Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Ole
2026-05-18 21:31:52 +00:00
parent 6eedfffa4d
commit c9383788de
22 changed files with 1614 additions and 42 deletions
+6 -3
View File
@@ -2,6 +2,7 @@
import logging
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
@@ -161,12 +162,14 @@ def extract_search_cards(html: str) -> list[FinnSearchCard]:
return cards
def find_next_page_url(html: str) -> str | None:
def find_next_page_url(html: str, base_url: str = "https://www.finn.no") -> str | None:
"""Return the FINN search next page URL if present."""
soup = BeautifulSoup(html, "html.parser")
next_link = soup.select_one("a[rel='next']")
if next_link and next_link.get("href"):
return clean_text(next_link.get("href"))
href = clean_text(next_link.get("href"))
if href:
return urljoin(base_url, href)
return None
@@ -185,7 +188,7 @@ async def fetch_search_pages(
for _ in range(max_pages):
html = await fetch_search_page_cached(url, client=client, conn=conn, use_cache=use_cache)
all_cards.extend(extract_search_cards(html))
next_url = find_next_page_url(html)
next_url = find_next_page_url(html, base_url=start_url)
if not next_url:
break
url = next_url