@@ -2,6 +2,7 @@
|
||||
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -161,12 +162,14 @@ def extract_search_cards(html: str) -> list[FinnSearchCard]:
|
||||
return cards
|
||||
|
||||
|
||||
def find_next_page_url(html: str) -> str | None:
|
||||
def find_next_page_url(html: str, base_url: str = "https://www.finn.no") -> str | None:
|
||||
"""Return the FINN search next page URL if present."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
next_link = soup.select_one("a[rel='next']")
|
||||
if next_link and next_link.get("href"):
|
||||
return clean_text(next_link.get("href"))
|
||||
href = clean_text(next_link.get("href"))
|
||||
if href:
|
||||
return urljoin(base_url, href)
|
||||
return None
|
||||
|
||||
|
||||
@@ -185,7 +188,7 @@ async def fetch_search_pages(
|
||||
for _ in range(max_pages):
|
||||
html = await fetch_search_page_cached(url, client=client, conn=conn, use_cache=use_cache)
|
||||
all_cards.extend(extract_search_cards(html))
|
||||
next_url = find_next_page_url(html)
|
||||
next_url = find_next_page_url(html, base_url=start_url)
|
||||
if not next_url:
|
||||
break
|
||||
url = next_url
|
||||
|
||||
Reference in New Issue
Block a user