"""FINN search scraping and parsing.""" import logging import re from bs4 import BeautifulSoup from . import cache from .config import FINN_CACHE_TTL_SEARCH_MINUTES from .http import HTTPClient from .models import FinnSearchCard from .parser import ( clean_text, extract_finnkode_from_url, normalize_area, normalize_finnkode, normalize_number, normalize_price, ) logger = logging.getLogger(__name__) async def fetch_search_page(url: str, client: HTTPClient | None = None) -> str: """Fetch a FINN search page HTML.""" client = client or HTTPClient(request_delay_seconds=0.0) response = await client.get(url) return response.text async def fetch_search_page_cached( url: str, client: HTTPClient | None = None, conn: cache.sqlite3.Connection | None = None, use_cache: bool = True, ) -> str: """Fetch a FINN search page with optional SQLite caching.""" client = client or HTTPClient(request_delay_seconds=0.0) conn = conn or cache.init_db() if use_cache: cached_html = cache.get_search_page(conn, url) if cached_html: logger.debug("Using cached search page: %s", url) return cached_html html = await fetch_search_page(url, client=client) cache.save_search_page(conn, url, html, ttl_minutes=FINN_CACHE_TTL_SEARCH_MINUTES) return html def extract_ad_links(html: str) -> list[str]: """Extract listing URLs from FINN search HTML.""" soup = BeautifulSoup(html, "html.parser") links = [] for article in soup.select("article.listing-card, article.sf-search-ad"): anchor = article.select_one("a[href*='finnkode']") if anchor and anchor.get("href"): links.append(clean_text(anchor.get("href")) or "") return links def _extract_int_from_text(text: str, pattern: str) -> int | None: match = re.search(pattern, text, re.I) if match: return normalize_number(match.group(1)) return None def _extract_area_from_text(text: str) -> int | None: matches = re.findall(r"(\d+(?:[.,]\d+)?)\s*(?:m²|m2|kvm)", text, re.I) if matches: return normalize_area(matches[-1]) return None def _extract_price_from_text(text: str, label: str) -> int | None: pattern = rf"{label}[:\s]*([\d\s]+kr)" match = re.search(pattern, text, re.I) if match: return normalize_price(match.group(1)) return None def extract_search_cards(html: str) -> list[FinnSearchCard]: """Parse FINN search HTML and return a list of FinnSearchCard objects.""" logger.debug("Extracting FINN search cards") soup = BeautifulSoup(html, "html.parser") cards: list[FinnSearchCard] = [] for card in soup.select("article.listing-card, article.sf-search-ad"): data_id = card.get("data-id") anchor = card.select_one("a[href*='finnkode']") url = anchor.get("href") if anchor else "" finnkode = normalize_finnkode(data_id or extract_finnkode_from_url(url)) if not finnkode: logger.debug("Skipping card with missing finnkode") continue title_elem = card.select_one(".title, h2.sf-realestate-heading, a.sf-search-ad-link") address_elem = card.select_one(".location, .sf-realestate-location") area_elem = card.select_one(".area") price_elem = card.select_one(".price") common_costs_elem = card.select_one(".common-costs") bedrooms_elem = card.select_one(".bedrooms") property_type_elem = card.select_one(".property-type") ownership_type_elem = card.select_one(".ownership-type") broker_elem = card.select_one(".broker-company") card_text = clean_text(card.get_text(" ") or "") bedrooms = None if bedrooms_elem: bedrooms = normalize_number(bedrooms_elem.get_text()) elif card_text: bedrooms = _extract_int_from_text(card_text, r"(\d+)\s*soverom") common_costs = None if common_costs_elem: common_costs = normalize_number(common_costs_elem.get_text()) elif card_text: common_costs = _extract_int_from_text( card_text, r"(?:Fellesutg|Felleskost(?:er)?)[^\d]*(\d+[\d\s]*)kr" ) total_price = None if price_elem: total_price = normalize_price(price_elem.get_text()) if not total_price and card_text: total_price = _extract_price_from_text(card_text, r"Totalpris") if not total_price and card_text: first_price_match = re.search(r"([\d\s]+kr)", card_text) if first_price_match: total_price = normalize_price(first_price_match.group(1)) area_m2 = None if area_elem: area_m2 = normalize_area(area_elem.get_text()) elif card_text: area_m2 = _extract_area_from_text(card_text) card_data = FinnSearchCard( finnkode=finnkode, url=url or "", title=clean_text(title_elem.get_text()) if title_elem else None, address=clean_text(address_elem.get_text()) if address_elem else None, area_m2=area_m2, asking_price=None, total_price=total_price, common_costs=common_costs, property_type=clean_text(property_type_elem.get_text()) if property_type_elem else None, ownership_type=clean_text(ownership_type_elem.get_text()) if ownership_type_elem else None, bedrooms=bedrooms, floor=None, broker_company=clean_text(broker_elem.get_text()) if broker_elem else None, ) cards.append(card_data) logger.debug("Parsed FINN search card %s", finnkode) return cards def find_next_page_url(html: str) -> str | None: """Return the FINN search next page URL if present.""" soup = BeautifulSoup(html, "html.parser") next_link = soup.select_one("a[rel='next']") if next_link and next_link.get("href"): return clean_text(next_link.get("href")) return None async def fetch_search_pages( start_url: str, max_pages: int = 1, client: HTTPClient | None = None, use_cache: bool = True, ) -> list[FinnSearchCard]: """Fetch paginated FINN search pages and parse search cards.""" client = client or HTTPClient(request_delay_seconds=0.0) conn = cache.init_db() url = start_url all_cards: list[FinnSearchCard] = [] for _ in range(max_pages): html = await fetch_search_page_cached(url, client=client, conn=conn, use_cache=use_cache) all_cards.extend(extract_search_cards(html)) next_url = find_next_page_url(html) if not next_url: break url = next_url logger.debug("Following next page link: %s", url) return all_cards