initial

2026-05-16 06:54:17 +00:00
commit 1399f61c1a
44 changed files with 6746 additions and 0 deletions
@@ -0,0 +1,194 @@
+"""FINN search scraping and parsing."""
+
+import logging
+import re
+
+from bs4 import BeautifulSoup
+
+from . import cache
+from .config import FINN_CACHE_TTL_SEARCH_MINUTES
+from .http import HTTPClient
+from .models import FinnSearchCard
+from .parser import (
+    clean_text,
+    extract_finnkode_from_url,
+    normalize_area,
+    normalize_finnkode,
+    normalize_number,
+    normalize_price,
+)
+
+logger = logging.getLogger(__name__)
+
+
+async def fetch_search_page(url: str, client: HTTPClient | None = None) -> str:
+    """Fetch a FINN search page HTML."""
+    client = client or HTTPClient(request_delay_seconds=0.0)
+    response = await client.get(url)
+    return response.text
+
+
+async def fetch_search_page_cached(
+    url: str,
+    client: HTTPClient | None = None,
+    conn: cache.sqlite3.Connection | None = None,
+    use_cache: bool = True,
+) -> str:
+    """Fetch a FINN search page with optional SQLite caching."""
+    client = client or HTTPClient(request_delay_seconds=0.0)
+    conn = conn or cache.init_db()
+    if use_cache:
+        cached_html = cache.get_search_page(conn, url)
+        if cached_html:
+            logger.debug("Using cached search page: %s", url)
+            return cached_html
+
+    html = await fetch_search_page(url, client=client)
+    cache.save_search_page(conn, url, html, ttl_minutes=FINN_CACHE_TTL_SEARCH_MINUTES)
+    return html
+
+
+def extract_ad_links(html: str) -> list[str]:
+    """Extract listing URLs from FINN search HTML."""
+    soup = BeautifulSoup(html, "html.parser")
+    links = []
+    for article in soup.select("article.listing-card, article.sf-search-ad"):
+        anchor = article.select_one("a[href*='finnkode']")
+        if anchor and anchor.get("href"):
+            links.append(clean_text(anchor.get("href")) or "")
+    return links
+
+
+def _extract_int_from_text(text: str, pattern: str) -> int | None:
+    match = re.search(pattern, text, re.I)
+    if match:
+        return normalize_number(match.group(1))
+    return None
+
+
+def _extract_area_from_text(text: str) -> int | None:
+    matches = re.findall(r"(\d+(?:[.,]\d+)?)\s*(?:m²|m2|kvm)", text, re.I)
+    if matches:
+        return normalize_area(matches[-1])
+    return None
+
+
+def _extract_price_from_text(text: str, label: str) -> int | None:
+    pattern = rf"{label}[:\s]*([\d\s]+kr)"
+    match = re.search(pattern, text, re.I)
+    if match:
+        return normalize_price(match.group(1))
+    return None
+
+
+def extract_search_cards(html: str) -> list[FinnSearchCard]:
+    """Parse FINN search HTML and return a list of FinnSearchCard objects."""
+    logger.debug("Extracting FINN search cards")
+    soup = BeautifulSoup(html, "html.parser")
+    cards: list[FinnSearchCard] = []
+
+    for card in soup.select("article.listing-card, article.sf-search-ad"):
+        data_id = card.get("data-id")
+        anchor = card.select_one("a[href*='finnkode']")
+        url = anchor.get("href") if anchor else ""
+        finnkode = normalize_finnkode(data_id or extract_finnkode_from_url(url))
+        if not finnkode:
+            logger.debug("Skipping card with missing finnkode")
+            continue
+
+        title_elem = card.select_one(".title, h2.sf-realestate-heading, a.sf-search-ad-link")
+        address_elem = card.select_one(".location, .sf-realestate-location")
+        area_elem = card.select_one(".area")
+        price_elem = card.select_one(".price")
+        common_costs_elem = card.select_one(".common-costs")
+        bedrooms_elem = card.select_one(".bedrooms")
+        property_type_elem = card.select_one(".property-type")
+        ownership_type_elem = card.select_one(".ownership-type")
+        broker_elem = card.select_one(".broker-company")
+
+        card_text = clean_text(card.get_text(" ") or "")
+
+        bedrooms = None
+        if bedrooms_elem:
+            bedrooms = normalize_number(bedrooms_elem.get_text())
+        elif card_text:
+            bedrooms = _extract_int_from_text(card_text, r"(\d+)\s*soverom")
+
+        common_costs = None
+        if common_costs_elem:
+            common_costs = normalize_number(common_costs_elem.get_text())
+        elif card_text:
+            common_costs = _extract_int_from_text(
+                card_text, r"(?:Fellesutg|Felleskost(?:er)?)[^\d]*(\d+[\d\s]*)kr"
+            )
+
+        total_price = None
+        if price_elem:
+            total_price = normalize_price(price_elem.get_text())
+        if not total_price and card_text:
+            total_price = _extract_price_from_text(card_text, r"Totalpris")
+        if not total_price and card_text:
+            first_price_match = re.search(r"([\d\s]+kr)", card_text)
+            if first_price_match:
+                total_price = normalize_price(first_price_match.group(1))
+
+        area_m2 = None
+        if area_elem:
+            area_m2 = normalize_area(area_elem.get_text())
+        elif card_text:
+            area_m2 = _extract_area_from_text(card_text)
+
+        card_data = FinnSearchCard(
+            finnkode=finnkode,
+            url=url or "",
+            title=clean_text(title_elem.get_text()) if title_elem else None,
+            address=clean_text(address_elem.get_text()) if address_elem else None,
+            area_m2=area_m2,
+            asking_price=None,
+            total_price=total_price,
+            common_costs=common_costs,
+            property_type=clean_text(property_type_elem.get_text()) if property_type_elem else None,
+            ownership_type=clean_text(ownership_type_elem.get_text())
+            if ownership_type_elem
+            else None,
+            bedrooms=bedrooms,
+            floor=None,
+            broker_company=clean_text(broker_elem.get_text()) if broker_elem else None,
+        )
+        cards.append(card_data)
+        logger.debug("Parsed FINN search card %s", finnkode)
+
+    return cards
+
+
+def find_next_page_url(html: str) -> str | None:
+    """Return the FINN search next page URL if present."""
+    soup = BeautifulSoup(html, "html.parser")
+    next_link = soup.select_one("a[rel='next']")
+    if next_link and next_link.get("href"):
+        return clean_text(next_link.get("href"))
+    return None
+
+
+async def fetch_search_pages(
+    start_url: str,
+    max_pages: int = 1,
+    client: HTTPClient | None = None,
+    use_cache: bool = True,
+) -> list[FinnSearchCard]:
+    """Fetch paginated FINN search pages and parse search cards."""
+    client = client or HTTPClient(request_delay_seconds=0.0)
+    conn = cache.init_db()
+    url = start_url
+    all_cards: list[FinnSearchCard] = []
+
+    for _ in range(max_pages):
+        html = await fetch_search_page_cached(url, client=client, conn=conn, use_cache=use_cache)
+        all_cards.extend(extract_search_cards(html))
+        next_url = find_next_page_url(html)
+        if not next_url:
+            break
+        url = next_url
+        logger.debug("Following next page link: %s", url)
+
+    return all_cards