finn-mcp/finn_eiendom/search.py

"""FINN search scraping and parsing."""

import logging
import re
from urllib.parse import urljoin

from bs4 import BeautifulSoup

from . import cache
from .config import FINN_CACHE_TTL_SEARCH_MINUTES
from .http import HTTPClient
from .models import FinnSearchCard
from .parser import (
    clean_text,
    extract_finnkode_from_url,
    normalize_area,
    normalize_finnkode,
    normalize_number,
    normalize_price,
)

logger = logging.getLogger(__name__)


async def fetch_search_page(url: str, client: HTTPClient | None = None) -> str:
    """Fetch a FINN search page HTML."""
    client = client or HTTPClient(request_delay_seconds=0.0)
    response = await client.get(url)
    return response.text


async def fetch_search_page_cached(
    url: str,
    client: HTTPClient | None = None,
    conn: cache.sqlite3.Connection | None = None,
    use_cache: bool = True,
) -> str:
    """Fetch a FINN search page with optional SQLite caching."""
    client = client or HTTPClient(request_delay_seconds=0.0)
    conn = conn or cache.init_db()
    if use_cache:
        cached_html = cache.get_search_page(conn, url)
        if cached_html:
            logger.debug("Using cached search page: %s", url)
            return cached_html

    html = await fetch_search_page(url, client=client)
    cache.save_search_page(conn, url, html, ttl_minutes=FINN_CACHE_TTL_SEARCH_MINUTES)
    return html


def extract_ad_links(html: str) -> list[str]:
    """Extract listing URLs from FINN search HTML."""
    soup = BeautifulSoup(html, "html.parser")
    links = []
    for article in soup.select("article.listing-card, article.sf-search-ad"):
        anchor = article.select_one("a[href*='finnkode']")
        if anchor and anchor.get("href"):
            links.append(clean_text(anchor.get("href")) or "")
    return links


def _extract_int_from_text(text: str, pattern: str) -> int | None:
    match = re.search(pattern, text, re.I)
    if match:
        return normalize_number(match.group(1))
    return None


def _extract_area_from_text(text: str) -> int | None:
    matches = re.findall(r"(\d+(?:[.,]\d+)?)\s*(?:m²|m2|kvm)", text, re.I)
    if matches:
        return normalize_area(matches[-1])
    return None


def _extract_price_from_text(text: str, label: str) -> int | None:
    pattern = rf"{label}[:\s]*([\d\s]+kr)"
    match = re.search(pattern, text, re.I)
    if match:
        return normalize_price(match.group(1))
    return None


def extract_search_cards(html: str) -> list[FinnSearchCard]:
    """Parse FINN search HTML and return a list of FinnSearchCard objects."""
    logger.debug("Extracting FINN search cards")
    soup = BeautifulSoup(html, "html.parser")
    cards: list[FinnSearchCard] = []

    for card in soup.select("article.listing-card, article.sf-search-ad"):
        data_id = card.get("data-id")
        anchor = card.select_one("a[href*='finnkode']")
        url = anchor.get("href") if anchor else ""
        finnkode = normalize_finnkode(data_id or extract_finnkode_from_url(url))
        if not finnkode:
            logger.debug("Skipping card with missing finnkode")
            continue

        title_elem = card.select_one(".title, h2.sf-realestate-heading, a.sf-search-ad-link")
        address_elem = card.select_one(".location, .sf-realestate-location")
        area_elem = card.select_one(".area")
        price_elem = card.select_one(".price")
        common_costs_elem = card.select_one(".common-costs")
        bedrooms_elem = card.select_one(".bedrooms")
        property_type_elem = card.select_one(".property-type")
        ownership_type_elem = card.select_one(".ownership-type")
        broker_elem = card.select_one(".broker-company")

        card_text = clean_text(card.get_text(" ") or "")

        bedrooms = None
        if bedrooms_elem:
            bedrooms = normalize_number(bedrooms_elem.get_text())
        elif card_text:
            bedrooms = _extract_int_from_text(card_text, r"(\d+)\s*soverom")

        common_costs = None
        if common_costs_elem:
            common_costs = normalize_number(common_costs_elem.get_text())
        elif card_text:
            common_costs = _extract_int_from_text(
                card_text, r"(?:Fellesutg|Felleskost(?:er)?)[^\d]*(\d+[\d\s]*)kr"
            )

        total_price = None
        if price_elem:
            total_price = normalize_price(price_elem.get_text())
        if not total_price and card_text:
            total_price = _extract_price_from_text(card_text, r"Totalpris")
        if not total_price and card_text:
            first_price_match = re.search(r"([\d\s]+kr)", card_text)
            if first_price_match:
                total_price = normalize_price(first_price_match.group(1))

        area_m2 = None
        if area_elem:
            area_m2 = normalize_area(area_elem.get_text())
        elif card_text:
            area_m2 = _extract_area_from_text(card_text)

        card_data = FinnSearchCard(
            finnkode=finnkode,
            url=url or "",
            title=clean_text(title_elem.get_text()) if title_elem else None,
            address=clean_text(address_elem.get_text()) if address_elem else None,
            area_m2=area_m2,
            asking_price=None,
            total_price=total_price,
            common_costs=common_costs,
            property_type=clean_text(property_type_elem.get_text()) if property_type_elem else None,
            ownership_type=clean_text(ownership_type_elem.get_text())
            if ownership_type_elem
            else None,
            bedrooms=bedrooms,
            floor=None,
            broker_company=clean_text(broker_elem.get_text()) if broker_elem else None,
        )
        cards.append(card_data)
        logger.debug("Parsed FINN search card %s", finnkode)

    return cards


def find_next_page_url(html: str, base_url: str = "https://www.finn.no") -> str | None:
    """Return the FINN search next page URL if present."""
    soup = BeautifulSoup(html, "html.parser")
    next_link = soup.select_one("a[rel='next']")
    if next_link and next_link.get("href"):
        href = clean_text(next_link.get("href"))
        if href:
            return urljoin(base_url, href)
    return None


async def fetch_search_pages(
    start_url: str,
    max_pages: int = 1,
    client: HTTPClient | None = None,
    use_cache: bool = True,
) -> list[FinnSearchCard]:
    """Fetch paginated FINN search pages and parse search cards."""
    client = client or HTTPClient(request_delay_seconds=0.0)
    conn = cache.init_db()
    url = start_url
    all_cards: list[FinnSearchCard] = []

    for _ in range(max_pages):
        html = await fetch_search_page_cached(url, client=client, conn=conn, use_cache=use_cache)
        all_cards.extend(extract_search_cards(html))
        next_url = find_next_page_url(html, base_url=start_url)
        if not next_url:
            break
        url = next_url
        logger.debug("Following next page link: %s", url)

    return all_cards