"""FINN listing detail scraping and normalization.""" import logging import re from datetime import UTC, datetime from bs4 import BeautifulSoup from .http import HTTPClient from .models import FinnAd from .parser import ( clean_text, extract_finnkode_from_url, normalize_area, normalize_finnkode, normalize_number, normalize_price, text_to_bool, ) logger = logging.getLogger(__name__) FINN_AD_URL_TEMPLATE = "https://www.finn.no/realestate/homes/ad.html?finnkode={}" async def fetch_ad(finnkode: str, client: HTTPClient | None = None) -> str: """Fetch FINN listing HTML by finnkode.""" client = client or HTTPClient(request_delay_seconds=0.0) url = FINN_AD_URL_TEMPLATE.format(finnkode) response = await client.get(url) return response.text def _load_property_map(soup: BeautifulSoup) -> dict[str, str]: properties: dict[str, str] = {} for dt, dd in zip(soup.find_all("dt"), soup.find_all("dd"), strict=False): key = clean_text(dt.get_text()) or "" value = clean_text(dd.get_text()) or "" properties[key.lower()] = value return properties def _get_data_testid_value(soup: BeautifulSoup, testid: str) -> str | None: node = soup.select_one(f'[data-testid="{testid}"]') if not node: return None return clean_text(node.get_text(" ", strip=True)) def _strip_labelled_text(text: str | None, labels: list[str]) -> str | None: if not text: return None for label in labels: if text.lower().startswith(label.lower()): return clean_text(text[len(label) :]) return text def _extract_floor_from_text(text: str | None) -> str | None: if not text: return None match = re.search(r"(\d+)\s*\.?\s*etasje", text, re.IGNORECASE) if match: return f"{match.group(1)}. etasje" return None def _clean_description(text: str | None) -> str | None: if not text: return None cleaned = re.sub(r"(?i)^om boligen", "", text).strip() cleaned = re.sub(r"(?i)^beskrivelse", "", cleaned).strip() return clean_text(cleaned) def _load_feature_text(soup: BeautifulSoup) -> str: return _get_data_testid_value(soup, "object-facilities") or "" def _extract_description(soup: BeautifulSoup) -> str | None: node = soup.select_one('[data-testid="om boligen"]') or soup.select_one(".description") if not node: return None paragraphs = [clean_text(p.get_text()) for p in node.select("p") if clean_text(p.get_text())] if paragraphs: return "\n".join(paragraphs) return _clean_description(node.get_text(" ", strip=True)) def scrape_ad(html: str, url: str | None = None) -> FinnAd: """Scrape a FINN listing HTML page into a FinnAd model.""" soup = BeautifulSoup(html, "html.parser") title_node = soup.select_one("h1") broker_name = soup.select_one(".broker-name") properties = _load_property_map(soup) feature_text = _load_feature_text(soup).lower() finnkode = normalize_finnkode(extract_finnkode_from_url(url or "")) or "" address = _get_data_testid_value(soup, "object-address") or properties.get("adresse") district = _get_data_testid_value(soup, "local-area-name") or properties.get("område") ownership_type = _strip_labelled_text( _get_data_testid_value(soup, "info-ownership-type"), ["Eieform", "Eiendomstype"] ) or properties.get("eierform") property_type = _strip_labelled_text( _get_data_testid_value(soup, "info-property-type"), ["Boligtype", "Eiendomstype"] ) or properties.get("eiendomstype") asking_price = normalize_price( properties.get("prisantydning") or _get_data_testid_value(soup, "pricing-incicative-price") ) total_price_value = normalize_price( properties.get("totalpris") or _get_data_testid_value(soup, "pricing-total-price") ) shared_debt = normalize_price( properties.get("fellesgjeld") or _get_data_testid_value(soup, "pricing-joint-debt") ) common_costs = normalize_number( properties.get("felles utgifter") or _get_data_testid_value(soup, "pricing-common-monthly-cost") ) area_m2 = normalize_area( properties.get("boligareal") or _get_data_testid_value(soup, "info-usable-i-area") or _get_data_testid_value(soup, "info-usable-area") ) rooms = normalize_number(properties.get("rom") or _get_data_testid_value(soup, "info-rooms")) bedrooms = normalize_number( properties.get("soverom") or _get_data_testid_value(soup, "info-bedrooms") ) floor = ( properties.get("etasje") or _extract_floor_from_text(title_node.get_text() if title_node else "") or _get_data_testid_value(soup, "info-floor") ) construction_year = normalize_number( properties.get("byggeår") or _get_data_testid_value(soup, "info-construction-year") ) energy_rating = properties.get("energimerking") heating = properties.get("oppvarming") has_balcony = text_to_bool(properties.get("balkonger/terrasser")) or "balkong" in feature_text has_terrace = "terrasse" in feature_text has_elevator = text_to_bool(properties.get("heis")) or "heis" in feature_text has_parking = ( bool(properties.get("parkering/garasje")) or "parkering" in feature_text or "garasje" in feature_text ) broker_company = None if broker_name: broker_company = clean_text(broker_name.get_text()) listing_description = _extract_description(soup) ad = FinnAd( finnkode=finnkode, url=url or "", title=clean_text(title_node.get_text()) if title_node else None, address=address, postal_area=properties.get("postnummer"), district=district, property_type=property_type, ownership_type=ownership_type, asking_price=asking_price, total_price=total_price_value, shared_debt=shared_debt, common_costs=common_costs, municipal_fee=normalize_number(properties.get("kommunale avgifter")), other_fees=normalize_number(properties.get("andre utgifter")), area_m2=area_m2, rooms=rooms, bedrooms=bedrooms, floor=floor, construction_year=construction_year, energy_rating=energy_rating, heating=heating, has_balcony=has_balcony, has_terrace=has_terrace, has_elevator=has_elevator, has_parking=has_parking, listing_description=listing_description, broker_name=None, broker_company=broker_company, detail_fetched_at=None, ) return ad async def fetch_ad_details(finnkode: str, client: HTTPClient | None = None) -> FinnAd: """Fetch FINN listing HTML and return a parsed FinnAd object.""" html = await fetch_ad(finnkode, client=client) ad = scrape_ad(html, url=FINN_AD_URL_TEMPLATE.format(finnkode)) ad.detail_fetched_at = datetime.now(UTC) return ad