Files
finn-mcp/finn_eiendom/ad.py
T
2026-05-16 06:54:17 +00:00

194 lines
6.8 KiB
Python

"""FINN listing detail scraping and normalization."""
import logging
import re
from datetime import UTC, datetime
from bs4 import BeautifulSoup
from .http import HTTPClient
from .models import FinnAd
from .parser import (
clean_text,
extract_finnkode_from_url,
normalize_area,
normalize_finnkode,
normalize_number,
normalize_price,
text_to_bool,
)
logger = logging.getLogger(__name__)
FINN_AD_URL_TEMPLATE = "https://www.finn.no/realestate/homes/ad.html?finnkode={}"
async def fetch_ad(finnkode: str, client: HTTPClient | None = None) -> str:
"""Fetch FINN listing HTML by finnkode."""
client = client or HTTPClient(request_delay_seconds=0.0)
url = FINN_AD_URL_TEMPLATE.format(finnkode)
response = await client.get(url)
return response.text
def _load_property_map(soup: BeautifulSoup) -> dict[str, str]:
properties: dict[str, str] = {}
for dt, dd in zip(soup.find_all("dt"), soup.find_all("dd"), strict=False):
key = clean_text(dt.get_text()) or ""
value = clean_text(dd.get_text()) or ""
properties[key.lower()] = value
return properties
def _get_data_testid_value(soup: BeautifulSoup, testid: str) -> str | None:
node = soup.select_one(f'[data-testid="{testid}"]')
if not node:
return None
return clean_text(node.get_text(" ", strip=True))
def _strip_labelled_text(text: str | None, labels: list[str]) -> str | None:
if not text:
return None
for label in labels:
if text.lower().startswith(label.lower()):
return clean_text(text[len(label) :])
return text
def _extract_floor_from_text(text: str | None) -> str | None:
if not text:
return None
match = re.search(r"(\d+)\s*\.?\s*etasje", text, re.IGNORECASE)
if match:
return f"{match.group(1)}. etasje"
return None
def _clean_description(text: str | None) -> str | None:
if not text:
return None
cleaned = re.sub(r"(?i)^om boligen", "", text).strip()
cleaned = re.sub(r"(?i)^beskrivelse", "", cleaned).strip()
return clean_text(cleaned)
def _load_feature_text(soup: BeautifulSoup) -> str:
return _get_data_testid_value(soup, "object-facilities") or ""
def _extract_description(soup: BeautifulSoup) -> str | None:
node = soup.select_one('[data-testid="om boligen"]') or soup.select_one(".description")
if not node:
return None
paragraphs = [clean_text(p.get_text()) for p in node.select("p") if clean_text(p.get_text())]
if paragraphs:
return "\n".join(paragraphs)
return _clean_description(node.get_text(" ", strip=True))
def scrape_ad(html: str, url: str | None = None) -> FinnAd:
"""Scrape a FINN listing HTML page into a FinnAd model."""
soup = BeautifulSoup(html, "html.parser")
title_node = soup.select_one("h1")
broker_name = soup.select_one(".broker-name")
properties = _load_property_map(soup)
feature_text = _load_feature_text(soup).lower()
finnkode = normalize_finnkode(extract_finnkode_from_url(url or "")) or ""
address = _get_data_testid_value(soup, "object-address") or properties.get("adresse")
district = _get_data_testid_value(soup, "local-area-name") or properties.get("område")
ownership_type = _strip_labelled_text(
_get_data_testid_value(soup, "info-ownership-type"), ["Eieform", "Eiendomstype"]
) or properties.get("eierform")
property_type = _strip_labelled_text(
_get_data_testid_value(soup, "info-property-type"), ["Boligtype", "Eiendomstype"]
) or properties.get("eiendomstype")
asking_price = normalize_price(
properties.get("prisantydning") or _get_data_testid_value(soup, "pricing-incicative-price")
)
total_price_value = normalize_price(
properties.get("totalpris") or _get_data_testid_value(soup, "pricing-total-price")
)
shared_debt = normalize_price(
properties.get("fellesgjeld") or _get_data_testid_value(soup, "pricing-joint-debt")
)
common_costs = normalize_number(
properties.get("felles utgifter")
or _get_data_testid_value(soup, "pricing-common-monthly-cost")
)
area_m2 = normalize_area(
properties.get("boligareal")
or _get_data_testid_value(soup, "info-usable-i-area")
or _get_data_testid_value(soup, "info-usable-area")
)
rooms = normalize_number(properties.get("rom") or _get_data_testid_value(soup, "info-rooms"))
bedrooms = normalize_number(
properties.get("soverom") or _get_data_testid_value(soup, "info-bedrooms")
)
floor = (
properties.get("etasje")
or _extract_floor_from_text(title_node.get_text() if title_node else "")
or _get_data_testid_value(soup, "info-floor")
)
construction_year = normalize_number(
properties.get("byggeår") or _get_data_testid_value(soup, "info-construction-year")
)
energy_rating = properties.get("energimerking")
heating = properties.get("oppvarming")
has_balcony = text_to_bool(properties.get("balkonger/terrasser")) or "balkong" in feature_text
has_terrace = "terrasse" in feature_text
has_elevator = text_to_bool(properties.get("heis")) or "heis" in feature_text
has_parking = (
bool(properties.get("parkering/garasje"))
or "parkering" in feature_text
or "garasje" in feature_text
)
broker_company = None
if broker_name:
broker_company = clean_text(broker_name.get_text())
listing_description = _extract_description(soup)
ad = FinnAd(
finnkode=finnkode,
url=url or "",
title=clean_text(title_node.get_text()) if title_node else None,
address=address,
postal_area=properties.get("postnummer"),
district=district,
property_type=property_type,
ownership_type=ownership_type,
asking_price=asking_price,
total_price=total_price_value,
shared_debt=shared_debt,
common_costs=common_costs,
municipal_fee=normalize_number(properties.get("kommunale avgifter")),
other_fees=normalize_number(properties.get("andre utgifter")),
area_m2=area_m2,
rooms=rooms,
bedrooms=bedrooms,
floor=floor,
construction_year=construction_year,
energy_rating=energy_rating,
heating=heating,
has_balcony=has_balcony,
has_terrace=has_terrace,
has_elevator=has_elevator,
has_parking=has_parking,
listing_description=listing_description,
broker_name=None,
broker_company=broker_company,
detail_fetched_at=None,
)
return ad
async def fetch_ad_details(finnkode: str, client: HTTPClient | None = None) -> FinnAd:
"""Fetch FINN listing HTML and return a parsed FinnAd object."""
html = await fetch_ad(finnkode, client=client)
ad = scrape_ad(html, url=FINN_AD_URL_TEMPLATE.format(finnkode))
ad.detail_fetched_at = datetime.now(UTC)
return ad