This commit is contained in:
Ole
2026-05-16 06:54:17 +00:00
commit 1399f61c1a
44 changed files with 6746 additions and 0 deletions
+193
View File
@@ -0,0 +1,193 @@
"""FINN listing detail scraping and normalization."""
import logging
import re
from datetime import UTC, datetime
from bs4 import BeautifulSoup
from .http import HTTPClient
from .models import FinnAd
from .parser import (
clean_text,
extract_finnkode_from_url,
normalize_area,
normalize_finnkode,
normalize_number,
normalize_price,
text_to_bool,
)
logger = logging.getLogger(__name__)
FINN_AD_URL_TEMPLATE = "https://www.finn.no/realestate/homes/ad.html?finnkode={}"
async def fetch_ad(finnkode: str, client: HTTPClient | None = None) -> str:
"""Fetch FINN listing HTML by finnkode."""
client = client or HTTPClient(request_delay_seconds=0.0)
url = FINN_AD_URL_TEMPLATE.format(finnkode)
response = await client.get(url)
return response.text
def _load_property_map(soup: BeautifulSoup) -> dict[str, str]:
properties: dict[str, str] = {}
for dt, dd in zip(soup.find_all("dt"), soup.find_all("dd"), strict=False):
key = clean_text(dt.get_text()) or ""
value = clean_text(dd.get_text()) or ""
properties[key.lower()] = value
return properties
def _get_data_testid_value(soup: BeautifulSoup, testid: str) -> str | None:
node = soup.select_one(f'[data-testid="{testid}"]')
if not node:
return None
return clean_text(node.get_text(" ", strip=True))
def _strip_labelled_text(text: str | None, labels: list[str]) -> str | None:
if not text:
return None
for label in labels:
if text.lower().startswith(label.lower()):
return clean_text(text[len(label) :])
return text
def _extract_floor_from_text(text: str | None) -> str | None:
if not text:
return None
match = re.search(r"(\d+)\s*\.?\s*etasje", text, re.IGNORECASE)
if match:
return f"{match.group(1)}. etasje"
return None
def _clean_description(text: str | None) -> str | None:
if not text:
return None
cleaned = re.sub(r"(?i)^om boligen", "", text).strip()
cleaned = re.sub(r"(?i)^beskrivelse", "", cleaned).strip()
return clean_text(cleaned)
def _load_feature_text(soup: BeautifulSoup) -> str:
return _get_data_testid_value(soup, "object-facilities") or ""
def _extract_description(soup: BeautifulSoup) -> str | None:
node = soup.select_one('[data-testid="om boligen"]') or soup.select_one(".description")
if not node:
return None
paragraphs = [clean_text(p.get_text()) for p in node.select("p") if clean_text(p.get_text())]
if paragraphs:
return "\n".join(paragraphs)
return _clean_description(node.get_text(" ", strip=True))
def scrape_ad(html: str, url: str | None = None) -> FinnAd:
"""Scrape a FINN listing HTML page into a FinnAd model."""
soup = BeautifulSoup(html, "html.parser")
title_node = soup.select_one("h1")
broker_name = soup.select_one(".broker-name")
properties = _load_property_map(soup)
feature_text = _load_feature_text(soup).lower()
finnkode = normalize_finnkode(extract_finnkode_from_url(url or "")) or ""
address = _get_data_testid_value(soup, "object-address") or properties.get("adresse")
district = _get_data_testid_value(soup, "local-area-name") or properties.get("område")
ownership_type = _strip_labelled_text(
_get_data_testid_value(soup, "info-ownership-type"), ["Eieform", "Eiendomstype"]
) or properties.get("eierform")
property_type = _strip_labelled_text(
_get_data_testid_value(soup, "info-property-type"), ["Boligtype", "Eiendomstype"]
) or properties.get("eiendomstype")
asking_price = normalize_price(
properties.get("prisantydning") or _get_data_testid_value(soup, "pricing-incicative-price")
)
total_price_value = normalize_price(
properties.get("totalpris") or _get_data_testid_value(soup, "pricing-total-price")
)
shared_debt = normalize_price(
properties.get("fellesgjeld") or _get_data_testid_value(soup, "pricing-joint-debt")
)
common_costs = normalize_number(
properties.get("felles utgifter")
or _get_data_testid_value(soup, "pricing-common-monthly-cost")
)
area_m2 = normalize_area(
properties.get("boligareal")
or _get_data_testid_value(soup, "info-usable-i-area")
or _get_data_testid_value(soup, "info-usable-area")
)
rooms = normalize_number(properties.get("rom") or _get_data_testid_value(soup, "info-rooms"))
bedrooms = normalize_number(
properties.get("soverom") or _get_data_testid_value(soup, "info-bedrooms")
)
floor = (
properties.get("etasje")
or _extract_floor_from_text(title_node.get_text() if title_node else "")
or _get_data_testid_value(soup, "info-floor")
)
construction_year = normalize_number(
properties.get("byggeår") or _get_data_testid_value(soup, "info-construction-year")
)
energy_rating = properties.get("energimerking")
heating = properties.get("oppvarming")
has_balcony = text_to_bool(properties.get("balkonger/terrasser")) or "balkong" in feature_text
has_terrace = "terrasse" in feature_text
has_elevator = text_to_bool(properties.get("heis")) or "heis" in feature_text
has_parking = (
bool(properties.get("parkering/garasje"))
or "parkering" in feature_text
or "garasje" in feature_text
)
broker_company = None
if broker_name:
broker_company = clean_text(broker_name.get_text())
listing_description = _extract_description(soup)
ad = FinnAd(
finnkode=finnkode,
url=url or "",
title=clean_text(title_node.get_text()) if title_node else None,
address=address,
postal_area=properties.get("postnummer"),
district=district,
property_type=property_type,
ownership_type=ownership_type,
asking_price=asking_price,
total_price=total_price_value,
shared_debt=shared_debt,
common_costs=common_costs,
municipal_fee=normalize_number(properties.get("kommunale avgifter")),
other_fees=normalize_number(properties.get("andre utgifter")),
area_m2=area_m2,
rooms=rooms,
bedrooms=bedrooms,
floor=floor,
construction_year=construction_year,
energy_rating=energy_rating,
heating=heating,
has_balcony=has_balcony,
has_terrace=has_terrace,
has_elevator=has_elevator,
has_parking=has_parking,
listing_description=listing_description,
broker_name=None,
broker_company=broker_company,
detail_fetched_at=None,
)
return ad
async def fetch_ad_details(finnkode: str, client: HTTPClient | None = None) -> FinnAd:
"""Fetch FINN listing HTML and return a parsed FinnAd object."""
html = await fetch_ad(finnkode, client=client)
ad = scrape_ad(html, url=FINN_AD_URL_TEMPLATE.format(finnkode))
ad.detail_fetched_at = datetime.now(UTC)
return ad