c9383788de
Co-authored-by: Copilot <copilot@github.com>
198 lines
6.9 KiB
Python
198 lines
6.9 KiB
Python
"""FINN listing detail scraping and normalization."""
|
|
|
|
import logging
|
|
import re
|
|
from datetime import UTC, datetime
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from .http import HTTPClient
|
|
from .models import FinnAd
|
|
from .parser import (
|
|
clean_text,
|
|
extract_finnkode_from_url,
|
|
normalize_area,
|
|
normalize_finnkode,
|
|
normalize_number,
|
|
normalize_price,
|
|
text_to_bool,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
FINN_AD_URL_TEMPLATE = "https://www.finn.no/realestate/homes/ad.html?finnkode={}"
|
|
|
|
|
|
async def fetch_ad(finnkode: str, client: HTTPClient | None = None) -> str:
|
|
"""Fetch FINN listing HTML by finnkode."""
|
|
client = client or HTTPClient(request_delay_seconds=0.0)
|
|
url = FINN_AD_URL_TEMPLATE.format(finnkode)
|
|
response = await client.get(url)
|
|
return response.text
|
|
|
|
|
|
def _load_property_map(soup: BeautifulSoup) -> dict[str, str]:
|
|
properties: dict[str, str] = {}
|
|
for dt, dd in zip(soup.find_all("dt"), soup.find_all("dd"), strict=False):
|
|
key = clean_text(dt.get_text()) or ""
|
|
value = clean_text(dd.get_text()) or ""
|
|
properties[key.lower()] = value
|
|
return properties
|
|
|
|
|
|
def _get_data_testid_value(soup: BeautifulSoup, testid: str) -> str | None:
|
|
node = soup.select_one(f'[data-testid="{testid}"]')
|
|
if not node:
|
|
return None
|
|
return clean_text(node.get_text(" ", strip=True))
|
|
|
|
|
|
def _strip_labelled_text(text: str | None, labels: list[str]) -> str | None:
|
|
if not text:
|
|
return None
|
|
for label in labels:
|
|
if text.lower().startswith(label.lower()):
|
|
return clean_text(text[len(label) :])
|
|
return text
|
|
|
|
|
|
def _extract_floor_from_text(text: str | None) -> str | None:
|
|
if not text:
|
|
return None
|
|
match = re.search(r"(\d+)\s*\.?\s*etasje", text, re.IGNORECASE)
|
|
if match:
|
|
return f"{match.group(1)}. etasje"
|
|
return None
|
|
|
|
|
|
def _clean_description(text: str | None) -> str | None:
|
|
if not text:
|
|
return None
|
|
cleaned = re.sub(r"(?i)^om boligen", "", text).strip()
|
|
cleaned = re.sub(r"(?i)^beskrivelse", "", cleaned).strip()
|
|
return clean_text(cleaned)
|
|
|
|
|
|
def _load_feature_text(soup: BeautifulSoup) -> str:
|
|
return _get_data_testid_value(soup, "object-facilities") or ""
|
|
|
|
|
|
def _extract_description(soup: BeautifulSoup) -> str | None:
|
|
node = soup.select_one('[data-testid="om boligen"]') or soup.select_one(".description")
|
|
if not node:
|
|
return None
|
|
paragraphs = [clean_text(p.get_text()) for p in node.select("p") if clean_text(p.get_text())]
|
|
if paragraphs:
|
|
return "\n".join(paragraphs)
|
|
return _clean_description(node.get_text(" ", strip=True))
|
|
|
|
|
|
def scrape_ad(html: str, url: str | None = None) -> FinnAd:
|
|
"""Scrape a FINN listing HTML page into a FinnAd model."""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
title_node = soup.select_one("h1")
|
|
broker_name = soup.select_one(".broker-name")
|
|
|
|
properties = _load_property_map(soup)
|
|
feature_text = _load_feature_text(soup).lower()
|
|
finnkode = normalize_finnkode(extract_finnkode_from_url(url or "")) or ""
|
|
address = _get_data_testid_value(soup, "object-address") or properties.get("adresse")
|
|
district = _get_data_testid_value(soup, "local-area-name") or properties.get("område")
|
|
ownership_type = _strip_labelled_text(
|
|
_get_data_testid_value(soup, "info-ownership-type"), ["Eieform", "Eiendomstype"]
|
|
) or properties.get("eierform")
|
|
property_type = _strip_labelled_text(
|
|
_get_data_testid_value(soup, "info-property-type"), ["Boligtype", "Eiendomstype"]
|
|
) or properties.get("eiendomstype")
|
|
|
|
asking_price = normalize_price(
|
|
properties.get("prisantydning") or _get_data_testid_value(soup, "pricing-incicative-price")
|
|
)
|
|
total_price_value = normalize_price(
|
|
properties.get("totalpris") or _get_data_testid_value(soup, "pricing-total-price")
|
|
)
|
|
shared_debt = normalize_price(
|
|
properties.get("fellesgjeld") or _get_data_testid_value(soup, "pricing-joint-debt")
|
|
)
|
|
common_costs = normalize_number(
|
|
properties.get("felles utgifter")
|
|
or _get_data_testid_value(soup, "pricing-common-monthly-cost")
|
|
)
|
|
area_m2 = normalize_area(
|
|
properties.get("boligareal")
|
|
or _get_data_testid_value(soup, "info-usable-i-area")
|
|
or _get_data_testid_value(soup, "info-usable-area")
|
|
)
|
|
rooms = normalize_number(properties.get("rom") or _get_data_testid_value(soup, "info-rooms"))
|
|
bedrooms = normalize_number(
|
|
properties.get("soverom") or _get_data_testid_value(soup, "info-bedrooms")
|
|
)
|
|
floor = (
|
|
properties.get("etasje")
|
|
or _extract_floor_from_text(title_node.get_text() if title_node else "")
|
|
or _get_data_testid_value(soup, "info-floor")
|
|
)
|
|
construction_year = normalize_number(
|
|
properties.get("byggeår") or _get_data_testid_value(soup, "info-construction-year")
|
|
)
|
|
energy_rating = properties.get("energimerking")
|
|
heating = properties.get("oppvarming")
|
|
has_balcony = text_to_bool(properties.get("balkonger/terrasser")) or "balkong" in feature_text
|
|
has_terrace = "terrasse" in feature_text
|
|
has_elevator = text_to_bool(properties.get("heis")) or "heis" in feature_text
|
|
has_parking = (
|
|
bool(properties.get("parkering/garasje"))
|
|
or "parkering" in feature_text
|
|
)
|
|
has_garage = (
|
|
bool(properties.get("parkering/garasje"))
|
|
or "garasje" in feature_text
|
|
)
|
|
broker_company = None
|
|
if broker_name:
|
|
broker_company = clean_text(broker_name.get_text())
|
|
|
|
listing_description = _extract_description(soup)
|
|
|
|
ad = FinnAd(
|
|
finnkode=finnkode,
|
|
url=url or "",
|
|
title=clean_text(title_node.get_text()) if title_node else None,
|
|
address=address,
|
|
postal_area=properties.get("postnummer"),
|
|
district=district,
|
|
property_type=property_type,
|
|
ownership_type=ownership_type,
|
|
asking_price=asking_price,
|
|
total_price=total_price_value,
|
|
shared_debt=shared_debt,
|
|
common_costs=common_costs,
|
|
municipal_fee=normalize_number(properties.get("kommunale avgifter")),
|
|
other_fees=normalize_number(properties.get("andre utgifter")),
|
|
area_m2=area_m2,
|
|
rooms=rooms,
|
|
bedrooms=bedrooms,
|
|
floor=floor,
|
|
construction_year=construction_year,
|
|
energy_rating=energy_rating,
|
|
heating=heating,
|
|
has_balcony=has_balcony,
|
|
has_terrace=has_terrace,
|
|
has_elevator=has_elevator,
|
|
has_parking=has_parking,
|
|
has_garage=has_garage,
|
|
listing_description=listing_description,
|
|
broker_name=None,
|
|
broker_company=broker_company,
|
|
detail_fetched_at=None,
|
|
)
|
|
return ad
|
|
|
|
|
|
async def fetch_ad_details(finnkode: str, client: HTTPClient | None = None) -> FinnAd:
|
|
"""Fetch FINN listing HTML and return a parsed FinnAd object."""
|
|
html = await fetch_ad(finnkode, client=client)
|
|
ad = scrape_ad(html, url=FINN_AD_URL_TEMPLATE.format(finnkode))
|
|
ad.detail_fetched_at = datetime.now(UTC)
|
|
return ad
|