initial
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
"""FINN listing detail scraping and normalization."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .http import HTTPClient
|
||||
from .models import FinnAd
|
||||
from .parser import (
|
||||
clean_text,
|
||||
extract_finnkode_from_url,
|
||||
normalize_area,
|
||||
normalize_finnkode,
|
||||
normalize_number,
|
||||
normalize_price,
|
||||
text_to_bool,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FINN_AD_URL_TEMPLATE = "https://www.finn.no/realestate/homes/ad.html?finnkode={}"
|
||||
|
||||
|
||||
async def fetch_ad(finnkode: str, client: HTTPClient | None = None) -> str:
|
||||
"""Fetch FINN listing HTML by finnkode."""
|
||||
client = client or HTTPClient(request_delay_seconds=0.0)
|
||||
url = FINN_AD_URL_TEMPLATE.format(finnkode)
|
||||
response = await client.get(url)
|
||||
return response.text
|
||||
|
||||
|
||||
def _load_property_map(soup: BeautifulSoup) -> dict[str, str]:
|
||||
properties: dict[str, str] = {}
|
||||
for dt, dd in zip(soup.find_all("dt"), soup.find_all("dd"), strict=False):
|
||||
key = clean_text(dt.get_text()) or ""
|
||||
value = clean_text(dd.get_text()) or ""
|
||||
properties[key.lower()] = value
|
||||
return properties
|
||||
|
||||
|
||||
def _get_data_testid_value(soup: BeautifulSoup, testid: str) -> str | None:
|
||||
node = soup.select_one(f'[data-testid="{testid}"]')
|
||||
if not node:
|
||||
return None
|
||||
return clean_text(node.get_text(" ", strip=True))
|
||||
|
||||
|
||||
def _strip_labelled_text(text: str | None, labels: list[str]) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
for label in labels:
|
||||
if text.lower().startswith(label.lower()):
|
||||
return clean_text(text[len(label) :])
|
||||
return text
|
||||
|
||||
|
||||
def _extract_floor_from_text(text: str | None) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
match = re.search(r"(\d+)\s*\.?\s*etasje", text, re.IGNORECASE)
|
||||
if match:
|
||||
return f"{match.group(1)}. etasje"
|
||||
return None
|
||||
|
||||
|
||||
def _clean_description(text: str | None) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
cleaned = re.sub(r"(?i)^om boligen", "", text).strip()
|
||||
cleaned = re.sub(r"(?i)^beskrivelse", "", cleaned).strip()
|
||||
return clean_text(cleaned)
|
||||
|
||||
|
||||
def _load_feature_text(soup: BeautifulSoup) -> str:
|
||||
return _get_data_testid_value(soup, "object-facilities") or ""
|
||||
|
||||
|
||||
def _extract_description(soup: BeautifulSoup) -> str | None:
|
||||
node = soup.select_one('[data-testid="om boligen"]') or soup.select_one(".description")
|
||||
if not node:
|
||||
return None
|
||||
paragraphs = [clean_text(p.get_text()) for p in node.select("p") if clean_text(p.get_text())]
|
||||
if paragraphs:
|
||||
return "\n".join(paragraphs)
|
||||
return _clean_description(node.get_text(" ", strip=True))
|
||||
|
||||
|
||||
def scrape_ad(html: str, url: str | None = None) -> FinnAd:
|
||||
"""Scrape a FINN listing HTML page into a FinnAd model."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
title_node = soup.select_one("h1")
|
||||
broker_name = soup.select_one(".broker-name")
|
||||
|
||||
properties = _load_property_map(soup)
|
||||
feature_text = _load_feature_text(soup).lower()
|
||||
finnkode = normalize_finnkode(extract_finnkode_from_url(url or "")) or ""
|
||||
address = _get_data_testid_value(soup, "object-address") or properties.get("adresse")
|
||||
district = _get_data_testid_value(soup, "local-area-name") or properties.get("område")
|
||||
ownership_type = _strip_labelled_text(
|
||||
_get_data_testid_value(soup, "info-ownership-type"), ["Eieform", "Eiendomstype"]
|
||||
) or properties.get("eierform")
|
||||
property_type = _strip_labelled_text(
|
||||
_get_data_testid_value(soup, "info-property-type"), ["Boligtype", "Eiendomstype"]
|
||||
) or properties.get("eiendomstype")
|
||||
|
||||
asking_price = normalize_price(
|
||||
properties.get("prisantydning") or _get_data_testid_value(soup, "pricing-incicative-price")
|
||||
)
|
||||
total_price_value = normalize_price(
|
||||
properties.get("totalpris") or _get_data_testid_value(soup, "pricing-total-price")
|
||||
)
|
||||
shared_debt = normalize_price(
|
||||
properties.get("fellesgjeld") or _get_data_testid_value(soup, "pricing-joint-debt")
|
||||
)
|
||||
common_costs = normalize_number(
|
||||
properties.get("felles utgifter")
|
||||
or _get_data_testid_value(soup, "pricing-common-monthly-cost")
|
||||
)
|
||||
area_m2 = normalize_area(
|
||||
properties.get("boligareal")
|
||||
or _get_data_testid_value(soup, "info-usable-i-area")
|
||||
or _get_data_testid_value(soup, "info-usable-area")
|
||||
)
|
||||
rooms = normalize_number(properties.get("rom") or _get_data_testid_value(soup, "info-rooms"))
|
||||
bedrooms = normalize_number(
|
||||
properties.get("soverom") or _get_data_testid_value(soup, "info-bedrooms")
|
||||
)
|
||||
floor = (
|
||||
properties.get("etasje")
|
||||
or _extract_floor_from_text(title_node.get_text() if title_node else "")
|
||||
or _get_data_testid_value(soup, "info-floor")
|
||||
)
|
||||
construction_year = normalize_number(
|
||||
properties.get("byggeår") or _get_data_testid_value(soup, "info-construction-year")
|
||||
)
|
||||
energy_rating = properties.get("energimerking")
|
||||
heating = properties.get("oppvarming")
|
||||
has_balcony = text_to_bool(properties.get("balkonger/terrasser")) or "balkong" in feature_text
|
||||
has_terrace = "terrasse" in feature_text
|
||||
has_elevator = text_to_bool(properties.get("heis")) or "heis" in feature_text
|
||||
has_parking = (
|
||||
bool(properties.get("parkering/garasje"))
|
||||
or "parkering" in feature_text
|
||||
or "garasje" in feature_text
|
||||
)
|
||||
broker_company = None
|
||||
if broker_name:
|
||||
broker_company = clean_text(broker_name.get_text())
|
||||
|
||||
listing_description = _extract_description(soup)
|
||||
|
||||
ad = FinnAd(
|
||||
finnkode=finnkode,
|
||||
url=url or "",
|
||||
title=clean_text(title_node.get_text()) if title_node else None,
|
||||
address=address,
|
||||
postal_area=properties.get("postnummer"),
|
||||
district=district,
|
||||
property_type=property_type,
|
||||
ownership_type=ownership_type,
|
||||
asking_price=asking_price,
|
||||
total_price=total_price_value,
|
||||
shared_debt=shared_debt,
|
||||
common_costs=common_costs,
|
||||
municipal_fee=normalize_number(properties.get("kommunale avgifter")),
|
||||
other_fees=normalize_number(properties.get("andre utgifter")),
|
||||
area_m2=area_m2,
|
||||
rooms=rooms,
|
||||
bedrooms=bedrooms,
|
||||
floor=floor,
|
||||
construction_year=construction_year,
|
||||
energy_rating=energy_rating,
|
||||
heating=heating,
|
||||
has_balcony=has_balcony,
|
||||
has_terrace=has_terrace,
|
||||
has_elevator=has_elevator,
|
||||
has_parking=has_parking,
|
||||
listing_description=listing_description,
|
||||
broker_name=None,
|
||||
broker_company=broker_company,
|
||||
detail_fetched_at=None,
|
||||
)
|
||||
return ad
|
||||
|
||||
|
||||
async def fetch_ad_details(finnkode: str, client: HTTPClient | None = None) -> FinnAd:
|
||||
"""Fetch FINN listing HTML and return a parsed FinnAd object."""
|
||||
html = await fetch_ad(finnkode, client=client)
|
||||
ad = scrape_ad(html, url=FINN_AD_URL_TEMPLATE.format(finnkode))
|
||||
ad.detail_fetched_at = datetime.now(UTC)
|
||||
return ad
|
||||
Reference in New Issue
Block a user