This commit is contained in:
Ole
2026-05-16 06:54:17 +00:00
commit 1399f61c1a
44 changed files with 6746 additions and 0 deletions
+194
View File
@@ -0,0 +1,194 @@
"""FINN search scraping and parsing."""
import logging
import re
from bs4 import BeautifulSoup
from . import cache
from .config import FINN_CACHE_TTL_SEARCH_MINUTES
from .http import HTTPClient
from .models import FinnSearchCard
from .parser import (
clean_text,
extract_finnkode_from_url,
normalize_area,
normalize_finnkode,
normalize_number,
normalize_price,
)
logger = logging.getLogger(__name__)
async def fetch_search_page(url: str, client: HTTPClient | None = None) -> str:
"""Fetch a FINN search page HTML."""
client = client or HTTPClient(request_delay_seconds=0.0)
response = await client.get(url)
return response.text
async def fetch_search_page_cached(
url: str,
client: HTTPClient | None = None,
conn: cache.sqlite3.Connection | None = None,
use_cache: bool = True,
) -> str:
"""Fetch a FINN search page with optional SQLite caching."""
client = client or HTTPClient(request_delay_seconds=0.0)
conn = conn or cache.init_db()
if use_cache:
cached_html = cache.get_search_page(conn, url)
if cached_html:
logger.debug("Using cached search page: %s", url)
return cached_html
html = await fetch_search_page(url, client=client)
cache.save_search_page(conn, url, html, ttl_minutes=FINN_CACHE_TTL_SEARCH_MINUTES)
return html
def extract_ad_links(html: str) -> list[str]:
"""Extract listing URLs from FINN search HTML."""
soup = BeautifulSoup(html, "html.parser")
links = []
for article in soup.select("article.listing-card, article.sf-search-ad"):
anchor = article.select_one("a[href*='finnkode']")
if anchor and anchor.get("href"):
links.append(clean_text(anchor.get("href")) or "")
return links
def _extract_int_from_text(text: str, pattern: str) -> int | None:
match = re.search(pattern, text, re.I)
if match:
return normalize_number(match.group(1))
return None
def _extract_area_from_text(text: str) -> int | None:
matches = re.findall(r"(\d+(?:[.,]\d+)?)\s*(?:m²|m2|kvm)", text, re.I)
if matches:
return normalize_area(matches[-1])
return None
def _extract_price_from_text(text: str, label: str) -> int | None:
pattern = rf"{label}[:\s]*([\d\s]+kr)"
match = re.search(pattern, text, re.I)
if match:
return normalize_price(match.group(1))
return None
def extract_search_cards(html: str) -> list[FinnSearchCard]:
"""Parse FINN search HTML and return a list of FinnSearchCard objects."""
logger.debug("Extracting FINN search cards")
soup = BeautifulSoup(html, "html.parser")
cards: list[FinnSearchCard] = []
for card in soup.select("article.listing-card, article.sf-search-ad"):
data_id = card.get("data-id")
anchor = card.select_one("a[href*='finnkode']")
url = anchor.get("href") if anchor else ""
finnkode = normalize_finnkode(data_id or extract_finnkode_from_url(url))
if not finnkode:
logger.debug("Skipping card with missing finnkode")
continue
title_elem = card.select_one(".title, h2.sf-realestate-heading, a.sf-search-ad-link")
address_elem = card.select_one(".location, .sf-realestate-location")
area_elem = card.select_one(".area")
price_elem = card.select_one(".price")
common_costs_elem = card.select_one(".common-costs")
bedrooms_elem = card.select_one(".bedrooms")
property_type_elem = card.select_one(".property-type")
ownership_type_elem = card.select_one(".ownership-type")
broker_elem = card.select_one(".broker-company")
card_text = clean_text(card.get_text(" ") or "")
bedrooms = None
if bedrooms_elem:
bedrooms = normalize_number(bedrooms_elem.get_text())
elif card_text:
bedrooms = _extract_int_from_text(card_text, r"(\d+)\s*soverom")
common_costs = None
if common_costs_elem:
common_costs = normalize_number(common_costs_elem.get_text())
elif card_text:
common_costs = _extract_int_from_text(
card_text, r"(?:Fellesutg|Felleskost(?:er)?)[^\d]*(\d+[\d\s]*)kr"
)
total_price = None
if price_elem:
total_price = normalize_price(price_elem.get_text())
if not total_price and card_text:
total_price = _extract_price_from_text(card_text, r"Totalpris")
if not total_price and card_text:
first_price_match = re.search(r"([\d\s]+kr)", card_text)
if first_price_match:
total_price = normalize_price(first_price_match.group(1))
area_m2 = None
if area_elem:
area_m2 = normalize_area(area_elem.get_text())
elif card_text:
area_m2 = _extract_area_from_text(card_text)
card_data = FinnSearchCard(
finnkode=finnkode,
url=url or "",
title=clean_text(title_elem.get_text()) if title_elem else None,
address=clean_text(address_elem.get_text()) if address_elem else None,
area_m2=area_m2,
asking_price=None,
total_price=total_price,
common_costs=common_costs,
property_type=clean_text(property_type_elem.get_text()) if property_type_elem else None,
ownership_type=clean_text(ownership_type_elem.get_text())
if ownership_type_elem
else None,
bedrooms=bedrooms,
floor=None,
broker_company=clean_text(broker_elem.get_text()) if broker_elem else None,
)
cards.append(card_data)
logger.debug("Parsed FINN search card %s", finnkode)
return cards
def find_next_page_url(html: str) -> str | None:
"""Return the FINN search next page URL if present."""
soup = BeautifulSoup(html, "html.parser")
next_link = soup.select_one("a[rel='next']")
if next_link and next_link.get("href"):
return clean_text(next_link.get("href"))
return None
async def fetch_search_pages(
start_url: str,
max_pages: int = 1,
client: HTTPClient | None = None,
use_cache: bool = True,
) -> list[FinnSearchCard]:
"""Fetch paginated FINN search pages and parse search cards."""
client = client or HTTPClient(request_delay_seconds=0.0)
conn = cache.init_db()
url = start_url
all_cards: list[FinnSearchCard] = []
for _ in range(max_pages):
html = await fetch_search_page_cached(url, client=client, conn=conn, use_cache=use_cache)
all_cards.extend(extract_search_cards(html))
next_url = find_next_page_url(html)
if not next_url:
break
url = next_url
logger.debug("Following next page link: %s", url)
return all_cards