c9383788de
Co-authored-by: Copilot <copilot@github.com>
198 lines
6.8 KiB
Python
198 lines
6.8 KiB
Python
"""FINN search scraping and parsing."""
|
|
|
|
import logging
|
|
import re
|
|
from urllib.parse import urljoin
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from . import cache
|
|
from .config import FINN_CACHE_TTL_SEARCH_MINUTES
|
|
from .http import HTTPClient
|
|
from .models import FinnSearchCard
|
|
from .parser import (
|
|
clean_text,
|
|
extract_finnkode_from_url,
|
|
normalize_area,
|
|
normalize_finnkode,
|
|
normalize_number,
|
|
normalize_price,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def fetch_search_page(url: str, client: HTTPClient | None = None) -> str:
|
|
"""Fetch a FINN search page HTML."""
|
|
client = client or HTTPClient(request_delay_seconds=0.0)
|
|
response = await client.get(url)
|
|
return response.text
|
|
|
|
|
|
async def fetch_search_page_cached(
|
|
url: str,
|
|
client: HTTPClient | None = None,
|
|
conn: cache.sqlite3.Connection | None = None,
|
|
use_cache: bool = True,
|
|
) -> str:
|
|
"""Fetch a FINN search page with optional SQLite caching."""
|
|
client = client or HTTPClient(request_delay_seconds=0.0)
|
|
conn = conn or cache.init_db()
|
|
if use_cache:
|
|
cached_html = cache.get_search_page(conn, url)
|
|
if cached_html:
|
|
logger.debug("Using cached search page: %s", url)
|
|
return cached_html
|
|
|
|
html = await fetch_search_page(url, client=client)
|
|
cache.save_search_page(conn, url, html, ttl_minutes=FINN_CACHE_TTL_SEARCH_MINUTES)
|
|
return html
|
|
|
|
|
|
def extract_ad_links(html: str) -> list[str]:
|
|
"""Extract listing URLs from FINN search HTML."""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
links = []
|
|
for article in soup.select("article.listing-card, article.sf-search-ad"):
|
|
anchor = article.select_one("a[href*='finnkode']")
|
|
if anchor and anchor.get("href"):
|
|
links.append(clean_text(anchor.get("href")) or "")
|
|
return links
|
|
|
|
|
|
def _extract_int_from_text(text: str, pattern: str) -> int | None:
|
|
match = re.search(pattern, text, re.I)
|
|
if match:
|
|
return normalize_number(match.group(1))
|
|
return None
|
|
|
|
|
|
def _extract_area_from_text(text: str) -> int | None:
|
|
matches = re.findall(r"(\d+(?:[.,]\d+)?)\s*(?:m²|m2|kvm)", text, re.I)
|
|
if matches:
|
|
return normalize_area(matches[-1])
|
|
return None
|
|
|
|
|
|
def _extract_price_from_text(text: str, label: str) -> int | None:
|
|
pattern = rf"{label}[:\s]*([\d\s]+kr)"
|
|
match = re.search(pattern, text, re.I)
|
|
if match:
|
|
return normalize_price(match.group(1))
|
|
return None
|
|
|
|
|
|
def extract_search_cards(html: str) -> list[FinnSearchCard]:
|
|
"""Parse FINN search HTML and return a list of FinnSearchCard objects."""
|
|
logger.debug("Extracting FINN search cards")
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
cards: list[FinnSearchCard] = []
|
|
|
|
for card in soup.select("article.listing-card, article.sf-search-ad"):
|
|
data_id = card.get("data-id")
|
|
anchor = card.select_one("a[href*='finnkode']")
|
|
url = anchor.get("href") if anchor else ""
|
|
finnkode = normalize_finnkode(data_id or extract_finnkode_from_url(url))
|
|
if not finnkode:
|
|
logger.debug("Skipping card with missing finnkode")
|
|
continue
|
|
|
|
title_elem = card.select_one(".title, h2.sf-realestate-heading, a.sf-search-ad-link")
|
|
address_elem = card.select_one(".location, .sf-realestate-location")
|
|
area_elem = card.select_one(".area")
|
|
price_elem = card.select_one(".price")
|
|
common_costs_elem = card.select_one(".common-costs")
|
|
bedrooms_elem = card.select_one(".bedrooms")
|
|
property_type_elem = card.select_one(".property-type")
|
|
ownership_type_elem = card.select_one(".ownership-type")
|
|
broker_elem = card.select_one(".broker-company")
|
|
|
|
card_text = clean_text(card.get_text(" ") or "")
|
|
|
|
bedrooms = None
|
|
if bedrooms_elem:
|
|
bedrooms = normalize_number(bedrooms_elem.get_text())
|
|
elif card_text:
|
|
bedrooms = _extract_int_from_text(card_text, r"(\d+)\s*soverom")
|
|
|
|
common_costs = None
|
|
if common_costs_elem:
|
|
common_costs = normalize_number(common_costs_elem.get_text())
|
|
elif card_text:
|
|
common_costs = _extract_int_from_text(
|
|
card_text, r"(?:Fellesutg|Felleskost(?:er)?)[^\d]*(\d+[\d\s]*)kr"
|
|
)
|
|
|
|
total_price = None
|
|
if price_elem:
|
|
total_price = normalize_price(price_elem.get_text())
|
|
if not total_price and card_text:
|
|
total_price = _extract_price_from_text(card_text, r"Totalpris")
|
|
if not total_price and card_text:
|
|
first_price_match = re.search(r"([\d\s]+kr)", card_text)
|
|
if first_price_match:
|
|
total_price = normalize_price(first_price_match.group(1))
|
|
|
|
area_m2 = None
|
|
if area_elem:
|
|
area_m2 = normalize_area(area_elem.get_text())
|
|
elif card_text:
|
|
area_m2 = _extract_area_from_text(card_text)
|
|
|
|
card_data = FinnSearchCard(
|
|
finnkode=finnkode,
|
|
url=url or "",
|
|
title=clean_text(title_elem.get_text()) if title_elem else None,
|
|
address=clean_text(address_elem.get_text()) if address_elem else None,
|
|
area_m2=area_m2,
|
|
asking_price=None,
|
|
total_price=total_price,
|
|
common_costs=common_costs,
|
|
property_type=clean_text(property_type_elem.get_text()) if property_type_elem else None,
|
|
ownership_type=clean_text(ownership_type_elem.get_text())
|
|
if ownership_type_elem
|
|
else None,
|
|
bedrooms=bedrooms,
|
|
floor=None,
|
|
broker_company=clean_text(broker_elem.get_text()) if broker_elem else None,
|
|
)
|
|
cards.append(card_data)
|
|
logger.debug("Parsed FINN search card %s", finnkode)
|
|
|
|
return cards
|
|
|
|
|
|
def find_next_page_url(html: str, base_url: str = "https://www.finn.no") -> str | None:
|
|
"""Return the FINN search next page URL if present."""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
next_link = soup.select_one("a[rel='next']")
|
|
if next_link and next_link.get("href"):
|
|
href = clean_text(next_link.get("href"))
|
|
if href:
|
|
return urljoin(base_url, href)
|
|
return None
|
|
|
|
|
|
async def fetch_search_pages(
|
|
start_url: str,
|
|
max_pages: int = 1,
|
|
client: HTTPClient | None = None,
|
|
use_cache: bool = True,
|
|
) -> list[FinnSearchCard]:
|
|
"""Fetch paginated FINN search pages and parse search cards."""
|
|
client = client or HTTPClient(request_delay_seconds=0.0)
|
|
conn = cache.init_db()
|
|
url = start_url
|
|
all_cards: list[FinnSearchCard] = []
|
|
|
|
for _ in range(max_pages):
|
|
html = await fetch_search_page_cached(url, client=client, conn=conn, use_cache=use_cache)
|
|
all_cards.extend(extract_search_cards(html))
|
|
next_url = find_next_page_url(html, base_url=start_url)
|
|
if not next_url:
|
|
break
|
|
url = next_url
|
|
logger.debug("Following next page link: %s", url)
|
|
|
|
return all_cards
|