initial
This commit is contained in:
@@ -0,0 +1,194 @@
|
||||
"""FINN search scraping and parsing."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from . import cache
|
||||
from .config import FINN_CACHE_TTL_SEARCH_MINUTES
|
||||
from .http import HTTPClient
|
||||
from .models import FinnSearchCard
|
||||
from .parser import (
|
||||
clean_text,
|
||||
extract_finnkode_from_url,
|
||||
normalize_area,
|
||||
normalize_finnkode,
|
||||
normalize_number,
|
||||
normalize_price,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def fetch_search_page(url: str, client: HTTPClient | None = None) -> str:
|
||||
"""Fetch a FINN search page HTML."""
|
||||
client = client or HTTPClient(request_delay_seconds=0.0)
|
||||
response = await client.get(url)
|
||||
return response.text
|
||||
|
||||
|
||||
async def fetch_search_page_cached(
|
||||
url: str,
|
||||
client: HTTPClient | None = None,
|
||||
conn: cache.sqlite3.Connection | None = None,
|
||||
use_cache: bool = True,
|
||||
) -> str:
|
||||
"""Fetch a FINN search page with optional SQLite caching."""
|
||||
client = client or HTTPClient(request_delay_seconds=0.0)
|
||||
conn = conn or cache.init_db()
|
||||
if use_cache:
|
||||
cached_html = cache.get_search_page(conn, url)
|
||||
if cached_html:
|
||||
logger.debug("Using cached search page: %s", url)
|
||||
return cached_html
|
||||
|
||||
html = await fetch_search_page(url, client=client)
|
||||
cache.save_search_page(conn, url, html, ttl_minutes=FINN_CACHE_TTL_SEARCH_MINUTES)
|
||||
return html
|
||||
|
||||
|
||||
def extract_ad_links(html: str) -> list[str]:
|
||||
"""Extract listing URLs from FINN search HTML."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
links = []
|
||||
for article in soup.select("article.listing-card, article.sf-search-ad"):
|
||||
anchor = article.select_one("a[href*='finnkode']")
|
||||
if anchor and anchor.get("href"):
|
||||
links.append(clean_text(anchor.get("href")) or "")
|
||||
return links
|
||||
|
||||
|
||||
def _extract_int_from_text(text: str, pattern: str) -> int | None:
|
||||
match = re.search(pattern, text, re.I)
|
||||
if match:
|
||||
return normalize_number(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_area_from_text(text: str) -> int | None:
|
||||
matches = re.findall(r"(\d+(?:[.,]\d+)?)\s*(?:m²|m2|kvm)", text, re.I)
|
||||
if matches:
|
||||
return normalize_area(matches[-1])
|
||||
return None
|
||||
|
||||
|
||||
def _extract_price_from_text(text: str, label: str) -> int | None:
|
||||
pattern = rf"{label}[:\s]*([\d\s]+kr)"
|
||||
match = re.search(pattern, text, re.I)
|
||||
if match:
|
||||
return normalize_price(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def extract_search_cards(html: str) -> list[FinnSearchCard]:
|
||||
"""Parse FINN search HTML and return a list of FinnSearchCard objects."""
|
||||
logger.debug("Extracting FINN search cards")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
cards: list[FinnSearchCard] = []
|
||||
|
||||
for card in soup.select("article.listing-card, article.sf-search-ad"):
|
||||
data_id = card.get("data-id")
|
||||
anchor = card.select_one("a[href*='finnkode']")
|
||||
url = anchor.get("href") if anchor else ""
|
||||
finnkode = normalize_finnkode(data_id or extract_finnkode_from_url(url))
|
||||
if not finnkode:
|
||||
logger.debug("Skipping card with missing finnkode")
|
||||
continue
|
||||
|
||||
title_elem = card.select_one(".title, h2.sf-realestate-heading, a.sf-search-ad-link")
|
||||
address_elem = card.select_one(".location, .sf-realestate-location")
|
||||
area_elem = card.select_one(".area")
|
||||
price_elem = card.select_one(".price")
|
||||
common_costs_elem = card.select_one(".common-costs")
|
||||
bedrooms_elem = card.select_one(".bedrooms")
|
||||
property_type_elem = card.select_one(".property-type")
|
||||
ownership_type_elem = card.select_one(".ownership-type")
|
||||
broker_elem = card.select_one(".broker-company")
|
||||
|
||||
card_text = clean_text(card.get_text(" ") or "")
|
||||
|
||||
bedrooms = None
|
||||
if bedrooms_elem:
|
||||
bedrooms = normalize_number(bedrooms_elem.get_text())
|
||||
elif card_text:
|
||||
bedrooms = _extract_int_from_text(card_text, r"(\d+)\s*soverom")
|
||||
|
||||
common_costs = None
|
||||
if common_costs_elem:
|
||||
common_costs = normalize_number(common_costs_elem.get_text())
|
||||
elif card_text:
|
||||
common_costs = _extract_int_from_text(
|
||||
card_text, r"(?:Fellesutg|Felleskost(?:er)?)[^\d]*(\d+[\d\s]*)kr"
|
||||
)
|
||||
|
||||
total_price = None
|
||||
if price_elem:
|
||||
total_price = normalize_price(price_elem.get_text())
|
||||
if not total_price and card_text:
|
||||
total_price = _extract_price_from_text(card_text, r"Totalpris")
|
||||
if not total_price and card_text:
|
||||
first_price_match = re.search(r"([\d\s]+kr)", card_text)
|
||||
if first_price_match:
|
||||
total_price = normalize_price(first_price_match.group(1))
|
||||
|
||||
area_m2 = None
|
||||
if area_elem:
|
||||
area_m2 = normalize_area(area_elem.get_text())
|
||||
elif card_text:
|
||||
area_m2 = _extract_area_from_text(card_text)
|
||||
|
||||
card_data = FinnSearchCard(
|
||||
finnkode=finnkode,
|
||||
url=url or "",
|
||||
title=clean_text(title_elem.get_text()) if title_elem else None,
|
||||
address=clean_text(address_elem.get_text()) if address_elem else None,
|
||||
area_m2=area_m2,
|
||||
asking_price=None,
|
||||
total_price=total_price,
|
||||
common_costs=common_costs,
|
||||
property_type=clean_text(property_type_elem.get_text()) if property_type_elem else None,
|
||||
ownership_type=clean_text(ownership_type_elem.get_text())
|
||||
if ownership_type_elem
|
||||
else None,
|
||||
bedrooms=bedrooms,
|
||||
floor=None,
|
||||
broker_company=clean_text(broker_elem.get_text()) if broker_elem else None,
|
||||
)
|
||||
cards.append(card_data)
|
||||
logger.debug("Parsed FINN search card %s", finnkode)
|
||||
|
||||
return cards
|
||||
|
||||
|
||||
def find_next_page_url(html: str) -> str | None:
|
||||
"""Return the FINN search next page URL if present."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
next_link = soup.select_one("a[rel='next']")
|
||||
if next_link and next_link.get("href"):
|
||||
return clean_text(next_link.get("href"))
|
||||
return None
|
||||
|
||||
|
||||
async def fetch_search_pages(
|
||||
start_url: str,
|
||||
max_pages: int = 1,
|
||||
client: HTTPClient | None = None,
|
||||
use_cache: bool = True,
|
||||
) -> list[FinnSearchCard]:
|
||||
"""Fetch paginated FINN search pages and parse search cards."""
|
||||
client = client or HTTPClient(request_delay_seconds=0.0)
|
||||
conn = cache.init_db()
|
||||
url = start_url
|
||||
all_cards: list[FinnSearchCard] = []
|
||||
|
||||
for _ in range(max_pages):
|
||||
html = await fetch_search_page_cached(url, client=client, conn=conn, use_cache=use_cache)
|
||||
all_cards.extend(extract_search_cards(html))
|
||||
next_url = find_next_page_url(html)
|
||||
if not next_url:
|
||||
break
|
||||
url = next_url
|
||||
logger.debug("Following next page link: %s", url)
|
||||
|
||||
return all_cards
|
||||
Reference in New Issue
Block a user