Enhance analysis functionality with parallel fetching and response shaping; add image processing for unit images
This commit is contained in:
+124
-41
@@ -16,7 +16,9 @@ The cached result is invalidated automatically the moment any piece of
|
||||
underlying data changes, because the deps_hash will differ.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from . import ad as ad_module
|
||||
from . import cache, eiendom_no, scoring, search
|
||||
@@ -43,6 +45,10 @@ from .models import EiendomUnit, FinnAd, SimilarUnit
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Max parallel ad + eiendom.no fetches in analyze_search phase 1.
|
||||
# High enough to be fast; low enough to avoid FINN rate-limiting.
|
||||
FETCH_CONCURRENCY = 5
|
||||
|
||||
|
||||
def _normalize_description(text: str | None) -> str:
|
||||
return text.lower() if text else ""
|
||||
@@ -91,8 +97,6 @@ def _build_ad_summary(
|
||||
risks.append("Risk flags are detected in description or metadata.")
|
||||
if ad.common_costs and ad.common_costs > 5000:
|
||||
risks.append("Common costs are relatively high and should be reviewed.")
|
||||
if enriched and enriched.sale_status and enriched.sale_status.upper() != "FOR_SALE":
|
||||
risks.append("Eiendom.no sale status does not indicate an active sale.")
|
||||
if not enriched:
|
||||
risks.append("Missing Eiendom.no data increases uncertainty.")
|
||||
|
||||
@@ -208,8 +212,27 @@ async def analyze_ad(
|
||||
|
||||
result = {
|
||||
"finnkode": finn_ad.finnkode,
|
||||
"url": finn_ad.url,
|
||||
"title": finn_ad.title,
|
||||
"address": finn_ad.address,
|
||||
"district": finn_ad.district,
|
||||
"property_type": finn_ad.property_type,
|
||||
"ownership_type": finn_ad.ownership_type,
|
||||
"floor": finn_ad.floor,
|
||||
"area_m2": finn_ad.area_m2,
|
||||
"bedrooms": finn_ad.bedrooms,
|
||||
"rooms": finn_ad.rooms,
|
||||
"total_price": finn_ad.total_price,
|
||||
"asking_price": finn_ad.asking_price,
|
||||
"shared_debt": finn_ad.shared_debt,
|
||||
"common_costs": finn_ad.common_costs,
|
||||
"construction_year": finn_ad.construction_year,
|
||||
"has_balcony": finn_ad.has_balcony,
|
||||
"has_terrace": finn_ad.has_terrace,
|
||||
"has_elevator": finn_ad.has_elevator,
|
||||
"has_parking": finn_ad.has_parking,
|
||||
"has_garage": finn_ad.has_garage,
|
||||
"eiendom_unit_code": finn_ad.eiendom_unit_code,
|
||||
"score": scores,
|
||||
"categories": categories,
|
||||
"summary": summary,
|
||||
@@ -226,12 +249,26 @@ async def analyze_ad(
|
||||
return result
|
||||
|
||||
|
||||
async def _analyze_card(card, conn, *, include_eiendom_no: bool, client) -> dict:
|
||||
"""Fetch details + enrich a single search card. Raises on unrecoverable
|
||||
errors; the caller is responsible for catching and skipping."""
|
||||
finn_ad = cache.get_finn_ad(conn, card.finnkode, ttl_hours=FINN_CACHE_TTL_AD_HOURS)
|
||||
if finn_ad is None:
|
||||
finn_ad = await ad_module.fetch_ad_details(card.finnkode, client=client)
|
||||
async def _fetch_card_to_db(
|
||||
card,
|
||||
conn,
|
||||
*,
|
||||
include_eiendom_no: bool,
|
||||
client,
|
||||
) -> tuple["FinnAd | None", "str | None"]:
|
||||
"""Phase 1 worker: fetch ad details + resolve Eiendom.no unit, persist to DB.
|
||||
|
||||
Returns (finn_ad, unit_code). Both can be None on failure -- the caller
|
||||
treats None as a skip without aborting the whole batch.
|
||||
"""
|
||||
try:
|
||||
finn_ad = cache.get_finn_ad(conn, card.finnkode, ttl_hours=FINN_CACHE_TTL_AD_HOURS)
|
||||
if finn_ad is None:
|
||||
finn_ad = await ad_module.fetch_ad_details(card.finnkode, client=client)
|
||||
save_finn_ad(conn, finn_ad)
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to fetch ad %s: %s", card.finnkode, exc)
|
||||
return None, None
|
||||
|
||||
unit_code = None
|
||||
if include_eiendom_no:
|
||||
@@ -239,11 +276,9 @@ async def _analyze_card(card, conn, *, include_eiendom_no: bool, client) -> dict
|
||||
matched_unit = await eiendom_no.search_unit_from_finn_url(card.url)
|
||||
unit_code = matched_unit.unit_code if matched_unit else None
|
||||
except Exception as exc:
|
||||
# A failed unit resolution is non-fatal -- proceed without enrichment.
|
||||
logger.warning("Eiendom.no unit search failed for %s: %s", card.finnkode, exc)
|
||||
unit_code = None
|
||||
|
||||
return await analyze_ad(finn_ad, unit_code=unit_code)
|
||||
return finn_ad, unit_code
|
||||
|
||||
|
||||
async def analyze_search(
|
||||
@@ -254,13 +289,24 @@ async def analyze_search(
|
||||
include_eiendom_no: bool = True,
|
||||
client=None,
|
||||
use_cache: bool = True,
|
||||
ctx: Any = None,
|
||||
) -> dict:
|
||||
"""Analyze a FINN search URL and enrich matching listings.
|
||||
|
||||
Search-level results are NOT cached as a whole (the search page itself
|
||||
is cached at the HTML level). Individual ad analyses ARE cached via
|
||||
``analyze_ad``, so re-running a search only re-scores ads whose
|
||||
underlying data has changed.
|
||||
Two-phase parallel execution
|
||||
----------------------------
|
||||
Phase 1 (parallel, I/O bound):
|
||||
All resale cards are fetched concurrently behind a semaphore of size
|
||||
``FETCH_CONCURRENCY``. Each worker fetches the ad detail page and
|
||||
resolves the Eiendom.no unit in one shot, then writes both to SQLite.
|
||||
Progress is reported via ``ctx`` if provided.
|
||||
|
||||
Phase 2 (sequential, cache bound):
|
||||
Scoring reads entirely from SQLite -- no network -- and is fast.
|
||||
Results are sorted by total score and returned.
|
||||
|
||||
Individual ad analyses ARE cached via ``analyze_ad``; re-running a search
|
||||
only re-scores ads whose underlying data has changed.
|
||||
"""
|
||||
conn = cache.init_db(FINN_CACHE_PATH)
|
||||
cards = await search.fetch_search_pages(
|
||||
@@ -269,38 +315,75 @@ async def analyze_search(
|
||||
client=client,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
|
||||
resale_cards = [c for c in cards[:detail_limit] if _is_resale_listing(c.url)]
|
||||
skipped_count = len(cards[:detail_limit]) - len(resale_cards)
|
||||
|
||||
if ctx is not None:
|
||||
await ctx.info(
|
||||
f"Found {len(cards)} listings, {len(resale_cards)} resale ads to fetch."
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 1: parallel fetch to DB
|
||||
# ------------------------------------------------------------------
|
||||
fetched: dict[str, tuple] = {} # finnkode -> (FinnAd, unit_code | None)
|
||||
fetch_counter = 0
|
||||
sem = asyncio.Semaphore(FETCH_CONCURRENCY)
|
||||
|
||||
if not fetch_details:
|
||||
resale_cards = []
|
||||
|
||||
async def _fetch_worker(card, idx: int) -> None:
|
||||
nonlocal fetch_counter
|
||||
async with sem:
|
||||
finn_ad, unit_code = await _fetch_card_to_db(
|
||||
card, conn, include_eiendom_no=include_eiendom_no, client=client
|
||||
)
|
||||
fetched[card.finnkode] = (finn_ad, unit_code)
|
||||
fetch_counter += 1
|
||||
if ctx is not None:
|
||||
await ctx.report_progress(fetch_counter, len(resale_cards))
|
||||
status = "enriched" if unit_code else "no eiendom match"
|
||||
await ctx.info(
|
||||
f"[{fetch_counter}/{len(resale_cards)}] {card.finnkode} fetched ({status})"
|
||||
)
|
||||
|
||||
await asyncio.gather(*[_fetch_worker(c, i) for i, c in enumerate(resale_cards)])
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 2: score from DB (reads cache, fast)
|
||||
# ------------------------------------------------------------------
|
||||
if ctx is not None:
|
||||
await ctx.info(f"All data fetched. Scoring {len(resale_cards)} ads...")
|
||||
|
||||
results = []
|
||||
enriched_count = 0
|
||||
skipped_count = 0
|
||||
cache_hits = 0
|
||||
|
||||
if fetch_details:
|
||||
for card in cards[:detail_limit]:
|
||||
# Project / new-build ads are not resale listings and fetch_ad_details
|
||||
# cannot resolve them -- skip up front rather than 404 mid-run.
|
||||
if not _is_resale_listing(card.url):
|
||||
logger.info("Skipping non-resale card %s (%s)", card.finnkode, card.url)
|
||||
skipped_count += 1
|
||||
continue
|
||||
for card in resale_cards:
|
||||
finn_ad, unit_code = fetched.get(card.finnkode, (None, None))
|
||||
if finn_ad is None:
|
||||
skipped_count += 1
|
||||
continue
|
||||
try:
|
||||
result = await analyze_ad(finn_ad, unit_code=unit_code)
|
||||
except Exception as exc:
|
||||
logger.warning("Skipping card %s during scoring: %s", card.finnkode, exc)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# One bad card (stale finnkode, removed ad, transient network error)
|
||||
# must not abort the whole search -- isolate each card.
|
||||
try:
|
||||
result = await _analyze_card(
|
||||
card, conn, include_eiendom_no=include_eiendom_no, client=client
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("Skipping card %s: %s", card.finnkode, exc)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
if result.get("eiendom_unit"):
|
||||
enriched_count += 1
|
||||
# Track analysis cache hits via the absence of recompute logging
|
||||
# (the flag is not propagated up here; rely on debug logs).
|
||||
results.append(result)
|
||||
if result.get("eiendom_unit"):
|
||||
enriched_count += 1
|
||||
results.append(result)
|
||||
|
||||
results.sort(key=lambda item: item["score"].get("total", 0.0), reverse=True)
|
||||
|
||||
if ctx is not None:
|
||||
await ctx.info(
|
||||
f"Done. {len(results)} analyzed, {enriched_count} enriched, "
|
||||
f"{skipped_count} skipped."
|
||||
)
|
||||
|
||||
return {
|
||||
"search_url": search_url,
|
||||
"search_cards": [card.model_dump(mode="json") for card in cards],
|
||||
|
||||
Reference in New Issue
Block a user