Enhance analysis functionality with parallel fetching and response shaping; add image processing for unit images

2026-05-26 20:50:58 +00:00
parent 2933b8c1ea
commit 5b772b2ae5
4 changed files with 300 additions and 49 deletions
@@ -16,7 +16,9 @@ The cached result is invalidated automatically the moment any piece of
 underlying data changes, because the deps_hash will differ.
 """

+import asyncio
 import logging
+from typing import Any

 from . import ad as ad_module
 from . import cache, eiendom_no, scoring, search
@@ -43,6 +45,10 @@ from .models import EiendomUnit, FinnAd, SimilarUnit

 logger = logging.getLogger(__name__)

+# Max parallel ad + eiendom.no fetches in analyze_search phase 1.
+# High enough to be fast; low enough to avoid FINN rate-limiting.
+FETCH_CONCURRENCY = 5
+

 def _normalize_description(text: str | None) -> str:
    return text.lower() if text else ""
@@ -91,8 +97,6 @@ def _build_ad_summary(
        risks.append("Risk flags are detected in description or metadata.")
    if ad.common_costs and ad.common_costs > 5000:
        risks.append("Common costs are relatively high and should be reviewed.")
-    if enriched and enriched.sale_status and enriched.sale_status.upper() != "FOR_SALE":
-        risks.append("Eiendom.no sale status does not indicate an active sale.")
    if not enriched:
        risks.append("Missing Eiendom.no data increases uncertainty.")

@@ -208,8 +212,27 @@ async def analyze_ad(

    result = {
        "finnkode": finn_ad.finnkode,
+        "url": finn_ad.url,
        "title": finn_ad.title,
        "address": finn_ad.address,
+        "district": finn_ad.district,
+        "property_type": finn_ad.property_type,
+        "ownership_type": finn_ad.ownership_type,
+        "floor": finn_ad.floor,
+        "area_m2": finn_ad.area_m2,
+        "bedrooms": finn_ad.bedrooms,
+        "rooms": finn_ad.rooms,
+        "total_price": finn_ad.total_price,
+        "asking_price": finn_ad.asking_price,
+        "shared_debt": finn_ad.shared_debt,
+        "common_costs": finn_ad.common_costs,
+        "construction_year": finn_ad.construction_year,
+        "has_balcony": finn_ad.has_balcony,
+        "has_terrace": finn_ad.has_terrace,
+        "has_elevator": finn_ad.has_elevator,
+        "has_parking": finn_ad.has_parking,
+        "has_garage": finn_ad.has_garage,
+        "eiendom_unit_code": finn_ad.eiendom_unit_code,
        "score": scores,
        "categories": categories,
        "summary": summary,
@@ -226,12 +249,26 @@ async def analyze_ad(
    return result


-async def _analyze_card(card, conn, *, include_eiendom_no: bool, client) -> dict:
-    """Fetch details + enrich a single search card. Raises on unrecoverable
-    errors; the caller is responsible for catching and skipping."""
-    finn_ad = cache.get_finn_ad(conn, card.finnkode, ttl_hours=FINN_CACHE_TTL_AD_HOURS)
-    if finn_ad is None:
-        finn_ad = await ad_module.fetch_ad_details(card.finnkode, client=client)
+async def _fetch_card_to_db(
+    card,
+    conn,
+    *,
+    include_eiendom_no: bool,
+    client,
+) -> tuple["FinnAd | None", "str | None"]:
+    """Phase 1 worker: fetch ad details + resolve Eiendom.no unit, persist to DB.
+
+    Returns (finn_ad, unit_code).  Both can be None on failure -- the caller
+    treats None as a skip without aborting the whole batch.
+    """
+    try:
+        finn_ad = cache.get_finn_ad(conn, card.finnkode, ttl_hours=FINN_CACHE_TTL_AD_HOURS)
+        if finn_ad is None:
+            finn_ad = await ad_module.fetch_ad_details(card.finnkode, client=client)
+            save_finn_ad(conn, finn_ad)
+    except Exception as exc:
+        logger.warning("Failed to fetch ad %s: %s", card.finnkode, exc)
+        return None, None

    unit_code = None
    if include_eiendom_no:
@@ -239,11 +276,9 @@ async def _analyze_card(card, conn, *, include_eiendom_no: bool, client) -> dict
            matched_unit = await eiendom_no.search_unit_from_finn_url(card.url)
            unit_code = matched_unit.unit_code if matched_unit else None
        except Exception as exc:
-            # A failed unit resolution is non-fatal -- proceed without enrichment.
            logger.warning("Eiendom.no unit search failed for %s: %s", card.finnkode, exc)
-            unit_code = None

-    return await analyze_ad(finn_ad, unit_code=unit_code)
+    return finn_ad, unit_code


 async def analyze_search(
@@ -254,13 +289,24 @@ async def analyze_search(
    include_eiendom_no: bool = True,
    client=None,
    use_cache: bool = True,
+    ctx: Any = None,
 ) -> dict:
    """Analyze a FINN search URL and enrich matching listings.

-    Search-level results are NOT cached as a whole (the search page itself
-    is cached at the HTML level).  Individual ad analyses ARE cached via
-    ``analyze_ad``, so re-running a search only re-scores ads whose
-    underlying data has changed.
+    Two-phase parallel execution
+    ----------------------------
+    Phase 1 (parallel, I/O bound):
+      All resale cards are fetched concurrently behind a semaphore of size
+      ``FETCH_CONCURRENCY``.  Each worker fetches the ad detail page and
+      resolves the Eiendom.no unit in one shot, then writes both to SQLite.
+      Progress is reported via ``ctx`` if provided.
+
+    Phase 2 (sequential, cache bound):
+      Scoring reads entirely from SQLite -- no network -- and is fast.
+      Results are sorted by total score and returned.
+
+    Individual ad analyses ARE cached via ``analyze_ad``; re-running a search
+    only re-scores ads whose underlying data has changed.
    """
    conn = cache.init_db(FINN_CACHE_PATH)
    cards = await search.fetch_search_pages(
@@ -269,38 +315,75 @@ async def analyze_search(
        client=client,
        use_cache=use_cache,
    )
+
+    resale_cards = [c for c in cards[:detail_limit] if _is_resale_listing(c.url)]
+    skipped_count = len(cards[:detail_limit]) - len(resale_cards)
+
+    if ctx is not None:
+        await ctx.info(
+            f"Found {len(cards)} listings, {len(resale_cards)} resale ads to fetch."
+        )
+
+    # ------------------------------------------------------------------
+    # Phase 1: parallel fetch to DB
+    # ------------------------------------------------------------------
+    fetched: dict[str, tuple] = {}  # finnkode -> (FinnAd, unit_code | None)
+    fetch_counter = 0
+    sem = asyncio.Semaphore(FETCH_CONCURRENCY)
+
+    if not fetch_details:
+        resale_cards = []
+
+    async def _fetch_worker(card, idx: int) -> None:
+        nonlocal fetch_counter
+        async with sem:
+            finn_ad, unit_code = await _fetch_card_to_db(
+                card, conn, include_eiendom_no=include_eiendom_no, client=client
+            )
+            fetched[card.finnkode] = (finn_ad, unit_code)
+            fetch_counter += 1
+            if ctx is not None:
+                await ctx.report_progress(fetch_counter, len(resale_cards))
+                status = "enriched" if unit_code else "no eiendom match"
+                await ctx.info(
+                    f"[{fetch_counter}/{len(resale_cards)}] {card.finnkode} fetched ({status})"
+                )
+
+    await asyncio.gather(*[_fetch_worker(c, i) for i, c in enumerate(resale_cards)])
+
+    # ------------------------------------------------------------------
+    # Phase 2: score from DB (reads cache, fast)
+    # ------------------------------------------------------------------
+    if ctx is not None:
+        await ctx.info(f"All data fetched. Scoring {len(resale_cards)} ads...")
+
    results = []
    enriched_count = 0
-    skipped_count = 0
-    cache_hits = 0

-    if fetch_details:
-        for card in cards[:detail_limit]:
-            # Project / new-build ads are not resale listings and fetch_ad_details
-            # cannot resolve them -- skip up front rather than 404 mid-run.
-            if not _is_resale_listing(card.url):
-                logger.info("Skipping non-resale card %s (%s)", card.finnkode, card.url)
-                skipped_count += 1
-                continue
+    for card in resale_cards:
+        finn_ad, unit_code = fetched.get(card.finnkode, (None, None))
+        if finn_ad is None:
+            skipped_count += 1
+            continue
+        try:
+            result = await analyze_ad(finn_ad, unit_code=unit_code)
+        except Exception as exc:
+            logger.warning("Skipping card %s during scoring: %s", card.finnkode, exc)
+            skipped_count += 1
+            continue

-            # One bad card (stale finnkode, removed ad, transient network error)
-            # must not abort the whole search -- isolate each card.
-            try:
-                result = await _analyze_card(
-                    card, conn, include_eiendom_no=include_eiendom_no, client=client
-                )
-            except Exception as exc:
-                logger.warning("Skipping card %s: %s", card.finnkode, exc)
-                skipped_count += 1
-                continue
-
-            if result.get("eiendom_unit"):
-                enriched_count += 1
-            # Track analysis cache hits via the absence of recompute logging
-            # (the flag is not propagated up here; rely on debug logs).
-            results.append(result)
+        if result.get("eiendom_unit"):
+            enriched_count += 1
+        results.append(result)

    results.sort(key=lambda item: item["score"].get("total", 0.0), reverse=True)
+
+    if ctx is not None:
+        await ctx.info(
+            f"Done. {len(results)} analyzed, {enriched_count} enriched, "
+            f"{skipped_count} skipped."
+        )
+
    return {
        "search_url": search_url,
        "search_cards": [card.model_dump(mode="json") for card in cards],