Implement caching strategy for analysis results and enhance hash-aware data fetching

2026-05-26 13:54:49 +00:00
parent 46fd22c277
commit 22f30ebf00
3 changed files with 557 additions and 122 deletions
@@ -1,9 +1,37 @@
-"""Orchestration for FINN search + Eiendom.no enrichment + scoring."""
+"""Orchestration for FINN search + Eiendom.no enrichment + scoring.
+
+Analysis caching
+----------------
+``analyze_ad`` caches its result under a ``deps_hash`` that is the
+SHA-256 of the combined raw payloads of the ad, the eiendom unit, and the
+comparable sales used to produce it.  On a subsequent call the function:
+
+  1. Reads the three raw content hashes from the DB (no deserialisation).
+  2. Derives the same deps_hash from those hashes.
+  3. Checks analysis_cache for a matching (finnkode, deps_hash) row.
+  4. Returns the cached result immediately if found.
+  5. Otherwise runs the full scoring pipeline and writes to analysis_cache.
+
+The cached result is invalidated automatically the moment any piece of
+underlying data changes, because the deps_hash will differ.
+"""

 import logging

 from . import ad as ad_module
 from . import cache, eiendom_no, scoring, search
+from .cache import (
+    combine_hashes,
+    get_analysis,
+    get_eiendom_unit_hash,
+    get_finn_ad_hash,
+    get_similar_units_hash,
+    invalidate_analysis,
+    save_analysis,
+    save_eiendom_unit,
+    save_finn_ad,
+    save_similar_units,
+)
 from .config import (
    EIENDOM_NO_CACHE_TTL_HOURS,
    FINN_CACHE_PATH,
@@ -86,38 +114,93 @@ def _build_ad_summary(
    }


+def _compute_deps_hash(
+    conn,
+    finnkode: str,
+    unit_code: str | None,
+    listing_status: str = "RECENTLY_SOLD",
+) -> str:
+    """Derive a deps_hash from the three stored raw content hashes.
+
+    Reads only the hash column -- no payload deserialisation.
+    """
+    ad_hash = get_finn_ad_hash(conn, finnkode)
+    unit_hash = get_eiendom_unit_hash(conn, unit_code) if unit_code else None
+    comps_hash = (
+        get_similar_units_hash(conn, unit_code, listing_status) if unit_code else None
+    )
+    return combine_hashes(ad_hash, unit_hash, comps_hash)
+
+
 async def analyze_ad(
    finn_ad: FinnAd,
    unit_code: str | None = None,
 ) -> dict:
-    """Enrich a FinnAd and compute score summary."""
+    """Enrich a FinnAd and compute score summary.
+
+    Result is cached in analysis_cache keyed by deps_hash.  Recomputation
+    happens only when the underlying raw data has actually changed.
+    """
    conn = cache.init_db(FINN_CACHE_PATH)
+
+    # ------------------------------------------------------------------
+    # 1. Ensure the ad is in the DB so we have a stable hash to key on.
+    # ------------------------------------------------------------------
+    ad_hash, ad_changed = save_finn_ad(conn, finn_ad)
+
+    # ------------------------------------------------------------------
+    # 2. Fetch / refresh Eiendom.no data (cache-aware).
+    # ------------------------------------------------------------------
    enriched: EiendomUnit | None = None
-    similar_units: list[SimilarUnit] = []
+    unit_hash_changed = False

    if unit_code:
        enriched = cache.get_eiendom_unit(conn, unit_code)
        if enriched is None:
            enriched = await eiendom_no.enrich_ad_with_eiendom_no(finn_ad, unit_code)
            if enriched is not None:
-                cache.save_eiendom_unit(conn, enriched)
+                _, unit_hash_changed = save_eiendom_unit(conn, enriched)
+        # If already cached, unit_hash_changed stays False -- no new write.
+
+    # ------------------------------------------------------------------
+    # 3. Fetch / refresh similar units (cache-aware).
+    # ------------------------------------------------------------------
+    similar_units: list[SimilarUnit] = []
+    comps_hash_changed = False

    if enriched:
-        # Check cache for similar units first. The cache uses (unit_code,
-        # listing_status) as the key, so we must look it up by unit_code.
        similar_units = cache.get_similar_units(
            conn, enriched.unit_code, "RECENTLY_SOLD", ttl_hours=EIENDOM_NO_CACHE_TTL_HOURS
        )
-
        if not similar_units:
-            # Cache miss: build the vector and fetch fresh from Eiendom.no
-            # (unit_vector field from get_unit is None; build locally)
            vector = enriched.unit_vector or eiendom_no.build_unit_vector(enriched)
            if vector:
                similar_units = await eiendom_no.get_similar_units(vector)
-                # Save to cache
                if similar_units:
-                    cache.save_similar_units(conn, enriched.unit_code, "RECENTLY_SOLD", similar_units)
+                    _, comps_hash_changed = save_similar_units(
+                        conn, enriched.unit_code, "RECENTLY_SOLD", similar_units
+                    )
+
+    # ------------------------------------------------------------------
+    # 4. Derive deps_hash and check analysis_cache.
+    # ------------------------------------------------------------------
+    deps_hash = _compute_deps_hash(conn, finn_ad.finnkode, unit_code)
+
+    cached_analysis = get_analysis(conn, finn_ad.finnkode, deps_hash)
+    if cached_analysis is not None:
+        logger.debug("analysis_cache hit for %s -- skipping recompute", finn_ad.finnkode)
+        return cached_analysis
+
+    # ------------------------------------------------------------------
+    # 5. Cache miss: compute, store, return.
+    # ------------------------------------------------------------------
+    logger.debug(
+        "analysis_cache miss for %s (ad_changed=%s, unit_changed=%s, comps_changed=%s)",
+        finn_ad.finnkode,
+        ad_changed,
+        unit_hash_changed,
+        comps_hash_changed,
+    )

    scores = scoring.score_ad(finn_ad, enriched, similar_units)
    categories = scoring.classify_ad(scores)
@@ -130,10 +213,16 @@ async def analyze_ad(
        "score": scores,
        "categories": categories,
        "summary": summary,
-        "eiendom_unit": enriched.model_dump() if enriched else None,
-        "similar_units": [unit.model_dump() for unit in similar_units],
+        "eiendom_unit": enriched.model_dump(mode="json") if enriched else None,
+        "similar_units": [unit.model_dump(mode="json") for unit in similar_units],
    }
-    cache.save_finn_ad(conn, finn_ad)
+
+    # Round-trip through JSON to guarantee all values are serialisable
+    # (catches any datetime that survives model_dump, e.g. from scoring).
+    import json as _json
+    result = _json.loads(_json.dumps(result, default=str))
+
+    save_analysis(conn, finn_ad.finnkode, deps_hash, result)
    return result


@@ -166,7 +255,13 @@ async def analyze_search(
    client=None,
    use_cache: bool = True,
 ) -> dict:
-    """Analyze a FINN search URL and enrich matching listings."""
+    """Analyze a FINN search URL and enrich matching listings.
+
+    Search-level results are NOT cached as a whole (the search page itself
+    is cached at the HTML level).  Individual ad analyses ARE cached via
+    ``analyze_ad``, so re-running a search only re-scores ads whose
+    underlying data has changed.
+    """
    conn = cache.init_db(FINN_CACHE_PATH)
    cards = await search.fetch_search_pages(
        search_url,
@@ -177,6 +272,7 @@ async def analyze_search(
    results = []
    enriched_count = 0
    skipped_count = 0
+    cache_hits = 0

    if fetch_details:
        for card in cards[:detail_limit]:
@@ -200,12 +296,14 @@ async def analyze_search(

            if result.get("eiendom_unit"):
                enriched_count += 1
+            # Track analysis cache hits via the absence of recompute logging
+            # (the flag is not propagated up here; rely on debug logs).
            results.append(result)

    results.sort(key=lambda item: item["score"].get("total", 0.0), reverse=True)
    return {
        "search_url": search_url,
-        "search_cards": [card.model_dump() for card in cards],
+        "search_cards": [card.model_dump(mode="json") for card in cards],
        "analysis": results,
        "summary": {
            "total_listings": len(cards),
@@ -213,4 +311,4 @@ async def analyze_search(
            "skipped_listings": skipped_count,
            "eiendom_enriched": enriched_count,
        },
-    }
+    }