Implement caching strategy for analysis results and enhance hash-aware data fetching

This commit is contained in:
Ole
2026-05-26 13:54:49 +00:00
parent 46fd22c277
commit 22f30ebf00
3 changed files with 557 additions and 122 deletions
+115 -17
View File
@@ -1,9 +1,37 @@
"""Orchestration for FINN search + Eiendom.no enrichment + scoring."""
"""Orchestration for FINN search + Eiendom.no enrichment + scoring.
Analysis caching
----------------
``analyze_ad`` caches its result under a ``deps_hash`` that is the
SHA-256 of the combined raw payloads of the ad, the eiendom unit, and the
comparable sales used to produce it. On a subsequent call the function:
1. Reads the three raw content hashes from the DB (no deserialisation).
2. Derives the same deps_hash from those hashes.
3. Checks analysis_cache for a matching (finnkode, deps_hash) row.
4. Returns the cached result immediately if found.
5. Otherwise runs the full scoring pipeline and writes to analysis_cache.
The cached result is invalidated automatically the moment any piece of
underlying data changes, because the deps_hash will differ.
"""
import logging
from . import ad as ad_module
from . import cache, eiendom_no, scoring, search
from .cache import (
combine_hashes,
get_analysis,
get_eiendom_unit_hash,
get_finn_ad_hash,
get_similar_units_hash,
invalidate_analysis,
save_analysis,
save_eiendom_unit,
save_finn_ad,
save_similar_units,
)
from .config import (
EIENDOM_NO_CACHE_TTL_HOURS,
FINN_CACHE_PATH,
@@ -86,38 +114,93 @@ def _build_ad_summary(
}
def _compute_deps_hash(
conn,
finnkode: str,
unit_code: str | None,
listing_status: str = "RECENTLY_SOLD",
) -> str:
"""Derive a deps_hash from the three stored raw content hashes.
Reads only the hash column -- no payload deserialisation.
"""
ad_hash = get_finn_ad_hash(conn, finnkode)
unit_hash = get_eiendom_unit_hash(conn, unit_code) if unit_code else None
comps_hash = (
get_similar_units_hash(conn, unit_code, listing_status) if unit_code else None
)
return combine_hashes(ad_hash, unit_hash, comps_hash)
async def analyze_ad(
finn_ad: FinnAd,
unit_code: str | None = None,
) -> dict:
"""Enrich a FinnAd and compute score summary."""
"""Enrich a FinnAd and compute score summary.
Result is cached in analysis_cache keyed by deps_hash. Recomputation
happens only when the underlying raw data has actually changed.
"""
conn = cache.init_db(FINN_CACHE_PATH)
# ------------------------------------------------------------------
# 1. Ensure the ad is in the DB so we have a stable hash to key on.
# ------------------------------------------------------------------
ad_hash, ad_changed = save_finn_ad(conn, finn_ad)
# ------------------------------------------------------------------
# 2. Fetch / refresh Eiendom.no data (cache-aware).
# ------------------------------------------------------------------
enriched: EiendomUnit | None = None
similar_units: list[SimilarUnit] = []
unit_hash_changed = False
if unit_code:
enriched = cache.get_eiendom_unit(conn, unit_code)
if enriched is None:
enriched = await eiendom_no.enrich_ad_with_eiendom_no(finn_ad, unit_code)
if enriched is not None:
cache.save_eiendom_unit(conn, enriched)
_, unit_hash_changed = save_eiendom_unit(conn, enriched)
# If already cached, unit_hash_changed stays False -- no new write.
# ------------------------------------------------------------------
# 3. Fetch / refresh similar units (cache-aware).
# ------------------------------------------------------------------
similar_units: list[SimilarUnit] = []
comps_hash_changed = False
if enriched:
# Check cache for similar units first. The cache uses (unit_code,
# listing_status) as the key, so we must look it up by unit_code.
similar_units = cache.get_similar_units(
conn, enriched.unit_code, "RECENTLY_SOLD", ttl_hours=EIENDOM_NO_CACHE_TTL_HOURS
)
if not similar_units:
# Cache miss: build the vector and fetch fresh from Eiendom.no
# (unit_vector field from get_unit is None; build locally)
vector = enriched.unit_vector or eiendom_no.build_unit_vector(enriched)
if vector:
similar_units = await eiendom_no.get_similar_units(vector)
# Save to cache
if similar_units:
cache.save_similar_units(conn, enriched.unit_code, "RECENTLY_SOLD", similar_units)
_, comps_hash_changed = save_similar_units(
conn, enriched.unit_code, "RECENTLY_SOLD", similar_units
)
# ------------------------------------------------------------------
# 4. Derive deps_hash and check analysis_cache.
# ------------------------------------------------------------------
deps_hash = _compute_deps_hash(conn, finn_ad.finnkode, unit_code)
cached_analysis = get_analysis(conn, finn_ad.finnkode, deps_hash)
if cached_analysis is not None:
logger.debug("analysis_cache hit for %s -- skipping recompute", finn_ad.finnkode)
return cached_analysis
# ------------------------------------------------------------------
# 5. Cache miss: compute, store, return.
# ------------------------------------------------------------------
logger.debug(
"analysis_cache miss for %s (ad_changed=%s, unit_changed=%s, comps_changed=%s)",
finn_ad.finnkode,
ad_changed,
unit_hash_changed,
comps_hash_changed,
)
scores = scoring.score_ad(finn_ad, enriched, similar_units)
categories = scoring.classify_ad(scores)
@@ -130,10 +213,16 @@ async def analyze_ad(
"score": scores,
"categories": categories,
"summary": summary,
"eiendom_unit": enriched.model_dump() if enriched else None,
"similar_units": [unit.model_dump() for unit in similar_units],
"eiendom_unit": enriched.model_dump(mode="json") if enriched else None,
"similar_units": [unit.model_dump(mode="json") for unit in similar_units],
}
cache.save_finn_ad(conn, finn_ad)
# Round-trip through JSON to guarantee all values are serialisable
# (catches any datetime that survives model_dump, e.g. from scoring).
import json as _json
result = _json.loads(_json.dumps(result, default=str))
save_analysis(conn, finn_ad.finnkode, deps_hash, result)
return result
@@ -166,7 +255,13 @@ async def analyze_search(
client=None,
use_cache: bool = True,
) -> dict:
"""Analyze a FINN search URL and enrich matching listings."""
"""Analyze a FINN search URL and enrich matching listings.
Search-level results are NOT cached as a whole (the search page itself
is cached at the HTML level). Individual ad analyses ARE cached via
``analyze_ad``, so re-running a search only re-scores ads whose
underlying data has changed.
"""
conn = cache.init_db(FINN_CACHE_PATH)
cards = await search.fetch_search_pages(
search_url,
@@ -177,6 +272,7 @@ async def analyze_search(
results = []
enriched_count = 0
skipped_count = 0
cache_hits = 0
if fetch_details:
for card in cards[:detail_limit]:
@@ -200,12 +296,14 @@ async def analyze_search(
if result.get("eiendom_unit"):
enriched_count += 1
# Track analysis cache hits via the absence of recompute logging
# (the flag is not propagated up here; rely on debug logs).
results.append(result)
results.sort(key=lambda item: item["score"].get("total", 0.0), reverse=True)
return {
"search_url": search_url,
"search_cards": [card.model_dump() for card in cards],
"search_cards": [card.model_dump(mode="json") for card in cards],
"analysis": results,
"summary": {
"total_listings": len(cards),
@@ -213,4 +311,4 @@ async def analyze_search(
"skipped_listings": skipped_count,
"eiendom_enriched": enriched_count,
},
}
}