Implement caching strategy for analysis results and enhance hash-aware data fetching
This commit is contained in:
+115
-17
@@ -1,9 +1,37 @@
|
||||
"""Orchestration for FINN search + Eiendom.no enrichment + scoring."""
|
||||
"""Orchestration for FINN search + Eiendom.no enrichment + scoring.
|
||||
|
||||
Analysis caching
|
||||
----------------
|
||||
``analyze_ad`` caches its result under a ``deps_hash`` that is the
|
||||
SHA-256 of the combined raw payloads of the ad, the eiendom unit, and the
|
||||
comparable sales used to produce it. On a subsequent call the function:
|
||||
|
||||
1. Reads the three raw content hashes from the DB (no deserialisation).
|
||||
2. Derives the same deps_hash from those hashes.
|
||||
3. Checks analysis_cache for a matching (finnkode, deps_hash) row.
|
||||
4. Returns the cached result immediately if found.
|
||||
5. Otherwise runs the full scoring pipeline and writes to analysis_cache.
|
||||
|
||||
The cached result is invalidated automatically the moment any piece of
|
||||
underlying data changes, because the deps_hash will differ.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from . import ad as ad_module
|
||||
from . import cache, eiendom_no, scoring, search
|
||||
from .cache import (
|
||||
combine_hashes,
|
||||
get_analysis,
|
||||
get_eiendom_unit_hash,
|
||||
get_finn_ad_hash,
|
||||
get_similar_units_hash,
|
||||
invalidate_analysis,
|
||||
save_analysis,
|
||||
save_eiendom_unit,
|
||||
save_finn_ad,
|
||||
save_similar_units,
|
||||
)
|
||||
from .config import (
|
||||
EIENDOM_NO_CACHE_TTL_HOURS,
|
||||
FINN_CACHE_PATH,
|
||||
@@ -86,38 +114,93 @@ def _build_ad_summary(
|
||||
}
|
||||
|
||||
|
||||
def _compute_deps_hash(
|
||||
conn,
|
||||
finnkode: str,
|
||||
unit_code: str | None,
|
||||
listing_status: str = "RECENTLY_SOLD",
|
||||
) -> str:
|
||||
"""Derive a deps_hash from the three stored raw content hashes.
|
||||
|
||||
Reads only the hash column -- no payload deserialisation.
|
||||
"""
|
||||
ad_hash = get_finn_ad_hash(conn, finnkode)
|
||||
unit_hash = get_eiendom_unit_hash(conn, unit_code) if unit_code else None
|
||||
comps_hash = (
|
||||
get_similar_units_hash(conn, unit_code, listing_status) if unit_code else None
|
||||
)
|
||||
return combine_hashes(ad_hash, unit_hash, comps_hash)
|
||||
|
||||
|
||||
async def analyze_ad(
|
||||
finn_ad: FinnAd,
|
||||
unit_code: str | None = None,
|
||||
) -> dict:
|
||||
"""Enrich a FinnAd and compute score summary."""
|
||||
"""Enrich a FinnAd and compute score summary.
|
||||
|
||||
Result is cached in analysis_cache keyed by deps_hash. Recomputation
|
||||
happens only when the underlying raw data has actually changed.
|
||||
"""
|
||||
conn = cache.init_db(FINN_CACHE_PATH)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 1. Ensure the ad is in the DB so we have a stable hash to key on.
|
||||
# ------------------------------------------------------------------
|
||||
ad_hash, ad_changed = save_finn_ad(conn, finn_ad)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 2. Fetch / refresh Eiendom.no data (cache-aware).
|
||||
# ------------------------------------------------------------------
|
||||
enriched: EiendomUnit | None = None
|
||||
similar_units: list[SimilarUnit] = []
|
||||
unit_hash_changed = False
|
||||
|
||||
if unit_code:
|
||||
enriched = cache.get_eiendom_unit(conn, unit_code)
|
||||
if enriched is None:
|
||||
enriched = await eiendom_no.enrich_ad_with_eiendom_no(finn_ad, unit_code)
|
||||
if enriched is not None:
|
||||
cache.save_eiendom_unit(conn, enriched)
|
||||
_, unit_hash_changed = save_eiendom_unit(conn, enriched)
|
||||
# If already cached, unit_hash_changed stays False -- no new write.
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3. Fetch / refresh similar units (cache-aware).
|
||||
# ------------------------------------------------------------------
|
||||
similar_units: list[SimilarUnit] = []
|
||||
comps_hash_changed = False
|
||||
|
||||
if enriched:
|
||||
# Check cache for similar units first. The cache uses (unit_code,
|
||||
# listing_status) as the key, so we must look it up by unit_code.
|
||||
similar_units = cache.get_similar_units(
|
||||
conn, enriched.unit_code, "RECENTLY_SOLD", ttl_hours=EIENDOM_NO_CACHE_TTL_HOURS
|
||||
)
|
||||
|
||||
if not similar_units:
|
||||
# Cache miss: build the vector and fetch fresh from Eiendom.no
|
||||
# (unit_vector field from get_unit is None; build locally)
|
||||
vector = enriched.unit_vector or eiendom_no.build_unit_vector(enriched)
|
||||
if vector:
|
||||
similar_units = await eiendom_no.get_similar_units(vector)
|
||||
# Save to cache
|
||||
if similar_units:
|
||||
cache.save_similar_units(conn, enriched.unit_code, "RECENTLY_SOLD", similar_units)
|
||||
_, comps_hash_changed = save_similar_units(
|
||||
conn, enriched.unit_code, "RECENTLY_SOLD", similar_units
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 4. Derive deps_hash and check analysis_cache.
|
||||
# ------------------------------------------------------------------
|
||||
deps_hash = _compute_deps_hash(conn, finn_ad.finnkode, unit_code)
|
||||
|
||||
cached_analysis = get_analysis(conn, finn_ad.finnkode, deps_hash)
|
||||
if cached_analysis is not None:
|
||||
logger.debug("analysis_cache hit for %s -- skipping recompute", finn_ad.finnkode)
|
||||
return cached_analysis
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 5. Cache miss: compute, store, return.
|
||||
# ------------------------------------------------------------------
|
||||
logger.debug(
|
||||
"analysis_cache miss for %s (ad_changed=%s, unit_changed=%s, comps_changed=%s)",
|
||||
finn_ad.finnkode,
|
||||
ad_changed,
|
||||
unit_hash_changed,
|
||||
comps_hash_changed,
|
||||
)
|
||||
|
||||
scores = scoring.score_ad(finn_ad, enriched, similar_units)
|
||||
categories = scoring.classify_ad(scores)
|
||||
@@ -130,10 +213,16 @@ async def analyze_ad(
|
||||
"score": scores,
|
||||
"categories": categories,
|
||||
"summary": summary,
|
||||
"eiendom_unit": enriched.model_dump() if enriched else None,
|
||||
"similar_units": [unit.model_dump() for unit in similar_units],
|
||||
"eiendom_unit": enriched.model_dump(mode="json") if enriched else None,
|
||||
"similar_units": [unit.model_dump(mode="json") for unit in similar_units],
|
||||
}
|
||||
cache.save_finn_ad(conn, finn_ad)
|
||||
|
||||
# Round-trip through JSON to guarantee all values are serialisable
|
||||
# (catches any datetime that survives model_dump, e.g. from scoring).
|
||||
import json as _json
|
||||
result = _json.loads(_json.dumps(result, default=str))
|
||||
|
||||
save_analysis(conn, finn_ad.finnkode, deps_hash, result)
|
||||
return result
|
||||
|
||||
|
||||
@@ -166,7 +255,13 @@ async def analyze_search(
|
||||
client=None,
|
||||
use_cache: bool = True,
|
||||
) -> dict:
|
||||
"""Analyze a FINN search URL and enrich matching listings."""
|
||||
"""Analyze a FINN search URL and enrich matching listings.
|
||||
|
||||
Search-level results are NOT cached as a whole (the search page itself
|
||||
is cached at the HTML level). Individual ad analyses ARE cached via
|
||||
``analyze_ad``, so re-running a search only re-scores ads whose
|
||||
underlying data has changed.
|
||||
"""
|
||||
conn = cache.init_db(FINN_CACHE_PATH)
|
||||
cards = await search.fetch_search_pages(
|
||||
search_url,
|
||||
@@ -177,6 +272,7 @@ async def analyze_search(
|
||||
results = []
|
||||
enriched_count = 0
|
||||
skipped_count = 0
|
||||
cache_hits = 0
|
||||
|
||||
if fetch_details:
|
||||
for card in cards[:detail_limit]:
|
||||
@@ -200,12 +296,14 @@ async def analyze_search(
|
||||
|
||||
if result.get("eiendom_unit"):
|
||||
enriched_count += 1
|
||||
# Track analysis cache hits via the absence of recompute logging
|
||||
# (the flag is not propagated up here; rely on debug logs).
|
||||
results.append(result)
|
||||
|
||||
results.sort(key=lambda item: item["score"].get("total", 0.0), reverse=True)
|
||||
return {
|
||||
"search_url": search_url,
|
||||
"search_cards": [card.model_dump() for card in cards],
|
||||
"search_cards": [card.model_dump(mode="json") for card in cards],
|
||||
"analysis": results,
|
||||
"summary": {
|
||||
"total_listings": len(cards),
|
||||
@@ -213,4 +311,4 @@ async def analyze_search(
|
||||
"skipped_listings": skipped_count,
|
||||
"eiendom_enriched": enriched_count,
|
||||
},
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user