Implement caching strategy for analysis results and enhance hash-aware data fetching
This commit is contained in:
+114
-16
@@ -1,9 +1,37 @@
|
||||
"""Orchestration for FINN search + Eiendom.no enrichment + scoring."""
|
||||
"""Orchestration for FINN search + Eiendom.no enrichment + scoring.
|
||||
|
||||
Analysis caching
|
||||
----------------
|
||||
``analyze_ad`` caches its result under a ``deps_hash`` that is the
|
||||
SHA-256 of the combined raw payloads of the ad, the eiendom unit, and the
|
||||
comparable sales used to produce it. On a subsequent call the function:
|
||||
|
||||
1. Reads the three raw content hashes from the DB (no deserialisation).
|
||||
2. Derives the same deps_hash from those hashes.
|
||||
3. Checks analysis_cache for a matching (finnkode, deps_hash) row.
|
||||
4. Returns the cached result immediately if found.
|
||||
5. Otherwise runs the full scoring pipeline and writes to analysis_cache.
|
||||
|
||||
The cached result is invalidated automatically the moment any piece of
|
||||
underlying data changes, because the deps_hash will differ.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from . import ad as ad_module
|
||||
from . import cache, eiendom_no, scoring, search
|
||||
from .cache import (
|
||||
combine_hashes,
|
||||
get_analysis,
|
||||
get_eiendom_unit_hash,
|
||||
get_finn_ad_hash,
|
||||
get_similar_units_hash,
|
||||
invalidate_analysis,
|
||||
save_analysis,
|
||||
save_eiendom_unit,
|
||||
save_finn_ad,
|
||||
save_similar_units,
|
||||
)
|
||||
from .config import (
|
||||
EIENDOM_NO_CACHE_TTL_HOURS,
|
||||
FINN_CACHE_PATH,
|
||||
@@ -86,38 +114,93 @@ def _build_ad_summary(
|
||||
}
|
||||
|
||||
|
||||
def _compute_deps_hash(
|
||||
conn,
|
||||
finnkode: str,
|
||||
unit_code: str | None,
|
||||
listing_status: str = "RECENTLY_SOLD",
|
||||
) -> str:
|
||||
"""Derive a deps_hash from the three stored raw content hashes.
|
||||
|
||||
Reads only the hash column -- no payload deserialisation.
|
||||
"""
|
||||
ad_hash = get_finn_ad_hash(conn, finnkode)
|
||||
unit_hash = get_eiendom_unit_hash(conn, unit_code) if unit_code else None
|
||||
comps_hash = (
|
||||
get_similar_units_hash(conn, unit_code, listing_status) if unit_code else None
|
||||
)
|
||||
return combine_hashes(ad_hash, unit_hash, comps_hash)
|
||||
|
||||
|
||||
async def analyze_ad(
|
||||
finn_ad: FinnAd,
|
||||
unit_code: str | None = None,
|
||||
) -> dict:
|
||||
"""Enrich a FinnAd and compute score summary."""
|
||||
"""Enrich a FinnAd and compute score summary.
|
||||
|
||||
Result is cached in analysis_cache keyed by deps_hash. Recomputation
|
||||
happens only when the underlying raw data has actually changed.
|
||||
"""
|
||||
conn = cache.init_db(FINN_CACHE_PATH)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 1. Ensure the ad is in the DB so we have a stable hash to key on.
|
||||
# ------------------------------------------------------------------
|
||||
ad_hash, ad_changed = save_finn_ad(conn, finn_ad)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 2. Fetch / refresh Eiendom.no data (cache-aware).
|
||||
# ------------------------------------------------------------------
|
||||
enriched: EiendomUnit | None = None
|
||||
similar_units: list[SimilarUnit] = []
|
||||
unit_hash_changed = False
|
||||
|
||||
if unit_code:
|
||||
enriched = cache.get_eiendom_unit(conn, unit_code)
|
||||
if enriched is None:
|
||||
enriched = await eiendom_no.enrich_ad_with_eiendom_no(finn_ad, unit_code)
|
||||
if enriched is not None:
|
||||
cache.save_eiendom_unit(conn, enriched)
|
||||
_, unit_hash_changed = save_eiendom_unit(conn, enriched)
|
||||
# If already cached, unit_hash_changed stays False -- no new write.
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3. Fetch / refresh similar units (cache-aware).
|
||||
# ------------------------------------------------------------------
|
||||
similar_units: list[SimilarUnit] = []
|
||||
comps_hash_changed = False
|
||||
|
||||
if enriched:
|
||||
# Check cache for similar units first. The cache uses (unit_code,
|
||||
# listing_status) as the key, so we must look it up by unit_code.
|
||||
similar_units = cache.get_similar_units(
|
||||
conn, enriched.unit_code, "RECENTLY_SOLD", ttl_hours=EIENDOM_NO_CACHE_TTL_HOURS
|
||||
)
|
||||
|
||||
if not similar_units:
|
||||
# Cache miss: build the vector and fetch fresh from Eiendom.no
|
||||
# (unit_vector field from get_unit is None; build locally)
|
||||
vector = enriched.unit_vector or eiendom_no.build_unit_vector(enriched)
|
||||
if vector:
|
||||
similar_units = await eiendom_no.get_similar_units(vector)
|
||||
# Save to cache
|
||||
if similar_units:
|
||||
cache.save_similar_units(conn, enriched.unit_code, "RECENTLY_SOLD", similar_units)
|
||||
_, comps_hash_changed = save_similar_units(
|
||||
conn, enriched.unit_code, "RECENTLY_SOLD", similar_units
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 4. Derive deps_hash and check analysis_cache.
|
||||
# ------------------------------------------------------------------
|
||||
deps_hash = _compute_deps_hash(conn, finn_ad.finnkode, unit_code)
|
||||
|
||||
cached_analysis = get_analysis(conn, finn_ad.finnkode, deps_hash)
|
||||
if cached_analysis is not None:
|
||||
logger.debug("analysis_cache hit for %s -- skipping recompute", finn_ad.finnkode)
|
||||
return cached_analysis
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 5. Cache miss: compute, store, return.
|
||||
# ------------------------------------------------------------------
|
||||
logger.debug(
|
||||
"analysis_cache miss for %s (ad_changed=%s, unit_changed=%s, comps_changed=%s)",
|
||||
finn_ad.finnkode,
|
||||
ad_changed,
|
||||
unit_hash_changed,
|
||||
comps_hash_changed,
|
||||
)
|
||||
|
||||
scores = scoring.score_ad(finn_ad, enriched, similar_units)
|
||||
categories = scoring.classify_ad(scores)
|
||||
@@ -130,10 +213,16 @@ async def analyze_ad(
|
||||
"score": scores,
|
||||
"categories": categories,
|
||||
"summary": summary,
|
||||
"eiendom_unit": enriched.model_dump() if enriched else None,
|
||||
"similar_units": [unit.model_dump() for unit in similar_units],
|
||||
"eiendom_unit": enriched.model_dump(mode="json") if enriched else None,
|
||||
"similar_units": [unit.model_dump(mode="json") for unit in similar_units],
|
||||
}
|
||||
cache.save_finn_ad(conn, finn_ad)
|
||||
|
||||
# Round-trip through JSON to guarantee all values are serialisable
|
||||
# (catches any datetime that survives model_dump, e.g. from scoring).
|
||||
import json as _json
|
||||
result = _json.loads(_json.dumps(result, default=str))
|
||||
|
||||
save_analysis(conn, finn_ad.finnkode, deps_hash, result)
|
||||
return result
|
||||
|
||||
|
||||
@@ -166,7 +255,13 @@ async def analyze_search(
|
||||
client=None,
|
||||
use_cache: bool = True,
|
||||
) -> dict:
|
||||
"""Analyze a FINN search URL and enrich matching listings."""
|
||||
"""Analyze a FINN search URL and enrich matching listings.
|
||||
|
||||
Search-level results are NOT cached as a whole (the search page itself
|
||||
is cached at the HTML level). Individual ad analyses ARE cached via
|
||||
``analyze_ad``, so re-running a search only re-scores ads whose
|
||||
underlying data has changed.
|
||||
"""
|
||||
conn = cache.init_db(FINN_CACHE_PATH)
|
||||
cards = await search.fetch_search_pages(
|
||||
search_url,
|
||||
@@ -177,6 +272,7 @@ async def analyze_search(
|
||||
results = []
|
||||
enriched_count = 0
|
||||
skipped_count = 0
|
||||
cache_hits = 0
|
||||
|
||||
if fetch_details:
|
||||
for card in cards[:detail_limit]:
|
||||
@@ -200,12 +296,14 @@ async def analyze_search(
|
||||
|
||||
if result.get("eiendom_unit"):
|
||||
enriched_count += 1
|
||||
# Track analysis cache hits via the absence of recompute logging
|
||||
# (the flag is not propagated up here; rely on debug logs).
|
||||
results.append(result)
|
||||
|
||||
results.sort(key=lambda item: item["score"].get("total", 0.0), reverse=True)
|
||||
return {
|
||||
"search_url": search_url,
|
||||
"search_cards": [card.model_dump() for card in cards],
|
||||
"search_cards": [card.model_dump(mode="json") for card in cards],
|
||||
"analysis": results,
|
||||
"summary": {
|
||||
"total_listings": len(cards),
|
||||
|
||||
+330
-38
@@ -1,5 +1,29 @@
|
||||
"""SQLite cache and persistence for FINN and Eiendom.no data."""
|
||||
"""SQLite cache and persistence for FINN and Eiendom.no data.
|
||||
|
||||
Caching strategy
|
||||
----------------
|
||||
Raw data (finn_ads, eiendom_units, similar_units)
|
||||
Stored with a SHA-256 content_hash of the serialised payload.
|
||||
On write: compare incoming hash to stored hash. If equal the remote
|
||||
data has not changed -- the row is left untouched and the caller gets
|
||||
back ``changed=False``, which preserves a valid analysis_cache entry.
|
||||
|
||||
Analysis results (analysis_cache)
|
||||
Keyed by ``(finnkode, deps_hash)`` where deps_hash = SHA-256 of the
|
||||
combined raw payloads of the ad, eiendom unit, and comps that were used
|
||||
to produce the result. A cache hit is only valid when the deps_hash
|
||||
still matches, i.e. none of the underlying data has changed.
|
||||
This means analysis is re-run *only* when remote data actually changes,
|
||||
not on every TTL tick.
|
||||
|
||||
Search pages / cards (cache_meta)
|
||||
Still TTL-based -- these change frequently and a content-hash over a
|
||||
full HTML page is cheap but the semantics of "changed" are less clear
|
||||
(ads added/removed vs. cosmetic HTML tweaks). Hash is stored anyway so
|
||||
callers can detect real list changes if desired.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
@@ -12,6 +36,32 @@ from .models import EiendomUnit, FinnAd, FinnSearchCard, SimilarUnit
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hashing helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def compute_content_hash(payload: Any) -> str:
|
||||
"""Return a stable SHA-256 hex digest of *payload*.
|
||||
|
||||
*payload* can be a dict, list, or any JSON-serialisable value.
|
||||
Keys are sorted so that insertion order does not affect the hash.
|
||||
"""
|
||||
serialised = json.dumps(payload, sort_keys=True, default=str)
|
||||
return hashlib.sha256(serialised.encode()).hexdigest()
|
||||
|
||||
|
||||
def combine_hashes(*hashes: str | None) -> str:
|
||||
"""Combine multiple content hashes into one deterministic deps_hash."""
|
||||
combined = "|".join(h or "" for h in hashes)
|
||||
return hashlib.sha256(combined.encode()).hexdigest()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Connection / schema
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_connection(path: str | None = None) -> sqlite3.Connection:
|
||||
db_path = path or FINN_CACHE_PATH
|
||||
conn = sqlite3.connect(str(db_path), detect_types=sqlite3.PARSE_DECLTYPES)
|
||||
@@ -22,25 +72,33 @@ def get_connection(path: str | None = None) -> sqlite3.Connection:
|
||||
def init_db(path: str | None = None) -> sqlite3.Connection:
|
||||
conn = get_connection(path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS finn_ads (
|
||||
finnkode TEXT PRIMARY KEY,
|
||||
url TEXT,
|
||||
payload TEXT NOT NULL,
|
||||
content_hash TEXT,
|
||||
fetched_at TEXT NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
# Migration: add content_hash column if the table already existed without it.
|
||||
_add_column_if_missing(cursor, "finn_ads", "content_hash", "TEXT")
|
||||
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS eiendom_units (
|
||||
unit_code TEXT PRIMARY KEY,
|
||||
payload TEXT NOT NULL,
|
||||
content_hash TEXT,
|
||||
fetched_at TEXT NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
_add_column_if_missing(cursor, "eiendom_units", "content_hash", "TEXT")
|
||||
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS similar_units (
|
||||
@@ -48,36 +106,66 @@ def init_db(path: str | None = None) -> sqlite3.Connection:
|
||||
unit_code TEXT NOT NULL,
|
||||
listing_status TEXT NOT NULL,
|
||||
payload TEXT NOT NULL,
|
||||
content_hash TEXT,
|
||||
fetched_at TEXT NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
_add_column_if_missing(cursor, "similar_units", "content_hash", "TEXT")
|
||||
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS cache_meta (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL,
|
||||
content_hash TEXT,
|
||||
expires_at TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
_add_column_if_missing(cursor, "cache_meta", "content_hash", "TEXT")
|
||||
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS analysis_cache (
|
||||
finnkode TEXT PRIMARY KEY,
|
||||
deps_hash TEXT NOT NULL,
|
||||
payload TEXT NOT NULL,
|
||||
computed_at TEXT NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
return conn
|
||||
|
||||
|
||||
def _add_column_if_missing(
|
||||
cursor: sqlite3.Cursor, table: str, column: str, col_type: str
|
||||
) -> None:
|
||||
"""ALTER TABLE … ADD COLUMN is idempotent via this guard."""
|
||||
cursor.execute(f"PRAGMA table_info({table})")
|
||||
existing = {row["name"] for row in cursor.fetchall()}
|
||||
if column not in existing:
|
||||
cursor.execute(f"ALTER TABLE {table} ADD COLUMN {column} {col_type}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Generic cache_meta helpers (search pages, search cards)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def cache_get(conn: sqlite3.Connection, key: str) -> dict[str, Any] | None:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT value, expires_at FROM cache_meta WHERE key = ?", (key,))
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
|
||||
expires_at = row["expires_at"]
|
||||
if expires_at and datetime.fromisoformat(expires_at) < datetime.now(UTC):
|
||||
cursor.execute("DELETE FROM cache_meta WHERE key = ?", (key,))
|
||||
conn.commit()
|
||||
return None
|
||||
|
||||
return json.loads(row["value"])
|
||||
|
||||
|
||||
@@ -87,24 +175,28 @@ def cache_set(
|
||||
payload: dict[str, Any],
|
||||
ttl_hours: int | None = None,
|
||||
ttl_minutes: int | None = None,
|
||||
) -> None:
|
||||
) -> str:
|
||||
"""Store *payload* in cache_meta and return its content_hash."""
|
||||
expires_at = None
|
||||
if ttl_minutes is not None:
|
||||
expires_at = (datetime.now(UTC) + timedelta(minutes=ttl_minutes)).isoformat()
|
||||
elif ttl_hours is not None:
|
||||
expires_at = (datetime.now(UTC) + timedelta(hours=ttl_hours)).isoformat()
|
||||
|
||||
content_hash = compute_content_hash(payload)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO cache_meta (key, value, expires_at) VALUES (?, ?, ?)",
|
||||
(key, json.dumps(payload), expires_at),
|
||||
"INSERT OR REPLACE INTO cache_meta (key, value, content_hash, expires_at)"
|
||||
" VALUES (?, ?, ?, ?)",
|
||||
(key, json.dumps(payload, default=_json_default), content_hash, expires_at),
|
||||
)
|
||||
conn.commit()
|
||||
return content_hash
|
||||
|
||||
|
||||
def _is_fresh(fetched_at: str, ttl_hours: int | None) -> bool:
|
||||
if ttl_hours is None:
|
||||
return True
|
||||
return datetime.fromisoformat(fetched_at) >= datetime.now(UTC) - timedelta(hours=ttl_hours)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Search page / cards helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def save_search_page(
|
||||
@@ -112,8 +204,9 @@ def save_search_page(
|
||||
url: str,
|
||||
html: str,
|
||||
ttl_minutes: int = 60,
|
||||
) -> None:
|
||||
cache_set(conn, f"search_page:{url}", {"html": html}, ttl_minutes=ttl_minutes)
|
||||
) -> str:
|
||||
"""Cache raw HTML for a search page URL. Returns content_hash."""
|
||||
return cache_set(conn, f"search_page:{url}", {"html": html}, ttl_minutes=ttl_minutes)
|
||||
|
||||
|
||||
def get_search_page(conn: sqlite3.Connection, url: str) -> str | None:
|
||||
@@ -128,8 +221,9 @@ def save_search_cards(
|
||||
url: str,
|
||||
cards: list[FinnSearchCard],
|
||||
ttl_minutes: int = 60,
|
||||
) -> None:
|
||||
cache_set(
|
||||
) -> str:
|
||||
"""Cache parsed search cards. Returns content_hash."""
|
||||
return cache_set(
|
||||
conn,
|
||||
f"search_cards:{url}",
|
||||
[card.model_dump(mode="json") for card in cards],
|
||||
@@ -144,28 +238,54 @@ def get_search_cards(conn: sqlite3.Connection, url: str) -> list[FinnSearchCard]
|
||||
return [FinnSearchCard.model_validate(item) for item in payload]
|
||||
|
||||
|
||||
def save_finn_ad(conn: sqlite3.Connection, ad: FinnAd) -> None:
|
||||
# ---------------------------------------------------------------------------
|
||||
# FinnAd
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def save_finn_ad(conn: sqlite3.Connection, ad: FinnAd) -> tuple[str, bool]:
|
||||
"""Persist *ad* to finn_ads.
|
||||
|
||||
Returns ``(content_hash, changed)`` where ``changed=False`` means the
|
||||
remote payload is identical to what was already stored -- callers can
|
||||
use this to skip analysis recomputation.
|
||||
"""
|
||||
cursor = conn.cursor()
|
||||
payload = ad.model_dump(mode="json")
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO finn_ads (finnkode, url, payload, fetched_at) VALUES (?, ?, ?, ?)",
|
||||
(
|
||||
ad.finnkode,
|
||||
ad.url,
|
||||
json.dumps(payload),
|
||||
new_hash = compute_content_hash(payload)
|
||||
fetched_at = (
|
||||
ad.detail_fetched_at.isoformat()
|
||||
if ad.detail_fetched_at
|
||||
else datetime.now(UTC).isoformat(),
|
||||
),
|
||||
else datetime.now(UTC).isoformat()
|
||||
)
|
||||
|
||||
# Check existing hash before writing.
|
||||
cursor.execute(
|
||||
"SELECT content_hash FROM finn_ads WHERE finnkode = ?", (ad.finnkode,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if row and row["content_hash"] == new_hash:
|
||||
logger.debug("finn_ad %s unchanged (hash match)", ad.finnkode)
|
||||
return new_hash, False
|
||||
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO finn_ads"
|
||||
" (finnkode, url, payload, content_hash, fetched_at)"
|
||||
" VALUES (?, ?, ?, ?, ?)",
|
||||
(ad.finnkode, ad.url, json.dumps(payload, default=_json_default), new_hash, fetched_at),
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug("finn_ad %s saved (hash=%s)", ad.finnkode, new_hash[:8])
|
||||
return new_hash, True
|
||||
|
||||
|
||||
def get_finn_ad(
|
||||
conn: sqlite3.Connection, finnkode: str, ttl_hours: int | None = None
|
||||
) -> FinnAd | None:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT payload, fetched_at FROM finn_ads WHERE finnkode = ?", (finnkode,))
|
||||
cursor.execute(
|
||||
"SELECT payload, fetched_at FROM finn_ads WHERE finnkode = ?", (finnkode,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
@@ -174,13 +294,47 @@ def get_finn_ad(
|
||||
return FinnAd.model_validate(json.loads(row["payload"]))
|
||||
|
||||
|
||||
def save_eiendom_unit(conn: sqlite3.Connection, unit: EiendomUnit) -> None:
|
||||
def get_finn_ad_hash(conn: sqlite3.Connection, finnkode: str) -> str | None:
|
||||
"""Return the stored content_hash for *finnkode*, or None if not cached."""
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO eiendom_units (unit_code, payload, fetched_at) VALUES (?, ?, ?)",
|
||||
(unit.unit_code, json.dumps(unit.model_dump(mode="json")), unit.fetched_at.isoformat()),
|
||||
"SELECT content_hash FROM finn_ads WHERE finnkode = ?", (finnkode,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return row["content_hash"] if row else None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# EiendomUnit
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def save_eiendom_unit(conn: sqlite3.Connection, unit: EiendomUnit) -> tuple[str, bool]:
|
||||
"""Persist *unit* to eiendom_units.
|
||||
|
||||
Returns ``(content_hash, changed)``.
|
||||
"""
|
||||
cursor = conn.cursor()
|
||||
payload = unit.model_dump(mode="json")
|
||||
new_hash = compute_content_hash(payload)
|
||||
|
||||
cursor.execute(
|
||||
"SELECT content_hash FROM eiendom_units WHERE unit_code = ?", (unit.unit_code,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if row and row["content_hash"] == new_hash:
|
||||
logger.debug("eiendom_unit %s unchanged (hash match)", unit.unit_code)
|
||||
return new_hash, False
|
||||
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO eiendom_units"
|
||||
" (unit_code, payload, content_hash, fetched_at)"
|
||||
" VALUES (?, ?, ?, ?)",
|
||||
(unit.unit_code, json.dumps(payload, default=_json_default), new_hash, unit.fetched_at.isoformat()),
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug("eiendom_unit %s saved (hash=%s)", unit.unit_code, new_hash[:8])
|
||||
return new_hash, True
|
||||
|
||||
|
||||
def get_eiendom_unit(
|
||||
@@ -190,8 +344,7 @@ def get_eiendom_unit(
|
||||
) -> EiendomUnit | None:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT payload, fetched_at FROM eiendom_units WHERE unit_code = ?",
|
||||
(unit_code,),
|
||||
"SELECT payload, fetched_at FROM eiendom_units WHERE unit_code = ?", (unit_code,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
@@ -201,23 +354,65 @@ def get_eiendom_unit(
|
||||
return EiendomUnit.model_validate(json.loads(row["payload"]))
|
||||
|
||||
|
||||
def get_eiendom_unit_hash(conn: sqlite3.Connection, unit_code: str) -> str | None:
|
||||
"""Return the stored content_hash for *unit_code*, or None if not cached."""
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT content_hash FROM eiendom_units WHERE unit_code = ?", (unit_code,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return row["content_hash"] if row else None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SimilarUnits
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def save_similar_units(
|
||||
conn: sqlite3.Connection,
|
||||
unit_code: str,
|
||||
listing_status: str,
|
||||
similar_units: list[SimilarUnit],
|
||||
) -> None:
|
||||
) -> tuple[str, bool]:
|
||||
"""Persist *similar_units* for (unit_code, listing_status).
|
||||
|
||||
Returns ``(content_hash, changed)``.
|
||||
"""
|
||||
cursor = conn.cursor()
|
||||
payload = json.dumps([item.model_dump(mode="json") for item in similar_units])
|
||||
payload_list = [item.model_dump(mode="json") for item in similar_units]
|
||||
new_hash = compute_content_hash(payload_list)
|
||||
|
||||
cursor.execute(
|
||||
"SELECT payload, content_hash FROM similar_units"
|
||||
" WHERE unit_code = ? AND listing_status = ?"
|
||||
" ORDER BY id DESC LIMIT 1",
|
||||
(unit_code, listing_status),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if row and row["content_hash"] == new_hash:
|
||||
logger.debug(
|
||||
"similar_units %s/%s unchanged (hash match)", unit_code, listing_status
|
||||
)
|
||||
return new_hash, False
|
||||
|
||||
cursor.execute(
|
||||
(
|
||||
"INSERT INTO similar_units"
|
||||
" (unit_code, listing_status, payload, fetched_at)"
|
||||
" VALUES (?, ?, ?, ?)"
|
||||
" (unit_code, listing_status, payload, content_hash, fetched_at)"
|
||||
" VALUES (?, ?, ?, ?, ?)",
|
||||
(
|
||||
unit_code,
|
||||
listing_status,
|
||||
json.dumps(payload_list, default=_json_default),
|
||||
new_hash,
|
||||
datetime.now(UTC).isoformat(),
|
||||
),
|
||||
(unit_code, listing_status, payload, datetime.now(UTC).isoformat()),
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug(
|
||||
"similar_units %s/%s saved (hash=%s)", unit_code, listing_status, new_hash[:8]
|
||||
)
|
||||
return new_hash, True
|
||||
|
||||
|
||||
def get_similar_units(
|
||||
@@ -228,11 +423,9 @@ def get_similar_units(
|
||||
) -> list[SimilarUnit]:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
(
|
||||
"SELECT payload, fetched_at FROM similar_units"
|
||||
" WHERE unit_code = ? AND listing_status = ?"
|
||||
" ORDER BY id DESC LIMIT 1"
|
||||
),
|
||||
" ORDER BY id DESC LIMIT 1",
|
||||
(unit_code, listing_status),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
@@ -241,3 +434,102 @@ def get_similar_units(
|
||||
if ttl_hours is not None and not _is_fresh(row["fetched_at"], ttl_hours):
|
||||
return []
|
||||
return [SimilarUnit.model_validate(item) for item in json.loads(row["payload"])]
|
||||
|
||||
|
||||
def get_similar_units_hash(
|
||||
conn: sqlite3.Connection, unit_code: str, listing_status: str
|
||||
) -> str | None:
|
||||
"""Return the stored content_hash for (unit_code, listing_status), or None."""
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT content_hash FROM similar_units"
|
||||
" WHERE unit_code = ? AND listing_status = ?"
|
||||
" ORDER BY id DESC LIMIT 1",
|
||||
(unit_code, listing_status),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return row["content_hash"] if row else None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Analysis cache
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_analysis(
|
||||
conn: sqlite3.Connection, finnkode: str, deps_hash: str
|
||||
) -> dict[str, Any] | None:
|
||||
"""Return cached analysis for *finnkode* if deps_hash still matches.
|
||||
|
||||
``deps_hash`` encodes the combined hashes of the ad, eiendom unit, and
|
||||
comps that were used to produce the analysis. Any change to underlying
|
||||
data produces a different deps_hash and the cache is considered stale.
|
||||
"""
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT payload, deps_hash FROM analysis_cache WHERE finnkode = ?",
|
||||
(finnkode,),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
if row["deps_hash"] != deps_hash:
|
||||
logger.debug(
|
||||
"analysis_cache miss for %s (deps_hash changed %s→%s)",
|
||||
finnkode,
|
||||
row["deps_hash"][:8],
|
||||
deps_hash[:8],
|
||||
)
|
||||
return None
|
||||
logger.debug("analysis_cache hit for %s", finnkode)
|
||||
return json.loads(row["payload"])
|
||||
|
||||
|
||||
def _json_default(obj: Any) -> Any:
|
||||
"""Fallback serialiser for json.dumps.
|
||||
Converts datetime/date → ISO string; anything else → repr string.
|
||||
Means save_analysis never raises TypeError regardless of what scoring
|
||||
or model_dump() emits.
|
||||
"""
|
||||
if hasattr(obj, "isoformat"):
|
||||
return obj.isoformat()
|
||||
return repr(obj)
|
||||
|
||||
|
||||
def save_analysis(
|
||||
conn: sqlite3.Connection,
|
||||
finnkode: str,
|
||||
deps_hash: str,
|
||||
result: dict[str, Any],
|
||||
) -> None:
|
||||
"""Store an analysis result keyed by (finnkode, deps_hash)."""
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO analysis_cache"
|
||||
" (finnkode, deps_hash, payload, computed_at)"
|
||||
" VALUES (?, ?, ?, ?)",
|
||||
(finnkode, deps_hash, json.dumps(result, default=_json_default), datetime.now(UTC).isoformat()),
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug("analysis_cache saved for %s (deps_hash=%s)", finnkode, deps_hash[:8])
|
||||
|
||||
|
||||
def invalidate_analysis(conn: sqlite3.Connection, finnkode: str) -> None:
|
||||
"""Remove any cached analysis for *finnkode* (call after raw data changes)."""
|
||||
conn.cursor().execute(
|
||||
"DELETE FROM analysis_cache WHERE finnkode = ?", (finnkode,)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _is_fresh(fetched_at: str, ttl_hours: int | None) -> bool:
|
||||
if ttl_hours is None:
|
||||
return True
|
||||
return datetime.fromisoformat(fetched_at) >= datetime.now(UTC) - timedelta(
|
||||
hours=ttl_hours
|
||||
)
|
||||
+88
-43
@@ -1,4 +1,21 @@
|
||||
"""Service layer for cache-aware fetching of FINN ads and Eiendom.no units."""
|
||||
"""Service layer for cache-aware fetching of FINN ads and Eiendom.no units.
|
||||
|
||||
Hash-aware fetch pattern
|
||||
------------------------
|
||||
Every ``get_or_fetch_*`` function follows the same contract:
|
||||
|
||||
1. TTL check -- if cached row is fresh enough, return it directly.
|
||||
2. Remote fetch -- if TTL expired (or force_refresh), fetch from network.
|
||||
3. Hash check -- compare incoming payload hash to stored hash.
|
||||
If equal the remote data has not changed; skip the DB write so that
|
||||
the analysis_cache entry for this finnkode remains valid.
|
||||
4. Write + invalidate -- if hash differs, persist the new row and
|
||||
delete any cached analysis (it will be recomputed on next call to
|
||||
``analyze_ad``).
|
||||
|
||||
This means analysis results survive TTL resets as long as the remote
|
||||
data has not actually changed.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
@@ -10,11 +27,12 @@ from .cache import (
|
||||
get_finn_ad,
|
||||
get_similar_units as get_cached_similar_units,
|
||||
init_db,
|
||||
invalidate_analysis,
|
||||
save_eiendom_unit,
|
||||
save_finn_ad,
|
||||
save_similar_units,
|
||||
)
|
||||
from .config import EIENDOM_NO_CACHE_TTL_HOURS, FINN_CACHE_PATH
|
||||
from .config import EIENDOM_NO_CACHE_TTL_HOURS, FINN_CACHE_PATH, FINN_CACHE_TTL_AD_HOURS
|
||||
from .eiendom_no import (
|
||||
build_unit_vector,
|
||||
decode_unit_vector,
|
||||
@@ -29,12 +47,25 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def get_or_fetch_ad(finnkode: str, force_refresh: bool = False) -> FinnAd:
|
||||
"""Get FinnAd from cache or fetch fresh. Never returns None."""
|
||||
"""Get FinnAd from cache or fetch fresh. Never returns None.
|
||||
|
||||
On a TTL expiry or force_refresh the ad is re-fetched from FINN.
|
||||
If the remote payload hash matches the stored hash the DB row is
|
||||
NOT updated, so analysis_cache entries for this finnkode stay valid.
|
||||
If the hash differs the row is updated and any cached analysis is
|
||||
invalidated.
|
||||
"""
|
||||
conn = init_db(FINN_CACHE_PATH)
|
||||
ad = None if force_refresh else get_finn_ad(conn, finnkode, ttl_hours=24)
|
||||
if ad is None:
|
||||
ad = None if force_refresh else get_finn_ad(conn, finnkode, ttl_hours=FINN_CACHE_TTL_AD_HOURS)
|
||||
if ad is not None:
|
||||
return ad
|
||||
|
||||
# Cache miss or force_refresh: fetch from remote.
|
||||
ad = await fetch_ad_details(finnkode)
|
||||
save_finn_ad(conn, ad)
|
||||
_, changed = save_finn_ad(conn, ad)
|
||||
if changed:
|
||||
logger.debug("finn_ad %s updated -- invalidating analysis cache", finnkode)
|
||||
invalidate_analysis(conn, finnkode)
|
||||
return ad
|
||||
|
||||
|
||||
@@ -67,10 +98,13 @@ async def ensure_eiendom_unit_code(ad: FinnAd) -> str | None:
|
||||
|
||||
ad.eiendom_unit_code = unit.unit_code
|
||||
conn = init_db(FINN_CACHE_PATH)
|
||||
save_finn_ad(conn, ad) # persist backfill; do NOT cache `unit` here --
|
||||
# the resolver returns a partial record (code +
|
||||
# address + coords). The full unit comes from
|
||||
# get_or_fetch_eiendom_unit -> get_unit().
|
||||
|
||||
# Persist the backfilled unit_code. If the hash changes (new field),
|
||||
# invalidate the analysis cache so it is recomputed with the enriched ad.
|
||||
_, changed = save_finn_ad(conn, ad)
|
||||
if changed:
|
||||
invalidate_analysis(conn, ad.finnkode)
|
||||
|
||||
logger.info("Resolved finnkode %s -> unit %s", ad.finnkode, unit.unit_code)
|
||||
return ad.eiendom_unit_code
|
||||
|
||||
@@ -78,13 +112,31 @@ async def ensure_eiendom_unit_code(ad: FinnAd) -> str | None:
|
||||
async def get_or_fetch_eiendom_unit(
|
||||
unit_code: str, force_refresh: bool = False
|
||||
) -> EiendomUnit | None:
|
||||
"""Get EiendomUnit from cache or fetch fresh."""
|
||||
"""Get EiendomUnit from cache or fetch fresh.
|
||||
|
||||
Hash-aware: if the remote payload is identical to what is stored,
|
||||
the DB row is not updated (analysis_cache stays valid).
|
||||
"""
|
||||
conn = init_db(FINN_CACHE_PATH)
|
||||
unit = None if force_refresh else get_cached_eiendom_unit(conn, unit_code, ttl_hours=24)
|
||||
if unit is None:
|
||||
unit = (
|
||||
None
|
||||
if force_refresh
|
||||
else get_cached_eiendom_unit(conn, unit_code, ttl_hours=24)
|
||||
)
|
||||
if unit is not None:
|
||||
return unit
|
||||
|
||||
unit = await get_unit(unit_code)
|
||||
if unit is not None:
|
||||
save_eiendom_unit(conn, unit)
|
||||
_, changed = save_eiendom_unit(conn, unit)
|
||||
if changed:
|
||||
logger.debug(
|
||||
"eiendom_unit %s updated -- analysis caches for linked finnkodes may be stale",
|
||||
unit_code,
|
||||
)
|
||||
# We don't have a direct finnkode → unit_code reverse map in the
|
||||
# DB yet, so we cannot invalidate analysis here. The deps_hash
|
||||
# mismatch in get_analysis() handles this automatically.
|
||||
return unit
|
||||
|
||||
|
||||
@@ -93,40 +145,36 @@ async def get_or_fetch_similar_units(
|
||||
) -> list[SimilarUnit]:
|
||||
"""Get similar units (comps) from cache or fetch fresh.
|
||||
|
||||
Fetches the unit first to get the unit_vector, then checks cache for similar
|
||||
units by (unit_code, listing_status). On cache miss, fetches fresh from
|
||||
Eiendom.no and saves to cache.
|
||||
Hash-aware: identical remote payloads do not trigger a DB write,
|
||||
so the analysis_cache entry for any finnkode that uses these comps
|
||||
remains valid.
|
||||
"""
|
||||
conn = init_db(FINN_CACHE_PATH)
|
||||
|
||||
# First, ensure we have the unit to build its vector
|
||||
# Ensure we have the unit to build its vector.
|
||||
unit = await get_or_fetch_eiendom_unit(unit_code, force_refresh=force_refresh)
|
||||
if unit is None:
|
||||
return []
|
||||
|
||||
# Check cache for similar units (unless force_refresh)
|
||||
if not force_refresh:
|
||||
cached_similar = get_cached_similar_units(
|
||||
conn, unit_code, listing_status, ttl_hours=EIENDOM_NO_CACHE_TTL_HOURS
|
||||
)
|
||||
if cached_similar:
|
||||
logger.debug(
|
||||
"Using cached similar units for %s (status=%s)",
|
||||
unit_code,
|
||||
listing_status,
|
||||
"Using cached similar units for %s (status=%s)", unit_code, listing_status
|
||||
)
|
||||
return cached_similar
|
||||
|
||||
# Cache miss or force_refresh: fetch fresh
|
||||
# Cache miss or force_refresh: fetch from remote.
|
||||
vector = build_unit_vector(unit)
|
||||
similar = await get_similar_units(vector, listing_status=listing_status)
|
||||
|
||||
# Save to cache
|
||||
if similar:
|
||||
save_similar_units(conn, unit_code, listing_status, similar)
|
||||
_, changed = save_similar_units(conn, unit_code, listing_status, similar)
|
||||
if changed:
|
||||
logger.debug(
|
||||
"Cached %d similar units for %s (status=%s)",
|
||||
len(similar),
|
||||
"similar_units %s/%s updated -- analysis caches may be stale",
|
||||
unit_code,
|
||||
listing_status,
|
||||
)
|
||||
@@ -170,10 +218,8 @@ async def analyze_search(
|
||||
) -> dict[str, Any]:
|
||||
"""Analyze a FINN search URL and return a ranked shortlist.
|
||||
|
||||
NOTE: enrichment for search results lives in analysis.py. If that path
|
||||
also reports `eiendom_enriched: 0`, it has the same root cause -- each
|
||||
card's eiendom_unit_code must be resolved via ensure_eiendom_unit_code
|
||||
(or search_unit_from_finn_url) before the enrichment gate.
|
||||
Individual ad analyses are served from analysis_cache when the
|
||||
underlying data has not changed.
|
||||
"""
|
||||
return await run_analysis_search(
|
||||
search_url,
|
||||
@@ -198,15 +244,15 @@ async def analyze_ad(
|
||||
unit_code = await ensure_eiendom_unit_code(ad) if include_eiendom_no else None
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"ad": ad.model_dump(),
|
||||
"ad": ad.model_dump(mode="json"),
|
||||
}
|
||||
if unit_code:
|
||||
unit = await get_or_fetch_eiendom_unit(unit_code)
|
||||
if unit:
|
||||
result["eiendom_unit"] = unit.model_dump()
|
||||
result["eiendom_unit"] = unit.model_dump(mode="json")
|
||||
if include_similar_units:
|
||||
similar = await get_or_fetch_similar_units(unit_code)
|
||||
result["similar_units"] = [s.model_dump() for s in similar]
|
||||
result["similar_units"] = [s.model_dump(mode="json") for s in similar]
|
||||
return result
|
||||
|
||||
|
||||
@@ -220,14 +266,14 @@ async def analyze_ad_against_comps(
|
||||
unit_code = await ensure_eiendom_unit_code(ad)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"ad": ad.model_dump(),
|
||||
"ad": ad.model_dump(mode="json"),
|
||||
}
|
||||
if unit_code:
|
||||
unit = await get_or_fetch_eiendom_unit(unit_code)
|
||||
if unit:
|
||||
result["eiendom_unit"] = unit.model_dump()
|
||||
result["eiendom_unit"] = unit.model_dump(mode="json")
|
||||
comps = await get_or_fetch_similar_units(unit_code, listing_status=listing_status)
|
||||
result["comparable_units"] = [c.model_dump() for c in comps]
|
||||
result["comparable_units"] = [c.model_dump(mode="json") for c in comps]
|
||||
return result
|
||||
|
||||
|
||||
@@ -235,7 +281,6 @@ async def find_similar_to_liked(
|
||||
finnkode: str, *, mode: str = "recommendations", listing_status: str = "FOR_SALE"
|
||||
) -> dict[str, Any]:
|
||||
"""Find properties similar to a listing the user has liked."""
|
||||
# Requires that feedback.verdict = "liked" exists for this finnkode
|
||||
ad = await get_or_fetch_ad(finnkode)
|
||||
|
||||
unit_code = await ensure_eiendom_unit_code(ad)
|
||||
@@ -252,8 +297,8 @@ async def find_similar_to_liked(
|
||||
|
||||
similar = await get_or_fetch_similar_units(unit_code, listing_status=listing_status)
|
||||
return {
|
||||
"base_ad": ad.model_dump(),
|
||||
"similar_listings": [s.model_dump() for s in similar],
|
||||
"base_ad": ad.model_dump(mode="json"),
|
||||
"similar_listings": [s.model_dump(mode="json") for s in similar],
|
||||
"mode": mode,
|
||||
}
|
||||
|
||||
@@ -269,16 +314,16 @@ async def compare_ads(
|
||||
# Resolve before model_dump() -- see analyze_ad.
|
||||
unit_code = await ensure_eiendom_unit_code(ad) if include_eiendom_no else None
|
||||
|
||||
ad_data = ad.model_dump()
|
||||
ad_data = ad.model_dump(mode="json")
|
||||
if unit_code:
|
||||
unit = await get_or_fetch_eiendom_unit(unit_code)
|
||||
if unit:
|
||||
ad_data["eiendom_unit"] = unit.model_dump()
|
||||
ad_data["eiendom_unit"] = unit.model_dump(mode="json")
|
||||
if include_comps:
|
||||
comps = await get_or_fetch_similar_units(
|
||||
unit_code, listing_status="RECENTLY_SOLD"
|
||||
)
|
||||
ad_data["comps"] = [c.model_dump() for c in comps]
|
||||
ad_data["comps"] = [c.model_dump(mode="json") for c in comps]
|
||||
|
||||
ads.append(ad_data)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user