Files
finn-mcp/finn_eiendom/cache.py
T
ole 55d93894ac feat(refactor): Document refactoring progress and phases in markdown
feat(scripts): Add backfill script for content_hash in cache tables

feat(scripts): Create recompute script for analysis_cache population

test(tests): Implement comprehensive tests for analysis module functions

fix(tests): Update CLI tests to assert errors on stderr instead of stdout

fix(tests): Adjust MCP integration tests to pass context parameter correctly

fix(tests): Modify service tests to return hash on save functions for consistency
2026-05-29 15:16:57 +00:00

735 lines
24 KiB
Python

"""SQLite cache and persistence for FINN and Eiendom.no data.
Caching strategy
----------------
Raw data (finn_ads, eiendom_units, similar_units)
Stored with a SHA-256 content_hash of the serialised payload.
On write: compare incoming hash to stored hash. If equal the remote
data has not changed -- the row is left untouched and the caller gets
back ``changed=False``, which preserves a valid analysis_cache entry.
Analysis results (analysis_cache)
Keyed by ``(finnkode, deps_hash)`` where deps_hash = SHA-256 of the
combined raw payloads of the ad, eiendom unit, and comps that were used
to produce the result. A cache hit is only valid when the deps_hash
still matches, i.e. none of the underlying data has changed.
This means analysis is re-run *only* when remote data actually changes,
not on every TTL tick.
Search pages / cards (cache_meta)
Still TTL-based -- these change frequently and a content-hash over a
full HTML page is cheap but the semantics of "changed" are less clear
(ads added/removed vs. cosmetic HTML tweaks). Hash is stored anyway so
callers can detect real list changes if desired.
"""
import hashlib
import json
import logging
import sqlite3
from datetime import UTC, datetime, timedelta
from typing import Any
from .config import FINN_CACHE_PATH
from .models import EiendomUnit, FinnAd, FinnSearchCard, SimilarUnit
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Hashing helpers
# ---------------------------------------------------------------------------
def compute_content_hash(payload: Any) -> str:
"""Return a stable SHA-256 hex digest of *payload*.
*payload* can be a dict, list, or any JSON-serialisable value.
Keys are sorted so that insertion order does not affect the hash.
"""
serialised = json.dumps(payload, sort_keys=True, default=str)
return hashlib.sha256(serialised.encode()).hexdigest()
def combine_hashes(*hashes: str | None) -> str:
"""Combine multiple content hashes into one deterministic deps_hash."""
combined = "|".join(h or "" for h in hashes)
return hashlib.sha256(combined.encode()).hexdigest()
# ---------------------------------------------------------------------------
# Connection / schema
# ---------------------------------------------------------------------------
def get_connection(path: str | None = None) -> sqlite3.Connection:
db_path = path or FINN_CACHE_PATH
conn = sqlite3.connect(str(db_path), detect_types=sqlite3.PARSE_DECLTYPES)
conn.row_factory = sqlite3.Row
return conn
def init_db(path: str | None = None) -> sqlite3.Connection:
conn = get_connection(path)
cursor = conn.cursor()
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS finn_ads (
finnkode TEXT PRIMARY KEY,
url TEXT,
payload TEXT NOT NULL,
content_hash TEXT,
fetched_at TEXT NOT NULL,
last_verified_at TEXT
)
"""
)
# Migrations: add columns if the table already existed without them.
_add_column_if_missing(cursor, "finn_ads", "content_hash", "TEXT")
_add_column_if_missing(cursor, "finn_ads", "last_verified_at", "TEXT")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS eiendom_units (
unit_code TEXT PRIMARY KEY,
payload TEXT NOT NULL,
content_hash TEXT,
fetched_at TEXT NOT NULL
)
"""
)
_add_column_if_missing(cursor, "eiendom_units", "content_hash", "TEXT")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS similar_units (
id INTEGER PRIMARY KEY AUTOINCREMENT,
unit_code TEXT NOT NULL,
listing_status TEXT NOT NULL,
payload TEXT NOT NULL,
content_hash TEXT,
fetched_at TEXT NOT NULL
)
"""
)
_add_column_if_missing(cursor, "similar_units", "content_hash", "TEXT")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS cache_meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
content_hash TEXT,
expires_at TEXT
)
"""
)
_add_column_if_missing(cursor, "cache_meta", "content_hash", "TEXT")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS analysis_cache (
finnkode TEXT PRIMARY KEY,
deps_hash TEXT NOT NULL,
payload TEXT NOT NULL,
computed_at TEXT NOT NULL
)
"""
)
# New tables for Phase 2 enhancements
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS user_feedback (
finnkode TEXT PRIMARY KEY,
verdict TEXT NOT NULL,
notes TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
)
"""
)
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS price_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
finnkode TEXT NOT NULL,
total_price INTEGER,
asking_price INTEGER,
sale_status TEXT,
recorded_at TEXT NOT NULL
)
"""
)
cursor.execute("CREATE INDEX IF NOT EXISTS idx_price_history_finnkode_recorded ON price_history(finnkode, recorded_at)")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS search_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
search_url TEXT NOT NULL,
finnkodes TEXT NOT NULL,
created_at TEXT NOT NULL
)
"""
)
cursor.execute("CREATE INDEX IF NOT EXISTS idx_search_runs_url_created ON search_runs(search_url, created_at)")
# Create indexes for efficient staleness queries
cursor.execute("CREATE INDEX IF NOT EXISTS idx_finn_ads_verified ON finn_ads(last_verified_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_eiendom_units_fetched ON eiendom_units(fetched_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_similar_units_fetched ON similar_units(fetched_at)")
conn.commit()
return conn
def _add_column_if_missing(
cursor: sqlite3.Cursor, table: str, column: str, col_type: str
) -> None:
"""ALTER TABLE … ADD COLUMN is idempotent via this guard."""
cursor.execute(f"PRAGMA table_info({table})")
existing = {row["name"] for row in cursor.fetchall()}
if column not in existing:
cursor.execute(f"ALTER TABLE {table} ADD COLUMN {column} {col_type}")
# ---------------------------------------------------------------------------
# Generic cache_meta helpers (search pages, search cards)
# ---------------------------------------------------------------------------
def cache_get(conn: sqlite3.Connection, key: str) -> dict[str, Any] | None:
cursor = conn.cursor()
cursor.execute("SELECT value, expires_at FROM cache_meta WHERE key = ?", (key,))
row = cursor.fetchone()
if not row:
return None
expires_at = row["expires_at"]
if expires_at and datetime.fromisoformat(expires_at) < datetime.now(UTC):
cursor.execute("DELETE FROM cache_meta WHERE key = ?", (key,))
conn.commit()
return None
return json.loads(row["value"])
def cache_set(
conn: sqlite3.Connection,
key: str,
payload: dict[str, Any],
ttl_hours: int | None = None,
ttl_minutes: int | None = None,
) -> str:
"""Store *payload* in cache_meta and return its content_hash."""
expires_at = None
if ttl_minutes is not None:
expires_at = (datetime.now(UTC) + timedelta(minutes=ttl_minutes)).isoformat()
elif ttl_hours is not None:
expires_at = (datetime.now(UTC) + timedelta(hours=ttl_hours)).isoformat()
content_hash = compute_content_hash(payload)
cursor = conn.cursor()
cursor.execute(
"INSERT OR REPLACE INTO cache_meta (key, value, content_hash, expires_at)"
" VALUES (?, ?, ?, ?)",
(key, json.dumps(payload, default=_json_default), content_hash, expires_at),
)
conn.commit()
return content_hash
# ---------------------------------------------------------------------------
# Search page / cards helpers
# ---------------------------------------------------------------------------
def save_search_page(
conn: sqlite3.Connection,
url: str,
html: str,
ttl_minutes: int = 60,
) -> str:
"""Cache raw HTML for a search page URL. Returns content_hash."""
return cache_set(conn, f"search_page:{url}", {"html": html}, ttl_minutes=ttl_minutes)
def get_search_page(conn: sqlite3.Connection, url: str) -> str | None:
payload = cache_get(conn, f"search_page:{url}")
if not payload:
return None
return payload.get("html")
def save_search_cards(
conn: sqlite3.Connection,
url: str,
cards: list[FinnSearchCard],
ttl_minutes: int = 60,
) -> str:
"""Cache parsed search cards. Returns content_hash."""
return cache_set(
conn,
f"search_cards:{url}",
[card.model_dump(mode="json") for card in cards],
ttl_minutes=ttl_minutes,
)
def get_search_cards(conn: sqlite3.Connection, url: str) -> list[FinnSearchCard]:
payload = cache_get(conn, f"search_cards:{url}")
if not payload:
return []
return [FinnSearchCard.model_validate(item) for item in payload]
# ---------------------------------------------------------------------------
# FinnAd
# ---------------------------------------------------------------------------
def save_finn_ad(conn: sqlite3.Connection, ad: FinnAd) -> tuple[str, bool]:
"""Persist *ad* to finn_ads.
Returns ``(content_hash, changed)`` where ``changed=False`` means the
remote payload is identical to what was already stored -- callers can
use this to skip analysis recomputation.
"""
cursor = conn.cursor()
payload = ad.model_dump(mode="json")
new_hash = compute_content_hash(payload)
fetched_at = (
ad.detail_fetched_at.isoformat()
if ad.detail_fetched_at
else datetime.now(UTC).isoformat()
)
# Update last_verified_at to now when saving (indicates we just checked the data)
last_verified_at = datetime.now(UTC).isoformat()
# Check existing hash before writing.
cursor.execute(
"SELECT content_hash FROM finn_ads WHERE finnkode = ?", (ad.finnkode,)
)
row = cursor.fetchone()
if row and row["content_hash"] == new_hash:
logger.debug("finn_ad %s unchanged (hash match)", ad.finnkode)
return new_hash, False
cursor.execute(
"INSERT OR REPLACE INTO finn_ads"
" (finnkode, url, payload, content_hash, fetched_at, last_verified_at)"
" VALUES (?, ?, ?, ?, ?, ?)",
(ad.finnkode, ad.url, json.dumps(payload, default=_json_default), new_hash, fetched_at, last_verified_at),
)
conn.commit()
logger.debug("finn_ad %s saved (hash=%s)", ad.finnkode, new_hash[:8])
return new_hash, True
def get_finn_ad(
conn: sqlite3.Connection, finnkode: str, ttl_hours: int | None = None
) -> FinnAd | None:
cursor = conn.cursor()
cursor.execute(
"SELECT payload, fetched_at FROM finn_ads WHERE finnkode = ?", (finnkode,)
)
row = cursor.fetchone()
if not row:
return None
if ttl_hours is not None and not _is_fresh(row["fetched_at"], ttl_hours):
return None
return FinnAd.model_validate(json.loads(row["payload"]))
def get_finn_ad_hash(conn: sqlite3.Connection, finnkode: str) -> str | None:
"""Return the stored content_hash for *finnkode*, or None if not cached."""
cursor = conn.cursor()
cursor.execute(
"SELECT content_hash FROM finn_ads WHERE finnkode = ?", (finnkode,)
)
row = cursor.fetchone()
return row["content_hash"] if row else None
# ---------------------------------------------------------------------------
# EiendomUnit
# ---------------------------------------------------------------------------
def save_eiendom_unit(conn: sqlite3.Connection, unit: EiendomUnit) -> tuple[str, bool]:
"""Persist *unit* to eiendom_units.
Returns ``(content_hash, changed)``.
"""
cursor = conn.cursor()
payload = unit.model_dump(mode="json")
new_hash = compute_content_hash(payload)
cursor.execute(
"SELECT content_hash FROM eiendom_units WHERE unit_code = ?", (unit.unit_code,)
)
row = cursor.fetchone()
if row and row["content_hash"] == new_hash:
logger.debug("eiendom_unit %s unchanged (hash match)", unit.unit_code)
return new_hash, False
cursor.execute(
"INSERT OR REPLACE INTO eiendom_units"
" (unit_code, payload, content_hash, fetched_at)"
" VALUES (?, ?, ?, ?)",
(unit.unit_code, json.dumps(payload, default=_json_default), new_hash, unit.fetched_at.isoformat()),
)
conn.commit()
logger.debug("eiendom_unit %s saved (hash=%s)", unit.unit_code, new_hash[:8])
return new_hash, True
def get_eiendom_unit(
conn: sqlite3.Connection,
unit_code: str,
ttl_hours: int | None = None,
) -> EiendomUnit | None:
cursor = conn.cursor()
cursor.execute(
"SELECT payload, fetched_at FROM eiendom_units WHERE unit_code = ?", (unit_code,)
)
row = cursor.fetchone()
if not row:
return None
if ttl_hours is not None and not _is_fresh(row["fetched_at"], ttl_hours):
return None
return EiendomUnit.model_validate(json.loads(row["payload"]))
def get_eiendom_unit_hash(conn: sqlite3.Connection, unit_code: str) -> str | None:
"""Return the stored content_hash for *unit_code*, or None if not cached."""
cursor = conn.cursor()
cursor.execute(
"SELECT content_hash FROM eiendom_units WHERE unit_code = ?", (unit_code,)
)
row = cursor.fetchone()
return row["content_hash"] if row else None
# ---------------------------------------------------------------------------
# SimilarUnits
# ---------------------------------------------------------------------------
def save_similar_units(
conn: sqlite3.Connection,
unit_code: str,
listing_status: str,
similar_units: list[SimilarUnit],
) -> tuple[str, bool]:
"""Persist *similar_units* for (unit_code, listing_status).
Returns ``(content_hash, changed)``.
"""
cursor = conn.cursor()
payload_list = [item.model_dump(mode="json") for item in similar_units]
new_hash = compute_content_hash(payload_list)
cursor.execute(
"SELECT payload, content_hash FROM similar_units"
" WHERE unit_code = ? AND listing_status = ?"
" ORDER BY id DESC LIMIT 1",
(unit_code, listing_status),
)
row = cursor.fetchone()
if row and row["content_hash"] == new_hash:
logger.debug(
"similar_units %s/%s unchanged (hash match)", unit_code, listing_status
)
return new_hash, False
cursor.execute(
"INSERT INTO similar_units"
" (unit_code, listing_status, payload, content_hash, fetched_at)"
" VALUES (?, ?, ?, ?, ?)",
(
unit_code,
listing_status,
json.dumps(payload_list, default=_json_default),
new_hash,
datetime.now(UTC).isoformat(),
),
)
conn.commit()
logger.debug(
"similar_units %s/%s saved (hash=%s)", unit_code, listing_status, new_hash[:8]
)
return new_hash, True
def get_similar_units(
conn: sqlite3.Connection,
unit_code: str,
listing_status: str,
ttl_hours: int | None = None,
) -> list[SimilarUnit]:
cursor = conn.cursor()
cursor.execute(
"SELECT payload, fetched_at FROM similar_units"
" WHERE unit_code = ? AND listing_status = ?"
" ORDER BY id DESC LIMIT 1",
(unit_code, listing_status),
)
row = cursor.fetchone()
if not row:
return []
if ttl_hours is not None and not _is_fresh(row["fetched_at"], ttl_hours):
return []
return [SimilarUnit.model_validate(item) for item in json.loads(row["payload"])]
def get_similar_units_hash(
conn: sqlite3.Connection, unit_code: str, listing_status: str
) -> str | None:
"""Return the stored content_hash for (unit_code, listing_status), or None."""
cursor = conn.cursor()
cursor.execute(
"SELECT content_hash FROM similar_units"
" WHERE unit_code = ? AND listing_status = ?"
" ORDER BY id DESC LIMIT 1",
(unit_code, listing_status),
)
row = cursor.fetchone()
return row["content_hash"] if row else None
# ---------------------------------------------------------------------------
# Analysis cache
# ---------------------------------------------------------------------------
def get_analysis(
conn: sqlite3.Connection, finnkode: str, deps_hash: str
) -> dict[str, Any] | None:
"""Return cached analysis for *finnkode* if deps_hash still matches.
``deps_hash`` encodes the combined hashes of the ad, eiendom unit, and
comps that were used to produce the analysis. Any change to underlying
data produces a different deps_hash and the cache is considered stale.
"""
cursor = conn.cursor()
cursor.execute(
"SELECT payload, deps_hash FROM analysis_cache WHERE finnkode = ?",
(finnkode,),
)
row = cursor.fetchone()
if not row:
return None
if row["deps_hash"] != deps_hash:
logger.debug(
"analysis_cache miss for %s (deps_hash changed %s%s)",
finnkode,
row["deps_hash"][:8],
deps_hash[:8],
)
return None
logger.debug("analysis_cache hit for %s", finnkode)
return json.loads(row["payload"])
def _json_default(obj: Any) -> Any:
"""Fallback serialiser for json.dumps.
Converts datetime/date → ISO string; anything else → repr string.
Means save_analysis never raises TypeError regardless of what scoring
or model_dump() emits.
"""
if hasattr(obj, "isoformat"):
return obj.isoformat()
return repr(obj)
def save_analysis(
conn: sqlite3.Connection,
finnkode: str,
deps_hash: str,
result: dict[str, Any],
) -> None:
"""Store an analysis result keyed by (finnkode, deps_hash)."""
cursor = conn.cursor()
cursor.execute(
"INSERT OR REPLACE INTO analysis_cache"
" (finnkode, deps_hash, payload, computed_at)"
" VALUES (?, ?, ?, ?)",
(finnkode, deps_hash, json.dumps(result, default=_json_default), datetime.now(UTC).isoformat()),
)
conn.commit()
logger.debug("analysis_cache saved for %s (deps_hash=%s)", finnkode, deps_hash[:8])
def invalidate_analysis(conn: sqlite3.Connection, finnkode: str) -> None:
"""Remove any cached analysis for *finnkode* (call after raw data changes)."""
conn.cursor().execute(
"DELETE FROM analysis_cache WHERE finnkode = ?", (finnkode,)
)
conn.commit()
# ---------------------------------------------------------------------------
# User feedback
# ---------------------------------------------------------------------------
def save_feedback(
conn: sqlite3.Connection, finnkode: str, verdict: str, notes: str | None = None
) -> dict[str, Any]:
"""Store user feedback/verdict for a FINN listing."""
cursor = conn.cursor()
now = datetime.now(UTC).isoformat()
cursor.execute(
"INSERT OR REPLACE INTO user_feedback"
" (finnkode, verdict, notes, created_at, updated_at)"
" VALUES (?, ?, ?, ?, ?)",
(finnkode, verdict, notes, now, now),
)
conn.commit()
logger.debug("feedback saved for %s (verdict=%s)", finnkode, verdict)
return {"finnkode": finnkode, "verdict": verdict, "notes": notes}
def get_feedback(conn: sqlite3.Connection, finnkode: str) -> dict[str, Any] | None:
"""Retrieve stored feedback for a FINN listing."""
cursor = conn.cursor()
cursor.execute(
"SELECT finnkode, verdict, notes, created_at, updated_at FROM user_feedback WHERE finnkode = ?",
(finnkode,),
)
row = cursor.fetchone()
if not row:
return None
return {
"finnkode": row["finnkode"],
"verdict": row["verdict"],
"notes": row["notes"],
"created_at": row["created_at"],
"updated_at": row["updated_at"],
}
def get_feedback_by_verdict(
conn: sqlite3.Connection, verdict: str, limit: int = 100
) -> list[dict[str, Any]]:
"""Retrieve all stored feedback with a given verdict."""
cursor = conn.cursor()
cursor.execute(
"SELECT finnkode, verdict, notes, created_at, updated_at FROM user_feedback"
" WHERE verdict = ? ORDER BY updated_at DESC LIMIT ?",
(verdict, limit),
)
return [
{
"finnkode": row["finnkode"],
"verdict": row["verdict"],
"notes": row["notes"],
"created_at": row["created_at"],
"updated_at": row["updated_at"],
}
for row in cursor.fetchall()
]
# ---------------------------------------------------------------------------
# Price history
# ---------------------------------------------------------------------------
def save_price_history(
conn: sqlite3.Connection,
finnkode: str,
total_price: int | None = None,
asking_price: int | None = None,
sale_status: str | None = None,
) -> None:
"""Record a price/status snapshot for a listing."""
cursor = conn.cursor()
cursor.execute(
"INSERT INTO price_history (finnkode, total_price, asking_price, sale_status, recorded_at)"
" VALUES (?, ?, ?, ?, ?)",
(finnkode, total_price, asking_price, sale_status, datetime.now(UTC).isoformat()),
)
conn.commit()
logger.debug("price_history recorded for %s (total=%s, asking=%s)", finnkode, total_price, asking_price)
def get_price_history(conn: sqlite3.Connection, finnkode: str, limit: int = 100) -> list[dict[str, Any]]:
"""Retrieve price history for a listing."""
cursor = conn.cursor()
cursor.execute(
"SELECT total_price, asking_price, sale_status, recorded_at FROM price_history"
" WHERE finnkode = ? ORDER BY recorded_at DESC LIMIT ?",
(finnkode, limit),
)
return [
{
"total_price": row["total_price"],
"asking_price": row["asking_price"],
"sale_status": row["sale_status"],
"recorded_at": row["recorded_at"],
}
for row in cursor.fetchall()
]
# ---------------------------------------------------------------------------
# Search runs
# ---------------------------------------------------------------------------
def save_search_run(
conn: sqlite3.Connection, search_url: str, finnkodes: list[str]
) -> None:
"""Record a search run with the finnkodes found."""
cursor = conn.cursor()
finnkodes_json = json.dumps(finnkodes)
cursor.execute(
"INSERT INTO search_runs (search_url, finnkodes, created_at)"
" VALUES (?, ?, ?)",
(search_url, finnkodes_json, datetime.now(UTC).isoformat()),
)
conn.commit()
logger.debug("search_run recorded for %s (%d finnkodes)", search_url, len(finnkodes))
def get_latest_search_run(conn: sqlite3.Connection, search_url: str) -> dict[str, Any] | None:
"""Retrieve the most recent search run for a URL."""
cursor = conn.cursor()
cursor.execute(
"SELECT search_url, finnkodes, created_at FROM search_runs"
" WHERE search_url = ? ORDER BY created_at DESC LIMIT 1",
(search_url,),
)
row = cursor.fetchone()
if not row:
return None
return {
"search_url": row["search_url"],
"finnkodes": json.loads(row["finnkodes"]),
"created_at": row["created_at"],
}
def delete_feedback(conn: sqlite3.Connection, finnkode: str) -> dict[str, Any]:
"""Delete stored feedback for a FINN listing."""
cursor = conn.cursor()
cursor.execute("DELETE FROM user_feedback WHERE finnkode = ?", (finnkode,))
conn.commit()
logger.debug("feedback deleted for %s", finnkode)
return {"finnkode": finnkode, "deleted": True}
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _is_fresh(fetched_at: str, ttl_hours: int | None) -> bool:
if ttl_hours is None:
return True
return datetime.fromisoformat(fetched_at) >= datetime.now(UTC) - timedelta(
hours=ttl_hours
)