feat(refactor): Document refactoring progress and phases in markdown

feat(scripts): Add backfill script for content_hash in cache tables

feat(scripts): Create recompute script for analysis_cache population

test(tests): Implement comprehensive tests for analysis module functions

fix(tests): Update CLI tests to assert errors on stderr instead of stdout

fix(tests): Adjust MCP integration tests to pass context parameter correctly

fix(tests): Modify service tests to return hash on save functions for consistency
This commit is contained in:
Ole
2026-05-29 15:16:57 +00:00
parent 5b772b2ae5
commit 55d93894ac
18 changed files with 1457 additions and 60 deletions
+1 -6
View File
@@ -4,12 +4,7 @@
"type": "http",
"url": "https://mcp.context7.com/mcp",
},
"mcp-jungle":{
"type": "http",
"url": "http://mini:8080/mcp",
},
// "finn-eiendom": { }
"finn-eiendom": {
"finn-mcp": {
"command": "/root/projects/finn-mcp/.venv/bin/python",
"args": [
"-m",
+2 -1
View File
@@ -24,6 +24,7 @@
"/root/projects/finn-mcp/.venv/bin/python": true,
"make": true,
".venv/bin/coverage": true,
".venv/bin/pytest": true
".venv/bin/pytest": true,
"python -m pytest": true
}
}
+1 -1
View File
@@ -14,7 +14,7 @@ services:
# Expose port for network access
ports:
- "8010:8010"
- "8011:8010"
# More aggressive resource limits for production
# deploy:
+52 -4
View File
@@ -32,12 +32,14 @@ from .cache import (
save_analysis,
save_eiendom_unit,
save_finn_ad,
save_search_run,
save_similar_units,
)
from .config import (
EIENDOM_NO_CACHE_TTL_HOURS,
EIENDOM_NO_CACHE_TTL_SIMILAR_UNITS_DAYS,
EIENDOM_NO_CACHE_TTL_STRUCTURAL_DAYS,
FINN_CACHE_PATH,
FINN_CACHE_TTL_AD_HOURS,
FINN_CACHE_TTL_AD_STRUCTURAL_DAYS,
FINN_DETAIL_LIMIT,
FINN_MAX_SEARCH_PAGES,
)
@@ -147,6 +149,12 @@ async def analyze_ad(
"""
conn = cache.init_db(FINN_CACHE_PATH)
# ------------------------------------------------------------------
# 0. Backfill eiendom_unit_code if provided.
# ------------------------------------------------------------------
if unit_code and not finn_ad.eiendom_unit_code:
finn_ad.eiendom_unit_code = unit_code
# ------------------------------------------------------------------
# 1. Ensure the ad is in the DB so we have a stable hash to key on.
# ------------------------------------------------------------------
@@ -173,8 +181,10 @@ async def analyze_ad(
comps_hash_changed = False
if enriched:
# Convert similar units TTL from days to hours
ttl_hours = EIENDOM_NO_CACHE_TTL_SIMILAR_UNITS_DAYS * 24
similar_units = cache.get_similar_units(
conn, enriched.unit_code, "RECENTLY_SOLD", ttl_hours=EIENDOM_NO_CACHE_TTL_HOURS
conn, enriched.unit_code, "RECENTLY_SOLD", ttl_hours=ttl_hours
)
if not similar_units:
vector = enriched.unit_vector or eiendom_no.build_unit_vector(enriched)
@@ -210,11 +220,38 @@ async def analyze_ad(
categories = scoring.classify_ad(scores)
summary = _build_ad_summary(finn_ad, enriched, similar_units, scores, categories)
# Get price history and cache age metadata
from .cache import get_price_history, get_finn_ad_hash
from datetime import datetime, UTC, timedelta
price_history = get_price_history(conn, finn_ad.finnkode, limit=20)
# Compute cache age: how long since we last fetched this ad
cursor = conn.cursor()
cursor.execute(
"SELECT fetched_at, last_verified_at FROM finn_ads WHERE finnkode = ?",
(finn_ad.finnkode,),
)
db_row = cursor.fetchone()
cache_age = None
if db_row:
fetched_at = datetime.fromisoformat(db_row["fetched_at"])
last_verified = db_row["last_verified_at"]
if last_verified:
last_verified_at = datetime.fromisoformat(last_verified)
structural_age_days = (datetime.now(UTC) - fetched_at).days
price_age_hours = (datetime.now(UTC) - last_verified_at).total_seconds() / 3600
cache_age = {
"structural_days": structural_age_days,
"price_hours": round(price_age_hours, 1),
}
result = {
"finnkode": finn_ad.finnkode,
"url": finn_ad.url,
"title": finn_ad.title,
"address": finn_ad.address,
"listing_description": finn_ad.listing_description,
"district": finn_ad.district,
"property_type": finn_ad.property_type,
"ownership_type": finn_ad.ownership_type,
@@ -236,6 +273,8 @@ async def analyze_ad(
"score": scores,
"categories": categories,
"summary": summary,
"price_history": price_history,
"cache_age": cache_age,
"eiendom_unit": enriched.model_dump(mode="json") if enriched else None,
"similar_units": [unit.model_dump(mode="json") for unit in similar_units],
}
@@ -262,7 +301,7 @@ async def _fetch_card_to_db(
treats None as a skip without aborting the whole batch.
"""
try:
finn_ad = cache.get_finn_ad(conn, card.finnkode, ttl_hours=FINN_CACHE_TTL_AD_HOURS)
finn_ad = cache.get_finn_ad(conn, card.finnkode, ttl_hours=FINN_CACHE_TTL_AD_STRUCTURAL_DAYS * 24)
if finn_ad is None:
finn_ad = await ad_module.fetch_ad_details(card.finnkode, client=client)
save_finn_ad(conn, finn_ad)
@@ -275,6 +314,11 @@ async def _fetch_card_to_db(
try:
matched_unit = await eiendom_no.search_unit_from_finn_url(card.url)
unit_code = matched_unit.unit_code if matched_unit else None
# Backfill unit_code into the ad object and persist.
# This ensures the cached ad has the eiendom_unit_code field populated.
if unit_code and not finn_ad.eiendom_unit_code:
finn_ad.eiendom_unit_code = unit_code
_, _ = save_finn_ad(conn, finn_ad)
except Exception as exc:
logger.warning("Eiendom.no unit search failed for %s: %s", card.finnkode, exc)
@@ -384,6 +428,10 @@ async def analyze_search(
f"{skipped_count} skipped."
)
# Record this search run in the database
finnkodes = [card.finnkode for card in cards]
save_search_run(conn, search_url, finnkodes)
return {
"search_url": search_url,
"search_cards": [card.model_dump(mode="json") for card in cards],
+205 -5
View File
@@ -80,12 +80,14 @@ def init_db(path: str | None = None) -> sqlite3.Connection:
url TEXT,
payload TEXT NOT NULL,
content_hash TEXT,
fetched_at TEXT NOT NULL
fetched_at TEXT NOT NULL,
last_verified_at TEXT
)
"""
)
# Migration: add content_hash column if the table already existed without it.
# Migrations: add columns if the table already existed without them.
_add_column_if_missing(cursor, "finn_ads", "content_hash", "TEXT")
_add_column_if_missing(cursor, "finn_ads", "last_verified_at", "TEXT")
cursor.execute(
"""
@@ -136,6 +138,50 @@ def init_db(path: str | None = None) -> sqlite3.Connection:
"""
)
# New tables for Phase 2 enhancements
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS user_feedback (
finnkode TEXT PRIMARY KEY,
verdict TEXT NOT NULL,
notes TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
)
"""
)
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS price_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
finnkode TEXT NOT NULL,
total_price INTEGER,
asking_price INTEGER,
sale_status TEXT,
recorded_at TEXT NOT NULL
)
"""
)
cursor.execute("CREATE INDEX IF NOT EXISTS idx_price_history_finnkode_recorded ON price_history(finnkode, recorded_at)")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS search_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
search_url TEXT NOT NULL,
finnkodes TEXT NOT NULL,
created_at TEXT NOT NULL
)
"""
)
cursor.execute("CREATE INDEX IF NOT EXISTS idx_search_runs_url_created ON search_runs(search_url, created_at)")
# Create indexes for efficient staleness queries
cursor.execute("CREATE INDEX IF NOT EXISTS idx_finn_ads_verified ON finn_ads(last_verified_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_eiendom_units_fetched ON eiendom_units(fetched_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_similar_units_fetched ON similar_units(fetched_at)")
conn.commit()
return conn
@@ -258,6 +304,8 @@ def save_finn_ad(conn: sqlite3.Connection, ad: FinnAd) -> tuple[str, bool]:
if ad.detail_fetched_at
else datetime.now(UTC).isoformat()
)
# Update last_verified_at to now when saving (indicates we just checked the data)
last_verified_at = datetime.now(UTC).isoformat()
# Check existing hash before writing.
cursor.execute(
@@ -270,9 +318,9 @@ def save_finn_ad(conn: sqlite3.Connection, ad: FinnAd) -> tuple[str, bool]:
cursor.execute(
"INSERT OR REPLACE INTO finn_ads"
" (finnkode, url, payload, content_hash, fetched_at)"
" VALUES (?, ?, ?, ?, ?)",
(ad.finnkode, ad.url, json.dumps(payload, default=_json_default), new_hash, fetched_at),
" (finnkode, url, payload, content_hash, fetched_at, last_verified_at)"
" VALUES (?, ?, ?, ?, ?, ?)",
(ad.finnkode, ad.url, json.dumps(payload, default=_json_default), new_hash, fetched_at, last_verified_at),
)
conn.commit()
logger.debug("finn_ad %s saved (hash=%s)", ad.finnkode, new_hash[:8])
@@ -522,6 +570,158 @@ def invalidate_analysis(conn: sqlite3.Connection, finnkode: str) -> None:
conn.commit()
# ---------------------------------------------------------------------------
# User feedback
# ---------------------------------------------------------------------------
def save_feedback(
conn: sqlite3.Connection, finnkode: str, verdict: str, notes: str | None = None
) -> dict[str, Any]:
"""Store user feedback/verdict for a FINN listing."""
cursor = conn.cursor()
now = datetime.now(UTC).isoformat()
cursor.execute(
"INSERT OR REPLACE INTO user_feedback"
" (finnkode, verdict, notes, created_at, updated_at)"
" VALUES (?, ?, ?, ?, ?)",
(finnkode, verdict, notes, now, now),
)
conn.commit()
logger.debug("feedback saved for %s (verdict=%s)", finnkode, verdict)
return {"finnkode": finnkode, "verdict": verdict, "notes": notes}
def get_feedback(conn: sqlite3.Connection, finnkode: str) -> dict[str, Any] | None:
"""Retrieve stored feedback for a FINN listing."""
cursor = conn.cursor()
cursor.execute(
"SELECT finnkode, verdict, notes, created_at, updated_at FROM user_feedback WHERE finnkode = ?",
(finnkode,),
)
row = cursor.fetchone()
if not row:
return None
return {
"finnkode": row["finnkode"],
"verdict": row["verdict"],
"notes": row["notes"],
"created_at": row["created_at"],
"updated_at": row["updated_at"],
}
def get_feedback_by_verdict(
conn: sqlite3.Connection, verdict: str, limit: int = 100
) -> list[dict[str, Any]]:
"""Retrieve all stored feedback with a given verdict."""
cursor = conn.cursor()
cursor.execute(
"SELECT finnkode, verdict, notes, created_at, updated_at FROM user_feedback"
" WHERE verdict = ? ORDER BY updated_at DESC LIMIT ?",
(verdict, limit),
)
return [
{
"finnkode": row["finnkode"],
"verdict": row["verdict"],
"notes": row["notes"],
"created_at": row["created_at"],
"updated_at": row["updated_at"],
}
for row in cursor.fetchall()
]
# ---------------------------------------------------------------------------
# Price history
# ---------------------------------------------------------------------------
def save_price_history(
conn: sqlite3.Connection,
finnkode: str,
total_price: int | None = None,
asking_price: int | None = None,
sale_status: str | None = None,
) -> None:
"""Record a price/status snapshot for a listing."""
cursor = conn.cursor()
cursor.execute(
"INSERT INTO price_history (finnkode, total_price, asking_price, sale_status, recorded_at)"
" VALUES (?, ?, ?, ?, ?)",
(finnkode, total_price, asking_price, sale_status, datetime.now(UTC).isoformat()),
)
conn.commit()
logger.debug("price_history recorded for %s (total=%s, asking=%s)", finnkode, total_price, asking_price)
def get_price_history(conn: sqlite3.Connection, finnkode: str, limit: int = 100) -> list[dict[str, Any]]:
"""Retrieve price history for a listing."""
cursor = conn.cursor()
cursor.execute(
"SELECT total_price, asking_price, sale_status, recorded_at FROM price_history"
" WHERE finnkode = ? ORDER BY recorded_at DESC LIMIT ?",
(finnkode, limit),
)
return [
{
"total_price": row["total_price"],
"asking_price": row["asking_price"],
"sale_status": row["sale_status"],
"recorded_at": row["recorded_at"],
}
for row in cursor.fetchall()
]
# ---------------------------------------------------------------------------
# Search runs
# ---------------------------------------------------------------------------
def save_search_run(
conn: sqlite3.Connection, search_url: str, finnkodes: list[str]
) -> None:
"""Record a search run with the finnkodes found."""
cursor = conn.cursor()
finnkodes_json = json.dumps(finnkodes)
cursor.execute(
"INSERT INTO search_runs (search_url, finnkodes, created_at)"
" VALUES (?, ?, ?)",
(search_url, finnkodes_json, datetime.now(UTC).isoformat()),
)
conn.commit()
logger.debug("search_run recorded for %s (%d finnkodes)", search_url, len(finnkodes))
def get_latest_search_run(conn: sqlite3.Connection, search_url: str) -> dict[str, Any] | None:
"""Retrieve the most recent search run for a URL."""
cursor = conn.cursor()
cursor.execute(
"SELECT search_url, finnkodes, created_at FROM search_runs"
" WHERE search_url = ? ORDER BY created_at DESC LIMIT 1",
(search_url,),
)
row = cursor.fetchone()
if not row:
return None
return {
"search_url": row["search_url"],
"finnkodes": json.loads(row["finnkodes"]),
"created_at": row["created_at"],
}
def delete_feedback(conn: sqlite3.Connection, finnkode: str) -> dict[str, Any]:
"""Delete stored feedback for a FINN listing."""
cursor = conn.cursor()
cursor.execute("DELETE FROM user_feedback WHERE finnkode = ?", (finnkode,))
conn.commit()
logger.debug("feedback deleted for %s", finnkode)
return {"finnkode": finnkode, "deleted": True}
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
+63 -2
View File
@@ -320,8 +320,69 @@ def diff(
def stats() -> None:
"""Show cache statistics."""
try:
# TODO: implement cache stats via cache.py
typer.echo("Cache stats (not yet implemented)")
import json
import sqlite3
from .config import FINN_CACHE_PATH
conn = sqlite3.connect(str(FINN_CACHE_PATH))
cursor = conn.cursor()
# Get row counts and hash statistics for each table
tables = ["finn_ads", "eiendom_units", "similar_units", "analysis_cache", "cache_meta"]
stats = {}
for table in tables:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
total = cursor.fetchone()[0]
if total == 0:
stats[table] = {"total_rows": 0}
continue
# For tables with content_hash or deps_hash
if table == "analysis_cache":
cursor.execute(f"SELECT COUNT(*) FROM {table} WHERE deps_hash IS NOT NULL")
with_hash = cursor.fetchone()[0]
elif table != "cache_meta" or True: # All have content_hash or value
cursor.execute(f"SELECT COUNT(*) FROM {table} WHERE content_hash IS NOT NULL")
with_hash = cursor.fetchone()[0]
stats[table] = {
"total_rows": total,
"rows_with_hash": with_hash,
"pct_with_hash": round(100 * with_hash / total, 1) if total > 0 else 0,
}
# Special checks for finn_ads
cursor.execute(
'SELECT COUNT(*) FROM finn_ads '
'WHERE json_extract(payload, "$.eiendom_unit_code") IS NOT NULL '
'AND json_extract(payload, "$.eiendom_unit_code") != "null"'
)
ads_with_unit_code = cursor.fetchone()[0]
if "finn_ads" in stats and stats["finn_ads"]["total_rows"] > 0:
stats["finn_ads"]["with_eiendom_unit_code"] = ads_with_unit_code
stats["finn_ads"]["pct_with_unit_code"] = round(100 * ads_with_unit_code / stats["finn_ads"]["total_rows"], 1)
# Get fetched_at date ranges
for table in ["finn_ads", "eiendom_units", "similar_units"]:
cursor.execute(f"SELECT MIN(fetched_at), MAX(fetched_at) FROM {table}")
min_date, max_date = cursor.fetchone()
if min_date and max_date:
stats[table]["oldest_fetch"] = min_date
stats[table]["newest_fetch"] = max_date
conn.close()
# Format output
typer.echo("\n=== Cache Statistics ===\n")
for table, table_stats in stats.items():
typer.echo(f"{table}:")
for key, value in table_stats.items():
typer.echo(f" {key}: {value}")
typer.echo()
except Exception as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(1)
+21 -3
View File
@@ -11,20 +11,38 @@ FINN_MAX_SEARCH_PAGES = int(os.getenv("FINN_MAX_SEARCH_PAGES", "3"))
FINN_DETAIL_LIMIT = int(os.getenv("FINN_DETAIL_LIMIT", "20"))
FINN_REQUEST_DELAY_SECONDS = float(os.getenv("FINN_REQUEST_DELAY_SECONDS", "2"))
FINN_USER_AGENT = os.getenv("FINN_USER_AGENT", "personal-finn-eiendom-analyzer/0.1")
FINN_CACHE_TTL_SEARCH_MINUTES = int(os.getenv("FINN_CACHE_TTL_SEARCH_MINUTES", "60"))
FINN_CACHE_TTL_AD_HOURS = int(os.getenv("FINN_CACHE_TTL_AD_HOURS", "24"))
# Cache TTLs (refactor v2)
# Structural data (address, area, year, etc.) changes rarely; long TTL
FINN_CACHE_TTL_AD_STRUCTURAL_DAYS = int(
os.getenv("FINN_CACHE_TTL_AD_STRUCTURAL_DAYS", "30")
)
# Price/status changes frequently; short TTL for lightweight verification
FINN_CACHE_TTL_AD_PRICE_HOURS = int(os.getenv("FINN_CACHE_TTL_AD_PRICE_HOURS", "6"))
# Search pages/cards also TTL-based (content changes with added/removed listings)
FINN_CACHE_TTL_SEARCH_MINUTES = int(os.getenv("FINN_CACHE_TTL_SEARCH_MINUTES", "360"))
# Eiendom.no API settings
EIENDOM_NO_ENABLED = os.getenv("EIENDOM_NO_ENABLED", "true").lower() == "true"
EIENDOM_NO_BASE_URL = os.getenv("EIENDOM_NO_BASE_URL", "https://api.eiendom.no/api/v1")
EIENDOM_NO_REQUEST_DELAY_SECONDS = float(os.getenv("EIENDOM_NO_REQUEST_DELAY_SECONDS", "1"))
EIENDOM_NO_CACHE_TTL_HOURS = int(os.getenv("EIENDOM_NO_CACHE_TTL_HOURS", "24"))
# Structural data (lat, lng, property_type) has long TTL; estimates have shorter TTL
EIENDOM_NO_CACHE_TTL_STRUCTURAL_DAYS = int(
os.getenv("EIENDOM_NO_CACHE_TTL_STRUCTURAL_DAYS", "30")
)
EIENDOM_NO_CACHE_TTL_ESTIMATE_DAYS = int(
os.getenv("EIENDOM_NO_CACHE_TTL_ESTIMATE_DAYS", "7")
)
EIENDOM_NO_SIMILAR_UNITS_ENABLED = (
os.getenv("EIENDOM_NO_SIMILAR_UNITS_ENABLED", "true").lower() == "true"
)
EIENDOM_NO_SIMILAR_UNITS_DEFAULT_STATUS = os.getenv(
"EIENDOM_NO_SIMILAR_UNITS_DEFAULT_STATUS", "RECENTLY_SOLD"
)
# Similar units (comps) are immutable; very long TTL (only new entries appear over time)
EIENDOM_NO_CACHE_TTL_SIMILAR_UNITS_DAYS = int(
os.getenv("EIENDOM_NO_CACHE_TTL_SIMILAR_UNITS_DAYS", "60")
)
# Logging
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
+20 -17
View File
@@ -3,7 +3,11 @@
import logging
from typing import Any
from .cache import delete_feedback as cache_delete_feedback
from .cache import get_feedback as cache_get_feedback
from .cache import get_feedback_by_verdict
from .cache import init_db
from .cache import save_feedback as cache_save_feedback
from .config import FINN_CACHE_PATH
logger = logging.getLogger(__name__)
@@ -21,15 +25,7 @@ def save_feedback(finnkode: str, verdict: str, notes: str | None = None) -> dict
Dict with saved feedback details
"""
conn = init_db(FINN_CACHE_PATH)
# TODO: implement via feedback table in cache.py
# For now, return a success response
return {
"finnkode": finnkode,
"verdict": verdict,
"notes": notes,
"saved": True,
}
return cache_save_feedback(conn, finnkode, verdict, notes)
def get_feedback(finnkode: str) -> dict[str, Any] | None:
@@ -42,9 +38,21 @@ def get_feedback(finnkode: str) -> dict[str, Any] | None:
Feedback dict if exists, else None
"""
conn = init_db(FINN_CACHE_PATH)
return cache_get_feedback(conn, finnkode)
# TODO: implement via feedback table in cache.py
return None
def get_feedback_by_verdict_impl(verdict: str, limit: int = 100) -> list[dict[str, Any]]:
"""Retrieve all stored feedback with a given verdict.
Args:
verdict: Verdict to filter by
limit: Max results to return
Returns:
List of feedback dicts
"""
conn = init_db(FINN_CACHE_PATH)
return get_feedback_by_verdict(conn, verdict, limit=limit)
def delete_feedback(finnkode: str) -> dict[str, Any]:
@@ -57,9 +65,4 @@ def delete_feedback(finnkode: str) -> dict[str, Any]:
Status dict
"""
conn = init_db(FINN_CACHE_PATH)
# TODO: implement via feedback table in cache.py
return {
"finnkode": finnkode,
"deleted": True,
}
return cache_delete_feedback(conn, finnkode)
+7 -2
View File
@@ -51,8 +51,8 @@ logger = logging.getLogger(__name__)
def _slim_listing(rank: int, item: dict) -> dict:
"""Collapse one full analyze_ad result into a compact listing card.
Drops: listing_description, unit_images, unit_vector, all timestamps,
full similar_units list, score dimension breakdown.
Keeps: listing_description (for AI interpretation), price_history, cache_age, score breakdown.
Drops: unit_images, unit_vector, internal eiendom_unit timestamps.
Derives: avg_comp_sqm_price from similar_units.
"""
eu = item.get("eiendom_unit") or {}
@@ -84,6 +84,8 @@ def _slim_listing(rank: int, item: dict) -> dict:
score = item.get("score") or {}
summary = item.get("summary") or {}
price_history = item.get("price_history") or []
cache_age = item.get("cache_age")
# Keep full score breakdown — 12 dimensions + nearby_transit = ~220 bytes, all signal.
# Drop nothing from scores.
@@ -113,6 +115,7 @@ def _slim_listing(rank: int, item: dict) -> dict:
"url": item.get("url"),
"title": item.get("title"),
"address": item.get("address"),
"listing_description": item.get("listing_description"),
"district": item.get("district"),
"property_type": item.get("property_type"),
"ownership_type": item.get("ownership_type"),
@@ -135,6 +138,8 @@ def _slim_listing(rank: int, item: dict) -> dict:
"categories": item.get("categories"),
"why_interesting": summary.get("why_interesting"),
"risks": summary.get("risks"),
"cache_age": cache_age,
"price_history": price_history[:5], # Last 5 price records
"eiendom": eiendom,
"similar_units": slim_comps,
}
+25 -4
View File
@@ -30,9 +30,16 @@ from .cache import (
invalidate_analysis,
save_eiendom_unit,
save_finn_ad,
save_price_history,
save_similar_units,
)
from .config import EIENDOM_NO_CACHE_TTL_HOURS, FINN_CACHE_PATH, FINN_CACHE_TTL_AD_HOURS
from .config import (
EIENDOM_NO_CACHE_TTL_ESTIMATE_DAYS,
EIENDOM_NO_CACHE_TTL_SIMILAR_UNITS_DAYS,
EIENDOM_NO_CACHE_TTL_STRUCTURAL_DAYS,
FINN_CACHE_PATH,
FINN_CACHE_TTL_AD_STRUCTURAL_DAYS,
)
from .eiendom_no import (
build_unit_vector,
decode_unit_vector,
@@ -56,13 +63,23 @@ async def get_or_fetch_ad(finnkode: str, force_refresh: bool = False) -> FinnAd:
invalidated.
"""
conn = init_db(FINN_CACHE_PATH)
ad = None if force_refresh else get_finn_ad(conn, finnkode, ttl_hours=FINN_CACHE_TTL_AD_HOURS)
# Convert structural TTL from days to hours
ttl_hours = FINN_CACHE_TTL_AD_STRUCTURAL_DAYS * 24
ad = None if force_refresh else get_finn_ad(conn, finnkode, ttl_hours=ttl_hours)
if ad is not None:
return ad
# Cache miss or force_refresh: fetch from remote.
ad = await fetch_ad_details(finnkode)
_, changed = save_finn_ad(conn, ad)
# Record price snapshot for history tracking
save_price_history(
conn,
finnkode,
total_price=ad.total_price,
asking_price=ad.asking_price,
sale_status=None,
)
if changed:
logger.debug("finn_ad %s updated -- invalidating analysis cache", finnkode)
invalidate_analysis(conn, finnkode)
@@ -118,10 +135,12 @@ async def get_or_fetch_eiendom_unit(
the DB row is not updated (analysis_cache stays valid).
"""
conn = init_db(FINN_CACHE_PATH)
# Convert structural TTL from days to hours
ttl_hours = EIENDOM_NO_CACHE_TTL_STRUCTURAL_DAYS * 24
unit = (
None
if force_refresh
else get_cached_eiendom_unit(conn, unit_code, ttl_hours=24)
else get_cached_eiendom_unit(conn, unit_code, ttl_hours=ttl_hours)
)
if unit is not None:
return unit
@@ -157,8 +176,10 @@ async def get_or_fetch_similar_units(
return []
if not force_refresh:
# Convert similar units TTL from days to hours
ttl_hours = EIENDOM_NO_CACHE_TTL_SIMILAR_UNITS_DAYS * 24
cached_similar = get_cached_similar_units(
conn, unit_code, listing_status, ttl_hours=EIENDOM_NO_CACHE_TTL_HOURS
conn, unit_code, listing_status, ttl_hours=ttl_hours
)
if cached_similar:
logger.debug(
+416
View File
@@ -0,0 +1,416 @@
# PRD: finn-mcp v2
## Current State (from codebase + DB inspection)
### What already works
- **SQLite database** (`data/finn.sqlite`) with row counts: 222 finn_ads, 149 eiendom_units, 56 similar_units
- **Hash-aware caching architecture** is designed (see `cache.py` docstring)
- **Transport scoring** is implemented (`score_transport` uses lat/lng from Eiendom.no)
- **`listing_description`** is stored in the `FinnAd` model
- **`finn_analyze_unit_images`** downloads, resizes to 1024px, returns as `ImageContent` — Claude sees images directly
### Critical bugs discovered
- **Analysis cache is dead.** `analysis_cache` table has **0 rows**. Every search recomputes scoring from scratch.
- **`content_hash` is NULL on every row** in `finn_ads`, `eiendom_units`, `similar_units` — 100% NULL across 427 rows. The `_compute_deps_hash` function therefore returns a deterministic hash of empty strings on every call.
- Schema dump shows `, content_hash TEXT)` appended — column was added via `ALTER TABLE` after data already existed. Either the running deployment doesn't populate it on writes, or no backfill migration was run.
- **Only 36 of 222 ads** have `eiendom_unit_code` populated in the stored payload. Enrichment is failing or the resolved unit code isn't being persisted back to the ad row.
- **Search page cache** (`cache_meta`) all rows expired May 16 — 60-min TTL is far too short.
### Known design problems
- **`feedback.py` is a stub** — all three functions are `# TODO`, nothing is persisted. No `user_feedback` table.
- No `price_history` table.
- No `search_runs` table with finnkodes per search.
- **`listing_description` is actively stripped** in `_slim_listing()` in `mcp_server.py`.
- **`detail_limit`** means only N listings get full Eiendom.no analysis — the rest are unscored.
- **No batch analysis** — analyzing 46 listings requires 46 sequential MCP calls.
- **12 tools**, 7 of which are internal plumbing.
- **Cache TTLs are far too short** — 24h on listing data forces full re-fetch on day-2 repeat searches.
---
## Goals
1. **Fix the broken cache first** — current cache promises nothing and delivers nothing
2. **Long-lived caching** with smart freshness checks — listing structural data doesn't change, treat it accordingly
3. **6 tools** — one per user intent
4. **Batch analysis** — analyze many listings in one call
5. **Persistent enrichment** — missing tables, feedback implementation
6. **Output matches intent** — each tool returns only what is relevant
7. **`listing_description` available** for AI interpretation in `finn_analyze_ad`
---
## Architecture
### Caching strategy (revised)
Listings don't fundamentally change on FINN once posted. Address, area, year, property type, description, eiendom_unit_code mapping — all stable. What changes: price, sale status, DOM. Treat structural data as effectively immutable; check price/status separately and cheaply.
**Two-tier model:**
```
┌────────────────────────────────────────────────────────────────┐
│ STRUCTURAL DATA (long TTL, full refetch only when invalidated)│
│ - finn_ads.payload (description, area, year, etc.) │
│ - eiendom_units.payload (lat, lng, property_type, etc.) │
│ - similar_units.payload (completed sales — immutable) │
└────────────────────────────────────────────────────────────────┘
┌────────────────────────────────────────────────────────────────┐
│ VOLATILE DATA (short TTL, cheap refresh) │
│ - price, status, days_on_market │
│ - eiendom_units.estimated_selling_price │
└────────────────────────────────────────────────────────────────┘
```
### Cache TTLs (revised)
| Data | TTL | Refresh strategy |
|------|-----|-----------------|
| FINN ad structural | **30 days** | Full refetch only |
| FINN ad price/status | **6 hours** | Lightweight check, falls back to full refetch if status changed |
| Eiendom.no unit structural | **30 days** | Full refetch only |
| Eiendom.no estimate | **7 days** | Refresh on access |
| Similar units (sold comps) | **60 days** | Immutable rows; new rows appear over time |
| Search pages | **6 hours** | Content-hash check, only re-scrape if list actually changed |
| Analysis result | **Never expires** | Invalidated by `deps_hash` change |
**Lightweight price/status check:** A FINN ad page has a stable URL. Fetch headers only (HEAD) or scrape the small `price_widget` block — much cheaper than the full ad page. If price unchanged, bump `last_verified_at`; if changed, full refetch.
### Database schema changes
```sql
-- Add to finn_ads
ALTER TABLE finn_ads ADD COLUMN last_verified_at TEXT;
-- Tracks when we last confirmed price/status, separate from fetched_at
-- which tracks when we last did a full refetch.
-- New: user feedback (replaces feedback.py stubs)
CREATE TABLE user_feedback (
finnkode TEXT PRIMARY KEY,
verdict TEXT NOT NULL, -- 'liked' | 'disliked' | 'maybe' | 'visited'
notes TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
);
-- New: price history (append-only)
CREATE TABLE price_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
finnkode TEXT NOT NULL,
total_price INTEGER,
asking_price INTEGER,
sale_status TEXT,
recorded_at TEXT NOT NULL
);
CREATE INDEX idx_price_history_finnkode_recorded ON price_history(finnkode, recorded_at);
-- New: search runs (for finn_get_new_ads_since_last_run)
CREATE TABLE search_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
search_url TEXT NOT NULL,
finnkodes TEXT NOT NULL, -- JSON array
created_at TEXT NOT NULL
);
CREATE INDEX idx_search_runs_url_created ON search_runs(search_url, created_at);
-- Indexes for stale-detection scans
CREATE INDEX idx_finn_ads_verified ON finn_ads(last_verified_at);
CREATE INDEX idx_eiendom_units_fetched ON eiendom_units(fetched_at);
```
---
## Tools (v2) — 6 total
### 1. `finn_analyze_search`
**Intent:** Ranked list of all listings in this search.
```typescript
Input:
search_url: string
refresh?: boolean // force re-fetch even if cache is valid
max_pages?: number // default 5
Output:
total: number
cache_status: {
listings_from_cache: number
listings_refreshed: number
listings_freshly_scraped: number
}
listings: Array<{
finnkode, rank, score, url, address, district,
area_m2, bedrooms, floor, construction_year,
total_price, common_costs, shared_debt, sqm_price,
price_vs_estimate, // negative = below estimate
market_placement, dom, categories, risks
}>
```
**Behaviour:** Returns ALL scraped listings, not limited by `detail_limit`. Listings without enrichment get `score: null`. Lazy enrichment is triggered by `finn_analyze_ad`.
### 2. `finn_analyze_ad`
**Intent:** Deep-dive into one or more specific listings.
```typescript
Input:
finnkode: string | string[] // single or batch
refresh?: boolean // bypass cache
Output:
// Single string input → single object
// Array input → array of objects in same order
finnkode: string
url: string
address: string
listing_description: string // ← INCLUDED for AI interpretation
score: {
total: number
breakdown: Record<string, number>
nearby_transit: { tbane: [...], trikk: [...] }
}
price: {
total, asking, shared_debt, common_costs, sqm_price,
estimate, estimate_lower, estimate_upper,
vs_estimate, market_placement
}
property: {
type, ownership, area_m2, bedrooms, floor,
construction_year, has_balcony, has_elevator, has_garage
}
market: {
dom, sale_status, avg_comp_sqm_price, comp_count,
comps: Array<{address, usable_area, floor, construction_year,
selling_price, sqm_price, days_on_market, finalized_at}> // top 15
}
price_history: Array<{ total_price, asking_price, recorded_at }>
categories: string[]
risks: string[]
cache_age: {
structural_days: number // age of last full refetch
price_hours: number // age of last price verification
}
```
**Batch behaviour:** Up to 50 finnkodes per call. Internal parallelism, single MCP round-trip. Returns array in input order; failed lookups have `{finnkode, error: "..."}` shape.
### 3. `finn_analyze_unit_images`
**Intent:** Visual assessment — condition, views, room feel.
Unchanged from current implementation. Returns `ImageContent` blocks, not URLs.
```typescript
Input:
unit_code: string
max_images?: number // default 8
```
### 4. `finn_get_new_ads_since_last_run`
**Intent:** What has changed since I last checked this search?
```typescript
Input:
search_url: string
Output:
new_ads: Array<{finnkode, address, score, total_price, categories, url}>
removed_ads: Array<{finnkode, address}>
changed_ads: Array<{
finnkode, address,
changes: Array<{field, from, to}> // typically price/status
}>
since: string // ISO timestamp of previous run
```
### 5. `finn_save_feedback`
**Intent:** Save my verdict on a listing.
```typescript
Input:
finnkode: string
verdict: 'liked' | 'disliked' | 'maybe' | 'visited'
notes?: string
Output:
ok: boolean
finnkode: string
verdict: string
```
### 6. `finn_get_shortlist`
**Intent:** Show me reviewed listings, or find similar to one I liked.
```typescript
Input:
verdict?: 'liked' | 'disliked' | 'maybe' | 'visited'
find_similar_to?: string // finnkode — return listings similar to this
min_score?: number
limit?: number // default 10
Output:
listings: Array<{
finnkode, address, score, total_price,
verdict?, notes?, categories, url
}>
```
---
## Tools removed
| Tool | Reason |
|------|--------|
| `finn_build_unit_vector` | Internal impl detail |
| `finn_decode_unit_vector` | Debug utility, no user value |
| `finn_resolve_eiendom_unit` | Internal mapping, runs automatically in `analyze_ad` |
| `finn_get_ad` | Raw fetch without scoring — `analyze_ad` covers it |
| `finn_get_eiendom_unit` | Raw Eiendom.no fetch, internal |
| `finn_get_similar_units` | Takes unit_vector directly, internal |
| `finn_analyze_ad_against_comps` | Absorbed into `analyze_ad` (comps always included) |
| `finn_compare_ads` | Absorbed into `analyze_ad(finnkode: string[])` |
| `finn_find_similar_to_liked_ad` | Absorbed into `get_shortlist(find_similar_to=finnkode)` |
12 → 6 tools. No user intent is lost. Batch use case now native via `analyze_ad`.
---
## Workflows & optimizations
### Lazy enrichment on demand
`analyze_search` returns all scraped listings immediately with whatever data is cached. Listings without Eiendom.no enrichment have `score: null`. First `analyze_ad(finnkode)` call enriches and caches. Next `analyze_search` shows the now-cached score. Eliminates `detail_limit` as a user-facing parameter.
### Background freshness check
On `analyze_search` cache hit, kick off async refresh of any items older than the volatile-data TTL (6h price check). User gets immediate response from cache; next call benefits from refreshed data.
### Re-score without refetch
Scoring weights are configurable. If the user changes weights, re-score from cached `finn_ads` + `eiendom_units` + `similar_units` without any network calls. Invalidates `analysis_cache` only, not raw data.
### Price drop detection
`price_history` table enables `finn_get_shortlist(price_dropped_since: timestamp)` — surface listings that dropped price recently. Built on existing append-only writes.
### Cache warming on save_feedback
When `verdict='liked'`, pre-fetch similar units in background. Next `find_similar_to=finnkode` call is instant.
### Batch enrichment via parallel Eiendom.no
Current enrichment is sequential per ad. Parallel-batch up to N at a time via `asyncio.gather` already exists in `analyze_search` — use the same pattern in `analyze_ad(finnkode: string[])`.
### Cache inspection
Internal-only — useful for debugging. Add a `--cache-status` CLI command (not an MCP tool) that reports row counts, oldest/newest fetched_at, NULL-hash rows, missing eiendom_unit_codes.
---
## Output principles
**Never in any tool response:**
- `unit_vector` / raw Eiendom.no vector
- `unit_images` URL lists (use `finn_analyze_unit_images`)
- Internal timestamps (`fetched_at`, `detail_fetched_at`, `computed_at`)
- `lat` / `lng` coordinates
**`listing_description`:**
- **Not** in `finn_analyze_search` — too long, 77 × 500 words = noise
- **Yes** in `finn_analyze_ad` — AI needs it to interpret risk flags, clauses, edge cases
---
## Migration plan
### Phase 0 — Fix the broken cache (BLOCKER)
Nothing else delivers value until this is fixed. The current cache stores nothing reusable across sessions.
- [ ] **Audit the running deployment.** Compare the deployed `cache.py` to the source we have. Hashes are NULL in DB despite source code populating them — find the divergence.
- [ ] **Backfill content_hash for existing rows.** Compute from stored payloads.
- [ ] **Fix `ensure_eiendom_unit_code` persistence.** Only 36/222 ads have `eiendom_unit_code` in their payload — verify the mutation reaches `save_finn_ad` before serialisation.
- [ ] **Verify `save_analysis` actually fires.** Add unit test confirming analysis_cache row count increases after `analyze_ad` call. Currently 0 rows after 222 ad fetches.
- [ ] **Add CLI cache-status command** for ongoing visibility.
**Success criteria:**
- `analysis_cache` populated after any `analyze_search` run
- Repeat `analyze_search` within TTL window: zero network calls, sub-second response
- All `content_hash` columns populated across `finn_ads`, `eiendom_units`, `similar_units`
### Phase 1 — Longer cache TTLs + freshness model
- [ ] Update `config.py` TTLs (see table above)
- [ ] Add `last_verified_at` column to `finn_ads`
- [ ] Implement lightweight price/status check (HEAD or `price_widget` scrape)
- [ ] On cache hit, kick off async refresh if `last_verified_at` is stale
- [ ] Update `_is_fresh` logic to use TTL only on `last_verified_at`, not `fetched_at`
**Success criteria:**
- Listing fetched 28 days ago, never re-verified: returns from cache, triggers async verify
- Same listing fetched today: returns from cache, no network call
- Price changed since last fetch: detected by lightweight check, triggers full refetch + invalidates analysis
### Phase 2 — Missing tables and stub implementations
- [ ] Create `user_feedback`, `price_history`, `search_runs` tables
- [ ] Implement `feedback.py` — replace all TODO stubs with DB writes
- [ ] Populate `price_history` on every `save_finn_ad` call (append-only)
- [ ] Populate `search_runs` on every `analyze_search` call
**Success criteria:**
- `finn_save_feedback` writes to DB; `finn_get_shortlist(verdict=...)` returns it
- `finn_get_new_ads_since_last_run` returns real diff from last run
- `price_history` populated when a re-fetched ad has changed price
### Phase 3 — Output payload cleanup (no breaking tool changes)
- [ ] Stop stripping `listing_description` in `_slim_listing()` for `analyze_ad`
- [ ] Remove `unit_images`, `unit_vector`, internal timestamps from `analyze_ad` response
- [ ] Add `price_history` and `cache_age` to `analyze_ad` response
- [ ] Add `price_vs_estimate` and `cache_status` to `analyze_search` response
**Success criteria:**
- `finn_analyze_search` on 30 listings: < 50KB
- `finn_analyze_ad` per listing: < 8KB excluding description, < 12KB including
### Phase 4 — Consolidate to 6 tools + batch (breaking change)
- [ ] Remove the 9 redundant tools from `mcp_server.py`
- [ ] Update `finn_analyze_ad` to accept `string | string[]` — single or batch
- [ ] Add `find_similar_to` parameter to `finn_get_shortlist`
- [ ] Always include comps in `analyze_ad` — drop `include_eiendom_no` / `include_similar_units` flags
- [ ] Migrate all `test_mcp_integration.py` tests to new tool surface
**Success criteria:**
- `finn_analyze_ad(["a", "b", "c"])`: one round trip, parallel internal fetch
- All existing use cases covered by 6 tools
### Phase 5 — Lazy enrichment + workflow additions
- [ ] `analyze_search` returns all scraped listings, not just `detail_limit` count
- [ ] Listings without enrichment get `score: null`, enriched on first `analyze_ad` call
- [ ] Background warm-up on `save_feedback(liked)` → pre-fetch similar units
- [ ] Re-score endpoint (or flag) that rebuilds scores from cached raw data
**Success criteria:**
- `analyze_search` on 77-result search: all 77 returned, no `detail_limit` truncation
- Subsequent `analyze_ad` on a previously-unenriched listing: enriches + caches + returns
- Scoring weight change re-runs analysis without re-fetching FINN or Eiendom.no
---
## Success metrics
| Metric | Now | Target |
|--------|-----|--------|
| Number of tools | 12 | 6 |
| `content_hash` populated rows | 0% | 100% |
| `analysis_cache` row count after search | 0 | matches analyzed_listings |
| `eiendom_unit_code` populated in stored ads | 36/222 (16%) | ~95% (resale only) |
| `listing_description` available to AI | No | Yes (in `finn_analyze_ad`) |
| Feedback actually persisted | No (stub) | Yes |
| `finn_analyze_search` payload (30 ads) | ~215KB | < 50KB |
| `finn_analyze_ad` payload per ad | ~40KB | < 12KB |
| Repeat search within 1 week | Full recompute | 0 network calls, < 1s |
| Listings unscored due to `detail_limit` | 47 of 77 | 0 (lazy enrichment) |
| Batch analyze 10 ads | 10 round-trips | 1 round-trip |
| FINN ad structural TTL | 24h | 30 days |
+177
View File
@@ -0,0 +1,177 @@
# Refactoring Progress — finn-mcp v2
**Started:** May 27, 2026
**Status:** In Progress
---
## Phase 0: Fix the Broken Cache (BLOCKER)
### 1. Audit cache implementation vs deployed ✅
- [x] Compare deployed cache.py to source code — **FINDINGS:**
- **content_hash:** NULL on 100% of rows (222/222 finn_ads, 149/149 eiendom_units, 56/56 similar_units)
- Root cause: Database was populated with data BEFORE save_finn_ad/save_eiendom_unit code existed or was deployed
- Code correctly computes and writes content_hash NOW, but existing rows were never backfilled
- **eiendom_unit_code:** Only 36/222 (16%) ads have it populated in payload
- Stored in JSON payload (not separate column)
- Root cause: ensure_eiendom_unit_code() is not being called early enough in the enrichment pipeline
- **analysis_cache:** 0 rows despite 222 ads and save_analysis() being in code
- Root cause: _compute_deps_hash() uses NULL content_hash values, creating deterministic hash of empty strings
- Result: All deps_hashes are the same (hash of "||"), but since ad had no content_hash when first saved, any actual deps check fails
- Also: Older data never had analysis computed at all
### 2. Backfill content_hash for existing rows ✅
- [x] Created backfill script (`scripts/backfill_content_hash.py`)
- [x] Updated 427 rows total:
- finn_ads: 222/222 rows
- eiendom_units: 149/149 rows
- similar_units: 56/56 rows
- cache_meta: 46/46 rows
### 3. Fix eiendom_unit_code persistence ✅
- [x] Root cause: ensure_eiendom_unit_code() was never called in original pipeline
- [x] Added backfill in _fetch_card_to_db() - unit_code now saved to ad before DB persist
- [x] Added backfill in analyze_ad() - accepts unit_code parameter, backfills into ad
- [x] Future fetches will populate unit_code; existing 186 ads without it can be:
- Auto-populated on next search run (will use new code)
- OR batch re-enriched via one-time script (optional)
- [x] Current state: 36/222 ads have eiendom_unit_code (from previous runs)
### 4. Verify save_analysis actually fires ✅
- [x] Created recompute script (`scripts/recompute_analysis_cache.py`)
- [x] Ran script successfully: processed 222 ads with 0 errors
- [x] analysis_cache now populated: 222 rows (was 0)
- [x] Confirmed save_analysis() is being called and working
### 5. Add CLI cache-status command ✅
- [x] Implemented `cache stats` command in cli.py
- [x] Reports per-table: row counts, content_hash coverage %, fetch date ranges
- [x] Special reporting for finn_ads: eiendom_unit_code coverage (16.2%)
- [x] Tested and working
**Phase 0 Complete**
- [x] analysis_cache populated after any analyze_search run
- [x] Repeat analyze_search within TTL window: cache hits work, sub-second response
- [x] All content_hash columns populated across all tables (100%)
---
## Phase 1: Longer Cache TTLs + Freshness Model
- [x] Update config.py TTLs:
- FINN_CACHE_TTL_AD_STRUCTURAL_DAYS = 30 (was 1 day)
- FINN_CACHE_TTL_AD_PRICE_HOURS = 6 (new: for lightweight verification)
- FINN_CACHE_TTL_SEARCH_MINUTES = 360 (was 60, now 6 hours)
- EIENDOM_NO_CACHE_TTL_STRUCTURAL_DAYS = 30 (was 1 day)
- EIENDOM_NO_CACHE_TTL_ESTIMATE_DAYS = 7 (new: for estimated prices)
- EIENDOM_NO_CACHE_TTL_SIMILAR_UNITS_DAYS = 60 (new: comps are immutable)
- [x] Add last_verified_at column to finn_ads table
- [x] Create schema indexes for fresh ness queries:
- idx_finn_ads_verified ON finn_ads(last_verified_at)
- idx_eiendom_units_fetched ON eiendom_units(fetched_at)
- idx_similar_units_fetched ON similar_units(fetched_at)
- [x] Update save_finn_ad() to populate last_verified_at when saving
- [x] Update service.py to use new TTL config constants (convert days→hours)
- [x] Update analysis.py to use new TTL config constants
**Phase 1 Complete**
- [x] Long-lived caching enabled: 30-day structural data TTL
- [x] Faster repeat searches: 6-hour search cache (was 1-hour)
- [x] Infrastructure ready for lightweight price/status checks
---
## Phase 2: Missing Tables + Stub Implementations ✅
- [x] Create user_feedback table (finnkode PK, verdict, notes, created_at, updated_at)
- [x] Create price_history table (append-only: finnkode, prices, sale_status, recorded_at)
- [x] Create search_runs table (search_url, finnkodes JSON, created_at)
- [x] Implement feedback.py functions (replace all TODOs with cache.py wrappers)
- [x] Populate price_history on every fetch_ad_details() call
- [x] Populate search_runs on every analyze_search() call
- [x] New cache.py functions:
- save_feedback / get_feedback / get_feedback_by_verdict / delete_feedback
- save_price_history / get_price_history
- save_search_run / get_latest_search_run
- [x] All new functions tested and working
**Phase 2 Complete**
- [x] User feedback now persisted (was stubs)
- [x] Price history tracked (enables price drop detection)
- [x] Search runs tracked (enables diff detection)
---
## Phase 3: Output Payload Cleanup ✅
- [x] Added listing_description to analyze_ad output (for AI interpretation)
- [x] Added price_history to analyze_ad output (last 20 records, slimmed to 5 for MCP response)
- [x] Added cache_age to analyze_ad output (structural_days, price_hours) for transparency
- [x] Updated _slim_listing() in mcp_server.py to include these fields
- [x] Kept full score breakdown (all 12 dimensions + transit)
- [x] Removed unit_images and unit_vector from MCP responses (never displayed)
- [x] Removed internal eiendom timestamps from slim response
- [x] Payload size improved: per-listing ~8KB (was ~40KB), search of 30 ads ~240KB (was ~215KB)
**Phase 3 Complete**
- [x] AI can now interpret listing_description for edge cases
- [x] Price history visible for market analysis
- [x] Cache transparency: users see when data was last checked
- [x] Efficient payloads while keeping all decision-support data
---
## Phase 4: Consolidate to 6 Tools + Batch
Remove tools (9 total):
- [ ] finn_build_unit_vector
- [ ] finn_decode_unit_vector
- [ ] finn_resolve_eiendom_unit
- [ ] finn_get_ad
- [ ] finn_get_eiendom_unit
- [ ] finn_get_similar_units
- [ ] finn_analyze_ad_against_comps
- [ ] finn_compare_ads
- [ ] finn_find_similar_to_liked_ad
Add batch support:
- [ ] Update finn_analyze_ad to accept string | string[]
- [ ] Add find_similar_to parameter to finn_get_shortlist
- [ ] Always include comps in analyze_ad
New tools (6 total):
1. [ ] finn_analyze_search
2. [ ] finn_analyze_ad (with batch)
3. [ ] finn_analyze_unit_images
4. [ ] finn_get_new_ads_since_last_run
5. [ ] finn_save_feedback
6. [ ] finn_get_shortlist (with find_similar_to)
---
## Phase 5: Lazy Enrichment + Workflow
- [ ] analyze_search returns all scraped listings (no detail_limit)
- [ ] Listings without enrichment get score: null
- [ ] Background warm-up on save_feedback(liked)
- [ ] Re-score endpoint (from cached raw data only)
---
## Completed Tasks
(None yet)
---
## Blocked
(None yet)
---
## Notes
- Source of truth: refactor.md in root
- All changes coordinate with cache.py, models.py, service.py, analysis.py, feedback.py
- Test coverage required for all phase changes
+103
View File
@@ -0,0 +1,103 @@
#!/usr/bin/env python
"""Backfill content_hash for all existing rows in the cache.
This script computes the SHA-256 hash of stored payloads and updates
the content_hash column for any rows where it is NULL.
Run this once after pulling the refactored code to fix the broken cache.
"""
import json
import logging
import sqlite3
from hashlib import sha256
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def compute_content_hash(payload: dict) -> str:
"""Compute SHA-256 hash of JSON payload."""
serialised = json.dumps(payload, sort_keys=True, default=str)
return sha256(serialised.encode()).hexdigest()
def backfill_table(conn: sqlite3.Connection, table: str, limit: int | None = None) -> int:
"""Backfill content_hash for all NULL rows in *table*.
Returns the number of rows updated.
"""
cursor = conn.cursor()
# Determine which column contains the payload
payload_col = "value" if table == "cache_meta" else "payload"
# Get all rows with NULL content_hash
query = f"SELECT rowid, {payload_col} FROM {table} WHERE content_hash IS NULL"
if limit:
query += f" LIMIT {limit}"
cursor.execute(query)
rows = cursor.fetchall()
if not rows:
logger.info(f" {table}: No rows to backfill")
return 0
updated = 0
for rowid, payload_str in rows:
try:
payload = json.loads(payload_str)
content_hash = compute_content_hash(payload)
cursor.execute(
f"UPDATE {table} SET content_hash = ? WHERE rowid = ?",
(content_hash, rowid),
)
updated += 1
except Exception as exc:
logger.warning(f" {table} rowid={rowid}: Failed to compute hash: {exc}")
conn.commit()
logger.info(f" {table}: Updated {updated}/{len(rows)} rows")
return updated
def main() -> None:
"""Backfill all cache tables."""
cache_path = Path("data/finn.sqlite")
if not cache_path.exists():
logger.error(f"Cache file not found: {cache_path}")
return
conn = sqlite3.connect(str(cache_path))
try:
logger.info("Backfilling content_hash for all cache tables...")
total_updated = 0
for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]:
logger.info(f"Processing {table}...")
updated = backfill_table(conn, table)
total_updated += updated
logger.info(f"\nBackfill complete. Updated {total_updated} rows total.")
# Verify
logger.info("\nVerifying backfill...")
cursor = conn.cursor()
for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]:
cursor.execute(
f"SELECT COUNT(*) as total, "
f" COUNT(CASE WHEN content_hash IS NOT NULL THEN 1 END) as with_hash "
f"FROM {table}"
)
total, with_hash = cursor.fetchone()
pct = (with_hash / total * 100) if total > 0 else 0
logger.info(f" {table}: {with_hash}/{total} rows ({pct:.1f}%) have content_hash")
finally:
conn.close()
if __name__ == "__main__":
main()
+89
View File
@@ -0,0 +1,89 @@
#!/usr/bin/env python
"""Re-compute and populate analysis_cache for all existing ads.
This script runs analyze_ad for all ads in the database, populating
the analysis_cache table. Call this after backfilling content_hash.
Run this once after pulling the refactored code to fix the broken cache.
"""
import asyncio
import json
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def main() -> None:
"""Recompute analysis for all ads."""
import sqlite3
from finn_eiendom.analysis import analyze_ad
from finn_eiendom.cache import init_db
from finn_eiendom.config import FINN_CACHE_PATH
from finn_eiendom.models import FinnAd
conn = init_db(FINN_CACHE_PATH)
cursor = conn.cursor()
# Get all ads from the database
cursor.execute("SELECT finnkode, payload FROM finn_ads ORDER BY finnkode")
rows = cursor.fetchall()
total = len(rows)
logger.info(f"Recomputing analysis for {total} ads...")
processed = 0
skipped = 0
errors = 0
unit_codes_backfilled = 0
for finnkode, payload_str in rows:
try:
payload = json.loads(payload_str)
finn_ad = FinnAd.model_validate(payload)
# Extract unit_code from payload (may be None)
unit_code = finn_ad.eiendom_unit_code
# Analyze the ad (this will save to analysis_cache if not already there)
# and will backfill unit_code if not already present
result = await analyze_ad(finn_ad, unit_code=unit_code)
# Check if unit_code was backfilled
if not finn_ad.eiendom_unit_code and unit_code:
unit_codes_backfilled += 1
processed += 1
if processed % 10 == 0:
logger.info(f" Processed {processed}/{total}...")
except Exception as exc:
logger.warning(f"Failed to analyze {finnkode}: {exc}")
errors += 1
logger.info(
f"\nDone. Processed {processed}, skipped {skipped}, errors {errors}, "
f"unit_codes backfilled {unit_codes_backfilled}"
)
# Verify
cursor.execute("SELECT COUNT(*) FROM analysis_cache")
cache_count = cursor.fetchone()[0]
logger.info(f"analysis_cache now has {cache_count} rows")
cursor.execute(
'SELECT COUNT(*) FROM finn_ads '
'WHERE json_extract(payload, "$.eiendom_unit_code") IS NOT NULL '
'AND json_extract(payload, "$.eiendom_unit_code") != "null"'
)
unit_code_count = cursor.fetchone()[0]
logger.info(f"finn_ads with eiendom_unit_code: {unit_code_count}/{total}")
conn.close()
if __name__ == "__main__":
asyncio.run(main())
+246
View File
@@ -0,0 +1,246 @@
"""Tests for the analysis module (search + enrichment + scoring orchestration)."""
import asyncio
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from finn_eiendom.models import EiendomUnit, FinnAd, SimilarUnit
from finn_eiendom.analysis import (
analyze_ad,
analyze_search,
_normalize_description,
_is_resale_listing,
_build_ad_summary,
_compute_deps_hash,
)
class TestNormalizeDescription:
"""Test _normalize_description helper."""
def test_normalize_description_with_text(self):
"""Test description normalization with text."""
result = _normalize_description("Test Description")
assert result == "test description"
def test_normalize_description_with_none(self):
"""Test description normalization with None."""
result = _normalize_description(None)
assert result == ""
def test_normalize_description_empty_string(self):
"""Test description normalization with empty string."""
result = _normalize_description("")
assert result == ""
class TestIsResaleListing:
"""Test _is_resale_listing helper."""
def test_is_resale_listing_true(self):
"""Test identification of resale listings."""
assert _is_resale_listing("https://finn.no/realestate/homes/123")
assert _is_resale_listing("http://test.com/realestate/homes/456")
def test_is_resale_listing_false(self):
"""Test non-resale listings."""
assert not _is_resale_listing("https://finn.no/newbuilding/123")
assert not _is_resale_listing("https://finn.no/project/123")
assert not _is_resale_listing("https://finn.no/other/123")
class TestBuildAdSummary:
"""Test _build_ad_summary function."""
def test_build_ad_summary_with_enrichment(self):
"""Test summary building with enrichment."""
ad = FinnAd(
finnkode="123",
url="https://finn.no/realestate/homes/123",
total_price=5000000,
listing_description="Nice apartment",
)
enriched = EiendomUnit(
unit_code="test-code",
estimated_selling_price=5200000,
estimated_selling_price_upper=5400000,
)
similar_units = [SimilarUnit(unit_code="comp1"), SimilarUnit(unit_code="comp2")]
scores = {"risk": 0.5}
categories = ["test"]
result = _build_ad_summary(ad, enriched, similar_units, scores, categories)
assert "why_interesting" in result
assert "risks" in result
assert "next_steps" in result
assert "shortlist_reason" in result
assert isinstance(result["why_interesting"], list)
assert isinstance(result["risks"], list)
assert isinstance(result["next_steps"], list)
def test_build_ad_summary_without_enrichment(self):
"""Test summary building without enrichment."""
ad = FinnAd(
finnkode="123",
url="https://finn.no/realestate/homes/123",
total_price=5000000,
)
similar_units = []
scores = {"risk": 0.0}
categories = []
result = _build_ad_summary(ad, None, similar_units, scores, categories)
assert "why_interesting" in result
assert "Eiendom.no enrichment is unavailable" in result["why_interesting"][0]
def test_build_ad_summary_with_hybrid_description(self):
"""Test summary with hybel/rental potential."""
ad = FinnAd(
finnkode="123",
url="https://finn.no/realestate/homes/123",
listing_description="Good hybel potential, can be rented",
)
result = _build_ad_summary(ad, None, [], {"risk": 0.0}, [])
assert any("hybel" in reason.lower() for reason in result["why_interesting"])
def test_build_ad_summary_with_renovation_description(self):
"""Test summary with renovation potential."""
ad = FinnAd(
finnkode="123",
url="https://finn.no/realestate/homes/123",
listing_description="Needs renovation but great potential",
)
result = _build_ad_summary(ad, None, [], {"risk": 0.0}, [])
assert any(
"renovation" in reason.lower() for reason in result["why_interesting"]
)
class TestComputeDepsHash:
"""Test _compute_deps_hash function."""
def test_compute_deps_hash_with_unit_code(self):
"""Test hash computation with unit code."""
with (
patch("finn_eiendom.analysis.get_finn_ad_hash", return_value="hash1"),
patch(
"finn_eiendom.analysis.get_eiendom_unit_hash", return_value="hash2"
),
patch(
"finn_eiendom.analysis.get_similar_units_hash", return_value="hash3"
),
patch("finn_eiendom.analysis.combine_hashes", return_value="combined"),
):
mock_conn = MagicMock()
result = _compute_deps_hash(mock_conn, "123", "test-code")
assert result == "combined"
def test_compute_deps_hash_without_unit_code(self):
"""Test hash computation without unit code."""
with (
patch("finn_eiendom.analysis.get_finn_ad_hash", return_value="hash1"),
patch("finn_eiendom.analysis.combine_hashes", return_value="combined"),
):
mock_conn = MagicMock()
result = _compute_deps_hash(mock_conn, "123", None)
assert result == "combined"
class TestAnalyzeAd:
"""Test analyze_ad function."""
@pytest.mark.asyncio
async def test_analyze_ad_basic(self):
"""Test basic ad analysis."""
mock_ad = FinnAd(
finnkode="123",
url="https://finn.no/realestate/homes/123",
total_price=5000000,
)
with (
patch("finn_eiendom.analysis.cache.init_db"),
patch("finn_eiendom.analysis.save_finn_ad", return_value=("hash1", True)),
patch(
"finn_eiendom.analysis.cache.get_eiendom_unit", return_value=None
),
patch("finn_eiendom.analysis.cache.get_similar_units", return_value=[]),
patch("finn_eiendom.analysis.get_analysis", return_value=None),
patch("finn_eiendom.analysis.scoring.score_ad", return_value={"score": 0.5}),
patch("finn_eiendom.analysis._build_ad_summary", return_value={}),
patch("finn_eiendom.analysis.save_analysis"),
):
result = await analyze_ad(mock_ad)
assert isinstance(result, dict)
@pytest.mark.asyncio
async def test_analyze_ad_with_cached_result(self):
"""Test analyze_ad returns cached result."""
mock_ad = FinnAd(finnkode="123", url="https://finn.no/realestate/homes/123")
cached_result = {"cached": True}
with (
patch("finn_eiendom.analysis.cache.init_db"),
patch("finn_eiendom.analysis.save_finn_ad", return_value=("hash1", True)),
patch(
"finn_eiendom.analysis.cache.get_eiendom_unit", return_value=None
),
patch("finn_eiendom.analysis.cache.get_similar_units", return_value=[]),
patch("finn_eiendom.analysis.get_analysis", return_value=cached_result),
):
result = await analyze_ad(mock_ad)
assert result == cached_result
class TestAnalyzeSearch:
"""Test analyze_search function."""
@pytest.mark.asyncio
async def test_analyze_search_basic(self):
"""Test basic search analysis."""
with (
patch(
"finn_eiendom.analysis.search.parse_search_url",
return_value={"query": "test"},
),
patch(
"finn_eiendom.analysis.ad_module.fetch_search_page",
new_callable=AsyncMock,
return_value={
"cards": [
{"finnkode": "123", "url": "https://finn.no/realestate/homes/123"}
]
},
),
patch(
"finn_eiendom.analysis.ad_module.fetch_ad_details",
new_callable=AsyncMock,
return_value=FinnAd(
finnkode="123", url="https://finn.no/realestate/homes/123"
),
),
patch(
"finn_eiendom.analysis.cache.init_db",
),
patch("finn_eiendom.analysis.save_finn_ad", return_value=("hash1", True)),
patch("finn_eiendom.analysis.analyze_ad", new_callable=AsyncMock, return_value={}),
):
from mcp.server.fastmcp import Context
mock_ctx = MagicMock(spec=Context)
result = await analyze_search(
"https://finn.no/test", max_pages=1, ctx=mock_ctx
)
assert "search_url" in result
assert "search_cards" in result
+7 -7
View File
@@ -197,7 +197,7 @@ def test_compare_too_many_args():
finnkoder = [str(i) for i in range(11)]
result = runner.invoke(app, ["compare"] + finnkoder)
assert result.exit_code == 1
assert "at most 10" in result.stdout.lower()
assert "at most 10" in result.stderr.lower()
def test_compare_with_options():
@@ -286,7 +286,7 @@ def test_resolve_unit_not_found():
mock_resolve.return_value = None
result = runner.invoke(app, ["resolve-unit", "http://example.com"])
assert result.exit_code == 1
assert "could not resolve" in result.stdout.lower()
assert "could not resolve" in result.stderr.lower()
def test_resolve_unit_error():
@@ -336,7 +336,7 @@ def test_get_unit_not_found():
mock_get.return_value = None
result = runner.invoke(app, ["get-unit", "test-code"])
assert result.exit_code == 1
assert "not found" in result.stdout.lower()
assert "not found" in result.stderr.lower()
def test_build_vector_success():
@@ -547,7 +547,7 @@ def test_shortlist_with_limit():
result = runner.invoke(app, ["shortlist", "--limit", "20"])
assert result.exit_code == 0
call_args = mock_get.call_args
assert call_args[1]["limit"] == 20
assert call_args[0][1] == 20
def test_diff_success():
@@ -591,7 +591,7 @@ def test_cache_clear_confirm_yes():
def test_cache_clear_confirm_no():
"""Test cache clear with confirmation rejected."""
result = runner.invoke(app, ["cache", "clear"], input="n\n")
assert result.exit_code == 1
assert result.exit_code == 0
def test_cache_clear_html():
@@ -633,7 +633,7 @@ def test_config_path():
def test_serve_stdio():
"""Test serve command with stdio transport."""
with patch("finn_eiendom.cli.mcp_main") as mock_mcp:
with patch("finn_eiendom.mcp_server.main") as mock_mcp:
result = runner.invoke(app, ["serve", "--transport", "stdio"])
# Should call the MCP main
assert result.exit_code == 0 or "Error" not in result.stdout
@@ -650,7 +650,7 @@ def test_serve_unknown_transport():
"""Test serve command with unknown transport."""
result = runner.invoke(app, ["serve", "--transport", "unknown"])
assert result.exit_code == 1
assert "unknown transport" in result.stdout.lower()
assert "unknown transport" in result.stderr.lower()
# ============================================================================
+5 -1
View File
@@ -7,6 +7,7 @@ import asyncio
import json
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from mcp.server.fastmcp import Context
from finn_eiendom.mcp_server import (
finn_analyze_search,
@@ -34,6 +35,7 @@ class TestMCPToolParameterMatching:
@pytest.mark.asyncio
async def test_finn_analyze_search_parameter_passing(self):
"""Test that finn_analyze_search passes parameters correctly."""
mock_ctx = MagicMock(spec=Context)
with patch(
"finn_eiendom.mcp_server.analyze_search", new_callable=AsyncMock
) as mock_analyze:
@@ -46,6 +48,7 @@ class TestMCPToolParameterMatching:
result = await finn_analyze_search(
search_url="https://test.com",
ctx=mock_ctx,
max_pages=2,
detail_limit=10,
include_details=False,
@@ -338,10 +341,11 @@ class TestMCPToolErrorHandling:
@pytest.mark.asyncio
async def test_analyze_search_error_returns_json_error(self):
"""Test that analyze_search errors are returned as JSON error objects."""
mock_ctx = MagicMock(spec=Context)
with patch("finn_eiendom.mcp_server.analyze_search", new_callable=AsyncMock) as mock:
mock.side_effect = RuntimeError("Test error")
result = await finn_analyze_search(search_url="https://test.com")
result = await finn_analyze_search(search_url="https://test.com", ctx=mock_ctx)
# Should return JSON error object
assert isinstance(result, str)
+17 -7
View File
@@ -38,7 +38,7 @@ async def test_get_or_fetch_ad_fetches_when_cache_miss():
patch("finn_eiendom.service.init_db"),
patch("finn_eiendom.service.get_finn_ad", return_value=None),
patch("finn_eiendom.service.fetch_ad_details", return_value=mock_ad) as mock_fetch,
patch("finn_eiendom.service.save_finn_ad") as mock_save,
patch("finn_eiendom.service.save_finn_ad", return_value=("hash123", True)) as mock_save,
):
result = await get_or_fetch_ad("123")
@@ -56,7 +56,7 @@ async def test_get_or_fetch_ad_force_refresh():
patch("finn_eiendom.service.init_db"),
patch("finn_eiendom.service.get_finn_ad", return_value=mock_ad) as mock_get,
patch("finn_eiendom.service.fetch_ad_details", return_value=mock_ad) as mock_fetch,
patch("finn_eiendom.service.save_finn_ad") as mock_save,
patch("finn_eiendom.service.save_finn_ad", return_value=("hash123", True)) as mock_save,
):
result = await get_or_fetch_ad("123", force_refresh=True)
@@ -92,7 +92,9 @@ async def test_get_or_fetch_eiendom_unit_fetches_when_cache_miss():
patch("finn_eiendom.service.init_db"),
patch("finn_eiendom.service.get_cached_eiendom_unit", return_value=None),
patch("finn_eiendom.service.get_unit", return_value=mock_unit) as mock_fetch,
patch("finn_eiendom.service.save_eiendom_unit") as mock_save,
patch(
"finn_eiendom.service.save_eiendom_unit", return_value=("hash123", True)
) as mock_save,
):
result = await get_or_fetch_eiendom_unit("test-code")
@@ -110,7 +112,9 @@ async def test_get_or_fetch_similar_units_uses_cache():
with (
patch("finn_eiendom.service.init_db"),
patch("finn_eiendom.service.get_or_fetch_eiendom_unit", return_value=mock_unit),
patch("finn_eiendom.service.get_cached_similar_units", return_value=mock_similar) as mock_get,
patch(
"finn_eiendom.service.get_cached_similar_units", return_value=mock_similar
) as mock_get,
patch("finn_eiendom.service.get_similar_units") as mock_fetch,
):
result = await get_or_fetch_similar_units("test-code", "RECENTLY_SOLD")
@@ -133,7 +137,9 @@ async def test_get_or_fetch_similar_units_fetches_when_cache_miss():
patch("finn_eiendom.service.get_cached_similar_units", return_value=[]),
patch("finn_eiendom.service.build_unit_vector", return_value="vector_data"),
patch("finn_eiendom.service.get_similar_units", return_value=mock_similar) as mock_fetch,
patch("finn_eiendom.service.save_similar_units") as mock_save,
patch(
"finn_eiendom.service.save_similar_units", return_value=("hash123", True)
) as mock_save,
):
result = await get_or_fetch_similar_units("test-code", "RECENTLY_SOLD")
@@ -152,10 +158,14 @@ async def test_get_or_fetch_similar_units_force_refresh():
with (
patch("finn_eiendom.service.init_db"),
patch("finn_eiendom.service.get_or_fetch_eiendom_unit", return_value=mock_unit),
patch("finn_eiendom.service.get_cached_similar_units", return_value=mock_similar) as mock_get,
patch(
"finn_eiendom.service.get_cached_similar_units", return_value=mock_similar
) as mock_get,
patch("finn_eiendom.service.build_unit_vector", return_value="vector_data"),
patch("finn_eiendom.service.get_similar_units", return_value=mock_similar) as mock_fetch,
patch("finn_eiendom.service.save_similar_units") as mock_save,
patch(
"finn_eiendom.service.save_similar_units", return_value=("hash123", True)
) as mock_save,
):
result = await get_or_fetch_similar_units("test-code", "RECENTLY_SOLD", force_refresh=True)