feat(refactor): Document refactoring progress and phases in markdown

feat(scripts): Add backfill script for content_hash in cache tables

feat(scripts): Create recompute script for analysis_cache population

test(tests): Implement comprehensive tests for analysis module functions

fix(tests): Update CLI tests to assert errors on stderr instead of stdout

fix(tests): Adjust MCP integration tests to pass context parameter correctly

fix(tests): Modify service tests to return hash on save functions for consistency
This commit is contained in:
Ole
2026-05-29 15:16:57 +00:00
parent 5b772b2ae5
commit 55d93894ac
18 changed files with 1457 additions and 60 deletions
+103
View File
@@ -0,0 +1,103 @@
#!/usr/bin/env python
"""Backfill content_hash for all existing rows in the cache.
This script computes the SHA-256 hash of stored payloads and updates
the content_hash column for any rows where it is NULL.
Run this once after pulling the refactored code to fix the broken cache.
"""
import json
import logging
import sqlite3
from hashlib import sha256
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def compute_content_hash(payload: dict) -> str:
"""Compute SHA-256 hash of JSON payload."""
serialised = json.dumps(payload, sort_keys=True, default=str)
return sha256(serialised.encode()).hexdigest()
def backfill_table(conn: sqlite3.Connection, table: str, limit: int | None = None) -> int:
"""Backfill content_hash for all NULL rows in *table*.
Returns the number of rows updated.
"""
cursor = conn.cursor()
# Determine which column contains the payload
payload_col = "value" if table == "cache_meta" else "payload"
# Get all rows with NULL content_hash
query = f"SELECT rowid, {payload_col} FROM {table} WHERE content_hash IS NULL"
if limit:
query += f" LIMIT {limit}"
cursor.execute(query)
rows = cursor.fetchall()
if not rows:
logger.info(f" {table}: No rows to backfill")
return 0
updated = 0
for rowid, payload_str in rows:
try:
payload = json.loads(payload_str)
content_hash = compute_content_hash(payload)
cursor.execute(
f"UPDATE {table} SET content_hash = ? WHERE rowid = ?",
(content_hash, rowid),
)
updated += 1
except Exception as exc:
logger.warning(f" {table} rowid={rowid}: Failed to compute hash: {exc}")
conn.commit()
logger.info(f" {table}: Updated {updated}/{len(rows)} rows")
return updated
def main() -> None:
"""Backfill all cache tables."""
cache_path = Path("data/finn.sqlite")
if not cache_path.exists():
logger.error(f"Cache file not found: {cache_path}")
return
conn = sqlite3.connect(str(cache_path))
try:
logger.info("Backfilling content_hash for all cache tables...")
total_updated = 0
for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]:
logger.info(f"Processing {table}...")
updated = backfill_table(conn, table)
total_updated += updated
logger.info(f"\nBackfill complete. Updated {total_updated} rows total.")
# Verify
logger.info("\nVerifying backfill...")
cursor = conn.cursor()
for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]:
cursor.execute(
f"SELECT COUNT(*) as total, "
f" COUNT(CASE WHEN content_hash IS NOT NULL THEN 1 END) as with_hash "
f"FROM {table}"
)
total, with_hash = cursor.fetchone()
pct = (with_hash / total * 100) if total > 0 else 0
logger.info(f" {table}: {with_hash}/{total} rows ({pct:.1f}%) have content_hash")
finally:
conn.close()
if __name__ == "__main__":
main()
+89
View File
@@ -0,0 +1,89 @@
#!/usr/bin/env python
"""Re-compute and populate analysis_cache for all existing ads.
This script runs analyze_ad for all ads in the database, populating
the analysis_cache table. Call this after backfilling content_hash.
Run this once after pulling the refactored code to fix the broken cache.
"""
import asyncio
import json
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def main() -> None:
"""Recompute analysis for all ads."""
import sqlite3
from finn_eiendom.analysis import analyze_ad
from finn_eiendom.cache import init_db
from finn_eiendom.config import FINN_CACHE_PATH
from finn_eiendom.models import FinnAd
conn = init_db(FINN_CACHE_PATH)
cursor = conn.cursor()
# Get all ads from the database
cursor.execute("SELECT finnkode, payload FROM finn_ads ORDER BY finnkode")
rows = cursor.fetchall()
total = len(rows)
logger.info(f"Recomputing analysis for {total} ads...")
processed = 0
skipped = 0
errors = 0
unit_codes_backfilled = 0
for finnkode, payload_str in rows:
try:
payload = json.loads(payload_str)
finn_ad = FinnAd.model_validate(payload)
# Extract unit_code from payload (may be None)
unit_code = finn_ad.eiendom_unit_code
# Analyze the ad (this will save to analysis_cache if not already there)
# and will backfill unit_code if not already present
result = await analyze_ad(finn_ad, unit_code=unit_code)
# Check if unit_code was backfilled
if not finn_ad.eiendom_unit_code and unit_code:
unit_codes_backfilled += 1
processed += 1
if processed % 10 == 0:
logger.info(f" Processed {processed}/{total}...")
except Exception as exc:
logger.warning(f"Failed to analyze {finnkode}: {exc}")
errors += 1
logger.info(
f"\nDone. Processed {processed}, skipped {skipped}, errors {errors}, "
f"unit_codes backfilled {unit_codes_backfilled}"
)
# Verify
cursor.execute("SELECT COUNT(*) FROM analysis_cache")
cache_count = cursor.fetchone()[0]
logger.info(f"analysis_cache now has {cache_count} rows")
cursor.execute(
'SELECT COUNT(*) FROM finn_ads '
'WHERE json_extract(payload, "$.eiendom_unit_code") IS NOT NULL '
'AND json_extract(payload, "$.eiendom_unit_code") != "null"'
)
unit_code_count = cursor.fetchone()[0]
logger.info(f"finn_ads with eiendom_unit_code: {unit_code_count}/{total}")
conn.close()
if __name__ == "__main__":
asyncio.run(main())