Files
finn-mcp/scripts/backfill_content_hash.py
T
ole 55d93894ac feat(refactor): Document refactoring progress and phases in markdown
feat(scripts): Add backfill script for content_hash in cache tables

feat(scripts): Create recompute script for analysis_cache population

test(tests): Implement comprehensive tests for analysis module functions

fix(tests): Update CLI tests to assert errors on stderr instead of stdout

fix(tests): Adjust MCP integration tests to pass context parameter correctly

fix(tests): Modify service tests to return hash on save functions for consistency
2026-05-29 15:16:57 +00:00

104 lines
3.2 KiB
Python

#!/usr/bin/env python
"""Backfill content_hash for all existing rows in the cache.
This script computes the SHA-256 hash of stored payloads and updates
the content_hash column for any rows where it is NULL.
Run this once after pulling the refactored code to fix the broken cache.
"""
import json
import logging
import sqlite3
from hashlib import sha256
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def compute_content_hash(payload: dict) -> str:
"""Compute SHA-256 hash of JSON payload."""
serialised = json.dumps(payload, sort_keys=True, default=str)
return sha256(serialised.encode()).hexdigest()
def backfill_table(conn: sqlite3.Connection, table: str, limit: int | None = None) -> int:
"""Backfill content_hash for all NULL rows in *table*.
Returns the number of rows updated.
"""
cursor = conn.cursor()
# Determine which column contains the payload
payload_col = "value" if table == "cache_meta" else "payload"
# Get all rows with NULL content_hash
query = f"SELECT rowid, {payload_col} FROM {table} WHERE content_hash IS NULL"
if limit:
query += f" LIMIT {limit}"
cursor.execute(query)
rows = cursor.fetchall()
if not rows:
logger.info(f" {table}: No rows to backfill")
return 0
updated = 0
for rowid, payload_str in rows:
try:
payload = json.loads(payload_str)
content_hash = compute_content_hash(payload)
cursor.execute(
f"UPDATE {table} SET content_hash = ? WHERE rowid = ?",
(content_hash, rowid),
)
updated += 1
except Exception as exc:
logger.warning(f" {table} rowid={rowid}: Failed to compute hash: {exc}")
conn.commit()
logger.info(f" {table}: Updated {updated}/{len(rows)} rows")
return updated
def main() -> None:
"""Backfill all cache tables."""
cache_path = Path("data/finn.sqlite")
if not cache_path.exists():
logger.error(f"Cache file not found: {cache_path}")
return
conn = sqlite3.connect(str(cache_path))
try:
logger.info("Backfilling content_hash for all cache tables...")
total_updated = 0
for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]:
logger.info(f"Processing {table}...")
updated = backfill_table(conn, table)
total_updated += updated
logger.info(f"\nBackfill complete. Updated {total_updated} rows total.")
# Verify
logger.info("\nVerifying backfill...")
cursor = conn.cursor()
for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]:
cursor.execute(
f"SELECT COUNT(*) as total, "
f" COUNT(CASE WHEN content_hash IS NOT NULL THEN 1 END) as with_hash "
f"FROM {table}"
)
total, with_hash = cursor.fetchone()
pct = (with_hash / total * 100) if total > 0 else 0
logger.info(f" {table}: {with_hash}/{total} rows ({pct:.1f}%) have content_hash")
finally:
conn.close()
if __name__ == "__main__":
main()