#!/usr/bin/env python """Backfill content_hash for all existing rows in the cache. This script computes the SHA-256 hash of stored payloads and updates the content_hash column for any rows where it is NULL. Run this once after pulling the refactored code to fix the broken cache. """ import json import logging import sqlite3 from hashlib import sha256 from pathlib import Path logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def compute_content_hash(payload: dict) -> str: """Compute SHA-256 hash of JSON payload.""" serialised = json.dumps(payload, sort_keys=True, default=str) return sha256(serialised.encode()).hexdigest() def backfill_table(conn: sqlite3.Connection, table: str, limit: int | None = None) -> int: """Backfill content_hash for all NULL rows in *table*. Returns the number of rows updated. """ cursor = conn.cursor() # Determine which column contains the payload payload_col = "value" if table == "cache_meta" else "payload" # Get all rows with NULL content_hash query = f"SELECT rowid, {payload_col} FROM {table} WHERE content_hash IS NULL" if limit: query += f" LIMIT {limit}" cursor.execute(query) rows = cursor.fetchall() if not rows: logger.info(f" {table}: No rows to backfill") return 0 updated = 0 for rowid, payload_str in rows: try: payload = json.loads(payload_str) content_hash = compute_content_hash(payload) cursor.execute( f"UPDATE {table} SET content_hash = ? WHERE rowid = ?", (content_hash, rowid), ) updated += 1 except Exception as exc: logger.warning(f" {table} rowid={rowid}: Failed to compute hash: {exc}") conn.commit() logger.info(f" {table}: Updated {updated}/{len(rows)} rows") return updated def main() -> None: """Backfill all cache tables.""" cache_path = Path("data/finn.sqlite") if not cache_path.exists(): logger.error(f"Cache file not found: {cache_path}") return conn = sqlite3.connect(str(cache_path)) try: logger.info("Backfilling content_hash for all cache tables...") total_updated = 0 for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]: logger.info(f"Processing {table}...") updated = backfill_table(conn, table) total_updated += updated logger.info(f"\nBackfill complete. Updated {total_updated} rows total.") # Verify logger.info("\nVerifying backfill...") cursor = conn.cursor() for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]: cursor.execute( f"SELECT COUNT(*) as total, " f" COUNT(CASE WHEN content_hash IS NOT NULL THEN 1 END) as with_hash " f"FROM {table}" ) total, with_hash = cursor.fetchone() pct = (with_hash / total * 100) if total > 0 else 0 logger.info(f" {table}: {with_hash}/{total} rows ({pct:.1f}%) have content_hash") finally: conn.close() if __name__ == "__main__": main()