feat(refactor): Document refactoring progress and phases in markdown
feat(scripts): Add backfill script for content_hash in cache tables feat(scripts): Create recompute script for analysis_cache population test(tests): Implement comprehensive tests for analysis module functions fix(tests): Update CLI tests to assert errors on stderr instead of stdout fix(tests): Adjust MCP integration tests to pass context parameter correctly fix(tests): Modify service tests to return hash on save functions for consistency
This commit is contained in:
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python
|
||||
"""Backfill content_hash for all existing rows in the cache.
|
||||
|
||||
This script computes the SHA-256 hash of stored payloads and updates
|
||||
the content_hash column for any rows where it is NULL.
|
||||
|
||||
Run this once after pulling the refactored code to fix the broken cache.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
from hashlib import sha256
|
||||
from pathlib import Path
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def compute_content_hash(payload: dict) -> str:
|
||||
"""Compute SHA-256 hash of JSON payload."""
|
||||
serialised = json.dumps(payload, sort_keys=True, default=str)
|
||||
return sha256(serialised.encode()).hexdigest()
|
||||
|
||||
|
||||
def backfill_table(conn: sqlite3.Connection, table: str, limit: int | None = None) -> int:
|
||||
"""Backfill content_hash for all NULL rows in *table*.
|
||||
|
||||
Returns the number of rows updated.
|
||||
"""
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Determine which column contains the payload
|
||||
payload_col = "value" if table == "cache_meta" else "payload"
|
||||
|
||||
# Get all rows with NULL content_hash
|
||||
query = f"SELECT rowid, {payload_col} FROM {table} WHERE content_hash IS NULL"
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if not rows:
|
||||
logger.info(f" {table}: No rows to backfill")
|
||||
return 0
|
||||
|
||||
updated = 0
|
||||
for rowid, payload_str in rows:
|
||||
try:
|
||||
payload = json.loads(payload_str)
|
||||
content_hash = compute_content_hash(payload)
|
||||
cursor.execute(
|
||||
f"UPDATE {table} SET content_hash = ? WHERE rowid = ?",
|
||||
(content_hash, rowid),
|
||||
)
|
||||
updated += 1
|
||||
except Exception as exc:
|
||||
logger.warning(f" {table} rowid={rowid}: Failed to compute hash: {exc}")
|
||||
|
||||
conn.commit()
|
||||
logger.info(f" {table}: Updated {updated}/{len(rows)} rows")
|
||||
return updated
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Backfill all cache tables."""
|
||||
cache_path = Path("data/finn.sqlite")
|
||||
if not cache_path.exists():
|
||||
logger.error(f"Cache file not found: {cache_path}")
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(str(cache_path))
|
||||
try:
|
||||
logger.info("Backfilling content_hash for all cache tables...")
|
||||
|
||||
total_updated = 0
|
||||
for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]:
|
||||
logger.info(f"Processing {table}...")
|
||||
updated = backfill_table(conn, table)
|
||||
total_updated += updated
|
||||
|
||||
logger.info(f"\nBackfill complete. Updated {total_updated} rows total.")
|
||||
|
||||
# Verify
|
||||
logger.info("\nVerifying backfill...")
|
||||
cursor = conn.cursor()
|
||||
for table in ["finn_ads", "eiendom_units", "similar_units", "cache_meta"]:
|
||||
cursor.execute(
|
||||
f"SELECT COUNT(*) as total, "
|
||||
f" COUNT(CASE WHEN content_hash IS NOT NULL THEN 1 END) as with_hash "
|
||||
f"FROM {table}"
|
||||
)
|
||||
total, with_hash = cursor.fetchone()
|
||||
pct = (with_hash / total * 100) if total > 0 else 0
|
||||
logger.info(f" {table}: {with_hash}/{total} rows ({pct:.1f}%) have content_hash")
|
||||
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python
|
||||
"""Re-compute and populate analysis_cache for all existing ads.
|
||||
|
||||
This script runs analyze_ad for all ads in the database, populating
|
||||
the analysis_cache table. Call this after backfilling content_hash.
|
||||
|
||||
Run this once after pulling the refactored code to fix the broken cache.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
"""Recompute analysis for all ads."""
|
||||
import sqlite3
|
||||
|
||||
from finn_eiendom.analysis import analyze_ad
|
||||
from finn_eiendom.cache import init_db
|
||||
from finn_eiendom.config import FINN_CACHE_PATH
|
||||
from finn_eiendom.models import FinnAd
|
||||
|
||||
conn = init_db(FINN_CACHE_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all ads from the database
|
||||
cursor.execute("SELECT finnkode, payload FROM finn_ads ORDER BY finnkode")
|
||||
rows = cursor.fetchall()
|
||||
total = len(rows)
|
||||
|
||||
logger.info(f"Recomputing analysis for {total} ads...")
|
||||
|
||||
processed = 0
|
||||
skipped = 0
|
||||
errors = 0
|
||||
unit_codes_backfilled = 0
|
||||
|
||||
for finnkode, payload_str in rows:
|
||||
try:
|
||||
payload = json.loads(payload_str)
|
||||
finn_ad = FinnAd.model_validate(payload)
|
||||
|
||||
# Extract unit_code from payload (may be None)
|
||||
unit_code = finn_ad.eiendom_unit_code
|
||||
|
||||
# Analyze the ad (this will save to analysis_cache if not already there)
|
||||
# and will backfill unit_code if not already present
|
||||
result = await analyze_ad(finn_ad, unit_code=unit_code)
|
||||
|
||||
# Check if unit_code was backfilled
|
||||
if not finn_ad.eiendom_unit_code and unit_code:
|
||||
unit_codes_backfilled += 1
|
||||
|
||||
processed += 1
|
||||
if processed % 10 == 0:
|
||||
logger.info(f" Processed {processed}/{total}...")
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning(f"Failed to analyze {finnkode}: {exc}")
|
||||
errors += 1
|
||||
|
||||
logger.info(
|
||||
f"\nDone. Processed {processed}, skipped {skipped}, errors {errors}, "
|
||||
f"unit_codes backfilled {unit_codes_backfilled}"
|
||||
)
|
||||
|
||||
# Verify
|
||||
cursor.execute("SELECT COUNT(*) FROM analysis_cache")
|
||||
cache_count = cursor.fetchone()[0]
|
||||
logger.info(f"analysis_cache now has {cache_count} rows")
|
||||
|
||||
cursor.execute(
|
||||
'SELECT COUNT(*) FROM finn_ads '
|
||||
'WHERE json_extract(payload, "$.eiendom_unit_code") IS NOT NULL '
|
||||
'AND json_extract(payload, "$.eiendom_unit_code") != "null"'
|
||||
)
|
||||
unit_code_count = cursor.fetchone()[0]
|
||||
logger.info(f"finn_ads with eiendom_unit_code: {unit_code_count}/{total}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user