From 1399f61c1a88824c69c6745ec95f6c499316de31 Mon Sep 17 00:00:00 2001 From: Ole Date: Sat, 16 May 2026 06:54:17 +0000 Subject: [PATCH] initial --- .env.example | 18 + .github/copilot-instructions.md | 181 ++ .../instructions/clean-code.instructions.md | 150 ++ .github/instructions/cli.instructions.md | 158 ++ .github/instructions/docs.instructions.md | 118 ++ .github/instructions/mcp.instructions.md | 192 ++ .github/instructions/python.instructions.md | 80 + .github/instructions/tests.instructions.md | 199 +++ .gitignore | 33 + .vscode/extensions.json | 10 + .vscode/mcp.json | 8 + .vscode/settings.json | 23 + AGENTS.md | 178 ++ IMPLEMENTATION.md | 384 ++++ Makefile | 47 + PRD.md | 1556 +++++++++++++++++ PROJECT.md | 162 ++ README.md | 160 ++ USAGE.md | 503 ++++++ finn_eiendom/__init__.py | 36 + finn_eiendom/ad.py | 193 ++ finn_eiendom/analysis.py | 175 ++ finn_eiendom/cache.py | 243 +++ finn_eiendom/config.py | 30 + finn_eiendom/eiendom_no.py | 236 +++ finn_eiendom/http.py | 122 ++ finn_eiendom/mcp_server.py | 160 ++ finn_eiendom/models.py | 128 ++ finn_eiendom/parser.py | 88 + finn_eiendom/scoring.py | 146 ++ finn_eiendom/search.py | 194 ++ finn_eiendom/service.py | 35 + pyproject.toml | 49 + tests/__init__.py | 1 + tests/fixtures.py | 236 +++ tests/test_ad.py | 45 + tests/test_cache.py | 71 + tests/test_eiendom_no.py | 44 + tests/test_http.py | 83 + tests/test_mcp_server.py | 69 + tests/test_parser.py | 45 + tests/test_scoring.py | 22 + tests/test_search.py | 38 + tests/test_service.py | 97 + 44 files changed, 6746 insertions(+) create mode 100644 .env.example create mode 100644 .github/copilot-instructions.md create mode 100644 .github/instructions/clean-code.instructions.md create mode 100644 .github/instructions/cli.instructions.md create mode 100644 .github/instructions/docs.instructions.md create mode 100644 .github/instructions/mcp.instructions.md create mode 100644 .github/instructions/python.instructions.md create mode 100644 .github/instructions/tests.instructions.md create mode 100644 .gitignore create mode 100644 .vscode/extensions.json create mode 100644 .vscode/mcp.json create mode 100644 .vscode/settings.json create mode 100644 AGENTS.md create mode 100644 IMPLEMENTATION.md create mode 100644 Makefile create mode 100644 PRD.md create mode 100644 PROJECT.md create mode 100644 README.md create mode 100644 USAGE.md create mode 100644 finn_eiendom/__init__.py create mode 100644 finn_eiendom/ad.py create mode 100644 finn_eiendom/analysis.py create mode 100644 finn_eiendom/cache.py create mode 100644 finn_eiendom/config.py create mode 100644 finn_eiendom/eiendom_no.py create mode 100644 finn_eiendom/http.py create mode 100644 finn_eiendom/mcp_server.py create mode 100644 finn_eiendom/models.py create mode 100644 finn_eiendom/parser.py create mode 100644 finn_eiendom/scoring.py create mode 100644 finn_eiendom/search.py create mode 100644 finn_eiendom/service.py create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/fixtures.py create mode 100644 tests/test_ad.py create mode 100644 tests/test_cache.py create mode 100644 tests/test_eiendom_no.py create mode 100644 tests/test_http.py create mode 100644 tests/test_mcp_server.py create mode 100644 tests/test_parser.py create mode 100644 tests/test_scoring.py create mode 100644 tests/test_search.py create mode 100644 tests/test_service.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..bf75d09 --- /dev/null +++ b/.env.example @@ -0,0 +1,18 @@ +FINN_CACHE_PATH=/data/finn.sqlite +FINN_MAX_SEARCH_PAGES=3 +FINN_DETAIL_LIMIT=20 +FINN_REQUEST_DELAY_SECONDS=2 +FINN_CACHE_TTL_SEARCH_MINUTES=60 +FINN_CACHE_TTL_AD_HOURS=24 +FINN_USER_AGENT=personal-finn-eiendom-analyzer/0.1 + +EIENDOM_NO_ENABLED=true +EIENDOM_NO_BASE_URL=https://api.eiendom.no/api/v1 +EIENDOM_NO_CACHE_TTL_HOURS=24 +EIENDOM_NO_REQUEST_DELAY_SECONDS=1 +EIENDOM_NO_SIMILAR_UNITS_ENABLED=true +EIENDOM_NO_SIMILAR_UNITS_DEFAULT_STATUS=RECENTLY_SOLD + +LOG_LEVEL=DEBUG +MCP_HOST=0.0.0.0 +MCP_PORT=8000 \ No newline at end of file diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..9ca9de4 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,181 @@ +# Copilot instructions for finn-eiendom-mcp + +This project is a private, self-hosted Python platform for analyzing FINN real-estate listings. It exposes the same code through three coordinated front ends: + +1. A **Python library** (`finn_eiendom`) — source of truth. +2. An **MCP server** (FastMCP, stdio + optional HTTP) over `finn_eiendom/mcp_server.py`. +3. A **CLI** (`finn-eiendom`) over `finn_eiendom/cli.py`. + +All three share the same `service.py`, `formatting.py`, `cache.py`, and `models.py`. Code lives in exactly one place and is called from both front ends. See `PRD.md` §17 for the full ownership rules — that section is the constitution. + +--- + +## Source of truth + +Read in this order: + +1. `PRD.md` — product and architecture, especially §17. +2. `PROJECT.md` — module map. +3. `AGENTS.md` — workflow. +4. `.github/instructions/*.md` — per-topic rules. + +--- + +## Module layout + +``` +finn_eiendom/ + config.py # env vars, defaults, TTLs + models.py # Pydantic v2 models + parser.py # number/area/date/URL/finnkode normalization + http.py # async HTTP (httpx) with delay + retry + user-agent + cache.py # SQLite (sqlite3) schema + persistence + search.py # FINN search HTML parsing + pagination + ad.py # FINN listing HTML parsing + eiendom_no.py # Eiendom.no unit search/detail, unit_vector, similar-units + scoring.py # score model + classifications + feedback.py # verdicts + soft preference signal + analysis.py # orchestration + shortlist + summary + service.py # get_or_fetch_* + thin facade for MCP and CLI + formatting.py # render_* helpers shared by MCP and CLI + mcp_server.py # FastMCP wrappers around service.py + cli.py # typer-based CLI wrappers around service.py + __main__.py # python -m finn_eiendom → CLI entry +``` + +--- + +## The five hard rules + +Enforced by `tests/test_architecture.py`: + +1. **`mcp_server.py` and `cli.py` are siblings.** They never import from each other. Both import only from `service`, `formatting`, `models`, `config`, stdlib, and their own framework (`mcp` / `typer`). +2. **`service.py` is the only orchestrator.** Nothing above it touches HTTP or SQLite directly. +3. **`httpx` lives only in `http.py`.** +4. **`sqlite3` lives only in `cache.py`.** +5. **Output formatting lives only in `formatting.py`.** Never inline in CLI or MCP tool bodies. + +--- + +## Development workflow — local venv + +Default runtime is a project-local virtualenv. Docker is supported for packaging but optional for development. + +```bash +uv venv # or: python3.12 -m venv .venv +source .venv/bin/activate +uv pip install -e ".[dev]" # or: pip install -e ".[dev]" + +# from now on: +pytest +ruff check . +ruff format . +mypy finn_eiendom +finn-eiendom --help +finn-eiendom-mcp # stdio MCP server +``` + +**Never** install packages globally. **Never** add a dependency without updating `pyproject.toml`. + +--- + +## Coding rules + +* Python 3.12+. +* Pydantic v2 with `model_config = ConfigDict(...)`. No v1 `class Config:` blocks. +* Type hints on every function signature. +* Async I/O for all network and DB code paths through `service.py`. +* Dependency injection for HTTP/cache clients in tests. +* Small, focused functions. One job per function. See `clean-code.instructions.md`. +* Errors raise with actionable messages; the MCP boundary translates them to `{"error": True, "code": ..., "message": ...}`. +* stdio MCP servers log to **stderr only**. + +--- + +## Code ownership — the short version + +| Concern | Lives in | +| -------------------------------------- | ------------------------------ | +| FINN search HTML parsing | `search.py` | +| FINN listing HTML parsing | `ad.py` | +| Norwegian number / area / URL regexes | `parser.py` | +| HTTP fetching + retry + delay | `http.py` | +| SQLite reads / writes | `cache.py` | +| Eiendom.no unit search/detail/comps | `eiendom_no.py` | +| `unit_vector` encode/decode (msgpack) | `eiendom_no.py` | +| Scoring + classification | `scoring.py` | +| Feedback storage | `feedback.py` | +| Cache-aware orchestration | `service.py` (`get_or_fetch_*`)| +| Shortlist + summary assembly | `analysis.py` | +| End-to-end runs | `service.py` (`analyze_search`)| +| MCP tool definitions | `mcp_server.py` | +| CLI command definitions | `cli.py` | +| Output rendering | `formatting.py` | +| Env-var defaults | `config.py` | +| Pydantic models | `models.py` | + +Full table with "never lives in" column is in `PRD.md` §17.2. + +--- + +## Adding a feature + +1. Decide the home using the table above (and `PRD.md` §17.2). +2. Implement in `service.py` (or `analysis.py` if pure orchestration). +3. Add a service-level test. +4. Add a thin MCP tool — `response_format`-aware. +5. Add a thin CLI command — `--format`-aware. +6. Add a renderer in `formatting.py`. +7. Test MCP and CLI registration. +8. Update PRD and instruction docs. + +If the MCP tool body or CLI command body grows past ~20 lines, push logic down to `service.py`. + +--- + +## Documentation lookups — use context7 + +When uncertain about an external library API (FastMCP, Pydantic v2, Typer, httpx, msgpack, pytest-asyncio, respx, BeautifulSoup), call the **`context7` MCP server** *before* writing code. Don't rely on training-data memory. + +``` +context7:resolve-library-id → library_id +context7:query-docs(library_id, topic) → authoritative snippets +``` + +Details in `.github/instructions/docs.instructions.md`. + +--- + +## Clean code is a hard requirement + +See `clean-code.instructions.md`. DRY, single-responsibility, descriptive names, type hints, no dead code, comments explain why not what. If duplication slips in, the right answer is to extract it — not to copy the second instance. + +--- + +## Product behavior + +The MVP does one thing well: + +``` +FINN search URL in + → relevant property candidates out + → enriched with Eiendom.no estimates + → similar-units / comps + → explanations + → risks + → next steps + → broker questions +``` + +Always explain: + +* why a property is interesting, +* price vs estimate, +* price vs comparable sales, +* renovation upside, +* hybel / rental potential, +* technical / legal risks, +* uncertainty / confidence, +* next questions for the broker. + +Scores and estimates are decision support, not advice. Surface uncertainty, never hide it. \ No newline at end of file diff --git a/.github/instructions/clean-code.instructions.md b/.github/instructions/clean-code.instructions.md new file mode 100644 index 0000000..1ba8f38 --- /dev/null +++ b/.github/instructions/clean-code.instructions.md @@ -0,0 +1,150 @@ +--- +name: Clean code rules +description: Best-practice standards for all production and test code +applyTo: "**/*.py" +--- + +# Clean code rules + +These rules apply everywhere — every module, every function, every test. They are intentionally opinionated. If a rule conflicts with the architecture rules in `PRD.md` §17, the architecture rules win. If it conflicts with another best practice here, pick the one that produces the simpler, more readable result. + +## Single responsibility + +* One job per function. If a function name needs "and" to describe it, it's two functions. +* One job per module. `parser.py` parses. `cache.py` caches. `formatting.py` formats. Don't mix. +* One job per class. We rarely need classes outside Pydantic models, dataclasses, and the `HTTPClient`. Avoid OO for OO's sake. + +## Function size + +* Aim for under **30 lines** of body. +* Past **50 lines** it's a code smell — extract helpers. +* If you've got more than **3 levels of nesting**, the function wants splitting (extract the inner block into a helper named after what it does). + +## Naming + +* Names describe **intent**, not implementation. `get_or_fetch_ad`, not `process_ad`. `render_shortlist_markdown`, not `format2`. +* Verbs for actions (`fetch_`, `parse_`, `score_`, `render_`). +* Nouns for data (`FinnAd`, `EiendomUnit`, `shortlist`). +* Boolean variables / parameters read as predicates: `force_refresh`, `include_eiendom_no`, `is_recently_sold`. Not `flag`, not `do_thing`. +* Avoid abbreviations except those well-established in the domain (`url`, `ad`, `nok`, `bra`, `sqm`). +* Norwegian terms stay Norwegian when they're domain vocabulary (`hybel`, `fellesgjeld`, `finnkode`). Don't translate `finnkode` to `finn_code` — it's a proper noun. + +## Type hints + +Required on every function signature, including private helpers. Mypy in strict mode is the goal. + +```python +# ❌ +def parse(html, base_url=None): + ... + +# ✅ +def parse(html: str, base_url: str | None = None) -> FinnAd | None: + ... +``` + +Use modern syntax: `X | None` over `Optional[X]`, `list[int]` over `List[int]`, `dict[str, Any]` over `Dict[str, Any]`. + +## Comments + +* Comments explain **WHY**, never **WHAT**. The code already says what. +* If a comment is needed to explain *what* a line does, the line wants renaming or extracting. +* Use docstrings for public functions, classes, and modules. One-line summary, blank line, optional details and examples. +* No commented-out code. Delete it. Git remembers. +* No `# TODO` without a date or issue reference. `# TODO(2026-05): replace once Eiendom.no confirms ...` is fine. + +## DRY — Don't Repeat Yourself + +If you write the same logic, regex, SQL, or format string **twice**, extract it. The decision table in `PRD.md` §17.2 tells you where it belongs. + +The pre-merge anti-duplication checklist (from `PRD.md` §17.4): + +1. Is this logic already implemented somewhere? (`grep` the function name and obvious keywords.) +2. If I'm copy-pasting from another file, am I about to duplicate behavior that should live in one shared function? +3. Can a new caller use an existing `service.py` function instead of writing its own orchestration? +4. Is the same Pydantic field defined in two models? Factor out a base model. +5. Am I formatting output in two places (CLI + MCP)? Move it to `formatting.py`. +6. Am I opening a SQLite connection outside `cache.py`? Move it. +7. Am I building an httpx call outside `http.py`? Move it. +8. Am I writing a Norwegian-number / area / finnkode regex outside `parser.py`? Move it. +9. Am I adding an env-var lookup outside `config.py`? Move it. +10. Did I add a new behavior with only one front end (MCP or CLI)? If it should exist in both, the service function is missing. + +A small amount of duplication is acceptable to keep boundaries clean — see `PRD.md` §17.8. Past a handful of lines, extract. + +## Errors + +* **Fail loudly** with actionable messages. + + ```python + # ❌ + raise ValueError("bad input") + + # ✅ + raise ValueError(f"Unknown listing_status {status!r}; expected one of {VALID_LISTING_STATUSES}") + ``` + +* **No silent failures.** `except Exception: pass` is forbidden. Catch the specific exception, log it, and either recover or re-raise. + +* **Service raises; MCP wraps.** Service functions raise normal exceptions. The MCP tool boundary translates them into `{"error": True, "code": ..., "message": ...}`. CLI lets typer handle non-zero exits. + +* **Graceful degradation is explicit.** If Eiendom.no enrichment fails, return a result with `eiendom_unit=None` and a warning, not a silently-missing field. + +## State + +* No global mutable state. The only module-level constants allowed are configuration values loaded from env in `config.py`. +* No module-level caches (dicts, lists) that mutate. Use `cache.py` if you need persistence. +* Pass dependencies in (HTTP clients, DB connections) for testability. + +## Dead code + +* No commented-out code. +* No unused imports (ruff catches these — fix them, don't add `# noqa`). +* No unused parameters (use `_` or remove). +* No `if False:` blocks "for later". +* Functions and classes that aren't called anywhere — delete them. Git keeps history. + +## Magic numbers and strings + +Anything that influences behavior and isn't self-explanatory belongs in `config.py` (env-controlled) or as a named module-level constant near the top of the file. + +```python +# ❌ +if days > 90: + confidence = "low" + +# ✅ +COMPS_STALE_AFTER_DAYS = 90 + +if days > COMPS_STALE_AFTER_DAYS: + confidence = "low" +``` + +URLs, timeouts, retries, TTLs, status codes — never inline. + +## Imports + +* Standard library first, third-party second, local last, separated by blank lines. +* Ruff's `I` rules sort and group these — run `ruff check . --fix`. +* No wildcard imports. +* No relative imports above one level (`from ..thing import x` is a smell; refactor). +* Each module's allowed import set is enforced by `tests/test_architecture.py`. + +## Tests are first-class code + +Same rules. Same type hints. Same naming. Same DRY. If a fixture is used in three test files, it goes in `conftest.py`. If three tests share a setup, factor it into a fixture. + +## Reviewing your own change before commit + +A 60-second self-review: + +1. Did I add a function that already exists somewhere? (`grep` it.) +2. Did I bypass `service.py`, `http.py`, `cache.py`, or `formatting.py`? +3. Is everything typed? +4. Did I leave a `print()`, `breakpoint()`, or commented-out block behind? +5. Does the test for this change actually fail without the change? +6. Did I update `PRD.md` or the relevant instruction file if I changed an architectural rule? + +## When in doubt about a library API + +Use the `context7` MCP server instead of guessing. See `docs.instructions.md`. Training-data memory of `pydantic.field_validator`, `typer.Option`, `mcp.tool` annotations, or `httpx.AsyncClient` is unreliable — they all change between versions. \ No newline at end of file diff --git a/.github/instructions/cli.instructions.md b/.github/instructions/cli.instructions.md new file mode 100644 index 0000000..4f2692b --- /dev/null +++ b/.github/instructions/cli.instructions.md @@ -0,0 +1,158 @@ +--- +name: CLI rules +description: Rules for the typer-based finn-eiendom CLI +applyTo: "finn_eiendom/cli.py,finn_eiendom/__main__.py" +--- + +# CLI rules + +The CLI is a **thin wrapper** over `service.py`. It is a sibling of `mcp_server.py` — they never call each other and they share the same underlying service functions. Every CLI command maps 1:1 to a service function with the same parameters and defaults. + +## Framework + +Built with [`typer`](https://typer.tiangolo.com/). One `typer.Typer` app: + +```python +# finn_eiendom/cli.py +import asyncio, typer +from . import service, formatting + +app = typer.Typer(no_args_is_help=True, add_completion=False) +``` + +Entry points in `pyproject.toml`: + +```toml +[project.scripts] +finn-eiendom-mcp = "finn_eiendom.mcp_server:main" +finn-eiendom = "finn_eiendom.cli:app" +``` + +Plus `finn_eiendom/__main__.py`: + +```python +from .cli import app + +if __name__ == "__main__": + app() +``` + +So `python -m finn_eiendom ...` works without installation. + +## Command body shape + +```python +@app.command() +def analyze_search( + url: str, + max_pages: int = 3, + detail_limit: int = 20, + no_details: bool = typer.Option(False, "--no-details"), + no_eiendom: bool = typer.Option(False, "--no-eiendom"), + with_similar: bool = typer.Option(False, "--with-similar"), + format: str = typer.Option("json", "--format"), +) -> None: + """Analyze a FINN search URL and return a ranked shortlist.""" + result = asyncio.run(service.analyze_search( + search_url=url, + max_pages=max_pages, + detail_limit=detail_limit, + include_details=not no_details, + include_eiendom_no=not no_eiendom, + include_similar_units_for_shortlist=with_similar, + )) + typer.echo(formatting.render_shortlist(result, format)) +``` + +Rules: + +* The command body has at most three sections: option parsing (handled by typer), one `service.` call, one `typer.echo(formatting.render_(result, format))`. +* If the body has more than ~20 lines, the logic belongs in `service.py`. +* No `print()` — use `typer.echo()` for stdout, `typer.echo(..., err=True)` for stderr. +* No business logic, no rendering, no SQLite, no HTTP, no parsing. + +## Formats + +Every command that produces structured output accepts `--format`: + +* `--format json` (default) — full structured output, pipeable into `jq`. +* `--format markdown` — human-readable. +* `--format table` — terminal table (only where it makes sense: `analyze-search`, `compare`, `shortlist`, `diff`). + +All three render paths are produced by `formatting.py`. Never format inline in `cli.py`. Unsupported values raise `ValueError` with a list of supported formats — typer surfaces this as a non-zero exit. + +## Commands + +```text +finn-eiendom analyze-search [--max-pages 3] [--detail-limit 20] [--no-details] [--no-eiendom] [--with-similar] [--format ...] +finn-eiendom get-ad [--force-refresh] [--no-eiendom] [--with-similar] [--format ...] +finn-eiendom compare [--no-eiendom] [--no-comps] [--format ...] +finn-eiendom save-feedback [--notes "..."] +finn-eiendom shortlist [--run-id ID] [--limit 10] [--format ...] +finn-eiendom diff [--format ...] +finn-eiendom resolve-unit +finn-eiendom get-unit [--force-refresh] +finn-eiendom enrich-ad [--with-similar] +finn-eiendom build-vector +finn-eiendom decode-vector +finn-eiendom similar-units [--status RECENTLY_SOLD|FOR_SALE|CURRENT] +finn-eiendom similar-to-liked [--mode recommendations|comps] [--status ...] +finn-eiendom analyze-against-comps +finn-eiendom cache stats | clear | clear-html | clear-json +finn-eiendom serve [--transport stdio|http] [--host 127.0.0.1] [--port 8010] +finn-eiendom config show | path +finn-eiendom doctor +finn-eiendom version +``` + +Sub-command groups (`cache`, `config`) use `typer.Typer` sub-apps: + +```python +cache_app = typer.Typer(help="Cache management") +app.add_typer(cache_app, name="cache") + +@cache_app.command("stats") +def cache_stats() -> None: + typer.echo(formatting.render_cache_stats(service.get_cache_stats(), "json")) +``` + +## Async glue + +Service functions are async; CLI commands are sync. Always use `asyncio.run(service.(...))` at the call boundary. Don't sprinkle `async def` across CLI commands — typer expects sync handlers. + +## Exit codes + +* `0` — success. +* `1` — runtime error (raised exception in service). +* `2` — usage error (typer's default for bad options). + +Let exceptions propagate from `service.py` and rely on typer's default handling. Only catch where you want a more specific exit code or message. + +## What stays out of cli.py + +* `import httpx`, `import sqlite3`, `import msgpack` — never. +* `from .ad import ...`, `from .search import ...`, `from .eiendom_no import ...`, `from .scoring import ...`, `from .cache import ...`, `from .http import ...` — never. +* Inline formatting logic — goes in `formatting.py`. +* MCP imports (no `from .mcp_server import ...`). + +Allowed imports in `cli.py`: + +```python +import asyncio, json, sys +import typer +from . import service, formatting, config +from .models import FinnAd, EiendomUnit, SimilarUnit # only for type hints +``` + +`tests/test_architecture.py` enforces this. + +## When uncertain about typer + +Use `context7` instead of guessing: + +``` +context7:resolve-library-id → "tiangolo/typer" +context7:query-docs(id, "Typer sub-apps and option groups") +``` + +See `docs.instructions.md`. \ No newline at end of file diff --git a/.github/instructions/docs.instructions.md b/.github/instructions/docs.instructions.md new file mode 100644 index 0000000..0278246 --- /dev/null +++ b/.github/instructions/docs.instructions.md @@ -0,0 +1,118 @@ +--- +name: Documentation lookups via context7 MCP +description: How and when to use the context7 MCP server for library documentation +applyTo: "**/*.py,**/*.md,**/*.toml,**/*.yaml,**/*.yml" +--- + +# Documentation lookups — use context7 + +When you are uncertain about a library's API, **call the `context7` MCP server before writing code**. Do not rely on training-data memory. Pydantic, FastMCP, Typer, httpx, and pytest all evolve quickly; what was true two releases ago is often wrong now. + +## When to use context7 + +Use it **before** writing code involving any of these: + +* **FastMCP / MCP Python SDK** — `@mcp.tool()` signatures, `ToolAnnotations`, `mcp.run(transport=...)`, resource and prompt decorators, server lifecycle, streamable-HTTP setup. +* **Pydantic v2** — `BaseModel`, `Field`, `ConfigDict`, `model_validator`, `field_validator`, `model_dump` / `model_dump_json`, discriminated unions, `Annotated[...]` with validators. +* **Typer** — `Typer()` apps, `typer.Option`, `typer.Argument`, sub-apps via `add_typer`, callbacks, exit codes, testing with `CliRunner`. +* **httpx** — `AsyncClient`, timeouts, transports, retries, `Response` API. +* **respx** — mocking httpx, `respx.mock`, `route.mock`, match patterns. +* **msgpack** — packing/unpacking, type extensions, raw vs string mode. +* **base64** — `urlsafe_b64encode`, padding handling. +* **pytest** / **pytest-asyncio** — fixtures, parametrize, async tests, markers, `tmp_path`, `monkeypatch`. +* **BeautifulSoup** / **lxml** — selectors, parser flavors, element traversal. +* **typer.testing.CliRunner** — invoking apps, asserting on stdout/stderr/exit codes. + +Use it **also** when: + +* A test fails with an error like `AttributeError: 'BaseModel' object has no attribute 'dict'` (Pydantic v1 vs v2 confusion). +* You see a `DeprecationWarning` from a third-party library and aren't sure of the modern replacement. +* You're about to copy a code pattern from memory that feels "old". + +## When NOT to use it + +* Pure Python stdlib (`json`, `pathlib`, `dataclasses`, `typing`) — these are stable and well-known. +* Project-internal modules — read the source. +* Generic programming questions ("what's a list comprehension") — use your own knowledge. +* FINN / Eiendom.no API behavior — these are not in context7. Use fixtures from prior runs in `tests/fixtures/` and the endpoint notes in `PRD.md` §9. + +## How to use it + +Two-step pattern: + +### 1. Resolve the library ID + +``` +context7:resolve-library-id(query="fastmcp") +context7:resolve-library-id(query="pydantic") +context7:resolve-library-id(query="typer") +``` + +Returns the canonical library ID (e.g. `pydantic/pydantic`, `fastapi/typer`). Pick the most-starred / official-looking match. + +### 2. Query the docs + +``` +context7:query-docs( + context7CompatibleLibraryID="pydantic/pydantic", + topic="field validators v2 mode after", + tokens=3000, +) +``` + +* **Keep the topic focused.** "Pydantic v2 field validators with mode=after on Optional[str]" beats "Pydantic validation". +* **Cap tokens** to roughly what you need (1500–4000 is usually plenty). The default is fine for most calls. +* **Use library-specific terminology** in the topic — "discriminator field" for Pydantic, "tool annotations" for FastMCP, "sub-apps" for Typer. + +### Worked examples + +**Q: How do I declare a FastMCP tool with read-only annotations?** + +``` +context7:resolve-library-id(query="modelcontextprotocol python sdk") +context7:query-docs(context7CompatibleLibraryID="", + topic="FastMCP @mcp.tool ToolAnnotations readOnlyHint") +``` + +**Q: How do I write a Pydantic v2 model_validator that runs after field validation?** + +``` +context7:resolve-library-id(query="pydantic") +context7:query-docs(context7CompatibleLibraryID="pydantic/pydantic", + topic="model_validator mode='after' v2") +``` + +**Q: How do I mock an async httpx POST with respx?** + +``` +context7:resolve-library-id(query="respx") +context7:query-docs(context7CompatibleLibraryID="", + topic="respx mock async httpx POST json body") +``` + +**Q: How do I add a Typer sub-app for `cache` commands?** + +``` +context7:resolve-library-id(query="typer") +context7:query-docs(context7CompatibleLibraryID="", + topic="Typer add_typer sub-application command groups") +``` + +## After the lookup + +* Cite or summarize what you found in a code comment **only when** the snippet documents a non-obvious API choice — otherwise the code is enough. +* If context7 returns nothing useful, fall back to: + 1. The library's official docs site. + 2. The library's repo `README` / `examples/`. + 3. The smallest possible spike (a 5-line script in the venv) to verify behavior. + +## Anti-patterns + +* **Don't** invent a method signature from memory and hope. If you're not 100% sure of an API, look it up. +* **Don't** copy patterns from old Stack Overflow answers without verifying — Pydantic, FastMCP, and Typer all had breaking changes recently. +* **Don't** silence a warning instead of fixing the deprecation. Look up the modern API. +* **Don't** query context7 for FINN or Eiendom.no — those endpoints aren't in any public docs index. Use `tests/fixtures/` and `PRD.md` §9. + +## Network configuration note + +`context7` is configured as a connected MCP server in this environment. If a call fails with a connection error, surface it clearly — don't fall back to guessing. \ No newline at end of file diff --git a/.github/instructions/mcp.instructions.md b/.github/instructions/mcp.instructions.md new file mode 100644 index 0000000..4f62fc2 --- /dev/null +++ b/.github/instructions/mcp.instructions.md @@ -0,0 +1,192 @@ +--- +name: MCP rules +description: Rules for FastMCP tools, resources, and prompts +applyTo: "finn_eiendom/mcp_server.py,finn_eiendom/**/*mcp*.py" +--- + +# MCP server rules + +The MCP server is a **thin wrapper** over `service.py`. It owns: + +* Tool registration with `@mcp.tool()` and annotations. +* Pydantic input schemas (these double as tool documentation). +* Error wrapping at the protocol boundary. +* JSON / markdown response formatting via `formatting.py`. + +It does **not** own: + +* Parsing, scraping, scoring, cache, or HTTP fetching logic. +* SQLite or `httpx` access. +* Any orchestration of "check cache, else fetch, else save" — that's `service.py`. + +## Server bootstrap + +```python +# finn_eiendom/mcp_server.py +import sys, logging +from mcp.server.fastmcp import FastMCP + +logging.basicConfig(stream=sys.stderr, level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s %(message)s") + +mcp = FastMCP("finn_eiendom_mcp") + +# ... tools registered here ... + +def main() -> None: + mcp.run(transport="stdio") + +if __name__ == "__main__": + main() +``` + +stdio servers **must** log to stderr only — anything on stdout breaks the JSON-RPC frame. + +## Tool naming + +All tools use the `finn_` prefix so they don't collide with other MCP servers running in the same Claude Desktop: + +* `finn_analyze_search` +* `finn_get_ad` +* `finn_compare_ads` +* `finn_save_feedback` +* `finn_get_shortlist` +* `finn_get_new_ads_since_last_run` +* `finn_resolve_eiendom_unit` +* `finn_get_eiendom_unit` +* `finn_enrich_ad` +* `finn_build_unit_vector` +* `finn_decode_unit_vector` +* `finn_get_similar_units` +* `finn_find_similar_to_liked_ad` +* `finn_analyze_ad_against_comps` + +## Tool body shape + +Every tool body looks like this: + +```python +@mcp.tool( + annotations=ToolAnnotations( + title="Analyze a FINN search URL", + readOnlyHint=True, + destructiveHint=False, + openWorldHint=True, + ) +) +async def finn_analyze_search(input: AnalyzeSearchInput) -> str: + """Analyze a FINN search URL and return a ranked shortlist.""" + try: + result = await service.analyze_search( + search_url=input.search_url, + max_pages=input.max_pages, + detail_limit=input.detail_limit, + include_details=input.include_details, + include_eiendom_no=input.include_eiendom_no, + include_similar_units_for_shortlist=input.include_similar_units_for_shortlist, + ) + return formatting.render_shortlist(result, input.response_format) + except Exception as e: + log.exception("finn_analyze_search failed") + return json.dumps({ + "error": True, + "code": type(e).__name__, + "message": str(e), + }) +``` + +Notes: + +* Every tool delegates to `service.` in one call. +* Every tool wraps in try/except and returns the error envelope as a JSON string. +* Output rendering goes through `formatting.py`, never inline. +* If the tool body needs more than ~20 lines, logic has leaked out of the service layer — push it back down. + +## Input schemas + +Every tool has a Pydantic v2 input model. Schemas live with the tool in `mcp_server.py` (they document the tool to LLM clients). Reuse from `models.py` only when the same shape is also a domain object — otherwise keep them as tool-local input types. + +```python +class AnalyzeSearchInput(BaseModel): + search_url: str = Field(..., description="Full FINN search URL") + max_pages: int = Field(default=3, ge=1, le=10) + detail_limit: int = Field(default=20, ge=1, le=100) + include_details: bool = True + include_eiendom_no: bool = True + include_similar_units_for_shortlist: bool = False + response_format: Literal["json", "markdown"] = "json" +``` + +## Annotations + +Set the right hints: + +* Read-only tools (most of them): `readOnlyHint=True, destructiveHint=False, openWorldHint=True`. +* `finn_save_feedback`: `readOnlyHint=False, destructiveHint=False, idempotentHint=False`. + +## Response format + +Tools accept a `response_format` parameter (`"json"` or `"markdown"`): + +* `"json"` — return `json.dumps(result_dict)`. +* `"markdown"` — return `formatting.render_(result, "markdown")`. + +Errors are always returned as the JSON error envelope regardless of `response_format`. + +## What stays out of mcp_server.py + +* `import httpx` — never. +* `import sqlite3` — never. +* `from .ad import ...`, `from .search import ...`, `from .eiendom_no import ...`, `from .scoring import ...`, `from .cache import ...`, `from .http import ...` — never. Go through `service`. +* Output formatting logic — goes in `formatting.py`. +* Cache management — goes in `service.py`. + +Allowed imports in `mcp_server.py`: + +```python +import json, logging, sys +from typing import Literal, Optional +from mcp.server.fastmcp import FastMCP +from mcp.server.fastmcp.utilities import ToolAnnotations +from pydantic import BaseModel, Field +from . import service, formatting +from .models import FinnAd, EiendomUnit, SimilarUnit # only if needed for type hints +from . import config +``` + +`tests/test_architecture.py` enforces this. + +## Resources and prompts + +When you add resources or prompts, they follow the same rule: thin wrappers over `service.py` and `formatting.py`. Resources: + +``` +finn://preferences/current +finn://search-runs/latest +finn://search-runs/{id} +finn://ads/{finnkode} +finn://ads/{finnkode}/enriched +finn://shortlist/latest +finn://feedback/{finnkode} +finn://eiendom-units/{unitCode} +finn://eiendom-units/{unitCode}/similar/{listingStatus} +``` + +Prompts: `evaluate_property_for_user`, `compare_properties_for_user`, `refine_search_from_feedback`, `find_more_like_this`. + +## When uncertain about FastMCP + +Use `context7` for FastMCP / MCP SDK questions instead of guessing: + +``` +context7:resolve-library-id → "modelcontextprotocol/python-sdk" or similar +context7:query-docs(id, "FastMCP tool annotations") → snippets +``` + +See `docs.instructions.md`. + +## Transports + +* Default: stdio. `finn-eiendom-mcp` is the entry point. +* Optional: Streamable HTTP via `finn-eiendom serve --transport http --port 8010`. Path: `POST /mcp`. Operational endpoints: `GET /health`, `GET /version`, `GET /debug/config`. +* Keep tools transport-agnostic. No request/response shape depends on the transport. \ No newline at end of file diff --git a/.github/instructions/python.instructions.md b/.github/instructions/python.instructions.md new file mode 100644 index 0000000..80dadc7 --- /dev/null +++ b/.github/instructions/python.instructions.md @@ -0,0 +1,80 @@ +--- +name: Python project rules +description: Python conventions for the FINN/Eiendom MCP server +applyTo: "**/*.py" +--- + +# Python conventions + +## Runtime + +* Python **3.12+**. +* Project-local virtualenv at `.venv/` (created by `uv venv` or `python3.12 -m venv .venv`). +* All commands run inside the activated venv. +* Editable install: `uv pip install -e ".[dev]"` (or `pip install -e ".[dev]"`). +* Never install packages globally; never use `sudo pip`; never mutate host Python. +* Add new dependencies to `pyproject.toml` in the same change that uses them. + +## Language + +* Use Python 3.12 syntax. Prefer `X | None` over `Optional[X]`, `list[int]` over `List[int]`, structural pattern matching where it actually helps. +* **Type hints on every function signature**, including private helpers. `mypy --strict finn_eiendom` is the target. +* Async-first for I/O. Sync code is fine for parsing, scoring, and cache access (SQLite). +* Pydantic v2 for all structured domain models, with `model_config = ConfigDict(...)`. No v1 `class Config:` blocks. + +## Prefer + +* Small, pure functions for parsing, normalization, and scoring. +* Explicit return types and explicit exceptions. +* Dependency injection for HTTP clients and DB connections in tests (pass `client` / `conn` as args; let services own the defaults). +* Domain names from the PRD (`FinnAd`, `EiendomUnit`, `SimilarUnit`, `analyze_search`, `get_or_fetch_ad`). +* `dataclass` for internal value objects that don't cross the API boundary; Pydantic for anything serialized or validated. + +## Avoid + +* Global mutable state (module-level dicts as caches, etc.). The only allowed module-level state is configuration loaded from env in `config.py`. +* Hardcoded URLs, credentials, paths, or magic numbers anywhere outside `config.py`. +* `httpx` imports anywhere except `finn_eiendom/http.py`. +* `sqlite3` imports anywhere except `finn_eiendom/cache.py`. +* `BeautifulSoup` imports anywhere except `finn_eiendom/search.py` and `finn_eiendom/ad.py`. +* `msgpack` imports anywhere except `finn_eiendom/eiendom_no.py`. +* Scraping, scoring, cache, or HTTP fetching logic inside MCP tool or CLI command bodies. +* Direct network calls in unit tests — use `respx` and fixtures. +* `print()` for logging — use the `logging` module. stdio MCP server logs go to **stderr only**. +* Bare `except:` or `except Exception: pass` — catch the specific exception or let it propagate. + +## External fetches + +All external fetches must support: + +* Configurable request delay (`FINN_REQUEST_DELAY_SECONDS`, `EIENDOM_NO_REQUEST_DELAY_SECONDS`). +* Cache lookup before fetch. +* Retry on 5xx with exponential backoff (`1s, 2s, 4s`). +* Graceful failure that returns `None` or empty rather than raising, when the caller can degrade. +* Structured logging at INFO for success, WARNING for retry, ERROR for final failure. + +## Best practices + +* **Single responsibility per function.** If a function name needs "and" to describe it, it's two functions. +* **Function length:** aim for under 30 lines. Past 50 lines it's a code smell — extract helpers. +* **Cyclomatic complexity:** if you've got more than 3 levels of nesting, the function wants splitting. +* **Naming:** `get_or_fetch_ad`, not `process_ad`. Verbs for actions, nouns for data. Avoid abbreviations except those well-known in the domain (`url`, `ad`, `nok`). +* **DRY:** if you write the same logic, regex, SQL, or format string twice, extract it. The decision table in `PRD.md` §17.2 tells you where it belongs. +* **Comments explain WHY**, not WHAT. The code already says what. +* **Errors are loud:** raise with actionable messages (`f"Unknown listing_status {status!r}; expected one of {VALID_STATUSES}"`). The MCP boundary wraps them as `{"error": True, ...}`. + +## When uncertain about a library API + +Use the `context7` MCP server **before** writing code: + +1. `context7:resolve-library-id` with the package name → canonical library ID. +2. `context7:query-docs` with that ID + focused topic. + +See `docs.instructions.md`. Don't guess from training memory — Pydantic, FastMCP, and Typer all change. + +## Tooling + +* `ruff check .` — lint. Target Python 3.12. Active rules: `E F I UP B SIM`. +* `ruff format .` — format. Line length 100. +* `mypy --strict finn_eiendom` — type-check. +* `pytest` — run the full suite. \ No newline at end of file diff --git a/.github/instructions/tests.instructions.md b/.github/instructions/tests.instructions.md new file mode 100644 index 0000000..920e54e --- /dev/null +++ b/.github/instructions/tests.instructions.md @@ -0,0 +1,199 @@ +--- +name: Test rules +description: Testing conventions for parser, cache, scoring, service, MCP, CLI, and architecture +applyTo: "tests/**/*.py" +--- + +# Test rules + +## Runtime + +Tests run in the project-local `.venv`. From the project root with the venv activated: + +```bash +pytest # full suite +pytest tests/test_service.py -v # one file +pytest -k "shortlist" # one keyword +pytest --lf # rerun last failures +``` + +`pytest-asyncio` is in `[tool.pytest.ini_options]` with `asyncio_mode = "auto"` — `async def` tests run without an `@pytest.mark.asyncio` decorator. + +## Never do live network calls + +No real HTTP in unit tests. Mock with `respx` (sits in front of `httpx.AsyncClient`): + +```python +import respx, httpx +from finn_eiendom import http as http_module + +@respx.mock +async def test_finn_search_fetch_uses_user_agent(): + route = respx.get("https://www.finn.no/realestate/homes/search.html").mock( + return_value=httpx.Response(200, html=SAMPLE_FINN_SEARCH_HTML) + ) + client = http_module.HTTPClient(user_agent="test-agent") + resp = await client.get("https://www.finn.no/realestate/homes/search.html") + assert resp.status_code == 200 + assert route.calls.last.request.headers["user-agent"] == "test-agent" +``` + +## Fixtures + +Fixture-driven testing for parsers and APIs: + +* FINN search HTML → `tests/fixtures/finn_search.html`. +* FINN listing HTML → `tests/fixtures/finn_ad_*.html`. +* Eiendom.no unit search JSON → `tests/fixtures/eiendom_unit_search.json`. +* Eiendom.no unit detail JSON → `tests/fixtures/eiendom_unit_detail.json`. +* Eiendom.no similar-units JSON → `tests/fixtures/eiendom_similar.json`. + +Loader helpers in `tests/fixtures.py` (e.g. `SAMPLE_FINN_SEARCH_HTML`, `SAMPLE_EIENDOM_UNIT_JSON`). Add new fixtures here, don't inline large strings in test files. + +## Test layout + +``` +tests/ + fixtures/ # raw HTML / JSON inputs + fixtures.py # loader helpers + conftest.py # shared pytest fixtures (tmp DB, http client, etc.) + test_parser.py # number/area/date/URL/finnkode normalization + test_search.py # FINN search HTML → cards + test_ad.py # FINN listing HTML → FinnAd + test_eiendom_no.py # unit search/detail/similar JSON, unit_vector encode/decode + test_scoring.py # all scoring components + classifier + test_cache.py # SQLite read/write/TTL + test_http.py # retry on 5xx, raise on 4xx, delay applied (new) + test_service.py # get_or_fetch_*, analyze_* (new) + test_formatting.py # render_* json/markdown/table (new) + test_mcp_server.py # tool registration + error envelope (expanded) + test_cli.py # typer CliRunner (new) + test_architecture.py # import-graph invariants (new) +``` + +## What to test per category + +### Parsers (`test_parser`, `test_search`, `test_ad`, `test_eiendom_no`) + +* Missing fields → `None`, not exception. +* Norwegian number formats: `7 200 991 kr`, `kr 7 200 991`, `7.200.991`. +* URL normalization (relative → absolute). +* Finnkode extraction from various URL shapes. +* Area parsing: `77 m²`, `77m2`, `77 kvm`. +* Price parsing (asking vs total vs shared debt). +* Eiendom.no JSON edge cases: empty `units`, missing `valuation`, missing `latestMarketData`. + +### Unit vectors (`test_eiendom_no`) + +* msgpack encoding + base64url without padding. +* Decode roundtrip. +* Missing optional fields (floor, rooms, built). +* Both lon/lat orderings handled. + +### Scoring (`test_scoring`) + +* Each component in isolation. +* Total clamped to 0–100. +* Risk penalties applied (negative range). +* Bargain classification triggers on the expected signal mix. +* Hybel classification: documented / possible / unclear / not relevant. +* Explainability: explanation list non-empty when score is non-trivial. + +### Cache (`test_cache`) + +* Read after write returns same object. +* TTL expiry returns `None`. +* JSON roundtrip preserves all fields. +* `init_db` is idempotent on existing DBs. + +### HTTP (`test_http`) + +* Retries on 500/502/503/504 with backoff (count exactly N retries). +* Raises immediately on 404 / 4xx. +* Applies `request_delay` between calls. +* Honors `user_agent`. + +### Service (`test_service`) + +The service tests are the heart of the suite. They cover orchestration end-to-end against fixtures. + +* `test_get_or_fetch_ad_uses_cache` — second call hits cache, no HTTP. +* `test_get_or_fetch_ad_fetches_when_cache_miss` — first call hits HTTP, then writes cache. +* `test_get_or_fetch_ad_force_refresh` — `force_refresh=True` bypasses cache. +* `test_analyze_search_with_fixtures` — full run from search HTML → shortlist. +* `test_find_similar_to_liked_uses_liked_feedback` — only seeds from `liked` verdicts. + +Use a tmp SQLite DB via the `tmp_path` pytest fixture: + +```python +@pytest.fixture +def tmp_db(tmp_path, monkeypatch): + db_path = tmp_path / "finn.sqlite" + monkeypatch.setenv("FINN_CACHE_PATH", str(db_path)) + return db_path +``` + +### Formatting (`test_formatting`) + +* `render_shortlist(result, "json")` is parseable JSON and roundtrips. +* `render_shortlist(result, "markdown")` contains the score and at least one risk. +* `render_(result, "xml")` raises `ValueError` listing supported formats. + +### MCP (`test_mcp_server`) + +* `test_mcp_server_has_correct_tools` — all 14 `finn_*` tool names registered. +* `test_finn_decode_unit_vector_returns_json` — happy path. +* `test_finn_analyze_search_handles_error` — error envelope shape: `{"error": True, "code": ..., "message": ...}`. + +Use the `mcp` SDK's testing helpers; don't spawn a subprocess. + +### CLI (`test_cli`) + +Use Typer's `CliRunner`: + +```python +from typer.testing import CliRunner +from finn_eiendom.cli import app + +runner = CliRunner() + +def test_cli_help(): + result = runner.invoke(app, ["--help"]) + assert result.exit_code == 0 + assert "analyze-search" in result.stdout +``` + +Patch `service.` with `monkeypatch` so CLI tests don't exercise the full stack — that's covered by `test_service.py`. + +### Architecture (`test_architecture`) + +Static checks of the module dependency graph: + +* No `import httpx` outside `finn_eiendom/http.py`. +* No `import sqlite3` outside `finn_eiendom/cache.py`. +* No `BeautifulSoup` import outside `search.py` and `ad.py`. +* No `msgpack` import outside `eiendom_no.py`. +* `mcp_server.py` only imports from `service`, `formatting`, `models`, `config`, `mcp`, stdlib, `pydantic`. +* `cli.py` only imports from `service`, `formatting`, `models`, `config`, `typer`, stdlib. +* `service.py` does not import from `mcp_server` or `cli`. + +Implementation: walk `.py` files under `finn_eiendom/` with `ast`, collect imports, assert allowed sets per module. + +## Best practices + +* One assertion per test (or per closely related group). Long tests die in painful ways. +* Test names describe the behavior: `test_get_or_fetch_ad_uses_cache_within_ttl`. +* Use `monkeypatch` for env vars and `tmp_path` for files. No `os.environ` mutation. +* No `time.sleep` — use `freezegun` if a test depends on time, or refactor the code under test to take a `now` parameter. +* No "smoke tests" that ping real servers — those go under a separately-marked `pytest -m live` suite and are not part of CI. + +## When uncertain about test tooling + +Use `context7` for pytest, respx, freezegun, or Typer testing: + +``` +context7:resolve-library-id → "pytest-dev/pytest" / "lundberg/respx" +context7:query-docs(id, "respx mock httpx async post") +``` + +See `docs.instructions.md`. \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a4d8bd9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,33 @@ +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +htmlcov/ + +# Virtualenvs +.venv/ +venv/ + +# uv +# uv.lock + +# Env +.env +.env.local + +# Data/cache +data/*.sqlite +data/*.sqlite-* +data/*.db +data/*.db-* + +# Editor +.DS_Store +.idea/ + +# Logs +*.log \ No newline at end of file diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..82ee0ac --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,10 @@ +{ + "recommendations": [ + "github.copilot", + "github.copilot-chat", + "ms-python.python", + "charliermarsh.ruff", + "ms-azuretools.vscode-docker", + "tamasfe.even-better-toml" + ] +} \ No newline at end of file diff --git a/.vscode/mcp.json b/.vscode/mcp.json new file mode 100644 index 0000000..38b62ae --- /dev/null +++ b/.vscode/mcp.json @@ -0,0 +1,8 @@ +{ + "servers": { + "context7": { + "type": "http", + "url": "https://mcp.context7.com/mcp", + }, + }, +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..375eed9 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,23 @@ +{ + "python.defaultInterpreterPath": ".venv/bin/python", + "python.testing.pytestEnabled": true, + "python.testing.unittestEnabled": false, + "python.testing.pytestArgs": [ + "tests" + ], + "editor.formatOnSave": true, + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff" + }, + "ruff.enable": true, + "chat.instructionsFilesLocations": { + ".github/instructions": true + }, + "github.copilot.chat.codeGeneration.useInstructionFiles": true, + "files.exclude": { + "**/__pycache__": true, + "**/.pytest_cache": true, + "**/.mypy_cache": true, + "**/.ruff_cache": true + } +} \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..af932a7 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,178 @@ +# AGENTS.md — Workflow for AI agents on finn-eiendom-mcp + +This is the master doc for any AI agent (Claude, Copilot, Cursor, etc.) working in this repo. Read this first, then the more specific files it references. + +--- + +## Read order + +Before changing code, read: + +1. **`PRD.md`** — what we're building and why. Especially §17 ("Code ownership and anti-duplication") — that section is the constitution. +2. **`PROJECT.md`** — module map. +3. This file — workflow. +4. The relevant `.github/instructions/*.md`: + * `python.instructions.md` — Python conventions. + * `mcp.instructions.md` — MCP tool rules. + * `cli.instructions.md` — CLI command rules. + * `tests.instructions.md` — testing conventions. + * `clean-code.instructions.md` — best practices and DRY enforcement. + * `docs.instructions.md` — when and how to use the **context7** MCP server for library documentation. + +If something in code contradicts the PRD, the PRD wins. If you change behavior, update both the PRD and the relevant instruction file in the same change. + +--- + +## Runtime — local venv (default) + +This project runs in a project-local virtualenv. Docker is supported for packaging but is not required for development. + +### One-time setup + +```bash +# from the project root +uv venv # or: python3.12 -m venv .venv +source .venv/bin/activate +uv pip install -e ".[dev]" # or: pip install -e ".[dev]" +``` + +Python **3.12+** is required. + +### Daily commands + +All commands are run inside the activated `.venv`: + +```bash +pytest # tests +ruff check . # lint +ruff format . # format +mypy finn_eiendom # type-check +finn-eiendom --help # CLI entrypoint +finn-eiendom-mcp # MCP server (stdio) +finn-eiendom serve --transport http --port 8010 # MCP server (HTTP) +``` + +### Never + +* Install packages globally (`pip install ...` outside a venv). +* Use `sudo pip`. +* Mutate the host Python. +* Add dependencies without updating `pyproject.toml`. + +### Adding a dependency + +```bash +uv pip install # ad-hoc, then: +# edit pyproject.toml to record it +uv pip install -e ".[dev]" # reinstall in editable mode +``` + +--- + +## Architecture in one screen + +``` +cli.py (typer) mcp_server.py (FastMCP) ← thin, parallel front ends + \ / + \ / + service.py ← orchestration: get_or_fetch, analyze_* + ↓ + analysis.py ← shortlist + summary + ↓ + search / ad / eiendom_no / scoring / feedback + ↓ + parser / http / cache + ↓ + FINN HTML + Eiendom.no JSON + SQLite +``` + +`formatting.py` sits next to `service.py` and is shared by both CLI and MCP for `json`, `markdown`, and `table` rendering. + +**The single-home rule:** every piece of logic has exactly one home. If you're tempted to add it in two places, you're wrong about one — push it down a layer and call it from both. See `PRD.md` §17.2 for the full ownership table. + +--- + +## The five hard rules + +These are non-negotiable. Architecture tests in `tests/test_architecture.py` enforce them. + +1. **`mcp_server.py` and `cli.py` are siblings.** They never call each other. Both call only `service`, `formatting`, `models`, and `config`. +2. **`service.py` is the only place that combines cache + fetch.** Nothing above it touches HTTP or SQLite directly. +3. **`httpx` lives in `http.py`. Nowhere else.** +4. **`sqlite3` lives in `cache.py`. Nowhere else.** +5. **Output formatting lives in `formatting.py`.** No inline rendering in CLI or MCP tool bodies. + +If you have to break one of these to ship a feature, the feature is wrong — fix the design first. + +--- + +## Adding a feature — the checklist + +For any new tool / command / behavior: + +1. Decide the home using the table in `PRD.md` §17.2. +2. Write the function in `service.py` (or extend `analysis.py` if it's pure orchestration). +3. Add a test in `tests/test_service.py`. +4. Add a thin MCP tool in `mcp_server.py` — `response_format` aware. +5. Add a thin CLI command in `cli.py` — `--format` aware. +6. Add the renderer in `formatting.py` if output is non-trivial. +7. Add tests in `tests/test_mcp_server.py` and `tests/test_cli.py`. +8. Update `PRD.md` and any affected `.github/instructions/*.md`. + +If steps 4 or 5 need more than ~20 lines, logic has leaked out of the service layer. Push it back down. + +--- + +## Clean code + +See `.github/instructions/clean-code.instructions.md`. Highlights: + +* Type hints everywhere. +* Functions stay small; one job per function. +* Names describe intent (`get_or_fetch_ad`, not `process`). +* Comments explain **why**, never **what** the code already says. +* DRY: if you write the same regex / SQL / format string twice, extract it. +* Errors fail loudly with actionable messages. No silent `except: pass`. +* No dead code, no commented-out blocks left in the tree. + +--- + +## Documentation lookups — use context7 + +When uncertain about a library's API (FastMCP decorators, Pydantic v2 validators, Typer command patterns, httpx async, msgpack, pytest-asyncio, respx, BeautifulSoup selectors, etc.), **use the `context7` MCP server**. Do not guess from training-data memory. + +Pattern (full details in `.github/instructions/docs.instructions.md`): + +1. `context7:resolve-library-id` with the library name → get the canonical ID. +2. `context7:query-docs` with that ID + a focused topic. + +Use context7 *before* writing the code, not after a test fails. If context7 returns nothing useful, search the library's official docs, then write the smallest possible spike to verify. + +--- + +## Safety and compliance + +* Private, low-frequency use only. +* Respect FINN / Eiendom.no rate limits and bot protection. +* Cache aggressively; never bulk-harvest. +* stdio MCP servers log to **stderr only** — anything on stdout breaks the JSON-RPC frame. +* Scores and estimates are decision support, never legal / technical / financial advice. + +--- + +## Implementation order (Phase 2) + +Follow `PRD.md` §29 step-by-step. Each step is independently mergeable: + +1. Switch dev workflow to local venv + update instruction files (this change). +2. Pydantic v2 cleanup. +3. Service layer + tests. +4. Formatting layer + tests. +5. HTTP retry on 5xx + tests. +6. Replace FastAPI with FastMCP stdio server. +7. CLI with typer. +8. Diff workflow. +9. Compare workflow. +10. Similar-to-liked. +11. Architecture tests. +12. README + Claude Desktop config. \ No newline at end of file diff --git a/IMPLEMENTATION.md b/IMPLEMENTATION.md new file mode 100644 index 0000000..ab35d52 --- /dev/null +++ b/IMPLEMENTATION.md @@ -0,0 +1,384 @@ +# IMPLEMENTATION.md — Phase 2 build runbook + +How to drive Phase 2 (the 12 steps in `PRD.md` §29) to completion using an AI agent. Each step has its own kickoff prompt, files affected, and "done" criteria. Run them in order. Each step is independently mergeable. + +--- + +## 0. Pre-flight + +Before starting step 1: + +1. ls -la + +2. **Venv is healthy.** From the project root: + + ```bash + source .venv/bin/activate + pytest -x # green except for any pre-existing FastMCP-related skips + ruff check . # zero issues + ``` + +3. **Docs are in place.** Re-confirm `PRD.md` §17 (code ownership) is current — every step below references it. + +If any of these fail, stop and fix before proceeding. + +--- + +## How to use this runbook + +For each step: + +1. Create a feature branch: `git checkout -b feat/phase2-step--` off `chore/cleanup-phase-2-prep`. +2. Open a fresh agent chat with repo access. Paste the kickoff prompt verbatim. +3. Let the agent propose, implement, and test. Push back where it skips tests or violates §17. +4. When all "done" boxes are checked, merge into `chore/cleanup-phase-2-prep`. +5. Move to the next step. + +Each kickoff prompt assumes the agent reads PRD.md, AGENTS.md, and the relevant instruction files first — that's encoded in the prompt. + +After step 12, merge `chore/cleanup-phase-2-prep` into `main`. + +--- + +## Step 1 — Dev workflow already switched to local venv + +This step is **done** by the time `CLEANUP.md` is merged. The instruction files and `AGENTS.md` already use local venv. Sanity check: + +```bash +source .venv/bin/activate +which finn-eiendom 2>/dev/null || echo "expected: not yet installed; entry points come in steps 6 and 7" +ruff check . # zero issues +pytest -x # green (allow mcp_server failures) +``` + +Move on. + +--- + +## Step 2 — Pydantic v2 cleanup + +### Kickoff prompt + +> Read **PRD.md** (especially §17 code ownership and A8 acceptance criterion), **`.github/instructions/python.instructions.md`**, and **`.github/instructions/clean-code.instructions.md`**. +> +> Implement Phase 2 step 2: convert every Pydantic model in `finn_eiendom/models.py` from v1 (`class Config:`) to v2 (`model_config = ConfigDict(...)`). Use `context7:query-docs` on `pydantic/pydantic` if you're not sure of the v2 syntax — don't guess. +> +> Add `tests/test_models.py` with a JSON roundtrip test per model. +> +> Run `ruff check .`, `ruff format .`, and `pytest tests/test_models.py -v` before declaring done. + +### Files + +* `finn_eiendom/models.py` (edit) +* `tests/test_models.py` (new) + +### Done when + +* `grep -rn "class Config:" finn_eiendom/` produces zero output. +* `pytest tests/test_models.py` is green. +* Existing tests still pass. + +--- + +## Step 3 — Service layer + +### Kickoff prompt + +> Read **PRD.md** §16 (Service layer) and §17 (code ownership), **`.github/instructions/python.instructions.md`** and **`.github/instructions/clean-code.instructions.md`**. +> +> Create `finn_eiendom/service.py` with the public surface listed in PRD §16: `get_or_fetch_ad`, `get_or_fetch_eiendom_unit`, `get_or_fetch_similar_units`, `analyze_search`, `analyze_ad`, `analyze_ad_against_comps`, `find_similar_to_liked`, `compare_ads`, `resolve_eiendom_unit_from_finn_url`, `build_unit_vector_for_unit_code`, `decode_unit_vector_to_dict`, `save_feedback`, `get_shortlist`, `get_new_ads_since_last_run`. +> +> Each function: +> 1. Opens its own SQLite connection via `cache.init_db(FINN_CACHE_PATH)`. +> 2. Reads cache first with TTLs from `config.py`. +> 3. On miss or `force_refresh=True`, calls the fetcher in `ad.py` / `eiendom_no.py`. +> 4. Writes the fresh result back. +> 5. Returns a typed model or dict. +> +> Do not duplicate behavior from `analysis.py` — delegate to it. Add `tests/test_service.py` covering the five service tests listed in PRD §25.2. + +### Files + +* `finn_eiendom/service.py` (new) +* `tests/test_service.py` (new) +* `tests/conftest.py` (may need a `tmp_db` fixture if it doesn't exist) + +### Done when + +* `pytest tests/test_service.py` is green. +* `service.py` imports only from `models`, `config`, `cache`, `analysis`, `ad`, `eiendom_no`, `feedback`, `scoring`, stdlib. +* No `import httpx` or `import sqlite3` outside their owners. + +--- + +## Step 4 — Formatting layer + +### Kickoff prompt + +> Read **PRD.md** §17.6 (shared formatting module) and §19 (output formats), **`.github/instructions/clean-code.instructions.md`**. +> +> Create `finn_eiendom/formatting.py` with these renderers (signatures in PRD §17.6): `render_ad`, `render_shortlist`, `render_comparison`, `render_diff`, `render_similar_units`, `render_unit`, `render_score_breakdown`, plus `render_cache_stats` for the CLI cache subcommand. +> +> Each renderer accepts `(payload, fmt: Literal["json","markdown","table"]) -> str`. Unsupported formats raise `ValueError` listing supported options. Table rendering only applies where it makes sense (shortlist, comparison, diff, similar-units). +> +> Add `tests/test_formatting.py` covering the three tests listed in PRD §25.5. + +### Files + +* `finn_eiendom/formatting.py` (new) +* `tests/test_formatting.py` (new) + +### Done when + +* `pytest tests/test_formatting.py` is green. +* `render_*` is the *only* place that formats output. No inline rendering anywhere else (verified by reading diffs of steps 6 and 7). + +--- + +## Step 5 — HTTP retry on 5xx + +### Kickoff prompt + +> Read **PRD.md** A9 (acceptance criterion), **`.github/instructions/python.instructions.md`**. +> +> Extend `HTTPClient.get()` in `finn_eiendom/http.py` to retry on 5xx responses (500/502/503/504) with exponential backoff `1s, 2s, 4s`, up to `retries` attempts (default 3). Surface 4xx as `httpx.HTTPStatusError` immediately. Apply the existing `request_delay` between any two calls. +> +> If you're unsure about `httpx` retry semantics or `respx` test patterns, use `context7`. +> +> Add `tests/test_http.py` covering the three tests listed in PRD §25.6 using `respx`. + +### Files + +* `finn_eiendom/http.py` (edit) +* `tests/test_http.py` (new) + +### Done when + +* `pytest tests/test_http.py` is green. +* `httpx` imports remain confined to `http.py`. + +--- + +## Step 6 — Replace FastAPI with FastMCP + +### Kickoff prompt + +> Read **PRD.md** §14 (MCP design — every tool and input schema), §17 (code ownership), and **`.github/instructions/mcp.instructions.md`** end-to-end. +> +> Rewrite `finn_eiendom/mcp_server.py` from scratch: +> - Use `from mcp.server.fastmcp import FastMCP`. +> - Configure stderr-only logging. +> - Register all 14 tools listed in PRD §14.1 with the `finn_` prefix. +> - Each tool body has the shape in `mcp.instructions.md` §"Tool body shape": one `service.` call, one `formatting.render_*` call, try/except returning the JSON error envelope. +> - Input schemas as in PRD §14.2. +> - Annotations: `readOnlyHint=True` for all except `finn_save_feedback`. +> - `main()` calls `mcp.run(transport="stdio")`. +> - Add `finn-eiendom-mcp = "finn_eiendom.mcp_server:main"` to `[project.scripts]` in `pyproject.toml`. +> +> If unsure about FastMCP annotations or transport options, use `context7:query-docs` on the MCP Python SDK. +> +> Rewrite `tests/test_mcp_server.py` to cover the three tests in PRD §25.3. Use the SDK's testing helpers — do not spawn a subprocess. +> +> Verify: `finn-eiendom-mcp` boots over stdio, `mcp dev finn_eiendom/mcp_server.py` lists all 14 tools. + +### Files + +* `finn_eiendom/mcp_server.py` (full rewrite) +* `tests/test_mcp_server.py` (full rewrite) +* `pyproject.toml` (edit `[project.scripts]`) + +### Done when + +* `mcp_server.py` imports only `service`, `formatting`, `models`, `config`, stdlib, `mcp`, `pydantic`. +* All 14 tools registered. +* `pytest tests/test_mcp_server.py` is green. +* `grep -rn "FastAPI" finn_eiendom/` is empty. + +--- + +## Step 7 — CLI + +### Kickoff prompt + +> Read **PRD.md** §15 (CLI design — every command and option) and **`.github/instructions/cli.instructions.md`** end-to-end. +> +> Create `finn_eiendom/cli.py` with a `typer.Typer` app exposing all commands in PRD §15.1, plus `finn_eiendom/__main__.py` that calls the app. Add to `pyproject.toml`: +> ``` +> [project.scripts] +> finn-eiendom = "finn_eiendom.cli:app" +> ``` +> +> Each command: +> - Translates options into a `service.` call. +> - Calls `formatting.render_*(result, format)` and `typer.echo(...)`. +> - No business logic, no inline rendering. +> - Body under ~20 lines. +> +> Sub-app for `cache` (stats/clear/clear-html/clear-json) and `config` (show/path). `serve` accepts `--transport stdio|http` and dispatches to `mcp_server.main()` or the HTTP transport. +> +> If unsure about Typer sub-apps or `CliRunner`, use `context7`. +> +> Add `tests/test_cli.py` covering the five tests in PRD §25.4 using `typer.testing.CliRunner`. Mock `service.*` with `monkeypatch` — do not exercise the full stack here, that's `test_service.py`. + +### Files + +* `finn_eiendom/cli.py` (new) +* `finn_eiendom/__main__.py` (new) +* `tests/test_cli.py` (new) +* `pyproject.toml` (edit) + +### Done when + +* `finn-eiendom --help` lists every command in PRD §15.1. +* `cli.py` imports only `service`, `formatting`, `models`, `config`, stdlib, `typer`. +* `pytest tests/test_cli.py` is green. + +--- + +## Step 8 — Diff workflow (new / removed / changed) + +### Kickoff prompt + +> Read **PRD.md** §10.8, §13 (search_runs table), workflow I in §18, and **`.github/instructions/clean-code.instructions.md`**. +> +> Implement: +> 1. `search_runs` and `scores` tables in `cache.py` (use existing migration pattern). +> 2. `service.get_new_ads_since_last_run(search_url)` that compares against the previous run for the same `normalized_url` and returns `{new_ads, removed_ads, changed_ads}` with price/common_costs/status diffs on changed. +> 3. `finn_get_new_ads_since_last_run` MCP tool. +> 4. `finn-eiendom diff ` CLI command. +> 5. `formatting.render_diff(result, fmt)`. +> +> Add tests covering: empty previous-run case, all-new case, mixed new+removed+changed case. + +### Done when + +* The three new tests pass. +* MCP and CLI both expose the same behavior with identical defaults. + +--- + +## Step 9 — Compare workflow + +### Kickoff prompt + +> Read **PRD.md** workflow K in §18 and §14.2 (`CompareAdsInput`). +> +> Implement `service.compare_ads(finnkoder, include_eiendom_no=True, include_comps=True)` returning a comparison table + winners by category (best value / lifestyle / hybel / bargain / safest / highest risk / most overpriced). +> +> Wire `finn_compare_ads` MCP tool and `finn-eiendom compare ` CLI command. Add `formatting.render_comparison`. Tests for service and CLI. + +### Done when + +* `finn-eiendom compare 462400360 461153194 --format markdown` produces a readable comparison. +* Service test covers the winners-by-category logic. + +--- + +## Step 10 — Similar-to-liked + +### Kickoff prompt + +> Read **PRD.md** workflow G in §18 and `FindSimilarToLikedInput` in §14.2. +> +> Implement `service.find_similar_to_liked(finnkode, mode, listing_status)`: +> 1. Load FinnAd; verify `feedback` has `verdict=liked` for this finnkode. +> 2. Ensure Eiendom.no enrichment + unit_vector exist. +> 3. Fetch similar-units (prefer `FOR_SALE` for recommendations, `RECENTLY_SOLD` for comps). +> 4. Score candidates against user preferences. +> 5. Return ranked recommendations. +> +> Wire MCP tool and CLI command. Tests covering: no liked feedback raises clear error; happy path returns ranked list. + +### Done when + +* `finn-eiendom similar-to-liked 462400360` returns ranked candidates when the listing has a liked verdict, and a clear error otherwise. + +--- + +## Step 11 — Architecture tests + +### Kickoff prompt + +> Read **PRD.md** A10 (architecture acceptance criterion) and §17.3 (layering invariants). +> +> Create `tests/test_architecture.py` that walks every `.py` file under `finn_eiendom/` with `ast`, collects all `import` and `from X import Y` statements, and asserts the layering invariants in PRD A10: +> - No `httpx` outside `http.py`. +> - No `sqlite3` outside `cache.py`. +> - No `BeautifulSoup` outside `search.py` / `ad.py`. +> - No `msgpack` outside `eiendom_no.py`. +> - `mcp_server.py` and `cli.py` import only from the allowed set. +> - `service.py` never imports `mcp_server` or `cli`. +> +> Add a parametrize'd test per invariant so failures show which module violated which rule. Failures should print the offending import line and module. + +### Done when + +* `pytest tests/test_architecture.py` is green. +* Deliberately introducing a violation (e.g. `import httpx` in `service.py`) makes a test fail with a clear message. + +--- + +## Step 12 — README + Claude Desktop config + final verification + +### Kickoff prompt + +> Read **PRD.md** §21 (deployment), §22 (MVP scope), §24 (all acceptance criteria), **README.md** and **USAGE.md**. +> +> Update `README.md` and `USAGE.md` so every command, env var, and Claude Desktop snippet matches what was actually built in steps 1–11. Verify with the user's exact paths. +> +> Run the full A1–A11 acceptance check: +> +> - A1: `finn-eiendom-mcp` boots over stdio; `mcp dev finn_eiendom/mcp_server.py` lists all 14 tools. +> - A2: `finn-eiendom --help` lists every §15.1 command; each command runs against fixtures. +> - A3 – A9: matching service tests pass. +> - A10: `pytest tests/test_architecture.py` is green. +> - A11: `ruff check .` is clean; `pytest` is fully green; `mypy --strict finn_eiendom` passes or is documented as a gap. +> +> Report any failures with specific file/line references — don't paper over them. + +### Files + +* `README.md` (edit to match reality) +* `USAGE.md` (edit to match reality) + +### Done when + +* All 11 acceptance criteria in PRD §24 pass. +* README + USAGE quickstart examples actually work end-to-end on a fresh clone. + +--- + +## Definition of done for the whole phase + +Merge `chore/cleanup-phase-2-prep` into `main` when **every** box is checked: + +* [ ] All 12 steps merged in order. +* [ ] `finn-eiendom-mcp` boots over stdio with all 14 tools. +* [ ] `finn-eiendom --help` lists every command in PRD §15.1. +* [ ] `pytest` is green, including the new `test_service.py`, `test_cli.py`, `test_http.py`, `test_formatting.py`, `test_models.py`, `test_architecture.py`. +* [ ] `ruff check .` is clean. +* [ ] `mypy --strict finn_eiendom` passes or has a documented exception list. +* [ ] `README.md` and `USAGE.md` quickstart examples work on a fresh clone in under 5 minutes. +* [ ] Claude Desktop config in USAGE.md is verified to work against your installation. + +--- + +## When a step blocks + +If a step blocks on an unclear requirement: + +1. Re-read the relevant PRD section. +2. Check `PRD.md` §28 (open questions) — the answer may be a deferred decision. +3. If still unclear, write the question down, pick the simplest interpretation, mark it `# TODO(): revisit ` in code, and move on. + +If a step blocks on a library question (FastMCP, Pydantic v2, Typer, httpx, msgpack, respx): + +1. Use `context7` — see `.github/instructions/docs.instructions.md`. +2. If context7 returns nothing useful, write the smallest possible spike in `scratch/` (gitignored) to verify behavior. + +If a step blocks on §17 (code ownership) — i.e. it feels like the right answer requires putting logic in the "wrong" place: + +1. Stop. +2. Re-read PRD §17.2 (decision table) and §17.3 (layering invariants). +3. Ask whether the service layer is actually missing a function. Usually it is. +4. Add the missing service function instead of bending the layering. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a1c0a36 --- /dev/null +++ b/Makefile @@ -0,0 +1,47 @@ +.PHONY: help venv install dev test test-fast lint format typecheck check clean serve mcp doctor + +PYTHON ?= python3.12 +VENV ?= .venv +BIN = $(VENV)/bin + +help: ## Show this help + @grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-12s\033[0m %s\n", $$1, $$2}' + +venv: ## Create the local virtualenv + uv venv $(VENV) 2>/dev/null || $(PYTHON) -m venv $(VENV) + @echo "Activate with: source $(BIN)/activate" + +install: venv ## Install the package (editable) with dev extras + uv pip install --python $(BIN)/python -e ".[dev]" 2>/dev/null || $(BIN)/pip install -e ".[dev]" + +dev: install ## Alias for install + +test: ## Run the full test suite + $(BIN)/pytest + +test-fast: ## Run tests, fail fast, verbose + $(BIN)/pytest -x -v + +lint: ## Lint with ruff + $(BIN)/ruff check . + +format: ## Auto-format with ruff + $(BIN)/ruff format . + +typecheck: ## Static type-check with mypy + $(BIN)/mypy finn_eiendom + +check: lint typecheck test ## Run lint + typecheck + tests + +clean: ## Remove caches and build artifacts + rm -rf .pytest_cache .ruff_cache .mypy_cache build dist *.egg-info + find . -type d -name __pycache__ -prune -exec rm -rf {} + + +serve: ## Start the MCP server over HTTP on port 8010 + $(BIN)/finn-eiendom serve --transport http --port 8010 + +mcp: ## Start the MCP server over stdio + $(BIN)/finn-eiendom-mcp + +doctor: ## Smoke-check the install + $(BIN)/finn-eiendom doctor diff --git a/PRD.md b/PRD.md new file mode 100644 index 0000000..482c96e --- /dev/null +++ b/PRD.md @@ -0,0 +1,1556 @@ +# PRD: finn-eiendom-mcp — Personal Real Estate Scout + +> Private, self-hosted property analysis platform built around a FINN scraper, an Eiendom.no enrichment layer, a scoring engine, and a SQLite cache. Exposed through three coordinated entry points: a **Python library** (`finn_eiendom`), an **MCP server** (FastMCP, stdio + optional HTTP), and a **CLI** (`finn-eiendom`). The Python library is the source of truth — MCP and CLI are thin, parallel front ends over the same service layer. + +--- + +## 1. Summary + +`finn-eiendom-mcp` analyzes a FINN real-estate search URL and returns a ranked shortlist of properties enriched with Eiendom.no estimates, comparable recently-sold units, scoring, risk flags, and broker questions. The same domain code powers: + +1. **MCP tools** for Claude Desktop / AI clients / n8n / agents. +2. **A CLI** for terminal-driven manual analysis and shell scripting. +3. **A Python library** that tests and notebooks can call directly. + +```text +FINN search URL + → listings (search cards) + → FINN details + → Eiendom.no enrichment (unit search + unit detail) + → unit_vector (built locally) + → similar-units / comps + → scoring + categorization + → shortlist + risks + next steps + broker questions +``` + +This is a **private, low-frequency decision-support tool**. Not a SaaS, not a crawler, not a bidding tool, not legal/technical/financial advice. + +--- + +## 2. Why three entry points + +| Layer | Audience | Transport | Purpose | +| ---------------- | ------------------------------------- | -------------------- | ----------------------------------------------------------------------------------- | +| Python library | tests, notebooks, custom scripts | in-process | Source of truth. Pure functions + async I/O. No global state beyond SQLite path. | +| MCP server | Claude Desktop, n8n, AI agents | stdio + streamableHttp | LLM-driven analysis, shortlisting, broker prep. | +| CLI | terminal, cron, ad-hoc debugging | stdio | Quick checks, smoke tests, scripted runs, demonstrations of new behavior. | + +The architectural rule: **all three layers call the same service functions**. MCP tools and CLI commands are thin wrappers around `service.py`. If a change goes into one, equivalent behavior appears in the others. + +--- + +## 3. User context & preferences + +User and partner are searching for a home in the Oslo area, roughly 9–12 MNOK depending on total monthly cost, rental/hybel potential, and property quality. Important preferences: + +* Good location and quality of life. +* Enough space and strong floor plan. +* Minimum 2 bedrooms, preferably more. +* Balcony, terrace, views, sun, sea/nature proximity. +* Hybel/rental potential or flexible layout. +* Willing to renovate themselves if the price is right. +* Renovation need is **not** automatically negative. +* Strong interest in **bargain candidates** where competition may be lower due to older standard or poor presentation. +* Avoid uncontrolled technical/legal risk: moisture, rot, illegal hybel, unapproved changes, severe TG3, unclear housing-association finances. + +--- + +## 4. Problem + +FINN search results are not ranked by the user's actual decision criteria. Manually triaging dozens of listings is slow and inconsistent. The current process lacks: + +* Automated extraction of FINN search and listing data. +* Linking FINN listings to structured Eiendom.no units. +* Price evaluation against Eiendom.no estimates and comparable sales. +* Similar-property discovery from listings the user already likes. +* Consistent scoring of price, location, layout, risk, renovation upside, hybel potential. +* Local history of seen listings, changes, scores, and feedback. +* Integration with AI clients and shell tooling. + +--- + +## 5. Goals + +The system shall: + +1. Accept a FINN real estate search URL via library, MCP tool, or CLI command. +2. Parse FINN search pages and extract listing cards, URLs, and finnkoder. +3. Fetch FINN listing detail pages and parse into a structured `FinnAd`. +4. Normalize Norwegian numbers, areas, currencies, dates, URLs. +5. Resolve each FINN URL to an Eiendom.no `unitCode` and fetch the unit detail. +6. Build a base64url-encoded `unit_vector` from unit detail and fetch similar-units / comps. +7. Score each listing using FINN data, Eiendom.no estimates, comps, user preferences, and risk signals. +8. Return a ranked shortlist with reasons, risks, next steps, and broker questions. +9. Cache HTML, JSON, parsed ads, units, comps, scores, and feedback in SQLite. +10. Detect new/removed/changed listings between runs of the same search URL. +11. Store user feedback (`liked`, `rejected`, `interesting`, `risk`, `viewing_candidate`, etc.) and surface it in subsequent runs. +12. Expose all of the above through MCP tools, CLI commands, and Python functions with consistent semantics. +13. Run locally in a project-local virtualenv. Docker is supported but optional. + +--- + +## 6. Non-goals + +MVP shall not: + +* Crawl all of FINN or Eiendom.no. +* Bypass rate limits, bot protection, authentication, or access controls. +* Bulk-harvest or redistribute data. +* Contact brokers automatically. +* Place bids automatically. +* Interpret full PDF condition reports. +* Provide official valuation, legal advice, technical inspection, or mortgage advice. +* Expose a public SaaS service. +* Build a web UI. + +--- + +## 7. Primary use cases + +| ID | Use case | Description | +| ---- | ----------------------------- | ------------------------------------------------------------------------------------ | +| UC1 | Analyze FINN search | Paste a FINN search URL → ranked shortlist with reasons/risks/next steps. | +| UC2 | Find bargain candidates | Surface listings with renovation need or weak presentation that may be underpriced. | +| UC3 | Separate renovation from risk | Treat cosmetic renovation as upside; flag technical/legal risk. | +| UC4 | Compare listings | Side-by-side comparison of multiple finnkoder. | +| UC5 | Save feedback | Mark listings as liked, rejected, interesting, risk, viewing candidate, etc. | +| UC6 | Find new listings since last run | Show new/removed/changed listings vs the prior run of the same search URL. | +| UC7 | Broker questions | Generate concrete questions based on risks, deviations, hybel status, comps. | +| UC8 | Eiendom.no enrichment | Add estimates, coordinates, area, rooms, floor, year, market data. | +| UC9 | Price fairness | Classify price as cheap / fair / expensive vs estimate and comps. | +| UC10 | Similar to liked | Find properties similar to listings the user has explicitly liked. | +| UC11 | Comparable sales | Fetch similar recently sold units to support valuation and bargain scoring. | + +--- + +## 8. Inputs + +Supported inputs across all three layers: + +* FINN search URL. +* FINN listing URL. +* Finnkode (string of digits). +* List of finnkoder. +* Eiendom.no `unitCode`. +* Eiendom.no `unit_vector` (base64url string). +* User feedback verdict + notes. +* Optional scoring/preference overrides (JSON or env). + +Example FINN search URL: + +```text +https://www.finn.no/realestate/homes/search.html?bbox=...&area_from=60&min_bedrooms=2&price_collective_to=12000000&... +``` + +--- + +## 9. External endpoints + +### 9.1 FINN HTML + +Not JSON. Parse HTML, cache aggressively, run at low frequency. + +| Method | URL pattern | Purpose | +| ------ | ---------------------------------------------------------------------------------------- | ---------------------------------------------------------- | +| GET | `https://www.finn.no/realestate/homes/search.html?{query_params}` | Parse search result cards, listing URLs, finnkoder. | +| GET | `https://www.finn.no/realestate/homes/search.html?{query_params}&page={N}` | Pagination. | +| GET | `https://www.finn.no/realestate/homes/ad.html?finnkode={finnkode}` | Parse listing detail page. | +| GET | `{calendar_ics_url_from_listing_html}` | Optional: parse viewing times (prefer parsing from listing HTML first). | + +Important search params: `bbox`, `location`, `area_from`, `area_to`, `price_collective_to`, `price_collective_from`, `min_bedrooms`, `facilities`, `floor_navigator`, `lifecycle`, `page`, `stored-id`. + +### 9.2 Eiendom.no + +Real JSON API. Used for enrichment, valuation, and similar-units. + +#### 9.2.1 Resolve FINN listing → Eiendom.no unitCode + +``` +GET https://api.eiendom.no/api/v1/geodata/units/search/?search={url_encoded_finn_listing_url_or_address} +``` + +Returns: + +```json +{ + "units": [ + { + "unitCode": "c-gxw-xmyum-s2a", + "address": "Gunnar Schjelderups v. 11D H0502, Oslo", + "geometry": { "type": "Point", "coordinates": [10.77, 59.95] } + } + ], + "summary": { "totalUnitsFound": 1, "totalCitiesFound": 1 } +} +``` + +#### 9.2.2 Fetch unit detail + +``` +GET https://api.eiendom.no/api/v1/geodata/units/{unitCode}/ +``` + +Important response fields: `unitCode`, `address`, `unitName`, `streetAddress`, `postalName`, `registrationCode`, `geometry.coordinates`, `specification.{propertyType, floor, rooms, constructionYear, usableArea}`, `valuation.{estimatedSellingPrice, estimatedSellingPriceLower, estimatedSellingPriceUpper}`, `latestMarketData.{listingPrice, monthlyCosts, squareMeterPrice, daysOnMarket, saleStatus, marketPlacementScore}`. + +#### 9.2.3 Build `unit_vector` (local, not HTTP) + +Encoding step before similar-units. Generated from unit detail data: + +```json +{ + "lon": 10.7803, + "lat": 59.9287, + "ptype": "APARTMENT", + "floor": 8, + "rooms": 5, + "built": 2005, + "area": 80, + "price": 8491082 +} +``` + +Encoding: `unit_vector = base64url_without_padding(msgpack(payload))`. + +Library functions (in `eiendom_no.py` only): + +* `build_unit_vector(unit) -> str` +* `decode_unit_vector(unit_vector) -> dict` + +#### 9.2.4 Fetch similar-units + +``` +GET https://api.eiendom.no/api/v1/geodata/units/similar/?unit_vector={unit_vector} +``` + +Returns a list of comparable units with `unitCode`, `address`, `geometry`, `specification`, and `marketData.{listingPrice, jointDebt, monthlyCosts, sellingPrice, squareMeterPrice, daysOnMarket, saleStatus, finalizedAt}`. + +`listing_status` (RECENTLY_SOLD / FOR_SALE / CURRENT) is implemented as a **local filter** over the returned `marketData.saleStatus` and `finalizedAt`. Only pass it to the API if later experimentation confirms server-side support. + +### 9.3 Optional Hjemla (disabled by default) + +``` +GET https://consumer-service-hjemla-prod.propcloud.no/public/market/address-list +``` + +Params: `marketType`, `period`, `marketStates`, `unittypes`, bbox (`swLat`, `neLat`, `swLng`, `neLng`), `limit`, `randomize`. + +Useful for bbox-level market snapshots. Disabled in MVP via `HJEMLA_ENABLED=false`. + +### 9.4 MCP server endpoint + +stdio is the default. Optional Streamable HTTP on `POST http://{host}:8010/mcp`. Operational endpoints when running HTTP: `GET /health`, `GET /version`, `GET /debug/config`. + +--- + +## 10. Functional requirements + +### 10.1 FINN search extraction + +Fetch and parse FINN search pages. Extract and deduplicate by finnkode. Support pagination via `page=N` and respect `FINN_MAX_SEARCH_PAGES`. Search-card fields when available: finnkode, URL, title, address/area, area, asking_price, total_price, common_costs, ownership_type, property_type, bedrooms, floor, viewing time, broker. + +### 10.2 FINN listing detail extraction + +Fetch and parse individual listing pages. Fields when available: finnkode, URL, title, address, postal_area, district, property_type, ownership_type, asking_price, total_price, shared_debt, common_costs, fees, municipal_fees, BRA/BRA-i/BRA-e/BRA-b, P-room, rooms, bedrooms, floor, construction_year, energy_rating, heating, balcony/terrace, elevator, parking/garage, viewings, listing_description, broker_name, broker_company, document_links. + +### 10.3 Normalization + +* Norwegian formatted numbers: `7 200 991 kr` → `7200991`. +* Areas: `77 m²` → `77`. +* Dates/viewings → ISO 8601. +* URLs → absolute. +* Missing values → `null`. +* Finnkode and Eiendom.no unitCode as strings. + +### 10.4 Eiendom.no enrichment + +Enabled by default. Flow: FINN listing URL → unit search → `unitCode` → unit detail → structured market data. Store: unit_code, address, coordinates, registration code, property_type, floor, rooms, construction_year, usable_area, estimated_selling_price + lower/upper, latest market data (listing_price, sqm_price, monthly_costs, days_on_market, sale_status), market_placement, raw JSON. + +If enrichment fails, the analysis continues with FINN data only and marks enrichment as `unavailable`. + +### 10.5 Similar-units / `unit_vector` + +Required functions: `build_unit_vector(unit)`, `decode_unit_vector(unit_vector)`, `get_similar_units(unit_vector, listing_status)`. Supported listing statuses: `RECENTLY_SOLD` (default for comps), `FOR_SALE` (active recommendations), `CURRENT` (if confirmed). Similar-unit fields when available: unit_code, address, coordinates, property_type, floor, rooms, construction_year, area, listing_price, selling_price, shared_debt, common_costs, sqm_price, days_on_market, sale_status, finalized_at, raw JSON. + +### 10.6 Cache and history + +SQLite. Default TTLs: + +| Data | Default TTL | +| -------------------- | ----------------------: | +| Search results | 30–60 minutes | +| FINN listing details | 6–24 hours | +| Eiendom.no unit data | 24 hours | +| Similar-units | 24 hours | +| Feedback/history | Permanent until deleted | + +### 10.7 Feedback + +Verdict vocabulary: `liked`, `rejected`, `interesting`, `bargain_candidate`, `risk_object`, `viewing_candidate`, `viewed`, `too_expensive`, `too_small`, `too_far_out`, `too_high_risk`, `likes_location`, `likes_layout`, `dislikes_area`. Stored permanently. `liked` listings are used as seeds for similar-to-liked recommendations. Feedback can be used as a soft scoring signal. + +### 10.8 Diffs between runs + +For a normalized search URL, the system shall compare finnkoder against the previous run and report `new_ads`, `removed_ads`, and `changed_ads` (price, common costs, status). Optionally re-fetch only new or changed details. + +--- + +## 11. Scoring and classification + +### 11.1 Score model (clamped to 0–100) + +| Category | Range | +| ------------------------------------- | ----: | +| Economy / total cost | 0–20 | +| Eiendom.no estimate / market position | 0–20 | +| Comparable sales / similar-units | 0–20 | +| Location | 0–15 | +| Layout and potential | 0–20 | +| Outdoor space / view / sun | 0–15 | +| Hybel / rental potential | 0–10 | +| Renovation / bargain upside | 0–15 | +| Technical / legal risk | -20–0 | + +### 11.2 Categories + +`bargain_candidate`, `safe_candidate`, `lifestyle_candidate`, `hybel_candidate`, `renovation_candidate`, `similar_to_liked`, `comparable_sale_match`, `risk_object`, `too_expensive`, `not_interesting`, `manual_review_required`. + +### 11.3 Bargain candidate logic + +A listing may be a bargain candidate when several of these are true: low sqm price vs comps, listing price below estimate, price near lower estimate interval, sqm price below similar recently sold, older standard / renovation need / weak presentation, strong underlying location/layout, suitable size, risk appears controllable. + +### 11.4 Renovation logic + +Renovation need is not automatically negative. + +* **Opportunity:** older standard, modernization need, renovation object, cosmetic wear, outdated kitchen/surfaces, weak presentation, layout improvement potential. +* **Risk:** moisture, rot, mold, drainage issues, load-bearing concerns, illegal/unapproved changes, non-approved hybel, serious electrical/wet-room deviations, TG3 with high cost or safety implications. + +### 11.5 Hybel / rental logic + +* **Positive:** hybel, rental unit, separate entrance, extra bathroom/kitchenette, basement/sokkel, secondary section, stated rental income. +* **Risk:** not approved, not applied for, not building-reported, only "disposable room", not approved for permanent residence, board approval required. + +Output classifies as: documented legal hybel / possible hybel potential / unclear/risky hybel / not relevant. + +### 11.6 Market and comparable outputs + +Market estimate: `market_score`, `price_vs_estimate_pct`, `price_position` (`below_estimate` / `within_estimate_range` / `above_estimate` / `unknown`), `sqm_price_position` (`cheap` / `normal` / `expensive` / `unknown`). + +Comparable: `comparable_score`, `comps_count`, `avg_selling_price`, `median_selling_price` (where possible), `avg_sqm_price`, `sqm_price_delta_pct`, `price_delta_pct`, `confidence` (`low` / `medium` / `high`). + +Risk factors: too few comps, comps too far away, large differences in area/rooms/floor/year, old sale dates, low confidence. + +--- + +## 12. Technical architecture + +```text +AI client / Claude Desktop / n8n / agent ← MCP layer + ↓ + FastMCP (stdio | streamable HTTP) + +User in a terminal ← CLI layer + ↓ + finn-eiendom CLI (typer) + +Python tests / notebooks / custom scripts ← Library layer + ↓ + import finn_eiendom + + ──────── all three above share ──────── + + finn_eiendom.formatting ← render_* for json/markdown/table + ↓ + finn_eiendom.service ← orchestration: get_or_fetch, analyze_* + ↓ + finn_eiendom.analysis ← shortlist + summary building + ↓ + search / ad / eiendom_no / scoring / feedback + ↓ + finn_eiendom.cache (SQLite) ← html, json, ads, units, comps, scores, feedback + ↓ + finn_eiendom.http (httpx) ← delay, retry, user-agent + ↓ + FINN HTML + Eiendom.no JSON (+ optional Hjemla) +``` + +### 12.1 Module layout + +```text +finn_eiendom/ + __init__.py + config.py # env / defaults / TTLs + models.py # Pydantic v2 models + parser.py # number/area/date/URL/finnkode normalization + http.py # async HTTP with delay, retry, user-agent + cache.py # SQLite schema + persistence + search.py # FINN search HTML parsing + pagination + ad.py # FINN listing HTML parsing + eiendom_no.py # unit search/detail, unit_vector, similar-units + scoring.py # score model + classifications + feedback.py # verdicts + soft preference signal + analysis.py # orchestration + shortlist + summary + service.py # get_or_fetch_* + thin facade for MCP and CLI + formatting.py # render_* helpers shared by MCP and CLI + mcp_server.py # FastMCP wrappers around service + cli.py # typer-based CLI wrappers around service + __main__.py # python -m finn_eiendom → CLI entry +``` + +### 12.2 Layering rules + +* `mcp_server.py` and `cli.py` are **thin**. They translate inputs to service calls and format outputs via `formatting.py`. +* `service.py` orchestrates cache + fetch. Every read should consult the cache first; every fresh fetch should write back. +* `analysis.py` orchestrates the full shortlist run: search → details → enrichment → comps → scoring → summary. +* Domain modules (`search`, `ad`, `eiendom_no`, `scoring`, `feedback`) are pure or only depend on `http`/`cache`. +* No layer above the service may call `httpx` or `sqlite3` directly. + +--- + +## 13. Data model + +SQLite. Existing schema already implements `finn_ads`, `eiendom_units`, `similar_units`, and `cache_meta`. MVP additions: `search_runs`, `scores`, `feedback`. + +```sql +CREATE TABLE finn_ads ( + finnkode TEXT PRIMARY KEY, + url TEXT, + payload TEXT NOT NULL, -- JSON-serialized FinnAd + fetched_at TEXT NOT NULL +); + +CREATE TABLE eiendom_units ( + unit_code TEXT PRIMARY KEY, + payload TEXT NOT NULL, -- JSON-serialized EiendomUnit + fetched_at TEXT NOT NULL +); + +CREATE TABLE similar_units ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + unit_code TEXT NOT NULL, + listing_status TEXT NOT NULL, + payload TEXT NOT NULL, -- JSON array of SimilarUnit + fetched_at TEXT NOT NULL +); + +CREATE TABLE cache_meta ( + key TEXT PRIMARY KEY, -- e.g. search_page:{url}, search_cards:{url} + value TEXT NOT NULL, + expires_at TEXT +); + +CREATE TABLE search_runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + search_url TEXT NOT NULL, + normalized_url TEXT NOT NULL, + created_at TEXT NOT NULL, + total_found INTEGER, + total_parsed INTEGER, + total_scored INTEGER, + result_json TEXT -- shortlist snapshot +); + +CREATE TABLE scores ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + finnkode TEXT NOT NULL, + search_run_id INTEGER, + total_score REAL, + economy REAL, + market_position REAL, + comparable_sales REAL, + location REAL, + layout REAL, + outdoor REAL, + rental_potential REAL, + renovation REAL, + risk REAL, + categories_json TEXT, + explanation_json TEXT, + created_at TEXT NOT NULL +); + +CREATE TABLE feedback ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + finnkode TEXT NOT NULL, + verdict TEXT NOT NULL, + notes TEXT, + created_at TEXT NOT NULL +); +``` + +--- + +## 14. MCP design + +### 14.1 Tools + +All tool names use the `finn_` prefix to avoid collisions when the server runs alongside others. + +| Tool | Purpose | Read-only | +| ------------------------------------- | ---------------------------------------------------------------- | :-------: | +| `finn_analyze_search` | Analyze a FINN search URL and return a ranked shortlist. | yes | +| `finn_get_ad` | Fetch structured data for one finnkode. | yes | +| `finn_compare_ads` | Compare multiple listings side by side. | yes | +| `finn_save_feedback` | Store feedback/verdict/notes. | no | +| `finn_get_shortlist` | Fetch stored shortlist from a search run. | yes | +| `finn_get_new_ads_since_last_run` | Detect new/removed/changed listings vs the previous run. | yes | +| `finn_resolve_eiendom_unit` | Map FINN URL → Eiendom.no `unitCode`. | yes | +| `finn_get_eiendom_unit` | Fetch Eiendom.no unit detail by `unitCode`. | yes | +| `finn_enrich_ad` | Combine FINN listing and Eiendom.no enrichment. | yes | +| `finn_build_unit_vector` | Build a base64url `unit_vector` from a `unitCode`. | yes | +| `finn_decode_unit_vector` | Decode a `unit_vector` for inspection/debugging. | yes | +| `finn_get_similar_units` | Fetch comps/recommendations from `unit_vector`. | yes | +| `finn_find_similar_to_liked_ad` | Find properties similar to a listing the user has liked. | yes | +| `finn_analyze_ad_against_comps` | Evaluate one listing against `RECENTLY_SOLD` comps. | yes | + +All read-only tools set `readOnlyHint=True, destructiveHint=False, openWorldHint=True`. `finn_save_feedback` sets `readOnlyHint=False, destructiveHint=False, idempotentHint=False`. + +### 14.2 Tool input schemas (Pydantic v2) + +```python +class AnalyzeSearchInput(BaseModel): + search_url: str = Field(..., description="Full FINN search URL") + max_pages: int = Field(default=3, ge=1, le=10) + detail_limit: int = Field(default=20, ge=1, le=100) + include_details: bool = True + include_eiendom_no: bool = True + include_similar_units_for_shortlist: bool = False + response_format: Literal["json", "markdown"] = "json" + +class GetAdInput(BaseModel): + finnkode: str = Field(..., pattern=r"^\d+$") + force_refresh: bool = False + include_eiendom_no: bool = True + include_similar_units: bool = False + +class ResolveUnitInput(BaseModel): + finn_url: str + +class GetUnitInput(BaseModel): + unit_code: str + force_refresh: bool = False + +class BuildUnitVectorInput(BaseModel): + unit_code: str + +class DecodeUnitVectorInput(BaseModel): + unit_vector: str + +class SimilarUnitsInput(BaseModel): + unit_vector: str + listing_status: Literal["RECENTLY_SOLD", "FOR_SALE", "CURRENT"] = "RECENTLY_SOLD" + force_refresh: bool = False + +class FindSimilarToLikedInput(BaseModel): + finnkode: str + mode: Literal["recommendations", "comps"] = "recommendations" + listing_status: Literal["RECENTLY_SOLD", "FOR_SALE", "CURRENT"] = "FOR_SALE" + +class AnalyzeAgainstCompsInput(BaseModel): + finnkode: str + listing_status: Literal["RECENTLY_SOLD"] = "RECENTLY_SOLD" + +class SaveFeedbackInput(BaseModel): + finnkode: str + verdict: str + notes: Optional[str] = None + +class CompareAdsInput(BaseModel): + finnkoder: List[str] = Field(..., min_length=2, max_length=10) + include_eiendom_no: bool = True + include_comps: bool = True +``` + +### 14.3 Tool response convention + +Every tool body wraps execution in try/except and returns a JSON string. Errors return: + +```python +return json.dumps({"error": True, "code": "", "message": str(e)}) +``` + +This keeps the protocol layer happy and lets the LLM react to recoverable failures. + +When `response_format="markdown"`, return human-readable formatted text instead of JSON — produced by `formatting.py`, never inline. + +### 14.4 Resources + +```text +finn://preferences/current +finn://search-runs/latest +finn://search-runs/{id} +finn://ads/{finnkode} +finn://ads/{finnkode}/enriched +finn://shortlist/latest +finn://feedback/{finnkode} +finn://eiendom-units/{unitCode} +finn://eiendom-units/{unitCode}/similar/{listingStatus} +``` + +### 14.5 Prompts + +* `evaluate_property_for_user` +* `compare_properties_for_user` +* `refine_search_from_feedback` +* `find_more_like_this` + +Evaluation prompt template output: category, score, short assessment, why interesting, Eiendom.no estimate, comparable sales, main risks, bargain potential, questions for broker, should we view it. + +### 14.6 Entry point + +```python +# finn_eiendom/mcp_server.py +from mcp.server.fastmcp import FastMCP +mcp = FastMCP("finn_eiendom_mcp") + +# ... tools defined here ... + +def main() -> None: + mcp.run(transport="stdio") + +if __name__ == "__main__": + main() +``` + +`pyproject.toml`: + +```toml +[project.scripts] +finn-eiendom-mcp = "finn_eiendom.mcp_server:main" +finn-eiendom = "finn_eiendom.cli:app" +``` + +--- + +## 15. CLI design + +Built with `typer`. Every command maps 1:1 to a service function — same parameters, same defaults, same outputs. + +### 15.1 Commands + +```text +finn-eiendom analyze-search [--max-pages 3] [--detail-limit 20] [--no-details] [--no-eiendom] [--with-similar] [--format json|markdown|table] +finn-eiendom get-ad [--force-refresh] [--no-eiendom] [--with-similar] [--format ...] +finn-eiendom compare [--no-eiendom] [--no-comps] [--format ...] +finn-eiendom save-feedback [--notes "..."] +finn-eiendom shortlist [--run-id ID] [--limit 10] [--format ...] +finn-eiendom diff [--format ...] ← new / removed / changed +finn-eiendom resolve-unit +finn-eiendom get-unit [--force-refresh] +finn-eiendom enrich-ad [--with-similar] +finn-eiendom build-vector +finn-eiendom decode-vector +finn-eiendom similar-units [--status RECENTLY_SOLD|FOR_SALE|CURRENT] +finn-eiendom similar-to-liked [--mode recommendations|comps] [--status ...] +finn-eiendom analyze-against-comps +finn-eiendom cache stats | clear | clear-html | clear-json +finn-eiendom serve [--transport stdio|http] [--host 127.0.0.1] [--port 8010] +finn-eiendom config show | path +finn-eiendom doctor ← run a few smoke checks: cache reachable, eiendom.no reachable, finn reachable +finn-eiendom version +``` + +### 15.2 Output formats + +* `--format json` — full structured output (default for piping into `jq`). +* `--format markdown` — same data, human-readable. +* `--format table` — concise terminal table (for `analyze-search`, `compare`, `shortlist`, `diff`). + +All three are produced by `finn_eiendom.formatting`. CLI never formats inline. + +### 15.3 Examples + +```bash +# Triage a search live +finn-eiendom analyze-search 'https://www.finn.no/realestate/homes/search.html?location=...' --format table + +# Drill into one listing +finn-eiendom get-ad 462400360 --format markdown + +# Compare two finalists +finn-eiendom compare 462400360 461153194 --format markdown + +# Mark a listing as liked, then ask for similar +finn-eiendom save-feedback 462400360 liked --notes "great layout, check fellesgjeld" +finn-eiendom similar-to-liked 462400360 + +# Operate the MCP server in HTTP mode for n8n +finn-eiendom serve --transport http --port 8010 +``` + +### 15.4 CLI implementation pattern + +```python +# finn_eiendom/cli.py +import asyncio, typer +from . import service, formatting + +app = typer.Typer(no_args_is_help=True, add_completion=False) + +@app.command() +def analyze_search( + url: str, + max_pages: int = 3, + detail_limit: int = 20, + no_details: bool = typer.Option(False, "--no-details"), + no_eiendom: bool = typer.Option(False, "--no-eiendom"), + with_similar: bool = typer.Option(False, "--with-similar"), + format: str = typer.Option("json", "--format"), +) -> None: + result = asyncio.run(service.analyze_search( + search_url=url, + max_pages=max_pages, + detail_limit=detail_limit, + include_details=not no_details, + include_eiendom_no=not no_eiendom, + include_similar_units_for_shortlist=with_similar, + )) + typer.echo(formatting.render_shortlist(result, format)) +``` + +CLI commands are wrappers — no business logic, no rendering. If you need to add behavior, it goes in `service.py` and gets a matching MCP tool. If you need to change rendering, edit `formatting.py`. + +--- + +## 16. Service layer + +The keystone of the architecture. + +```python +# finn_eiendom/service.py — public surface + +async def get_or_fetch_ad(finnkode: str, force_refresh: bool = False) -> FinnAd: ... +async def get_or_fetch_eiendom_unit(unit_code: str, force_refresh: bool = False) -> Optional[EiendomUnit]: ... +async def get_or_fetch_similar_units(unit_code: str, listing_status: str = "RECENTLY_SOLD", force_refresh: bool = False) -> list[SimilarUnit]: ... + +async def analyze_search(search_url: str, *, max_pages=3, detail_limit=20, include_details=True, include_eiendom_no=True, include_similar_units_for_shortlist=False) -> dict: ... +async def analyze_ad(finnkode: str, *, include_eiendom_no=True, include_similar_units=False) -> dict: ... +async def analyze_ad_against_comps(finnkode: str, listing_status: str = "RECENTLY_SOLD") -> dict: ... +async def find_similar_to_liked(finnkode: str, *, mode="recommendations", listing_status="FOR_SALE") -> dict: ... +async def compare_ads(finnkoder: list[str], *, include_eiendom_no=True, include_comps=True) -> dict: ... + +async def resolve_eiendom_unit_from_finn_url(finn_url: str) -> Optional[EiendomUnit]: ... +def build_unit_vector_for_unit_code(unit_code: str) -> dict: ... +def decode_unit_vector_to_dict(unit_vector: str) -> dict: ... + +def save_feedback(finnkode: str, verdict: str, notes: Optional[str] = None) -> dict: ... +def get_shortlist(run_id: Optional[int] = None, limit: int = 10) -> dict: ... +def get_new_ads_since_last_run(search_url: str) -> dict: ... +``` + +Every function: + +1. Opens its own SQLite connection via `cache.init_db(FINN_CACHE_PATH)`. +2. Reads from cache first, with TTLs from `config.py`. +3. On cache miss (or `force_refresh=True`), calls the relevant fetch function in `ad.py` / `eiendom_no.py`. +4. Writes the fresh result back to the cache. +5. Returns a typed model or dict, never `None` unexpectedly — failures raise with clear messages. + +--- + +## 17. Code ownership and anti-duplication + +This section is the constitution. Everything else flexes; this does not. The goal is one home for every piece of logic and one obvious answer to "where does this go?". + +### 17.1 The single-home rule + +Every piece of logic has exactly one home. If you're tempted to add it in two places, you're wrong about one of them — push it down a layer and call it from both. + +### 17.2 Decision table — "where does this go?" + +| Concern | Lives in | Never in | +| -------------------------------------------------- | --------------------------------- | -------------------------------------------------------------- | +| Parsing FINN search HTML | `search.py` | `mcp_server`, `cli`, `analysis`, `scripts` | +| Parsing FINN listing HTML | `ad.py` | `mcp_server`, `cli`, `analysis`, `scripts` | +| Norwegian number / date / URL / finnkode normalization | `parser.py` | inline anywhere — if you write a regex twice, extract it | +| HTTP requests, retry, delay, user-agent | `http.py` | `search` / `ad` / `eiendom_no` using `httpx` directly | +| SQLite reads/writes | `cache.py` | every other module — go through cache helpers | +| Eiendom.no unit search / unit detail | `eiendom_no.py` | `ad`, `search`, `analysis` (call eiendom_no, don't reimplement)| +| `unit_vector` encode / decode | `eiendom_no.py` | `mcp_server`, `cli` (call it; don't pack msgpack inline) | +| Similar-units fetching + local filtering | `eiendom_no.py` | `analysis`, `service` (call `get_similar_units`) | +| Score components | `scoring.py` | `analysis` (use `score_ad`), `mcp_server`, `cli` | +| Category assignment | `scoring.py` (`classify_ad`) | `analysis`, `mcp_server`, `cli` | +| Feedback storage + retrieval | `feedback.py` | `mcp_server`, `cli`, `analysis` | +| "Get from cache, else fetch, else save" | `service.py` (`get_or_fetch_*`) | `mcp_server`, `cli`, `analysis` (always go through service) | +| Shortlist + summary assembly | `analysis.py` | `mcp_server`, `cli` | +| End-to-end orchestration (search → shortlist) | `service.py` (`analyze_search`) | `mcp_server`, `cli` (they just call it) | +| MCP tool definitions + annotations | `mcp_server.py` | `service`, `cli` | +| MCP error wrapping `{"error": True, ...}` | `mcp_server.py` only | `service` (which raises), `cli` (which has its own exit codes) | +| CLI command definitions + Typer plumbing | `cli.py` | `service`, `mcp_server` | +| Output formatting (json / markdown / table) | `formatting.py` | inline in `mcp_server.py` or `cli.py` | +| Env-var defaults | `config.py` | hardcoded anywhere | +| Pydantic models | `models.py` | redefined locally; subclass only if needed | + +### 17.3 Layering invariants + +The dependency graph is acyclic and points downward: + +``` +cli.py ─┐ + ├──> service.py ──> analysis.py ──> search / ad / eiendom_no / scoring / feedback +mcp_server.py ─┘ │ + │ ├──> parser.py + │ └──> http.py / cache.py + └──> formatting.py +``` + +Hard rules: + +* `mcp_server.py` and `cli.py` are **siblings** and never call each other. +* Neither MCP nor CLI imports from `search`, `ad`, `eiendom_no`, `scoring`, `feedback`, `cache`, or `http`. They import from `service`, `models`, and `formatting` only. +* `service.py` does not import from `mcp_server` or `cli`. +* `analysis.py` does not open SQLite connections directly — it goes through `cache.py` functions. +* `search.py`, `ad.py`, `eiendom_no.py` do not open SQLite directly — they call cache helpers passed in or imported from `cache.py`. +* Nothing except `http.py` uses `httpx` directly. If `import httpx` appears anywhere else, move it. +* Nothing except `cache.py` uses `sqlite3` directly. +* Nothing except `parser.py` defines Norwegian-text regexes. + +### 17.4 Anti-duplication checklist + +Before merging any change, ask: + +1. Is this logic already implemented somewhere? (`grep` the function name and obvious keywords.) +2. If I'm copy-pasting from another file, am I about to duplicate behavior that should live in one shared function? +3. Can a new caller use an existing `service.py` function instead of writing its own orchestration? +4. Is the same Pydantic field defined in two models? If yes, factor out a base model. +5. Am I formatting output in two places (CLI + MCP)? Move it to `formatting.py`. +6. Am I opening a SQLite connection outside `cache.py`? Move it. +7. Am I building an httpx call outside `http.py`? Move it. +8. Am I writing a Norwegian-number / area / finnkode regex outside `parser.py`? Move it. +9. Am I adding an env-var lookup outside `config.py`? Move it. +10. Did I add a new behavior with only one front end (MCP or CLI)? If it should exist in both, the service function is missing. + +### 17.5 Examples — what NOT to do + +**Bad:** MCP tool reaches into `ad.py` directly. + +```python +# ❌ in mcp_server.py +from .ad import fetch_ad_details +@mcp.tool() +async def finn_get_ad(...): + ad = await fetch_ad_details(...) # bypasses cache! +``` + +**Good:** MCP tool goes through `service.py`. + +```python +# ✅ in mcp_server.py +from .service import get_or_fetch_ad +@mcp.tool() +async def finn_get_ad(...): + ad = await get_or_fetch_ad(finnkode, force_refresh=force_refresh) + return ad.model_dump_json() +``` + +**Bad:** CLI formats output inline that MCP also needs. + +```python +# ❌ in cli.py +def _render_shortlist_markdown(result): ... # 80 lines of formatting +# later in mcp_server.py, the same 80 lines copy-pasted +``` + +**Good:** Shared formatter. + +```python +# ✅ in finn_eiendom/formatting.py +def render_shortlist(result: dict, fmt: str) -> str: ... +# cli.py and mcp_server.py both call render_shortlist(result, fmt) +``` + +**Bad:** Service inlines parsing or HTTP. + +```python +# ❌ in service.py +async def get_or_fetch_ad(...): + html = await httpx.AsyncClient().get(url) # http belongs in http.py + soup = BeautifulSoup(html.text, "html.parser") # parsing belongs in ad.py +``` + +**Good:** Service delegates. + +```python +# ✅ in service.py +async def get_or_fetch_ad(finnkode, force_refresh=False): + conn = cache.init_db(FINN_CACHE_PATH) + if not force_refresh: + cached = cache.get_finn_ad(conn, finnkode, ttl_hours=FINN_CACHE_TTL_AD_HOURS) + if cached: + return cached + ad = await ad_module.fetch_ad_details(finnkode) + cache.save_finn_ad(conn, ad) + return ad +``` + +### 17.6 The shared `formatting.py` module + +Output formatting (JSON / markdown / table) is shared between CLI (`--format`) and MCP (`response_format`). Centralize all renderers here: + +```python +# finn_eiendom/formatting.py +def render_ad(ad: FinnAd, fmt: str) -> str: ... +def render_shortlist(result: dict, fmt: str) -> str: ... +def render_comparison(result: dict, fmt: str) -> str: ... +def render_diff(result: dict, fmt: str) -> str: ... +def render_similar_units(units: list[SimilarUnit], fmt: str) -> str: ... +def render_unit(unit: EiendomUnit, fmt: str) -> str: ... +def render_score_breakdown(scores: dict, fmt: str) -> str: ... +``` + +CLI and MCP both call these. Neither has its own renderer. `fmt` accepts `"json"`, `"markdown"`, `"table"` (only where table makes sense). Unsupported values raise `ValueError` with a list of supported formats. + +### 17.7 Adding a new feature — the checklist + +For any new tool / command / behavior: + +1. **Decide the home.** Use the table in §17.2. +2. **Write the service function** in `service.py` (or extend `analysis.py` if it's pure orchestration of existing services). +3. **Add a test** for the service function in `tests/test_service.py`. +4. **Add the MCP tool** in `mcp_server.py` — thin wrapper, `response_format` aware. +5. **Add the CLI command** in `cli.py` — thin wrapper, `--format` aware. +6. **Add formatter** in `formatting.py` if output is non-trivial. +7. **Add a test** for the MCP tool registration in `tests/test_mcp_server.py`. +8. **Add a test** for the CLI command in `tests/test_cli.py`. +9. **Update docs** — README and the relevant `.github/instructions/*.md` if new patterns are introduced. + +If step 4 or 5 needs more than ~20 lines, you've put logic in the wrong layer. Push it down. + +### 17.8 Acceptable duplication + +A few small repetitions are tolerated to keep boundaries clean: + +* Trivial `model_dump()` / `model_dump_json()` calls at MCP and CLI boundaries. +* `try/except → format error` blocks at each MCP tool (kept identical via a helper if it grows). +* Pydantic input schema declarations at each MCP tool (they document the tool). + +Anything beyond a handful of lines is duplication and goes into a helper. + +--- + +## 18. Workflows + +### A. Analyze FINN search + +``` +Input: FINN search URL +Steps: + 1. Normalize URL. + 2. Check search-page cache (TTL 60min). + 3. Fetch page 1, parse cards. + 4. If max_pages > 1, fetch page 2..N. + 5. Deduplicate by finnkode. + 6. Record a search_run. + 7. Pre-score from card data. + 8. Select top N for detail fetch. + 9. Run workflow B for each. + 10. Score + classify each. + 11. Sort by total score. + 12. Persist scores; persist shortlist snapshot. + 13. Return shortlist + summary. +``` + +### B. Fetch and parse FINN listing + +``` +Input: finnkode +Steps: + 1. Build https://www.finn.no/realestate/homes/ad.html?finnkode={n}. + 2. Check finn_ads cache (TTL 24h). + 3. Fetch HTML, parse with ad.scrape_ad(). + 4. Normalize numbers/areas/dates via parser.py. + 5. save_finn_ad(). +Output: FinnAd. +``` + +### C. Eiendom.no enrichment + +``` +Input: FINN listing URL or finnkode +Steps: + 1. Build full FINN URL. + 2. Cache check on unit search. + 3. eiendom_no.search_unit_from_finn_url(). + 4. Pick best match. + 5. Save unitCode on the ad. + 6. Cache check on unit detail. + 7. eiendom_no.get_unit(unitCode). + 8. save_eiendom_unit(). + 9. Compute FINN-vs-Eiendom.no mismatch warnings. +Output: EiendomUnit + mismatch list (or unavailable). +``` + +### D. Build unit_vector + +``` +Input: EiendomUnit +Steps: + 1. Extract lon/lat from geometry. + 2. propertyType → ptype. + 3. floor / rooms / constructionYear / usableArea. + 4. Choose price: listingPrice → estimatedSellingPrice → FINN total_price. + 5. msgpack.packb + urlsafe_b64encode (strip "="). + 6. Persist unit_vector on eiendom_units. +Output: unit_vector + payload. +``` + +### E. Fetch similar-units / comps + +``` +Input: unitCode, listing_status=RECENTLY_SOLD +Steps: + 1. Load EiendomUnit; ensure unit_vector exists. + 2. Cache check on similar_units. + 3. eiendom_no.get_similar_units(unit_vector). + 4. Normalize and filter locally: + RECENTLY_SOLD → saleStatus=SOLD and finalizedAt is set + FOR_SALE → saleStatus=FORSALE + 5. Compute summary: count, avg/median selling price, avg sqm price, avg DOM. + 6. save_similar_units(). +Output: similar_units[] + comps_summary + confidence. +``` + +### F. Score property + +``` +Input: FinnAd, EiendomUnit, similar_units, user_prefs, feedback +Steps: + 1. economy / market / comparable / location / layout / outdoor / hybel / renovation / risk. + 2. Clamp total to 0–100. + 3. Assign categories. + 4. Build explanation: why_interesting, risks, next_steps, broker_questions. +Output: scores dict + categories + summary. +``` + +### G. Find similar to liked + +``` +Input: finnkode with verdict=liked +Steps: + 1. Load FinnAd. + 2. Ensure Eiendom.no enrichment + unit_vector. + 3. Fetch similar-units (prefer FOR_SALE). + 4. Score candidates against user preferences. + 5. Return ranked recommendations. +``` + +### H. Analyze one listing against comps + +``` +Input: finnkode +Steps: + 1. workflow B → enrich (C) → comps (E, RECENTLY_SOLD). + 2. Compare listing price vs comp avg/median; sqm price vs comp avg. + 3. Compute confidence and classify cheap/fair/expensive. +Output: price_position, sqm_price_position, comparable_score, confidence, comps_summary, warnings. +``` + +### I. Detect new / removed / changed listings + +``` +Input: FINN search URL +Steps: + 1. workflow A (no detail fetch needed). + 2. Compare finnkoder against previous search_run for same normalized_url. + 3. For changed ads, diff price/common_costs/status. + 4. Optionally workflow B on new + changed only. +Output: new_ads[], removed_ads[], changed_ads[]. +``` + +### J. Feedback loop + +``` +Input: finnkode + verdict + notes +Steps: + 1. INSERT into feedback. + 2. Update ad status. + 3. If verdict=liked: mark as seed for similar-to-liked recommendations. + 4. If verdict=rejected: store rejection reason. + 5. Future analyses use feedback as a soft preference signal. +``` + +### K. Compare multiple listings + +``` +Input: finnkoder[] +Steps: + 1. workflow B + C for each. + 2. Optionally workflow E. + 3. Build comparison table. + 4. Identify winners by category: best value / lifestyle / hybel / bargain / safest / highest risk / most overpriced. +Output: comparison_table + winners_by_category + recommendation + risks + broker_questions. +``` + +--- + +## 19. Output formats + +### 19.1 Shortlist item + +```text +1. [Title/address] – Score 84/100 + Category: Bargain candidate + Price: 7,200,000 total / 77 m² / 93,500 NOK per m² + Eiendom.no: Estimate 7,650,000 / range 6,900,000–8,400,000 + Comps: 12 similar recently sold / avg 98,000 NOK per m² + + Why interesting: + - Good size for price. + - Balcony and view. + - Renovation need may reduce competition. + - Flexible layout. + - Price looks low vs estimate and comps. + + Risks: + - Check wet rooms in condition report. + - Common costs need review. + - Hybel potential is not documented. + - Comparable confidence is medium. + + Next steps: + - Open listing. + - Read condition report. + - Check FINN vs Eiendom.no mismatches. + - Ask broker about planned cost increases. + - Consider viewing. +``` + +### 19.2 Analysis summary + +```text +Analyzed 83 listings. +Fetched details for 20. +Eiendom.no-enriched 18. +Fetched similar-units for 7 shortlisted listings. +Shortlisted 8. + +Best bargain candidate: ... +Best safe candidate: ... +Best hybel candidate: ... +Best price vs estimate: ... +Best price vs comps: ... +Highest risk: ... +Most overpriced: ... +``` + +--- + +## 20. Configuration + +| Variable | Default | Purpose | +| ----------------------------------------- | -------------------------------: | -------------------------------- | +| `FINN_CACHE_PATH` | `data/finn.sqlite` | SQLite DB path | +| `FINN_MAX_SEARCH_PAGES` | `3` | Max search pages | +| `FINN_DETAIL_LIMIT` | `20` | Max detailed listings per run | +| `FINN_REQUEST_DELAY_SECONDS` | `2` | Delay between FINN requests | +| `FINN_USER_AGENT` | `personal-finn-eiendom-analyzer/0.1` | HTTP User-Agent | +| `FINN_CACHE_TTL_SEARCH_MINUTES` | `60` | Search cache TTL | +| `FINN_CACHE_TTL_AD_HOURS` | `24` | Listing cache TTL | +| `EIENDOM_NO_ENABLED` | `true` | Enable Eiendom.no enrichment | +| `EIENDOM_NO_BASE_URL` | `https://api.eiendom.no/api/v1` | API base URL | +| `EIENDOM_NO_CACHE_TTL_HOURS` | `24` | Unit/similar cache TTL | +| `EIENDOM_NO_REQUEST_DELAY_SECONDS` | `1` | Delay between Eiendom.no calls | +| `EIENDOM_NO_SIMILAR_UNITS_ENABLED` | `true` | Enable similar-units | +| `EIENDOM_NO_SIMILAR_UNITS_DEFAULT_STATUS` | `RECENTLY_SOLD` | Default comps status | +| `HJEMLA_ENABLED` | `false` | Enable optional Hjemla API | +| `LOG_LEVEL` | `INFO` | Logging level | +| `MCP_TRANSPORT` | `stdio` | `stdio` or `streamable_http` | +| `MCP_HTTP_HOST` | `127.0.0.1` | Streamable HTTP bind | +| `MCP_HTTP_PORT` | `8010` | Streamable HTTP port | + +--- + +## 21. Deployment + +The default runtime is a project-local virtualenv. Docker is supported but optional. + +### 21.1 Local install (default) + +```bash +# in the project root +uv venv # or: python3.12 -m venv .venv +source .venv/bin/activate +uv pip install -e ".[dev]" # or: pip install -e ".[dev]" + +# now available: +finn-eiendom --help # CLI +finn-eiendom-mcp # MCP server over stdio +finn-eiendom serve --transport http --port 8010 # MCP server over HTTP +pytest # tests +ruff check . # lint +``` + +For a global CLI install: + +```bash +uv tool install . +# or +pipx install . +``` + +### 21.2 Claude Desktop integration (stdio) + +`~/Library/Application Support/Claude/claude_desktop_config.json`: + +```json +{ + "mcpServers": { + "finn-eiendom": { + "command": "/Users/ole/code/finn-mcp/.venv/bin/finn-eiendom-mcp", + "args": [], + "env": { + "FINN_CACHE_PATH": "/Users/ole/code/finn-mcp/data/finn.sqlite", + "EIENDOM_NO_ENABLED": "true" + } + } + } +} +``` + +Or, with `uv` from the project root: + +```json +{ + "mcpServers": { + "finn-eiendom": { + "command": "uv", + "args": ["run", "finn-eiendom-mcp"], + "cwd": "/Users/ole/code/finn-mcp" + } + } +} +``` + +### 21.3 Docker Compose (optional) + +```yaml +services: + finn-eiendom-mcp: + build: . + container_name: finn-eiendom-mcp + restart: unless-stopped + ports: + - "8010:8010" + environment: + FINN_CACHE_PATH: /data/finn.sqlite + EIENDOM_NO_ENABLED: "true" + EIENDOM_NO_SIMILAR_UNITS_ENABLED: "true" + MCP_TRANSPORT: streamable_http + MCP_HTTP_HOST: 0.0.0.0 + MCP_HTTP_PORT: "8010" + volumes: + - ./data:/data + command: ["finn-eiendom", "serve", "--transport", "http", "--host", "0.0.0.0", "--port", "8010"] +``` + +### 21.4 Dockerfile + +```dockerfile +FROM python:3.12-slim +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends gcc \ + && rm -rf /var/lib/apt/lists/* + +COPY pyproject.toml . +COPY finn_eiendom ./finn_eiendom +RUN pip install --no-cache-dir . + +EXPOSE 8010 +CMD ["finn-eiendom-mcp"] +``` + +--- + +## 22. MVP scope + +### Must have + +* Local venv install (`uv venv` + `pip install -e .[dev]`). +* Python core package with all modules listed in §12.1. +* `service.py` with `get_or_fetch_*` helpers. +* `formatting.py` shared between CLI and MCP. +* SQLite cache/history (existing schema retained, `search_runs` + `scores` + `feedback` added). +* FastMCP server with all tools in §14.1 except `finn_compare_ads` (deferred to "should have"). +* CLI with all commands in §15.1 except `serve --transport http` and `cache clear-*` variants (deferred). +* FINN search + listing extraction. +* Eiendom.no enrichment enabled by default. +* `unit_vector` build + decode. +* Similar-units/comps with local filtering. +* Scoring on all nine components with category assignment. +* Feedback storage. +* Shortlist output with reasons, risks, next steps, broker questions. +* Pydantic v2 models with `model_config` (no v1 `Config`). +* HTTP retry on 5xx in addition to connection errors. +* MCP entry-point registered in `pyproject.toml`. +* README + `.github/instructions/*.md` describing the architecture and ownership rules. + +### Should have + +* Pagination. +* Price per m² across the board. +* Component score breakdown in output. +* Generated broker questions. +* `finn_get_new_ads_since_last_run` / `finn-eiendom diff`. +* `finn_compare_ads` / `finn-eiendom compare`. +* Feedback-based scoring adjustment. +* `finn_find_similar_to_liked_ad` / `finn-eiendom similar-to-liked`. +* CLI `--format markdown` + `--format table`. +* CLI `serve --transport http`. +* CLI `cache stats|clear|clear-html|clear-json`. + +### Later + +* Web UI / dashboard. +* n8n workflow templates. +* PDF condition-report analysis. +* Geocoding / travel-time / sun / noise overlays. +* Push notifications. +* Price-drop monitoring. +* LLM-based listing-text scoring. +* Optional Hjemla integration. + +--- + +## 23. Roadmap + +### Phase 0 — Spike (largely done) + +* Parse one FINN search result, extract finnkoder, parse 3–5 listings. +* Resolve FINN URL → Eiendom.no `unitCode`, fetch unit detail, generate `unit_vector`, fetch similar-units with `RECENTLY_SOLD`. + +### Phase 1 — Core MVP (mostly done) + +* Stable parser, SQLite cache, Eiendom.no enrichment, similar-units/comps, basic scoring. +* Fixture-based tests for parsers, cache, scoring. + +### Phase 2 — MCP / CLI MVP (this PRD) + +* Replace FastAPI with FastMCP stdio server. +* Add `service.py` and `formatting.py`. +* Add `cli.py` (typer) and `__main__.py`. +* Wire MCP tools and CLI commands into the service + formatting layers. +* Pydantic v2 `model_config` cleanup. +* HTTP retry on 5xx. +* New tests: `tests/test_service.py`, expanded `tests/test_mcp_server.py`, new `tests/test_cli.py`, new `tests/test_http.py`, new `tests/test_formatting.py`, new `tests/test_architecture.py`. +* Switch from Docker-only workflow to local venv as default; keep Docker as an optional packaging path. + +### Phase 3 — Personal scoring v2 + +* Tighter user-preference weights, stronger bargain/risk/hybel logic, better confidence handling, generated broker questions. + +### Phase 4 — Agent / workflow + +* Cron / scheduled runs, diff notifications, n8n templates, Slack/Discord output. + +### Phase 5 — Dashboard + +* React/TanStack UI for shortlist, feedback, comps, history. + +--- + +## 24. Acceptance criteria + +### A1. MCP server + +Given a fresh local venv install, `finn-eiendom-mcp` starts via `mcp.run(transport="stdio")` without error. Running `mcp dev finn_eiendom/mcp_server.py` shows all tools listed in §14.1. + +### A2. CLI + +Given `pip install -e .`, `finn-eiendom --help` lists every command in §15.1. Each command runs end-to-end against cached fixtures with no live network calls and produces JSON, markdown, or table output as requested via `formatting.py`. + +### A3. Search analysis + +Given a valid FINN search URL, `service.analyze_search()` returns a ranked shortlist sorted by total score, with at least the fields: `summary`, `shortlist`, `search_url`. Cards are deduplicated by finnkode. Identical reruns within the search-cache TTL are served from cache. + +### A4. Listing detail + +Given a valid finnkode, `service.get_or_fetch_ad()` returns a `FinnAd` with at least `finnkode`, `url`, `title`, `address`, `total_price`, `area_m2`, `listing_description`. Missing fields are `None`, not raised. Subsequent calls within the TTL hit the cache. + +### A5. Feedback + +Given a finnkode and verdict, `service.save_feedback()` writes a `feedback` row. `liked` verdicts are surfaced by `service.find_similar_to_liked()`. + +### A6. Eiendom.no enrichment + +Given a FINN listing URL, the system resolves a `unitCode`, fetches the unit detail, stores estimate / coordinates / area / rooms / year / market data, and uses them in scoring. Enrichment failures degrade gracefully — the `eiendom_unit` field is `None` in the result, no exception escapes the service. + +### A7. Similar-units + +Given a `unitCode`, the system builds (or loads) a cached `unit_vector`, calls similar-units with the requested `listing_status`, returns structured comps, caches the result, and emits a comps summary with count, average price, average sqm price. + +### A8. Pydantic v2 + +`FinnAd`, `EiendomUnit`, `SimilarUnit` use `model_config = ConfigDict(...)`. No `class Config:` blocks remain. + +### A9. HTTP retry + +`HTTPClient.get()` retries 5xx responses with exponential backoff (`1s, 2s, 4s`) up to `retries` attempts, and surfaces 4xx as `httpx.HTTPStatusError` immediately. + +### A10. No-duplication / architecture invariants + +A static check (`tests/test_architecture.py`) verifies: + +* No `import httpx` outside `finn_eiendom/http.py`. +* No `import sqlite3` outside `finn_eiendom/cache.py`. +* No `BeautifulSoup` import outside `finn_eiendom/search.py` or `finn_eiendom/ad.py`. +* No `msgpack` import outside `finn_eiendom/eiendom_no.py`. +* `mcp_server.py` only imports from `service`, `formatting`, `models`, `config`, and stdlib + `mcp`. +* `cli.py` only imports from `service`, `formatting`, `models`, `config`, and stdlib + `typer`. + +### A11. Tooling + +`ruff check .` returns zero issues. `pytest` passes. `mypy --strict finn_eiendom` passes (or is documented as a known gap). + +--- + +## 25. Test strategy + +### 25.1 Unit tests + +* `tests/test_parser.py` — number/date/URL/finnkode normalization. +* `tests/test_search.py` — FINN search HTML → cards. +* `tests/test_ad.py` — FINN listing HTML → FinnAd. +* `tests/test_eiendom_no.py` — unit search/detail/similar JSON parsers, `unit_vector` encode/decode. +* `tests/test_scoring.py` — all scoring components + classifier. +* `tests/test_cache.py` — read/write/TTL behavior. + +### 25.2 Service tests (new) + +* `tests/test_service.py` + * `test_get_or_fetch_ad_uses_cache` + * `test_get_or_fetch_ad_fetches_when_cache_miss` + * `test_get_or_fetch_ad_force_refresh` + * `test_analyze_search_with_fixtures` + * `test_find_similar_to_liked_uses_liked_feedback` + +### 25.3 MCP tests + +* `tests/test_mcp_server.py` + * `test_mcp_server_has_correct_tools` + * `test_finn_decode_unit_vector_returns_json` + * `test_finn_analyze_search_handles_error` + +### 25.4 CLI tests (new) + +Use Typer's `CliRunner`. + +* `tests/test_cli.py` + * `test_cli_help` + * `test_cli_analyze_search_table_format` + * `test_cli_get_ad_json_format` + * `test_cli_save_feedback_persists_row` + * `test_cli_decode_vector` + +### 25.5 Formatting tests (new) + +* `tests/test_formatting.py` + * `test_render_shortlist_json_roundtrips` + * `test_render_shortlist_markdown_contains_score` + * `test_render_unsupported_format_raises_valueerror` + +### 25.6 HTTP tests (new) + +Use `respx`. + +* `tests/test_http.py` + * `test_get_retries_on_500` + * `test_get_raises_on_404` + * `test_post_delay_applied` + +### 25.7 Architecture tests (new) + +* `tests/test_architecture.py` — static import-graph checks listed in A10. + +### 25.8 Manual / smoke tests + +* `finn-eiendom doctor` runs. +* Real FINN URL run; compare top-3 with manual judgment. +* Save 5 feedback rows; rerun; verify scoring shift. +* Mark one ad liked; run `similar-to-liked`; sanity-check candidates. + +--- + +## 26. Logging, safety, compliance + +Log: start/end of analysis, pages/listings/details fetched, Eiendom.no enrichments attempted/found/failed, similar-units attempted/found/failed, cache hits/misses, parse errors, request errors, debug-level scoring details. + +Safety / compliance: + +* Private, low-frequency, user-triggered use only. +* Configurable request delays and User-Agent. +* Cache aggressively to minimize requests. +* No public redistribution of FINN/Eiendom.no data. +* No public exposure without auth — prefer LAN / Tailscale / reverse proxy. +* Scores, estimates, and comps are decision support, not official valuation, legal, or technical advice. +* stdio MCP servers must log to **stderr only** (`logging.basicConfig(stream=sys.stderr, ...)`). + +--- + +## 27. Risks & mitigations + +| Risk | Impact | Mitigation | +| ------------------------------------ | ---------------------- | -------------------------------------------- | +| FINN HTML changes | Parser breaks | Fixture tests, resilient selectors | +| Eiendom.no API/JSON changes | Enrichment/comps break | JSON fixtures, graceful fallback | +| Unit-vector format changes | Similar-units breaks | Unit tests, fall back to cache, mark unavailable | +| Too many requests | Blocking / unwanted load | Delay, cache, low-frequency use | +| Bad scoring | Poor recommendations | Explain score and uncertainty | +| Legal/technical interpretation wrong | Bad decisions | Present as broker questions, not facts | +| User overtrusts score | Missed risks | Always show risks and next steps | +| Public MCP exposure | Misuse | LAN / Tailscale / auth-only | +| stdio server writes to stdout | Breaks JSON-RPC frame | Configure logging to stderr; architecture test| +| Duplication of logic | Drift between MCP/CLI/library | Code-ownership table + architecture tests | + +--- + +## 28. Open questions + +1. Should `service.py` open one shared `sqlite3.Connection` per process or one per call? (current code opens per call — fine but worth measuring.) +2. Store raw HTML permanently or only parsed output? Default: only parsed, raw HTML under TTL. +3. How aggressively to detail-fetch in `analyze_search`? Default: top 20 cards. +4. Hardcode scoring weights or expose via YAML / env? Default: hardcoded for MVP; YAML in Phase 3. +5. Should feedback affect scoring in MVP, or only be stored? Default: stored only; soft signal in Phase 3. +6. Multiple scoring profiles (lifestyle / bargain / hybel / safe)? Default: single profile in MVP. +7. Permanently store Eiendom.no data or TTL only? Default: TTL only; review later. +8. How to handle FINN-vs-Eiendom.no mismatches (area, price)? Default: store both, surface as warning, never silently overwrite. +9. Which `listing_status` values does similar-units accept server-side? Verify in spike before relying on it. +10. Should recommendations use only `liked` listings, or also high-scoring listings without feedback? Default: liked only. +11. Should `serve --transport http` ship in MVP? Default: yes for cron/n8n users; stdio still default for Claude Desktop. + +--- + +## 29. First implementation plan (Phase 2) + +Step by step, each step independently mergeable. + +1. **Switch dev workflow to local venv.** Update `AGENTS.md`, `copilot-instructions.md`, `python.instructions.md`, `tests.instructions.md`. Add `clean-code.instructions.md`, `cli.instructions.md`, and `docs.instructions.md`. +2. **Pydantic v2 cleanup** — replace `class Config` with `model_config = ConfigDict(...)` in `models.py`. Add roundtrip test. +3. **Service layer** — create `finn_eiendom/service.py` with `get_or_fetch_*` and orchestration helpers. Add `tests/test_service.py`. +4. **Formatting layer** — create `finn_eiendom/formatting.py` with all `render_*` helpers. Add `tests/test_formatting.py`. +5. **HTTP retry** — extend `HTTPClient.get()` with 5xx retry + exponential backoff. Add `tests/test_http.py`. +6. **Replace FastAPI with FastMCP** — rewrite `finn_eiendom/mcp_server.py` against `service.py` + `formatting.py`. Add stdio `main()`. Add `[project.scripts]` entry `finn-eiendom-mcp`. Expand `tests/test_mcp_server.py`. +7. **CLI** — create `finn_eiendom/cli.py` (typer) and `finn_eiendom/__main__.py`. Add `[project.scripts]` entry `finn-eiendom`. Add `tests/test_cli.py`. +8. **Diff workflow** — implement `search_runs` table + `service.get_new_ads_since_last_run` + matching MCP tool + CLI `diff` command. +9. **Compare workflow** — implement `service.compare_ads` + MCP tool + CLI `compare` command. +10. **Similar-to-liked** — implement `service.find_similar_to_liked` + MCP tool + CLI `similar-to-liked` command. +11. **Architecture tests** — `tests/test_architecture.py` enforcing A10. +12. **README + Claude Desktop config** — document install paths for both CLI and MCP using local venv. + +Definition of done for the whole phase: + +* [ ] `finn-eiendom-mcp` boots over stdio with all tools listed. +* [ ] `finn-eiendom --help` lists every command in §15.1. +* [ ] `pytest` is green, including new `test_service.py`, `test_cli.py`, `test_http.py`, `test_formatting.py`, `test_architecture.py`. +* [ ] `ruff check .` is clean. +* [ ] README documents Claude Desktop config and a CLI quickstart using local venv. +* [ ] All acceptance criteria in §24 pass. + +--- + +## 30. Final product statement + +> **Build a compact, private, self-hosted property analysis platform whose source of truth is a typed Python library, and whose user-facing surfaces are (a) an MCP server for LLM agents, (b) a CLI for terminals and cron, and (c) a Python API for tests and notebooks. All three share the same service layer, the same formatting layer, and the same SQLite cache.** + +The MVP does one thing well: + +> **FINN search in → relevant property candidates out, enriched with Eiendom.no estimates, similar-units, explanation, risk, and next steps.** \ No newline at end of file diff --git a/PROJECT.md b/PROJECT.md new file mode 100644 index 0000000..3b6f4b9 --- /dev/null +++ b/PROJECT.md @@ -0,0 +1,162 @@ +# PROJECT.md — module map + +The repo at a glance. For the why and the rules, read [`PRD.md`](PRD.md) §12 and §17. For the workflow, read [`AGENTS.md`](AGENTS.md). + +--- + +## Source tree + +``` +finn-mcp/ +├── pyproject.toml +├── Makefile +├── README.md ← user-facing overview +├── USAGE.md ← full user guide +├── PRD.md ← product spec + architecture (§17 = constitution) +├── PROJECT.md ← this file +├── AGENTS.md ← workflow for AI agents and contributors +├── CLEANUP.md ← pre-Phase-2 cleanup runbook +├── IMPLEMENTATION.md ← Phase 2 build runbook (12 steps) +│ +├── .github/ +│ ├── copilot-instructions.md +│ └── instructions/ +│ ├── python.instructions.md +│ ├── mcp.instructions.md +│ ├── cli.instructions.md +│ ├── tests.instructions.md +│ ├── clean-code.instructions.md +│ └── docs.instructions.md ← context7 lookup rules +│ +├── finn_eiendom/ ← the package +│ ├── __init__.py +│ ├── __main__.py ← python -m finn_eiendom → CLI +│ ├── config.py ← env vars, defaults, TTLs +│ ├── models.py ← Pydantic v2 models +│ ├── parser.py ← Norwegian number/area/URL/finnkode normalization +│ ├── http.py ← async httpx client w/ retry + delay +│ ├── cache.py ← SQLite schema + persistence +│ ├── search.py ← FINN search HTML parsing +│ ├── ad.py ← FINN listing HTML parsing +│ ├── eiendom_no.py ← Eiendom.no unit search/detail, unit_vector, comps +│ ├── scoring.py ← score model + classifications +│ ├── feedback.py ← verdicts + soft preference signal +│ ├── analysis.py ← shortlist + summary assembly +│ ├── service.py ← get_or_fetch_* + thin facade for MCP and CLI +│ ├── formatting.py ← render_* helpers (json/markdown/table) — shared by MCP and CLI +│ ├── mcp_server.py ← FastMCP wrappers around service.py +│ └── cli.py ← typer wrappers around service.py +│ +├── tests/ +│ ├── conftest.py +│ ├── fixtures.py +│ ├── fixtures/ ← HTML + JSON samples +│ ├── test_parser.py +│ ├── test_search.py +│ ├── test_ad.py +│ ├── test_eiendom_no.py +│ ├── test_scoring.py +│ ├── test_cache.py +│ ├── test_http.py ← retry + delay behavior +│ ├── test_service.py ← get_or_fetch_* + analyze_* +│ ├── test_formatting.py ← render_* roundtrips +│ ├── test_models.py ← Pydantic v2 roundtrips +│ ├── test_mcp_server.py ← tool registration + error envelope +│ ├── test_cli.py ← Typer CliRunner +│ └── test_architecture.py ← import-graph invariants (PRD A10) +│ +└── data/ ← gitignored; SQLite cache lives here + └── finn.sqlite +``` + +--- + +## Module responsibilities + +Single-home rule: every concern lives in exactly one module. See `PRD.md` §17.2 for the full table. + +| Module | Owns | Imports allowed | +| --------------- | --------------------------------------------------------------------- | ---------------------------------------------------------- | +| `config.py` | env-var loading, defaults, TTL constants | stdlib | +| `models.py` | Pydantic v2 models | stdlib, `pydantic` | +| `parser.py` | Norwegian text normalization (numbers, dates, URLs, finnkode) | stdlib | +| `http.py` | async `httpx.AsyncClient`, retry on 5xx, delay, user-agent | stdlib, `httpx` | +| `cache.py` | SQLite schema, reads, writes, TTL | stdlib, `sqlite3`, `models` | +| `search.py` | FINN search HTML → cards (BeautifulSoup) | stdlib, `bs4`, `parser`, `http`, `cache`, `models` | +| `ad.py` | FINN listing HTML → `FinnAd` (BeautifulSoup) | stdlib, `bs4`, `parser`, `http`, `cache`, `models` | +| `eiendom_no.py` | Eiendom.no unit search/detail, unit_vector, similar-units (msgpack) | stdlib, `msgpack`, `http`, `cache`, `models` | +| `scoring.py` | 9 score components, total clamping, category classifier | stdlib, `models` | +| `feedback.py` | feedback storage and retrieval | stdlib, `cache`, `models` | +| `analysis.py` | shortlist + summary assembly | stdlib, `search`, `ad`, `eiendom_no`, `scoring`, `feedback`| +| `service.py` | cache-aware orchestration; the only place that combines fetch + cache | stdlib, `config`, `cache`, `analysis`, `ad`, `eiendom_no`, `feedback`, `scoring`, `models` | +| `formatting.py` | render_* helpers (json/markdown/table) | stdlib, `models` | +| `mcp_server.py` | FastMCP tool definitions, error wrapping, stdio/HTTP entry | stdlib, `mcp`, `pydantic`, `service`, `formatting`, `config`, `models` | +| `cli.py` | typer command definitions, --format dispatch | stdlib, `typer`, `service`, `formatting`, `config`, `models` | + +`mcp_server.py` and `cli.py` are siblings — they never import each other. `service.py` never imports `mcp_server` or `cli`. `tests/test_architecture.py` enforces all of this. + +--- + +## Entry points + +Defined in `pyproject.toml`: + +```toml +[project.scripts] +finn-eiendom-mcp = "finn_eiendom.mcp_server:main" +finn-eiendom = "finn_eiendom.cli:app" +``` + +So you have: + +* `finn-eiendom-mcp` — MCP server over stdio (what Claude Desktop calls). +* `finn-eiendom` — CLI with all subcommands. +* `python -m finn_eiendom` — same as `finn-eiendom` (via `__main__.py`). +* `import finn_eiendom` — the library, for tests and notebooks. + +--- + +## Dependency graph + +``` + cli.py mcp_server.py + ↓ ↓ + └──> formatting.py <──┘ + │ + ↓ + service.py + ↓ + analysis.py + ↓ + ┌───────────┼──────────────┐ + ↓ ↓ ↓ + search.py ad.py eiendom_no.py scoring.py feedback.py + │ │ │ │ │ + ↓ ↓ ↓ ↓ ↓ + parser.py parser.py cache.py models.py cache.py + │ │ │ + ↓ ↓ ↓ + http.py http.py http.py +``` + +Bottom layer: `parser.py`, `http.py`, `cache.py`, `models.py`, `config.py`. They depend only on stdlib + one third-party library each. + +The graph is acyclic and points downward. Every arrow can be drawn; no arrow can be drawn upward. + +--- + +## Where to add things + +| You want to… | Add it to… | +| ----------------------------------------- | --------------------------------------- | +| Parse a new FINN field | `ad.py` or `search.py` + `models.py` | +| Add a new score component | `scoring.py` | +| Add a new env var | `config.py` | +| Add a new MCP tool | `mcp_server.py` (after `service.py`) | +| Add a new CLI command | `cli.py` (after `service.py`) | +| Change how something renders | `formatting.py` | +| Add a new orchestration / workflow | `service.py` (then add MCP + CLI) | +| Speak to a new external API | new module next to `eiendom_no.py` | +| Add a new SQLite table | `cache.py` | + +For anything else — read `PRD.md` §17.2 and §17.7. diff --git a/README.md b/README.md new file mode 100644 index 0000000..65fc26d --- /dev/null +++ b/README.md @@ -0,0 +1,160 @@ +# finn-eiendom-mcp + +> **Private, self-hosted property analysis platform for Norwegian real estate.** Analyzes FINN listings, enriches with Eiendom.no estimates, scores against personal preferences, and surfaces bargain candidates, hybel potential, renovation upside, and risk flags. Exposed through an MCP server, a CLI, and a Python library — all sharing one service layer. + +This is a **personal tool**. Not a SaaS, not a crawler, not legal/financial advice. Run locally, low frequency, your own data. + +--- + +## What it does + +``` +FINN search URL → ranked shortlist of homes + with reasons, risks, comps, broker questions +``` + +Specifically: + +* Parses FINN search and listing pages. +* Resolves each listing to an Eiendom.no `unitCode` for valuation and similar-units. +* Builds a `unit_vector` and fetches recently-sold comparables. +* Scores 9 components (economy, market position, comps, location, layout, outdoor, hybel, renovation, risk). +* Classifies as *bargain*, *safe*, *hybel*, *renovation*, *lifestyle*, or *risk*. +* Caches everything in SQLite; remembers what you've liked or rejected. +* Detects new / removed / changed listings between runs. + +--- + +## Three ways to use it + +| Surface | When you want… | Entry point | +| --------------- | -------------------------------------------------------------- | ----------------------- | +| **CLI** | Quick triage in a terminal, scripting, cron | `finn-eiendom ...` | +| **MCP server** | Claude Desktop, n8n, AI agents — conversational analysis | `finn-eiendom-mcp` | +| **Python lib** | Tests, notebooks, custom scripts | `import finn_eiendom` | + +All three call the same underlying `service.py` — same defaults, same semantics, same results. + +--- + +## Quick start + +### Requirements + +* Python **3.12+** +* `uv` (recommended) or `pip` + +### Install + +```bash +git clone finn-mcp +cd finn-mcp + +uv venv # or: python3.12 -m venv .venv +source .venv/bin/activate +uv pip install -e ".[dev]" # or: pip install -e ".[dev]" +``` + +### First run (CLI) + +```bash +# Triage a FINN search +finn-eiendom analyze-search 'https://www.finn.no/realestate/homes/search.html?location=...' --format table + +# Drill into one listing +finn-eiendom get-ad 462400360 --format markdown + +# Mark a listing as liked +finn-eiendom save-feedback 462400360 liked --notes "great layout, check fellesgjeld" + +# Find similar properties to liked listings +finn-eiendom similar-to-liked 462400360 +``` + +### First run (Claude Desktop) + +Add to `~/Library/Application Support/Claude/claude_desktop_config.json` (macOS) or the equivalent on Linux: + +```json +{ + "mcpServers": { + "finn-eiendom": { + "command": "/absolute/path/to/finn-mcp/.venv/bin/finn-eiendom-mcp", + "env": { + "FINN_CACHE_PATH": "/absolute/path/to/finn-mcp/data/finn.sqlite", + "EIENDOM_NO_ENABLED": "true" + } + } + } +} +``` + +Restart Claude Desktop. Then in any chat: + +> Analyze this FINN search and shortlist the top 5 for a couple in Oslo with a 9–12 MNOK budget, willing to renovate, prefer hybel potential: +> `https://www.finn.no/realestate/homes/search.html?location=...` + +For deep usage — every command, every MCP tool, every env var — see [`USAGE.md`](USAGE.md). + +--- + +## Architecture in one screen + +``` + CLI (typer) MCP server (FastMCP) ← thin, parallel front ends + \ / + \ / + service.py ← cache + fetch orchestration + ↓ + analysis.py ← shortlist + summary + ↓ + search / ad / eiendom_no / scoring / feedback + ↓ + parser / http / cache (SQLite) + ↓ + FINN HTML + Eiendom.no JSON +``` + +`formatting.py` lives next to `service.py` and is shared by both CLI and MCP for JSON / markdown / table rendering. + +**Key rule:** CLI and MCP are siblings. They never call each other. Both call the same `service.py` functions. See [`PRD.md`](PRD.md) §17 for the full code-ownership constitution. + +--- + +## Project documents + +Read in this order depending on what you're doing: + +| If you want to… | Read | +| ------------------------------------- | --------------------------------------------------- | +| Use the tool | This README, then [`USAGE.md`](USAGE.md) | +| Understand the design | [`PRD.md`](PRD.md), especially §1, §12, §17 | +| Contribute / extend / hack on it | [`AGENTS.md`](AGENTS.md), then [`PROJECT.md`](PROJECT.md), then `.github/instructions/*.md` | +| Run the cleanup pass on the repo | [`CLEANUP.md`](CLEANUP.md) | +| Build out unfinished features | [`IMPLEMENTATION.md`](IMPLEMENTATION.md) | + +--- + +## Status + +* **Phase 0 (spike):** done. +* **Phase 1 (core MVP):** mostly done. +* **Phase 2 (MCP + CLI):** in progress — driven by [`IMPLEMENTATION.md`](IMPLEMENTATION.md). +* **Phase 3+ (scoring v2, agent workflows, dashboard):** future. + +--- + +## Safety and compliance + +* Private, low-frequency, user-triggered use only. No public deployment. +* Configurable request delays (`FINN_REQUEST_DELAY_SECONDS`, `EIENDOM_NO_REQUEST_DELAY_SECONDS`) — defaults are conservative. +* Aggressive caching to minimize external requests. +* No bypassing of rate limits, bot protection, authentication, or access controls. +* No public redistribution of FINN or Eiendom.no data. +* Scores, estimates, and comparable sales are **decision support, not advice**. Don't substitute this for a real broker, lawyer, or technical inspector. + +--- + +## License / use + +Personal project. Not for redistribution. Don't expose the MCP HTTP transport on a public interface — keep it on LAN, Tailscale, or behind auth. diff --git a/USAGE.md b/USAGE.md new file mode 100644 index 0000000..9bbb039 --- /dev/null +++ b/USAGE.md @@ -0,0 +1,503 @@ +# USAGE.md — finn-eiendom user guide + +How to use the tool day-to-day. Covers installation, every CLI command, every MCP tool, Claude Desktop integration, common workflows, environment variables, and troubleshooting. + +For the why and the architecture, see [`README.md`](README.md) and [`PRD.md`](PRD.md). + +--- + +## 1. Installation + +### Requirements + +* Python **3.12 or newer** (check with `python3 --version`) +* `uv` (recommended) or `pip` +* macOS, Linux, or WSL2 on Windows + +### Install + +```bash +git clone finn-mcp +cd finn-mcp + +# Option A: uv (preferred — fast) +uv venv +source .venv/bin/activate +uv pip install -e ".[dev]" + +# Option B: pip +python3.12 -m venv .venv +source .venv/bin/activate +pip install -e ".[dev]" +``` + +Verify: + +```bash +finn-eiendom --help +finn-eiendom-mcp --help # may exit immediately on stdio mode; that's fine +finn-eiendom doctor # smoke-checks cache, FINN, Eiendom.no reachability +``` + +### Updating + +```bash +git pull +source .venv/bin/activate +uv pip install -e ".[dev]" +``` + +If `pyproject.toml` added dependencies, the second command picks them up. + +### Global install (optional) + +If you want `finn-eiendom` available system-wide without activating the venv: + +```bash +uv tool install . +# or +pipx install . +``` + +--- + +## 2. First-time setup + +### Set up the data directory + +```bash +mkdir -p data +``` + +SQLite cache lives there at `data/finn.sqlite` by default. Override with `FINN_CACHE_PATH` if you want it elsewhere. + +### Optional: environment file + +Create `.env` in the project root for your usual settings: + +```bash +FINN_CACHE_PATH=data/finn.sqlite +FINN_MAX_SEARCH_PAGES=3 +FINN_DETAIL_LIMIT=20 +EIENDOM_NO_ENABLED=true +EIENDOM_NO_SIMILAR_UNITS_ENABLED=true +LOG_LEVEL=INFO +``` + +See §7 for the full list of variables. + +### Verify + +```bash +finn-eiendom doctor +``` + +This pings the cache, reaches FINN once, reaches Eiendom.no once, and reports any failures. + +--- + +## 3. CLI reference + +Every command runs inside the activated venv. + +### 3.1 Analyze a FINN search + +```bash +finn-eiendom analyze-search '' [options] +``` + +| Option | Default | Purpose | +| ------------------- | ------- | ---------------------------------------------------------- | +| `--max-pages N` | `3` | Pages of search results to fetch. | +| `--detail-limit N` | `20` | How many listings to detail-fetch from the result set. | +| `--no-details` | off | Skip detail fetches; use only search-card data. | +| `--no-eiendom` | off | Skip Eiendom.no enrichment. | +| `--with-similar` | off | Fetch similar-units / comps for shortlisted listings. | +| `--format FMT` | `json` | `json`, `markdown`, or `table`. | + +Examples: + +```bash +# Triage in the terminal +finn-eiendom analyze-search 'https://www.finn.no/realestate/homes/search.html?location=0.20061&min_bedrooms=2&price_collective_to=12000000' --format table + +# Full JSON for piping into jq +finn-eiendom analyze-search '' --format json | jq '.shortlist[].title' + +# Detailed run with comps +finn-eiendom analyze-search '' --detail-limit 30 --with-similar --format markdown +``` + +### 3.2 Drill into one listing + +```bash +finn-eiendom get-ad [options] +``` + +| Option | Default | Purpose | +| ------------------- | ------- | -------------------------------------------------- | +| `--force-refresh` | off | Bypass the 24h cache and refetch. | +| `--no-eiendom` | off | Skip Eiendom.no enrichment. | +| `--with-similar` | off | Fetch similar-units / comps. | +| `--format FMT` | `json` | `json` or `markdown`. | + +```bash +finn-eiendom get-ad 462400360 --format markdown +finn-eiendom get-ad 462400360 --force-refresh --with-similar +``` + +### 3.3 Compare listings + +```bash +finn-eiendom compare [...] [options] +``` + +| Option | Default | Purpose | +| ---------------- | ------- | -------------------------------------- | +| `--no-eiendom` | off | Skip Eiendom.no enrichment. | +| `--no-comps` | off | Skip similar-units / comps. | +| `--format FMT` | `json` | `json`, `markdown`, or `table`. | + +```bash +finn-eiendom compare 462400360 461153194 --format markdown +finn-eiendom compare 462400360 461153194 462400360 --format table +``` + +Up to 10 finnkoder per call. + +### 3.4 Feedback + +```bash +finn-eiendom save-feedback [--notes "..."] +``` + +Verdict vocabulary: `liked`, `rejected`, `interesting`, `bargain_candidate`, `risk_object`, `viewing_candidate`, `viewed`, `too_expensive`, `too_small`, `too_far_out`, `too_high_risk`, `likes_location`, `likes_layout`, `dislikes_area`. + +```bash +finn-eiendom save-feedback 462400360 liked --notes "balcony, view, check wet rooms" +finn-eiendom save-feedback 461153194 rejected --notes "too far from city center" +``` + +`liked` verdicts feed the `similar-to-liked` command. + +### 3.5 New / removed / changed listings + +```bash +finn-eiendom diff '' [--format FMT] +``` + +Compares the current search results against the previous run for the same normalized URL and reports new finnkoder, removed finnkoder, and changed listings (price, common costs, status). + +```bash +finn-eiendom diff '' --format table +``` + +Useful as a daily cron: + +```bash +0 9 * * * cd /path/to/finn-mcp && .venv/bin/finn-eiendom diff 'https://www.finn.no/...' --format markdown >> diff.log +``` + +### 3.6 Shortlist history + +```bash +finn-eiendom shortlist [--run-id ID] [--limit N] [--format FMT] +``` + +Without `--run-id`, returns the latest saved shortlist. + +### 3.7 Eiendom.no commands + +```bash +finn-eiendom resolve-unit '' # find unitCode for a FINN listing +finn-eiendom get-unit [--force-refresh] # fetch unit detail +finn-eiendom enrich-ad [--with-similar] # FINN + Eiendom.no combined +finn-eiendom build-vector # build the base64url unit_vector +finn-eiendom decode-vector # decode for inspection +finn-eiendom similar-units [--status RECENTLY_SOLD|FOR_SALE|CURRENT] +``` + +### 3.8 Find similar to liked + +```bash +finn-eiendom similar-to-liked [--mode recommendations|comps] [--status STATUS] +``` + +The listing must have a `liked` feedback row. Defaults to `mode=recommendations`, `status=FOR_SALE` — i.e. find active listings similar to this one. Use `--mode comps --status RECENTLY_SOLD` to get comparable sales instead. + +### 3.9 Price analysis against comps + +```bash +finn-eiendom analyze-against-comps +``` + +Returns `price_position` (`below_estimate` / `within_range` / `above_estimate`), `sqm_price_position` (`cheap` / `normal` / `expensive`), `comparable_score`, and a `confidence` label. + +### 3.10 Cache management + +```bash +finn-eiendom cache stats # row counts and TTL summary +finn-eiendom cache clear # purge everything except feedback +finn-eiendom cache clear-html # only purge raw HTML +finn-eiendom cache clear-json # only purge raw JSON +``` + +Feedback is never purged by `cache clear` — feedback is permanent until explicitly deleted via SQL. + +### 3.11 MCP server + +```bash +finn-eiendom serve # stdio (default) +finn-eiendom serve --transport http --port 8010 # HTTP for n8n / multi-client +``` + +In HTTP mode the server listens on `http://127.0.0.1:8010/mcp` with operational endpoints `GET /health`, `GET /version`, `GET /debug/config`. + +There's also a shorthand `finn-eiendom-mcp` that starts stdio mode directly — that's what Claude Desktop calls. + +### 3.12 Misc + +```bash +finn-eiendom config show # print resolved configuration +finn-eiendom config path # print SQLite cache path +finn-eiendom doctor # smoke checks +finn-eiendom version +``` + +--- + +## 4. MCP tools (for Claude Desktop / n8n / agents) + +All tools use the `finn_` prefix. They mirror the CLI commands 1:1 — same defaults, same semantics. + +| Tool | Purpose | +| ------------------------------------- | ---------------------------------------------------------------- | +| `finn_analyze_search` | Analyze a FINN search URL and return a ranked shortlist. | +| `finn_get_ad` | Fetch structured data for one finnkode. | +| `finn_compare_ads` | Compare multiple listings side by side. | +| `finn_save_feedback` | Store feedback/verdict/notes. | +| `finn_get_shortlist` | Fetch a stored shortlist from a previous run. | +| `finn_get_new_ads_since_last_run` | Detect new / removed / changed listings. | +| `finn_resolve_eiendom_unit` | Map FINN URL → Eiendom.no `unitCode`. | +| `finn_get_eiendom_unit` | Fetch Eiendom.no unit detail by `unitCode`. | +| `finn_enrich_ad` | Combine FINN listing + Eiendom.no enrichment. | +| `finn_build_unit_vector` | Build a `unit_vector` from a `unitCode`. | +| `finn_decode_unit_vector` | Decode a `unit_vector` for inspection. | +| `finn_get_similar_units` | Fetch comps / recommendations. | +| `finn_find_similar_to_liked_ad` | Find properties similar to one you liked. | +| `finn_analyze_ad_against_comps` | Evaluate a listing against `RECENTLY_SOLD` comps. | + +Every tool accepts a `response_format` parameter (`"json"` or `"markdown"`). Errors come back as `{"error": true, "code": "", "message": "..."}`. + +--- + +## 5. Claude Desktop setup + +### Config file + +* macOS: `~/Library/Application Support/Claude/claude_desktop_config.json` +* Linux: `~/.config/Claude/claude_desktop_config.json` + +### Direct entry-point (recommended) + +```json +{ + "mcpServers": { + "finn-eiendom": { + "command": "/absolute/path/to/finn-mcp/.venv/bin/finn-eiendom-mcp", + "env": { + "FINN_CACHE_PATH": "/absolute/path/to/finn-mcp/data/finn.sqlite", + "EIENDOM_NO_ENABLED": "true", + "EIENDOM_NO_SIMILAR_UNITS_ENABLED": "true", + "LOG_LEVEL": "INFO" + } + } + } +} +``` + +The `command` **must** be the absolute path to the venv's `finn-eiendom-mcp` binary. Don't rely on `$PATH` here — Claude Desktop doesn't inherit your shell environment. + +### Alternative: via `uv` + +```json +{ + "mcpServers": { + "finn-eiendom": { + "command": "uv", + "args": ["run", "finn-eiendom-mcp"], + "cwd": "/absolute/path/to/finn-mcp" + } + } +} +``` + +### Verify + +1. Restart Claude Desktop. +2. Look for `finn-eiendom` in the MCP servers indicator (usually a hammer icon). +3. Ask in any chat: *"Use the finn-eiendom server to analyze this search: ..."* + +If it doesn't show up, check the Claude Desktop logs: + +* macOS: `~/Library/Logs/Claude/mcp-server-finn-eiendom.log` +* Linux: `~/.local/share/Claude/logs/mcp-server-finn-eiendom.log` + +stdout output from the server is a fatal error — the server must only log to stderr. + +--- + +## 6. Common workflows + +### 6.1 Daily triage + +```bash +# Morning routine +finn-eiendom diff 'https://www.finn.no/...' --format table +# Detail-fetch only what's new or changed +finn-eiendom analyze-search 'https://www.finn.no/...' --detail-limit 10 --format markdown +``` + +### 6.2 Weekly deep dive in Claude Desktop + +> Read my latest finn-eiendom shortlist and group the top 10 by category (bargain / safe / hybel / lifestyle). For each, summarize the three most important risks and the three most important broker questions. + +### 6.3 Pre-viewing prep + +```bash +# Mark candidates for viewing +finn-eiendom save-feedback 462400360 viewing_candidate --notes "Saturday 14:00" +# Get the full data + comps +finn-eiendom get-ad 462400360 --with-similar --format markdown > viewing_prep_462400360.md +``` + +Then in Claude Desktop: + +> Read the saved markdown for finnkode 462400360 and prepare a viewing checklist: wet rooms to inspect, common-costs questions, hybel-approval question, neighbor questions. + +### 6.4 Comparing finalists + +```bash +finn-eiendom compare 462400360 461153194 459333210 --format markdown > finalists.md +``` + +### 6.5 Build a recommendation set from liked properties + +```bash +# After you've liked a few +finn-eiendom save-feedback 462400360 liked +finn-eiendom save-feedback 461153194 liked + +# Get recommendations similar to each +finn-eiendom similar-to-liked 462400360 --mode recommendations --status FOR_SALE +finn-eiendom similar-to-liked 461153194 --mode recommendations --status FOR_SALE +``` + +--- + +## 7. Environment variables + +| Variable | Default | Purpose | +| ----------------------------------------- | -------------------------------: | -------------------------------- | +| `FINN_CACHE_PATH` | `data/finn.sqlite` | SQLite DB path | +| `FINN_MAX_SEARCH_PAGES` | `3` | Max search pages per analyze | +| `FINN_DETAIL_LIMIT` | `20` | Max detail fetches per analyze | +| `FINN_REQUEST_DELAY_SECONDS` | `2` | Seconds between FINN requests | +| `FINN_USER_AGENT` | `personal-finn-eiendom-analyzer/0.1` | HTTP User-Agent | +| `FINN_CACHE_TTL_SEARCH_MINUTES` | `60` | Search cache TTL | +| `FINN_CACHE_TTL_AD_HOURS` | `24` | Listing cache TTL | +| `EIENDOM_NO_ENABLED` | `true` | Enable Eiendom.no enrichment | +| `EIENDOM_NO_BASE_URL` | `https://api.eiendom.no/api/v1` | API base URL | +| `EIENDOM_NO_CACHE_TTL_HOURS` | `24` | Unit/similar cache TTL | +| `EIENDOM_NO_REQUEST_DELAY_SECONDS` | `1` | Seconds between Eiendom.no calls | +| `EIENDOM_NO_SIMILAR_UNITS_ENABLED` | `true` | Enable similar-units | +| `EIENDOM_NO_SIMILAR_UNITS_DEFAULT_STATUS` | `RECENTLY_SOLD` | Default comps status | +| `HJEMLA_ENABLED` | `false` | Enable optional Hjemla API | +| `LOG_LEVEL` | `INFO` | Log level | +| `MCP_TRANSPORT` | `stdio` | `stdio` or `streamable_http` | +| `MCP_HTTP_HOST` | `127.0.0.1` | HTTP bind address | +| `MCP_HTTP_PORT` | `8010` | HTTP port | + +Set them in `.env`, in your shell, or in the Claude Desktop `env` block per §5. + +--- + +## 8. Troubleshooting + +### Claude Desktop doesn't see the server + +1. The `command` path must be absolute and point at the venv's binary. +2. Check `~/Library/Logs/Claude/mcp-server-finn-eiendom.log` (macOS) for a Python traceback. +3. The server **must not** write to stdout — any `print()` in the code breaks JSON-RPC. If you're hacking on it and see a frame parse error, that's the cause. +4. Restart Claude Desktop after config changes (`Cmd+Q`, not just close the window). + +### "Module not found" when running CLI + +The venv isn't activated, or the package isn't installed in editable mode. + +```bash +source .venv/bin/activate +uv pip install -e ".[dev]" +``` + +### Eiendom.no enrichment is `unavailable` + +This is graceful degradation when: + +* The FINN URL can't be matched to a `unitCode` (rare, but happens for unusual addresses). +* Eiendom.no rate-limited or returned 5xx. +* The unit was deleted from Eiendom.no's index. + +Check the log for the warning. The listing analysis continues without enrichment. + +### Similar-units returns nothing + +* Verify `EIENDOM_NO_SIMILAR_UNITS_ENABLED=true`. +* The `unit_vector` might be empty / malformed — check `finn-eiendom decode-vector `. +* Try `--status FOR_SALE` if `RECENTLY_SOLD` is sparse, or vice versa. + +### Slow first run + +The first analyze fills the cache. Subsequent runs are much faster. Tune `FINN_REQUEST_DELAY_SECONDS` and `EIENDOM_NO_REQUEST_DELAY_SECONDS` if you're impatient — but don't drop them too low, the whole point of caching is to be polite. + +### Stale results + +Cache TTLs: + +* Search: 60 minutes +* FINN listing: 24 hours +* Eiendom.no unit: 24 hours +* Similar-units: 24 hours + +Force a refresh with `--force-refresh` on `get-ad` or `get-unit`, or wipe with `finn-eiendom cache clear`. + +### `pytest` fails after pulling new changes + +```bash +source .venv/bin/activate +uv pip install -e ".[dev]" # re-sync dependencies +pytest -x # find the first failure +``` + +If a test fails with a network-related error, that's a bug — tests should never hit the network. Report it. + +--- + +## 9. What this tool is not + +* Not a public API. Don't expose the HTTP transport on the open internet. +* Not financial, legal, or valuation advice. Scores and estimates are decision support. +* Not a bidding agent. It will never contact a broker or place a bid for you. +* Not a crawler. Use it for the searches you'd be manually browsing anyway — at your own pace. +* Not a substitute for a real condition report (`tilstandsrapport`), a real lawyer, or a real broker. + +--- + +## 10. Getting help + +* [`README.md`](README.md) — overview +* [`PRD.md`](PRD.md) — full product spec and architecture +* [`AGENTS.md`](AGENTS.md) — workflow rules for contributors +* [`.github/instructions/*.md`](.github/instructions/) — per-topic conventions + +For bugs, open an issue in the repo with: the exact command run, the full traceback or unexpected output, the version (`finn-eiendom version`), and a redacted FINN URL if relevant. diff --git a/finn_eiendom/__init__.py b/finn_eiendom/__init__.py new file mode 100644 index 0000000..abbba63 --- /dev/null +++ b/finn_eiendom/__init__.py @@ -0,0 +1,36 @@ +"""FINN Real Estate MCP Server - Private property analysis platform.""" + +__version__ = "0.1.0" +__author__ = "FINN Scout" + +from . import ad, analysis, cache, config, eiendom_no, scoring, search +from .http import HTTPClient +from .models import EiendomUnit, FinnAd, FinnSearchCard, SimilarUnit, UnitVector +from .parser import ( + extract_finnkode_from_url, + normalize_area, + normalize_finnkode, + normalize_number, + normalize_price, +) + +__all__ = [ + "config", + "FinnAd", + "FinnSearchCard", + "EiendomUnit", + "SimilarUnit", + "UnitVector", + "normalize_price", + "normalize_area", + "normalize_number", + "normalize_finnkode", + "extract_finnkode_from_url", + "HTTPClient", + "ad", + "analysis", + "cache", + "eiendom_no", + "scoring", + "search", +] diff --git a/finn_eiendom/ad.py b/finn_eiendom/ad.py new file mode 100644 index 0000000..e4db961 --- /dev/null +++ b/finn_eiendom/ad.py @@ -0,0 +1,193 @@ +"""FINN listing detail scraping and normalization.""" + +import logging +import re +from datetime import UTC, datetime + +from bs4 import BeautifulSoup + +from .http import HTTPClient +from .models import FinnAd +from .parser import ( + clean_text, + extract_finnkode_from_url, + normalize_area, + normalize_finnkode, + normalize_number, + normalize_price, + text_to_bool, +) + +logger = logging.getLogger(__name__) + +FINN_AD_URL_TEMPLATE = "https://www.finn.no/realestate/homes/ad.html?finnkode={}" + + +async def fetch_ad(finnkode: str, client: HTTPClient | None = None) -> str: + """Fetch FINN listing HTML by finnkode.""" + client = client or HTTPClient(request_delay_seconds=0.0) + url = FINN_AD_URL_TEMPLATE.format(finnkode) + response = await client.get(url) + return response.text + + +def _load_property_map(soup: BeautifulSoup) -> dict[str, str]: + properties: dict[str, str] = {} + for dt, dd in zip(soup.find_all("dt"), soup.find_all("dd"), strict=False): + key = clean_text(dt.get_text()) or "" + value = clean_text(dd.get_text()) or "" + properties[key.lower()] = value + return properties + + +def _get_data_testid_value(soup: BeautifulSoup, testid: str) -> str | None: + node = soup.select_one(f'[data-testid="{testid}"]') + if not node: + return None + return clean_text(node.get_text(" ", strip=True)) + + +def _strip_labelled_text(text: str | None, labels: list[str]) -> str | None: + if not text: + return None + for label in labels: + if text.lower().startswith(label.lower()): + return clean_text(text[len(label) :]) + return text + + +def _extract_floor_from_text(text: str | None) -> str | None: + if not text: + return None + match = re.search(r"(\d+)\s*\.?\s*etasje", text, re.IGNORECASE) + if match: + return f"{match.group(1)}. etasje" + return None + + +def _clean_description(text: str | None) -> str | None: + if not text: + return None + cleaned = re.sub(r"(?i)^om boligen", "", text).strip() + cleaned = re.sub(r"(?i)^beskrivelse", "", cleaned).strip() + return clean_text(cleaned) + + +def _load_feature_text(soup: BeautifulSoup) -> str: + return _get_data_testid_value(soup, "object-facilities") or "" + + +def _extract_description(soup: BeautifulSoup) -> str | None: + node = soup.select_one('[data-testid="om boligen"]') or soup.select_one(".description") + if not node: + return None + paragraphs = [clean_text(p.get_text()) for p in node.select("p") if clean_text(p.get_text())] + if paragraphs: + return "\n".join(paragraphs) + return _clean_description(node.get_text(" ", strip=True)) + + +def scrape_ad(html: str, url: str | None = None) -> FinnAd: + """Scrape a FINN listing HTML page into a FinnAd model.""" + soup = BeautifulSoup(html, "html.parser") + title_node = soup.select_one("h1") + broker_name = soup.select_one(".broker-name") + + properties = _load_property_map(soup) + feature_text = _load_feature_text(soup).lower() + finnkode = normalize_finnkode(extract_finnkode_from_url(url or "")) or "" + address = _get_data_testid_value(soup, "object-address") or properties.get("adresse") + district = _get_data_testid_value(soup, "local-area-name") or properties.get("område") + ownership_type = _strip_labelled_text( + _get_data_testid_value(soup, "info-ownership-type"), ["Eieform", "Eiendomstype"] + ) or properties.get("eierform") + property_type = _strip_labelled_text( + _get_data_testid_value(soup, "info-property-type"), ["Boligtype", "Eiendomstype"] + ) or properties.get("eiendomstype") + + asking_price = normalize_price( + properties.get("prisantydning") or _get_data_testid_value(soup, "pricing-incicative-price") + ) + total_price_value = normalize_price( + properties.get("totalpris") or _get_data_testid_value(soup, "pricing-total-price") + ) + shared_debt = normalize_price( + properties.get("fellesgjeld") or _get_data_testid_value(soup, "pricing-joint-debt") + ) + common_costs = normalize_number( + properties.get("felles utgifter") + or _get_data_testid_value(soup, "pricing-common-monthly-cost") + ) + area_m2 = normalize_area( + properties.get("boligareal") + or _get_data_testid_value(soup, "info-usable-i-area") + or _get_data_testid_value(soup, "info-usable-area") + ) + rooms = normalize_number(properties.get("rom") or _get_data_testid_value(soup, "info-rooms")) + bedrooms = normalize_number( + properties.get("soverom") or _get_data_testid_value(soup, "info-bedrooms") + ) + floor = ( + properties.get("etasje") + or _extract_floor_from_text(title_node.get_text() if title_node else "") + or _get_data_testid_value(soup, "info-floor") + ) + construction_year = normalize_number( + properties.get("byggeår") or _get_data_testid_value(soup, "info-construction-year") + ) + energy_rating = properties.get("energimerking") + heating = properties.get("oppvarming") + has_balcony = text_to_bool(properties.get("balkonger/terrasser")) or "balkong" in feature_text + has_terrace = "terrasse" in feature_text + has_elevator = text_to_bool(properties.get("heis")) or "heis" in feature_text + has_parking = ( + bool(properties.get("parkering/garasje")) + or "parkering" in feature_text + or "garasje" in feature_text + ) + broker_company = None + if broker_name: + broker_company = clean_text(broker_name.get_text()) + + listing_description = _extract_description(soup) + + ad = FinnAd( + finnkode=finnkode, + url=url or "", + title=clean_text(title_node.get_text()) if title_node else None, + address=address, + postal_area=properties.get("postnummer"), + district=district, + property_type=property_type, + ownership_type=ownership_type, + asking_price=asking_price, + total_price=total_price_value, + shared_debt=shared_debt, + common_costs=common_costs, + municipal_fee=normalize_number(properties.get("kommunale avgifter")), + other_fees=normalize_number(properties.get("andre utgifter")), + area_m2=area_m2, + rooms=rooms, + bedrooms=bedrooms, + floor=floor, + construction_year=construction_year, + energy_rating=energy_rating, + heating=heating, + has_balcony=has_balcony, + has_terrace=has_terrace, + has_elevator=has_elevator, + has_parking=has_parking, + listing_description=listing_description, + broker_name=None, + broker_company=broker_company, + detail_fetched_at=None, + ) + return ad + + +async def fetch_ad_details(finnkode: str, client: HTTPClient | None = None) -> FinnAd: + """Fetch FINN listing HTML and return a parsed FinnAd object.""" + html = await fetch_ad(finnkode, client=client) + ad = scrape_ad(html, url=FINN_AD_URL_TEMPLATE.format(finnkode)) + ad.detail_fetched_at = datetime.now(UTC) + return ad diff --git a/finn_eiendom/analysis.py b/finn_eiendom/analysis.py new file mode 100644 index 0000000..01620f6 --- /dev/null +++ b/finn_eiendom/analysis.py @@ -0,0 +1,175 @@ +"""Orchestration for FINN search + Eiendom.no enrichment + scoring.""" + +import logging + +from . import ad as ad_module +from . import cache, eiendom_no, scoring, search +from .config import ( + FINN_CACHE_PATH, + FINN_CACHE_TTL_AD_HOURS, + FINN_DETAIL_LIMIT, + FINN_MAX_SEARCH_PAGES, +) +from .models import EiendomUnit, FinnAd, SimilarUnit + +logger = logging.getLogger(__name__) + + +def _normalize_description(text: str | None) -> str: + return text.lower() if text else "" + + +def _build_ad_summary( + ad: FinnAd, + enriched: EiendomUnit | None, + similar_units: list[SimilarUnit], + scores: dict, + categories: list[str], +) -> dict: + description = _normalize_description(ad.listing_description) + reasons = [] + risks = [] + next_steps = [ + "Open the FINN listing and condition report.", + "Review the Eiendom.no estimate and comparable sales.", + "Ask the broker about renovation status and approvals.", + ] + + if enriched and enriched.estimated_selling_price and ad.total_price: + if ad.total_price < enriched.estimated_selling_price: + reasons.append("Listing price is below Eiendom.no estimate.") + elif ad.total_price <= enriched.estimated_selling_price_upper: + reasons.append("Price sits within the local estimate range.") + else: + reasons.append("Listing price is above the estimate range.") + else: + reasons.append("Eiendom.no enrichment is unavailable or incomplete.") + + if "utsikt" in description or ad.has_balcony or ad.has_terrace: + reasons.append("Outdoor space or view potential is positive.") + if "hybel" in description or "leie" in description: + reasons.append("Potential hybel/rental opportunity is mentioned.") + if "potensial" in description or "renover" in description: + reasons.append("Renovation or improvement potential is highlighted.") + + if scores.get("risk", 0.0) < 0: + risks.append("Risk flags are detected in description or metadata.") + if ad.common_costs and ad.common_costs > 5000: + risks.append("Common costs are relatively high and should be reviewed.") + if enriched and enriched.sale_status and enriched.sale_status.upper() != "FOR_SALE": + risks.append("Eiendom.no sale status does not indicate an active sale.") + if not enriched: + risks.append("Missing Eiendom.no data increases uncertainty.") + + if not any("Eiendom.no" in step for step in next_steps): + next_steps.append("Verify the property on Eiendom.no and reconcile any mismatches.") + + if similar_units: + next_steps.append("Review the comparable units and average sqm prices.") + else: + next_steps.append("Comparable sales are unavailable; treat valuation with caution.") + + return { + "why_interesting": reasons, + "risks": risks, + "next_steps": next_steps, + "shortlist_reason": ", ".join(reasons[:3]) + if reasons + else "Review details and seller disclosures.", + } + + +async def analyze_ad( + finn_ad: FinnAd, + unit_code: str | None = None, +) -> dict: + """Enrich a FinnAd and compute score summary.""" + conn = cache.init_db(FINN_CACHE_PATH) + enriched: EiendomUnit | None = None + similar_units: list[SimilarUnit] = [] + + if unit_code: + enriched = cache.get_eiendom_unit(conn, unit_code) + if enriched is None: + enriched = await eiendom_no.enrich_ad_with_eiendom_no(finn_ad, unit_code) + if enriched is not None: + cache.save_eiendom_unit(conn, enriched) + + if enriched and enriched.unit_vector: + similar_units = cache.get_similar_units(conn, enriched.unit_code, "RECENTLY_SOLD") + if not similar_units: + similar_units = await eiendom_no.get_similar_units(enriched.unit_vector) + if similar_units: + cache.save_similar_units(conn, enriched.unit_code, "RECENTLY_SOLD", similar_units) + + scores = scoring.score_ad(finn_ad, enriched, similar_units) + categories = scoring.classify_ad(scores) + summary = _build_ad_summary(finn_ad, enriched, similar_units, scores, categories) + + result = { + "finnkode": finn_ad.finnkode, + "title": finn_ad.title, + "address": finn_ad.address, + "score": scores, + "categories": categories, + "summary": summary, + "eiendom_unit": enriched.model_dump() if enriched else None, + "similar_units": [unit.model_dump() for unit in similar_units], + } + cache.save_finn_ad(conn, finn_ad) + return result + + +async def analyze_search( + search_url: str, + max_pages: int = FINN_MAX_SEARCH_PAGES, + fetch_details: bool = True, + detail_limit: int = FINN_DETAIL_LIMIT, + include_eiendom_no: bool = True, + client=None, + use_cache: bool = True, +) -> dict: + """Analyze a FINN search URL and enrich matching listings.""" + conn = cache.init_db(FINN_CACHE_PATH) + cards = await search.fetch_search_pages( + search_url, + max_pages=max_pages, + client=client, + use_cache=use_cache, + ) + results = [] + enriched_count = 0 + + if fetch_details: + for card in cards[:detail_limit]: + finn_ad = cache.get_finn_ad(conn, card.finnkode, ttl_hours=FINN_CACHE_TTL_AD_HOURS) + if finn_ad is None: + finn_ad = await ad_module.fetch_ad_details(card.finnkode, client=client) + unit_code = None + if include_eiendom_no: + try: + matched_unit = await eiendom_no.search_unit_from_finn_url(card.url) + except Exception as exc: + logger.warning("Eiendom.no unit search failed: %s", exc) + matched_unit = None + unit_code = ( + matched_unit.unit_code + if matched_unit + else eiendom_no.resolve_unit_from_finn_url(card.url) + ) + result = await analyze_ad(finn_ad, unit_code=unit_code) + if result.get("eiendom_unit"): + enriched_count += 1 + results.append(result) + + results.sort(key=lambda item: item["score"].get("total", 0.0), reverse=True) + return { + "search_url": search_url, + "search_cards": [card.model_dump() for card in cards], + "analysis": results, + "summary": { + "total_listings": len(cards), + "analyzed_listings": len(results), + "eiendom_enriched": enriched_count, + }, + } diff --git a/finn_eiendom/cache.py b/finn_eiendom/cache.py new file mode 100644 index 0000000..8bf78ba --- /dev/null +++ b/finn_eiendom/cache.py @@ -0,0 +1,243 @@ +"""SQLite cache and persistence for FINN and Eiendom.no data.""" + +import json +import logging +import sqlite3 +from datetime import UTC, datetime, timedelta +from typing import Any + +from .config import FINN_CACHE_PATH +from .models import EiendomUnit, FinnAd, FinnSearchCard, SimilarUnit + +logger = logging.getLogger(__name__) + + +def get_connection(path: str | None = None) -> sqlite3.Connection: + db_path = path or FINN_CACHE_PATH + conn = sqlite3.connect(str(db_path), detect_types=sqlite3.PARSE_DECLTYPES) + conn.row_factory = sqlite3.Row + return conn + + +def init_db(path: str | None = None) -> sqlite3.Connection: + conn = get_connection(path) + cursor = conn.cursor() + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS finn_ads ( + finnkode TEXT PRIMARY KEY, + url TEXT, + payload TEXT NOT NULL, + fetched_at TEXT NOT NULL + ) + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS eiendom_units ( + unit_code TEXT PRIMARY KEY, + payload TEXT NOT NULL, + fetched_at TEXT NOT NULL + ) + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS similar_units ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + unit_code TEXT NOT NULL, + listing_status TEXT NOT NULL, + payload TEXT NOT NULL, + fetched_at TEXT NOT NULL + ) + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS cache_meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + expires_at TEXT + ) + """ + ) + conn.commit() + return conn + + +def cache_get(conn: sqlite3.Connection, key: str) -> dict[str, Any] | None: + cursor = conn.cursor() + cursor.execute("SELECT value, expires_at FROM cache_meta WHERE key = ?", (key,)) + row = cursor.fetchone() + if not row: + return None + + expires_at = row["expires_at"] + if expires_at and datetime.fromisoformat(expires_at) < datetime.now(UTC): + cursor.execute("DELETE FROM cache_meta WHERE key = ?", (key,)) + conn.commit() + return None + + return json.loads(row["value"]) + + +def cache_set( + conn: sqlite3.Connection, + key: str, + payload: dict[str, Any], + ttl_hours: int | None = None, + ttl_minutes: int | None = None, +) -> None: + expires_at = None + if ttl_minutes is not None: + expires_at = (datetime.now(UTC) + timedelta(minutes=ttl_minutes)).isoformat() + elif ttl_hours is not None: + expires_at = (datetime.now(UTC) + timedelta(hours=ttl_hours)).isoformat() + cursor = conn.cursor() + cursor.execute( + "INSERT OR REPLACE INTO cache_meta (key, value, expires_at) VALUES (?, ?, ?)", + (key, json.dumps(payload), expires_at), + ) + conn.commit() + + +def _is_fresh(fetched_at: str, ttl_hours: int | None) -> bool: + if ttl_hours is None: + return True + return datetime.fromisoformat(fetched_at) >= datetime.now(UTC) - timedelta(hours=ttl_hours) + + +def save_search_page( + conn: sqlite3.Connection, + url: str, + html: str, + ttl_minutes: int = 60, +) -> None: + cache_set(conn, f"search_page:{url}", {"html": html}, ttl_minutes=ttl_minutes) + + +def get_search_page(conn: sqlite3.Connection, url: str) -> str | None: + payload = cache_get(conn, f"search_page:{url}") + if not payload: + return None + return payload.get("html") + + +def save_search_cards( + conn: sqlite3.Connection, + url: str, + cards: list[FinnSearchCard], + ttl_minutes: int = 60, +) -> None: + cache_set( + conn, + f"search_cards:{url}", + [card.model_dump(mode="json") for card in cards], + ttl_minutes=ttl_minutes, + ) + + +def get_search_cards(conn: sqlite3.Connection, url: str) -> list[FinnSearchCard]: + payload = cache_get(conn, f"search_cards:{url}") + if not payload: + return [] + return [FinnSearchCard.model_validate(item) for item in payload] + + +def save_finn_ad(conn: sqlite3.Connection, ad: FinnAd) -> None: + cursor = conn.cursor() + payload = ad.model_dump(mode="json") + cursor.execute( + "INSERT OR REPLACE INTO finn_ads (finnkode, url, payload, fetched_at) VALUES (?, ?, ?, ?)", + ( + ad.finnkode, + ad.url, + json.dumps(payload), + ad.detail_fetched_at.isoformat() + if ad.detail_fetched_at + else datetime.now(UTC).isoformat(), + ), + ) + conn.commit() + + +def get_finn_ad( + conn: sqlite3.Connection, finnkode: str, ttl_hours: int | None = None +) -> FinnAd | None: + cursor = conn.cursor() + cursor.execute("SELECT payload, fetched_at FROM finn_ads WHERE finnkode = ?", (finnkode,)) + row = cursor.fetchone() + if not row: + return None + if ttl_hours is not None and not _is_fresh(row["fetched_at"], ttl_hours): + return None + return FinnAd.model_validate(json.loads(row["payload"])) + + +def save_eiendom_unit(conn: sqlite3.Connection, unit: EiendomUnit) -> None: + cursor = conn.cursor() + cursor.execute( + "INSERT OR REPLACE INTO eiendom_units (unit_code, payload, fetched_at) VALUES (?, ?, ?)", + (unit.unit_code, json.dumps(unit.model_dump(mode="json")), unit.fetched_at.isoformat()), + ) + conn.commit() + + +def get_eiendom_unit( + conn: sqlite3.Connection, + unit_code: str, + ttl_hours: int | None = None, +) -> EiendomUnit | None: + cursor = conn.cursor() + cursor.execute( + "SELECT payload, fetched_at FROM eiendom_units WHERE unit_code = ?", + (unit_code,), + ) + row = cursor.fetchone() + if not row: + return None + if ttl_hours is not None and not _is_fresh(row["fetched_at"], ttl_hours): + return None + return EiendomUnit.model_validate(json.loads(row["payload"])) + + +def save_similar_units( + conn: sqlite3.Connection, + unit_code: str, + listing_status: str, + similar_units: list[SimilarUnit], +) -> None: + cursor = conn.cursor() + payload = json.dumps([item.model_dump(mode="json") for item in similar_units]) + cursor.execute( + ( + "INSERT INTO similar_units" + " (unit_code, listing_status, payload, fetched_at)" + " VALUES (?, ?, ?, ?)" + ), + (unit_code, listing_status, payload, datetime.now(UTC).isoformat()), + ) + conn.commit() + + +def get_similar_units( + conn: sqlite3.Connection, + unit_code: str, + listing_status: str, + ttl_hours: int | None = None, +) -> list[SimilarUnit]: + cursor = conn.cursor() + cursor.execute( + ( + "SELECT payload, fetched_at FROM similar_units" + " WHERE unit_code = ? AND listing_status = ?" + " ORDER BY id DESC LIMIT 1" + ), + (unit_code, listing_status), + ) + row = cursor.fetchone() + if not row: + return [] + if ttl_hours is not None and not _is_fresh(row["fetched_at"], ttl_hours): + return [] + return [SimilarUnit.model_validate(item) for item in json.loads(row["payload"])] diff --git a/finn_eiendom/config.py b/finn_eiendom/config.py new file mode 100644 index 0000000..c56b9e7 --- /dev/null +++ b/finn_eiendom/config.py @@ -0,0 +1,30 @@ +"""Configuration and environment variables.""" + +import os +from pathlib import Path + +# Cache and database +FINN_CACHE_PATH = os.getenv("FINN_CACHE_PATH", str(Path("data/finn.sqlite"))) + +# FINN API settings +FINN_MAX_SEARCH_PAGES = int(os.getenv("FINN_MAX_SEARCH_PAGES", "3")) +FINN_DETAIL_LIMIT = int(os.getenv("FINN_DETAIL_LIMIT", "20")) +FINN_REQUEST_DELAY_SECONDS = float(os.getenv("FINN_REQUEST_DELAY_SECONDS", "2")) +FINN_USER_AGENT = os.getenv("FINN_USER_AGENT", "personal-finn-eiendom-analyzer/0.1") +FINN_CACHE_TTL_SEARCH_MINUTES = int(os.getenv("FINN_CACHE_TTL_SEARCH_MINUTES", "60")) +FINN_CACHE_TTL_AD_HOURS = int(os.getenv("FINN_CACHE_TTL_AD_HOURS", "24")) + +# Eiendom.no API settings +EIENDOM_NO_ENABLED = os.getenv("EIENDOM_NO_ENABLED", "true").lower() == "true" +EIENDOM_NO_BASE_URL = os.getenv("EIENDOM_NO_BASE_URL", "https://api.eiendom.no/api/v1") +EIENDOM_NO_REQUEST_DELAY_SECONDS = float(os.getenv("EIENDOM_NO_REQUEST_DELAY_SECONDS", "1")) +EIENDOM_NO_CACHE_TTL_HOURS = int(os.getenv("EIENDOM_NO_CACHE_TTL_HOURS", "24")) +EIENDOM_NO_SIMILAR_UNITS_ENABLED = ( + os.getenv("EIENDOM_NO_SIMILAR_UNITS_ENABLED", "true").lower() == "true" +) +EIENDOM_NO_SIMILAR_UNITS_DEFAULT_STATUS = os.getenv( + "EIENDOM_NO_SIMILAR_UNITS_DEFAULT_STATUS", "RECENTLY_SOLD" +) + +# Logging +LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") diff --git a/finn_eiendom/eiendom_no.py b/finn_eiendom/eiendom_no.py new file mode 100644 index 0000000..64bb5c0 --- /dev/null +++ b/finn_eiendom/eiendom_no.py @@ -0,0 +1,236 @@ +"""Eiendom.no enrichment, unit vector, and similar units client.""" + +import base64 +import logging +from typing import Any + +import msgpack + +from .config import ( + EIENDOM_NO_BASE_URL, + EIENDOM_NO_ENABLED, + EIENDOM_NO_REQUEST_DELAY_SECONDS, + EIENDOM_NO_SIMILAR_UNITS_DEFAULT_STATUS, +) +from .http import HTTPClient +from .models import EiendomUnit, SimilarUnit, UnitVector +from .parser import extract_finnkode_from_url, normalize_finnkode + +logger = logging.getLogger(__name__) + + +def _extract_coordinates(geometry: dict) -> tuple[float | None, float | None]: + if not isinstance(geometry, dict): + return None, None + coords = geometry.get("coordinates") or [] + if isinstance(coords, (list, tuple)) and len(coords) >= 2: + return coords[0], coords[1] + return None, None + + +def parse_eiendom_unit_json(unit_data: dict) -> EiendomUnit: + geometry = unit_data.get("geometry", {}) + lon, lat = _extract_coordinates(geometry) + specification = unit_data.get("specification", {}) + valuation = unit_data.get("valuation", {}) + market = unit_data.get("latestMarketData", {}) + + return EiendomUnit( + unit_code=unit_data.get("unitCode", ""), + address=unit_data.get("address") or unit_data.get("streetAddress"), + lat=lat or unit_data.get("lat"), + lng=lon or unit_data.get("lon"), + property_type=specification.get("propertyType") or unit_data.get("propertyType"), + floor=specification.get("floor") or unit_data.get("floor"), + rooms=specification.get("rooms") or unit_data.get("rooms"), + construction_year=specification.get("constructionYear") + or unit_data.get("constructionYear"), + usable_area=specification.get("usableArea") or unit_data.get("usableArea"), + estimated_selling_price=valuation.get("estimatedSellingPrice") + or unit_data.get("estimatedSellingPrice"), + estimated_selling_price_lower=valuation.get("estimatedSellingPriceLower") + or unit_data.get("estimatedSellingPriceLower"), + estimated_selling_price_upper=valuation.get("estimatedSellingPriceUpper") + or unit_data.get("estimatedSellingPriceUpper"), + listing_price=market.get("listingPrice") or unit_data.get("listingPrice"), + listing_sqm_price=market.get("squareMeterPrice") + or unit_data.get("listingSquareMeterPrice"), + common_costs=market.get("monthlyCosts") + or market.get("commonCosts") + or unit_data.get("commonCosts"), + days_on_market=market.get("daysOnMarket") or unit_data.get("daysOnMarket"), + sale_status=market.get("saleStatus") or unit_data.get("saleStatus"), + market_placement_score=market.get("marketPlacementScore") + or unit_data.get("marketPlacementScore"), + ) + + +def parse_similar_units_json(response_data: dict) -> list[SimilarUnit]: + units: list[SimilarUnit] = [] + for item in response_data.get("units", []): + geometry = item.get("geometry", {}) + lon, lat = _extract_coordinates(geometry) + specification = item.get("specification", {}) + market = item.get("marketData", {}) + units.append( + SimilarUnit( + unit_code=item.get("unitCode", ""), + address=item.get("address"), + lat=lat or item.get("lat"), + lng=lon or item.get("lon"), + property_type=specification.get("propertyType") or item.get("propertyType"), + floor=specification.get("floor") or item.get("floor"), + rooms=specification.get("rooms") or item.get("rooms"), + construction_year=specification.get("constructionYear") + or item.get("constructionYear"), + usable_area=specification.get("usableArea") or item.get("usableArea"), + listing_price=market.get("listingPrice") or item.get("listingPrice"), + selling_price=market.get("sellingPrice") or item.get("sellingPrice"), + shared_debt=market.get("jointDebt") or item.get("sharedDebt"), + common_costs=market.get("monthlyCosts") or item.get("commonCosts"), + sqm_price=market.get("squareMeterPrice") or item.get("squareMeterPrice"), + days_on_market=market.get("daysOnMarket") or item.get("daysOnMarket"), + sale_status=market.get("saleStatus") or item.get("saleStatus"), + finalized_at=item.get("finalizedAt") or market.get("finalizedAt"), + listing_status=item.get("listingStatus", "RECENTLY_SOLD"), + ) + ) + return units + + +def build_unit_vector(unit: EiendomUnit) -> str: + """Build a base64url-encoded unit_vector from EiendomUnit data.""" + payload = UnitVector( + lon=unit.lng or 0.0, + lat=unit.lat or 0.0, + ptype=unit.property_type or "APARTMENT", + floor=unit.floor, + rooms=unit.rooms, + built=unit.construction_year, + area=unit.usable_area, + price=unit.listing_price or unit.estimated_selling_price, + ) + packed = msgpack.packb(payload.model_dump(), use_bin_type=True) + encoded = base64.urlsafe_b64encode(packed).decode("utf-8").rstrip("=") + return encoded + + +def decode_unit_vector(vector_str: str) -> dict: + """Decode a base64url unit_vector for debugging.""" + padding = 4 - (len(vector_str) % 4) + if padding != 4: + vector_str += "=" * padding + packed = base64.urlsafe_b64decode(vector_str.encode("utf-8")) + return msgpack.unpackb(packed, raw=False) + + +async def search_unit_from_finn_url( + finn_url: str, + client: HTTPClient | None = None, +) -> EiendomUnit | None: + if not EIENDOM_NO_ENABLED or not finn_url: + logger.info("Eiendom.no unit search is disabled or finn_url is empty") + return None + + client = client or HTTPClient( + base_url=EIENDOM_NO_BASE_URL, + request_delay_seconds=EIENDOM_NO_REQUEST_DELAY_SECONDS, + ) + response = await client.get( + "/geodata/units/search/", + params={"search": finn_url}, + ) + data = response.json() + units = data.get("units", []) + if not units: + return None + return parse_eiendom_unit_json(units[0]) + + +async def get_unit( + unit_code: str, + client: HTTPClient | None = None, +) -> EiendomUnit | None: + if not EIENDOM_NO_ENABLED: + logger.info("Eiendom.no enrichment is disabled") + return None + + client = client or HTTPClient( + base_url=EIENDOM_NO_BASE_URL, + request_delay_seconds=EIENDOM_NO_REQUEST_DELAY_SECONDS, + ) + path = f"/geodata/units/{unit_code}/" + response = await client.get(path) + data = response.json() + units = data.get("units") or [] + if not units and isinstance(data, dict) and data.get("unitCode"): + return parse_eiendom_unit_json(data) + if not units: + return None + return parse_eiendom_unit_json(units[0]) + + +async def get_eiendom_unit( + unit_code: str, + client: HTTPClient | None = None, +) -> EiendomUnit | None: + return await get_unit(unit_code, client=client) + + +async def get_similar_units( + unit_vector: str, + listing_status: str = EIENDOM_NO_SIMILAR_UNITS_DEFAULT_STATUS, + client: HTTPClient | None = None, +) -> list[SimilarUnit]: + if not EIENDOM_NO_ENABLED: + logger.info("Eiendom.no similar-units disabled") + return [] + + client = client or HTTPClient( + base_url=EIENDOM_NO_BASE_URL, + request_delay_seconds=EIENDOM_NO_REQUEST_DELAY_SECONDS, + ) + response = await client.get( + "/geodata/units/similar/", + params={"unit_vector": unit_vector}, + ) + data = response.json() + units = parse_similar_units_json(data) + + listing_status = (listing_status or "").upper() + if listing_status == "RECENTLY_SOLD": + units = [ + unit + for unit in units + if unit.sale_status and unit.sale_status.upper() == "SOLD" and unit.finalized_at + ] + elif listing_status == "FOR_SALE": + units = [ + unit for unit in units if unit.sale_status and unit.sale_status.upper() == "FORSALE" + ] + + return units + + +def resolve_unit_from_finn_url(finn_url: str) -> str | None: + """Resolve the FINN URL into a unit identifier or unitCode placeholder.""" + if not finn_url: + return None + candidate = normalize_finnkode(extract_finnkode_from_url(finn_url)) + if candidate: + return candidate + return None + + +async def enrich_ad_with_eiendom_no( + ad: Any, + unit_code: str | None = None, + client: HTTPClient | None = None, +) -> EiendomUnit | None: + if not unit_code: + return None + unit = await get_eiendom_unit(unit_code, client=client) + if unit is None: + return None + unit.unit_vector = build_unit_vector(unit) + return unit diff --git a/finn_eiendom/http.py b/finn_eiendom/http.py new file mode 100644 index 0000000..32bcbbb --- /dev/null +++ b/finn_eiendom/http.py @@ -0,0 +1,122 @@ +"""HTTP client with retries, delays, and error handling.""" + +import asyncio +import logging + +import httpx + +logger = logging.getLogger(__name__) + + +class HTTPClient: + """HTTP client with configurable retries, delays, and timeout.""" + + def __init__( + self, + base_url: str = "", + user_agent: str = "personal-finn-eiendom-analyzer/0.1", + request_delay_seconds: float = 0.0, + retries: int = 1, + timeout_seconds: float = 30.0, + ): + """ + Initialize HTTP client. + + Args: + base_url: Base URL for requests + user_agent: User-Agent header value + request_delay_seconds: Delay between requests (to be respectful) + retries: Number of retry attempts for failed connections + timeout_seconds: Request timeout + """ + self.base_url = base_url + self.user_agent = user_agent + self.request_delay_seconds = request_delay_seconds + self.timeout = httpx.Timeout(timeout_seconds) + self.transport = httpx.AsyncHTTPTransport(retries=retries) + self.last_request_time: float | None = None + + async def get(self, url: str, **kwargs) -> httpx.Response: + """ + Make async GET request with delay and error handling. + + Args: + url: URL to fetch + **kwargs: Additional httpx arguments + + Returns: + httpx.Response + + Raises: + httpx.HTTPStatusError if status is 4xx or 5xx + """ + headers = kwargs.pop("headers", {}) + if "User-Agent" not in headers: + headers["User-Agent"] = self.user_agent + + for attempt in range(self._get_retries() + 1): + await self._apply_delay() + + async with httpx.AsyncClient( + timeout=self.timeout, + base_url=self.base_url if not url.startswith("http") else "", + ) as client: + try: + response = await client.get(url, headers=headers, **kwargs) + if response.status_code < 500: + response.raise_for_status() + logger.debug(f"GET {url} -> {response.status_code}") + return response + if attempt < self._get_retries(): + await asyncio.sleep(2**attempt) + continue + response.raise_for_status() + return response + except httpx.HTTPStatusError as e: + logger.error(f"HTTP {e.response.status_code} for {url}") + raise + except httpx.RequestError as e: + logger.error(f"Request failed for {url}: {e}") + raise + + def _get_retries(self) -> int: + """Get retries count from transport.""" + if hasattr(self.transport, "_retries"): + return self.transport._retries + return 1 + + async def post(self, url: str, **kwargs) -> httpx.Response: + """Make async POST request with delay and error handling.""" + headers = kwargs.pop("headers", {}) + if "User-Agent" not in headers: + headers["User-Agent"] = self.user_agent + + for attempt in range(self._get_retries() + 1): + await self._apply_delay() + + async with httpx.AsyncClient( + timeout=self.timeout, + base_url=self.base_url if not url.startswith("http") else "", + ) as client: + try: + response = await client.post(url, headers=headers, **kwargs) + if response.status_code < 500: + response.raise_for_status() + logger.debug(f"POST {url} -> {response.status_code}") + return response + if attempt < self._get_retries(): + await asyncio.sleep(2**attempt) + continue + response.raise_for_status() + return response + except httpx.HTTPStatusError as e: + logger.error(f"HTTP {e.response.status_code} for {url}") + raise + except httpx.RequestError as e: + logger.error(f"Request failed for {url}: {e}") + raise + + async def _apply_delay(self): + """Apply delay between requests if configured.""" + if self.request_delay_seconds > 0: + await asyncio.sleep(self.request_delay_seconds) diff --git a/finn_eiendom/mcp_server.py b/finn_eiendom/mcp_server.py new file mode 100644 index 0000000..3658f07 --- /dev/null +++ b/finn_eiendom/mcp_server.py @@ -0,0 +1,160 @@ +"""FastMCP stdio server for FINN real estate analysis and Eiendom.no enrichment.""" + +import json +import logging + +from mcp.server.fastmcp import FastMCP + +from .analysis import analyze_search +from .eiendom_no import ( + build_unit_vector, + decode_unit_vector, + get_similar_units, + get_unit, + search_unit_from_finn_url, +) +from .service import get_or_fetch_ad, get_or_fetch_eiendom_unit + +logger = logging.getLogger(__name__) + +mcp = FastMCP("finn_eiendom_mcp") + + +@mcp.tool( + description=( + "Analyze a FINN.no real estate search URL. Scrapes listing cards," + " fetches details, enriches with Eiendom.no data, scores, and ranks." + ) +) +async def finn_analyze_search( + search_url: str, + max_pages: int = 3, + detail_limit: int = 20, + include_details: bool = True, + include_eiendom_no: bool = True, +) -> str: + """Analyze a FINN search URL and return ranked listing results.""" + try: + result = await analyze_search( + search_url, + max_pages=max_pages, + fetch_details=include_details, + detail_limit=detail_limit, + include_eiendom_no=include_eiendom_no, + ) + return json.dumps(result) + except Exception as e: + logger.error(f"Error analyzing search: {e}") + return json.dumps({"error": True, "message": str(e)}) + + +@mcp.tool( + description=( + "Fetch full detail for a FINN listing by finnkode." + " Checks cache first; use force_refresh=True to bypass." + ) +) +async def finn_get_ad(finnkode: str, force_refresh: bool = False) -> str: + """Fetch FINN ad details by finnkode.""" + try: + ad = await get_or_fetch_ad(finnkode, force_refresh=force_refresh) + return ad.model_dump_json() + except Exception as e: + logger.error(f"Error fetching ad {finnkode}: {e}") + return json.dumps({"error": True, "message": str(e)}) + + +@mcp.tool( + description="Resolve an Eiendom.no unit_code from a FINN listing URL. " + "Returns unit_code, address, lat, lng or an error if not found." +) +async def finn_resolve_eiendom_unit(finn_url: str) -> str: + """Resolve Eiendom.no unit from FINN URL.""" + try: + unit = await search_unit_from_finn_url(finn_url) + if unit is None: + return json.dumps( + { + "error": True, + "message": "Eiendom.no unit could not be resolved from FINN URL", + } + ) + return json.dumps( + { + "unit_code": unit.unit_code, + "address": unit.address, + "lat": unit.lat, + "lng": unit.lng, + } + ) + except Exception as e: + logger.error(f"Error resolving unit from {finn_url}: {e}") + return json.dumps({"error": True, "message": str(e)}) + + +@mcp.tool( + description="Fetch full Eiendom.no unit data by unit_code. Checks SQLite cache (24h TTL)." +) +async def finn_get_eiendom_unit(unit_code: str, force_refresh: bool = False) -> str: + """Fetch Eiendom.no unit details by unit_code.""" + try: + unit = await get_or_fetch_eiendom_unit(unit_code, force_refresh=force_refresh) + if unit is None: + return json.dumps({"error": True, "message": "Eiendom.no unit not found"}) + return unit.model_dump_json() + except Exception as e: + logger.error(f"Error fetching unit {unit_code}: {e}") + return json.dumps({"error": True, "message": str(e)}) + + +@mcp.tool( + description="Fetch comparable recently-sold or for-sale units from Eiendom.no using a " + "base64-encoded unit vector. Returns list of similar units with sale prices." +) +async def finn_get_similar_units(unit_vector: str, listing_status: str = "RECENTLY_SOLD") -> str: + """Fetch similar units from Eiendom.no.""" + try: + units = await get_similar_units(unit_vector, listing_status) + return json.dumps([unit.model_dump() for unit in units]) + except Exception as e: + logger.error(f"Error fetching similar units: {e}") + return json.dumps({"error": True, "message": str(e)}) + + +@mcp.tool( + description="Build a base64-encoded unit vector for a given Eiendom.no unit_code. " + "The vector is used as input to finn_get_similar_units." +) +async def finn_build_unit_vector(unit_code: str) -> str: + """Build unit vector for Eiendom.no unit.""" + try: + unit = await get_unit(unit_code) + if unit is None: + return json.dumps({"error": True, "message": "Eiendom.no unit not found"}) + return json.dumps({"unit_code": unit.unit_code, "unit_vector": build_unit_vector(unit)}) + except Exception as e: + logger.error(f"Error building unit vector for {unit_code}: {e}") + return json.dumps({"error": True, "message": str(e)}) + + +@mcp.tool( + description="Decode a base64 unit vector into human-readable JSON (lat, lon, property type, " + "floor, rooms, construction year, area, price)." +) +def finn_decode_unit_vector(unit_vector: str) -> str: + """Decode unit vector to readable format.""" + try: + result = decode_unit_vector(unit_vector) + return json.dumps(result) + except Exception as e: + logger.error(f"Error decoding unit vector: {e}") + return json.dumps({"error": True, "message": str(e)}) + + +def main() -> None: + """Run the FastMCP stdio server.""" + mcp.run(transport="stdio") + + +if __name__ == "__main__": + main() diff --git a/finn_eiendom/models.py b/finn_eiendom/models.py new file mode 100644 index 0000000..7ef876f --- /dev/null +++ b/finn_eiendom/models.py @@ -0,0 +1,128 @@ +"""Pydantic models for FINN ads and Eiendom.no units.""" + +from datetime import UTC, datetime + +from pydantic import BaseModel, ConfigDict, Field + + +class FinnSearchCard(BaseModel): + """FINN search result card (minimal fields from search listing).""" + + finnkode: str + url: str + title: str | None = None + address: str | None = None + area_m2: int | None = None + asking_price: int | None = None + total_price: int | None = None + common_costs: int | None = None + property_type: str | None = None + ownership_type: str | None = None + bedrooms: int | None = None + floor: str | None = None + broker_company: str | None = None + + +class FinnAd(BaseModel): + """FINN listing detail with all available fields.""" + + finnkode: str + url: str + title: str | None = None + address: str | None = None + postal_area: str | None = None + district: str | None = None + property_type: str | None = None + ownership_type: str | None = None + asking_price: int | None = None + total_price: int | None = None + shared_debt: int | None = None + common_costs: int | None = None + municipal_fee: int | None = None + other_fees: int | None = None + area_m2: int | None = None + rooms: int | None = None + bedrooms: int | None = None + floor: str | None = None + construction_year: int | None = None + energy_rating: str | None = None + heating: str | None = None + has_balcony: bool | None = None + has_terrace: bool | None = None + has_elevator: bool | None = None + has_parking: bool | None = None + has_garage: bool | None = None + listing_description: str | None = None + broker_name: str | None = None + broker_company: str | None = None + first_seen_at: datetime = Field(default_factory=lambda: datetime.now(UTC)) + last_seen_at: datetime = Field(default_factory=lambda: datetime.now(UTC)) + detail_fetched_at: datetime | None = None + eiendom_unit_code: str | None = None + + model_config = ConfigDict(serializers={datetime: lambda v: v.isoformat()}) + + +class EiendomUnit(BaseModel): + """Eiendom.no unit detail with market data.""" + + unit_code: str + address: str | None = None + lat: float | None = None + lng: float | None = None + property_type: str | None = None + floor: int | None = None + rooms: int | None = None + construction_year: int | None = None + usable_area: int | None = None + estimated_selling_price: int | None = None + estimated_selling_price_lower: int | None = None + estimated_selling_price_upper: int | None = None + listing_price: int | None = None + listing_sqm_price: int | None = None + common_costs: int | None = None + days_on_market: int | None = None + sale_status: str | None = None + market_placement_score: str | None = None + unit_vector: str | None = None + fetched_at: datetime = Field(default_factory=lambda: datetime.now(UTC)) + + model_config = ConfigDict(serializers={datetime: lambda v: v.isoformat()}) + + +class SimilarUnit(BaseModel): + """Eiendom.no similar unit (comp) result.""" + + unit_code: str + address: str | None = None + lat: float | None = None + lng: float | None = None + property_type: str | None = None + floor: int | None = None + rooms: int | None = None + construction_year: int | None = None + usable_area: int | None = None + listing_price: int | None = None + selling_price: int | None = None + shared_debt: int | None = None + common_costs: int | None = None + sqm_price: int | None = None + days_on_market: int | None = None + sale_status: str | None = None + finalized_at: datetime | None = None + listing_status: str = Field(default="RECENTLY_SOLD") + + model_config = ConfigDict(serializers={datetime: lambda v: v.isoformat() if v else None}) + + +class UnitVector(BaseModel): + """Unit vector payload for similar-units API.""" + + lon: float + lat: float + ptype: str # property type: APARTMENT, HOUSE, etc. + floor: int | None = None + rooms: int | None = None + built: int | None = None # construction year + area: int | None = None # usable area + price: int | None = None # listing or estimated price diff --git a/finn_eiendom/parser.py b/finn_eiendom/parser.py new file mode 100644 index 0000000..147a0a2 --- /dev/null +++ b/finn_eiendom/parser.py @@ -0,0 +1,88 @@ +"""Normalization and parsing helpers.""" + +import re + + +def normalize_price(price_str: str | None) -> int | None: + """ + Normalize Norwegian formatted price to integer. + Example: "7 200 991 kr" -> 7200991 + """ + if not price_str: + return None + # Remove "kr" and spaces, keep only digits + normalized = re.sub(r"[^\d]", "", price_str) + try: + return int(normalized) if normalized else None + except ValueError: + return None + + +def normalize_area(area_str: str | None) -> int | None: + """ + Normalize area string to integer. + Example: "77 m²" -> 77 + """ + if not area_str: + return None + cleaned = area_str.replace(" ", "") + match = re.search(r"(\d+(?:[.,]\d+)?)", cleaned) + if match: + value = match.group(1).replace(",", ".") + try: + return int(float(value)) + except ValueError: + return None + return None + + +def normalize_number(num_str: str | None) -> int | None: + """ + Normalize Norwegian formatted number to integer. + Handles text like "3 500 kr/mnd" and "7,2". + """ + if not num_str: + return None + cleaned = re.sub(r"[^\d,\.]", "", num_str) + cleaned = cleaned.replace(" ", "") + if "," in cleaned: + cleaned = cleaned.replace(".", "").replace(",", ".") + else: + cleaned = cleaned.replace(".", "") + try: + return int(float(cleaned)) if cleaned else None + except ValueError: + return None + + +def normalize_finnkode(finnkode: str | None) -> str | None: + """Normalize finnkode to string, strip whitespace.""" + if not finnkode: + return None + return str(finnkode).strip() + + +def extract_finnkode_from_url(url: str) -> str | None: + """ + Extract finnkode from FINN URL. + Example: https://www.finn.no/realestate/homes/ad.html?finnkode=462400360 -> 462400360 + """ + match = re.search(r"finnkode=(\d+)", url) + if match: + return match.group(1) + return None + + +def text_to_bool(text: str | None) -> bool: + """Convert text to boolean.""" + if not text: + return False + return text.lower() in ("ja", "yes", "true", "1", "y") + + +def clean_text(text: str | None) -> str | None: + """Clean and normalize text: strip, collapse whitespace.""" + if not text: + return None + cleaned = " ".join(text.split()) + return cleaned if cleaned else None diff --git a/finn_eiendom/scoring.py b/finn_eiendom/scoring.py new file mode 100644 index 0000000..64627f9 --- /dev/null +++ b/finn_eiendom/scoring.py @@ -0,0 +1,146 @@ +"""Scoring engine for FINN listings enriched with Eiendom.no data.""" + +import logging +from typing import Any + +from .models import EiendomUnit, SimilarUnit + +logger = logging.getLogger(__name__) + + +def _clamp(value: float, min_value: float, max_value: float) -> float: + return max(min_value, min(max_value, value)) + + +def score_market_position(unit: EiendomUnit | None) -> float: + if unit is None or unit.estimated_selling_price is None or unit.listing_price is None: + return 0.0 + ratio = unit.listing_price / unit.estimated_selling_price + if ratio <= 0.9: + return 20.0 + if ratio <= 1.0: + return 16.0 + (1.0 - ratio) * 40.0 + if ratio <= 1.1: + return 12.0 - (ratio - 1.0) * 40.0 + return 5.0 + + +def score_economy(ad: Any, unit: EiendomUnit | None) -> float: + if ad.total_price is None: + return 0.0 + if unit and unit.estimated_selling_price: + ratio = ad.total_price / unit.estimated_selling_price + if ratio <= 0.95: + return 20.0 + if ratio <= 1.0: + return 15.0 + if ratio <= 1.05: + return 10.0 + return 6.0 + if ad.asking_price and ad.total_price <= ad.asking_price: + return 12.0 + return 8.0 + + +def score_comparable_sales(listings: list[SimilarUnit], listing_price: int | None) -> float: + if not listings or listing_price is None: + return 0.0 + selling_prices = [unit.selling_price for unit in listings if unit.selling_price] + if not selling_prices: + return 0.0 + average = sum(selling_prices) / len(selling_prices) + ratio = listing_price / average + score = (1.0 - abs(ratio - 1.0)) * 20.0 + return float(_clamp(score, 0.0, 20.0)) + + +def score_location(address: str | None, district: str | None) -> float: + if not address and not district: + return 0.0 + if district and "oslo" in district.lower(): + return 15.0 + if address and "oslo" in address.lower(): + return 12.0 + return 7.0 + + +def score_layout_and_potential(description: str | None, rooms: int | None) -> float: + score = 0.0 + if rooms and rooms >= 4: + score += 10.0 + if description and "potensial" in description.lower(): + score += 8.0 + return float(_clamp(score, 0.0, 20.0)) + + +def score_outdoor_and_view(description: str | None) -> float: + if not description: + return 0.0 + score = 5.0 if "utsikt" in description.lower() or "balkong" in description.lower() else 0.0 + return float(_clamp(score, 0.0, 15.0)) + + +def score_rental_potential(description: str | None) -> float: + if not description: + return 0.0 + score = 10.0 if "hybel" in description.lower() or "leie" in description.lower() else 0.0 + return score + + +def score_renovation_upside(description: str | None, asking_price: int | None) -> float: + score = 0.0 + if description and "renover" in description.lower(): + score += 10.0 + if asking_price and asking_price > 0: + score += 5.0 + return float(_clamp(score, 0.0, 15.0)) + + +def score_risk(description: str | None, unit: EiendomUnit | None) -> float: + if unit is None: + return -10.0 + if description and "usikker" in description.lower(): + return -10.0 + return 0.0 + + +def score_ad( + ad: Any, unit: EiendomUnit | None, similar_units: list[SimilarUnit] +) -> dict[str, float]: + scores = { + "economy": score_economy(ad, unit), + "market_position": score_market_position(unit), + "comparable_sales": score_comparable_sales( + similar_units, ad.total_price or ad.asking_price + ), + "location": score_location(ad.address, ad.district), + "layout": score_layout_and_potential(ad.listing_description, ad.rooms), + "outdoor": score_outdoor_and_view(ad.listing_description), + "rental_potential": score_rental_potential(ad.listing_description), + "renovation": score_renovation_upside(ad.listing_description, ad.asking_price), + "risk": score_risk(ad.listing_description, unit), + } + scores["total"] = float(_clamp(sum(scores.values()), 0.0, 100.0)) + return scores + + +def classify_ad(scores: dict[str, float]) -> list[str]: + categories: list[str] = [] + total = scores.get("total", 0.0) + if total >= 70: + categories.append("bargain_candidate") + if total >= 60: + categories.append("safe_candidate") + if 50 <= total < 70: + categories.append("lifestyle_candidate") + if scores.get("renovation", 0.0) >= 8: + categories.append("renovation_candidate") + if scores.get("rental_potential", 0.0) >= 5: + categories.append("hybel_candidate") + if scores.get("risk", 0.0) < 0: + categories.append("risk_object") + if total < 30: + categories.append("not_interesting") + if 30 <= total < 60: + categories.append("manual_review_required") + return categories diff --git a/finn_eiendom/search.py b/finn_eiendom/search.py new file mode 100644 index 0000000..86ea72c --- /dev/null +++ b/finn_eiendom/search.py @@ -0,0 +1,194 @@ +"""FINN search scraping and parsing.""" + +import logging +import re + +from bs4 import BeautifulSoup + +from . import cache +from .config import FINN_CACHE_TTL_SEARCH_MINUTES +from .http import HTTPClient +from .models import FinnSearchCard +from .parser import ( + clean_text, + extract_finnkode_from_url, + normalize_area, + normalize_finnkode, + normalize_number, + normalize_price, +) + +logger = logging.getLogger(__name__) + + +async def fetch_search_page(url: str, client: HTTPClient | None = None) -> str: + """Fetch a FINN search page HTML.""" + client = client or HTTPClient(request_delay_seconds=0.0) + response = await client.get(url) + return response.text + + +async def fetch_search_page_cached( + url: str, + client: HTTPClient | None = None, + conn: cache.sqlite3.Connection | None = None, + use_cache: bool = True, +) -> str: + """Fetch a FINN search page with optional SQLite caching.""" + client = client or HTTPClient(request_delay_seconds=0.0) + conn = conn or cache.init_db() + if use_cache: + cached_html = cache.get_search_page(conn, url) + if cached_html: + logger.debug("Using cached search page: %s", url) + return cached_html + + html = await fetch_search_page(url, client=client) + cache.save_search_page(conn, url, html, ttl_minutes=FINN_CACHE_TTL_SEARCH_MINUTES) + return html + + +def extract_ad_links(html: str) -> list[str]: + """Extract listing URLs from FINN search HTML.""" + soup = BeautifulSoup(html, "html.parser") + links = [] + for article in soup.select("article.listing-card, article.sf-search-ad"): + anchor = article.select_one("a[href*='finnkode']") + if anchor and anchor.get("href"): + links.append(clean_text(anchor.get("href")) or "") + return links + + +def _extract_int_from_text(text: str, pattern: str) -> int | None: + match = re.search(pattern, text, re.I) + if match: + return normalize_number(match.group(1)) + return None + + +def _extract_area_from_text(text: str) -> int | None: + matches = re.findall(r"(\d+(?:[.,]\d+)?)\s*(?:m²|m2|kvm)", text, re.I) + if matches: + return normalize_area(matches[-1]) + return None + + +def _extract_price_from_text(text: str, label: str) -> int | None: + pattern = rf"{label}[:\s]*([\d\s]+kr)" + match = re.search(pattern, text, re.I) + if match: + return normalize_price(match.group(1)) + return None + + +def extract_search_cards(html: str) -> list[FinnSearchCard]: + """Parse FINN search HTML and return a list of FinnSearchCard objects.""" + logger.debug("Extracting FINN search cards") + soup = BeautifulSoup(html, "html.parser") + cards: list[FinnSearchCard] = [] + + for card in soup.select("article.listing-card, article.sf-search-ad"): + data_id = card.get("data-id") + anchor = card.select_one("a[href*='finnkode']") + url = anchor.get("href") if anchor else "" + finnkode = normalize_finnkode(data_id or extract_finnkode_from_url(url)) + if not finnkode: + logger.debug("Skipping card with missing finnkode") + continue + + title_elem = card.select_one(".title, h2.sf-realestate-heading, a.sf-search-ad-link") + address_elem = card.select_one(".location, .sf-realestate-location") + area_elem = card.select_one(".area") + price_elem = card.select_one(".price") + common_costs_elem = card.select_one(".common-costs") + bedrooms_elem = card.select_one(".bedrooms") + property_type_elem = card.select_one(".property-type") + ownership_type_elem = card.select_one(".ownership-type") + broker_elem = card.select_one(".broker-company") + + card_text = clean_text(card.get_text(" ") or "") + + bedrooms = None + if bedrooms_elem: + bedrooms = normalize_number(bedrooms_elem.get_text()) + elif card_text: + bedrooms = _extract_int_from_text(card_text, r"(\d+)\s*soverom") + + common_costs = None + if common_costs_elem: + common_costs = normalize_number(common_costs_elem.get_text()) + elif card_text: + common_costs = _extract_int_from_text( + card_text, r"(?:Fellesutg|Felleskost(?:er)?)[^\d]*(\d+[\d\s]*)kr" + ) + + total_price = None + if price_elem: + total_price = normalize_price(price_elem.get_text()) + if not total_price and card_text: + total_price = _extract_price_from_text(card_text, r"Totalpris") + if not total_price and card_text: + first_price_match = re.search(r"([\d\s]+kr)", card_text) + if first_price_match: + total_price = normalize_price(first_price_match.group(1)) + + area_m2 = None + if area_elem: + area_m2 = normalize_area(area_elem.get_text()) + elif card_text: + area_m2 = _extract_area_from_text(card_text) + + card_data = FinnSearchCard( + finnkode=finnkode, + url=url or "", + title=clean_text(title_elem.get_text()) if title_elem else None, + address=clean_text(address_elem.get_text()) if address_elem else None, + area_m2=area_m2, + asking_price=None, + total_price=total_price, + common_costs=common_costs, + property_type=clean_text(property_type_elem.get_text()) if property_type_elem else None, + ownership_type=clean_text(ownership_type_elem.get_text()) + if ownership_type_elem + else None, + bedrooms=bedrooms, + floor=None, + broker_company=clean_text(broker_elem.get_text()) if broker_elem else None, + ) + cards.append(card_data) + logger.debug("Parsed FINN search card %s", finnkode) + + return cards + + +def find_next_page_url(html: str) -> str | None: + """Return the FINN search next page URL if present.""" + soup = BeautifulSoup(html, "html.parser") + next_link = soup.select_one("a[rel='next']") + if next_link and next_link.get("href"): + return clean_text(next_link.get("href")) + return None + + +async def fetch_search_pages( + start_url: str, + max_pages: int = 1, + client: HTTPClient | None = None, + use_cache: bool = True, +) -> list[FinnSearchCard]: + """Fetch paginated FINN search pages and parse search cards.""" + client = client or HTTPClient(request_delay_seconds=0.0) + conn = cache.init_db() + url = start_url + all_cards: list[FinnSearchCard] = [] + + for _ in range(max_pages): + html = await fetch_search_page_cached(url, client=client, conn=conn, use_cache=use_cache) + all_cards.extend(extract_search_cards(html)) + next_url = find_next_page_url(html) + if not next_url: + break + url = next_url + logger.debug("Following next page link: %s", url) + + return all_cards diff --git a/finn_eiendom/service.py b/finn_eiendom/service.py new file mode 100644 index 0000000..bf11192 --- /dev/null +++ b/finn_eiendom/service.py @@ -0,0 +1,35 @@ +"""Service layer for cache-aware fetching of FINN ads and Eiendom.no units.""" + +import logging + +from .ad import fetch_ad_details +from .cache import get_eiendom_unit as get_cached_eiendom_unit +from .cache import get_finn_ad, init_db, save_eiendom_unit, save_finn_ad +from .config import FINN_CACHE_PATH +from .eiendom_no import get_unit +from .models import EiendomUnit, FinnAd + +logger = logging.getLogger(__name__) + + +async def get_or_fetch_ad(finnkode: str, force_refresh: bool = False) -> FinnAd: + """Get FinnAd from cache or fetch fresh. Never returns None.""" + conn = init_db(FINN_CACHE_PATH) + ad = None if force_refresh else get_finn_ad(conn, finnkode, ttl_hours=24) + if ad is None: + ad = await fetch_ad_details(finnkode) + save_finn_ad(conn, ad) + return ad + + +async def get_or_fetch_eiendom_unit( + unit_code: str, force_refresh: bool = False +) -> EiendomUnit | None: + """Get EiendomUnit from cache or fetch fresh.""" + conn = init_db(FINN_CACHE_PATH) + unit = None if force_refresh else get_cached_eiendom_unit(conn, unit_code, ttl_hours=24) + if unit is None: + unit = await get_unit(unit_code) + if unit is not None: + save_eiendom_unit(conn, unit) + return unit diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9d5102f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +[project] +name = "finn-eiendom-mcp" +version = "0.1.0" +description = "Private FINN and Eiendom.no real estate MCP scout" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "beautifulsoup4>=4.12.0", + "httpx>=0.27.0", + "lxml>=5.0.0", + "mcp[cli]>=1.0.0", + "msgpack>=1.0.0", + "pydantic>=2.8.0", + "pydantic-settings>=2.4.0", + "python-dotenv>=1.0.0", +] + +[project.scripts] +finn-eiendom-mcp = "finn_eiendom.mcp_server:main" + +[dependency-groups] +dev = [ + "ipython>=8.0.0", + "mypy>=1.10.0", + "pytest>=8.0.0", + "pytest-asyncio>=0.23.0", + "respx>=0.21.0", + "ruff>=0.6.0", +] + +[tool.ruff] +line-length = 100 +target-version = "py312" + +[tool.ruff.lint] +select = ["E", "F", "I", "UP", "B", "SIM"] +ignore = [] + +[tool.ruff.lint.per-file-ignores] +"tests/fixtures.py" = ["E501"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto" + +[tool.mypy] +python_version = "3.12" +strict = true +plugins = [] \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..4e9d29c --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Test fixtures and utilities.""" diff --git a/tests/fixtures.py b/tests/fixtures.py new file mode 100644 index 0000000..aeab59a --- /dev/null +++ b/tests/fixtures.py @@ -0,0 +1,236 @@ +"""Fixture data for testing without hitting live APIs.""" +# noqa: E501 + +SAMPLE_FINN_SEARCH_HTML = """ + + +FINN.no - Leiligheter til salgs + + + + +""" + +# noqa: E501 +SAMPLE_FINN_SEARCH_HTML_NEW = """ + + +FINN.no - Leiligheter til salgs + +
+ +
+ + +""" + +SAMPLE_FINN_LISTING_HTML = """ + + +Flott 3-roms i Ferner - FINN.no + +
+
+

Flott 3-roms i Ferner

+
Totalpris: 7 200 991 kr
+
+
+
+
Adresse
+
Fernerveien 42, 0554 Oslo
+
Område
+
Grünerløkka
+
Postnummer
+
0554
+
Eierform
+
Eierbolig
+
Eiendomstype
+
Leilighet
+
Prisantydning
+
7 200 000 kr
+
Totalpris
+
7 200 991 kr
+
Fellesgjeld
+
0 kr
+
Felles utgifter
+
3 500 kr/mnd
+
Boligareal
+
77 m²
+
Rom
+
4
+
Soverom
+
3
+
Etasje
+
4. etasje
+
Byggeår
+
2005
+
Energimerking
+
C
+
Oppvarming
+
Fjernvarme
+
Balkonger/terrasser
+
Ja, balkonger
+
Heis
+
Ja
+
Parkering/garasje
+
Privat parkering
+
+
+
+

Beskrivelse

+

Flott beliggenhet med fin utsikt over Oslo. Moderne kjøkken og bad.

+

Klar til visning!

+
+
+
+ Meglerhuset AS + Telefon: 21 00 00 00 +
+
+
+ + +""" + +SAMPLE_FINN_LISTING_HTML_NEW = """ + + +Romslig 5-roms i 5.etasje med heisadkomst + +
+

Romslig 5-roms i 5.etasje med heisadkomst | 2 hybler | 4 balkonger | Ingen dokavgift!

+ Hegdehaugsveien 3, 0352 Oslo + Homansbyen +
+
Prisantydning10 900 000 kr
+
Totalpris
10 986 901 kr
+
Fellesgjeld
76 911 kr
+
Felleskost/mnd.
12 011 kr
+
+
+
BoligtypeLeilighet
+
EieformAndel
+
Soverom2
+
Rom5
+
Byggeår1938
+
Internt bruksareal124 m² (BRA-i)
+
+
FasiliteterBalkong/TerrasseParkettHeis
+
+

Om boligen

+

Her bor du med kort vei til daglige behov og offentlig transport.

+
+
+ + +""" + +SAMPLE_EIENDOM_UNIT_JSON = { + "units": [ + { + "unitCode": "c-gxw-xmyum-s2a", + "address": "Fernerveien 42, 0554 Oslo", + "municipality": "Oslo", + "lat": 59.9287, + "lon": 10.7803, + "propertyType": "APARTMENT", + "floor": 4, + "rooms": 4, + "constructionYear": 2005, + "usableArea": 77, + "estimatedSellingPrice": 7650000, + "estimatedSellingPriceLower": 6900000, + "estimatedSellingPriceUpper": 8400000, + "listingPrice": 7200000, + "listingSquareMeterPrice": 93500, + "commonCosts": 3500, + "daysOnMarket": 12, + "saleStatus": "FOR_SALE", + "marketPlacementScore": "ABOVE_AVERAGE", + "similarUnitCount": 12, + "averageSquareMeterPrice": 98000, + } + ] +} + +SAMPLE_EIENDOM_SIMILAR_UNITS_JSON = { + "units": [ + { + "unitCode": "c-recent-1", + "address": "Birketveien 10, 0554 Oslo", + "lat": 59.9290, + "lon": 10.7810, + "propertyType": "APARTMENT", + "floor": 3, + "rooms": 3, + "constructionYear": 2004, + "usableArea": 75, + "listingPrice": 7100000, + "sellingPrice": 7050000, + "sharedDebt": 0, + "commonCosts": 3400, + "squareMeterPrice": 94000, + "daysOnMarket": 18, + "saleStatus": "SOLD", + "finalizedAt": "2024-05-01", + }, + { + "unitCode": "c-recent-2", + "address": "Sommers gate 5, 0554 Oslo", + "lat": 59.9280, + "lon": 10.7820, + "propertyType": "APARTMENT", + "floor": 2, + "rooms": 4, + "constructionYear": 2006, + "usableArea": 80, + "listingPrice": 7400000, + "sellingPrice": 7350000, + "sharedDebt": 0, + "commonCosts": 3600, + "squareMeterPrice": 91875, + "daysOnMarket": 22, + "saleStatus": "SOLD", + "finalizedAt": "2024-04-28", + }, + ] +} diff --git a/tests/test_ad.py b/tests/test_ad.py new file mode 100644 index 0000000..6f6450f --- /dev/null +++ b/tests/test_ad.py @@ -0,0 +1,45 @@ +from finn_eiendom.ad import scrape_ad +from tests.fixtures import SAMPLE_FINN_LISTING_HTML, SAMPLE_FINN_LISTING_HTML_NEW + + +def test_scrape_ad(): + ad = scrape_ad( + SAMPLE_FINN_LISTING_HTML, + url="https://www.finn.no/realestate/homes/ad.html?finnkode=462400360", + ) + assert ad.finnkode == "462400360" + assert ad.title == "Flott 3-roms i Ferner" + assert ad.address == "Fernerveien 42, 0554 Oslo" + assert ad.area_m2 == 77 + assert ad.asking_price == 7200000 + assert ad.total_price == 7200991 + assert ad.common_costs == 3500 + assert ad.rooms == 4 + assert ad.bedrooms == 3 + assert ad.floor == "4. etasje" + assert ad.construction_year == 2005 + assert ad.energy_rating == "C" + assert ad.heating == "Fjernvarme" + assert "Moderne kjøkken" in ad.listing_description + assert ad.broker_company == "Meglerhuset AS" + + +def test_scrape_ad_new_structure(): + ad = scrape_ad( + SAMPLE_FINN_LISTING_HTML_NEW, + url="https://www.finn.no/realestate/homes/ad.html?finnkode=455978973", + ) + assert ad.finnkode == "455978973" + assert ad.title.startswith("Romslig 5-roms i 5.etasje") + assert ad.address == "Hegdehaugsveien 3, 0352 Oslo" + assert ad.property_type == "Leilighet" + assert ad.ownership_type == "Andel" + assert ad.asking_price == 10900000 + assert ad.total_price == 10986901 + assert ad.common_costs == 12011 + assert ad.area_m2 == 124 + assert ad.rooms == 5 + assert ad.bedrooms == 2 + assert ad.construction_year == 1938 + assert ad.floor == "5. etasje" + assert "kort vei" in ad.listing_description.lower() diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..d35b718 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,71 @@ +import tempfile +from datetime import UTC, datetime, timedelta +from pathlib import Path + +from finn_eiendom.cache import ( + get_eiendom_unit, + get_finn_ad, + get_search_page, + get_similar_units, + init_db, + save_eiendom_unit, + save_finn_ad, + save_search_page, + save_similar_units, +) +from finn_eiendom.models import EiendomUnit, FinnAd, SimilarUnit + + +def test_cache_roundtrip(): + with tempfile.TemporaryDirectory() as tmpdir: + db_path = Path(tmpdir) / "cache.sqlite" + conn = init_db(str(db_path)) + + ad = FinnAd(finnkode="1234", url="https://example.com", title="Test") + save_finn_ad(conn, ad) + loaded_ad = get_finn_ad(conn, "1234") + assert loaded_ad is not None + assert loaded_ad.finnkode == "1234" + assert loaded_ad.url == "https://example.com" + + unit = EiendomUnit(unit_code="abc", address="Oslo") + save_eiendom_unit(conn, unit) + loaded_unit = get_eiendom_unit(conn, "abc") + assert loaded_unit is not None + assert loaded_unit.address == "Oslo" + + comps = [ + SimilarUnit(unit_code="x1"), + SimilarUnit(unit_code="x2"), + ] + save_similar_units(conn, "abc", "RECENTLY_SOLD", comps) + loaded_comps = get_similar_units(conn, "abc", "RECENTLY_SOLD") + assert len(loaded_comps) == 2 + assert loaded_comps[0].unit_code == "x1" + + +def test_search_page_cache_roundtrip(): + with tempfile.TemporaryDirectory() as tmpdir: + conn = init_db(str(Path(tmpdir) / "cache.sqlite")) + + html = "search page" + url = "https://www.finn.no/realestate/homes/search.html" + + save_search_page(conn, url, html, ttl_minutes=5) + loaded_html = get_search_page(conn, url) + assert loaded_html == html + + +def test_finn_ad_cache_ttl_expiration(): + with tempfile.TemporaryDirectory() as tmpdir: + conn = init_db(str(Path(tmpdir) / "cache.sqlite")) + + ad = FinnAd( + finnkode="1234", + url="https://example.com", + title="Test", + detail_fetched_at=datetime.now(UTC) - timedelta(hours=2), + ) + save_finn_ad(conn, ad) + expired_ad = get_finn_ad(conn, "1234", ttl_hours=1) + assert expired_ad is None diff --git a/tests/test_eiendom_no.py b/tests/test_eiendom_no.py new file mode 100644 index 0000000..43eba03 --- /dev/null +++ b/tests/test_eiendom_no.py @@ -0,0 +1,44 @@ +from finn_eiendom.eiendom_no import ( + build_unit_vector, + decode_unit_vector, + parse_eiendom_unit_json, + parse_similar_units_json, + resolve_unit_from_finn_url, +) +from tests.fixtures import ( + SAMPLE_EIENDOM_SIMILAR_UNITS_JSON, + SAMPLE_EIENDOM_UNIT_JSON, +) + + +def test_parse_eiendom_unit_json(): + unit = parse_eiendom_unit_json(SAMPLE_EIENDOM_UNIT_JSON["units"][0]) + assert unit.unit_code == "c-gxw-xmyum-s2a" + assert unit.address == "Fernerveien 42, 0554 Oslo" + assert unit.estimated_selling_price == 7650000 + assert unit.listing_sqm_price == 93500 + + +def test_unit_vector_roundtrip(): + unit = parse_eiendom_unit_json(SAMPLE_EIENDOM_UNIT_JSON["units"][0]) + vector = build_unit_vector(unit) + decoded = decode_unit_vector(vector) + assert decoded["ptype"] == "APARTMENT" + assert decoded["area"] == 77 + assert decoded["price"] == 7200000 + assert isinstance(decoded, dict) + assert decoded["lon"] == unit.lng + + +def test_parse_similar_units_json(): + comps = parse_similar_units_json(SAMPLE_EIENDOM_SIMILAR_UNITS_JSON) + assert len(comps) == 2 + assert comps[0].unit_code == "c-recent-1" + assert comps[1].selling_price == 7350000 + + +def test_resolve_unit_from_finn_url(): + unit_code = resolve_unit_from_finn_url( + "https://www.finn.no/realestate/homes/ad.html?finnkode=462400360" + ) + assert unit_code == "462400360" diff --git a/tests/test_http.py b/tests/test_http.py new file mode 100644 index 0000000..81506ba --- /dev/null +++ b/tests/test_http.py @@ -0,0 +1,83 @@ +"""Tests for HTTP client retry logic.""" + +import httpx +import pytest +import respx + +from finn_eiendom.http import HTTPClient + + +@pytest.mark.asyncio +async def test_get_retries_on_500(): + """Test that HTTPClient retries on 500 errors and succeeds on second attempt.""" + client = HTTPClient(request_delay_seconds=0.0, retries=2) + + with respx.mock: + route = respx.get("https://example.com/api") + route.side_effect = [ + httpx.Response(500, text="Server Error"), + httpx.Response(200, text="Success"), + ] + + response = await client.get("https://example.com/api") + assert response.status_code == 200 + + +@pytest.mark.asyncio +async def test_get_raises_on_404(): + """Test that HTTPClient raises on 4xx errors immediately.""" + client = HTTPClient(request_delay_seconds=0.0, retries=2) + + with respx.mock: + respx.get("https://example.com/api").mock(return_value=httpx.Response(404)) + + with pytest.raises(httpx.HTTPStatusError) as exc_info: + await client.get("https://example.com/api") + + assert exc_info.value.response.status_code == 404 + + +@pytest.mark.asyncio +async def test_get_retries_on_502_bad_gateway(): + """Test that HTTPClient retries on 502 Bad Gateway.""" + client = HTTPClient(request_delay_seconds=0.0, retries=2) + + with respx.mock: + route = respx.get("https://example.com/api") + route.side_effect = [ + httpx.Response(502, text="Bad Gateway"), + httpx.Response(200, text="Success"), + ] + + response = await client.get("https://example.com/api") + assert response.status_code == 200 + + +@pytest.mark.asyncio +async def test_post_retries_on_503(): + """Test that HTTPClient retries POST on 503 Service Unavailable.""" + client = HTTPClient(request_delay_seconds=0.0, retries=2) + + with respx.mock: + route = respx.post("https://example.com/api") + route.side_effect = [ + httpx.Response(503, text="Service Unavailable"), + httpx.Response(201, json={"success": True}), + ] + + response = await client.post("https://example.com/api", json={"test": "data"}) + assert response.status_code == 201 + + +@pytest.mark.asyncio +async def test_get_eventually_fails_on_persistent_500(): + """Test that HTTPClient gives up after max retries.""" + client = HTTPClient(request_delay_seconds=0.0, retries=1) + + with respx.mock: + respx.get("https://example.com/api").mock(return_value=httpx.Response(500)) + + with pytest.raises(httpx.HTTPStatusError) as exc_info: + await client.get("https://example.com/api") + + assert exc_info.value.response.status_code == 500 diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py new file mode 100644 index 0000000..cd9d92b --- /dev/null +++ b/tests/test_mcp_server.py @@ -0,0 +1,69 @@ +"""Tests for the MCP server tools.""" + +import json + +from finn_eiendom.mcp_server import ( + finn_decode_unit_vector, + mcp, +) + + +def test_mcp_server_has_correct_tools(): + """Assert that the MCP server has all expected tools.""" + import asyncio + + async def check_tools(): + tools = await mcp.list_tools() + tool_names = {tool.name for tool in tools} + expected_tools = { + "finn_analyze_search", + "finn_get_ad", + "finn_resolve_eiendom_unit", + "finn_get_eiendom_unit", + "finn_get_similar_units", + "finn_build_unit_vector", + "finn_decode_unit_vector", + } + assert expected_tools.issubset(tool_names), f"Missing tools: {expected_tools - tool_names}" + + asyncio.run(check_tools()) + + +def test_finn_decode_unit_vector_returns_json(): + """Test that finn_decode_unit_vector returns valid JSON with expected keys.""" + from unittest.mock import patch + + test_vector = { + "lon": 10.7, + "lat": 59.9, + "ptype": "APARTMENT", + "floor": 3, + "rooms": 3, + "built": 2000, + "area": 80, + "price": 5000000, + } + + with patch("finn_eiendom.mcp_server.decode_unit_vector", return_value=test_vector): + result = finn_decode_unit_vector("dGVzdA==") + + data = json.loads(result) + assert "lon" in data + assert "lat" in data + assert "ptype" in data + assert data["lat"] == 59.9 + assert data["lon"] == 10.7 + + +def test_finn_decode_unit_vector_error_handling(): + """Test that finn_decode_unit_vector handles errors gracefully.""" + from unittest.mock import patch + + with patch( + "finn_eiendom.mcp_server.decode_unit_vector", side_effect=Exception("decode failed") + ): + result = finn_decode_unit_vector("invalid") + + data = json.loads(result) + assert data.get("error") is True + assert "message" in data diff --git a/tests/test_parser.py b/tests/test_parser.py new file mode 100644 index 0000000..5dba2cf --- /dev/null +++ b/tests/test_parser.py @@ -0,0 +1,45 @@ +from finn_eiendom.parser import ( + clean_text, + extract_finnkode_from_url, + normalize_area, + normalize_finnkode, + normalize_number, + normalize_price, +) + + +def test_normalize_price(): + assert normalize_price("7 200 991 kr") == 7200991 + assert normalize_price("1 234") == 1234 + assert normalize_price(None) is None + + +def test_normalize_area(): + assert normalize_area("77 m²") == 77 + assert normalize_area("100,5 m²") == 100 + assert normalize_area("") is None + + +def test_normalize_number(): + assert normalize_number("3 500 kr/mnd") == 3500 + assert normalize_number("7,2") == 7 + assert normalize_number("1.234") == 1234 + assert normalize_number(None) is None + + +def test_normalize_finnkode(): + assert normalize_finnkode(" 462400360 ") == "462400360" + assert normalize_finnkode(None) is None + + +def test_extract_finnkode_from_url(): + assert ( + extract_finnkode_from_url("https://www.finn.no/realestate/homes/ad.html?finnkode=462400360") + == "462400360" + ) + assert extract_finnkode_from_url("https://www.finn.no/realestate/homes/ad.html") is None + + +def test_clean_text(): + assert clean_text(" Hello world \n") == "Hello world" + assert clean_text(None) is None diff --git a/tests/test_scoring.py b/tests/test_scoring.py new file mode 100644 index 0000000..33f2029 --- /dev/null +++ b/tests/test_scoring.py @@ -0,0 +1,22 @@ +from finn_eiendom.models import EiendomUnit, FinnAd +from finn_eiendom.scoring import classify_ad, score_ad + + +def test_score_ad_and_classify(): + ad = FinnAd( + finnkode="462400360", + url="https://www.finn.no/realestate/homes/ad.html?finnkode=462400360", + title="Flott 3-roms i Ferner", + ) + unit = EiendomUnit( + unit_code="c-gxw-xmyum-s2a", + estimated_selling_price=7650000, + listing_price=7200000, + property_type="APARTMENT", + usable_area=77, + rooms=4, + ) + scores = score_ad(ad, unit, []) + assert scores["market_position"] >= 0 + categories = classify_ad(scores) + assert isinstance(categories, list) diff --git a/tests/test_search.py b/tests/test_search.py new file mode 100644 index 0000000..495918a --- /dev/null +++ b/tests/test_search.py @@ -0,0 +1,38 @@ +from finn_eiendom.search import extract_ad_links, extract_search_cards +from tests.fixtures import SAMPLE_FINN_SEARCH_HTML, SAMPLE_FINN_SEARCH_HTML_NEW + + +def test_extract_search_cards(): + cards = extract_search_cards(SAMPLE_FINN_SEARCH_HTML) + assert len(cards) == 2 + assert cards[0].finnkode == "462400360" + assert cards[0].url.endswith("finnkode=462400360") + assert cards[0].area_m2 == 77 + assert cards[0].total_price == 7200991 + assert cards[0].common_costs == 3500 + assert cards[1].bedrooms == 2 + + +def test_extract_search_cards_new_format(): + cards = extract_search_cards(SAMPLE_FINN_SEARCH_HTML_NEW) + assert len(cards) == 1 + assert cards[0].finnkode == "462880791" + assert cards[0].url.endswith("finnkode=462880791") + assert cards[0].address == "Lofotgata 4B, Oslo" + assert cards[0].area_m2 == 62 + assert cards[0].total_price == 7253377 + assert cards[0].common_costs == 7067 + assert cards[0].bedrooms == 2 + + +def test_extract_ad_links(): + links = extract_ad_links(SAMPLE_FINN_SEARCH_HTML) + assert len(links) == 2 + assert "finnkode=462400360" in links[0] + assert "finnkode=460784945" in links[1] + + +def test_extract_ad_links_new_format(): + links = extract_ad_links(SAMPLE_FINN_SEARCH_HTML_NEW) + assert len(links) == 1 + assert "finnkode=462880791" in links[0] diff --git a/tests/test_service.py b/tests/test_service.py new file mode 100644 index 0000000..f6a69f7 --- /dev/null +++ b/tests/test_service.py @@ -0,0 +1,97 @@ +"""Tests for the service layer (cache-aware fetching).""" + +from unittest.mock import patch + +import pytest + +from finn_eiendom.models import EiendomUnit, FinnAd +from finn_eiendom.service import get_or_fetch_ad, get_or_fetch_eiendom_unit + + +@pytest.mark.asyncio +async def test_get_or_fetch_ad_uses_cache(): + """Test that get_or_fetch_ad returns cached ad without fetching.""" + mock_ad = FinnAd(finnkode="123", url="http://example.com") + + with ( + patch("finn_eiendom.service.init_db"), + patch("finn_eiendom.service.get_finn_ad", return_value=mock_ad) as mock_get, + patch("finn_eiendom.service.fetch_ad_details") as mock_fetch, + ): + result = await get_or_fetch_ad("123") + + assert result.finnkode == "123" + mock_get.assert_called_once() + mock_fetch.assert_not_called() + + +@pytest.mark.asyncio +async def test_get_or_fetch_ad_fetches_when_cache_miss(): + """Test that get_or_fetch_ad fetches when cache is empty.""" + mock_ad = FinnAd(finnkode="123", url="http://example.com") + + with ( + patch("finn_eiendom.service.init_db"), + patch("finn_eiendom.service.get_finn_ad", return_value=None), + patch("finn_eiendom.service.fetch_ad_details", return_value=mock_ad) as mock_fetch, + patch("finn_eiendom.service.save_finn_ad") as mock_save, + ): + result = await get_or_fetch_ad("123") + + assert result.finnkode == "123" + mock_fetch.assert_called_once_with("123") + mock_save.assert_called_once() + + +@pytest.mark.asyncio +async def test_get_or_fetch_ad_force_refresh(): + """Test that force_refresh=True bypasses cache.""" + mock_ad = FinnAd(finnkode="123", url="http://example.com") + + with ( + patch("finn_eiendom.service.init_db"), + patch("finn_eiendom.service.get_finn_ad", return_value=mock_ad) as mock_get, + patch("finn_eiendom.service.fetch_ad_details", return_value=mock_ad) as mock_fetch, + patch("finn_eiendom.service.save_finn_ad") as mock_save, + ): + result = await get_or_fetch_ad("123", force_refresh=True) + + assert result.finnkode == "123" + mock_get.assert_not_called() + mock_fetch.assert_called_once_with("123") + mock_save.assert_called_once() + + +@pytest.mark.asyncio +async def test_get_or_fetch_eiendom_unit_uses_cache(): + """Test that get_or_fetch_eiendom_unit returns cached unit without fetching.""" + mock_unit = EiendomUnit(unit_code="test-code") + + with ( + patch("finn_eiendom.service.init_db"), + patch("finn_eiendom.service.get_cached_eiendom_unit", return_value=mock_unit) as mock_get, + patch("finn_eiendom.service.get_unit") as mock_fetch, + ): + result = await get_or_fetch_eiendom_unit("test-code") + + assert result.unit_code == "test-code" + mock_get.assert_called_once() + mock_fetch.assert_not_called() + + +@pytest.mark.asyncio +async def test_get_or_fetch_eiendom_unit_fetches_when_cache_miss(): + """Test that get_or_fetch_eiendom_unit fetches when cache is empty.""" + mock_unit = EiendomUnit(unit_code="test-code") + + with ( + patch("finn_eiendom.service.init_db"), + patch("finn_eiendom.service.get_cached_eiendom_unit", return_value=None), + patch("finn_eiendom.service.get_unit", return_value=mock_unit) as mock_fetch, + patch("finn_eiendom.service.save_eiendom_unit") as mock_save, + ): + result = await get_or_fetch_eiendom_unit("test-code") + + assert result.unit_code == "test-code" + mock_fetch.assert_called_once_with("test-code") + mock_save.assert_called_once()