89 lines
2.3 KiB
Python
89 lines
2.3 KiB
Python
"""Normalization and parsing helpers."""
|
|
|
|
import re
|
|
|
|
|
|
def normalize_price(price_str: str | None) -> int | None:
|
|
"""
|
|
Normalize Norwegian formatted price to integer.
|
|
Example: "7 200 991 kr" -> 7200991
|
|
"""
|
|
if not price_str:
|
|
return None
|
|
# Remove "kr" and spaces, keep only digits
|
|
normalized = re.sub(r"[^\d]", "", price_str)
|
|
try:
|
|
return int(normalized) if normalized else None
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def normalize_area(area_str: str | None) -> int | None:
|
|
"""
|
|
Normalize area string to integer.
|
|
Example: "77 m²" -> 77
|
|
"""
|
|
if not area_str:
|
|
return None
|
|
cleaned = area_str.replace(" ", "")
|
|
match = re.search(r"(\d+(?:[.,]\d+)?)", cleaned)
|
|
if match:
|
|
value = match.group(1).replace(",", ".")
|
|
try:
|
|
return int(float(value))
|
|
except ValueError:
|
|
return None
|
|
return None
|
|
|
|
|
|
def normalize_number(num_str: str | None) -> int | None:
|
|
"""
|
|
Normalize Norwegian formatted number to integer.
|
|
Handles text like "3 500 kr/mnd" and "7,2".
|
|
"""
|
|
if not num_str:
|
|
return None
|
|
cleaned = re.sub(r"[^\d,\.]", "", num_str)
|
|
cleaned = cleaned.replace(" ", "")
|
|
if "," in cleaned:
|
|
cleaned = cleaned.replace(".", "").replace(",", ".")
|
|
else:
|
|
cleaned = cleaned.replace(".", "")
|
|
try:
|
|
return int(float(cleaned)) if cleaned else None
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def normalize_finnkode(finnkode: str | None) -> str | None:
|
|
"""Normalize finnkode to string, strip whitespace."""
|
|
if not finnkode:
|
|
return None
|
|
return str(finnkode).strip()
|
|
|
|
|
|
def extract_finnkode_from_url(url: str) -> str | None:
|
|
"""
|
|
Extract finnkode from FINN URL.
|
|
Example: https://www.finn.no/realestate/homes/ad.html?finnkode=462400360 -> 462400360
|
|
"""
|
|
match = re.search(r"finnkode=(\d+)", url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def text_to_bool(text: str | None) -> bool:
|
|
"""Convert text to boolean."""
|
|
if not text:
|
|
return False
|
|
return text.lower() in ("ja", "yes", "true", "1", "y")
|
|
|
|
|
|
def clean_text(text: str | None) -> str | None:
|
|
"""Clean and normalize text: strip, collapse whitespace."""
|
|
if not text:
|
|
return None
|
|
cleaned = " ".join(text.split())
|
|
return cleaned if cleaned else None
|