"""
build_parcel_centroids_59.py
============================
Download the Nord (59) cadastre parcel GeoJSON from Etalab and build a
lightweight parcel-ID → centroid lookup table used by build_dvf_historique.py.

Output:
  data/2026/parcelles_59_centroids.json   — {parcel_id_14: [lat, lon]}
  data/2026/cadastre-59-parcelles.json.gz — raw download (kept as cache)

Parcel ID format (14 chars, matches etalab cadastre + geo-DVF):
  {dept_2}{commune_3}{prefixe_3}{section_2}{plan_4}
  e.g. "59350000AK0216"

Memory: streaming GeoJSON parser — O(1) peak, no full-file load.
RAM requirement: < 300 MB (centroid dict only).
Disk: 215 MB download + ~40 MB output.

Run once before build_dvf_historique.py:
    python build_parcel_centroids_59.py
"""

from __future__ import annotations

import gzip
import json
import logging
import os
import urllib.request

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger("build_parcel_centroids_59")

_DATA_DIR    = os.path.join(os.path.dirname(__file__), "data", "2026")
_GZ_PATH     = os.path.join(_DATA_DIR, "cadastre-59-parcelles.json.gz")
_OUTPUT_PATH = os.path.join(_DATA_DIR, "parcelles_59_centroids.json")
_SOURCE_URL  = (
    "https://cadastre.data.gouv.fr/data/etalab-cadastre/latest/"
    "geojson/departements/59/cadastre-59-parcelles.json.gz"
)


# ---------------------------------------------------------------------------
# Download
# ---------------------------------------------------------------------------

def _download(force: bool = False) -> None:
    if os.path.exists(_GZ_PATH) and not force:
        size_mb = os.path.getsize(_GZ_PATH) / 1_048_576
        log.info("Cadastre GeoJSON already present (%.0f Mo) — skip (--force to re-download)", size_mb)
        return

    log.info("Downloading cadastre-59-parcelles.json.gz from Etalab…")
    os.makedirs(_DATA_DIR, exist_ok=True)
    tmp = _GZ_PATH + ".tmp"

    req = urllib.request.Request(
        _SOURCE_URL,
        headers={"User-Agent": "agent-immobilier/1.0 (parcel centroid builder)"},
    )
    with urllib.request.urlopen(req, timeout=600) as resp:
        total = int(resp.headers.get("Content-Length", 0))
        downloaded = 0
        chunk_size = 1 * 1024 * 1024  # 1 MB
        with open(tmp, "wb") as out:
            while True:
                chunk = resp.read(chunk_size)
                if not chunk:
                    break
                out.write(chunk)
                downloaded += len(chunk)
                if total:
                    pct = 100 * downloaded / total
                    if downloaded % (10 * 1024 * 1024) < chunk_size:  # log every ~10 MB
                        log.info("  %.0f / %.0f Mo  (%.0f%%)",
                                 downloaded / 1_048_576, total / 1_048_576, pct)

    os.replace(tmp, _GZ_PATH)
    log.info("Download complete → %s (%.0f Mo)", _GZ_PATH, os.path.getsize(_GZ_PATH) / 1_048_576)


# ---------------------------------------------------------------------------
# Streaming GeoJSON parser
# ---------------------------------------------------------------------------

def _iter_features(gz_path: str):
    """
    Yield parsed feature dicts from a gzipped GeoJSON FeatureCollection
    without loading the entire file into memory.

    Uses json.JSONDecoder.raw_decode() to parse one feature at a time from
    an accumulating text buffer, so peak RAM ≈ size of one feature (<10 KB).
    """
    decoder = json.JSONDecoder()

    with gzip.open(gz_path, "rt", encoding="utf-8") as f:
        # Skip header until we reach the features array
        buf = ""
        while '"features":' not in buf:
            chunk = f.read(8192)
            if not chunk:
                return
            buf += chunk

        # Advance past '"features":['
        idx = buf.index('"features":') + len('"features":')
        buf = buf[idx:].lstrip()
        if buf.startswith("["):
            buf = buf[1:]

        # Stream features one by one
        read_chunk = lambda: f.read(65536)  # 64 KB per read
        while True:
            buf = buf.lstrip(" \t\n\r,")

            if not buf:
                chunk = read_chunk()
                if not chunk:
                    return
                buf += chunk
                continue

            if buf[0] == "]":
                return  # end of features array

            if buf[0] == "{":
                # Attempt to parse one feature
                try:
                    obj, end = decoder.raw_decode(buf)
                    yield obj
                    buf = buf[end:]
                except json.JSONDecodeError:
                    # Incomplete JSON — need more data
                    chunk = read_chunk()
                    if not chunk:
                        return
                    buf += chunk
            else:
                # Unexpected char — read more
                chunk = read_chunk()
                if not chunk:
                    return
                buf += chunk


# ---------------------------------------------------------------------------
# Centroid computation
# ---------------------------------------------------------------------------

def _centroid(geometry: dict) -> tuple[float, float] | None:
    """
    Return (lat, lon) centroid of a GeoJSON geometry.
    For Polygon: averages exterior ring coordinates.
    For MultiPolygon: uses the ring with the most vertices.
    Returns None if geometry is invalid.
    """
    gtype  = geometry.get("type", "")
    coords = geometry.get("coordinates")
    if not coords:
        return None

    if gtype == "Polygon":
        ring = coords[0]
    elif gtype == "MultiPolygon":
        ring = max(coords, key=lambda p: len(p[0]))[0]
    else:
        return None

    if not ring:
        return None

    lons = [pt[0] for pt in ring]
    lats = [pt[1] for pt in ring]
    return round(sum(lats) / len(lats), 7), round(sum(lons) / len(lons), 7)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def build_centroids(force_dl: bool = False, force_build: bool = False) -> None:
    if os.path.exists(_OUTPUT_PATH) and not force_build:
        with open(_OUTPUT_PATH, encoding="utf-8") as f:
            existing = json.load(f)
        log.info(
            "Centroid cache already present (%d parcels) → %s",
            len(existing), _OUTPUT_PATH,
        )
        log.info("Use --force to rebuild.")
        return

    _download(force=force_dl)

    log.info("Streaming and parsing parcelles GeoJSON…")
    centroids: dict[str, list[float]] = {}
    count      = 0
    no_geom    = 0

    for feat in _iter_features(_GZ_PATH):
        count += 1
        parcel_id = feat.get("id") or (feat.get("properties") or {}).get("id", "")
        if not parcel_id or not parcel_id.startswith("59"):
            continue  # safety: only Nord parcels
        geometry = feat.get("geometry") or {}
        c = _centroid(geometry)
        if c is None:
            no_geom += 1
            continue
        centroids[parcel_id] = list(c)  # [lat, lon]

        if count % 100_000 == 0:
            log.info("  %d features parsed, %d Nord centroids built", count, len(centroids))

    log.info(
        "Parsing complete: %d features total, %d Nord parcel centroids, %d no-geometry",
        count, len(centroids), no_geom,
    )

    log.info("Writing centroid cache → %s", _OUTPUT_PATH)
    with open(_OUTPUT_PATH, "w", encoding="utf-8") as f:
        json.dump(centroids, f, separators=(",", ":"))

    size_mb = os.path.getsize(_OUTPUT_PATH) / 1_048_576
    log.info("Done: %d parcels, %.1f Mo", len(centroids), size_mb)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Build Nord (59) parcel centroid cache from Etalab cadastre"
    )
    parser.add_argument("--force-dl",    action="store_true",
                        help="Re-download the GeoJSON even if already cached")
    parser.add_argument("--force-build", action="store_true",
                        help="Rebuild centroid JSON even if already present")
    args = parser.parse_args()

    print()
    print("=" * 60)
    print("  Cadastre Nord (59) — parcel centroid builder")
    print("=" * 60)
    print(f"  Source  : {_SOURCE_URL}")
    print(f"  GZ cache: {_GZ_PATH}")
    print(f"  Output  : {_OUTPUT_PATH}")
    print()

    build_centroids(force_dl=args.force_dl, force_build=args.force_build)

    print()
    print("  Next step:")
    print("    python build_dvf_historique.py --force")
    print("=" * 60)
