"""
build_securite_lille.py
=======================
One-time preprocessing: filters the national MdI security data file down to
Lille métropole communes and writes a lighter Excel subset used by the rest of
the pipeline (build_securite_nord.py → build_iris_data.py).

Input  (NOT committed — large national file, ~30 MB):
  data/2026/securite-info-complements-data.gouv-2025-geographie2025-produit-le2026-02-03.xlsx
  Download: https://www.data.gouv.fr/fr/datasets/statistiques-et-indicateurs-de-la-securite-au-quotidien/

Output (committed — lightweight subset):
  data/2026/securite_lille_metro.xlsx

The output file is already committed to the repository.
Re-run only if the source file changes (e.g. new annual release).

Usage:
    python build_securite_lille.py
"""

from __future__ import annotations

import os
import sys

import pandas as pd

_ROOT = os.path.dirname(os.path.abspath(__file__))
_SRC  = os.path.join(
    _ROOT, "data", "2026",
    "securite-info-complements-data.gouv-2025-geographie2025-produit-le2026-02-03.xlsx",
)
_DST  = os.path.join(_ROOT, "data", "2026", "securite_lille_metro.xlsx")

TARGET_CITIES = [
    "Wasquehal",
    "Marcq-en-Barœul",
    "Lille",
    "Croix",
    "Roubaix",
    "Tourcoing",
    "Lezennes",
    "Mons-en-Barœul",
    "Hem",
    "Saint-André-lez-Lille",
    "Mouvaux",
    "Lambersart",
    "Marquette-lez-Lille",
    "Wambrechies",
    "Linselles",
    "Quesnoy-sur-Deûle",
]

# Communes not matched by name (apostrophes / special chars)
EXTRA_CODGEO = ["59009"]  # Villeneuve-d'Ascq


def main() -> None:
    if not os.path.exists(_SRC):
        print(f"[!] Source file not found: {_SRC}")
        print("    Download from: https://www.data.gouv.fr/fr/datasets/"
              "statistiques-et-indicateurs-de-la-securite-au-quotidien/")
        sys.exit(1)

    xl       = pd.ExcelFile(_SRC, engine="openpyxl")
    df_zones = xl.parse("zonages supracommunaux")
    df_lib   = xl.parse("libelles supracommunaux")
    df_codes = xl.parse("codes-libelles")

    dep59    = df_zones[df_zones["DEP"].astype(str) == "59"]
    by_name  = dep59[dep59["LIBGEO"].isin(TARGET_CITIES)]
    by_code  = dep59[dep59["CODGEO"].astype(str).isin(EXTRA_CODGEO)]
    filtered = pd.concat([by_name, by_code]).drop_duplicates(subset="CODGEO").copy()

    print(f"Matched {len(filtered)} / {len(TARGET_CITIES) + len(EXTRA_CODGEO)} cities:")
    for _, row in filtered.sort_values("LIBGEO").iterrows():
        print(f"  {row['CODGEO']}  {row['LIBGEO']}  [{row['zone_competence']}]")

    missing = set(TARGET_CITIES) - set(filtered["LIBGEO"].tolist())
    if missing:
        print(f"\n[!] Not matched by name: {missing}")

    os.makedirs(os.path.dirname(_DST), exist_ok=True)
    with pd.ExcelWriter(_DST, engine="openpyxl") as writer:
        filtered.to_excel(writer, sheet_name="zonages supracommunaux", index=False)
        df_lib.to_excel(writer,   sheet_name="libelles supracommunaux", index=False)
        df_codes.to_excel(writer, sheet_name="codes-libelles",          index=False)

    print(f"\nSaved -> {_DST}")


if __name__ == "__main__":
    main()
