Add monolithic archive build mode with clean+validate

2026-02-11 14:36:00 +01:00
parent 3422979ebf
commit 0c07d169b3
3 changed files with 429 additions and 20 deletions
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ This repository converts DGM1 elevation tiles into Unity-ready 16-bit PNG height
 ### How the export works
 - Heightmaps: the pipeline builds `work/dgm.vrt` from all `raw/dgm1/*.tif`, computes a global min/max once (legacy fallback), and warps each tile footprint to `heightmap.out_res` with `srcNodata=-9999`. Per-tile min/max are computed from the warped tile and used to scale PNGs to `[0, 65535]` by default (`heightmap.use_tile_minmax=false` keeps global scaling). `export_unity/tile_index.csv` records `global_min/global_max`, `tile_min/tile_max`, and `tile_key = f"{floor((xmin + overlap_x) / tile_size_x)}_{floor((ymin + overlap_y) / tile_size_y)}"` (defaults: `tile_size_x=1000.0`, `tile_size_y=1000.0`, `overlap_x=0.5`, `overlap_y=0.5` in `[tile_key]`).
 - Orthophotos: `work/dop.vrt` is built from `raw/dop20/jp2/*.jp2`; the manifest drives the cropping bounds. JPEG tiles are written to `export_unity/ortho_jpg/` with matching `.jgw` worldfiles. If the manifest is missing, the orthophoto export aborts—run the heightmap export first or use `--export all`.
- Archives: `--build-from-archive` expands every `*.zip` under `archive/*` into the matching `raw/*` directories and copies `archive/dop20/filelist.txt` next to `raw/dop20/` for the downloader.
+- Archives: `--build-from-archive` supports a monolithic ZIP (`archive/archive_raw.zip`) and expands every `*.zip` under `archive/*` into the matching `raw/*` directories; dataset zips overlay the monolithic data. It also copies `archive/dop20/filelist.txt` next to `raw/dop20/` for the downloader.
 - Cleanup: temporary `_tmp.tif` and GDAL aux XML files under `work/` and `raw/dgm1/` are removed at the end of the heightmap export; avoid storing non-GDAL metadata in those folders.

 ### Key Commands
@@ -48,7 +48,8 @@ This repository converts DGM1 elevation tiles into Unity-ready 16-bit PNG height
 - Run export pipeline: `uv run python geodata_to_unity.py --export all`
 - Inspect an output tile: `gdalinfo export_unity/height_png16/<tile>.png | head`
 - Override config paths: use `--config <path>`, `--raw-dgm1-path <dir>`, `--raw-dop20-path <dir>`.
- Build raws from archives: `uv run python geodata_to_unity.py --build-from-archive --export all` (unzips `archive/*`; dop20 filelist stays in `archive/dop20/` for the downloader).
+- Build raws from archives: `uv run python geodata_to_unity.py --build-from-archive --export all` (uses `archive/archive_raw.zip` when present, then overlays `archive/*/*.zip`).
+- Deterministic submission rebuild: `uv run python geodata_to_unity.py --build-from-archive --clean-raw --validate --export all --force-vrt`.
 - Rebuild VRTs after moving data: add `--force-vrt`.

 ### Workflow Notes
@@ -58,6 +59,8 @@ This repository converts DGM1 elevation tiles into Unity-ready 16-bit PNG height
 - Keep file names stable to avoid churn in Unity scenes; re-exports overwrite in place.
 - Large raw datasets are intentionally excluded from version control—document download sources or scripts instead of committing data.
 - Additional inputs: download helper lives in `scripts/dlscript_dop20.sh` and pulls JP2/J2W/XML orthophotos listed in `archive/dop20/filelist.txt` (one URL per line); `archive/` can hold zipped 3D building tiles for future use.
+- `--clean-raw` only removes managed ingestion dirs (`raw/dgm1`, `raw/dop20/jp2`, `raw/dop20/j2w`, `raw/dop20/meta`, `raw/citygml/lod1`, `raw/citygml/lod2`) and intentionally keeps custom masks.
+- `--validate` writes `work/archive_materialize_report.json` and fails only when core datasets are missing (`dgm1 tif`, `dop20 jp2`, `citygml lod2`); optional sidecar gaps are warnings.
 - Handoff to Unity: copy/sync `export_unity/height_png16/` and `export_unity/tile_index.csv` into `DTrierFlood/Assets/GeoData/` before running the Unity-side importer. Keep `heightmap.out_res` aligned with the importer’s expected resolution (currently 1025).

 ### Orthophotos (textures)
--- a/geodata_pipeline/setup_helpers.py
+++ b/geodata_pipeline/setup_helpers.py
@@ -1,9 +1,11 @@
 from __future__ import annotations

+import json
 import glob
 import os
 import shutil
 import zipfile
+from datetime import datetime, timezone
 from typing import Iterable

 from .config import Config, ensure_default_config
@@ -31,33 +33,404 @@ def _paths_from_config(cfg: Config) -> Iterable[str]:
    ]


-def materialize_archives(cfg: Config) -> None:
-    """Best-effort expansion of archive zips into raw inputs."""
+def materialize_archives(
+    cfg: Config,
+    *,
+    clean_raw: bool = False,
+    validate: bool = False,
+    archive_raw_zip: str | None = None,
+) -> int:
+    """Expand archive zips into raw inputs with optional cleanup and validation."""
    ensure_directories(cfg)
-    _unpack_all(cfg.archives.dgm1_dir, cfg.raw.dgm1_dir)
-    _unpack_all(cfg.archives.citygml_lod1_dir, cfg.raw.citygml_lod1_dir)
-    _unpack_all(cfg.archives.citygml_lod2_dir, cfg.raw.citygml_lod2_dir)
-    _unpack_all(cfg.archives.dop20_dir, cfg.raw.dop20_dir)
+
+    report = {
+        "schema_version": 1,
+        "timestamp_utc": datetime.now(timezone.utc).isoformat(),
+        "mode": {
+            "clean_raw": bool(clean_raw),
+            "validate": bool(validate),
+        },
+        "paths": {
+            "archive_raw_zip_default": os.path.join("archive", "archive_raw.zip"),
+            "archive_raw_zip_requested": archive_raw_zip or "",
+            "raw_dgm1_dir": cfg.raw.dgm1_dir,
+            "raw_dop20_jp2_dir": cfg.raw.dop20_dir,
+            "raw_citygml_lod1_dir": cfg.raw.citygml_lod1_dir,
+            "raw_citygml_lod2_dir": cfg.raw.citygml_lod2_dir,
+        },
+        "clean_raw": {
+            "removed_dirs": [],
+            "errors": [],
+        },
+        "monolithic_zip": {
+            "used": False,
+            "path": "",
+            "entries_total": 0,
+            "entries_extracted": 0,
+            "entries_skipped": 0,
+            "files_overwritten": 0,
+            "unsafe_entries": 0,
+            "errors": [],
+        },
+        "per_dataset": {},
+        "filelist_copy": {
+            "source": os.path.join(cfg.archives.dop20_dir, "filelist.txt"),
+            "destination": os.path.join(os.path.dirname(cfg.raw.dop20_dir), "filelist.txt"),
+            "copied": False,
+            "missing_source": False,
+            "error": "",
+        },
+        "validation": {
+            "enabled": bool(validate),
+            "passed": True,
+            "errors": [],
+            "warnings": [],
+            "counts": {},
+        },
+    }
+
+    if clean_raw:
+        _clean_managed_raw_dirs(cfg, report)
+
+    monolithic_zip = archive_raw_zip or os.path.join("archive", "archive_raw.zip")
+    report["monolithic_zip"]["path"] = monolithic_zip
+
+    if archive_raw_zip and not os.path.isfile(monolithic_zip):
+        msg = f"[archive] Requested monolithic ZIP not found: {monolithic_zip}"
+        print(msg)
+        report["monolithic_zip"]["errors"].append(msg)
+        _write_materialize_report(cfg, report)
+        return 1
+
+    if os.path.isfile(monolithic_zip):
+        report["monolithic_zip"]["used"] = True
+        if not _extract_monolithic_zip(monolithic_zip, cfg, report):
+            _write_materialize_report(cfg, report)
+            return 1
+    else:
+        print(f"[archive] Monolithic ZIP not found (optional): {monolithic_zip}")
+
+    unpack_ok = True
+    unpack_ok &= _unpack_all(
+        cfg.archives.dgm1_dir,
+        cfg.raw.dgm1_dir,
+        report_node=report["per_dataset"].setdefault("dgm1", {}),
+    )
+    unpack_ok &= _unpack_all(
+        cfg.archives.citygml_lod1_dir,
+        cfg.raw.citygml_lod1_dir,
+        report_node=report["per_dataset"].setdefault("citygml_lod1", {}),
+    )
+    unpack_ok &= _unpack_all(
+        cfg.archives.citygml_lod2_dir,
+        cfg.raw.citygml_lod2_dir,
+        report_node=report["per_dataset"].setdefault("citygml_lod2", {}),
+    )
+    unpack_ok &= _unpack_all(
+        cfg.archives.dop20_dir,
+        cfg.raw.dop20_dir,
+        report_node=report["per_dataset"].setdefault("dop20_jp2", {}),
+    )
+    if not unpack_ok:
+        _write_materialize_report(cfg, report)
+        return 1
+
    _copy_filelist(
-        os.path.join(cfg.archives.dop20_dir, "filelist.txt"),
-        os.path.join(os.path.dirname(cfg.raw.dop20_dir), "filelist.txt"),
+        report["filelist_copy"]["source"],
+        report["filelist_copy"]["destination"],
+        report["filelist_copy"],
    )

+    if validate:
+        valid = _validate_materialized_raw(cfg, report)
+        _write_materialize_report(cfg, report)
+        if not valid:
+            return 1
+    else:
+        _write_materialize_report(cfg, report)

-def _unpack_all(archive_dir: str, dest_dir: str) -> None:
+    print(
+        "[archive] Materialization complete: "
+        f"monolithic_used={report['monolithic_zip']['used']}, "
+        f"report=work/archive_materialize_report.json"
+    )
+    return 0
+
+
+def _clean_managed_raw_dirs(cfg: Config, report: dict) -> None:
+    dop20_root = os.path.dirname(cfg.raw.dop20_dir)
+    managed_dirs = [
+        cfg.raw.dgm1_dir,
+        cfg.raw.citygml_lod1_dir,
+        cfg.raw.citygml_lod2_dir,
+        cfg.raw.dop20_dir,
+        os.path.join(dop20_root, "j2w"),
+        os.path.join(dop20_root, "meta"),
+    ]
+
+    for path in managed_dirs:
+        if os.path.isdir(path):
+            try:
+                shutil.rmtree(path)
+                report["clean_raw"]["removed_dirs"].append(path)
+                print(f"[archive] Removed raw dir: {path}")
+            except OSError as exc:
+                msg = f"[archive] Failed to remove {path}: {exc}"
+                report["clean_raw"]["errors"].append(msg)
+                print(msg)
+
+        os.makedirs(path, exist_ok=True)
+
+
+def _extract_monolithic_zip(zip_path: str, cfg: Config, report: dict) -> bool:
+    targets = _monolithic_targets(cfg)
+    node = report["monolithic_zip"]
+
+    try:
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            entries = zf.infolist()
+            node["entries_total"] = len(entries)
+            for info in entries:
+                if info.is_dir():
+                    continue
+                parts = _safe_zip_parts(info.filename)
+                if parts is None:
+                    node["unsafe_entries"] += 1
+                    node["entries_skipped"] += 1
+                    continue
+
+                resolved = _resolve_monolithic_member(parts, targets)
+                if resolved is None:
+                    node["entries_skipped"] += 1
+                    continue
+
+                _, out_path = resolved
+                if os.path.exists(out_path):
+                    node["files_overwritten"] += 1
+                _extract_member(zf, info, out_path)
+                node["entries_extracted"] += 1
+    except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile) as exc:
+        msg = f"[archive] Could not unpack monolithic ZIP {zip_path}: {exc}"
+        node["errors"].append(msg)
+        print(msg)
+        return False
+
+    print(
+        f"[archive] Monolithic ZIP extracted: {zip_path} "
+        f"(extracted={node['entries_extracted']}, skipped={node['entries_skipped']})"
+    )
+    return True
+
+
+def _monolithic_targets(cfg: Config) -> dict[str, dict]:
+    dop20_root = os.path.dirname(cfg.raw.dop20_dir)
+    return {
+        "dgm1": {
+            "dest": cfg.raw.dgm1_dir,
+            "markers": [("raw", "dgm1"), ("dgm1",)],
+        },
+        "dop20_jp2": {
+            "dest": cfg.raw.dop20_dir,
+            "markers": [("raw", "dop20", "jp2"), ("dop20", "jp2")],
+        },
+        "dop20_j2w": {
+            "dest": os.path.join(dop20_root, "j2w"),
+            "markers": [("raw", "dop20", "j2w"), ("dop20", "j2w")],
+        },
+        "dop20_meta": {
+            "dest": os.path.join(dop20_root, "meta"),
+            "markers": [("raw", "dop20", "meta"), ("dop20", "meta")],
+        },
+        "citygml_lod1": {
+            "dest": cfg.raw.citygml_lod1_dir,
+            "markers": [("raw", "citygml", "lod1"), ("citygml", "lod1")],
+        },
+        "citygml_lod2": {
+            "dest": cfg.raw.citygml_lod2_dir,
+            "markers": [("raw", "citygml", "lod2"), ("citygml", "lod2")],
+        },
+        "dop20_filelist": {
+            "dest": os.path.join(dop20_root, "filelist.txt"),
+            "markers": [("raw", "dop20"), ("dop20",)],
+        },
+    }
+
+
+def _resolve_monolithic_member(parts: list[str], targets: dict[str, dict]) -> tuple[str, str] | None:
+    lower_parts = [p.lower() for p in parts]
+    # Prefer more specific markers first.
+    keys = (
+        "dop20_jp2",
+        "dop20_j2w",
+        "dop20_meta",
+        "citygml_lod1",
+        "citygml_lod2",
+        "dgm1",
+        "dop20_filelist",
+    )
+    for key in keys:
+        target = targets[key]
+        for marker in target["markers"]:
+            idx = _find_marker(lower_parts, marker)
+            if idx is None:
+                continue
+            tail = parts[idx + len(marker) :]
+            if key == "dop20_filelist":
+                if len(tail) == 1 and tail[0].lower() == "filelist.txt":
+                    return key, target["dest"]
+                continue
+            if not tail:
+                continue
+            return key, os.path.join(target["dest"], *tail)
+    return None
+
+
+def _find_marker(parts: list[str], marker: tuple[str, ...]) -> int | None:
+    width = len(marker)
+    if width == 0 or len(parts) < width:
+        return None
+    for idx in range(0, len(parts) - width + 1):
+        if tuple(parts[idx : idx + width]) == marker:
+            return idx
+    return None
+
+
+def _safe_zip_parts(member_name: str) -> list[str] | None:
+    # Normalize to POSIX separators to make archive parsing deterministic.
+    normalized = member_name.replace("\\", "/")
+    normalized = normalized.strip("/")
+    if not normalized:
+        return None
+
+    parts = []
+    for part in normalized.split("/"):
+        token = part.strip()
+        if token in ("", "."):
+            continue
+        if token == "..":
+            return None
+        parts.append(token)
+    if not parts:
+        return None
+    return parts
+
+
+def _extract_member(zf: zipfile.ZipFile, info: zipfile.ZipInfo, out_path: str) -> None:
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    with zf.open(info, "r") as src, open(out_path, "wb") as dst:
+        shutil.copyfileobj(src, dst)
+
+
+def _unpack_all(archive_dir: str, dest_dir: str, *, report_node: dict) -> bool:
    os.makedirs(dest_dir, exist_ok=True)
-    for zpath in glob.glob(os.path.join(archive_dir, "*.zip")):
+    zips = sorted(glob.glob(os.path.join(archive_dir, "*.zip")))
+    report_node["archive_dir"] = archive_dir
+    report_node["dest_dir"] = dest_dir
+    report_node["zip_count"] = len(zips)
+    report_node["files_extracted"] = 0
+    report_node["files_overwritten"] = 0
+    report_node["unsafe_entries"] = 0
+    report_node["errors"] = []
+
+    for zpath in zips:
        print(f"Unpacking {zpath} -> {dest_dir}")
-        with zipfile.ZipFile(zpath, "r") as zf:
-            zf.extractall(dest_dir)
+        try:
+            with zipfile.ZipFile(zpath, "r") as zf:
+                for info in zf.infolist():
+                    if info.is_dir():
+                        continue
+                    parts = _safe_zip_parts(info.filename)
+                    if parts is None:
+                        report_node["unsafe_entries"] += 1
+                        continue
+                    out_path = os.path.join(dest_dir, *parts)
+                    if os.path.exists(out_path):
+                        report_node["files_overwritten"] += 1
+                    _extract_member(zf, info, out_path)
+                    report_node["files_extracted"] += 1
+        except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile) as exc:
+            msg = f"[archive] Could not unpack {zpath}: {exc}"
+            report_node["errors"].append(msg)
+            print(msg)
+            return False
+    return True


-def _copy_filelist(src: str, dest: str) -> None:
+def _copy_filelist(src: str, dest: str, report_node: dict) -> None:
    if not os.path.exists(src):
+        report_node["missing_source"] = True
+        print(f"[archive] Optional dop20 filelist missing: {src}")
        return
    os.makedirs(os.path.dirname(dest), exist_ok=True)
-    shutil.copy2(src, dest)
-    print(f"Copied filelist: {src} -> {dest}")
+    try:
+        shutil.copy2(src, dest)
+        report_node["copied"] = True
+        print(f"Copied filelist: {src} -> {dest}")
+    except OSError as exc:
+        report_node["error"] = str(exc)
+        print(f"[archive] Failed to copy filelist {src} -> {dest}: {exc}")
+
+
+def _count_ext(root: str, suffixes: tuple[str, ...]) -> int:
+    if not os.path.isdir(root):
+        return 0
+    total = 0
+    for cur_root, _, files in os.walk(root):
+        for name in files:
+            if name.lower().endswith(suffixes):
+                total += 1
+    return total
+
+
+def _validate_materialized_raw(cfg: Config, report: dict) -> bool:
+    node = report["validation"]
+    dop20_root = os.path.dirname(cfg.raw.dop20_dir)
+
+    counts = {
+        "dgm1_tif": _count_ext(cfg.raw.dgm1_dir, (".tif", ".tiff")),
+        "dop20_jp2": _count_ext(cfg.raw.dop20_dir, (".jp2",)),
+        "citygml_lod2": _count_ext(cfg.raw.citygml_lod2_dir, (".gml", ".xml")),
+        "citygml_lod1": _count_ext(cfg.raw.citygml_lod1_dir, (".gml", ".xml")),
+        "dop20_j2w": _count_ext(os.path.join(dop20_root, "j2w"), (".j2w", ".wld")),
+        "dop20_meta": _count_ext(os.path.join(dop20_root, "meta"), (".xml",)),
+    }
+    node["counts"] = counts
+
+    if counts["dgm1_tif"] == 0:
+        node["errors"].append(f"Missing required DGM1 TIFFs in {cfg.raw.dgm1_dir}")
+    if counts["dop20_jp2"] == 0:
+        node["errors"].append(f"Missing required DOP20 JP2s in {cfg.raw.dop20_dir}")
+    if counts["citygml_lod2"] == 0:
+        node["errors"].append(f"Missing required CityGML LoD2 files in {cfg.raw.citygml_lod2_dir}")
+
+    if counts["citygml_lod1"] == 0:
+        node["warnings"].append(f"No CityGML LoD1 files found in {cfg.raw.citygml_lod1_dir}")
+    if counts["dop20_j2w"] == 0:
+        node["warnings"].append(f"No DOP20 worldfiles found in {os.path.join(dop20_root, 'j2w')}")
+    if counts["dop20_meta"] == 0:
+        node["warnings"].append(f"No DOP20 metadata XML files found in {os.path.join(dop20_root, 'meta')}")
+    if not os.path.exists(os.path.join(dop20_root, "filelist.txt")):
+        node["warnings"].append(f"No dop20 filelist found at {os.path.join(dop20_root, 'filelist.txt')}")
+
+    for msg in node["warnings"]:
+        print(f"[archive][validate] Warning: {msg}")
+    for msg in node["errors"]:
+        print(f"[archive][validate] Error: {msg}")
+
+    node["passed"] = len(node["errors"]) == 0
+    if node["passed"]:
+        print("[archive][validate] Validation passed.")
+    else:
+        print("[archive][validate] Validation failed.")
+    return bool(node["passed"])
+
+
+def _write_materialize_report(cfg: Config, report: dict) -> None:
+    out_path = os.path.join(cfg.work.work_dir, "archive_materialize_report.json")
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    with open(out_path, "w", encoding="utf-8") as handle:
+        json.dump(report, handle, indent=2)
+    print(f"[archive] Wrote report: {out_path}")


 __all__ = ["ensure_directories", "materialize_archives", "ensure_default_config"]
--- a/geodata_to_unity.py
+++ b/geodata_to_unity.py
@@ -51,7 +51,22 @@ def parse_args(argv: Iterable[str] | None = None) -> argparse.Namespace:
    parser.add_argument(
        "--build-from-archive",
        action="store_true",
-        help="Populate raw inputs from archives (unzips zips, leaves dop20 filelist in archive).",
+        help="Populate raw inputs from archives (supports monolithic archive/archive_raw.zip + per-dataset zips).",
+    )
+    parser.add_argument(
+        "--clean-raw",
+        action="store_true",
+        help="With --build-from-archive: remove managed raw input dirs before unpacking archives.",
+    )
+    parser.add_argument(
+        "--validate",
+        action="store_true",
+        help="With --build-from-archive: validate materialized raw inputs and fail on missing core datasets.",
+    )
+    parser.add_argument(
+        "--archive-raw-zip",
+        default=None,
+        help="Optional path to a monolithic raw ZIP (default when omitted: archive/archive_raw.zip if present).",
    )
    parser.add_argument(
        "--setup",
@@ -170,6 +185,10 @@ def _download_requests_lpolpg(download_config: str, requested: list[str] | None)
 def main(argv: Iterable[str] | None = None) -> int:
    args = parse_args(argv)
    cfg = load_config(args)
+
+    if (args.clean_raw or args.validate or args.archive_raw_zip) and not args.build_from_archive:
+        print("[archive] --clean-raw/--validate/--archive-raw-zip are ignored without --build-from-archive.")
+
    target_export = None
    action_flags = args.download or args.split_lpolpg or args.erode_rivers
    if args.export is not None:
@@ -181,12 +200,26 @@ def main(argv: Iterable[str] | None = None) -> int:
        ensure_directories(cfg)
        print(f"Directories ensured. Config at {args.config}.")
        if args.build_from_archive:
-            materialize_archives(cfg)
+            archive_exit = materialize_archives(
+                cfg,
+                clean_raw=args.clean_raw,
+                validate=args.validate,
+                archive_raw_zip=args.archive_raw_zip,
+            )
+            if archive_exit != 0:
+                return archive_exit
        if args.export is None and not args.download:
            return 0

    if args.build_from_archive and not args.setup:
-        materialize_archives(cfg)
+        archive_exit = materialize_archives(
+            cfg,
+            clean_raw=args.clean_raw,
+            validate=args.validate,
+            archive_raw_zip=args.archive_raw_zip,
+        )
+        if archive_exit != 0:
+            return archive_exit

    if args.download:
        datasets = (