diff --git a/README.md b/README.md index 99a8291..3f1febe 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ This repository converts DGM1 elevation tiles into Unity-ready 16-bit PNG height ### How the export works - Heightmaps: the pipeline builds `work/dgm.vrt` from all `raw/dgm1/*.tif`, computes a global min/max once (legacy fallback), and warps each tile footprint to `heightmap.out_res` with `srcNodata=-9999`. Per-tile min/max are computed from the warped tile and used to scale PNGs to `[0, 65535]` by default (`heightmap.use_tile_minmax=false` keeps global scaling). `export_unity/tile_index.csv` records `global_min/global_max`, `tile_min/tile_max`, and `tile_key = f"{floor((xmin + overlap_x) / tile_size_x)}_{floor((ymin + overlap_y) / tile_size_y)}"` (defaults: `tile_size_x=1000.0`, `tile_size_y=1000.0`, `overlap_x=0.5`, `overlap_y=0.5` in `[tile_key]`). - Orthophotos: `work/dop.vrt` is built from `raw/dop20/jp2/*.jp2`; the manifest drives the cropping bounds. JPEG tiles are written to `export_unity/ortho_jpg/` with matching `.jgw` worldfiles. If the manifest is missing, the orthophoto export aborts—run the heightmap export first or use `--export all`. -- Archives: `--build-from-archive` expands every `*.zip` under `archive/*` into the matching `raw/*` directories and copies `archive/dop20/filelist.txt` next to `raw/dop20/` for the downloader. +- Archives: `--build-from-archive` supports a monolithic ZIP (`archive/archive_raw.zip`) and expands every `*.zip` under `archive/*` into the matching `raw/*` directories; dataset zips overlay the monolithic data. It also copies `archive/dop20/filelist.txt` next to `raw/dop20/` for the downloader. - Cleanup: temporary `_tmp.tif` and GDAL aux XML files under `work/` and `raw/dgm1/` are removed at the end of the heightmap export; avoid storing non-GDAL metadata in those folders. ### Key Commands @@ -48,7 +48,8 @@ This repository converts DGM1 elevation tiles into Unity-ready 16-bit PNG height - Run export pipeline: `uv run python geodata_to_unity.py --export all` - Inspect an output tile: `gdalinfo export_unity/height_png16/.png | head` - Override config paths: use `--config `, `--raw-dgm1-path `, `--raw-dop20-path `. -- Build raws from archives: `uv run python geodata_to_unity.py --build-from-archive --export all` (unzips `archive/*`; dop20 filelist stays in `archive/dop20/` for the downloader). +- Build raws from archives: `uv run python geodata_to_unity.py --build-from-archive --export all` (uses `archive/archive_raw.zip` when present, then overlays `archive/*/*.zip`). +- Deterministic submission rebuild: `uv run python geodata_to_unity.py --build-from-archive --clean-raw --validate --export all --force-vrt`. - Rebuild VRTs after moving data: add `--force-vrt`. ### Workflow Notes @@ -58,6 +59,8 @@ This repository converts DGM1 elevation tiles into Unity-ready 16-bit PNG height - Keep file names stable to avoid churn in Unity scenes; re-exports overwrite in place. - Large raw datasets are intentionally excluded from version control—document download sources or scripts instead of committing data. - Additional inputs: download helper lives in `scripts/dlscript_dop20.sh` and pulls JP2/J2W/XML orthophotos listed in `archive/dop20/filelist.txt` (one URL per line); `archive/` can hold zipped 3D building tiles for future use. +- `--clean-raw` only removes managed ingestion dirs (`raw/dgm1`, `raw/dop20/jp2`, `raw/dop20/j2w`, `raw/dop20/meta`, `raw/citygml/lod1`, `raw/citygml/lod2`) and intentionally keeps custom masks. +- `--validate` writes `work/archive_materialize_report.json` and fails only when core datasets are missing (`dgm1 tif`, `dop20 jp2`, `citygml lod2`); optional sidecar gaps are warnings. - Handoff to Unity: copy/sync `export_unity/height_png16/` and `export_unity/tile_index.csv` into `DTrierFlood/Assets/GeoData/` before running the Unity-side importer. Keep `heightmap.out_res` aligned with the importer’s expected resolution (currently 1025). ### Orthophotos (textures) diff --git a/geodata_pipeline/setup_helpers.py b/geodata_pipeline/setup_helpers.py index 5767ba3..7d820ad 100644 --- a/geodata_pipeline/setup_helpers.py +++ b/geodata_pipeline/setup_helpers.py @@ -1,9 +1,11 @@ from __future__ import annotations +import json import glob import os import shutil import zipfile +from datetime import datetime, timezone from typing import Iterable from .config import Config, ensure_default_config @@ -31,33 +33,404 @@ def _paths_from_config(cfg: Config) -> Iterable[str]: ] -def materialize_archives(cfg: Config) -> None: - """Best-effort expansion of archive zips into raw inputs.""" +def materialize_archives( + cfg: Config, + *, + clean_raw: bool = False, + validate: bool = False, + archive_raw_zip: str | None = None, +) -> int: + """Expand archive zips into raw inputs with optional cleanup and validation.""" ensure_directories(cfg) - _unpack_all(cfg.archives.dgm1_dir, cfg.raw.dgm1_dir) - _unpack_all(cfg.archives.citygml_lod1_dir, cfg.raw.citygml_lod1_dir) - _unpack_all(cfg.archives.citygml_lod2_dir, cfg.raw.citygml_lod2_dir) - _unpack_all(cfg.archives.dop20_dir, cfg.raw.dop20_dir) + + report = { + "schema_version": 1, + "timestamp_utc": datetime.now(timezone.utc).isoformat(), + "mode": { + "clean_raw": bool(clean_raw), + "validate": bool(validate), + }, + "paths": { + "archive_raw_zip_default": os.path.join("archive", "archive_raw.zip"), + "archive_raw_zip_requested": archive_raw_zip or "", + "raw_dgm1_dir": cfg.raw.dgm1_dir, + "raw_dop20_jp2_dir": cfg.raw.dop20_dir, + "raw_citygml_lod1_dir": cfg.raw.citygml_lod1_dir, + "raw_citygml_lod2_dir": cfg.raw.citygml_lod2_dir, + }, + "clean_raw": { + "removed_dirs": [], + "errors": [], + }, + "monolithic_zip": { + "used": False, + "path": "", + "entries_total": 0, + "entries_extracted": 0, + "entries_skipped": 0, + "files_overwritten": 0, + "unsafe_entries": 0, + "errors": [], + }, + "per_dataset": {}, + "filelist_copy": { + "source": os.path.join(cfg.archives.dop20_dir, "filelist.txt"), + "destination": os.path.join(os.path.dirname(cfg.raw.dop20_dir), "filelist.txt"), + "copied": False, + "missing_source": False, + "error": "", + }, + "validation": { + "enabled": bool(validate), + "passed": True, + "errors": [], + "warnings": [], + "counts": {}, + }, + } + + if clean_raw: + _clean_managed_raw_dirs(cfg, report) + + monolithic_zip = archive_raw_zip or os.path.join("archive", "archive_raw.zip") + report["monolithic_zip"]["path"] = monolithic_zip + + if archive_raw_zip and not os.path.isfile(monolithic_zip): + msg = f"[archive] Requested monolithic ZIP not found: {monolithic_zip}" + print(msg) + report["monolithic_zip"]["errors"].append(msg) + _write_materialize_report(cfg, report) + return 1 + + if os.path.isfile(monolithic_zip): + report["monolithic_zip"]["used"] = True + if not _extract_monolithic_zip(monolithic_zip, cfg, report): + _write_materialize_report(cfg, report) + return 1 + else: + print(f"[archive] Monolithic ZIP not found (optional): {monolithic_zip}") + + unpack_ok = True + unpack_ok &= _unpack_all( + cfg.archives.dgm1_dir, + cfg.raw.dgm1_dir, + report_node=report["per_dataset"].setdefault("dgm1", {}), + ) + unpack_ok &= _unpack_all( + cfg.archives.citygml_lod1_dir, + cfg.raw.citygml_lod1_dir, + report_node=report["per_dataset"].setdefault("citygml_lod1", {}), + ) + unpack_ok &= _unpack_all( + cfg.archives.citygml_lod2_dir, + cfg.raw.citygml_lod2_dir, + report_node=report["per_dataset"].setdefault("citygml_lod2", {}), + ) + unpack_ok &= _unpack_all( + cfg.archives.dop20_dir, + cfg.raw.dop20_dir, + report_node=report["per_dataset"].setdefault("dop20_jp2", {}), + ) + if not unpack_ok: + _write_materialize_report(cfg, report) + return 1 + _copy_filelist( - os.path.join(cfg.archives.dop20_dir, "filelist.txt"), - os.path.join(os.path.dirname(cfg.raw.dop20_dir), "filelist.txt"), + report["filelist_copy"]["source"], + report["filelist_copy"]["destination"], + report["filelist_copy"], ) + if validate: + valid = _validate_materialized_raw(cfg, report) + _write_materialize_report(cfg, report) + if not valid: + return 1 + else: + _write_materialize_report(cfg, report) -def _unpack_all(archive_dir: str, dest_dir: str) -> None: + print( + "[archive] Materialization complete: " + f"monolithic_used={report['monolithic_zip']['used']}, " + f"report=work/archive_materialize_report.json" + ) + return 0 + + +def _clean_managed_raw_dirs(cfg: Config, report: dict) -> None: + dop20_root = os.path.dirname(cfg.raw.dop20_dir) + managed_dirs = [ + cfg.raw.dgm1_dir, + cfg.raw.citygml_lod1_dir, + cfg.raw.citygml_lod2_dir, + cfg.raw.dop20_dir, + os.path.join(dop20_root, "j2w"), + os.path.join(dop20_root, "meta"), + ] + + for path in managed_dirs: + if os.path.isdir(path): + try: + shutil.rmtree(path) + report["clean_raw"]["removed_dirs"].append(path) + print(f"[archive] Removed raw dir: {path}") + except OSError as exc: + msg = f"[archive] Failed to remove {path}: {exc}" + report["clean_raw"]["errors"].append(msg) + print(msg) + + os.makedirs(path, exist_ok=True) + + +def _extract_monolithic_zip(zip_path: str, cfg: Config, report: dict) -> bool: + targets = _monolithic_targets(cfg) + node = report["monolithic_zip"] + + try: + with zipfile.ZipFile(zip_path, "r") as zf: + entries = zf.infolist() + node["entries_total"] = len(entries) + for info in entries: + if info.is_dir(): + continue + parts = _safe_zip_parts(info.filename) + if parts is None: + node["unsafe_entries"] += 1 + node["entries_skipped"] += 1 + continue + + resolved = _resolve_monolithic_member(parts, targets) + if resolved is None: + node["entries_skipped"] += 1 + continue + + _, out_path = resolved + if os.path.exists(out_path): + node["files_overwritten"] += 1 + _extract_member(zf, info, out_path) + node["entries_extracted"] += 1 + except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile) as exc: + msg = f"[archive] Could not unpack monolithic ZIP {zip_path}: {exc}" + node["errors"].append(msg) + print(msg) + return False + + print( + f"[archive] Monolithic ZIP extracted: {zip_path} " + f"(extracted={node['entries_extracted']}, skipped={node['entries_skipped']})" + ) + return True + + +def _monolithic_targets(cfg: Config) -> dict[str, dict]: + dop20_root = os.path.dirname(cfg.raw.dop20_dir) + return { + "dgm1": { + "dest": cfg.raw.dgm1_dir, + "markers": [("raw", "dgm1"), ("dgm1",)], + }, + "dop20_jp2": { + "dest": cfg.raw.dop20_dir, + "markers": [("raw", "dop20", "jp2"), ("dop20", "jp2")], + }, + "dop20_j2w": { + "dest": os.path.join(dop20_root, "j2w"), + "markers": [("raw", "dop20", "j2w"), ("dop20", "j2w")], + }, + "dop20_meta": { + "dest": os.path.join(dop20_root, "meta"), + "markers": [("raw", "dop20", "meta"), ("dop20", "meta")], + }, + "citygml_lod1": { + "dest": cfg.raw.citygml_lod1_dir, + "markers": [("raw", "citygml", "lod1"), ("citygml", "lod1")], + }, + "citygml_lod2": { + "dest": cfg.raw.citygml_lod2_dir, + "markers": [("raw", "citygml", "lod2"), ("citygml", "lod2")], + }, + "dop20_filelist": { + "dest": os.path.join(dop20_root, "filelist.txt"), + "markers": [("raw", "dop20"), ("dop20",)], + }, + } + + +def _resolve_monolithic_member(parts: list[str], targets: dict[str, dict]) -> tuple[str, str] | None: + lower_parts = [p.lower() for p in parts] + # Prefer more specific markers first. + keys = ( + "dop20_jp2", + "dop20_j2w", + "dop20_meta", + "citygml_lod1", + "citygml_lod2", + "dgm1", + "dop20_filelist", + ) + for key in keys: + target = targets[key] + for marker in target["markers"]: + idx = _find_marker(lower_parts, marker) + if idx is None: + continue + tail = parts[idx + len(marker) :] + if key == "dop20_filelist": + if len(tail) == 1 and tail[0].lower() == "filelist.txt": + return key, target["dest"] + continue + if not tail: + continue + return key, os.path.join(target["dest"], *tail) + return None + + +def _find_marker(parts: list[str], marker: tuple[str, ...]) -> int | None: + width = len(marker) + if width == 0 or len(parts) < width: + return None + for idx in range(0, len(parts) - width + 1): + if tuple(parts[idx : idx + width]) == marker: + return idx + return None + + +def _safe_zip_parts(member_name: str) -> list[str] | None: + # Normalize to POSIX separators to make archive parsing deterministic. + normalized = member_name.replace("\\", "/") + normalized = normalized.strip("/") + if not normalized: + return None + + parts = [] + for part in normalized.split("/"): + token = part.strip() + if token in ("", "."): + continue + if token == "..": + return None + parts.append(token) + if not parts: + return None + return parts + + +def _extract_member(zf: zipfile.ZipFile, info: zipfile.ZipInfo, out_path: str) -> None: + os.makedirs(os.path.dirname(out_path), exist_ok=True) + with zf.open(info, "r") as src, open(out_path, "wb") as dst: + shutil.copyfileobj(src, dst) + + +def _unpack_all(archive_dir: str, dest_dir: str, *, report_node: dict) -> bool: os.makedirs(dest_dir, exist_ok=True) - for zpath in glob.glob(os.path.join(archive_dir, "*.zip")): + zips = sorted(glob.glob(os.path.join(archive_dir, "*.zip"))) + report_node["archive_dir"] = archive_dir + report_node["dest_dir"] = dest_dir + report_node["zip_count"] = len(zips) + report_node["files_extracted"] = 0 + report_node["files_overwritten"] = 0 + report_node["unsafe_entries"] = 0 + report_node["errors"] = [] + + for zpath in zips: print(f"Unpacking {zpath} -> {dest_dir}") - with zipfile.ZipFile(zpath, "r") as zf: - zf.extractall(dest_dir) + try: + with zipfile.ZipFile(zpath, "r") as zf: + for info in zf.infolist(): + if info.is_dir(): + continue + parts = _safe_zip_parts(info.filename) + if parts is None: + report_node["unsafe_entries"] += 1 + continue + out_path = os.path.join(dest_dir, *parts) + if os.path.exists(out_path): + report_node["files_overwritten"] += 1 + _extract_member(zf, info, out_path) + report_node["files_extracted"] += 1 + except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile) as exc: + msg = f"[archive] Could not unpack {zpath}: {exc}" + report_node["errors"].append(msg) + print(msg) + return False + return True -def _copy_filelist(src: str, dest: str) -> None: +def _copy_filelist(src: str, dest: str, report_node: dict) -> None: if not os.path.exists(src): + report_node["missing_source"] = True + print(f"[archive] Optional dop20 filelist missing: {src}") return os.makedirs(os.path.dirname(dest), exist_ok=True) - shutil.copy2(src, dest) - print(f"Copied filelist: {src} -> {dest}") + try: + shutil.copy2(src, dest) + report_node["copied"] = True + print(f"Copied filelist: {src} -> {dest}") + except OSError as exc: + report_node["error"] = str(exc) + print(f"[archive] Failed to copy filelist {src} -> {dest}: {exc}") + + +def _count_ext(root: str, suffixes: tuple[str, ...]) -> int: + if not os.path.isdir(root): + return 0 + total = 0 + for cur_root, _, files in os.walk(root): + for name in files: + if name.lower().endswith(suffixes): + total += 1 + return total + + +def _validate_materialized_raw(cfg: Config, report: dict) -> bool: + node = report["validation"] + dop20_root = os.path.dirname(cfg.raw.dop20_dir) + + counts = { + "dgm1_tif": _count_ext(cfg.raw.dgm1_dir, (".tif", ".tiff")), + "dop20_jp2": _count_ext(cfg.raw.dop20_dir, (".jp2",)), + "citygml_lod2": _count_ext(cfg.raw.citygml_lod2_dir, (".gml", ".xml")), + "citygml_lod1": _count_ext(cfg.raw.citygml_lod1_dir, (".gml", ".xml")), + "dop20_j2w": _count_ext(os.path.join(dop20_root, "j2w"), (".j2w", ".wld")), + "dop20_meta": _count_ext(os.path.join(dop20_root, "meta"), (".xml",)), + } + node["counts"] = counts + + if counts["dgm1_tif"] == 0: + node["errors"].append(f"Missing required DGM1 TIFFs in {cfg.raw.dgm1_dir}") + if counts["dop20_jp2"] == 0: + node["errors"].append(f"Missing required DOP20 JP2s in {cfg.raw.dop20_dir}") + if counts["citygml_lod2"] == 0: + node["errors"].append(f"Missing required CityGML LoD2 files in {cfg.raw.citygml_lod2_dir}") + + if counts["citygml_lod1"] == 0: + node["warnings"].append(f"No CityGML LoD1 files found in {cfg.raw.citygml_lod1_dir}") + if counts["dop20_j2w"] == 0: + node["warnings"].append(f"No DOP20 worldfiles found in {os.path.join(dop20_root, 'j2w')}") + if counts["dop20_meta"] == 0: + node["warnings"].append(f"No DOP20 metadata XML files found in {os.path.join(dop20_root, 'meta')}") + if not os.path.exists(os.path.join(dop20_root, "filelist.txt")): + node["warnings"].append(f"No dop20 filelist found at {os.path.join(dop20_root, 'filelist.txt')}") + + for msg in node["warnings"]: + print(f"[archive][validate] Warning: {msg}") + for msg in node["errors"]: + print(f"[archive][validate] Error: {msg}") + + node["passed"] = len(node["errors"]) == 0 + if node["passed"]: + print("[archive][validate] Validation passed.") + else: + print("[archive][validate] Validation failed.") + return bool(node["passed"]) + + +def _write_materialize_report(cfg: Config, report: dict) -> None: + out_path = os.path.join(cfg.work.work_dir, "archive_materialize_report.json") + os.makedirs(os.path.dirname(out_path), exist_ok=True) + with open(out_path, "w", encoding="utf-8") as handle: + json.dump(report, handle, indent=2) + print(f"[archive] Wrote report: {out_path}") __all__ = ["ensure_directories", "materialize_archives", "ensure_default_config"] diff --git a/geodata_to_unity.py b/geodata_to_unity.py index be8da97..534dc9b 100644 --- a/geodata_to_unity.py +++ b/geodata_to_unity.py @@ -51,7 +51,22 @@ def parse_args(argv: Iterable[str] | None = None) -> argparse.Namespace: parser.add_argument( "--build-from-archive", action="store_true", - help="Populate raw inputs from archives (unzips zips, leaves dop20 filelist in archive).", + help="Populate raw inputs from archives (supports monolithic archive/archive_raw.zip + per-dataset zips).", + ) + parser.add_argument( + "--clean-raw", + action="store_true", + help="With --build-from-archive: remove managed raw input dirs before unpacking archives.", + ) + parser.add_argument( + "--validate", + action="store_true", + help="With --build-from-archive: validate materialized raw inputs and fail on missing core datasets.", + ) + parser.add_argument( + "--archive-raw-zip", + default=None, + help="Optional path to a monolithic raw ZIP (default when omitted: archive/archive_raw.zip if present).", ) parser.add_argument( "--setup", @@ -170,6 +185,10 @@ def _download_requests_lpolpg(download_config: str, requested: list[str] | None) def main(argv: Iterable[str] | None = None) -> int: args = parse_args(argv) cfg = load_config(args) + + if (args.clean_raw or args.validate or args.archive_raw_zip) and not args.build_from_archive: + print("[archive] --clean-raw/--validate/--archive-raw-zip are ignored without --build-from-archive.") + target_export = None action_flags = args.download or args.split_lpolpg or args.erode_rivers if args.export is not None: @@ -181,12 +200,26 @@ def main(argv: Iterable[str] | None = None) -> int: ensure_directories(cfg) print(f"Directories ensured. Config at {args.config}.") if args.build_from_archive: - materialize_archives(cfg) + archive_exit = materialize_archives( + cfg, + clean_raw=args.clean_raw, + validate=args.validate, + archive_raw_zip=args.archive_raw_zip, + ) + if archive_exit != 0: + return archive_exit if args.export is None and not args.download: return 0 if args.build_from_archive and not args.setup: - materialize_archives(cfg) + archive_exit = materialize_archives( + cfg, + clean_raw=args.clean_raw, + validate=args.validate, + archive_raw_zip=args.archive_raw_zip, + ) + if archive_exit != 0: + return archive_exit if args.download: datasets = (