from __future__ import annotations import json import glob import os import shutil import zipfile from datetime import datetime, timezone from typing import Iterable from .config import Config, ensure_default_config def ensure_directories(cfg: Config) -> None: for path in _paths_from_config(cfg): os.makedirs(path, exist_ok=True) def _paths_from_config(cfg: Config) -> Iterable[str]: return [ cfg.raw.dgm1_dir, cfg.raw.dop20_dir, cfg.raw.citygml_lod1_dir, cfg.raw.citygml_lod2_dir, cfg.archives.dgm1_dir, cfg.archives.dop20_dir, cfg.archives.citygml_lod1_dir, cfg.archives.citygml_lod2_dir, cfg.work.work_dir, cfg.export.heightmap_dir, cfg.export.ortho_dir, cfg.swe_lod.out_dir, ] def materialize_archives( cfg: Config, *, clean_raw: bool = False, validate: bool = False, archive_raw_zip: str | None = None, ) -> int: """Expand archive zips into raw inputs with optional cleanup and validation.""" ensure_directories(cfg) report = { "schema_version": 1, "timestamp_utc": datetime.now(timezone.utc).isoformat(), "mode": { "clean_raw": bool(clean_raw), "validate": bool(validate), }, "paths": { "archive_raw_zip_default": os.path.join("archive", "archive_raw.zip"), "archive_raw_zip_requested": archive_raw_zip or "", "raw_dgm1_dir": cfg.raw.dgm1_dir, "raw_dop20_jp2_dir": cfg.raw.dop20_dir, "raw_citygml_lod1_dir": cfg.raw.citygml_lod1_dir, "raw_citygml_lod2_dir": cfg.raw.citygml_lod2_dir, }, "clean_raw": { "removed_dirs": [], "errors": [], }, "monolithic_zip": { "used": False, "path": "", "entries_total": 0, "entries_extracted": 0, "entries_skipped": 0, "files_overwritten": 0, "unsafe_entries": 0, "errors": [], }, "per_dataset": {}, "filelist_copy": { "source": os.path.join(cfg.archives.dop20_dir, "filelist.txt"), "destination": os.path.join(os.path.dirname(cfg.raw.dop20_dir), "filelist.txt"), "copied": False, "missing_source": False, "error": "", }, "validation": { "enabled": bool(validate), "passed": True, "errors": [], "warnings": [], "counts": {}, }, } if clean_raw: _clean_managed_raw_dirs(cfg, report) monolithic_zip = archive_raw_zip or os.path.join("archive", "archive_raw.zip") report["monolithic_zip"]["path"] = monolithic_zip if archive_raw_zip and not os.path.isfile(monolithic_zip): msg = f"[archive] Requested monolithic ZIP not found: {monolithic_zip}" print(msg) report["monolithic_zip"]["errors"].append(msg) _write_materialize_report(cfg, report) return 1 if os.path.isfile(monolithic_zip): report["monolithic_zip"]["used"] = True if not _extract_monolithic_zip(monolithic_zip, cfg, report): _write_materialize_report(cfg, report) return 1 else: print(f"[archive] Monolithic ZIP not found (optional): {monolithic_zip}") unpack_ok = True unpack_ok &= _unpack_all( cfg.archives.dgm1_dir, cfg.raw.dgm1_dir, report_node=report["per_dataset"].setdefault("dgm1", {}), ) unpack_ok &= _unpack_all( cfg.archives.citygml_lod1_dir, cfg.raw.citygml_lod1_dir, report_node=report["per_dataset"].setdefault("citygml_lod1", {}), ) unpack_ok &= _unpack_all( cfg.archives.citygml_lod2_dir, cfg.raw.citygml_lod2_dir, report_node=report["per_dataset"].setdefault("citygml_lod2", {}), ) unpack_ok &= _unpack_all( cfg.archives.dop20_dir, cfg.raw.dop20_dir, report_node=report["per_dataset"].setdefault("dop20_jp2", {}), ) if not unpack_ok: _write_materialize_report(cfg, report) return 1 _copy_filelist( report["filelist_copy"]["source"], report["filelist_copy"]["destination"], report["filelist_copy"], ) if validate: valid = _validate_materialized_raw(cfg, report) _write_materialize_report(cfg, report) if not valid: return 1 else: _write_materialize_report(cfg, report) print( "[archive] Materialization complete: " f"monolithic_used={report['monolithic_zip']['used']}, " f"report=work/archive_materialize_report.json" ) return 0 def _clean_managed_raw_dirs(cfg: Config, report: dict) -> None: dop20_root = os.path.dirname(cfg.raw.dop20_dir) managed_dirs = [ cfg.raw.dgm1_dir, cfg.raw.citygml_lod1_dir, cfg.raw.citygml_lod2_dir, cfg.raw.dop20_dir, os.path.join(dop20_root, "j2w"), os.path.join(dop20_root, "meta"), ] for path in managed_dirs: if os.path.isdir(path): try: shutil.rmtree(path) report["clean_raw"]["removed_dirs"].append(path) print(f"[archive] Removed raw dir: {path}") except OSError as exc: msg = f"[archive] Failed to remove {path}: {exc}" report["clean_raw"]["errors"].append(msg) print(msg) os.makedirs(path, exist_ok=True) def _extract_monolithic_zip(zip_path: str, cfg: Config, report: dict) -> bool: targets = _monolithic_targets(cfg) node = report["monolithic_zip"] try: with zipfile.ZipFile(zip_path, "r") as zf: entries = zf.infolist() node["entries_total"] = len(entries) for info in entries: if info.is_dir(): continue parts = _safe_zip_parts(info.filename) if parts is None: node["unsafe_entries"] += 1 node["entries_skipped"] += 1 continue resolved = _resolve_monolithic_member(parts, targets) if resolved is None: node["entries_skipped"] += 1 continue _, out_path = resolved if os.path.exists(out_path): node["files_overwritten"] += 1 _extract_member(zf, info, out_path) node["entries_extracted"] += 1 except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile) as exc: msg = f"[archive] Could not unpack monolithic ZIP {zip_path}: {exc}" node["errors"].append(msg) print(msg) return False print( f"[archive] Monolithic ZIP extracted: {zip_path} " f"(extracted={node['entries_extracted']}, skipped={node['entries_skipped']})" ) return True def _monolithic_targets(cfg: Config) -> dict[str, dict]: dop20_root = os.path.dirname(cfg.raw.dop20_dir) return { "dgm1": { "dest": cfg.raw.dgm1_dir, "markers": [("raw", "dgm1"), ("dgm1",)], }, "dop20_jp2": { "dest": cfg.raw.dop20_dir, "markers": [("raw", "dop20", "jp2"), ("dop20", "jp2")], }, "dop20_j2w": { "dest": os.path.join(dop20_root, "j2w"), "markers": [("raw", "dop20", "j2w"), ("dop20", "j2w")], }, "dop20_meta": { "dest": os.path.join(dop20_root, "meta"), "markers": [("raw", "dop20", "meta"), ("dop20", "meta")], }, "citygml_lod1": { "dest": cfg.raw.citygml_lod1_dir, "markers": [("raw", "citygml", "lod1"), ("citygml", "lod1")], }, "citygml_lod2": { "dest": cfg.raw.citygml_lod2_dir, "markers": [("raw", "citygml", "lod2"), ("citygml", "lod2")], }, "dop20_filelist": { "dest": os.path.join(dop20_root, "filelist.txt"), "markers": [("raw", "dop20"), ("dop20",)], }, } def _resolve_monolithic_member(parts: list[str], targets: dict[str, dict]) -> tuple[str, str] | None: lower_parts = [p.lower() for p in parts] # Prefer more specific markers first. keys = ( "dop20_jp2", "dop20_j2w", "dop20_meta", "citygml_lod1", "citygml_lod2", "dgm1", "dop20_filelist", ) for key in keys: target = targets[key] for marker in target["markers"]: idx = _find_marker(lower_parts, marker) if idx is None: continue tail = parts[idx + len(marker) :] if key == "dop20_filelist": if len(tail) == 1 and tail[0].lower() == "filelist.txt": return key, target["dest"] continue if not tail: continue return key, os.path.join(target["dest"], *tail) return None def _find_marker(parts: list[str], marker: tuple[str, ...]) -> int | None: width = len(marker) if width == 0 or len(parts) < width: return None for idx in range(0, len(parts) - width + 1): if tuple(parts[idx : idx + width]) == marker: return idx return None def _safe_zip_parts(member_name: str) -> list[str] | None: # Normalize to POSIX separators to make archive parsing deterministic. normalized = member_name.replace("\\", "/") normalized = normalized.strip("/") if not normalized: return None parts = [] for part in normalized.split("/"): token = part.strip() if token in ("", "."): continue if token == "..": return None parts.append(token) if not parts: return None return parts def _extract_member(zf: zipfile.ZipFile, info: zipfile.ZipInfo, out_path: str) -> None: os.makedirs(os.path.dirname(out_path), exist_ok=True) with zf.open(info, "r") as src, open(out_path, "wb") as dst: shutil.copyfileobj(src, dst) def _unpack_all(archive_dir: str, dest_dir: str, *, report_node: dict) -> bool: os.makedirs(dest_dir, exist_ok=True) zips = sorted(glob.glob(os.path.join(archive_dir, "*.zip"))) report_node["archive_dir"] = archive_dir report_node["dest_dir"] = dest_dir report_node["zip_count"] = len(zips) report_node["files_extracted"] = 0 report_node["files_overwritten"] = 0 report_node["unsafe_entries"] = 0 report_node["errors"] = [] for zpath in zips: print(f"Unpacking {zpath} -> {dest_dir}") try: with zipfile.ZipFile(zpath, "r") as zf: for info in zf.infolist(): if info.is_dir(): continue parts = _safe_zip_parts(info.filename) if parts is None: report_node["unsafe_entries"] += 1 continue out_path = os.path.join(dest_dir, *parts) if os.path.exists(out_path): report_node["files_overwritten"] += 1 _extract_member(zf, info, out_path) report_node["files_extracted"] += 1 except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile) as exc: msg = f"[archive] Could not unpack {zpath}: {exc}" report_node["errors"].append(msg) print(msg) return False return True def _copy_filelist(src: str, dest: str, report_node: dict) -> None: if not os.path.exists(src): report_node["missing_source"] = True print(f"[archive] Optional dop20 filelist missing: {src}") return os.makedirs(os.path.dirname(dest), exist_ok=True) try: shutil.copy2(src, dest) report_node["copied"] = True print(f"Copied filelist: {src} -> {dest}") except OSError as exc: report_node["error"] = str(exc) print(f"[archive] Failed to copy filelist {src} -> {dest}: {exc}") def _count_ext(root: str, suffixes: tuple[str, ...]) -> int: if not os.path.isdir(root): return 0 total = 0 for cur_root, _, files in os.walk(root): for name in files: if name.lower().endswith(suffixes): total += 1 return total def _validate_materialized_raw(cfg: Config, report: dict) -> bool: node = report["validation"] dop20_root = os.path.dirname(cfg.raw.dop20_dir) counts = { "dgm1_tif": _count_ext(cfg.raw.dgm1_dir, (".tif", ".tiff")), "dop20_jp2": _count_ext(cfg.raw.dop20_dir, (".jp2",)), "citygml_lod2": _count_ext(cfg.raw.citygml_lod2_dir, (".gml", ".xml")), "citygml_lod1": _count_ext(cfg.raw.citygml_lod1_dir, (".gml", ".xml")), "dop20_j2w": _count_ext(os.path.join(dop20_root, "j2w"), (".j2w", ".wld")), "dop20_meta": _count_ext(os.path.join(dop20_root, "meta"), (".xml",)), } node["counts"] = counts if counts["dgm1_tif"] == 0: node["errors"].append(f"Missing required DGM1 TIFFs in {cfg.raw.dgm1_dir}") if counts["dop20_jp2"] == 0: node["errors"].append(f"Missing required DOP20 JP2s in {cfg.raw.dop20_dir}") if counts["citygml_lod2"] == 0: node["errors"].append(f"Missing required CityGML LoD2 files in {cfg.raw.citygml_lod2_dir}") if counts["citygml_lod1"] == 0: node["warnings"].append(f"No CityGML LoD1 files found in {cfg.raw.citygml_lod1_dir}") if counts["dop20_j2w"] == 0: node["warnings"].append(f"No DOP20 worldfiles found in {os.path.join(dop20_root, 'j2w')}") if counts["dop20_meta"] == 0: node["warnings"].append(f"No DOP20 metadata XML files found in {os.path.join(dop20_root, 'meta')}") if not os.path.exists(os.path.join(dop20_root, "filelist.txt")): node["warnings"].append(f"No dop20 filelist found at {os.path.join(dop20_root, 'filelist.txt')}") for msg in node["warnings"]: print(f"[archive][validate] Warning: {msg}") for msg in node["errors"]: print(f"[archive][validate] Error: {msg}") node["passed"] = len(node["errors"]) == 0 if node["passed"]: print("[archive][validate] Validation passed.") else: print("[archive][validate] Validation failed.") return bool(node["passed"]) def _write_materialize_report(cfg: Config, report: dict) -> None: out_path = os.path.join(cfg.work.work_dir, "archive_materialize_report.json") os.makedirs(os.path.dirname(out_path), exist_ok=True) with open(out_path, "w", encoding="utf-8") as handle: json.dump(report, handle, indent=2) print(f"[archive] Wrote report: {out_path}") __all__ = ["ensure_directories", "materialize_archives", "ensure_default_config"]