Files
GeoData/geodata_pipeline/setup_helpers.py

437 lines
15 KiB
Python

from __future__ import annotations
import json
import glob
import os
import shutil
import zipfile
from datetime import datetime, timezone
from typing import Iterable
from .config import Config, ensure_default_config
def ensure_directories(cfg: Config) -> None:
for path in _paths_from_config(cfg):
os.makedirs(path, exist_ok=True)
def _paths_from_config(cfg: Config) -> Iterable[str]:
return [
cfg.raw.dgm1_dir,
cfg.raw.dop20_dir,
cfg.raw.citygml_lod1_dir,
cfg.raw.citygml_lod2_dir,
cfg.archives.dgm1_dir,
cfg.archives.dop20_dir,
cfg.archives.citygml_lod1_dir,
cfg.archives.citygml_lod2_dir,
cfg.work.work_dir,
cfg.export.heightmap_dir,
cfg.export.ortho_dir,
cfg.swe_lod.out_dir,
]
def materialize_archives(
cfg: Config,
*,
clean_raw: bool = False,
validate: bool = False,
archive_raw_zip: str | None = None,
) -> int:
"""Expand archive zips into raw inputs with optional cleanup and validation."""
ensure_directories(cfg)
report = {
"schema_version": 1,
"timestamp_utc": datetime.now(timezone.utc).isoformat(),
"mode": {
"clean_raw": bool(clean_raw),
"validate": bool(validate),
},
"paths": {
"archive_raw_zip_default": os.path.join("archive", "archive_raw.zip"),
"archive_raw_zip_requested": archive_raw_zip or "",
"raw_dgm1_dir": cfg.raw.dgm1_dir,
"raw_dop20_jp2_dir": cfg.raw.dop20_dir,
"raw_citygml_lod1_dir": cfg.raw.citygml_lod1_dir,
"raw_citygml_lod2_dir": cfg.raw.citygml_lod2_dir,
},
"clean_raw": {
"removed_dirs": [],
"errors": [],
},
"monolithic_zip": {
"used": False,
"path": "",
"entries_total": 0,
"entries_extracted": 0,
"entries_skipped": 0,
"files_overwritten": 0,
"unsafe_entries": 0,
"errors": [],
},
"per_dataset": {},
"filelist_copy": {
"source": os.path.join(cfg.archives.dop20_dir, "filelist.txt"),
"destination": os.path.join(os.path.dirname(cfg.raw.dop20_dir), "filelist.txt"),
"copied": False,
"missing_source": False,
"error": "",
},
"validation": {
"enabled": bool(validate),
"passed": True,
"errors": [],
"warnings": [],
"counts": {},
},
}
if clean_raw:
_clean_managed_raw_dirs(cfg, report)
monolithic_zip = archive_raw_zip or os.path.join("archive", "archive_raw.zip")
report["monolithic_zip"]["path"] = monolithic_zip
if archive_raw_zip and not os.path.isfile(monolithic_zip):
msg = f"[archive] Requested monolithic ZIP not found: {monolithic_zip}"
print(msg)
report["monolithic_zip"]["errors"].append(msg)
_write_materialize_report(cfg, report)
return 1
if os.path.isfile(monolithic_zip):
report["monolithic_zip"]["used"] = True
if not _extract_monolithic_zip(monolithic_zip, cfg, report):
_write_materialize_report(cfg, report)
return 1
else:
print(f"[archive] Monolithic ZIP not found (optional): {monolithic_zip}")
unpack_ok = True
unpack_ok &= _unpack_all(
cfg.archives.dgm1_dir,
cfg.raw.dgm1_dir,
report_node=report["per_dataset"].setdefault("dgm1", {}),
)
unpack_ok &= _unpack_all(
cfg.archives.citygml_lod1_dir,
cfg.raw.citygml_lod1_dir,
report_node=report["per_dataset"].setdefault("citygml_lod1", {}),
)
unpack_ok &= _unpack_all(
cfg.archives.citygml_lod2_dir,
cfg.raw.citygml_lod2_dir,
report_node=report["per_dataset"].setdefault("citygml_lod2", {}),
)
unpack_ok &= _unpack_all(
cfg.archives.dop20_dir,
cfg.raw.dop20_dir,
report_node=report["per_dataset"].setdefault("dop20_jp2", {}),
)
if not unpack_ok:
_write_materialize_report(cfg, report)
return 1
_copy_filelist(
report["filelist_copy"]["source"],
report["filelist_copy"]["destination"],
report["filelist_copy"],
)
if validate:
valid = _validate_materialized_raw(cfg, report)
_write_materialize_report(cfg, report)
if not valid:
return 1
else:
_write_materialize_report(cfg, report)
print(
"[archive] Materialization complete: "
f"monolithic_used={report['monolithic_zip']['used']}, "
f"report=work/archive_materialize_report.json"
)
return 0
def _clean_managed_raw_dirs(cfg: Config, report: dict) -> None:
dop20_root = os.path.dirname(cfg.raw.dop20_dir)
managed_dirs = [
cfg.raw.dgm1_dir,
cfg.raw.citygml_lod1_dir,
cfg.raw.citygml_lod2_dir,
cfg.raw.dop20_dir,
os.path.join(dop20_root, "j2w"),
os.path.join(dop20_root, "meta"),
]
for path in managed_dirs:
if os.path.isdir(path):
try:
shutil.rmtree(path)
report["clean_raw"]["removed_dirs"].append(path)
print(f"[archive] Removed raw dir: {path}")
except OSError as exc:
msg = f"[archive] Failed to remove {path}: {exc}"
report["clean_raw"]["errors"].append(msg)
print(msg)
os.makedirs(path, exist_ok=True)
def _extract_monolithic_zip(zip_path: str, cfg: Config, report: dict) -> bool:
targets = _monolithic_targets(cfg)
node = report["monolithic_zip"]
try:
with zipfile.ZipFile(zip_path, "r") as zf:
entries = zf.infolist()
node["entries_total"] = len(entries)
for info in entries:
if info.is_dir():
continue
parts = _safe_zip_parts(info.filename)
if parts is None:
node["unsafe_entries"] += 1
node["entries_skipped"] += 1
continue
resolved = _resolve_monolithic_member(parts, targets)
if resolved is None:
node["entries_skipped"] += 1
continue
_, out_path = resolved
if os.path.exists(out_path):
node["files_overwritten"] += 1
_extract_member(zf, info, out_path)
node["entries_extracted"] += 1
except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile) as exc:
msg = f"[archive] Could not unpack monolithic ZIP {zip_path}: {exc}"
node["errors"].append(msg)
print(msg)
return False
print(
f"[archive] Monolithic ZIP extracted: {zip_path} "
f"(extracted={node['entries_extracted']}, skipped={node['entries_skipped']})"
)
return True
def _monolithic_targets(cfg: Config) -> dict[str, dict]:
dop20_root = os.path.dirname(cfg.raw.dop20_dir)
return {
"dgm1": {
"dest": cfg.raw.dgm1_dir,
"markers": [("raw", "dgm1"), ("dgm1",)],
},
"dop20_jp2": {
"dest": cfg.raw.dop20_dir,
"markers": [("raw", "dop20", "jp2"), ("dop20", "jp2")],
},
"dop20_j2w": {
"dest": os.path.join(dop20_root, "j2w"),
"markers": [("raw", "dop20", "j2w"), ("dop20", "j2w")],
},
"dop20_meta": {
"dest": os.path.join(dop20_root, "meta"),
"markers": [("raw", "dop20", "meta"), ("dop20", "meta")],
},
"citygml_lod1": {
"dest": cfg.raw.citygml_lod1_dir,
"markers": [("raw", "citygml", "lod1"), ("citygml", "lod1")],
},
"citygml_lod2": {
"dest": cfg.raw.citygml_lod2_dir,
"markers": [("raw", "citygml", "lod2"), ("citygml", "lod2")],
},
"dop20_filelist": {
"dest": os.path.join(dop20_root, "filelist.txt"),
"markers": [("raw", "dop20"), ("dop20",)],
},
}
def _resolve_monolithic_member(parts: list[str], targets: dict[str, dict]) -> tuple[str, str] | None:
lower_parts = [p.lower() for p in parts]
# Prefer more specific markers first.
keys = (
"dop20_jp2",
"dop20_j2w",
"dop20_meta",
"citygml_lod1",
"citygml_lod2",
"dgm1",
"dop20_filelist",
)
for key in keys:
target = targets[key]
for marker in target["markers"]:
idx = _find_marker(lower_parts, marker)
if idx is None:
continue
tail = parts[idx + len(marker) :]
if key == "dop20_filelist":
if len(tail) == 1 and tail[0].lower() == "filelist.txt":
return key, target["dest"]
continue
if not tail:
continue
return key, os.path.join(target["dest"], *tail)
return None
def _find_marker(parts: list[str], marker: tuple[str, ...]) -> int | None:
width = len(marker)
if width == 0 or len(parts) < width:
return None
for idx in range(0, len(parts) - width + 1):
if tuple(parts[idx : idx + width]) == marker:
return idx
return None
def _safe_zip_parts(member_name: str) -> list[str] | None:
# Normalize to POSIX separators to make archive parsing deterministic.
normalized = member_name.replace("\\", "/")
normalized = normalized.strip("/")
if not normalized:
return None
parts = []
for part in normalized.split("/"):
token = part.strip()
if token in ("", "."):
continue
if token == "..":
return None
parts.append(token)
if not parts:
return None
return parts
def _extract_member(zf: zipfile.ZipFile, info: zipfile.ZipInfo, out_path: str) -> None:
os.makedirs(os.path.dirname(out_path), exist_ok=True)
with zf.open(info, "r") as src, open(out_path, "wb") as dst:
shutil.copyfileobj(src, dst)
def _unpack_all(archive_dir: str, dest_dir: str, *, report_node: dict) -> bool:
os.makedirs(dest_dir, exist_ok=True)
zips = sorted(glob.glob(os.path.join(archive_dir, "*.zip")))
report_node["archive_dir"] = archive_dir
report_node["dest_dir"] = dest_dir
report_node["zip_count"] = len(zips)
report_node["files_extracted"] = 0
report_node["files_overwritten"] = 0
report_node["unsafe_entries"] = 0
report_node["errors"] = []
for zpath in zips:
print(f"Unpacking {zpath} -> {dest_dir}")
try:
with zipfile.ZipFile(zpath, "r") as zf:
for info in zf.infolist():
if info.is_dir():
continue
parts = _safe_zip_parts(info.filename)
if parts is None:
report_node["unsafe_entries"] += 1
continue
out_path = os.path.join(dest_dir, *parts)
if os.path.exists(out_path):
report_node["files_overwritten"] += 1
_extract_member(zf, info, out_path)
report_node["files_extracted"] += 1
except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile) as exc:
msg = f"[archive] Could not unpack {zpath}: {exc}"
report_node["errors"].append(msg)
print(msg)
return False
return True
def _copy_filelist(src: str, dest: str, report_node: dict) -> None:
if not os.path.exists(src):
report_node["missing_source"] = True
print(f"[archive] Optional dop20 filelist missing: {src}")
return
os.makedirs(os.path.dirname(dest), exist_ok=True)
try:
shutil.copy2(src, dest)
report_node["copied"] = True
print(f"Copied filelist: {src} -> {dest}")
except OSError as exc:
report_node["error"] = str(exc)
print(f"[archive] Failed to copy filelist {src} -> {dest}: {exc}")
def _count_ext(root: str, suffixes: tuple[str, ...]) -> int:
if not os.path.isdir(root):
return 0
total = 0
for cur_root, _, files in os.walk(root):
for name in files:
if name.lower().endswith(suffixes):
total += 1
return total
def _validate_materialized_raw(cfg: Config, report: dict) -> bool:
node = report["validation"]
dop20_root = os.path.dirname(cfg.raw.dop20_dir)
counts = {
"dgm1_tif": _count_ext(cfg.raw.dgm1_dir, (".tif", ".tiff")),
"dop20_jp2": _count_ext(cfg.raw.dop20_dir, (".jp2",)),
"citygml_lod2": _count_ext(cfg.raw.citygml_lod2_dir, (".gml", ".xml")),
"citygml_lod1": _count_ext(cfg.raw.citygml_lod1_dir, (".gml", ".xml")),
"dop20_j2w": _count_ext(os.path.join(dop20_root, "j2w"), (".j2w", ".wld")),
"dop20_meta": _count_ext(os.path.join(dop20_root, "meta"), (".xml",)),
}
node["counts"] = counts
if counts["dgm1_tif"] == 0:
node["errors"].append(f"Missing required DGM1 TIFFs in {cfg.raw.dgm1_dir}")
if counts["dop20_jp2"] == 0:
node["errors"].append(f"Missing required DOP20 JP2s in {cfg.raw.dop20_dir}")
if counts["citygml_lod2"] == 0:
node["errors"].append(f"Missing required CityGML LoD2 files in {cfg.raw.citygml_lod2_dir}")
if counts["citygml_lod1"] == 0:
node["warnings"].append(f"No CityGML LoD1 files found in {cfg.raw.citygml_lod1_dir}")
if counts["dop20_j2w"] == 0:
node["warnings"].append(f"No DOP20 worldfiles found in {os.path.join(dop20_root, 'j2w')}")
if counts["dop20_meta"] == 0:
node["warnings"].append(f"No DOP20 metadata XML files found in {os.path.join(dop20_root, 'meta')}")
if not os.path.exists(os.path.join(dop20_root, "filelist.txt")):
node["warnings"].append(f"No dop20 filelist found at {os.path.join(dop20_root, 'filelist.txt')}")
for msg in node["warnings"]:
print(f"[archive][validate] Warning: {msg}")
for msg in node["errors"]:
print(f"[archive][validate] Error: {msg}")
node["passed"] = len(node["errors"]) == 0
if node["passed"]:
print("[archive][validate] Validation passed.")
else:
print("[archive][validate] Validation failed.")
return bool(node["passed"])
def _write_materialize_report(cfg: Config, report: dict) -> None:
out_path = os.path.join(cfg.work.work_dir, "archive_materialize_report.json")
os.makedirs(os.path.dirname(out_path), exist_ok=True)
with open(out_path, "w", encoding="utf-8") as handle:
json.dump(report, handle, indent=2)
print(f"[archive] Wrote report: {out_path}")
__all__ = ["ensure_directories", "materialize_archives", "ensure_default_config"]