#!/usr/bin/env python3 """Download configured geodata tiles into raw/ based on a TOML config.""" from __future__ import annotations import argparse import os import shutil import sys import time import queue import threading import zipfile from concurrent.futures import Future, as_completed from dataclasses import dataclass from typing import Dict, Iterable, List, Optional, Tuple from urllib.parse import urlparse try: import tomllib except ImportError: # pragma: no cover - tomllib is required raise SystemExit("tomllib is required (Python 3.11+).") import requests DEFAULT_CONFIG = "geodata_download.toml" DEFAULT_OUTPUT_DIR = "raw" OUTPUT_SUBDIRS = { "dgm1": "dgm1", "dom1": "dom1", "dop20": "dop20", "geb3dlo": os.path.join("citygml", "lod2"), "citygml": os.path.join("citygml", "lod2"), "bdom20rgbi": "bdom20rgbi", "lpg": "lpolpg", "lpo": "lpolpg", "lpolpg": "lpolpg", } FILE_TYPE_SUBDIRS = { "image": "jp2", "worldfile": "j2w", "metadata": "meta", } @dataclass(frozen=True) class DownloadTask: dataset: str url: str output_path: str download_key: str unzip: bool = False unzip_dir: Optional[str] = None class DownloadLogger: def __init__( self, log_file: Optional[str] = None, log_format: Optional[str] = None, report_progress: bool = True, ) -> None: self._log_file = log_file self._log_format = log_format self._report_progress = report_progress def log(self, level: str, message: str) -> None: line = self._format(level, message) if self._report_progress: print(line) if self._log_file: with open(self._log_file, "a", encoding="utf-8") as fh: fh.write(line + "\n") def _format(self, level: str, message: str) -> str: if not self._log_format: return f"[{level}] {message}" timestamp = time.strftime("%Y-%m-%d %H:%M:%S") try: return self._log_format.format(timestamp=timestamp, level=level, message=message) except Exception: return f"[{level}] {message}" class DownloadProgress: def __init__(self, total: int, enabled: bool) -> None: self._total = max(total, 1) self._enabled = enabled self._downloaded = 0 self._missing = 0 self._failed = 0 self._bytes = 0 self._lock = threading.Lock() self._start = time.time() self._last_label = "" self._last_render = 0.0 self._min_interval = 0.25 def add_bytes(self, bytes_delta: int, label: Optional[str] = None) -> None: if not self._enabled or bytes_delta <= 0: return with self._lock: self._bytes += bytes_delta if label: self._last_label = os.path.basename(label) self._render_locked(force=False) def set_counts(self, downloaded: int, missing: int, failed: int, label: Optional[str] = None) -> None: if not self._enabled: return with self._lock: self._downloaded = downloaded self._missing = missing self._failed = failed if label: self._last_label = os.path.basename(label) self._render_locked(force=True) def finish(self) -> None: if not self._enabled: return sys.stderr.write("\n") sys.stderr.flush() def _render_locked(self, force: bool) -> None: now = time.time() if not force and (now - self._last_render) < self._min_interval: return self._last_render = now elapsed = max(now - self._start, 0.001) done = self._downloaded + self._missing + self._failed rate = done / elapsed remaining = max(self._total - done, 0) eta = int(remaining / rate) if rate > 0 else 0 bytes_mb = self._bytes / (1024 * 1024) bytes_gb = bytes_mb / 1024 byte_rate = bytes_mb / elapsed width = 28 filled = int(width * done / self._total) bar = "#" * filled + "-" * (width - filled) line = ( f"\r[{bar}] {done}/{self._total} " f"{rate:.2f}/s eta {eta}s ok={self._downloaded} miss={self._missing} fail={self._failed} " f"{bytes_gb:.1f}GB {byte_rate:.1f}MB/s " f"{self._last_label}" ) sys.stderr.write(line[:200]) sys.stderr.flush() class DaemonThreadPool: """Minimal daemon-thread pool that supports submit() + shutdown().""" def __init__(self, max_workers: int) -> None: if max_workers <= 0: raise ValueError("max_workers must be greater than 0") self._tasks: queue.Queue = queue.Queue() self._threads: list[threading.Thread] = [] self._shutdown = False for idx in range(max_workers): thread = threading.Thread( name=f"download-worker-{idx}", target=self._worker, daemon=True, ) thread.start() self._threads.append(thread) def submit(self, fn, *args, **kwargs) -> Future: if self._shutdown: raise RuntimeError("Thread pool already shutdown") future: Future = Future() self._tasks.put((future, fn, args, kwargs)) return future def shutdown(self, wait: bool = True, cancel_futures: bool = False) -> None: self._shutdown = True if cancel_futures: while True: try: item = self._tasks.get_nowait() except queue.Empty: break if item is None: continue future, _, _, _ = item future.cancel() for _ in self._threads: self._tasks.put(None) if wait: for thread in self._threads: thread.join() def _worker(self) -> None: while True: item = self._tasks.get() if item is None: return future, fn, args, kwargs = item if not future.set_running_or_notify_cancel(): continue try: result = fn(*args, **kwargs) except BaseException as exc: future.set_exception(exc) else: future.set_result(result) def _load_toml(path: str) -> dict: if not os.path.exists(path): raise SystemExit(f"Config not found: {path}") with open(path, "rb") as fh: return tomllib.load(fh) def _parse_download_targets(cfg: dict) -> Tuple[Dict[str, dict], Optional[str]]: download_cfg = cfg.get("download", {}) if not isinstance(download_cfg, dict): raise SystemExit("download must be a table in the TOML config.") has_targets = any(isinstance(value, dict) for value in download_cfg.values()) if has_targets: targets = {name: value for name, value in download_cfg.items() if isinstance(value, dict)} if not targets: raise SystemExit("download targets must be tables, e.g. [download.geobasis].") default_key = download_cfg.get("default_target") or download_cfg.get("default") return targets, default_key return {"default": download_cfg}, "default" def _parse_tile_ranges(cfg: dict) -> Dict[str, dict]: ranges = cfg.get("tile_ranges", {}) if not isinstance(ranges, dict): raise SystemExit("tile_ranges must be a table in the TOML config.") return ranges def _iter_tiles(range_cfg: dict) -> Iterable[Tuple[int, int]]: x_start = int(range_cfg["x_start"]) x_end = int(range_cfg["x_end"]) x_step = int(range_cfg.get("x_step", 1)) y_start = int(range_cfg["y_start"]) y_end = int(range_cfg["y_end"]) y_step = int(range_cfg.get("y_step", 1)) for x in range(x_start, x_end + 1, x_step): for y in range(y_start, y_end + 1, y_step): yield x, y def _override_range(range_cfg: dict, start: Tuple[int, int], end: Tuple[int, int]) -> dict: new_range = dict(range_cfg) new_range["x_start"] = start[0] new_range["y_start"] = start[1] new_range["x_end"] = end[0] new_range["y_end"] = end[1] return new_range def _resolve_output_dir(dataset_key: str, dataset_cfg: dict) -> str: if dataset_key in OUTPUT_SUBDIRS: return OUTPUT_SUBDIRS[dataset_key] return dataset_cfg.get("output_subdir", dataset_key) def _format_url( template: str, base_url: str, x: Optional[int], y: Optional[int], extra: dict, ) -> str: format_vars = {"base_url": base_url} if x is not None: format_vars["x"] = x if y is not None: format_vars["y"] = y for key, value in extra.items(): if isinstance(value, (str, int, float)): format_vars[key] = value return template.format(**format_vars) def _select_download_target( download_targets: Dict[str, dict], default_key: Optional[str], dataset_key: str, dataset_cfg: dict, ) -> Tuple[str, dict]: requested = dataset_cfg.get("download") if requested: if requested not in download_targets: raise SystemExit(f"{dataset_key}: download target not found: {requested}") return requested, download_targets[requested] if default_key: if default_key not in download_targets: raise SystemExit(f"download.default_target not found: {default_key}") return default_key, download_targets[default_key] if len(download_targets) == 1: name = next(iter(download_targets)) return name, download_targets[name] raise SystemExit( f"{dataset_key}: multiple download targets defined; set datasets.{dataset_key}.download " "or download.default_target.", ) def _resolve_url(source_cfg: dict, base_url: str, x: Optional[int], y: Optional[int]) -> str: if "url" in source_cfg: return str(source_cfg["url"]) template = source_cfg.get("url_template") if not template: raise SystemExit("url_template or url must be defined for the dataset.") return _format_url(str(template), base_url, x, y, source_cfg) def _build_tasks( cfg: dict, datasets: Dict[str, dict], tile_ranges: Dict[str, dict], download_targets: Dict[str, dict], default_target: Optional[str], start_override: Optional[Tuple[int, int]], end_override: Optional[Tuple[int, int]], ) -> List[DownloadTask]: tasks: List[DownloadTask] = [] for dataset_key, dataset_cfg in datasets.items(): download_key, download_cfg = _select_download_target( download_targets, default_target, dataset_key, dataset_cfg, ) base_url = download_cfg.get("base_url", "").rstrip("/") base_output_dir = download_cfg.get("output_directory", DEFAULT_OUTPUT_DIR) base_subdir = _resolve_output_dir(dataset_key, dataset_cfg) dataset_out_dir = os.path.join(base_output_dir, base_subdir) unzip = bool(dataset_cfg.get("unzip", False)) unzip_dir = dataset_out_dir tile_range_key = dataset_cfg.get("tile_range") if tile_range_key: if tile_range_key not in tile_ranges: raise SystemExit(f"{dataset_key}: tile_range not found: {tile_range_key}") range_cfg = tile_ranges[tile_range_key] if start_override and end_override: range_cfg = _override_range(range_cfg, start_override, end_override) for x, y in _iter_tiles(range_cfg): if "files" in dataset_cfg: for file_cfg in dataset_cfg["files"]: file_type = file_cfg.get("type", "file") file_subdir = FILE_TYPE_SUBDIRS.get(file_type, file_type) out_dir = os.path.join(dataset_out_dir, file_subdir) url = _resolve_url(file_cfg, base_url, x, y) filename = os.path.basename(urlparse(url).path) tasks.append( DownloadTask( dataset_key, url, os.path.join(out_dir, filename), download_key, unzip, unzip_dir, ) ) else: url = _resolve_url(dataset_cfg, base_url, x, y) filename = os.path.basename(urlparse(url).path) tasks.append( DownloadTask( dataset_key, url, os.path.join(dataset_out_dir, filename), download_key, unzip, unzip_dir, ) ) else: if "files" in dataset_cfg: for file_cfg in dataset_cfg["files"]: file_type = file_cfg.get("type", "file") file_subdir = FILE_TYPE_SUBDIRS.get(file_type, file_type) out_dir = os.path.join(dataset_out_dir, file_subdir) url = _resolve_url(file_cfg, base_url, None, None) filename = os.path.basename(urlparse(url).path) tasks.append( DownloadTask( dataset_key, url, os.path.join(out_dir, filename), download_key, unzip, unzip_dir, ) ) else: url = _resolve_url(dataset_cfg, base_url, None, None) filename = os.path.basename(urlparse(url).path) tasks.append( DownloadTask( dataset_key, url, os.path.join(dataset_out_dir, filename), download_key, unzip, unzip_dir, ) ) return tasks def _select_datasets(cfg: dict, requested: Optional[List[str]]) -> Dict[str, dict]: datasets = cfg.get("datasets", {}) if not isinstance(datasets, dict) or not datasets: raise SystemExit("datasets must be defined in the TOML config.") if requested: missing = [name for name in requested if name not in datasets] if missing: raise SystemExit(f"Unknown dataset(s): {', '.join(missing)}") selected = {name: datasets[name] for name in requested} else: selected = {name: ds for name, ds in datasets.items() if ds.get("enabled", True)} if not selected: raise SystemExit("No datasets selected for download.") return selected def _safe_remove_dir(base_dir: str, rel_dir: str) -> None: target = os.path.abspath(os.path.join(base_dir, rel_dir)) base = os.path.abspath(base_dir) if os.path.commonpath([target, base]) != base: raise SystemExit(f"Refusing to delete outside base dir: {target}") if target == base: raise SystemExit(f"Refusing to delete base dir: {target}") if os.path.exists(target): shutil.rmtree(target) def _unzipped_marker_path(task: DownloadTask) -> str: unzip_dir = task.unzip_dir or os.path.dirname(task.output_path) marker_name = f".{os.path.basename(task.output_path)}.unzipped" return os.path.join(unzip_dir, marker_name) def _needs_unzip(task: DownloadTask) -> bool: if not task.unzip: return False if not os.path.exists(task.output_path): return False return not os.path.exists(_unzipped_marker_path(task)) def _maybe_unzip(task: DownloadTask, logger: DownloadLogger) -> bool: if not task.unzip: return True if not os.path.exists(task.output_path): logger.log("ERROR", f"Unzip failed; missing file: {task.output_path}") return False unzip_dir = task.unzip_dir or os.path.dirname(task.output_path) os.makedirs(unzip_dir, exist_ok=True) try: with zipfile.ZipFile(task.output_path, "r") as zf: zf.extractall(unzip_dir) except (zipfile.BadZipFile, OSError) as exc: logger.log("ERROR", f"Unzip failed: {task.output_path} ({exc})") return False marker = _unzipped_marker_path(task) try: with open(marker, "w", encoding="utf-8") as fh: fh.write(time.strftime("%Y-%m-%d %H:%M:%S")) except OSError as exc: logger.log("WARN", f"Could not write unzip marker: {marker} ({exc})") logger.log("INFO", f"Unzipped: {task.output_path} -> {unzip_dir}") return True def _download_task( session: requests.Session, task: DownloadTask, timeout: int, verify: bool | str, retries: int, stop_event: threading.Event, progress: DownloadProgress, ) -> Tuple[str, DownloadTask, Optional[str]]: os.makedirs(os.path.dirname(task.output_path), exist_ok=True) tmp_path = f"{task.output_path}.part" if stop_event.is_set(): return "aborted", task, "Interrupted" for attempt in range(retries + 1): if stop_event.is_set(): return "aborted", task, "Interrupted" try: with session.get(task.url, stream=True, timeout=timeout, verify=verify) as resp: if resp.status_code in (404, 410): return "missing", task, f"HTTP {resp.status_code}" if resp.status_code >= 400: return "failed", task, f"HTTP {resp.status_code}" resp.raise_for_status() with open(tmp_path, "wb") as fh: for chunk in resp.iter_content(chunk_size=1024 * 1024): if stop_event.is_set(): return "aborted", task, "Interrupted" if chunk: fh.write(chunk) progress.add_bytes(len(chunk), task.output_path) os.replace(tmp_path, task.output_path) return "downloaded", task, None except requests.RequestException as exc: if attempt >= retries: return "failed", task, str(exc) time.sleep(1.0 + attempt * 0.5) except OSError as exc: return "failed", task, str(exc) if os.path.exists(tmp_path): try: os.remove(tmp_path) except OSError: pass return "failed", task, "Unknown error" def run_download( config_path: str, requested_datasets: Optional[List[str]] = None, start_override: Optional[Tuple[int, int]] = None, end_override: Optional[Tuple[int, int]] = None, clean_downloads: bool = False, ca_bundle_override: Optional[str] = None, ) -> int: cfg = _load_toml(config_path) download_targets, default_target = _parse_download_targets(cfg) tile_ranges = _parse_tile_ranges(cfg) datasets = _select_datasets(cfg, requested_datasets) logging_cfg = cfg.get("logging", {}) progress_enabled = bool(logging_cfg.get("report_progress", True)) logger = DownloadLogger( logging_cfg.get("log_file"), logging_cfg.get("log_format"), progress_enabled, ) if len(download_targets) > 1 and not default_target: default_target = next(iter(download_targets)) logger.log( "WARN", "No download.default_target set; using " f"{default_target} as default for datasets without a download key.", ) if clean_downloads: for dataset_key, dataset_cfg in datasets.items(): download_key, download_cfg = _select_download_target( download_targets, default_target, dataset_key, dataset_cfg, ) base_output_dir = download_cfg.get("output_directory", DEFAULT_OUTPUT_DIR) _safe_remove_dir(base_output_dir, _resolve_output_dir(dataset_key, dataset_cfg)) tasks = _build_tasks( cfg, datasets, tile_ranges, download_targets, default_target, start_override, end_override, ) if not tasks: logger.log("INFO", "No download tasks generated.") return 0 tasks_by_target: Dict[str, List[DownloadTask]] = {} for task in tasks: tasks_by_target.setdefault(task.download_key, []).append(task) total_downloaded = 0 total_missing = 0 total_failed = 0 total_skipped = 0 total_unzipped_existing = 0 for target_key, target_cfg in download_targets.items(): target_tasks = tasks_by_target.get(target_key) if not target_tasks: continue target_label = f"[{target_key}] " skip_existing = not clean_downloads pending: List[DownloadTask] = [] unzip_only: List[DownloadTask] = [] skipped = 0 for task in target_tasks: if skip_existing and os.path.exists(task.output_path): if _needs_unzip(task): unzip_only.append(task) else: skipped += 1 continue pending.append(task) if skipped: logger.log("INFO", f"{target_label}Skipped {skipped} existing file(s).") if unzip_only: logger.log("INFO", f"{target_label}Queued {len(unzip_only)} unzip(s) for existing files.") verify_ssl = target_cfg.get("verify_ssl", True) ca_bundle = ca_bundle_override or target_cfg.get("ca_bundle") if verify_ssl and ca_bundle: if os.path.exists(ca_bundle): verify = ca_bundle source = "CLI" if ca_bundle_override else "config" logger.log("INFO", f"{target_label}Using CA bundle ({source}): {ca_bundle}") else: verify = True logger.log("WARN", f"{target_label}CA bundle not found, using system trust: {ca_bundle}") else: verify = bool(verify_ssl) if not verify_ssl: logger.log("WARN", f"{target_label}TLS verification disabled by config.") timeout = int(target_cfg.get("timeout_seconds", 300)) retries = int(target_cfg.get("retry_attempts", 3)) parallel = int(target_cfg.get("parallel_downloads", 4)) user_agent = target_cfg.get("user_agent", "geodata-download/1.0") downloaded = 0 missing = 0 failed = 0 if pending: progress = DownloadProgress(len(pending), progress_enabled) stop_event = threading.Event() interrupted = False with requests.Session() as session: session.headers.update({"User-Agent": user_agent}) executor = DaemonThreadPool(max_workers=parallel) futures = [ executor.submit(_download_task, session, task, timeout, verify, retries, stop_event, progress) for task in pending ] try: for future in as_completed(futures): status, task, detail = future.result() if status == "downloaded": downloaded += 1 if task.unzip and not _maybe_unzip(task, logger): failed += 1 elif status == "missing": missing += 1 logger.log("WARN", f"{target_label}Missing tile: {task.url} ({detail})") elif status == "aborted": failed += 1 else: failed += 1 extra = f" ({detail})" if detail else "" logger.log("ERROR", f"{target_label}Failed: {task.url}{extra}") progress.set_counts(downloaded, missing, failed, task.output_path) except KeyboardInterrupt: interrupted = True stop_event.set() logger.log("WARN", f"{target_label}Interrupted; stopping downloads.") finally: if interrupted: executor.shutdown(wait=False, cancel_futures=True) else: executor.shutdown(wait=True) progress.finish() if interrupted: return 130 else: logger.log("INFO", f"{target_label}Nothing to download.") if unzip_only: for task in unzip_only: if _maybe_unzip(task, logger): total_unzipped_existing += 1 else: failed += 1 total_downloaded += downloaded total_missing += missing total_failed += failed total_skipped += skipped logger.log( "INFO", "Done. " f"Downloaded={total_downloaded}, Missing={total_missing}, Failed={total_failed}, " f"Skipped={total_skipped}, UnzippedExisting={total_unzipped_existing}.", ) return 1 if total_failed else 0 def parse_args(argv: Optional[Iterable[str]] = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Download geodata tiles from TOML config.") parser.add_argument( "--config", default=DEFAULT_CONFIG, help="Path to geodata_download.toml.", ) parser.add_argument( "--datasets", help="Comma-separated dataset keys to download (default: enabled datasets).", ) parser.add_argument( "--start", nargs=2, type=int, metavar=("X", "Y"), help="Override tile range start (x y).", ) parser.add_argument( "--end", nargs=2, type=int, metavar=("X", "Y"), help="Override tile range end (x y).", ) parser.add_argument( "--clean-downloads", action="store_true", help="Delete selected dataset folders before downloading.", ) parser.add_argument( "--ca-bundle", help="Path to a CA bundle file to override the config.", ) return parser.parse_args(argv) def main(argv: Optional[Iterable[str]] = None) -> int: args = parse_args(argv) datasets = [name.strip() for name in args.datasets.split(",")] if args.datasets else None start = tuple(args.start) if args.start else None end = tuple(args.end) if args.end else None if (start is None) != (end is None): raise SystemExit("--start and --end must be provided together.") return run_download( config_path=args.config, requested_datasets=datasets, start_override=start, end_override=end, clean_downloads=args.clean_downloads, ca_bundle_override=args.ca_bundle, ) if __name__ == "__main__": sys.exit(main())