GeoData/geodata_download.py

#!/usr/bin/env python3
"""Download configured geodata tiles into raw/ based on a TOML config."""
from __future__ import annotations

import argparse
import os
import shutil
import sys
import time
import queue
import threading
import zipfile
from concurrent.futures import Future, as_completed
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Tuple
from urllib.parse import urlparse

try:
    import tomllib
except ImportError:  # pragma: no cover - tomllib is required
    raise SystemExit("tomllib is required (Python 3.11+).")

import requests

DEFAULT_CONFIG = "geodata_download.toml"
DEFAULT_OUTPUT_DIR = "raw"

OUTPUT_SUBDIRS = {
    "dgm1": "dgm1",
    "dom1": "dom1",
    "dop20": "dop20",
    "geb3dlo": os.path.join("citygml", "lod2"),
    "citygml": os.path.join("citygml", "lod2"),
    "bdom20rgbi": "bdom20rgbi",
    "lpg": "lpolpg",
    "lpo": "lpolpg",
    "lpolpg": "lpolpg",
}

FILE_TYPE_SUBDIRS = {
    "image": "jp2",
    "worldfile": "j2w",
    "metadata": "meta",
}


@dataclass(frozen=True)
class DownloadTask:
    dataset: str
    url: str
    output_path: str
    download_key: str
    unzip: bool = False
    unzip_dir: Optional[str] = None


class DownloadLogger:
    def __init__(
        self,
        log_file: Optional[str] = None,
        log_format: Optional[str] = None,
        report_progress: bool = True,
    ) -> None:
        self._log_file = log_file
        self._log_format = log_format
        self._report_progress = report_progress

    def log(self, level: str, message: str) -> None:
        line = self._format(level, message)
        if self._report_progress:
            print(line)
        if self._log_file:
            with open(self._log_file, "a", encoding="utf-8") as fh:
                fh.write(line + "\n")

    def _format(self, level: str, message: str) -> str:
        if not self._log_format:
            return f"[{level}] {message}"
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        try:
            return self._log_format.format(timestamp=timestamp, level=level, message=message)
        except Exception:
            return f"[{level}] {message}"


class DownloadProgress:
    def __init__(self, total: int, enabled: bool) -> None:
        self._total = max(total, 1)
        self._enabled = enabled
        self._downloaded = 0
        self._missing = 0
        self._failed = 0
        self._bytes = 0
        self._lock = threading.Lock()
        self._start = time.time()
        self._last_label = ""
        self._last_render = 0.0
        self._min_interval = 0.25

    def add_bytes(self, bytes_delta: int, label: Optional[str] = None) -> None:
        if not self._enabled or bytes_delta <= 0:
            return
        with self._lock:
            self._bytes += bytes_delta
            if label:
                self._last_label = os.path.basename(label)
            self._render_locked(force=False)

    def set_counts(self, downloaded: int, missing: int, failed: int, label: Optional[str] = None) -> None:
        if not self._enabled:
            return
        with self._lock:
            self._downloaded = downloaded
            self._missing = missing
            self._failed = failed
            if label:
                self._last_label = os.path.basename(label)
            self._render_locked(force=True)

    def finish(self) -> None:
        if not self._enabled:
            return
        sys.stderr.write("\n")
        sys.stderr.flush()

    def _render_locked(self, force: bool) -> None:
        now = time.time()
        if not force and (now - self._last_render) < self._min_interval:
            return
        self._last_render = now
        elapsed = max(now - self._start, 0.001)
        done = self._downloaded + self._missing + self._failed
        rate = done / elapsed
        remaining = max(self._total - done, 0)
        eta = int(remaining / rate) if rate > 0 else 0
        bytes_mb = self._bytes / (1024 * 1024)
        bytes_gb = bytes_mb / 1024
        byte_rate = bytes_mb / elapsed
        width = 28
        filled = int(width * done / self._total)
        bar = "#" * filled + "-" * (width - filled)
        line = (
            f"\r[{bar}] {done}/{self._total} "
            f"{rate:.2f}/s eta {eta}s ok={self._downloaded} miss={self._missing} fail={self._failed} "
            f"{bytes_gb:.1f}GB {byte_rate:.1f}MB/s "
            f"{self._last_label}"
        )
        sys.stderr.write(line[:200])
        sys.stderr.flush()


class DaemonThreadPool:
    """Minimal daemon-thread pool that supports submit() + shutdown()."""

    def __init__(self, max_workers: int) -> None:
        if max_workers <= 0:
            raise ValueError("max_workers must be greater than 0")
        self._tasks: queue.Queue = queue.Queue()
        self._threads: list[threading.Thread] = []
        self._shutdown = False
        for idx in range(max_workers):
            thread = threading.Thread(
                name=f"download-worker-{idx}",
                target=self._worker,
                daemon=True,
            )
            thread.start()
            self._threads.append(thread)

    def submit(self, fn, *args, **kwargs) -> Future:
        if self._shutdown:
            raise RuntimeError("Thread pool already shutdown")
        future: Future = Future()
        self._tasks.put((future, fn, args, kwargs))
        return future

    def shutdown(self, wait: bool = True, cancel_futures: bool = False) -> None:
        self._shutdown = True
        if cancel_futures:
            while True:
                try:
                    item = self._tasks.get_nowait()
                except queue.Empty:
                    break
                if item is None:
                    continue
                future, _, _, _ = item
                future.cancel()
        for _ in self._threads:
            self._tasks.put(None)
        if wait:
            for thread in self._threads:
                thread.join()

    def _worker(self) -> None:
        while True:
            item = self._tasks.get()
            if item is None:
                return
            future, fn, args, kwargs = item
            if not future.set_running_or_notify_cancel():
                continue
            try:
                result = fn(*args, **kwargs)
            except BaseException as exc:
                future.set_exception(exc)
            else:
                future.set_result(result)


def _load_toml(path: str) -> dict:
    if not os.path.exists(path):
        raise SystemExit(f"Config not found: {path}")
    with open(path, "rb") as fh:
        return tomllib.load(fh)


def _parse_download_targets(cfg: dict) -> Tuple[Dict[str, dict], Optional[str]]:
    download_cfg = cfg.get("download", {})
    if not isinstance(download_cfg, dict):
        raise SystemExit("download must be a table in the TOML config.")
    has_targets = any(isinstance(value, dict) for value in download_cfg.values())
    if has_targets:
        targets = {name: value for name, value in download_cfg.items() if isinstance(value, dict)}
        if not targets:
            raise SystemExit("download targets must be tables, e.g. [download.geobasis].")
        default_key = download_cfg.get("default_target") or download_cfg.get("default")
        return targets, default_key
    return {"default": download_cfg}, "default"


def _parse_tile_ranges(cfg: dict) -> Dict[str, dict]:
    ranges = cfg.get("tile_ranges", {})
    if not isinstance(ranges, dict):
        raise SystemExit("tile_ranges must be a table in the TOML config.")
    return ranges


def _iter_tiles(range_cfg: dict) -> Iterable[Tuple[int, int]]:
    x_start = int(range_cfg["x_start"])
    x_end = int(range_cfg["x_end"])
    x_step = int(range_cfg.get("x_step", 1))
    y_start = int(range_cfg["y_start"])
    y_end = int(range_cfg["y_end"])
    y_step = int(range_cfg.get("y_step", 1))

    for x in range(x_start, x_end + 1, x_step):
        for y in range(y_start, y_end + 1, y_step):
            yield x, y


def _override_range(range_cfg: dict, start: Tuple[int, int], end: Tuple[int, int]) -> dict:
    new_range = dict(range_cfg)
    new_range["x_start"] = start[0]
    new_range["y_start"] = start[1]
    new_range["x_end"] = end[0]
    new_range["y_end"] = end[1]
    return new_range


def _resolve_output_dir(dataset_key: str, dataset_cfg: dict) -> str:
    if dataset_key in OUTPUT_SUBDIRS:
        return OUTPUT_SUBDIRS[dataset_key]
    return dataset_cfg.get("output_subdir", dataset_key)


def _format_url(
    template: str,
    base_url: str,
    x: Optional[int],
    y: Optional[int],
    extra: dict,
) -> str:
    format_vars = {"base_url": base_url}
    if x is not None:
        format_vars["x"] = x
    if y is not None:
        format_vars["y"] = y
    for key, value in extra.items():
        if isinstance(value, (str, int, float)):
            format_vars[key] = value
    return template.format(**format_vars)


def _select_download_target(
    download_targets: Dict[str, dict],
    default_key: Optional[str],
    dataset_key: str,
    dataset_cfg: dict,
) -> Tuple[str, dict]:
    requested = dataset_cfg.get("download")
    if requested:
        if requested not in download_targets:
            raise SystemExit(f"{dataset_key}: download target not found: {requested}")
        return requested, download_targets[requested]
    if default_key:
        if default_key not in download_targets:
            raise SystemExit(f"download.default_target not found: {default_key}")
        return default_key, download_targets[default_key]
    if len(download_targets) == 1:
        name = next(iter(download_targets))
        return name, download_targets[name]
    raise SystemExit(
        f"{dataset_key}: multiple download targets defined; set datasets.{dataset_key}.download "
        "or download.default_target.",
    )


def _resolve_url(source_cfg: dict, base_url: str, x: Optional[int], y: Optional[int]) -> str:
    if "url" in source_cfg:
        return str(source_cfg["url"])
    template = source_cfg.get("url_template")
    if not template:
        raise SystemExit("url_template or url must be defined for the dataset.")
    return _format_url(str(template), base_url, x, y, source_cfg)


def _build_tasks(
    cfg: dict,
    datasets: Dict[str, dict],
    tile_ranges: Dict[str, dict],
    download_targets: Dict[str, dict],
    default_target: Optional[str],
    start_override: Optional[Tuple[int, int]],
    end_override: Optional[Tuple[int, int]],
) -> List[DownloadTask]:
    tasks: List[DownloadTask] = []

    for dataset_key, dataset_cfg in datasets.items():
        download_key, download_cfg = _select_download_target(
            download_targets,
            default_target,
            dataset_key,
            dataset_cfg,
        )
        base_url = download_cfg.get("base_url", "").rstrip("/")
        base_output_dir = download_cfg.get("output_directory", DEFAULT_OUTPUT_DIR)
        base_subdir = _resolve_output_dir(dataset_key, dataset_cfg)
        dataset_out_dir = os.path.join(base_output_dir, base_subdir)
        unzip = bool(dataset_cfg.get("unzip", False))
        unzip_dir = dataset_out_dir

        tile_range_key = dataset_cfg.get("tile_range")
        if tile_range_key:
            if tile_range_key not in tile_ranges:
                raise SystemExit(f"{dataset_key}: tile_range not found: {tile_range_key}")
            range_cfg = tile_ranges[tile_range_key]
            if start_override and end_override:
                range_cfg = _override_range(range_cfg, start_override, end_override)

            for x, y in _iter_tiles(range_cfg):
                if "files" in dataset_cfg:
                    for file_cfg in dataset_cfg["files"]:
                        file_type = file_cfg.get("type", "file")
                        file_subdir = FILE_TYPE_SUBDIRS.get(file_type, file_type)
                        out_dir = os.path.join(dataset_out_dir, file_subdir)
                        url = _resolve_url(file_cfg, base_url, x, y)
                        filename = os.path.basename(urlparse(url).path)
                        tasks.append(
                            DownloadTask(
                                dataset_key,
                                url,
                                os.path.join(out_dir, filename),
                                download_key,
                                unzip,
                                unzip_dir,
                            )
                        )
                else:
                    url = _resolve_url(dataset_cfg, base_url, x, y)
                    filename = os.path.basename(urlparse(url).path)
                    tasks.append(
                        DownloadTask(
                            dataset_key,
                            url,
                            os.path.join(dataset_out_dir, filename),
                            download_key,
                            unzip,
                            unzip_dir,
                        )
                    )
        else:
            if "files" in dataset_cfg:
                for file_cfg in dataset_cfg["files"]:
                    file_type = file_cfg.get("type", "file")
                    file_subdir = FILE_TYPE_SUBDIRS.get(file_type, file_type)
                    out_dir = os.path.join(dataset_out_dir, file_subdir)
                    url = _resolve_url(file_cfg, base_url, None, None)
                    filename = os.path.basename(urlparse(url).path)
                    tasks.append(
                        DownloadTask(
                            dataset_key,
                            url,
                            os.path.join(out_dir, filename),
                            download_key,
                            unzip,
                            unzip_dir,
                        )
                    )
            else:
                url = _resolve_url(dataset_cfg, base_url, None, None)
                filename = os.path.basename(urlparse(url).path)
                tasks.append(
                    DownloadTask(
                        dataset_key,
                        url,
                        os.path.join(dataset_out_dir, filename),
                        download_key,
                        unzip,
                        unzip_dir,
                    )
                )

    return tasks


def _select_datasets(cfg: dict, requested: Optional[List[str]]) -> Dict[str, dict]:
    datasets = cfg.get("datasets", {})
    if not isinstance(datasets, dict) or not datasets:
        raise SystemExit("datasets must be defined in the TOML config.")

    if requested:
        missing = [name for name in requested if name not in datasets]
        if missing:
            raise SystemExit(f"Unknown dataset(s): {', '.join(missing)}")
        selected = {name: datasets[name] for name in requested}
    else:
        selected = {name: ds for name, ds in datasets.items() if ds.get("enabled", True)}

    if not selected:
        raise SystemExit("No datasets selected for download.")
    return selected


def _safe_remove_dir(base_dir: str, rel_dir: str) -> None:
    target = os.path.abspath(os.path.join(base_dir, rel_dir))
    base = os.path.abspath(base_dir)
    if os.path.commonpath([target, base]) != base:
        raise SystemExit(f"Refusing to delete outside base dir: {target}")
    if target == base:
        raise SystemExit(f"Refusing to delete base dir: {target}")
    if os.path.exists(target):
        shutil.rmtree(target)


def _unzipped_marker_path(task: DownloadTask) -> str:
    unzip_dir = task.unzip_dir or os.path.dirname(task.output_path)
    marker_name = f".{os.path.basename(task.output_path)}.unzipped"
    return os.path.join(unzip_dir, marker_name)


def _needs_unzip(task: DownloadTask) -> bool:
    if not task.unzip:
        return False
    if not os.path.exists(task.output_path):
        return False
    return not os.path.exists(_unzipped_marker_path(task))


def _maybe_unzip(task: DownloadTask, logger: DownloadLogger) -> bool:
    if not task.unzip:
        return True
    if not os.path.exists(task.output_path):
        logger.log("ERROR", f"Unzip failed; missing file: {task.output_path}")
        return False
    unzip_dir = task.unzip_dir or os.path.dirname(task.output_path)
    os.makedirs(unzip_dir, exist_ok=True)
    try:
        with zipfile.ZipFile(task.output_path, "r") as zf:
            zf.extractall(unzip_dir)
    except (zipfile.BadZipFile, OSError) as exc:
        logger.log("ERROR", f"Unzip failed: {task.output_path} ({exc})")
        return False
    marker = _unzipped_marker_path(task)
    try:
        with open(marker, "w", encoding="utf-8") as fh:
            fh.write(time.strftime("%Y-%m-%d %H:%M:%S"))
    except OSError as exc:
        logger.log("WARN", f"Could not write unzip marker: {marker} ({exc})")
    logger.log("INFO", f"Unzipped: {task.output_path} -> {unzip_dir}")
    return True


def _download_task(
    session: requests.Session,
    task: DownloadTask,
    timeout: int,
    verify: bool | str,
    retries: int,
    stop_event: threading.Event,
    progress: DownloadProgress,
) -> Tuple[str, DownloadTask, Optional[str]]:
    os.makedirs(os.path.dirname(task.output_path), exist_ok=True)
    tmp_path = f"{task.output_path}.part"

    if stop_event.is_set():
        return "aborted", task, "Interrupted"

    for attempt in range(retries + 1):
        if stop_event.is_set():
            return "aborted", task, "Interrupted"
        try:
            with session.get(task.url, stream=True, timeout=timeout, verify=verify) as resp:
                if resp.status_code in (404, 410):
                    return "missing", task, f"HTTP {resp.status_code}"
                if resp.status_code >= 400:
                    return "failed", task, f"HTTP {resp.status_code}"
                resp.raise_for_status()
                with open(tmp_path, "wb") as fh:
                    for chunk in resp.iter_content(chunk_size=1024 * 1024):
                        if stop_event.is_set():
                            return "aborted", task, "Interrupted"
                        if chunk:
                            fh.write(chunk)
                            progress.add_bytes(len(chunk), task.output_path)
            os.replace(tmp_path, task.output_path)
            return "downloaded", task, None
        except requests.RequestException as exc:
            if attempt >= retries:
                return "failed", task, str(exc)
            time.sleep(1.0 + attempt * 0.5)
        except OSError as exc:
            return "failed", task, str(exc)

    if os.path.exists(tmp_path):
        try:
            os.remove(tmp_path)
        except OSError:
            pass
    return "failed", task, "Unknown error"


def run_download(
    config_path: str,
    requested_datasets: Optional[List[str]] = None,
    start_override: Optional[Tuple[int, int]] = None,
    end_override: Optional[Tuple[int, int]] = None,
    clean_downloads: bool = False,
    ca_bundle_override: Optional[str] = None,
) -> int:
    cfg = _load_toml(config_path)
    download_targets, default_target = _parse_download_targets(cfg)
    tile_ranges = _parse_tile_ranges(cfg)
    datasets = _select_datasets(cfg, requested_datasets)

    logging_cfg = cfg.get("logging", {})
    progress_enabled = bool(logging_cfg.get("report_progress", True))
    logger = DownloadLogger(
        logging_cfg.get("log_file"),
        logging_cfg.get("log_format"),
        progress_enabled,
    )

    if len(download_targets) > 1 and not default_target:
        default_target = next(iter(download_targets))
        logger.log(
            "WARN",
            "No download.default_target set; using "
            f"{default_target} as default for datasets without a download key.",
        )

    if clean_downloads:
        for dataset_key, dataset_cfg in datasets.items():
            download_key, download_cfg = _select_download_target(
                download_targets,
                default_target,
                dataset_key,
                dataset_cfg,
            )
            base_output_dir = download_cfg.get("output_directory", DEFAULT_OUTPUT_DIR)
            _safe_remove_dir(base_output_dir, _resolve_output_dir(dataset_key, dataset_cfg))

    tasks = _build_tasks(
        cfg,
        datasets,
        tile_ranges,
        download_targets,
        default_target,
        start_override,
        end_override,
    )

    if not tasks:
        logger.log("INFO", "No download tasks generated.")
        return 0

    tasks_by_target: Dict[str, List[DownloadTask]] = {}
    for task in tasks:
        tasks_by_target.setdefault(task.download_key, []).append(task)

    total_downloaded = 0
    total_missing = 0
    total_failed = 0
    total_skipped = 0
    total_unzipped_existing = 0

    for target_key, target_cfg in download_targets.items():
        target_tasks = tasks_by_target.get(target_key)
        if not target_tasks:
            continue

        target_label = f"[{target_key}] "
        skip_existing = not clean_downloads
        pending: List[DownloadTask] = []
        unzip_only: List[DownloadTask] = []
        skipped = 0

        for task in target_tasks:
            if skip_existing and os.path.exists(task.output_path):
                if _needs_unzip(task):
                    unzip_only.append(task)
                else:
                    skipped += 1
                continue
            pending.append(task)

        if skipped:
            logger.log("INFO", f"{target_label}Skipped {skipped} existing file(s).")
        if unzip_only:
            logger.log("INFO", f"{target_label}Queued {len(unzip_only)} unzip(s) for existing files.")

        verify_ssl = target_cfg.get("verify_ssl", True)
        ca_bundle = ca_bundle_override or target_cfg.get("ca_bundle")
        if verify_ssl and ca_bundle:
            if os.path.exists(ca_bundle):
                verify = ca_bundle
                source = "CLI" if ca_bundle_override else "config"
                logger.log("INFO", f"{target_label}Using CA bundle ({source}): {ca_bundle}")
            else:
                verify = True
                logger.log("WARN", f"{target_label}CA bundle not found, using system trust: {ca_bundle}")
        else:
            verify = bool(verify_ssl)
            if not verify_ssl:
                logger.log("WARN", f"{target_label}TLS verification disabled by config.")

        timeout = int(target_cfg.get("timeout_seconds", 300))
        retries = int(target_cfg.get("retry_attempts", 3))
        parallel = int(target_cfg.get("parallel_downloads", 4))
        user_agent = target_cfg.get("user_agent", "geodata-download/1.0")

        downloaded = 0
        missing = 0
        failed = 0

        if pending:
            progress = DownloadProgress(len(pending), progress_enabled)
            stop_event = threading.Event()
            interrupted = False

            with requests.Session() as session:
                session.headers.update({"User-Agent": user_agent})
                executor = DaemonThreadPool(max_workers=parallel)
                futures = [
                    executor.submit(_download_task, session, task, timeout, verify, retries, stop_event, progress)
                    for task in pending
                ]
                try:
                    for future in as_completed(futures):
                        status, task, detail = future.result()
                        if status == "downloaded":
                            downloaded += 1
                            if task.unzip and not _maybe_unzip(task, logger):
                                failed += 1
                        elif status == "missing":
                            missing += 1
                            logger.log("WARN", f"{target_label}Missing tile: {task.url} ({detail})")
                        elif status == "aborted":
                            failed += 1
                        else:
                            failed += 1
                            extra = f" ({detail})" if detail else ""
                            logger.log("ERROR", f"{target_label}Failed: {task.url}{extra}")
                        progress.set_counts(downloaded, missing, failed, task.output_path)
                except KeyboardInterrupt:
                    interrupted = True
                    stop_event.set()
                    logger.log("WARN", f"{target_label}Interrupted; stopping downloads.")
                finally:
                    if interrupted:
                        executor.shutdown(wait=False, cancel_futures=True)
                    else:
                        executor.shutdown(wait=True)

            progress.finish()
            if interrupted:
                return 130
        else:
            logger.log("INFO", f"{target_label}Nothing to download.")

        if unzip_only:
            for task in unzip_only:
                if _maybe_unzip(task, logger):
                    total_unzipped_existing += 1
                else:
                    failed += 1

        total_downloaded += downloaded
        total_missing += missing
        total_failed += failed
        total_skipped += skipped

    logger.log(
        "INFO",
        "Done. "
        f"Downloaded={total_downloaded}, Missing={total_missing}, Failed={total_failed}, "
        f"Skipped={total_skipped}, UnzippedExisting={total_unzipped_existing}.",
    )
    return 1 if total_failed else 0


def parse_args(argv: Optional[Iterable[str]] = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Download geodata tiles from TOML config.")
    parser.add_argument(
        "--config",
        default=DEFAULT_CONFIG,
        help="Path to geodata_download.toml.",
    )
    parser.add_argument(
        "--datasets",
        help="Comma-separated dataset keys to download (default: enabled datasets).",
    )
    parser.add_argument(
        "--start",
        nargs=2,
        type=int,
        metavar=("X", "Y"),
        help="Override tile range start (x y).",
    )
    parser.add_argument(
        "--end",
        nargs=2,
        type=int,
        metavar=("X", "Y"),
        help="Override tile range end (x y).",
    )
    parser.add_argument(
        "--clean-downloads",
        action="store_true",
        help="Delete selected dataset folders before downloading.",
    )
    parser.add_argument(
        "--ca-bundle",
        help="Path to a CA bundle file to override the config.",
    )
    return parser.parse_args(argv)


def main(argv: Optional[Iterable[str]] = None) -> int:
    args = parse_args(argv)
    datasets = [name.strip() for name in args.datasets.split(",")] if args.datasets else None
    start = tuple(args.start) if args.start else None
    end = tuple(args.end) if args.end else None
    if (start is None) != (end is None):
        raise SystemExit("--start and --end must be provided together.")

    return run_download(
        config_path=args.config,
        requested_datasets=datasets,
        start_override=start,
        end_override=end,
        clean_downloads=args.clean_downloads,
        ca_bundle_override=args.ca_bundle,
    )


if __name__ == "__main__":
    sys.exit(main())