Improve filename normalization for special characters

2025-11-15 03:44:31 +01:00
parent d8de882cdd
commit f724216fb8
4 changed files with 79 additions and 14 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1321,6 +1321,7 @@ dependencies = [
 "toml",
 "tracing",
 "tracing-subscriber",
 "unicode-normalization",
 "url",
 "walkdir",
 ]
@@ -1695,6 +1696,15 @@ version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
 [[package]]
 name = "unicode-normalization"
 version = "0.1.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
 dependencies = [
 "tinyvec",
 ]
 [[package]]
 name = "untrusted"
 version = "0.9.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,3 +22,4 @@ rpassword = "7.3"
 walkdir = "2.5"
 time = { version = "0.3", features = ["formatting", "parsing", "macros"] }
 sha2 = "0.10"
 unicode-normalization = "0.1"
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 - `init-config` writes a ready-to-edit config template (respecting `--download-root` and `--force` to overwrite).
 - `auth` subcommand stores Base64-encoded credentials per profile (passwords are never logged).
 - `list-courses` fetches `/users/me`, paginates enrolled courses, infers semester keys, caches the metadata, and prints a concise table.
- `sync` traverses every course folder/file tree, normalizes names, streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`).
+- `sync` traverses every course folder/file tree, normalizes names (Unicode NFKD + transliteration so `Ökologie/ß/œ` becomes `Oekologie/ss/oe`), streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`).
 - XDG-compliant config (`~/.config/studip-sync/config.toml`) and state (`~/.local/share/studip-sync/state.toml`) stores everything in TOML.
 - Extensive logging controls: `--quiet`, `--verbose/-v`, `--debug`, and `--json`.
@@ -80,7 +80,7 @@ Global flags: `--quiet`, `--debug`, `--json`, `-v/--verbose` (stackable), `--con
 2. Cache missing semesters via `/semesters/{id}` and infer keys like `ws2425` / `ss25`. When `--refresh` is passed, already-known semesters that never recorded a `start` timestamp are re-fetched so `--since` filters have the data they need.
 3. For each course:
   - Walk folders using the JSON:API pagination helpers; fetch nested folders via `/folders/{id}/folders`.
-   - List file refs via `/folders/{id}/file-refs`, normalize filenames, and ensure unique siblings through a `NameRegistry`.
+  - List file refs via `/folders/{id}/file-refs`, normalize filenames (including transliteration of umlauts/ligatures like `ä→ae`, `Ö→Oe`, `ß→ss`, `œ→oe`), and ensure unique siblings through a `NameRegistry`.
   - Skip downloads when the local file exists and matches the stored checksum / size / remote `chdate`.
   - Stream downloads to `*.part`, hash contents on the fly, then rename atomically to the final path.
 4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and optionally delete now-empty directories). When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic.
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -26,6 +26,7 @@ use time::{
 };
 use tokio::{fs, io::AsyncWriteExt};
 use tracing::info;
 use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
 use walkdir::WalkDir;
 const USERNAME_ENV: &str = "STUDIP_SYNC_USERNAME";
@@ -966,17 +967,29 @@ async fn ensure_directory(path: &Path, dry_run: bool) -> Result<()> {
 fn normalize_component(input: &str) -> String {
    let mut sanitized = String::new();
-    for ch in input.chars() {
+    let mut last_was_separator = false;
-        if ch.is_ascii_alphanumeric() {
+
    for ch in input.nfkd() {
        if is_combining_mark(ch) {
            continue;
        }
        if let Some(mapped) = map_special_letters(ch) {
            sanitized.push_str(mapped);
            last_was_separator = false;
            continue;
        }
        if ch.is_alphanumeric() {
            sanitized.push(ch);
            last_was_separator = false;
        } else if ch.is_whitespace() || matches!(ch, '-' | '_' | '.') {
-            if !sanitized.ends_with('_') {
+            push_separator(&mut sanitized, &mut last_was_separator);
                sanitized.push('_');
            }
        } else {
-            // skip other characters
+            push_separator(&mut sanitized, &mut last_was_separator);
        }
    }
    let trimmed = sanitized.trim_matches('_');
    if trimmed.is_empty() {
        "untitled".to_string()
@@ -987,15 +1000,25 @@ fn normalize_component(input: &str) -> String {
 fn normalize_file_name(name: &str) -> String {
    let mut sanitized = String::new();
-    for ch in name.chars() {
+
-        if ch == '/' || ch == '\\' {
+    for ch in name.nfkd() {
-            sanitized.push('_');
+        if is_combining_mark(ch) {
        } else if ch.is_control() {
            continue;
        }
        match ch {
            '/' | '\\' => sanitized.push('_'),
            ch if ch.is_control() => continue,
            _ => {
                if let Some(mapped) = map_special_letters(ch) {
                    sanitized.push_str(mapped);
                } else {
                    sanitized.push(ch);
                }
            }
        }
    }
    if sanitized.trim().is_empty() {
        "file".to_string()
    } else {
@@ -1003,6 +1026,37 @@ fn normalize_file_name(name: &str) -> String {
    }
 }
 fn push_separator(buf: &mut String, last_was_separator: &mut bool) {
    if !*last_was_separator {
        buf.push('_');
        *last_was_separator = true;
    }
 }
 fn map_special_letters(ch: char) -> Option<&'static str> {
    match ch {
        'ß' => Some("ss"),
        'ẞ' => Some("SS"),
        'ä' => Some("ae"),
        'Ä' => Some("Ae"),
        'ö' => Some("oe"),
        'Ö' => Some("Oe"),
        'ü' => Some("ue"),
        'Ü' => Some("Ue"),
        'Æ' => Some("AE"),
        'æ' => Some("ae"),
        'Œ' => Some("OE"),
        'œ' => Some("oe"),
        'Ð' => Some("D"),
        'ð' => Some("d"),
        'Þ' => Some("Th"),
        'þ' => Some("th"),
        'Ł' => Some("L"),
        'ł' => Some("l"),
        _ => None,
    }
 }
 fn compute_file_checksum(path: &Path) -> Result<String> {
    let file = File::open(path)?;
    let mut reader = BufReader::new(file);