From f724216fb865efb872fef0878ad97ec4d909cc23 Mon Sep 17 00:00:00 2001 From: Matthias Puchstein Date: Sat, 15 Nov 2025 03:44:31 +0100 Subject: [PATCH] Improve filename normalization for special characters --- Cargo.lock | 10 +++++++ Cargo.toml | 1 + README.md | 4 +-- src/cli.rs | 78 +++++++++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 79 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7d8b49..635cf69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1321,6 +1321,7 @@ dependencies = [ "toml", "tracing", "tracing-subscriber", + "unicode-normalization", "url", "walkdir", ] @@ -1695,6 +1696,15 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + [[package]] name = "untrusted" version = "0.9.0" diff --git a/Cargo.toml b/Cargo.toml index a667eee..a0918d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,3 +22,4 @@ rpassword = "7.3" walkdir = "2.5" time = { version = "0.3", features = ["formatting", "parsing", "macros"] } sha2 = "0.10" +unicode-normalization = "0.1" diff --git a/README.md b/README.md index 9ce2369..cf9d4e5 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ - `init-config` writes a ready-to-edit config template (respecting `--download-root` and `--force` to overwrite). - `auth` subcommand stores Base64-encoded credentials per profile (passwords are never logged). - `list-courses` fetches `/users/me`, paginates enrolled courses, infers semester keys, caches the metadata, and prints a concise table. -- `sync` traverses every course folder/file tree, normalizes names, streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since ` filters (e.g., `--since ws2526` or `--since 01032024`). +- `sync` traverses every course folder/file tree, normalizes names (Unicode NFKD + transliteration so `Ökologie/ß/œ` becomes `Oekologie/ss/oe`), streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since ` filters (e.g., `--since ws2526` or `--since 01032024`). - XDG-compliant config (`~/.config/studip-sync/config.toml`) and state (`~/.local/share/studip-sync/state.toml`) stores everything in TOML. - Extensive logging controls: `--quiet`, `--verbose/-v`, `--debug`, and `--json`. @@ -80,7 +80,7 @@ Global flags: `--quiet`, `--debug`, `--json`, `-v/--verbose` (stackable), `--con 2. Cache missing semesters via `/semesters/{id}` and infer keys like `ws2425` / `ss25`. When `--refresh` is passed, already-known semesters that never recorded a `start` timestamp are re-fetched so `--since` filters have the data they need. 3. For each course: - Walk folders using the JSON:API pagination helpers; fetch nested folders via `/folders/{id}/folders`. - - List file refs via `/folders/{id}/file-refs`, normalize filenames, and ensure unique siblings through a `NameRegistry`. + - List file refs via `/folders/{id}/file-refs`, normalize filenames (including transliteration of umlauts/ligatures like `ä→ae`, `Ö→Oe`, `ß→ss`, `œ→oe`), and ensure unique siblings through a `NameRegistry`. - Skip downloads when the local file exists and matches the stored checksum / size / remote `chdate`. - Stream downloads to `*.part`, hash contents on the fly, then rename atomically to the final path. 4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and optionally delete now-empty directories). When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic. diff --git a/src/cli.rs b/src/cli.rs index b2f857e..0d0d1a9 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -26,6 +26,7 @@ use time::{ }; use tokio::{fs, io::AsyncWriteExt}; use tracing::info; +use unicode_normalization::{UnicodeNormalization, char::is_combining_mark}; use walkdir::WalkDir; const USERNAME_ENV: &str = "STUDIP_SYNC_USERNAME"; @@ -966,17 +967,29 @@ async fn ensure_directory(path: &Path, dry_run: bool) -> Result<()> { fn normalize_component(input: &str) -> String { let mut sanitized = String::new(); - for ch in input.chars() { - if ch.is_ascii_alphanumeric() { + let mut last_was_separator = false; + + for ch in input.nfkd() { + if is_combining_mark(ch) { + continue; + } + + if let Some(mapped) = map_special_letters(ch) { + sanitized.push_str(mapped); + last_was_separator = false; + continue; + } + + if ch.is_alphanumeric() { sanitized.push(ch); + last_was_separator = false; } else if ch.is_whitespace() || matches!(ch, '-' | '_' | '.') { - if !sanitized.ends_with('_') { - sanitized.push('_'); - } + push_separator(&mut sanitized, &mut last_was_separator); } else { - // skip other characters + push_separator(&mut sanitized, &mut last_was_separator); } } + let trimmed = sanitized.trim_matches('_'); if trimmed.is_empty() { "untitled".to_string() @@ -987,15 +1000,25 @@ fn normalize_component(input: &str) -> String { fn normalize_file_name(name: &str) -> String { let mut sanitized = String::new(); - for ch in name.chars() { - if ch == '/' || ch == '\\' { - sanitized.push('_'); - } else if ch.is_control() { + + for ch in name.nfkd() { + if is_combining_mark(ch) { continue; - } else { - sanitized.push(ch); + } + + match ch { + '/' | '\\' => sanitized.push('_'), + ch if ch.is_control() => continue, + _ => { + if let Some(mapped) = map_special_letters(ch) { + sanitized.push_str(mapped); + } else { + sanitized.push(ch); + } + } } } + if sanitized.trim().is_empty() { "file".to_string() } else { @@ -1003,6 +1026,37 @@ fn normalize_file_name(name: &str) -> String { } } +fn push_separator(buf: &mut String, last_was_separator: &mut bool) { + if !*last_was_separator { + buf.push('_'); + *last_was_separator = true; + } +} + +fn map_special_letters(ch: char) -> Option<&'static str> { + match ch { + 'ß' => Some("ss"), + 'ẞ' => Some("SS"), + 'ä' => Some("ae"), + 'Ä' => Some("Ae"), + 'ö' => Some("oe"), + 'Ö' => Some("Oe"), + 'ü' => Some("ue"), + 'Ü' => Some("Ue"), + 'Æ' => Some("AE"), + 'æ' => Some("ae"), + 'Œ' => Some("OE"), + 'œ' => Some("oe"), + 'Ð' => Some("D"), + 'ð' => Some("d"), + 'Þ' => Some("Th"), + 'þ' => Some("th"), + 'Ł' => Some("L"), + 'ł' => Some("l"), + _ => None, + } +} + fn compute_file_checksum(path: &Path) -> Result { let file = File::open(path)?; let mut reader = BufReader::new(file);