Improve filename normalization for special characters

This commit is contained in:
2025-11-15 03:44:31 +01:00
parent d8de882cdd
commit f724216fb8
4 changed files with 79 additions and 14 deletions

10
Cargo.lock generated
View File

@@ -1321,6 +1321,7 @@ dependencies = [
"toml", "toml",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"unicode-normalization",
"url", "url",
"walkdir", "walkdir",
] ]
@@ -1695,6 +1696,15 @@ version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
[[package]]
name = "unicode-normalization"
version = "0.1.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
dependencies = [
"tinyvec",
]
[[package]] [[package]]
name = "untrusted" name = "untrusted"
version = "0.9.0" version = "0.9.0"

View File

@@ -22,3 +22,4 @@ rpassword = "7.3"
walkdir = "2.5" walkdir = "2.5"
time = { version = "0.3", features = ["formatting", "parsing", "macros"] } time = { version = "0.3", features = ["formatting", "parsing", "macros"] }
sha2 = "0.10" sha2 = "0.10"
unicode-normalization = "0.1"

View File

@@ -7,7 +7,7 @@
- `init-config` writes a ready-to-edit config template (respecting `--download-root` and `--force` to overwrite). - `init-config` writes a ready-to-edit config template (respecting `--download-root` and `--force` to overwrite).
- `auth` subcommand stores Base64-encoded credentials per profile (passwords are never logged). - `auth` subcommand stores Base64-encoded credentials per profile (passwords are never logged).
- `list-courses` fetches `/users/me`, paginates enrolled courses, infers semester keys, caches the metadata, and prints a concise table. - `list-courses` fetches `/users/me`, paginates enrolled courses, infers semester keys, caches the metadata, and prints a concise table.
- `sync` traverses every course folder/file tree, normalizes names, streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`). - `sync` traverses every course folder/file tree, normalizes names (Unicode NFKD + transliteration so `Ökologie/ß/œ` becomes `Oekologie/ss/oe`), streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`).
- XDG-compliant config (`~/.config/studip-sync/config.toml`) and state (`~/.local/share/studip-sync/state.toml`) stores everything in TOML. - XDG-compliant config (`~/.config/studip-sync/config.toml`) and state (`~/.local/share/studip-sync/state.toml`) stores everything in TOML.
- Extensive logging controls: `--quiet`, `--verbose/-v`, `--debug`, and `--json`. - Extensive logging controls: `--quiet`, `--verbose/-v`, `--debug`, and `--json`.
@@ -80,7 +80,7 @@ Global flags: `--quiet`, `--debug`, `--json`, `-v/--verbose` (stackable), `--con
2. Cache missing semesters via `/semesters/{id}` and infer keys like `ws2425` / `ss25`. When `--refresh` is passed, already-known semesters that never recorded a `start` timestamp are re-fetched so `--since` filters have the data they need. 2. Cache missing semesters via `/semesters/{id}` and infer keys like `ws2425` / `ss25`. When `--refresh` is passed, already-known semesters that never recorded a `start` timestamp are re-fetched so `--since` filters have the data they need.
3. For each course: 3. For each course:
- Walk folders using the JSON:API pagination helpers; fetch nested folders via `/folders/{id}/folders`. - Walk folders using the JSON:API pagination helpers; fetch nested folders via `/folders/{id}/folders`.
- List file refs via `/folders/{id}/file-refs`, normalize filenames, and ensure unique siblings through a `NameRegistry`. - List file refs via `/folders/{id}/file-refs`, normalize filenames (including transliteration of umlauts/ligatures like `ä→ae`, `Ö→Oe`, `ß→ss`, `œ→oe`), and ensure unique siblings through a `NameRegistry`.
- Skip downloads when the local file exists and matches the stored checksum / size / remote `chdate`. - Skip downloads when the local file exists and matches the stored checksum / size / remote `chdate`.
- Stream downloads to `*.part`, hash contents on the fly, then rename atomically to the final path. - Stream downloads to `*.part`, hash contents on the fly, then rename atomically to the final path.
4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and optionally delete now-empty directories). When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic. 4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and optionally delete now-empty directories). When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic.

View File

@@ -26,6 +26,7 @@ use time::{
}; };
use tokio::{fs, io::AsyncWriteExt}; use tokio::{fs, io::AsyncWriteExt};
use tracing::info; use tracing::info;
use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
use walkdir::WalkDir; use walkdir::WalkDir;
const USERNAME_ENV: &str = "STUDIP_SYNC_USERNAME"; const USERNAME_ENV: &str = "STUDIP_SYNC_USERNAME";
@@ -966,17 +967,29 @@ async fn ensure_directory(path: &Path, dry_run: bool) -> Result<()> {
fn normalize_component(input: &str) -> String { fn normalize_component(input: &str) -> String {
let mut sanitized = String::new(); let mut sanitized = String::new();
for ch in input.chars() { let mut last_was_separator = false;
if ch.is_ascii_alphanumeric() {
for ch in input.nfkd() {
if is_combining_mark(ch) {
continue;
}
if let Some(mapped) = map_special_letters(ch) {
sanitized.push_str(mapped);
last_was_separator = false;
continue;
}
if ch.is_alphanumeric() {
sanitized.push(ch); sanitized.push(ch);
last_was_separator = false;
} else if ch.is_whitespace() || matches!(ch, '-' | '_' | '.') { } else if ch.is_whitespace() || matches!(ch, '-' | '_' | '.') {
if !sanitized.ends_with('_') { push_separator(&mut sanitized, &mut last_was_separator);
sanitized.push('_');
}
} else { } else {
// skip other characters push_separator(&mut sanitized, &mut last_was_separator);
} }
} }
let trimmed = sanitized.trim_matches('_'); let trimmed = sanitized.trim_matches('_');
if trimmed.is_empty() { if trimmed.is_empty() {
"untitled".to_string() "untitled".to_string()
@@ -987,15 +1000,25 @@ fn normalize_component(input: &str) -> String {
fn normalize_file_name(name: &str) -> String { fn normalize_file_name(name: &str) -> String {
let mut sanitized = String::new(); let mut sanitized = String::new();
for ch in name.chars() {
if ch == '/' || ch == '\\' { for ch in name.nfkd() {
sanitized.push('_'); if is_combining_mark(ch) {
} else if ch.is_control() {
continue; continue;
}
match ch {
'/' | '\\' => sanitized.push('_'),
ch if ch.is_control() => continue,
_ => {
if let Some(mapped) = map_special_letters(ch) {
sanitized.push_str(mapped);
} else { } else {
sanitized.push(ch); sanitized.push(ch);
} }
} }
}
}
if sanitized.trim().is_empty() { if sanitized.trim().is_empty() {
"file".to_string() "file".to_string()
} else { } else {
@@ -1003,6 +1026,37 @@ fn normalize_file_name(name: &str) -> String {
} }
} }
fn push_separator(buf: &mut String, last_was_separator: &mut bool) {
if !*last_was_separator {
buf.push('_');
*last_was_separator = true;
}
}
fn map_special_letters(ch: char) -> Option<&'static str> {
match ch {
'ß' => Some("ss"),
'ẞ' => Some("SS"),
'ä' => Some("ae"),
'Ä' => Some("Ae"),
'ö' => Some("oe"),
'Ö' => Some("Oe"),
'ü' => Some("ue"),
'Ü' => Some("Ue"),
'Æ' => Some("AE"),
'æ' => Some("ae"),
'Œ' => Some("OE"),
'œ' => Some("oe"),
'Ð' => Some("D"),
'ð' => Some("d"),
'Þ' => Some("Th"),
'þ' => Some("th"),
'Ł' => Some("L"),
'ł' => Some("l"),
_ => None,
}
}
fn compute_file_checksum(path: &Path) -> Result<String> { fn compute_file_checksum(path: &Path) -> Result<String> {
let file = File::open(path)?; let file = File::open(path)?;
let mut reader = BufReader::new(file); let mut reader = BufReader::new(file);