Improve filename normalization for special characters

This commit is contained in:
2025-11-15 03:44:31 +01:00
parent d8de882cdd
commit f724216fb8
4 changed files with 79 additions and 14 deletions

10
Cargo.lock generated
View File

@@ -1321,6 +1321,7 @@ dependencies = [
"toml",
"tracing",
"tracing-subscriber",
"unicode-normalization",
"url",
"walkdir",
]
@@ -1695,6 +1696,15 @@ version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
[[package]]
name = "unicode-normalization"
version = "0.1.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
dependencies = [
"tinyvec",
]
[[package]]
name = "untrusted"
version = "0.9.0"

View File

@@ -22,3 +22,4 @@ rpassword = "7.3"
walkdir = "2.5"
time = { version = "0.3", features = ["formatting", "parsing", "macros"] }
sha2 = "0.10"
unicode-normalization = "0.1"

View File

@@ -7,7 +7,7 @@
- `init-config` writes a ready-to-edit config template (respecting `--download-root` and `--force` to overwrite).
- `auth` subcommand stores Base64-encoded credentials per profile (passwords are never logged).
- `list-courses` fetches `/users/me`, paginates enrolled courses, infers semester keys, caches the metadata, and prints a concise table.
- `sync` traverses every course folder/file tree, normalizes names, streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`).
- `sync` traverses every course folder/file tree, normalizes names (Unicode NFKD + transliteration so `Ökologie/ß/œ` becomes `Oekologie/ss/oe`), streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`).
- XDG-compliant config (`~/.config/studip-sync/config.toml`) and state (`~/.local/share/studip-sync/state.toml`) stores everything in TOML.
- Extensive logging controls: `--quiet`, `--verbose/-v`, `--debug`, and `--json`.
@@ -80,7 +80,7 @@ Global flags: `--quiet`, `--debug`, `--json`, `-v/--verbose` (stackable), `--con
2. Cache missing semesters via `/semesters/{id}` and infer keys like `ws2425` / `ss25`. When `--refresh` is passed, already-known semesters that never recorded a `start` timestamp are re-fetched so `--since` filters have the data they need.
3. For each course:
- Walk folders using the JSON:API pagination helpers; fetch nested folders via `/folders/{id}/folders`.
- List file refs via `/folders/{id}/file-refs`, normalize filenames, and ensure unique siblings through a `NameRegistry`.
- List file refs via `/folders/{id}/file-refs`, normalize filenames (including transliteration of umlauts/ligatures like `ä→ae`, `Ö→Oe`, `ß→ss`, `œ→oe`), and ensure unique siblings through a `NameRegistry`.
- Skip downloads when the local file exists and matches the stored checksum / size / remote `chdate`.
- Stream downloads to `*.part`, hash contents on the fly, then rename atomically to the final path.
4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and optionally delete now-empty directories). When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic.

View File

@@ -26,6 +26,7 @@ use time::{
};
use tokio::{fs, io::AsyncWriteExt};
use tracing::info;
use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
use walkdir::WalkDir;
const USERNAME_ENV: &str = "STUDIP_SYNC_USERNAME";
@@ -966,17 +967,29 @@ async fn ensure_directory(path: &Path, dry_run: bool) -> Result<()> {
fn normalize_component(input: &str) -> String {
let mut sanitized = String::new();
for ch in input.chars() {
if ch.is_ascii_alphanumeric() {
let mut last_was_separator = false;
for ch in input.nfkd() {
if is_combining_mark(ch) {
continue;
}
if let Some(mapped) = map_special_letters(ch) {
sanitized.push_str(mapped);
last_was_separator = false;
continue;
}
if ch.is_alphanumeric() {
sanitized.push(ch);
last_was_separator = false;
} else if ch.is_whitespace() || matches!(ch, '-' | '_' | '.') {
if !sanitized.ends_with('_') {
sanitized.push('_');
}
push_separator(&mut sanitized, &mut last_was_separator);
} else {
// skip other characters
push_separator(&mut sanitized, &mut last_was_separator);
}
}
let trimmed = sanitized.trim_matches('_');
if trimmed.is_empty() {
"untitled".to_string()
@@ -987,15 +1000,25 @@ fn normalize_component(input: &str) -> String {
fn normalize_file_name(name: &str) -> String {
let mut sanitized = String::new();
for ch in name.chars() {
if ch == '/' || ch == '\\' {
sanitized.push('_');
} else if ch.is_control() {
for ch in name.nfkd() {
if is_combining_mark(ch) {
continue;
} else {
sanitized.push(ch);
}
match ch {
'/' | '\\' => sanitized.push('_'),
ch if ch.is_control() => continue,
_ => {
if let Some(mapped) = map_special_letters(ch) {
sanitized.push_str(mapped);
} else {
sanitized.push(ch);
}
}
}
}
if sanitized.trim().is_empty() {
"file".to_string()
} else {
@@ -1003,6 +1026,37 @@ fn normalize_file_name(name: &str) -> String {
}
}
fn push_separator(buf: &mut String, last_was_separator: &mut bool) {
if !*last_was_separator {
buf.push('_');
*last_was_separator = true;
}
}
fn map_special_letters(ch: char) -> Option<&'static str> {
match ch {
'ß' => Some("ss"),
'ẞ' => Some("SS"),
'ä' => Some("ae"),
'Ä' => Some("Ae"),
'ö' => Some("oe"),
'Ö' => Some("Oe"),
'ü' => Some("ue"),
'Ü' => Some("Ue"),
'Æ' => Some("AE"),
'æ' => Some("ae"),
'Œ' => Some("OE"),
'œ' => Some("oe"),
'Ð' => Some("D"),
'ð' => Some("d"),
'Þ' => Some("Th"),
'þ' => Some("th"),
'Ł' => Some("L"),
'ł' => Some("l"),
_ => None,
}
}
fn compute_file_checksum(path: &Path) -> Result<String> {
let file = File::open(path)?;
let mut reader = BufReader::new(file);