Improve filename normalization for special characters
This commit is contained in:
10
Cargo.lock
generated
10
Cargo.lock
generated
@@ -1321,6 +1321,7 @@ dependencies = [
|
||||
"toml",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"unicode-normalization",
|
||||
"url",
|
||||
"walkdir",
|
||||
]
|
||||
@@ -1695,6 +1696,15 @@ version = "1.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-normalization"
|
||||
version = "0.1.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
|
||||
dependencies = [
|
||||
"tinyvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.9.0"
|
||||
|
||||
@@ -22,3 +22,4 @@ rpassword = "7.3"
|
||||
walkdir = "2.5"
|
||||
time = { version = "0.3", features = ["formatting", "parsing", "macros"] }
|
||||
sha2 = "0.10"
|
||||
unicode-normalization = "0.1"
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
- `init-config` writes a ready-to-edit config template (respecting `--download-root` and `--force` to overwrite).
|
||||
- `auth` subcommand stores Base64-encoded credentials per profile (passwords are never logged).
|
||||
- `list-courses` fetches `/users/me`, paginates enrolled courses, infers semester keys, caches the metadata, and prints a concise table.
|
||||
- `sync` traverses every course folder/file tree, normalizes names, streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`).
|
||||
- `sync` traverses every course folder/file tree, normalizes names (Unicode NFKD + transliteration so `Ökologie/ß/œ` becomes `Oekologie/ss/oe`), streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`).
|
||||
- XDG-compliant config (`~/.config/studip-sync/config.toml`) and state (`~/.local/share/studip-sync/state.toml`) stores everything in TOML.
|
||||
- Extensive logging controls: `--quiet`, `--verbose/-v`, `--debug`, and `--json`.
|
||||
|
||||
@@ -80,7 +80,7 @@ Global flags: `--quiet`, `--debug`, `--json`, `-v/--verbose` (stackable), `--con
|
||||
2. Cache missing semesters via `/semesters/{id}` and infer keys like `ws2425` / `ss25`. When `--refresh` is passed, already-known semesters that never recorded a `start` timestamp are re-fetched so `--since` filters have the data they need.
|
||||
3. For each course:
|
||||
- Walk folders using the JSON:API pagination helpers; fetch nested folders via `/folders/{id}/folders`.
|
||||
- List file refs via `/folders/{id}/file-refs`, normalize filenames, and ensure unique siblings through a `NameRegistry`.
|
||||
- List file refs via `/folders/{id}/file-refs`, normalize filenames (including transliteration of umlauts/ligatures like `ä→ae`, `Ö→Oe`, `ß→ss`, `œ→oe`), and ensure unique siblings through a `NameRegistry`.
|
||||
- Skip downloads when the local file exists and matches the stored checksum / size / remote `chdate`.
|
||||
- Stream downloads to `*.part`, hash contents on the fly, then rename atomically to the final path.
|
||||
4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and optionally delete now-empty directories). When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic.
|
||||
|
||||
74
src/cli.rs
74
src/cli.rs
@@ -26,6 +26,7 @@ use time::{
|
||||
};
|
||||
use tokio::{fs, io::AsyncWriteExt};
|
||||
use tracing::info;
|
||||
use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
const USERNAME_ENV: &str = "STUDIP_SYNC_USERNAME";
|
||||
@@ -966,17 +967,29 @@ async fn ensure_directory(path: &Path, dry_run: bool) -> Result<()> {
|
||||
|
||||
fn normalize_component(input: &str) -> String {
|
||||
let mut sanitized = String::new();
|
||||
for ch in input.chars() {
|
||||
if ch.is_ascii_alphanumeric() {
|
||||
let mut last_was_separator = false;
|
||||
|
||||
for ch in input.nfkd() {
|
||||
if is_combining_mark(ch) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(mapped) = map_special_letters(ch) {
|
||||
sanitized.push_str(mapped);
|
||||
last_was_separator = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ch.is_alphanumeric() {
|
||||
sanitized.push(ch);
|
||||
last_was_separator = false;
|
||||
} else if ch.is_whitespace() || matches!(ch, '-' | '_' | '.') {
|
||||
if !sanitized.ends_with('_') {
|
||||
sanitized.push('_');
|
||||
}
|
||||
push_separator(&mut sanitized, &mut last_was_separator);
|
||||
} else {
|
||||
// skip other characters
|
||||
push_separator(&mut sanitized, &mut last_was_separator);
|
||||
}
|
||||
}
|
||||
|
||||
let trimmed = sanitized.trim_matches('_');
|
||||
if trimmed.is_empty() {
|
||||
"untitled".to_string()
|
||||
@@ -987,15 +1000,25 @@ fn normalize_component(input: &str) -> String {
|
||||
|
||||
fn normalize_file_name(name: &str) -> String {
|
||||
let mut sanitized = String::new();
|
||||
for ch in name.chars() {
|
||||
if ch == '/' || ch == '\\' {
|
||||
sanitized.push('_');
|
||||
} else if ch.is_control() {
|
||||
|
||||
for ch in name.nfkd() {
|
||||
if is_combining_mark(ch) {
|
||||
continue;
|
||||
}
|
||||
|
||||
match ch {
|
||||
'/' | '\\' => sanitized.push('_'),
|
||||
ch if ch.is_control() => continue,
|
||||
_ => {
|
||||
if let Some(mapped) = map_special_letters(ch) {
|
||||
sanitized.push_str(mapped);
|
||||
} else {
|
||||
sanitized.push(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if sanitized.trim().is_empty() {
|
||||
"file".to_string()
|
||||
} else {
|
||||
@@ -1003,6 +1026,37 @@ fn normalize_file_name(name: &str) -> String {
|
||||
}
|
||||
}
|
||||
|
||||
fn push_separator(buf: &mut String, last_was_separator: &mut bool) {
|
||||
if !*last_was_separator {
|
||||
buf.push('_');
|
||||
*last_was_separator = true;
|
||||
}
|
||||
}
|
||||
|
||||
fn map_special_letters(ch: char) -> Option<&'static str> {
|
||||
match ch {
|
||||
'ß' => Some("ss"),
|
||||
'ẞ' => Some("SS"),
|
||||
'ä' => Some("ae"),
|
||||
'Ä' => Some("Ae"),
|
||||
'ö' => Some("oe"),
|
||||
'Ö' => Some("Oe"),
|
||||
'ü' => Some("ue"),
|
||||
'Ü' => Some("Ue"),
|
||||
'Æ' => Some("AE"),
|
||||
'æ' => Some("ae"),
|
||||
'Œ' => Some("OE"),
|
||||
'œ' => Some("oe"),
|
||||
'Ð' => Some("D"),
|
||||
'ð' => Some("d"),
|
||||
'Þ' => Some("Th"),
|
||||
'þ' => Some("th"),
|
||||
'Ł' => Some("L"),
|
||||
'ł' => Some("l"),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_file_checksum(path: &Path) -> Result<String> {
|
||||
let file = File::open(path)?;
|
||||
let mut reader = BufReader::new(file);
|
||||
|
||||
Reference in New Issue
Block a user