Improve filename normalization for special characters
This commit is contained in:
10
Cargo.lock
generated
10
Cargo.lock
generated
@@ -1321,6 +1321,7 @@ dependencies = [
|
|||||||
"toml",
|
"toml",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
|
"unicode-normalization",
|
||||||
"url",
|
"url",
|
||||||
"walkdir",
|
"walkdir",
|
||||||
]
|
]
|
||||||
@@ -1695,6 +1696,15 @@ version = "1.0.22"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
|
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-normalization"
|
||||||
|
version = "0.1.25"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
|
||||||
|
dependencies = [
|
||||||
|
"tinyvec",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "untrusted"
|
name = "untrusted"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
|
|||||||
@@ -22,3 +22,4 @@ rpassword = "7.3"
|
|||||||
walkdir = "2.5"
|
walkdir = "2.5"
|
||||||
time = { version = "0.3", features = ["formatting", "parsing", "macros"] }
|
time = { version = "0.3", features = ["formatting", "parsing", "macros"] }
|
||||||
sha2 = "0.10"
|
sha2 = "0.10"
|
||||||
|
unicode-normalization = "0.1"
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
- `init-config` writes a ready-to-edit config template (respecting `--download-root` and `--force` to overwrite).
|
- `init-config` writes a ready-to-edit config template (respecting `--download-root` and `--force` to overwrite).
|
||||||
- `auth` subcommand stores Base64-encoded credentials per profile (passwords are never logged).
|
- `auth` subcommand stores Base64-encoded credentials per profile (passwords are never logged).
|
||||||
- `list-courses` fetches `/users/me`, paginates enrolled courses, infers semester keys, caches the metadata, and prints a concise table.
|
- `list-courses` fetches `/users/me`, paginates enrolled courses, infers semester keys, caches the metadata, and prints a concise table.
|
||||||
- `sync` traverses every course folder/file tree, normalizes names, streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`).
|
- `sync` traverses every course folder/file tree, normalizes names (Unicode NFKD + transliteration so `Ökologie/ß/œ` becomes `Oekologie/ss/oe`), streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since <semester|date>` filters (e.g., `--since ws2526` or `--since 01032024`).
|
||||||
- XDG-compliant config (`~/.config/studip-sync/config.toml`) and state (`~/.local/share/studip-sync/state.toml`) stores everything in TOML.
|
- XDG-compliant config (`~/.config/studip-sync/config.toml`) and state (`~/.local/share/studip-sync/state.toml`) stores everything in TOML.
|
||||||
- Extensive logging controls: `--quiet`, `--verbose/-v`, `--debug`, and `--json`.
|
- Extensive logging controls: `--quiet`, `--verbose/-v`, `--debug`, and `--json`.
|
||||||
|
|
||||||
@@ -80,7 +80,7 @@ Global flags: `--quiet`, `--debug`, `--json`, `-v/--verbose` (stackable), `--con
|
|||||||
2. Cache missing semesters via `/semesters/{id}` and infer keys like `ws2425` / `ss25`. When `--refresh` is passed, already-known semesters that never recorded a `start` timestamp are re-fetched so `--since` filters have the data they need.
|
2. Cache missing semesters via `/semesters/{id}` and infer keys like `ws2425` / `ss25`. When `--refresh` is passed, already-known semesters that never recorded a `start` timestamp are re-fetched so `--since` filters have the data they need.
|
||||||
3. For each course:
|
3. For each course:
|
||||||
- Walk folders using the JSON:API pagination helpers; fetch nested folders via `/folders/{id}/folders`.
|
- Walk folders using the JSON:API pagination helpers; fetch nested folders via `/folders/{id}/folders`.
|
||||||
- List file refs via `/folders/{id}/file-refs`, normalize filenames, and ensure unique siblings through a `NameRegistry`.
|
- List file refs via `/folders/{id}/file-refs`, normalize filenames (including transliteration of umlauts/ligatures like `ä→ae`, `Ö→Oe`, `ß→ss`, `œ→oe`), and ensure unique siblings through a `NameRegistry`.
|
||||||
- Skip downloads when the local file exists and matches the stored checksum / size / remote `chdate`.
|
- Skip downloads when the local file exists and matches the stored checksum / size / remote `chdate`.
|
||||||
- Stream downloads to `*.part`, hash contents on the fly, then rename atomically to the final path.
|
- Stream downloads to `*.part`, hash contents on the fly, then rename atomically to the final path.
|
||||||
4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and optionally delete now-empty directories). When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic.
|
4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and optionally delete now-empty directories). When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic.
|
||||||
|
|||||||
74
src/cli.rs
74
src/cli.rs
@@ -26,6 +26,7 @@ use time::{
|
|||||||
};
|
};
|
||||||
use tokio::{fs, io::AsyncWriteExt};
|
use tokio::{fs, io::AsyncWriteExt};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
|
||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
const USERNAME_ENV: &str = "STUDIP_SYNC_USERNAME";
|
const USERNAME_ENV: &str = "STUDIP_SYNC_USERNAME";
|
||||||
@@ -966,17 +967,29 @@ async fn ensure_directory(path: &Path, dry_run: bool) -> Result<()> {
|
|||||||
|
|
||||||
fn normalize_component(input: &str) -> String {
|
fn normalize_component(input: &str) -> String {
|
||||||
let mut sanitized = String::new();
|
let mut sanitized = String::new();
|
||||||
for ch in input.chars() {
|
let mut last_was_separator = false;
|
||||||
if ch.is_ascii_alphanumeric() {
|
|
||||||
|
for ch in input.nfkd() {
|
||||||
|
if is_combining_mark(ch) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(mapped) = map_special_letters(ch) {
|
||||||
|
sanitized.push_str(mapped);
|
||||||
|
last_was_separator = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ch.is_alphanumeric() {
|
||||||
sanitized.push(ch);
|
sanitized.push(ch);
|
||||||
|
last_was_separator = false;
|
||||||
} else if ch.is_whitespace() || matches!(ch, '-' | '_' | '.') {
|
} else if ch.is_whitespace() || matches!(ch, '-' | '_' | '.') {
|
||||||
if !sanitized.ends_with('_') {
|
push_separator(&mut sanitized, &mut last_was_separator);
|
||||||
sanitized.push('_');
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// skip other characters
|
push_separator(&mut sanitized, &mut last_was_separator);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let trimmed = sanitized.trim_matches('_');
|
let trimmed = sanitized.trim_matches('_');
|
||||||
if trimmed.is_empty() {
|
if trimmed.is_empty() {
|
||||||
"untitled".to_string()
|
"untitled".to_string()
|
||||||
@@ -987,15 +1000,25 @@ fn normalize_component(input: &str) -> String {
|
|||||||
|
|
||||||
fn normalize_file_name(name: &str) -> String {
|
fn normalize_file_name(name: &str) -> String {
|
||||||
let mut sanitized = String::new();
|
let mut sanitized = String::new();
|
||||||
for ch in name.chars() {
|
|
||||||
if ch == '/' || ch == '\\' {
|
for ch in name.nfkd() {
|
||||||
sanitized.push('_');
|
if is_combining_mark(ch) {
|
||||||
} else if ch.is_control() {
|
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
match ch {
|
||||||
|
'/' | '\\' => sanitized.push('_'),
|
||||||
|
ch if ch.is_control() => continue,
|
||||||
|
_ => {
|
||||||
|
if let Some(mapped) = map_special_letters(ch) {
|
||||||
|
sanitized.push_str(mapped);
|
||||||
} else {
|
} else {
|
||||||
sanitized.push(ch);
|
sanitized.push(ch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if sanitized.trim().is_empty() {
|
if sanitized.trim().is_empty() {
|
||||||
"file".to_string()
|
"file".to_string()
|
||||||
} else {
|
} else {
|
||||||
@@ -1003,6 +1026,37 @@ fn normalize_file_name(name: &str) -> String {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn push_separator(buf: &mut String, last_was_separator: &mut bool) {
|
||||||
|
if !*last_was_separator {
|
||||||
|
buf.push('_');
|
||||||
|
*last_was_separator = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn map_special_letters(ch: char) -> Option<&'static str> {
|
||||||
|
match ch {
|
||||||
|
'ß' => Some("ss"),
|
||||||
|
'ẞ' => Some("SS"),
|
||||||
|
'ä' => Some("ae"),
|
||||||
|
'Ä' => Some("Ae"),
|
||||||
|
'ö' => Some("oe"),
|
||||||
|
'Ö' => Some("Oe"),
|
||||||
|
'ü' => Some("ue"),
|
||||||
|
'Ü' => Some("Ue"),
|
||||||
|
'Æ' => Some("AE"),
|
||||||
|
'æ' => Some("ae"),
|
||||||
|
'Œ' => Some("OE"),
|
||||||
|
'œ' => Some("oe"),
|
||||||
|
'Ð' => Some("D"),
|
||||||
|
'ð' => Some("d"),
|
||||||
|
'Þ' => Some("Th"),
|
||||||
|
'þ' => Some("th"),
|
||||||
|
'Ł' => Some("L"),
|
||||||
|
'ł' => Some("l"),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn compute_file_checksum(path: &Path) -> Result<String> {
|
fn compute_file_checksum(path: &Path) -> Result<String> {
|
||||||
let file = File::open(path)?;
|
let file = File::open(path)?;
|
||||||
let mut reader = BufReader::new(file);
|
let mut reader = BufReader::new(file);
|
||||||
|
|||||||
Reference in New Issue
Block a user