From b5ea4b901cc74c482a6547ad43aabd938c9752f6 Mon Sep 17 00:00:00 2001 From: Matthias Puchstein Date: Sun, 16 Nov 2025 00:01:46 +0100 Subject: [PATCH] Handle external redirects and add sync options --- Cargo.lock | 62 +++++++++++++++ Cargo.toml | 2 +- README.md | 11 ++- src/cli.rs | 175 ++++++++++++++++++++++++++++++++++++------- src/studip_client.rs | 105 +++++++++++++++++++++----- 5 files changed, 304 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 635cf69..adae4f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -258,6 +258,35 @@ version = "0.4.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a9b614a5787ef0c8802a55766480563cb3a93b435898c422ed2a359cf811582" +[[package]] +name = "cookie" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ddef33a339a91ea89fb53151bd0a4689cfce27055c291dfa69945475d22c747" +dependencies = [ + "percent-encoding", + "time", + "version_check", +] + +[[package]] +name = "cookie_store" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eac901828f88a5241ee0600950ab981148a18f2f756900ffba1b125ca6a3ef9" +dependencies = [ + "cookie", + "document-features", + "idna", + "log", + "publicsuffix", + "serde", + "serde_derive", + "serde_json", + "time", + "url", +] + [[package]] name = "cpufeatures" version = "0.2.17" @@ -337,6 +366,15 @@ dependencies = [ "syn", ] +[[package]] +name = "document-features" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" +dependencies = [ + "litrs", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -773,6 +811,12 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +[[package]] +name = "litrs" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" + [[package]] name = "log" version = "0.4.28" @@ -905,6 +949,22 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "psl-types" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" + +[[package]] +name = "publicsuffix" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42ea446cab60335f76979ec15e12619a2165b5ae2c12166bef27d283a9fadf" +dependencies = [ + "idna", + "psl-types", +] + [[package]] name = "quinn" version = "0.11.9" @@ -1041,6 +1101,8 @@ dependencies = [ "async-compression", "base64", "bytes", + "cookie", + "cookie_store", "futures-core", "futures-util", "http", diff --git a/Cargo.toml b/Cargo.toml index a0918d5..79fad94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ anyhow = "1.0" base64 = "0.22" clap = { version = "4.5", features = ["derive"] } directories = "5.0" -reqwest = { version = "0.12", default-features = false, features = ["json", "stream", "gzip", "brotli", "deflate", "rustls-tls"] } +reqwest = { version = "0.12", default-features = false, features = ["json", "stream", "gzip", "brotli", "deflate", "rustls-tls", "cookies"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" thiserror = "1.0" diff --git a/README.md b/README.md index cf9d4e5..8f4271b 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ - `init-config` writes a ready-to-edit config template (respecting `--download-root` and `--force` to overwrite). - `auth` subcommand stores Base64-encoded credentials per profile (passwords are never logged). - `list-courses` fetches `/users/me`, paginates enrolled courses, infers semester keys, caches the metadata, and prints a concise table. -- `sync` traverses every course folder/file tree, normalizes names (Unicode NFKD + transliteration so `Ökologie/ß/œ` becomes `Oekologie/ss/oe`), streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, and `--since ` filters (e.g., `--since ws2526` or `--since 01032024`). +- `sync` traverses every course folder/file tree, normalizes names (Unicode NFKD + transliteration so `Ökologie/ß/œ` becomes `Oekologie/ss/oe`), streams downloads to disk, tracks checksums/remote timestamps, and supports `--dry-run`, `--prune`, `--prune-empty-dirs`, `--write-external-links`, and `--since ` filters (e.g., `--since ws2526` or `--since 01032024`). - XDG-compliant config (`~/.config/studip-sync/config.toml`) and state (`~/.local/share/studip-sync/state.toml`) stores everything in TOML. - Extensive logging controls: `--quiet`, `--verbose/-v`, `--debug`, and `--json`. @@ -41,7 +41,12 @@ cargo run -- sync --dry-run # Run the real sync (omit --dry-run); add --prune to delete stray files + # or --prune-empty-dirs to only remove empty directories cargo run -- sync --prune + cargo run -- sync --prune-empty-dirs + # Use --write-external-links to drop .url shortcuts whenever Stud.IP + # points to files hosted on third-party sites you can't fetch directly + cargo run -- sync --write-external-links ``` Use `--profile`, `--config-dir`, or `--data-dir` when working with multiple identities or non-standard paths. @@ -70,7 +75,7 @@ max_concurrent_downloads = 3 # placeholder for future concurrency control | `init-config` | Write a default config template (fails if config exists unless forced). | `--force`, `--download-root` | | `auth` | Collect username/password, encode them, and save them to the active profile. | `--non-interactive`, `--username`, `--password` | | `list-courses` | List cached or freshly fetched courses with semester keys and IDs. | `--refresh` | -| `sync` | Download files for every enrolled course into the local tree. | `--dry-run`, `--prune`, `--since ` | +| `sync` | Download files for every enrolled course into the local tree. | `--dry-run`, `--prune`, `--prune-empty-dirs`, `--write-external-links`, `--since ` | Global flags: `--quiet`, `--debug`, `--json`, `-v/--verbose` (stackable), `--config-dir`, `--data-dir` (state + default downloads), `--profile`. @@ -83,7 +88,7 @@ Global flags: `--quiet`, `--debug`, `--json`, `-v/--verbose` (stackable), `--con - List file refs via `/folders/{id}/file-refs`, normalize filenames (including transliteration of umlauts/ligatures like `ä→ae`, `Ö→Oe`, `ß→ss`, `œ→oe`), and ensure unique siblings through a `NameRegistry`. - Skip downloads when the local file exists and matches the stored checksum / size / remote `chdate`. - Stream downloads to `*.part`, hash contents on the fly, then rename atomically to the final path. -4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and optionally delete now-empty directories). When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic. +4. Maintain a set of remote files so `--prune` can remove local files that no longer exist remotely (and clean up any directories left empty). When `--prune-empty-dirs` is used instead, only empty directories are removed without touching files. When `--write-external-links` is enabled, any file that redirects to an unsupported host gets a `filename.ext.url` shortcut so you can open it manually later. When `--since` is provided, files whose remote `chdate` precedes the resolved timestamp (semester start or explicit date) are skipped; newer files continue through the regular checksum/size logic. 5. `--dry-run` prints planned work but never writes to disk. ## Development Notes diff --git a/src/cli.rs b/src/cli.rs index 0d0d1a9..d2ed4e1 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -5,11 +5,14 @@ use crate::{ paths::{AppPaths, PathOverrides}, semesters, state::{CourseState, ProfileState, SemesterState, StateFile}, - studip_client::{Course, FileRef, Folder, SemesterData, SemesterResponse, StudipClient}, + studip_client::{ + Course, FileRef, Folder, SemesterData, SemesterResponse, StudipClient, StudipHttpError, + }, }; use anyhow::{Context, anyhow, bail}; use base64::{Engine, engine::general_purpose::STANDARD as BASE64}; use clap::{ArgAction, Parser, Subcommand, ValueHint}; +use reqwest::StatusCode; use rpassword::prompt_password; use sha2::{Digest, Sha256}; use std::{ @@ -25,8 +28,9 @@ use time::{ format_description::well_known::Rfc3339, macros::format_description, }; use tokio::{fs, io::AsyncWriteExt}; -use tracing::info; +use tracing::{info, warn}; use unicode_normalization::{UnicodeNormalization, char::is_combining_mark}; +use url::Url; use walkdir::WalkDir; const USERNAME_ENV: &str = "STUDIP_SYNC_USERNAME"; @@ -94,6 +98,12 @@ pub struct SyncArgs { pub dry_run: bool, #[arg(long = "prune", action = ArgAction::SetTrue)] pub prune: bool, + /// Remove empty directories under the download root once syncing is finished. + #[arg(long = "prune-empty-dirs", action = ArgAction::SetTrue)] + pub prune_empty_dirs: bool, + /// Write .url shortcuts for files that redirect to unsupported external hosts. + #[arg(long = "write-external-links", action = ArgAction::SetTrue)] + pub write_external_links: bool, #[arg(long = "since")] pub since: Option, } @@ -487,23 +497,44 @@ impl SyncArgs { if self.prune { let prune = prune_local(&download_root, &remote_files, self.dry_run)?; - stats.pruned_files = prune.removed_files; - stats.pruned_dirs = prune.removed_dirs; + stats.pruned_files += prune.removed_files; + stats.pruned_dirs += prune.removed_dirs; + } + + if self.prune_empty_dirs && !self.prune { + let removed = prune_empty_directories(&download_root, self.dry_run)?; + stats.pruned_dirs += removed; } info!( profile = ctx.profile_name(), dry_run = self.dry_run, prune = self.prune, + prune_empty_dirs = self.prune_empty_dirs, + write_external_links = self.write_external_links, downloaded = stats.downloaded, skipped = stats.skipped, planned = stats.planned, skipped_since = stats.skipped_since, + skipped_external = stats.skipped_external, pruned_files = stats.pruned_files, pruned_dirs = stats.pruned_dirs, since = self.since.as_deref().unwrap_or(""), "sync completed" ); + + if !stats.skipped_external_details.is_empty() { + println!("\nSkipped external downloads:"); + for detail in &stats.skipped_external_details { + println!( + " {} -> {} {}", + detail.path.display(), + detail.status, + detail.url + ); + } + } + Ok(()) } } @@ -674,6 +705,8 @@ struct SyncStats { skipped_since: usize, pruned_files: usize, pruned_dirs: usize, + skipped_external: usize, + skipped_external_details: Vec, } #[derive(Default)] @@ -842,10 +875,45 @@ async fn sync_file_ref( } if needs_download { - let checksum = download_file_to(client, &file_ref, &local_path).await?; - update_file_state(ctx, &file_ref, &local_path, Some(checksum))?; - println!("Downloaded {}", relative_path.display()); - stats.downloaded += 1; + match download_file_to(client, &file_ref, &local_path).await { + Ok(checksum) => { + if args.write_external_links { + remove_external_link(&local_path).await?; + } + update_file_state(ctx, &file_ref, &local_path, Some(checksum))?; + println!("Downloaded {}", relative_path.display()); + stats.downloaded += 1; + } + Err(err) => { + if let Some(http_err) = err.downcast_ref::() + && http_err.external + { + warn!( + target: "studip_sync", + url = %http_err.url, + status = %http_err.status, + "External download failed; skipping" + ); + println!( + "Skipped {} (external download failed: {} {})", + relative_path.display(), + http_err.status, + http_err.url + ); + stats.skipped_external += 1; + if args.write_external_links { + write_external_link(&local_path, &http_err.url, args.dry_run).await?; + } + stats.skipped_external_details.push(ExternalSkip { + path: relative_path.clone(), + status: http_err.status, + url: http_err.url.clone(), + }); + return Ok(()); + } + return Err(err); + } + } } else { stats.skipped += 1; } @@ -983,8 +1051,6 @@ fn normalize_component(input: &str) -> String { if ch.is_alphanumeric() { sanitized.push(ch); last_was_separator = false; - } else if ch.is_whitespace() || matches!(ch, '-' | '_' | '.') { - push_separator(&mut sanitized, &mut last_was_separator); } else { push_separator(&mut sanitized, &mut last_was_separator); } @@ -1102,6 +1168,18 @@ fn prune_local( } } + stats.removed_dirs += prune_empty_directories(root, dry_run)?; + + Ok(stats) +} + +fn prune_empty_directories(root: &Path, dry_run: bool) -> Result { + if !root.exists() { + return Ok(0); + } + + let mut removed = 0; + for entry in WalkDir::new(root) .contents_first(true) .into_iter() @@ -1111,29 +1189,65 @@ fn prune_local( if entry.path() == root { continue; } - if entry + + let is_empty = entry .path() .read_dir() - .map(|mut i| i.next().is_none()) - .unwrap_or(false) - { - if dry_run { - println!( - "Would remove empty directory {}", - entry - .path() - .strip_prefix(root) - .unwrap_or(entry.path()) - .display() - ); - } else { - std::fs::remove_dir(entry.path()).ok(); - } - stats.removed_dirs += 1; + .map(|mut iter| iter.next().is_none()) + .unwrap_or(false); + if !is_empty { + continue; } + + let rel = entry.path().strip_prefix(root).unwrap_or(entry.path()); + if dry_run { + println!("Would remove empty directory {}", rel.display()); + } else { + let _ = std::fs::remove_dir(entry.path()); + } + removed += 1; } - Ok(stats) + Ok(removed) +} + +async fn write_external_link(destination: &Path, url: &Url, dry_run: bool) -> Result<()> { + let link_path = external_link_path(destination); + if dry_run { + println!( + "Would write external link {} -> {}", + link_path.display(), + url + ); + return Ok(()); + } + + if let Some(parent) = link_path.parent() { + fs::create_dir_all(parent).await?; + } + + let mut file = fs::File::create(&link_path).await?; + let content = format!("[InternetShortcut]\nURL={}\n", url); + file.write_all(content.as_bytes()).await?; + file.flush().await?; + Ok(()) +} + +async fn remove_external_link(destination: &Path) -> Result<()> { + let link_path = external_link_path(destination); + if tokio::fs::try_exists(&link_path).await.unwrap_or(false) { + let _ = fs::remove_file(link_path).await; + } + Ok(()) +} + +fn external_link_path(destination: &Path) -> PathBuf { + let mut name = destination + .file_name() + .map(|n| n.to_os_string()) + .unwrap_or_default(); + name.push(".url"); + destination.with_file_name(name) } #[derive(Default)] @@ -1298,3 +1412,8 @@ max_concurrent_downloads = 3 "# ) } +struct ExternalSkip { + path: PathBuf, + status: StatusCode, + url: Url, +} diff --git a/src/studip_client.rs b/src/studip_client.rs index d9163dd..744dda5 100644 --- a/src/studip_client.rs +++ b/src/studip_client.rs @@ -1,10 +1,12 @@ use crate::{Result, config::ConfigProfile}; use anyhow::{Context, anyhow, bail}; use reqwest::{ - Client, Response, - header::{AUTHORIZATION, HeaderValue}, + Client, Response, StatusCode, + header::{AUTHORIZATION, HeaderValue, LOCATION}, + redirect::Policy, }; use serde::{Deserialize, de::DeserializeOwned}; +use thiserror::Error; use url::Url; #[derive(Clone)] @@ -29,6 +31,8 @@ impl StudipClient { let http = Client::builder() .user_agent("studip-sync/0.1") + .cookie_store(true) + .redirect(Policy::none()) .build() .context("Failed to build HTTP client")?; @@ -125,27 +129,90 @@ impl StudipClient { Ok(items) } fn download_endpoint(&self, path: &str) -> Result { - let normalized = path.trim_start_matches('/'); - self.base.join(normalized).map_err(Into::into) + if let Ok(url) = Url::parse(path) { + return Ok(url); + } + + self.base.join(path).map_err(Into::into) } async fn send_request(&self, url: Url) -> Result { - let response = self - .http - .get(url.clone()) - .header(AUTHORIZATION, self.auth_header.clone()) - .send() - .await - .with_context(|| format!("GET {}", url))?; - - if !response.status().is_success() { - let status = response.status(); - let body = response.text().await.unwrap_or_default(); - bail!("Stud.IP request failed ({status}) - {body}"); - } - - Ok(response) + self.follow_redirects(url, 10).await } + + async fn follow_redirects(&self, url: Url, max_redirects: usize) -> Result { + let mut current_url = url; + let mut redirects_left = max_redirects; + let mut include_auth = true; + + loop { + let mut request = self.http.get(current_url.clone()); + if include_auth { + request = request.header(AUTHORIZATION, self.auth_header.clone()); + } + let response = request + .send() + .await + .with_context(|| format!("GET {}", current_url))?; + + if response.status().is_redirection() { + if redirects_left == 0 { + bail!( + "Exceeded redirect limit while requesting {}", + response.url() + ); + } + + let location = response.headers().get(LOCATION).ok_or_else(|| { + anyhow!("Redirect from {} missing Location header", response.url()) + })?; + let location = location + .to_str() + .context("Invalid redirect Location header")?; + let next_url = if let Ok(absolute) = Url::parse(location) { + absolute + } else { + response + .url() + .join(location) + .with_context(|| format!("Invalid redirect location {location}"))? + }; + + if include_auth && next_url.origin() != self.base.origin() { + include_auth = false; + } + + current_url = next_url; + redirects_left -= 1; + continue; + } + + if !response.status().is_success() { + let status = response.status(); + let final_url = response.url().clone(); + let body = response.text().await.unwrap_or_default(); + let external = final_url.origin() != self.base.origin(); + return Err(StudipHttpError { + status, + url: final_url, + body, + external, + } + .into()); + } + + return Ok(response); + } + } +} + +#[derive(Debug, Error)] +#[error("Stud.IP request failed ({status}) at {url} - {body}")] +pub struct StudipHttpError { + pub status: StatusCode, + pub url: Url, + pub body: String, + pub external: bool, } fn build_root_and_api_urls(profile: &ConfigProfile) -> Result<(Url, Url)> {