[feat] add input validation, enhanced error messages, and new integration tests

This commit is contained in:
2025-08-12 09:49:11 +02:00
parent 8ebdf876ed
commit 75cfb6f160
5 changed files with 181 additions and 24 deletions

40
Cargo.lock generated
View File

@@ -93,9 +93,9 @@ dependencies = [
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.98" version = "1.0.99"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100"
[[package]] [[package]]
name = "atomic-waker" name = "atomic-waker"
@@ -190,9 +190,9 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.2.31" version = "1.2.32"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3a42d84bb6b69d3a8b3eaacf0d88f179e1929695e1ad012b6cf64d9caaa5fd2" checksum = "2352e5597e9c544d5e6d9c95190d5d27738ade584fa8db0a16e130e5c2b5296e"
dependencies = [ dependencies = [
"shlex", "shlex",
] ]
@@ -239,9 +239,9 @@ dependencies = [
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.5.43" version = "4.5.44"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50fd97c9dc2399518aa331917ac6f274280ec5eb34e555dd291899745c48ec6f" checksum = "1c1f056bae57e3e54c3375c41ff79619ddd13460a17d7438712bd0d83fda4ff8"
dependencies = [ dependencies = [
"clap_builder", "clap_builder",
"clap_derive", "clap_derive",
@@ -249,9 +249,9 @@ dependencies = [
[[package]] [[package]]
name = "clap_builder" name = "clap_builder"
version = "4.5.43" version = "4.5.44"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c35b5830294e1fa0462034af85cc95225a4cb07092c088c55bda3147cfcd8f65" checksum = "b3e7f4214277f3c7aa526a59dd3fbe306a370daee1f8b7b8c987069cd8e888a8"
dependencies = [ dependencies = [
"anstream", "anstream",
"anstyle", "anstyle",
@@ -261,9 +261,9 @@ dependencies = [
[[package]] [[package]]
name = "clap_complete" name = "clap_complete"
version = "4.5.56" version = "4.5.57"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67e4efcbb5da11a92e8a609233aa1e8a7d91e38de0be865f016d14700d45a7fd" checksum = "4d9501bd3f5f09f7bbee01da9a511073ed30a80cd7a509f1214bb74eadea71ad"
dependencies = [ dependencies = [
"clap", "clap",
] ]
@@ -564,9 +564,9 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]] [[package]]
name = "glob" name = "glob"
version = "0.3.2" version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
[[package]] [[package]]
name = "h2" name = "h2"
@@ -940,9 +940,9 @@ dependencies = [
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.174" version = "0.2.175"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543"
[[package]] [[package]]
name = "libloading" name = "libloading"
@@ -1190,9 +1190,9 @@ dependencies = [
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.95" version = "1.0.97"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" checksum = "d61789d7719defeb74ea5fe81f2fdfdbd28a803847077cecce2ff14e1472f6f1"
dependencies = [ dependencies = [
"unicode-ident", "unicode-ident",
] ]
@@ -1363,9 +1363,9 @@ dependencies = [
[[package]] [[package]]
name = "rustversion" name = "rustversion"
version = "1.0.21" version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]] [[package]]
name = "ryu" name = "ryu"
@@ -1477,9 +1477,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]] [[package]]
name = "slab" name = "slab"
version = "0.4.10" version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589"
[[package]] [[package]]
name = "smallvec" name = "smallvec"

View File

@@ -694,10 +694,11 @@ pub fn decode_audio_to_pcm_f32_ffmpeg(audio_path: &Path) -> Result<Vec<f32>> {
} }
}; };
if !output.status.success() { if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(anyhow!( return Err(anyhow!(
"ffmpeg failed for {}: {}", "Failed to decode audio from {} using ffmpeg. This may indicate the file is not a valid or supported audio/video file, is corrupted, or cannot be opened. ffmpeg stderr: {}",
audio_path.display(), audio_path.display(),
String::from_utf8_lossy(&output.stderr) stderr.trim()
)); ));
} }
let bytes = output.stdout; let bytes = output.stdout;

View File

@@ -186,6 +186,22 @@ fn is_audio_file(path: &Path) -> bool {
false false
} }
fn validate_input_path(path: &Path) -> anyhow::Result<()> {
use anyhow::{anyhow, Context};
let display = path.display();
if !path.exists() {
return Err(anyhow!("Input not found: {}", display));
}
let md = std::fs::metadata(path).with_context(|| format!("Failed to stat input: {}", display))?;
if md.is_dir() {
return Err(anyhow!("Input is a directory (expected a file): {}", display));
}
// Attempt to open to catch permission errors early
std::fs::File::open(path)
.with_context(|| format!("Failed to open input file: {}", display))
.map(|_| ())
}
struct LastModelCleanup { struct LastModelCleanup {
path: PathBuf, path: PathBuf,
} }
@@ -315,6 +331,18 @@ fn run() -> Result<()> {
return Err(anyhow!("No input files provided")); return Err(anyhow!("No input files provided"));
} }
// Preflight: validate each input path and type
for inp in &inputs {
let p = Path::new(inp);
validate_input_path(p)?;
if !(is_audio_file(p) || is_json_file(p)) {
return Err(anyhow!(
"Unsupported input type (expected .json transcript or common audio/video): {}",
p.display()
));
}
}
// Language must be provided via CLI when transcribing audio (no detection from JSON/env) // Language must be provided via CLI when transcribing audio (no detection from JSON/env)
let lang_hint: Option<String> = if let Some(ref l) = args.language { let lang_hint: Option<String> = if let Some(ref l) = args.language {
normalize_lang_code(l).or_else(|| Some(l.trim().to_lowercase())) normalize_lang_code(l).or_else(|| Some(l.trim().to_lowercase()))

View File

@@ -620,10 +620,13 @@ fn download_one_model(client: &Client, models_dir: &Path, entry: &ModelEntry) ->
url url
); );
let mut resp = client let mut resp = client
.get(url) .get(url.clone())
.send() .send()
.and_then(|r| r.error_for_status()) .and_then(|r| r.error_for_status())
.context("Failed to download model")?; .with_context(|| format!(
"Failed to download model {} from {}. If your terminal has display/TTY issues, try running with --no-progress.",
entry.name, url
))?;
let tmp_path = models_dir.join(format!("ggml-{}.bin.part", entry.name)); let tmp_path = models_dir.join(format!("ggml-{}.bin.part", entry.name));
if tmp_path.exists() { if tmp_path.exists() {

View File

@@ -0,0 +1,125 @@
// SPDX-License-Identifier: MIT
// Validation and error-handling integration tests
use std::fs;
use std::io::Read;
use std::path::PathBuf;
use std::process::Command;
fn bin() -> &'static str {
env!("CARGO_BIN_EXE_polyscribe")
}
fn manifest_path(relative: &str) -> PathBuf {
let mut p = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
p.push(relative);
p
}
#[test]
fn error_on_missing_input_file() {
let exe = bin();
let missing = manifest_path("input/definitely_missing_123.json");
let out = Command::new(exe)
.arg(missing.as_os_str())
.output()
.expect("failed to run polyscribe with missing input");
assert!(!out.status.success(), "command should fail on missing input");
let stderr = String::from_utf8_lossy(&out.stderr);
assert!(
stderr.contains("Input not found") || stderr.contains("No input files provided"),
"stderr should mention missing input; got: {}",
stderr
);
}
#[test]
fn error_on_directory_as_input() {
let exe = bin();
// Use the repo's input directory which exists and is a directory
let input_dir = manifest_path("input");
let out = Command::new(exe)
.arg(input_dir.as_os_str())
.output()
.expect("failed to run polyscribe with directory input");
assert!(!out.status.success(), "command should fail on dir input");
let stderr = String::from_utf8_lossy(&out.stderr);
assert!(
stderr.contains("directory") || stderr.contains("Unsupported input type"),
"stderr should mention directory/unsupported; got: {}",
stderr
);
}
#[test]
fn error_on_no_ffmpeg_present() {
let exe = bin();
// Create a tiny temp .wav file (may be empty; ffmpeg will be attempted but PATH will be empty)
let tmp_dir = manifest_path("target/tmp/itest_no_ffmpeg");
let _ = fs::remove_dir_all(&tmp_dir);
fs::create_dir_all(&tmp_dir).unwrap();
let wav = tmp_dir.join("dummy.wav");
fs::write(&wav, b"\0\0\0\0").unwrap();
let out = Command::new(exe)
.arg(wav.as_os_str())
.env("PATH", "") // simulate ffmpeg missing
.env_remove("WHISPER_MODEL")
.env("POLYSCRIBE_MODELS_BASE_COPY_DIR", manifest_path("models").as_os_str())
.arg("--language").arg("en")
.output()
.expect("failed to run polyscribe with empty PATH");
assert!(
!out.status.success(),
"command should fail when ffmpeg is not found"
);
let stderr = String::from_utf8_lossy(&out.stderr);
assert!(
stderr.contains("ffmpeg not found") || stderr.contains("Failed to execute ffmpeg"),
"stderr should mention ffmpeg not found; got: {}",
stderr
);
}
#[cfg(unix)]
#[test]
fn error_on_readonly_output_dir() {
use std::os::unix::fs::PermissionsExt;
let exe = bin();
let input1 = manifest_path("input/1-s0wlz.json");
// Prepare a read-only directory
let tmp_dir = manifest_path("target/tmp/itest_readonly_out");
let _ = fs::remove_dir_all(&tmp_dir);
fs::create_dir_all(&tmp_dir).unwrap();
let mut perms = fs::metadata(&tmp_dir).unwrap().permissions();
perms.set_mode(0o555); // read & execute, no write
fs::set_permissions(&tmp_dir, perms).unwrap();
let out = Command::new(exe)
.arg(input1.as_os_str())
.arg("-o")
.arg(tmp_dir.as_os_str())
.output()
.expect("failed to run polyscribe with read-only output dir");
// Restore perms for cleanup
let mut perms2 = fs::metadata(&tmp_dir).unwrap().permissions();
perms2.set_mode(0o755);
let _ = fs::set_permissions(&tmp_dir, perms2);
assert!(
!out.status.success(),
"command should fail when outputs cannot be created"
);
let stderr = String::from_utf8_lossy(&out.stderr);
assert!(
stderr.contains("Failed to create output") || stderr.contains("permission"),
"stderr should mention failure to create outputs; got: {}",
stderr
);
// Cleanup
let _ = fs::remove_dir_all(&tmp_dir);
}