diff --git a/Cargo.lock b/Cargo.lock index cafd5f5..0b41454 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "android-tzdata" version = "0.1.1" @@ -79,6 +88,32 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "bindgen" +version = "0.71.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", +] + +[[package]] +name = "bitflags" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" + [[package]] name = "bumpalo" version = "3.19.0" @@ -94,6 +129,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.1" @@ -114,6 +158,17 @@ dependencies = [ "windows-link", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.5.43" @@ -154,6 +209,15 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" +[[package]] +name = "cmake" +version = "0.1.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.4" @@ -176,14 +240,33 @@ dependencies = [ "serde", "serde_json", "toml", + "whisper-rs", ] +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "equivalent" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + [[package]] name = "hashbrown" version = "0.15.5" @@ -236,6 +319,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" @@ -258,6 +350,16 @@ version = "0.2.174" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" +[[package]] +name = "libloading" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" +dependencies = [ + "cfg-if", + "windows-targets", +] + [[package]] name = "log" version = "0.4.27" @@ -270,6 +372,22 @@ version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -291,6 +409,16 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +[[package]] +name = "prettyplease" +version = "0.2.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff24dfcda44452b9816fff4cd4227e1bb73ff5a2f1bc1105aa92fb8565ce44d2" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro2" version = "1.0.95" @@ -309,6 +437,41 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustversion" version = "1.0.21" @@ -496,6 +659,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "whisper-rs" +version = "0.14.3" +source = "git+https://github.com/tazz4843/whisper-rs#135b60b85a15714862806b6ea9f76abec38156f1" +dependencies = [ + "whisper-rs-sys", +] + +[[package]] +name = "whisper-rs-sys" +version = "0.13.0" +source = "git+https://github.com/tazz4843/whisper-rs#135b60b85a15714862806b6ea9f76abec38156f1" +dependencies = [ + "bindgen", + "cfg-if", + "cmake", + "fs_extra", +] + [[package]] name = "windows-core" version = "0.61.2" diff --git a/Cargo.toml b/Cargo.toml index b7750f3..6c5d1ed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,8 @@ serde = { version = "1.0.219", features = ["derive"] } serde_json = "1.0.142" toml = "0.8" chrono = { version = "0.4", features = ["clock"] } +whisper-rs = { git = "https://github.com/tazz4843/whisper-rs", optional = true } + +[features] +default = ["native-whisper"] +native-whisper = ["whisper-rs"] diff --git a/src/main.rs b/src/main.rs index 70e0b34..4d013fc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,16 +8,23 @@ use clap::Parser; use serde::{Deserialize, Serialize}; use chrono::Local; +#[cfg(feature = "native-whisper")] +use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; + #[derive(Parser, Debug)] -#[command(name = "merge_transcripts", version, about = "Merge multiple JSON transcripts into one")] +#[command(name = "merge_transcripts", version, about = "Merge multiple JSON transcripts into one or transcribe audio using native whisper")] struct Args { - /// Input JSON files to merge + /// Input .json transcript files or audio files to merge/transcribe #[arg(required = true)] inputs: Vec, - /// Output file path (if omitted, writes to stdout) + /// Output file path base (date prefix will be added); if omitted, writes JSON to stdout #[arg(short, long, value_name = "FILE")] output: Option, + + /// Language code to use for transcription (e.g., en, de). No auto-detection. + #[arg(short, long, value_name = "LANG")] + language: Option, } #[derive(Debug, Deserialize)] @@ -78,7 +85,7 @@ fn render_srt(items: &[OutputEntry]) -> String { out } -// --- Helpers for audio transcription via openai-whisper --- +// --- Helpers for audio transcription --- fn is_json_file(path: &Path) -> bool { matches!(path.extension().and_then(|s| s.to_str()).map(|s| s.to_lowercase()), Some(ext) if ext == "json") } @@ -93,75 +100,153 @@ fn is_audio_file(path: &Path) -> bool { false } -fn ensure_whisper_venv() -> Result { - let venv_dir = Path::new(".venv"); - let bin_dir = venv_dir.join("bin"); - let python_bin = bin_dir.join("python"); - // Create venv if needed - if !python_bin.exists() { - let status = Command::new("python") - .arg("-m").arg("venv").arg(".venv") - .status() - .with_context(|| "Failed to execute: python -m venv .venv")?; - if !status.success() { - return Err(anyhow!("Creating virtual environment failed (python -m venv .venv)")); - } - } - // Install openai-whisper if needed - let whisper_bin = bin_dir.join("whisper"); - if !whisper_bin.exists() { - let status = Command::new(&python_bin) - .arg("-m").arg("pip").arg("install").arg("-U").arg("openai-whisper") - .status() - .with_context(|| "Failed to execute: .venv/bin/python -m pip install -U openai-whisper")?; - if !status.success() { - return Err(anyhow!("Installing openai-whisper failed")); - } - } - Ok(whisper_bin) +fn normalize_lang_code(input: &str) -> Option { + let mut s = input.trim().to_lowercase(); + if s.is_empty() || s == "auto" || s == "c" || s == "posix" { return None; } + if let Some((lhs, _)) = s.split_once('.') { s = lhs.to_string(); } + if let Some((lhs, _)) = s.split_once('_') { s = lhs.to_string(); } + let code = match s.as_str() { + // ISO codes directly + "en"=>"en","de"=>"de","es"=>"es","fr"=>"fr","it"=>"it","pt"=>"pt","nl"=>"nl","ru"=>"ru","pl"=>"pl", + "uk"=>"uk","cs"=>"cs","sv"=>"sv","no"=>"no","da"=>"da","fi"=>"fi","hu"=>"hu","tr"=>"tr","el"=>"el", + "zh"=>"zh","ja"=>"ja","ko"=>"ko","ar"=>"ar","he"=>"he","hi"=>"hi","ro"=>"ro","bg"=>"bg","sk"=>"sk", + // Common English names + "english"=>"en","german"=>"de","spanish"=>"es","french"=>"fr","italian"=>"it","portuguese"=>"pt", + "dutch"=>"nl","russian"=>"ru","polish"=>"pl","ukrainian"=>"uk","czech"=>"cs","swedish"=>"sv", + "norwegian"=>"no","danish"=>"da","finnish"=>"fi","hungarian"=>"hu","turkish"=>"tr","greek"=>"el", + "chinese"=>"zh","japanese"=>"ja","korean"=>"ko","arabic"=>"ar","hebrew"=>"he","hindi"=>"hi", + "romanian"=>"ro","bulgarian"=>"bg","slovak"=>"sk", + _ => return None, + }; + Some(code.to_string()) } -fn run_whisper(audio_path: &Path, out_dir: &Path, whisper_bin: &Path) -> Result { - create_dir_all(out_dir).with_context(|| format!("Failed to create whisper output dir: {}", out_dir.display()))?; - let out_dir_str = out_dir.to_string_lossy().to_string(); - // Try with -o first (as per requested command) - let attempt = |use_short_o: bool| -> Result<()> { - let mut cmd = Command::new(whisper_bin); - cmd.arg(audio_path) - .arg("--model").arg("turbo") - .arg("--output_format").arg("json"); - if use_short_o { - cmd.arg("-o").arg(&out_dir_str); - } else { - cmd.arg("--output_dir").arg(&out_dir_str); + +#[cfg(feature = "native-whisper")] +fn find_model_file() -> Result { + let models_dir = Path::new("models"); + if !models_dir.exists() { + return Err(anyhow!("No models directory found at {}", models_dir.display())); + } + let mut candidates: Vec = Vec::new(); + let rd = std::fs::read_dir(models_dir) + .with_context(|| format!("Failed to read models directory: {}", models_dir.display()))?; + for entry in rd { + let entry = entry?; + let path = entry.path(); + if path.is_file() { + if let Some(ext) = path.extension().and_then(|s| s.to_str()).map(|s| s.to_lowercase()) { + if ext == "bin" { + candidates.push(path); + } + } } - let output = cmd.output() - .with_context(|| format!("Failed to execute whisper for {}", audio_path.display()))?; - if !output.status.success() { + } + if candidates.is_empty() { + return Err(anyhow!("No Whisper model files (*.bin) found in {}", models_dir.display())); + } + if candidates.len() == 1 { + return Ok(candidates.remove(0)); + } + // Multiple models: prompt user to choose + eprintln!("Multiple Whisper models found in {}:", models_dir.display()); + for (i, p) in candidates.iter().enumerate() { + eprintln!(" {}) {}", i + 1, p.display()); + } + eprint!("Select model by number [1-{}]: ", candidates.len()); + io::stderr().flush().ok(); + let mut input = String::new(); + io::stdin().read_line(&mut input).context("Failed to read selection")?; + let sel: usize = input.trim().parse().map_err(|_| anyhow!("Invalid selection: {}", input.trim()))?; + if sel == 0 || sel > candidates.len() { + return Err(anyhow!("Selection out of range")); + } + Ok(candidates.swap_remove(sel - 1)) +} + +#[cfg(feature = "native-whisper")] +fn decode_audio_to_pcm_f32_ffmpeg(audio_path: &Path) -> Result> { + let output = Command::new("ffmpeg") + .arg("-i").arg(audio_path) + .arg("-f").arg("f32le") + .arg("-ac").arg("1") + .arg("-ar").arg("16000") + .arg("pipe:1") + .output() + .with_context(|| format!("Failed to execute ffmpeg for {}", audio_path.display()))?; + if !output.status.success() { + return Err(anyhow!( + "ffmpeg failed for {}: {}", + audio_path.display(), + String::from_utf8_lossy(&output.stderr) + )); + } + let bytes = output.stdout; + if bytes.len() % 4 != 0 { + // Truncate to nearest multiple of 4 bytes to avoid partial f32 + let truncated = bytes.len() - (bytes.len() % 4); + let mut v = Vec::with_capacity(truncated / 4); + for chunk in bytes[..truncated].chunks_exact(4) { + let arr = [chunk[0], chunk[1], chunk[2], chunk[3]]; + v.push(f32::from_le_bytes(arr)); + } + Ok(v) + } else { + let mut v = Vec::with_capacity(bytes.len() / 4); + for chunk in bytes.chunks_exact(4) { + let arr = [chunk[0], chunk[1], chunk[2], chunk[3]]; + v.push(f32::from_le_bytes(arr)); + } + Ok(v) + } +} + +#[cfg(feature = "native-whisper")] +fn transcribe_native(audio_path: &Path, speaker: &str, lang_opt: Option<&str>) -> Result> { + let pcm = decode_audio_to_pcm_f32_ffmpeg(audio_path)?; + let model = find_model_file()?; + let is_en_only = model + .file_name() + .and_then(|s| s.to_str()) + .map(|s| s.contains(".en.") || s.ends_with(".en.bin")) + .unwrap_or(false); + if let Some(lang) = lang_opt { + if is_en_only && lang != "en" { return Err(anyhow!( - "Whisper transcription failed for {}: {}", - audio_path.display(), - String::from_utf8_lossy(&output.stderr) + "Selected model is English-only ({}), but a non-English language hint '{}' was provided. Please use a multilingual model like models/ggml-base.bin or set WHISPER_MODEL accordingly.", + model.display(), + lang )); } - Ok(()) - }; - - if let Err(_e) = attempt(true) { - // Fallback to --output_dir if -o not supported by CLI installed - attempt(false)?; } + let model_str = model.to_str().ok_or_else(|| anyhow!("Model path not valid UTF-8: {}", model.display()))?; + let ctx = WhisperContext::new_with_params(model_str, WhisperContextParameters::default()) + .with_context(|| format!("Failed to load Whisper model at {}", model.display()))?; + let mut state = ctx.create_state() + .map_err(|e| anyhow!("Failed to create Whisper state: {:?}", e))?; - // Try to locate the resulting JSON file - let stem = audio_path.file_stem().and_then(|s| s.to_str()).unwrap_or("transcript"); - let cand1 = out_dir.join(format!("{}.json", stem)); - if cand1.exists() { return Ok(cand1); } - if let Some(file_name) = audio_path.file_name().and_then(|s| s.to_str()) { - let cand2 = out_dir.join(format!("{}.json", file_name)); - if cand2.exists() { return Ok(cand2); } + let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 }); + let n_threads = std::thread::available_parallelism().map(|n| n.get() as i32).unwrap_or(1); + params.set_n_threads(n_threads); + params.set_translate(false); + if let Some(lang) = lang_opt { params.set_language(Some(lang)); } + + state.full(params, &pcm) + .map_err(|e| anyhow!("Whisper full() failed: {:?}", e))?; + + let num_segments = state.full_n_segments().map_err(|e| anyhow!("Failed to get segments: {:?}", e))?; + let mut items = Vec::new(); + for i in 0..num_segments { + let text = state.full_get_segment_text(i) + .map_err(|e| anyhow!("Failed to get segment text: {:?}", e))?; + let t0 = state.full_get_segment_t0(i).map_err(|e| anyhow!("Failed to get segment t0: {:?}", e))?; + let t1 = state.full_get_segment_t1(i).map_err(|e| anyhow!("Failed to get segment t1: {:?}", e))?; + let start = (t0 as f64) * 0.01; + let end = (t1 as f64) * 0.01; + items.push(OutputEntry { id: 0, speaker: speaker.to_string(), start, end, text: text.trim().to_string() }); } - Err(anyhow!("Could not find whisper JSON output in {} for input {}", out_dir.display(), audio_path.display())) + Ok(items) } fn main() -> Result<()> { @@ -182,22 +267,16 @@ fn main() -> Result<()> { return Err(anyhow!("No input files provided")); } - // Determine directory for intermediate Whisper transcriptions - let transcripts_dir: PathBuf = if let Some(ref out_path) = output_path { - let base_path = Path::new(out_path); - if let Some(parent) = base_path.parent() { - if parent.as_os_str().is_empty() { - PathBuf::from("output").join("transcripts") - } else { - parent.join("transcripts") - } - } else { - PathBuf::from("output").join("transcripts") - } + // Language must be provided via CLI when transcribing audio (no detection from JSON/env) + let lang_hint: Option = if let Some(ref l) = args.language { + normalize_lang_code(l).or_else(|| Some(l.trim().to_lowercase())) } else { - PathBuf::from("output").join("transcripts") + None }; - let mut whisper_bin_opt: Option = None; + let any_audio = inputs.iter().any(|p| is_audio_file(Path::new(p))); + if any_audio && lang_hint.is_none() { + return Err(anyhow!("Please specify --language (e.g., --language en). Language detection was removed.")); + } let mut entries: Vec = Vec::new(); @@ -211,16 +290,18 @@ fn main() -> Result<()> { let mut buf = String::new(); if is_audio_file(path) { - // Ensure whisper is available - if whisper_bin_opt.is_none() { - whisper_bin_opt = Some(ensure_whisper_venv()?); + #[cfg(feature = "native-whisper")] + { + let items = transcribe_native(path, &speaker, lang_hint.as_deref())?; + for e in items { + entries.push(e); + } + continue; + } + #[cfg(not(feature = "native-whisper"))] + { + return Err(anyhow!("Python transcription has been removed. Please build with --features native-whisper to transcribe audio.")); } - let whisper_bin = whisper_bin_opt.as_ref().unwrap(); - let json_path = run_whisper(path, &transcripts_dir, whisper_bin)?; - File::open(&json_path) - .with_context(|| format!("Failed to open Whisper JSON: {}", json_path.display()))? - .read_to_string(&mut buf) - .with_context(|| format!("Failed to read Whisper JSON: {}", json_path.display()))?; } else if is_json_file(path) { File::open(path) .with_context(|| format!("Failed to open: {}", input_path))?