From 36902e8de185ad20ff0774e2c0928c8039b6fc4b Mon Sep 17 00:00:00 2001 From: vikingowl Date: Fri, 8 Aug 2025 04:19:23 +0200 Subject: [PATCH] [feat] add support for audio transcription using openai-whisper --- src/main.rs | 134 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 127 insertions(+), 7 deletions(-) diff --git a/src/main.rs b/src/main.rs index fe5caae..70e0b34 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,7 @@ use std::fs::{File, create_dir_all}; use std::io::{self, Read, Write}; -use std::path::Path; +use std::path::{Path, PathBuf}; +use std::process::Command; use anyhow::{anyhow, Context, Result}; use clap::Parser; @@ -77,6 +78,92 @@ fn render_srt(items: &[OutputEntry]) -> String { out } +// --- Helpers for audio transcription via openai-whisper --- +fn is_json_file(path: &Path) -> bool { + matches!(path.extension().and_then(|s| s.to_str()).map(|s| s.to_lowercase()), Some(ext) if ext == "json") +} + +fn is_audio_file(path: &Path) -> bool { + if let Some(ext) = path.extension().and_then(|s| s.to_str()).map(|s| s.to_lowercase()) { + let exts = [ + "mp3","wav","m4a","mp4","aac","flac","ogg","wma","webm","mkv","mov","avi","m4b","3gp","opus","aiff","alac" + ]; + return exts.contains(&ext.as_str()); + } + false +} + +fn ensure_whisper_venv() -> Result { + let venv_dir = Path::new(".venv"); + let bin_dir = venv_dir.join("bin"); + let python_bin = bin_dir.join("python"); + // Create venv if needed + if !python_bin.exists() { + let status = Command::new("python") + .arg("-m").arg("venv").arg(".venv") + .status() + .with_context(|| "Failed to execute: python -m venv .venv")?; + if !status.success() { + return Err(anyhow!("Creating virtual environment failed (python -m venv .venv)")); + } + } + // Install openai-whisper if needed + let whisper_bin = bin_dir.join("whisper"); + if !whisper_bin.exists() { + let status = Command::new(&python_bin) + .arg("-m").arg("pip").arg("install").arg("-U").arg("openai-whisper") + .status() + .with_context(|| "Failed to execute: .venv/bin/python -m pip install -U openai-whisper")?; + if !status.success() { + return Err(anyhow!("Installing openai-whisper failed")); + } + } + Ok(whisper_bin) +} + +fn run_whisper(audio_path: &Path, out_dir: &Path, whisper_bin: &Path) -> Result { + create_dir_all(out_dir).with_context(|| format!("Failed to create whisper output dir: {}", out_dir.display()))?; + let out_dir_str = out_dir.to_string_lossy().to_string(); + + // Try with -o first (as per requested command) + let attempt = |use_short_o: bool| -> Result<()> { + let mut cmd = Command::new(whisper_bin); + cmd.arg(audio_path) + .arg("--model").arg("turbo") + .arg("--output_format").arg("json"); + if use_short_o { + cmd.arg("-o").arg(&out_dir_str); + } else { + cmd.arg("--output_dir").arg(&out_dir_str); + } + let output = cmd.output() + .with_context(|| format!("Failed to execute whisper for {}", audio_path.display()))?; + if !output.status.success() { + return Err(anyhow!( + "Whisper transcription failed for {}: {}", + audio_path.display(), + String::from_utf8_lossy(&output.stderr) + )); + } + Ok(()) + }; + + if let Err(_e) = attempt(true) { + // Fallback to --output_dir if -o not supported by CLI installed + attempt(false)?; + } + + // Try to locate the resulting JSON file + let stem = audio_path.file_stem().and_then(|s| s.to_str()).unwrap_or("transcript"); + let cand1 = out_dir.join(format!("{}.json", stem)); + if cand1.exists() { return Ok(cand1); } + if let Some(file_name) = audio_path.file_name().and_then(|s| s.to_str()) { + let cand2 = out_dir.join(format!("{}.json", file_name)); + if cand2.exists() { return Ok(cand2); } + } + Err(anyhow!("Could not find whisper JSON output in {} for input {}", out_dir.display(), audio_path.display())) +} + fn main() -> Result<()> { let args = Args::parse(); @@ -95,23 +182,56 @@ fn main() -> Result<()> { return Err(anyhow!("No input files provided")); } + // Determine directory for intermediate Whisper transcriptions + let transcripts_dir: PathBuf = if let Some(ref out_path) = output_path { + let base_path = Path::new(out_path); + if let Some(parent) = base_path.parent() { + if parent.as_os_str().is_empty() { + PathBuf::from("output").join("transcripts") + } else { + parent.join("transcripts") + } + } else { + PathBuf::from("output").join("transcripts") + } + } else { + PathBuf::from("output").join("transcripts") + }; + let mut whisper_bin_opt: Option = None; + let mut entries: Vec = Vec::new(); for input_path in &inputs { - let speaker = Path::new(input_path) + let path = Path::new(input_path); + let speaker = path .file_stem() .and_then(|s| s.to_str()) .unwrap_or("speaker") .to_string(); let mut buf = String::new(); - File::open(input_path) - .with_context(|| format!("Failed to open: {}", input_path))? - .read_to_string(&mut buf) - .with_context(|| format!("Failed to read: {}", input_path))?; + if is_audio_file(path) { + // Ensure whisper is available + if whisper_bin_opt.is_none() { + whisper_bin_opt = Some(ensure_whisper_venv()?); + } + let whisper_bin = whisper_bin_opt.as_ref().unwrap(); + let json_path = run_whisper(path, &transcripts_dir, whisper_bin)?; + File::open(&json_path) + .with_context(|| format!("Failed to open Whisper JSON: {}", json_path.display()))? + .read_to_string(&mut buf) + .with_context(|| format!("Failed to read Whisper JSON: {}", json_path.display()))?; + } else if is_json_file(path) { + File::open(path) + .with_context(|| format!("Failed to open: {}", input_path))? + .read_to_string(&mut buf) + .with_context(|| format!("Failed to read: {}", input_path))?; + } else { + return Err(anyhow!(format!("Unsupported input type (expected .json or audio media): {}", input_path))); + } let root: InputRoot = serde_json::from_str(&buf) - .with_context(|| format!("Invalid JSON: {}", input_path))?; + .with_context(|| format!("Invalid JSON transcript parsed from {}", input_path))?; for seg in root.segments { entries.push(OutputEntry {