[feat] add support for audio transcription using openai-whisper

2025-08-08 04:19:23 +02:00
parent a57d631f03
commit 36902e8de1
1 changed files with 127 additions and 7 deletions
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,7 @@
 use std::fs::{File, create_dir_all};
 use std::io::{self, Read, Write};
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::process::Command;
 use anyhow::{anyhow, Context, Result};
 use clap::Parser;
@@ -77,6 +78,92 @@ fn render_srt(items: &[OutputEntry]) -> String {
    out
 }
 // --- Helpers for audio transcription via openai-whisper ---
 fn is_json_file(path: &Path) -> bool {
    matches!(path.extension().and_then(|s| s.to_str()).map(|s| s.to_lowercase()), Some(ext) if ext == "json")
 }
 fn is_audio_file(path: &Path) -> bool {
    if let Some(ext) = path.extension().and_then(|s| s.to_str()).map(|s| s.to_lowercase()) {
        let exts = [
            "mp3","wav","m4a","mp4","aac","flac","ogg","wma","webm","mkv","mov","avi","m4b","3gp","opus","aiff","alac"
        ];
        return exts.contains(&ext.as_str());
    }
    false
 }
 fn ensure_whisper_venv() -> Result<PathBuf> {
    let venv_dir = Path::new(".venv");
    let bin_dir = venv_dir.join("bin");
    let python_bin = bin_dir.join("python");
    // Create venv if needed
    if !python_bin.exists() {
        let status = Command::new("python")
            .arg("-m").arg("venv").arg(".venv")
            .status()
            .with_context(|| "Failed to execute: python -m venv .venv")?;
        if !status.success() {
            return Err(anyhow!("Creating virtual environment failed (python -m venv .venv)"));
        }
    }
    // Install openai-whisper if needed
    let whisper_bin = bin_dir.join("whisper");
    if !whisper_bin.exists() {
        let status = Command::new(&python_bin)
            .arg("-m").arg("pip").arg("install").arg("-U").arg("openai-whisper")
            .status()
            .with_context(|| "Failed to execute: .venv/bin/python -m pip install -U openai-whisper")?;
        if !status.success() {
            return Err(anyhow!("Installing openai-whisper failed"));
        }
    }
    Ok(whisper_bin)
 }
 fn run_whisper(audio_path: &Path, out_dir: &Path, whisper_bin: &Path) -> Result<PathBuf> {
    create_dir_all(out_dir).with_context(|| format!("Failed to create whisper output dir: {}", out_dir.display()))?;
    let out_dir_str = out_dir.to_string_lossy().to_string();
    // Try with -o first (as per requested command)
    let attempt = |use_short_o: bool| -> Result<()> {
        let mut cmd = Command::new(whisper_bin);
        cmd.arg(audio_path)
            .arg("--model").arg("turbo")
            .arg("--output_format").arg("json");
        if use_short_o {
            cmd.arg("-o").arg(&out_dir_str);
        } else {
            cmd.arg("--output_dir").arg(&out_dir_str);
        }
        let output = cmd.output()
            .with_context(|| format!("Failed to execute whisper for {}", audio_path.display()))?;
        if !output.status.success() {
            return Err(anyhow!(
                "Whisper transcription failed for {}: {}",
                audio_path.display(),
                String::from_utf8_lossy(&output.stderr)
            ));
        }
        Ok(())
    };
    if let Err(_e) = attempt(true) {
        // Fallback to --output_dir if -o not supported by CLI installed
        attempt(false)?;
    }
    // Try to locate the resulting JSON file
    let stem = audio_path.file_stem().and_then(|s| s.to_str()).unwrap_or("transcript");
    let cand1 = out_dir.join(format!("{}.json", stem));
    if cand1.exists() { return Ok(cand1); }
    if let Some(file_name) = audio_path.file_name().and_then(|s| s.to_str()) {
        let cand2 = out_dir.join(format!("{}.json", file_name));
        if cand2.exists() { return Ok(cand2); }
    }
    Err(anyhow!("Could not find whisper JSON output in {} for input {}", out_dir.display(), audio_path.display()))
 }
 fn main() -> Result<()> {
    let args = Args::parse();
@@ -95,23 +182,56 @@ fn main() -> Result<()> {
        return Err(anyhow!("No input files provided"));
    }
    // Determine directory for intermediate Whisper transcriptions
    let transcripts_dir: PathBuf = if let Some(ref out_path) = output_path {
        let base_path = Path::new(out_path);
        if let Some(parent) = base_path.parent() {
            if parent.as_os_str().is_empty() {
                PathBuf::from("output").join("transcripts")
            } else {
                parent.join("transcripts")
            }
        } else {
            PathBuf::from("output").join("transcripts")
        }
    } else {
        PathBuf::from("output").join("transcripts")
    };
    let mut whisper_bin_opt: Option<PathBuf> = None;
    let mut entries: Vec<OutputEntry> = Vec::new();
    for input_path in &inputs {
-        let speaker = Path::new(input_path)
+        let path = Path::new(input_path);
        let speaker = path
            .file_stem()
            .and_then(|s| s.to_str())
            .unwrap_or("speaker")
            .to_string();
        let mut buf = String::new();
-        File::open(input_path)
+        if is_audio_file(path) {
-            .with_context(|| format!("Failed to open: {}", input_path))?
+            // Ensure whisper is available
-            .read_to_string(&mut buf)
+            if whisper_bin_opt.is_none() {
-            .with_context(|| format!("Failed to read: {}", input_path))?;
+                whisper_bin_opt = Some(ensure_whisper_venv()?);
            }
            let whisper_bin = whisper_bin_opt.as_ref().unwrap();
            let json_path = run_whisper(path, &transcripts_dir, whisper_bin)?;
            File::open(&json_path)
                .with_context(|| format!("Failed to open Whisper JSON: {}", json_path.display()))?
                .read_to_string(&mut buf)
                .with_context(|| format!("Failed to read Whisper JSON: {}", json_path.display()))?;
        } else if is_json_file(path) {
            File::open(path)
                .with_context(|| format!("Failed to open: {}", input_path))?
                .read_to_string(&mut buf)
                .with_context(|| format!("Failed to read: {}", input_path))?;
        } else {
            return Err(anyhow!(format!("Unsupported input type (expected .json or audio media): {}", input_path)));
        }
        let root: InputRoot = serde_json::from_str(&buf)
-            .with_context(|| format!("Invalid JSON: {}", input_path))?;
+            .with_context(|| format!("Invalid JSON transcript parsed from {}", input_path))?;
        for seg in root.segments {
            entries.push(OutputEntry {