From f47f3f32a36825ad645b38b39c5480515b051e88 Mon Sep 17 00:00:00 2001 From: vikingowl Date: Fri, 8 Aug 2025 19:53:00 +0200 Subject: [PATCH] [feat] add example scripts for transcription, model downloading, and updates; improve documentation with guides for CI, packaging, and development --- CONTRIBUTING.md | 32 +++++++++ README.md | 77 ++++++++++++++++++++++ docs/ci.md | 26 ++++++++ docs/design.md | 37 +++++++++++ docs/development.md | 73 +++++++++++++++++++++ docs/faq.md | 26 ++++++++ docs/release-packaging.md | 33 ++++++++++ docs/usage.md | 86 +++++++++++++++++++++++++ examples/download_models_interactive.sh | 11 ++++ examples/transcribe_file.sh | 13 ++++ examples/update_models.sh | 13 ++++ src/backend.rs | 1 + src/models.rs | 1 + 13 files changed, 429 insertions(+) create mode 100644 CONTRIBUTING.md create mode 100644 README.md create mode 100644 docs/ci.md create mode 100644 docs/design.md create mode 100644 docs/development.md create mode 100644 docs/faq.md create mode 100644 docs/release-packaging.md create mode 100644 docs/usage.md create mode 100644 examples/download_models_interactive.sh create mode 100644 examples/transcribe_file.sh create mode 100644 examples/update_models.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..08fa35b --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,32 @@ +# Contributing to PolyScribe + +Thanks for your interest in contributing! This guide explains the workflow and the checklist to follow before opening a Pull Request. + +Workflow (fork → branch → PR) +1) Fork the repository to your account. +2) Create a feature branch: +- git checkout -b feat/short-description +3) Make changes with focused commits and good messages. +4) Run the checklist below. +5) Push and open a Pull Request against the main repository. + +Developer checklist (before opening a PR) +- Build: + - cargo build (preferably without warnings) +- Tests: + - cargo test (all tests pass) +- Lints: + - cargo clippy --all-targets -- -D warnings (fix warnings) +- Documentation: + - Update README/docs for user-visible changes + - Update CHANGELOG.md if applicable +- Tests for changes: + - Add or update tests for bug fixes and new features where reasonable + +Local development tips +- Use `cargo run -- ` during development. +- For faster feedback, keep examples in the examples/ folder handy. +- Keep functions small and focused; prefer clear error messages with context. + +Code of conduct +- Be respectful and constructive. Assume good intent. diff --git a/README.md b/README.md new file mode 100644 index 0000000..fb8def6 --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +# PolyScribe + +PolyScribe is a fast, local-first CLI for transcribing audio/video and merging existing JSON transcripts. It uses whisper-rs under the hood, can discover and download Whisper models automatically, and supports CPU and optional GPU backends (CUDA, ROCm/HIP, Vulkan). + +Key features +- Transcribe audio and common video files using ffmpeg for audio extraction. +- Merge multiple JSON transcripts, or merge and also keep per-file outputs. +- Model management: interactive downloader and non-interactive updater with hash verification. +- GPU backend selection at runtime; auto-detects available accelerators. +- Clean outputs (JSON and SRT), speaker naming prompts, and useful logging controls. + +Quickstart +1) Install Rust (rustup) and ffmpeg, then build: +- rustup install stable +- rustup default stable +- cargo build --release + +2) Download a model (first run can prompt you): +- ./target/release/polyscribe --download-models + +3) Transcribe a file: +- ./target/release/polyscribe -v -o output my_audio.mp3 +This writes JSON and SRT into the output directory with a date prefix. + +Model locations +- Development (debug builds): ./models next to the project. +- Packaged/release builds: $XDG_DATA_HOME/polyscribe/models or ~/.local/share/polyscribe/models. +- Override via env var: POLYSCRIBE_MODELS_DIR=/path/to/models. +- Force a specific model file via env var: WHISPER_MODEL=/path/to/model.bin. + +Most-used CLI flags +- -o, --output FILE_OR_DIR: Output path base (date prefix added). If omitted, JSON prints to stdout. +- -m, --merge: Merge all inputs into one output; otherwise one output per input. +- --merge-and-separate: Write both merged output and separate per-input outputs (requires -o dir). +- --set-speaker-names: Prompt for a speaker label per input file. +- --update-models: Verify/update local models by size/hash against the upstream manifest. +- --download-models: Interactive model list + multi-select download. +- --language LANG: Language code hint (e.g., en, de). English-only models reject non-en hints. +- --gpu-backend [auto|cpu|cuda|hip|vulkan]: Select backend (auto by default). +- --gpu-layers N: Offload N layers to GPU when supported. +- -v/--verbose (repeatable): Increase log verbosity. -vv shows very detailed logs. +- -q/--quiet: Suppress non-error logs (stderr); does not silence stdout results. +- --no-interaction: Never prompt; suitable for CI. + +Minimal usage examples +- Transcribe an audio file to JSON/SRT: + - ./target/release/polyscribe -o output samples/podcast_clip.mp3 +- Merge multiple transcripts into one: + - ./target/release/polyscribe -m -o output merged input/a.json input/b.json +- Update local models non-interactively (good for CI): + - ./target/release/polyscribe --update-models --no-interaction -q + +Running tests and tools +- cargo test +- cargo clippy --all-targets -- -D warnings +- cargo build (preferably without warnings) + +Model downloader +- Interactive: ./target/release/polyscribe --download-models +- Non-interactive: relies on defaults; set WHISPER_MODEL or POLYSCRIBE_MODELS_DIR when needed. + +Documentation index +- docs/usage.md – complete CLI reference and workflows +- docs/development.md – build, run, and contribute locally +- docs/design.md – architecture overview and decisions +- docs/release-packaging.md – packaging notes for distributions +- docs/faq.md – common issues and solutions +- docs/ci.md – minimal CI checklist and job outline +- CONTRIBUTING.md – PR checklist and workflow + +CI status: [CI badge placeholder] + +Examples +See the examples/ directory for copy-paste scripts: +- examples/transcribe_file.sh +- examples/update_models.sh +- examples/download_models_interactive.sh diff --git a/docs/ci.md b/docs/ci.md new file mode 100644 index 0000000..7976bdf --- /dev/null +++ b/docs/ci.md @@ -0,0 +1,26 @@ +# CI checklist and job outline + +Checklist to keep docs and code healthy in CI +- Build: cargo build --all-targets --locked +- Tests: cargo test --all --locked +- Lints: cargo clippy --all-targets -- -D warnings +- Optional: check README and docs snippets (basic smoke run of examples scripts) + - bash examples/update_models.sh (can be skipped offline) + - bash examples/transcribe_file.sh (use a tiny sample file if available) + +Example GitHub Actions job (outline) +- name: Rust + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - name: Build + run: cargo build --all-targets --locked + - name: Test + run: cargo test --all --locked + - name: Clippy + run: cargo clippy --all-targets -- -D warnings + +Notes +- For GPU features, set up appropriate runners and add `--features gpu-cuda|gpu-hip|gpu-vulkan` where applicable. +- For docs-only changes, jobs still build/test to ensure doctests and examples compile when enabled. diff --git a/docs/design.md b/docs/design.md new file mode 100644 index 0000000..3c24d76 --- /dev/null +++ b/docs/design.md @@ -0,0 +1,37 @@ +# Design + +Overview +PolyScribe is a CLI that orchestrates: +- CLI parsing and I/O (main.rs) +- Core library (lib.rs) exposing reusable logic +- Backends for transcription (backend.rs) that bind to whisper-rs +- Model management (models.rs) that discovers/downloads/verifies models + +Data flow +1) CLI collects inputs (media or JSON), options (merge, speaker names, language, GPU backend), and output path. +2) For media, audio is extracted via ffmpeg to PCM f32 in-memory. +3) A Whisper model is selected (env var override, last-used, interactive download, or non-interactive default). +4) The selected backend performs transcription via whisper-rs producing segments. +5) Segments are merged/organized and written to JSON and SRT as requested. + +Key decisions +- Local-first: default to local models in ./models (debug) or XDG data dir (release) for predictable behavior. +- Whisper model selection: last-used cache (.last_model) provides stable default across runs. +- Non-interactive mode: avoid prompts for CI; download a sensible default if needed. +- Logging: simple macros (elog!/wlog!/ilog!/dlog!) with quiet/verbose controls; stderr used for diagnostics. +- GPU selection: runtime auto-detect with compile-time feature gates per backend. + +Model discovery & verification (conceptual) +- Remote model list pulled from Hugging Face repositories. +- For each model entry we track name, size, and optionally SHA-256. +- Downloads verify size and hash when available; updates compare local files against the manifest. +- Best local model is chosen based on reasonable heuristics (e.g., prefer larger quantized variants when available) to balance quality and speed. + +Extensibility +- New backends: implement TranscribeBackend and add selection wiring in select_backend. +- New model sources: extend models.rs to read additional manifests or repositories. +- Packaging: respect XDG_DATA_HOME/HOME; allow POLYSCRIBE_MODELS_DIR override; avoid hard-coding system paths. + +Binary naming and CLI surface +- Binary is `polyscribe`. +- Keep CLI flags stable and documented; add new flags conservatively. diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 0000000..757e295 --- /dev/null +++ b/docs/development.md @@ -0,0 +1,73 @@ +# Development + +This document describes how to build, test, and run PolyScribe locally, and how to work with models during development. + +Prerequisites +- Rust toolchain via rustup (recommended). +- ffmpeg installed and on PATH. +- For GPU builds: appropriate toolkits and libraries installed (CUDA/ROCm/Vulkan). + +Rust toolchain +- Target stable Rust. +- Recommended: + - rustup install stable + - rustup default stable + +Build +- CPU-only (default): + - cargo build + - cargo test +- Enable GPU features at build time: + - CUDA: cargo build --features gpu-cuda + - HIP: cargo build --features gpu-hip + - Vulkan: cargo build --features gpu-vulkan + +Run locally +- Development model directory defaults to ./models when built in debug mode. +- You can override: + - POLYSCRIBE_MODELS_DIR=/path/to/models + - WHISPER_MODEL=/path/to/model.bin (forces a specific file) +- Example run: + - cargo run -- -v -o output samples/example.mp3 + +Models during development +- Interactive downloader: + - cargo run -- --download-models +- Non-interactive update (checks sizes/hashes, downloads if missing): + - cargo run -- --update-models --no-interaction -q + +Tests +- Run all tests: + - cargo test +- The test suite includes CLI-oriented integration tests and unit tests. Some tests simulate GPU detection using env vars (POLYSCRIBE_TEST_FORCE_*). Do not rely on these flags in production code. + +Clippy +- Run lint checks and treat warnings as errors: + - cargo clippy --all-targets -- -D warnings +- Common warnings can often be fixed by simplifying code, removing unused imports, and following idiomatic patterns. + +Code layout +- src/lib.rs: core library surface and re-exports + - backend: runtime backend selection and transcription glue + - models: model discovery, manifest, download/update logic +- src/main.rs: CLI parsing, I/O orchestration, logging, and workflows +- tests/: integration tests + +Adding a feature +- Find the closest existing module (backend/models/main) and add a small, focused unit test. +- Keep user-facing changes documented in README/docs and update CHANGELOG.md. +- Prefer small functions with clear responsibilities; avoid exposing unnecessary items publicly. +- Follow existing logging style (elog!/wlog!/ilog!/dlog!). + +Running the model downloader +- Interactive: + - cargo run -- --download-models +- Non-interactive suggestions for CI: + - POLYSCRIBE_MODELS_DIR=$PWD/models \ + cargo run -- --update-models --no-interaction -q + +Env var examples for local testing +- Use a local copy of models and a specific model file: + - export POLYSCRIBE_MODELS_DIR=$PWD/models + - export WHISPER_MODEL=$PWD/models/large-v3-turbo-q8_0.bin +- Test manifests/offline copies are handled internally. For full offline runs, pre-populate models/ with the desired .bin files. diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 0000000..ecb1557 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,26 @@ +# FAQ + +Models are missing — what do I do? +- Run `polyscribe --download-models` to pick and download models interactively. +- For CI/non-interactive, set POLYSCRIBE_MODELS_DIR to a writable dir and run `polyscribe --update-models --no-interaction`. +- You can also point to a specific file via `WHISPER_MODEL=/path/to/model.bin`. + +I get timeouts or slow downloads +- Try again later or use a closer mirror (if available by setting upstream env vars or downloading manually into the models dir). +- Ensure your network allows Hugging Face downloads. + +Non-interactive CI runs hang or fail +- Add `--no-interaction` to disable prompts. +- Set `POLYSCRIBE_MODELS_DIR` to a known location and pre-populate models or run `--update-models`. +- Use `-q` to reduce noise in logs; use `-v` or `-vv` when debugging failures. + +GPU was not detected +- Ensure you built with the matching feature (`gpu-cuda`, `gpu-hip`, or `gpu-vulkan`). +- Install the relevant runtime (CUDA toolkit/driver, ROCm libraries, Vulkan loader/SDK) and ensure libraries are on the loader path. +- Force CPU backend with `--gpu-backend cpu` to verify the rest of the pipeline. + +Which model directory is used in releases? +- For packaged binaries, PolyScribe uses `$XDG_DATA_HOME/polyscribe/models` or `~/.local/share/polyscribe/models` by default. Override with `POLYSCRIBE_MODELS_DIR`. + +SRT timestamps look wrong +- SRT times are derived from model timestamps. If your input has variable sample rate or corrupted timestamps, ensure ffmpeg can decode it; consider re-encoding the audio. diff --git a/docs/release-packaging.md b/docs/release-packaging.md new file mode 100644 index 0000000..e429ff4 --- /dev/null +++ b/docs/release-packaging.md @@ -0,0 +1,33 @@ +# Release packaging + +Model directory layout +- Recommended system path for models: + - $XDG_DATA_HOME/polyscribe/models + - Fallback: ~/.local/share/polyscribe/models +- Allow override with POLYSCRIBE_MODELS_DIR. +- Keep models outside of /usr so users can update without root. + +Binary naming +- Install binary as `polyscribe`. + +Linux distribution tips (Arch example) +- Package name: polyscribe +- Runtime dependencies: + - ffmpeg + - (optional) CUDA/ROCm/Vulkan runtimes matching enabled features +- Build: + - CPU-only: cargo build --release + - CUDA: cargo build --release --features gpu-cuda + - HIP: cargo build --release --features gpu-hip + - Vulkan: cargo build --release --features gpu-vulkan +- Place README and docs under /usr/share/doc/polyscribe. +- Do not place models under /usr/share directly; prefer XDG data path resolved at runtime. + +Man page and completions +- Generate at install time and store under conventional locations: + - polyscribe man > /usr/share/man/man1/polyscribe.1 + - polyscribe completions bash > /usr/share/bash-completion/completions/polyscribe + - polyscribe completions zsh > /usr/share/zsh/site-functions/_polyscribe + +Non-interactive behavior for CI and packaging +- Use --no-interaction and set POLYSCRIBE_MODELS_DIR to a writable location during build/test steps. diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 0000000..f378280 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,86 @@ +# Usage + +PolyScribe is a command-line tool. Run `polyscribe -h` at any time to see the latest help. + +Common patterns +- Single file to transcript (JSON + SRT): + - polyscribe -o output path/to/audio_or_video.mp3 +- Multiple files → merged transcript: + - polyscribe -m -o output merged path/a.mp3 path/b.mp4 +- Multiple files → both merged and separate outputs: + - polyscribe --merge-and-separate -o output path/a.json path/b.json +- Prompt for speaker names per input: + - polyscribe --set-speaker-names -o output path/a.mp3 path/b.mp4 + +CLI reference +- Positional arguments: + - inputs: One or more .json transcripts or media files (audio/video). When media files are given, PolyScribe extracts audio via ffmpeg. +- Flags: + - -o, --output FILE_OR_DIR + - Output base path. For directories, date prefix is added and both .json and .srt are created. If omitted, JSON prints to stdout. + - -m, --merge + - Merge all inputs into a single output instead of one output per input. + - --merge-and-separate + - Write both a merged output and separate outputs (requires -o directory). + - --set-speaker-names + - Prompt for a speaker label per input (useful for multi-speaker datasets). + - --language LANG + - Language hint (e.g., en, de). English-only models reject non-en hints. + - --gpu-backend [auto|cpu|cuda|hip|vulkan] + - Choose runtime backend. Default is auto (prefers CUDA → HIP → Vulkan → CPU), depending on detection. + - --gpu-layers N + - Number of layers to offload to the GPU when supported. + - --download-models + - Launch interactive model downloader (lists Hugging Face models; multi-select to download). + - --update-models + - Verify/update local models by comparing sizes and hashes with the upstream manifest. + - -v, --verbose (repeatable) + - Increase log verbosity; use -vv for very detailed logs. + - -q, --quiet + - Suppress non-error logs to stderr; does not affect stdout outputs. + - --no-interaction + - Disable all interactive prompts (for CI). Combine with env vars to control behavior. + - Subcommands: + - completions : Write shell completion script to stdout. + - man: Write a man page to stdout. + +Expected outputs +- For each processed input or merged group, PolyScribe produces: + - A JSON transcript file with segments (id, speaker, start, end, text). + - An SRT subtitle file with timestamps and text (speaker: prefixed when provided). +- When -o is used with a directory, outputs are written into that directory with a YYYY-MM-DD prefix. + +Typical workflows +1) Single file → transcript: +- polyscribe -o output media/example.mp3 + +2) Multiple files → merged transcript: +- polyscribe -m -o output merged media/a.mp3 media/b.mp4 media/c.wav + +3) Multiple files → both merged and individual transcripts: +- polyscribe --merge-and-separate -o output media/a.json media/b.json + +4) Video → extract audio automatically: +- polyscribe -o output videos/talk.mp4 +(Requires ffmpeg on PATH.) + +Model locations +- Development builds (debug): ./models is used by default. +- Packaged releases: $XDG_DATA_HOME/polyscribe/models or ~/.local/share/polyscribe/models. +- Override: + - POLYSCRIBE_MODELS_DIR=/path/to/models + - WHISPER_MODEL=/path/to/specific_model.bin (forces exact model file). + +Environment variables +- POLYSCRIBE_MODELS_DIR: Override default models directory. +- WHISPER_MODEL: Point directly to a model file. +- XDG_DATA_HOME/HOME: Used to resolve default model path for release builds. +- CI/GITHUB_ACTIONS: When set, PolyScribe assumes non-TTY in some paths and may avoid prompts. +- Test-only toggles (used by our tests; not recommended in production): + - POLYSCRIBE_TEST_FORCE_CUDA=1 + - POLYSCRIBE_TEST_FORCE_HIP=1 + - POLYSCRIBE_TEST_FORCE_VULKAN=1 + +Notes +- GPU selection depends on both build features and runtime detection. Build with the corresponding cargo features (see development.md) for CUDA/HIP/Vulkan support. +- English-only models cannot be used with non-English language hints. diff --git a/examples/download_models_interactive.sh b/examples/download_models_interactive.sh new file mode 100644 index 0000000..42f9145 --- /dev/null +++ b/examples/download_models_interactive.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Launch the interactive model downloader and select models to install + +BIN=${BIN:-./target/release/polyscribe} +MODELS_DIR=${POLYSCRIBE_MODELS_DIR:-$PWD/models} +export POLYSCRIBE_MODELS_DIR="$MODELS_DIR" + +mkdir -p "$MODELS_DIR" +"$BIN" --download-models diff --git a/examples/transcribe_file.sh b/examples/transcribe_file.sh new file mode 100644 index 0000000..844daf6 --- /dev/null +++ b/examples/transcribe_file.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Transcribe an audio/video file to JSON and SRT into ./output +# Requires a model; first run may prompt to download. + +BIN=${BIN:-./target/release/polyscribe} +INPUT=${1:-samples/example.mp3} +OUTDIR=${OUTDIR:-output} + +mkdir -p "$OUTDIR" +"$BIN" -v -o "$OUTDIR" "$INPUT" +echo "Done. See $OUTDIR for JSON/SRT files." diff --git a/examples/update_models.sh b/examples/update_models.sh new file mode 100644 index 0000000..9b8de0a --- /dev/null +++ b/examples/update_models.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Verify/update local models non-interactively (useful in CI) + +BIN=${BIN:-./target/release/polyscribe} +MODELS_DIR=${POLYSCRIBE_MODELS_DIR:-$PWD/models} +export POLYSCRIBE_MODELS_DIR="$MODELS_DIR" + +mkdir -p "$MODELS_DIR" +"$BIN" --update-models --no-interaction -q + +echo "Models updated in $MODELS_DIR" diff --git a/src/backend.rs b/src/backend.rs index c985a16..484b483 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -1,3 +1,4 @@ +//! Transcription backend selection and implementations (CPU/GPU) used by PolyScribe. use crate::OutputEntry; use crate::{decode_audio_to_pcm_f32_ffmpeg, find_model_file}; use anyhow::{Context, Result, anyhow}; diff --git a/src/models.rs b/src/models.rs index b81416c..5985eb7 100644 --- a/src/models.rs +++ b/src/models.rs @@ -1,3 +1,4 @@ +//! Model discovery, selection, and downloading logic for PolyScribe. use std::collections::BTreeMap; use std::env; use std::fs::{File, create_dir_all};