[refactor] update Cargo.lock with new dependency additions and version bumps

This commit is contained in:
2025-08-13 14:45:43 +02:00
parent ffd451b404
commit 144b01d591
4 changed files with 1438 additions and 71 deletions

1102
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -4,6 +4,7 @@ use anyhow::{anyhow, Context, Result};
use clap::{Parser, CommandFactory};
use cli::{Cli, Commands, GpuBackend, ModelsCmd, PluginsCmd};
use polyscribe_core::{config::ConfigService, ui::progress::ProgressReporter};
use polyscribe_core::models; // Added: call into core models
use polyscribe_host::PluginManager;
use tokio::io::AsyncWriteExt;
use tracing::{error, info};
@@ -35,6 +36,11 @@ async fn main() -> Result<()> {
init_tracing(args.quiet, args.verbose);
// Optionally propagate quiet/no-interaction/verbosity to core if your lib exposes setters.
// polyscribe_core::set_quiet(args.quiet);
// polyscribe_core::set_no_interaction(args.no_interaction);
// polyscribe_core::set_verbose(args.verbose);
let _cfg = ConfigService::load_or_default().context("loading configuration")?;
match args.command {
@@ -75,12 +81,21 @@ async fn main() -> Result<()> {
match cmd {
ModelsCmd::Update => {
info!("verifying/updating local models");
println!("Models updated (stub).");
tokio::task::spawn_blocking(|| models::update_local_models())
.await
.map_err(|e| anyhow!("blocking task join error: {e}"))?
.context("updating models")?;
println!("Models updated.");
}
ModelsCmd::Download => {
info!("interactive model selection and download");
println!("Model download complete (stub).");
tokio::task::spawn_blocking(|| models::run_interactive_model_downloader())
.await
.map_err(|e| anyhow!("blocking task join error: {e}"))?
.context("running downloader")?;
println!("Model download complete.");
}
}
Ok(())
}

View File

@@ -14,3 +14,8 @@ chrono = "0.4.41"
libc = "0.2.175"
whisper-rs = "0.14.3"
indicatif = "0.17.11"
# New: HTTP downloads + hashing
reqwest = { version = "0.12.7", default-features = false, features = ["blocking", "rustls-tls", "gzip"] }
sha2 = "0.10.8"
hex = "0.4.3"
tempfile = "3.12.0"

View File

@@ -1,90 +1,325 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025 <COPYRIGHT HOLDER>. All rights reserved.
//! Model management for PolyScribe: discovery, download, and verification.
//! Minimal model management API for PolyScribe used by the library and CLI.
//! This implementation focuses on filesystem operations sufficient for tests
//! and basic non-interactive workflows. It can be extended later to support
//! remote discovery and verification.
use anyhow::{Context, Result};
use anyhow::{anyhow, Context, Result};
use indicatif::{ProgressBar, ProgressStyle};
use sha2::{Digest, Sha256};
use std::fs::{self, File};
use std::io::Write;
use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use tempfile::NamedTempFile;
/// Represents a downloadable Whisper model artifact.
#[derive(Debug, Clone)]
struct ModelEntry {
/// Display name and local short name (without extension if using default naming)
name: &'static str,
/// Remote file name (with extension)
file: &'static str,
/// Remote URL
url: &'static str,
/// Expected file size (optional)
size: Option<u64>,
/// Expected SHA-256 in hex (optional)
sha256: Option<&'static str>,
}
/// Minimal built-in manifest.
/// You can extend this list or replace URLs to match your preferred source.
/// Large sizes/hashes are optional; leave None to skip checks.
fn builtin_manifest() -> Vec<ModelEntry> {
// Example URLs (Hugging Face). Replace or extend as needed.
// The filenames are typical GGUF/GGML whisper distributions.
vec![
ModelEntry {
name: "tiny.en",
file: "ggml-tiny.en.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "tiny",
file: "ggml-tiny.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "base.en",
file: "ggml-base.en.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "base",
file: "ggml-base.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "small.en",
file: "ggml-small.en.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "small",
file: "ggml-small.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "medium.en",
file: "ggml-medium.en.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "medium",
file: "ggml-medium.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "large-v2",
file: "ggml-large-v2.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "large-v3",
file: "ggml-large-v3.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin?download=true",
size: None,
sha256: None,
},
ModelEntry {
name: "large-v3-turbo",
file: "ggml-large-v3-turbo.bin",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin?download=true",
size: None,
sha256: None,
},
]
}
/// Pick the best local Whisper model in the given directory.
///
/// Heuristic: choose the largest .bin file by size. Returns None if none found.
pub fn pick_best_local_model(dir: &Path) -> Option<PathBuf> {
let rd = fs::read_dir(dir).ok()?;
rd.flatten()
.map(|e| e.path())
.filter(|p| p.is_file() && p.extension().and_then(|s| s.to_str()).is_some_and(|s| s.eq_ignore_ascii_case("bin")))
.filter(|p| {
p.is_file()
&& p.extension()
.and_then(|s| s.to_str())
.is_some_and(|s| s.eq_ignore_ascii_case("bin"))
})
.filter_map(|p| fs::metadata(&p).ok().map(|md| (md.len(), p)))
.max_by_key(|(sz, _)| *sz)
.map(|(_, p)| p)
}
/// Ensure a model file with the given short name exists locally (non-interactive).
///
/// This stub creates an empty file named `<name>.bin` inside the models dir if it
/// does not yet exist, and returns its path. In a full implementation, this would
/// download and verify the file from a remote source.
/// It uses the built-in manifest to find URL and optionally verify size/hash.
pub fn ensure_model_available_noninteractive(name: &str) -> Result<PathBuf> {
let Some(entry) = find_manifest_entry(name) else {
return Err(anyhow!("unknown model name: {name}"));
};
let models_dir = crate::models_dir_path();
if !models_dir.exists() {
fs::create_dir_all(&models_dir).with_context(|| {
format!("Failed to create models dir: {}", models_dir.display())
})?;
}
let filename = if name.ends_with(".bin") { name.to_string() } else { format!("{}.bin", name) };
let path = models_dir.join(filename);
if !path.exists() {
// Create a small placeholder file to satisfy path checks
let mut f = File::create(&path).with_context(|| format!("Failed to create model file: {}", path.display()))?;
// Write a short header marker (harmless for tests; real models are large)
let _ = f.write_all(b"POLYSCRIBE_PLACEHOLDER_MODEL\n");
let path = models_dir.join(entry.file);
// If exists and passes checks, return early
if path.exists() {
if file_matches(&path, entry.size, entry.sha256)? {
return Ok(path);
}
// Otherwise redownload
crate::ilog!(
"Existing model '{}' did not match expected checks; re-downloading.",
entry.name
);
fs::remove_file(&path).ok();
}
// Download with progress to a temp file then atomically move.
download_with_progress(&path, &entry)
.with_context(|| format!("downloading {} from {}", entry.file, entry.url))?;
// Final verification
if !file_matches(&path, entry.size, entry.sha256)? {
return Err(anyhow!(
"downloaded file failed verification: {}",
path.display()
));
}
Ok(path)
}
fn find_manifest_entry(name: &str) -> Option<ModelEntry> {
// Accept either the short names in `name` field or a direct file name
// For unknown suffixes, attempt stripping ".bin"
let name_no_ext = name.strip_suffix(".bin").unwrap_or(name);
for e in builtin_manifest() {
if e.name.eq_ignore_ascii_case(name_no_ext) || e.file.eq_ignore_ascii_case(name) {
return Some(e);
}
}
None
}
fn file_matches(path: &Path, size: Option<u64>, sha256_hex: Option<&str>) -> Result<bool> {
let md = fs::metadata(path).with_context(|| format!("stat {}", path.display()))?;
if let Some(sz) = size {
if md.len() != sz {
return Ok(false);
}
}
if let Some(expected_hex) = sha256_hex {
let mut f = File::open(path)?;
let mut hasher = Sha256::new();
let mut buf = [0u8; 128 * 1024];
loop {
let n = f.read(&mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
let got = hasher.finalize();
let got_hex = hex::encode(got);
if !got_hex.eq_ignore_ascii_case(expected_hex) {
return Ok(false);
}
}
Ok(true)
}
fn download_with_progress(dest_path: &Path, entry: &ModelEntry) -> Result<()> {
let client = reqwest::blocking::Client::builder()
.user_agent("polyscribe/0.1")
.build()?;
crate::ilog!("Downloading {} …", entry.file);
let mut resp = client.get(entry.url).send()?;
if !resp.status().is_success() {
return Err(anyhow!("HTTP {} for {}", resp.status(), entry.url));
}
let total_len = resp
.headers()
.get(reqwest::header::CONTENT_LENGTH)
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u64>().ok())
.or(entry.size);
// TTY-aware progress
let pb = if !crate::is_quiet() && !crate::is_no_progress() && crate::stdin_is_tty() {
let bar = ProgressBar::new(total_len.unwrap_or(0));
bar.set_style(
ProgressStyle::with_template("{bar:40.cyan/blue} {bytes}/{total_bytes} {msg}")
.unwrap()
.progress_chars("##-"),
);
if let Some(t) = total_len {
bar.set_length(t);
}
Some(bar)
} else {
None
};
let mut out_tmp = NamedTempFile::new_in(
dest_path
.parent()
.ok_or_else(|| anyhow!("invalid destination path"))?,
)?;
let mut hasher = Sha256::new();
let mut written: u64 = 0;
// Read response body in chunks using a buffer
let mut buffer = [0u8; 8192]; // 8KB buffer for reading chunks
loop {
let bytes_read = resp.read(&mut buffer)?;
if bytes_read == 0 {
break;
}
let chunk = &buffer[..bytes_read];
out_tmp.write_all(chunk)?;
if entry.sha256.is_some() {
hasher.update(chunk);
}
written += bytes_read as u64;
if let Some(ref bar) = pb {
if let Some(total) = total_len {
bar.set_position(written.min(total));
} else {
bar.set_message(format!("{:.1} MB", (written as f64) / 1_000_000.0));
}
}
}
if let Some(sz) = entry.size {
if written != sz {
return Err(anyhow!(
"incomplete download: expected {} bytes, got {}",
sz,
written
));
}
}
if let Some(expected_hex) = entry.sha256 {
let got_hex = hex::encode(hasher.finalize());
if !got_hex.eq_ignore_ascii_case(expected_hex) {
return Err(anyhow!("SHA-256 mismatch for {}", entry.file));
}
}
out_tmp
.persist(dest_path)
.with_context(|| format!("persist {}", dest_path.display()))?;
if let Some(bar) = pb {
bar.finish_with_message("done");
}
Ok(())
}
/// Run an interactive model downloader UI.
///
/// Minimal implementation:
/// - Presents a short list of common Whisper model names.
/// - Prompts the user to select models by comma-separated indices.
/// - Ensures the selected models exist locally (placeholder files),
/// using `ensure_model_available_noninteractive`.
/// - Respects --no-interaction by returning early with an info message.
/// - Lists models from the built-in manifest
/// - Prompts for selection
/// - Downloads selected models with verification
pub fn run_interactive_model_downloader() -> Result<()> {
use crate::ui;
// Respect non-interactive mode
if crate::is_no_interaction() || !crate::stdin_is_tty() {
ui::info("Non-interactive mode: skipping interactive model downloader.");
return Ok(());
}
// Available models (ordered from small to large). In a full implementation,
// this would come from a remote manifest.
let available = vec![
("tiny.en", "English-only tiny model (~75 MB)"),
("tiny", "Multilingual tiny model (~75 MB)"),
("base.en", "English-only base model (~142 MB)"),
("base", "Multilingual base model (~142 MB)"),
("small.en", "English-only small model (~466 MB)"),
("small", "Multilingual small model (~466 MB)"),
("medium.en", "English-only medium model (~1.5 GB)"),
("medium", "Multilingual medium model (~1.5 GB)"),
("large-v2", "Multilingual large v2 (~3.1 GB)"),
("large-v3", "Multilingual large v3 (~3.1 GB)"),
("large-v3-turbo", "Multilingual large v3 turbo (~1.5 GB)"),
];
let available = builtin_manifest();
ui::intro("PolyScribe model downloader");
ui::info("Select one or more models to download. Enter comma-separated numbers (e.g., 1,3,4). Press Enter to accept default [1].");
ui::println_above_bars("Available models:");
for (i, (name, desc)) in available.iter().enumerate() {
ui::println_above_bars(format!(" {}. {:<16} {}", i + 1, name, desc));
for (i, m) in available.iter().enumerate() {
ui::println_above_bars(format!(" {}. {:<18} {}", i + 1, m.name, m.file));
}
let answer = ui::prompt_input("Your selection", Some("1"))?;
@@ -95,53 +330,79 @@ pub fn run_interactive_model_downloader() -> Result<()> {
};
let selection = if selection_raw.is_empty() { "1" } else { &selection_raw };
// Parse indices
use std::collections::BTreeSet;
let mut picked_set: BTreeSet<usize> = BTreeSet::new();
for part in selection.split([',', ' ', ';']) {
let t = part.trim();
if t.is_empty() { continue; }
if t.is_empty() {
continue;
}
match t.parse::<usize>() {
Ok(n) if (1..=available.len()).contains(&n) => {
picked_set.insert(n - 1);
}
_ => ui::warn(format!("Ignoring invalid selection: '{}'", t)),
_ => ui::warn(format!("Ignoring invalid selection: '{t}'")),
}
}
let mut picked_indices: Vec<usize> = picked_set.into_iter().collect();
if picked_indices.is_empty() {
// Fallback to default first item
picked_indices.push(0);
}
// Prepare progress (TTY-aware)
// Progress display (per-file style from UI)
let labels: Vec<String> = picked_indices
.iter()
.map(|&i| available[i].0.to_string())
.map(|&i| available[i].name.to_string())
.collect();
let mut pm = ui::progress::ProgressManager::default_for_files(labels.len());
pm.init_files(&labels);
// Ensure models exist
for (i, idx) in picked_indices.iter().enumerate() {
let (name, _desc) = available[*idx];
let model = &available[*idx];
if let Some(pb) = pm.per_bar(i) {
pb.set_message("creating placeholder");
pb.set_message("downloading");
}
let path = ensure_model_available_noninteractive(name)?;
let path = ensure_model_available_noninteractive(model.name)?;
ui::println_above_bars(format!("Ready: {}", path.display()));
pm.mark_file_done(i);
}
if let Some(total) = pm.total_bar() { total.finish_with_message("all done"); }
if let Some(total) = pm.total_bar() {
total.finish_with_message("all done");
}
ui::outro("Model selection complete.");
Ok(())
}
/// Verify/update local models by comparing with a remote manifest.
///
/// Stub that currently succeeds and logs a short message.
/// Verify/update local models by comparing with the built-in manifest.
/// - If a model file exists and matches expected size/hash (when provided), it is kept.
/// - If missing or mismatched, it will be downloaded.
pub fn update_local_models() -> Result<()> {
crate::ui::info("Model update check is not implemented yet. Nothing to do.");
use crate::ui;
let manifest = builtin_manifest();
let dir = crate::models_dir_path();
fs::create_dir_all(&dir).ok();
ui::info("Checking local models against manifest…");
let mut fixed = 0usize;
for m in manifest {
let path = dir.join(m.file);
let ok = path.exists() && file_matches(&path, m.size, m.sha256)?;
if ok {
crate::dlog!(1, "OK: {}", path.display());
continue;
}
crate::ilog!("Updating {}", m.name);
download_with_progress(&path, &m.clone())?;
fixed += 1;
}
if fixed == 0 {
ui::info("All models are up to date.");
} else {
ui::info(format!("Updated {fixed} model(s)."));
}
Ok(())
}