Files
owlen/crates/llm/core/src/retry.rs
vikingowl 10c8e2baae feat(v2): complete multi-LLM providers, TUI redesign, and advanced agent features
Multi-LLM Provider Support:
- Add llm-core crate with LlmProvider trait abstraction
- Implement Anthropic Claude API client with streaming
- Implement OpenAI API client with streaming
- Add token counting with SimpleTokenCounter and ClaudeTokenCounter
- Add retry logic with exponential backoff and jitter

Borderless TUI Redesign:
- Rewrite theme system with terminal capability detection (Full/Unicode256/Basic)
- Add provider tabs component with keybind switching [1]/[2]/[3]
- Implement vim-modal input (Normal/Insert/Visual/Command modes)
- Redesign chat panel with timestamps and streaming indicators
- Add multi-provider status bar with cost tracking
- Add Nerd Font icons with graceful ASCII fallbacks
- Add syntax highlighting (syntect) and markdown rendering (pulldown-cmark)

Advanced Agent Features:
- Add system prompt builder with configurable components
- Enhance subagent orchestration with parallel execution
- Add git integration module for safe command detection
- Add streaming tool results via channels
- Expand tool set: AskUserQuestion, TodoWrite, LS, MultiEdit, BashOutput, KillShell
- Add WebSearch with provider abstraction

Plugin System Enhancement:
- Add full agent definition parsing from YAML frontmatter
- Add skill system with progressive disclosure
- Wire plugin hooks into HookManager

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 17:24:14 +01:00

387 lines
12 KiB
Rust

//! Error recovery and retry logic for LLM operations
//!
//! This module provides configurable retry strategies with exponential backoff
//! for handling transient failures when communicating with LLM providers.
use crate::LlmError;
use rand::Rng;
use std::time::Duration;
/// Configuration for retry behavior
#[derive(Debug, Clone)]
pub struct RetryConfig {
/// Maximum number of retry attempts
pub max_retries: u32,
/// Initial delay before first retry (in milliseconds)
pub initial_delay_ms: u64,
/// Maximum delay between retries (in milliseconds)
pub max_delay_ms: u64,
/// Multiplier for exponential backoff
pub backoff_multiplier: f32,
}
impl Default for RetryConfig {
fn default() -> Self {
Self {
max_retries: 3,
initial_delay_ms: 1000,
max_delay_ms: 30000,
backoff_multiplier: 2.0,
}
}
}
impl RetryConfig {
/// Create a new retry configuration with custom values
pub fn new(
max_retries: u32,
initial_delay_ms: u64,
max_delay_ms: u64,
backoff_multiplier: f32,
) -> Self {
Self {
max_retries,
initial_delay_ms,
max_delay_ms,
backoff_multiplier,
}
}
/// Create a configuration with no retries
pub fn no_retry() -> Self {
Self {
max_retries: 0,
initial_delay_ms: 0,
max_delay_ms: 0,
backoff_multiplier: 1.0,
}
}
/// Create a configuration with aggressive retries for rate-limited scenarios
pub fn aggressive() -> Self {
Self {
max_retries: 5,
initial_delay_ms: 2000,
max_delay_ms: 60000,
backoff_multiplier: 2.5,
}
}
}
/// Determines whether an error is retryable
///
/// # Arguments
/// * `error` - The error to check
///
/// # Returns
/// `true` if the error is transient and the operation should be retried,
/// `false` if the error is permanent and retrying won't help
pub fn is_retryable_error(error: &LlmError) -> bool {
match error {
// Always retry rate limits
LlmError::RateLimit { .. } => true,
// Always retry timeouts
LlmError::Timeout(_) => true,
// Retry HTTP errors that are server-side (5xx)
LlmError::Http(msg) => {
// Check if the error message contains a 5xx status code
msg.contains("500")
|| msg.contains("502")
|| msg.contains("503")
|| msg.contains("504")
|| msg.contains("Internal Server Error")
|| msg.contains("Bad Gateway")
|| msg.contains("Service Unavailable")
|| msg.contains("Gateway Timeout")
}
// Don't retry authentication errors - they need user intervention
LlmError::Auth(_) => false,
// Don't retry JSON parsing errors - the data is malformed
LlmError::Json(_) => false,
// Don't retry API errors - these are typically client-side issues
LlmError::Api { .. } => false,
// Provider errors might be transient, but we conservatively don't retry
LlmError::Provider(_) => false,
// Stream errors are typically not retryable
LlmError::Stream(_) => false,
}
}
/// Strategy for retrying failed operations with exponential backoff
#[derive(Debug, Clone)]
pub struct RetryStrategy {
config: RetryConfig,
}
impl RetryStrategy {
/// Create a new retry strategy with the given configuration
pub fn new(config: RetryConfig) -> Self {
Self { config }
}
/// Create a retry strategy with default configuration
pub fn default_config() -> Self {
Self::new(RetryConfig::default())
}
/// Execute an async operation with retries
///
/// # Arguments
/// * `operation` - A function that returns a Future producing a Result
///
/// # Returns
/// The result of the operation, or the last error if all retries fail
///
/// # Example
/// ```ignore
/// let strategy = RetryStrategy::default_config();
/// let result = strategy.execute(|| async {
/// // Your LLM API call here
/// llm_client.chat(&messages, &options, None).await
/// }).await?;
/// ```
pub async fn execute<F, T, Fut>(&self, operation: F) -> Result<T, LlmError>
where
F: Fn() -> Fut,
Fut: std::future::Future<Output = Result<T, LlmError>>,
{
let mut attempt = 0;
loop {
// Try the operation
match operation().await {
Ok(result) => return Ok(result),
Err(err) => {
// Check if we should retry
if !is_retryable_error(&err) {
return Err(err);
}
attempt += 1;
// Check if we've exhausted retries
if attempt > self.config.max_retries {
return Err(err);
}
// Calculate delay with exponential backoff and jitter
let delay = self.delay_for_attempt(attempt);
// Log retry attempt (in a real implementation, you might use tracing)
eprintln!(
"Retry attempt {}/{} after {:?}",
attempt, self.config.max_retries, delay
);
// Sleep before next attempt
tokio::time::sleep(delay).await;
}
}
}
}
/// Calculate the delay for a given attempt number with jitter
///
/// Uses exponential backoff: delay = initial_delay * (backoff_multiplier ^ (attempt - 1))
/// Adds random jitter of ±10% to prevent thundering herd problems
///
/// # Arguments
/// * `attempt` - The attempt number (1-indexed)
///
/// # Returns
/// The delay duration to wait before the next retry
fn delay_for_attempt(&self, attempt: u32) -> Duration {
// Calculate base delay with exponential backoff
let base_delay_ms = self.config.initial_delay_ms as f64
* self.config.backoff_multiplier.powi((attempt - 1) as i32) as f64;
// Cap at max_delay_ms
let capped_delay_ms = base_delay_ms.min(self.config.max_delay_ms as f64);
// Add jitter: ±10%
let mut rng = rand::thread_rng();
let jitter_factor = rng.gen_range(0.9..=1.1);
let final_delay_ms = capped_delay_ms * jitter_factor;
Duration::from_millis(final_delay_ms as u64)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;
#[test]
fn test_default_retry_config() {
let config = RetryConfig::default();
assert_eq!(config.max_retries, 3);
assert_eq!(config.initial_delay_ms, 1000);
assert_eq!(config.max_delay_ms, 30000);
assert_eq!(config.backoff_multiplier, 2.0);
}
#[test]
fn test_no_retry_config() {
let config = RetryConfig::no_retry();
assert_eq!(config.max_retries, 0);
}
#[test]
fn test_is_retryable_error() {
// Retryable errors
assert!(is_retryable_error(&LlmError::RateLimit {
retry_after_secs: Some(60)
}));
assert!(is_retryable_error(&LlmError::Timeout(
"Request timed out".to_string()
)));
assert!(is_retryable_error(&LlmError::Http(
"500 Internal Server Error".to_string()
)));
assert!(is_retryable_error(&LlmError::Http(
"503 Service Unavailable".to_string()
)));
// Non-retryable errors
assert!(!is_retryable_error(&LlmError::Auth(
"Invalid API key".to_string()
)));
assert!(!is_retryable_error(&LlmError::Json(
"Invalid JSON".to_string()
)));
assert!(!is_retryable_error(&LlmError::Api {
message: "Invalid request".to_string(),
code: Some("400".to_string())
}));
assert!(!is_retryable_error(&LlmError::Http(
"400 Bad Request".to_string()
)));
}
#[test]
fn test_delay_calculation() {
let config = RetryConfig::default();
let strategy = RetryStrategy::new(config);
// Test that delays increase exponentially
let delay1 = strategy.delay_for_attempt(1);
let delay2 = strategy.delay_for_attempt(2);
let delay3 = strategy.delay_for_attempt(3);
// Base delays should be around 1000ms, 2000ms, 4000ms (with jitter)
assert!(delay1.as_millis() >= 900 && delay1.as_millis() <= 1100);
assert!(delay2.as_millis() >= 1800 && delay2.as_millis() <= 2200);
assert!(delay3.as_millis() >= 3600 && delay3.as_millis() <= 4400);
}
#[test]
fn test_delay_max_cap() {
let config = RetryConfig {
max_retries: 10,
initial_delay_ms: 1000,
max_delay_ms: 5000,
backoff_multiplier: 2.0,
};
let strategy = RetryStrategy::new(config);
// Even with high attempt numbers, delay should be capped
let delay = strategy.delay_for_attempt(10);
assert!(delay.as_millis() <= 5500); // max + jitter
}
#[tokio::test]
async fn test_retry_success_on_first_attempt() {
let strategy = RetryStrategy::default_config();
let call_count = Arc::new(AtomicU32::new(0));
let count_clone = call_count.clone();
let result = strategy
.execute(|| {
let count = count_clone.clone();
async move {
count.fetch_add(1, Ordering::SeqCst);
Ok::<_, LlmError>(42)
}
})
.await;
assert_eq!(result.unwrap(), 42);
assert_eq!(call_count.load(Ordering::SeqCst), 1);
}
#[tokio::test]
async fn test_retry_success_after_retries() {
let config = RetryConfig::new(3, 10, 100, 2.0); // Fast retries for testing
let strategy = RetryStrategy::new(config);
let call_count = Arc::new(AtomicU32::new(0));
let count_clone = call_count.clone();
let result = strategy
.execute(|| {
let count = count_clone.clone();
async move {
let current = count.fetch_add(1, Ordering::SeqCst) + 1;
if current < 3 {
Err(LlmError::Timeout("Timeout".to_string()))
} else {
Ok(42)
}
}
})
.await;
assert_eq!(result.unwrap(), 42);
assert_eq!(call_count.load(Ordering::SeqCst), 3);
}
#[tokio::test]
async fn test_retry_exhausted() {
let config = RetryConfig::new(2, 10, 100, 2.0); // Fast retries for testing
let strategy = RetryStrategy::new(config);
let call_count = Arc::new(AtomicU32::new(0));
let count_clone = call_count.clone();
let result = strategy
.execute(|| {
let count = count_clone.clone();
async move {
count.fetch_add(1, Ordering::SeqCst);
Err::<(), _>(LlmError::Timeout("Always fails".to_string()))
}
})
.await;
assert!(result.is_err());
assert_eq!(call_count.load(Ordering::SeqCst), 3); // Initial attempt + 2 retries
}
#[tokio::test]
async fn test_non_retryable_error() {
let strategy = RetryStrategy::default_config();
let call_count = Arc::new(AtomicU32::new(0));
let count_clone = call_count.clone();
let result = strategy
.execute(|| {
let count = count_clone.clone();
async move {
count.fetch_add(1, Ordering::SeqCst);
Err::<(), _>(LlmError::Auth("Invalid API key".to_string()))
}
})
.await;
assert!(result.is_err());
assert_eq!(call_count.load(Ordering::SeqCst), 1); // Should not retry
}
}