Multi-LLM Provider Support: - Add llm-core crate with LlmProvider trait abstraction - Implement Anthropic Claude API client with streaming - Implement OpenAI API client with streaming - Add token counting with SimpleTokenCounter and ClaudeTokenCounter - Add retry logic with exponential backoff and jitter Borderless TUI Redesign: - Rewrite theme system with terminal capability detection (Full/Unicode256/Basic) - Add provider tabs component with keybind switching [1]/[2]/[3] - Implement vim-modal input (Normal/Insert/Visual/Command modes) - Redesign chat panel with timestamps and streaming indicators - Add multi-provider status bar with cost tracking - Add Nerd Font icons with graceful ASCII fallbacks - Add syntax highlighting (syntect) and markdown rendering (pulldown-cmark) Advanced Agent Features: - Add system prompt builder with configurable components - Enhance subagent orchestration with parallel execution - Add git integration module for safe command detection - Add streaming tool results via channels - Expand tool set: AskUserQuestion, TodoWrite, LS, MultiEdit, BashOutput, KillShell - Add WebSearch with provider abstraction Plugin System Enhancement: - Add full agent definition parsing from YAML frontmatter - Add skill system with progressive disclosure - Wire plugin hooks into HookManager 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
387 lines
12 KiB
Rust
387 lines
12 KiB
Rust
//! Error recovery and retry logic for LLM operations
|
|
//!
|
|
//! This module provides configurable retry strategies with exponential backoff
|
|
//! for handling transient failures when communicating with LLM providers.
|
|
|
|
use crate::LlmError;
|
|
use rand::Rng;
|
|
use std::time::Duration;
|
|
|
|
/// Configuration for retry behavior
|
|
#[derive(Debug, Clone)]
|
|
pub struct RetryConfig {
|
|
/// Maximum number of retry attempts
|
|
pub max_retries: u32,
|
|
/// Initial delay before first retry (in milliseconds)
|
|
pub initial_delay_ms: u64,
|
|
/// Maximum delay between retries (in milliseconds)
|
|
pub max_delay_ms: u64,
|
|
/// Multiplier for exponential backoff
|
|
pub backoff_multiplier: f32,
|
|
}
|
|
|
|
impl Default for RetryConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
max_retries: 3,
|
|
initial_delay_ms: 1000,
|
|
max_delay_ms: 30000,
|
|
backoff_multiplier: 2.0,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl RetryConfig {
|
|
/// Create a new retry configuration with custom values
|
|
pub fn new(
|
|
max_retries: u32,
|
|
initial_delay_ms: u64,
|
|
max_delay_ms: u64,
|
|
backoff_multiplier: f32,
|
|
) -> Self {
|
|
Self {
|
|
max_retries,
|
|
initial_delay_ms,
|
|
max_delay_ms,
|
|
backoff_multiplier,
|
|
}
|
|
}
|
|
|
|
/// Create a configuration with no retries
|
|
pub fn no_retry() -> Self {
|
|
Self {
|
|
max_retries: 0,
|
|
initial_delay_ms: 0,
|
|
max_delay_ms: 0,
|
|
backoff_multiplier: 1.0,
|
|
}
|
|
}
|
|
|
|
/// Create a configuration with aggressive retries for rate-limited scenarios
|
|
pub fn aggressive() -> Self {
|
|
Self {
|
|
max_retries: 5,
|
|
initial_delay_ms: 2000,
|
|
max_delay_ms: 60000,
|
|
backoff_multiplier: 2.5,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Determines whether an error is retryable
|
|
///
|
|
/// # Arguments
|
|
/// * `error` - The error to check
|
|
///
|
|
/// # Returns
|
|
/// `true` if the error is transient and the operation should be retried,
|
|
/// `false` if the error is permanent and retrying won't help
|
|
pub fn is_retryable_error(error: &LlmError) -> bool {
|
|
match error {
|
|
// Always retry rate limits
|
|
LlmError::RateLimit { .. } => true,
|
|
|
|
// Always retry timeouts
|
|
LlmError::Timeout(_) => true,
|
|
|
|
// Retry HTTP errors that are server-side (5xx)
|
|
LlmError::Http(msg) => {
|
|
// Check if the error message contains a 5xx status code
|
|
msg.contains("500")
|
|
|| msg.contains("502")
|
|
|| msg.contains("503")
|
|
|| msg.contains("504")
|
|
|| msg.contains("Internal Server Error")
|
|
|| msg.contains("Bad Gateway")
|
|
|| msg.contains("Service Unavailable")
|
|
|| msg.contains("Gateway Timeout")
|
|
}
|
|
|
|
// Don't retry authentication errors - they need user intervention
|
|
LlmError::Auth(_) => false,
|
|
|
|
// Don't retry JSON parsing errors - the data is malformed
|
|
LlmError::Json(_) => false,
|
|
|
|
// Don't retry API errors - these are typically client-side issues
|
|
LlmError::Api { .. } => false,
|
|
|
|
// Provider errors might be transient, but we conservatively don't retry
|
|
LlmError::Provider(_) => false,
|
|
|
|
// Stream errors are typically not retryable
|
|
LlmError::Stream(_) => false,
|
|
}
|
|
}
|
|
|
|
/// Strategy for retrying failed operations with exponential backoff
|
|
#[derive(Debug, Clone)]
|
|
pub struct RetryStrategy {
|
|
config: RetryConfig,
|
|
}
|
|
|
|
impl RetryStrategy {
|
|
/// Create a new retry strategy with the given configuration
|
|
pub fn new(config: RetryConfig) -> Self {
|
|
Self { config }
|
|
}
|
|
|
|
/// Create a retry strategy with default configuration
|
|
pub fn default_config() -> Self {
|
|
Self::new(RetryConfig::default())
|
|
}
|
|
|
|
/// Execute an async operation with retries
|
|
///
|
|
/// # Arguments
|
|
/// * `operation` - A function that returns a Future producing a Result
|
|
///
|
|
/// # Returns
|
|
/// The result of the operation, or the last error if all retries fail
|
|
///
|
|
/// # Example
|
|
/// ```ignore
|
|
/// let strategy = RetryStrategy::default_config();
|
|
/// let result = strategy.execute(|| async {
|
|
/// // Your LLM API call here
|
|
/// llm_client.chat(&messages, &options, None).await
|
|
/// }).await?;
|
|
/// ```
|
|
pub async fn execute<F, T, Fut>(&self, operation: F) -> Result<T, LlmError>
|
|
where
|
|
F: Fn() -> Fut,
|
|
Fut: std::future::Future<Output = Result<T, LlmError>>,
|
|
{
|
|
let mut attempt = 0;
|
|
|
|
loop {
|
|
// Try the operation
|
|
match operation().await {
|
|
Ok(result) => return Ok(result),
|
|
Err(err) => {
|
|
// Check if we should retry
|
|
if !is_retryable_error(&err) {
|
|
return Err(err);
|
|
}
|
|
|
|
attempt += 1;
|
|
|
|
// Check if we've exhausted retries
|
|
if attempt > self.config.max_retries {
|
|
return Err(err);
|
|
}
|
|
|
|
// Calculate delay with exponential backoff and jitter
|
|
let delay = self.delay_for_attempt(attempt);
|
|
|
|
// Log retry attempt (in a real implementation, you might use tracing)
|
|
eprintln!(
|
|
"Retry attempt {}/{} after {:?}",
|
|
attempt, self.config.max_retries, delay
|
|
);
|
|
|
|
// Sleep before next attempt
|
|
tokio::time::sleep(delay).await;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Calculate the delay for a given attempt number with jitter
|
|
///
|
|
/// Uses exponential backoff: delay = initial_delay * (backoff_multiplier ^ (attempt - 1))
|
|
/// Adds random jitter of ±10% to prevent thundering herd problems
|
|
///
|
|
/// # Arguments
|
|
/// * `attempt` - The attempt number (1-indexed)
|
|
///
|
|
/// # Returns
|
|
/// The delay duration to wait before the next retry
|
|
fn delay_for_attempt(&self, attempt: u32) -> Duration {
|
|
// Calculate base delay with exponential backoff
|
|
let base_delay_ms = self.config.initial_delay_ms as f64
|
|
* self.config.backoff_multiplier.powi((attempt - 1) as i32) as f64;
|
|
|
|
// Cap at max_delay_ms
|
|
let capped_delay_ms = base_delay_ms.min(self.config.max_delay_ms as f64);
|
|
|
|
// Add jitter: ±10%
|
|
let mut rng = rand::thread_rng();
|
|
let jitter_factor = rng.gen_range(0.9..=1.1);
|
|
let final_delay_ms = capped_delay_ms * jitter_factor;
|
|
|
|
Duration::from_millis(final_delay_ms as u64)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use std::sync::atomic::{AtomicU32, Ordering};
|
|
use std::sync::Arc;
|
|
|
|
#[test]
|
|
fn test_default_retry_config() {
|
|
let config = RetryConfig::default();
|
|
assert_eq!(config.max_retries, 3);
|
|
assert_eq!(config.initial_delay_ms, 1000);
|
|
assert_eq!(config.max_delay_ms, 30000);
|
|
assert_eq!(config.backoff_multiplier, 2.0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_retry_config() {
|
|
let config = RetryConfig::no_retry();
|
|
assert_eq!(config.max_retries, 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_is_retryable_error() {
|
|
// Retryable errors
|
|
assert!(is_retryable_error(&LlmError::RateLimit {
|
|
retry_after_secs: Some(60)
|
|
}));
|
|
assert!(is_retryable_error(&LlmError::Timeout(
|
|
"Request timed out".to_string()
|
|
)));
|
|
assert!(is_retryable_error(&LlmError::Http(
|
|
"500 Internal Server Error".to_string()
|
|
)));
|
|
assert!(is_retryable_error(&LlmError::Http(
|
|
"503 Service Unavailable".to_string()
|
|
)));
|
|
|
|
// Non-retryable errors
|
|
assert!(!is_retryable_error(&LlmError::Auth(
|
|
"Invalid API key".to_string()
|
|
)));
|
|
assert!(!is_retryable_error(&LlmError::Json(
|
|
"Invalid JSON".to_string()
|
|
)));
|
|
assert!(!is_retryable_error(&LlmError::Api {
|
|
message: "Invalid request".to_string(),
|
|
code: Some("400".to_string())
|
|
}));
|
|
assert!(!is_retryable_error(&LlmError::Http(
|
|
"400 Bad Request".to_string()
|
|
)));
|
|
}
|
|
|
|
#[test]
|
|
fn test_delay_calculation() {
|
|
let config = RetryConfig::default();
|
|
let strategy = RetryStrategy::new(config);
|
|
|
|
// Test that delays increase exponentially
|
|
let delay1 = strategy.delay_for_attempt(1);
|
|
let delay2 = strategy.delay_for_attempt(2);
|
|
let delay3 = strategy.delay_for_attempt(3);
|
|
|
|
// Base delays should be around 1000ms, 2000ms, 4000ms (with jitter)
|
|
assert!(delay1.as_millis() >= 900 && delay1.as_millis() <= 1100);
|
|
assert!(delay2.as_millis() >= 1800 && delay2.as_millis() <= 2200);
|
|
assert!(delay3.as_millis() >= 3600 && delay3.as_millis() <= 4400);
|
|
}
|
|
|
|
#[test]
|
|
fn test_delay_max_cap() {
|
|
let config = RetryConfig {
|
|
max_retries: 10,
|
|
initial_delay_ms: 1000,
|
|
max_delay_ms: 5000,
|
|
backoff_multiplier: 2.0,
|
|
};
|
|
let strategy = RetryStrategy::new(config);
|
|
|
|
// Even with high attempt numbers, delay should be capped
|
|
let delay = strategy.delay_for_attempt(10);
|
|
assert!(delay.as_millis() <= 5500); // max + jitter
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_retry_success_on_first_attempt() {
|
|
let strategy = RetryStrategy::default_config();
|
|
let call_count = Arc::new(AtomicU32::new(0));
|
|
let count_clone = call_count.clone();
|
|
|
|
let result = strategy
|
|
.execute(|| {
|
|
let count = count_clone.clone();
|
|
async move {
|
|
count.fetch_add(1, Ordering::SeqCst);
|
|
Ok::<_, LlmError>(42)
|
|
}
|
|
})
|
|
.await;
|
|
|
|
assert_eq!(result.unwrap(), 42);
|
|
assert_eq!(call_count.load(Ordering::SeqCst), 1);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_retry_success_after_retries() {
|
|
let config = RetryConfig::new(3, 10, 100, 2.0); // Fast retries for testing
|
|
let strategy = RetryStrategy::new(config);
|
|
let call_count = Arc::new(AtomicU32::new(0));
|
|
let count_clone = call_count.clone();
|
|
|
|
let result = strategy
|
|
.execute(|| {
|
|
let count = count_clone.clone();
|
|
async move {
|
|
let current = count.fetch_add(1, Ordering::SeqCst) + 1;
|
|
if current < 3 {
|
|
Err(LlmError::Timeout("Timeout".to_string()))
|
|
} else {
|
|
Ok(42)
|
|
}
|
|
}
|
|
})
|
|
.await;
|
|
|
|
assert_eq!(result.unwrap(), 42);
|
|
assert_eq!(call_count.load(Ordering::SeqCst), 3);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_retry_exhausted() {
|
|
let config = RetryConfig::new(2, 10, 100, 2.0); // Fast retries for testing
|
|
let strategy = RetryStrategy::new(config);
|
|
let call_count = Arc::new(AtomicU32::new(0));
|
|
let count_clone = call_count.clone();
|
|
|
|
let result = strategy
|
|
.execute(|| {
|
|
let count = count_clone.clone();
|
|
async move {
|
|
count.fetch_add(1, Ordering::SeqCst);
|
|
Err::<(), _>(LlmError::Timeout("Always fails".to_string()))
|
|
}
|
|
})
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
assert_eq!(call_count.load(Ordering::SeqCst), 3); // Initial attempt + 2 retries
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_non_retryable_error() {
|
|
let strategy = RetryStrategy::default_config();
|
|
let call_count = Arc::new(AtomicU32::new(0));
|
|
let count_clone = call_count.clone();
|
|
|
|
let result = strategy
|
|
.execute(|| {
|
|
let count = count_clone.clone();
|
|
async move {
|
|
count.fetch_add(1, Ordering::SeqCst);
|
|
Err::<(), _>(LlmError::Auth("Invalid API key".to_string()))
|
|
}
|
|
})
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
assert_eq!(call_count.load(Ordering::SeqCst), 1); // Should not retry
|
|
}
|
|
}
|