owlen/crates/llm/core/src/retry.rs

//! Error recovery and retry logic for LLM operations
//!
//! This module provides configurable retry strategies with exponential backoff
//! for handling transient failures when communicating with LLM providers.

use crate::LlmError;
use rand::Rng;
use std::time::Duration;

/// Configuration for retry behavior
#[derive(Debug, Clone)]
pub struct RetryConfig {
    /// Maximum number of retry attempts
    pub max_retries: u32,
    /// Initial delay before first retry (in milliseconds)
    pub initial_delay_ms: u64,
    /// Maximum delay between retries (in milliseconds)
    pub max_delay_ms: u64,
    /// Multiplier for exponential backoff
    pub backoff_multiplier: f32,
}

impl Default for RetryConfig {
    fn default() -> Self {
        Self {
            max_retries: 3,
            initial_delay_ms: 1000,
            max_delay_ms: 30000,
            backoff_multiplier: 2.0,
        }
    }
}

impl RetryConfig {
    /// Create a new retry configuration with custom values
    pub fn new(
        max_retries: u32,
        initial_delay_ms: u64,
        max_delay_ms: u64,
        backoff_multiplier: f32,
    ) -> Self {
        Self {
            max_retries,
            initial_delay_ms,
            max_delay_ms,
            backoff_multiplier,
        }
    }

    /// Create a configuration with no retries
    pub fn no_retry() -> Self {
        Self {
            max_retries: 0,
            initial_delay_ms: 0,
            max_delay_ms: 0,
            backoff_multiplier: 1.0,
        }
    }

    /// Create a configuration with aggressive retries for rate-limited scenarios
    pub fn aggressive() -> Self {
        Self {
            max_retries: 5,
            initial_delay_ms: 2000,
            max_delay_ms: 60000,
            backoff_multiplier: 2.5,
        }
    }
}

/// Determines whether an error is retryable
///
/// # Arguments
/// * `error` - The error to check
///
/// # Returns
/// `true` if the error is transient and the operation should be retried,
/// `false` if the error is permanent and retrying won't help
pub fn is_retryable_error(error: &LlmError) -> bool {
    match error {
        // Always retry rate limits
        LlmError::RateLimit { .. } => true,

        // Always retry timeouts
        LlmError::Timeout(_) => true,

        // Retry HTTP errors that are server-side (5xx)
        LlmError::Http(msg) => {
            // Check if the error message contains a 5xx status code
            msg.contains("500")
                || msg.contains("502")
                || msg.contains("503")
                || msg.contains("504")
                || msg.contains("Internal Server Error")
                || msg.contains("Bad Gateway")
                || msg.contains("Service Unavailable")
                || msg.contains("Gateway Timeout")
        }

        // Don't retry authentication errors - they need user intervention
        LlmError::Auth(_) => false,

        // Don't retry JSON parsing errors - the data is malformed
        LlmError::Json(_) => false,

        // Don't retry API errors - these are typically client-side issues
        LlmError::Api { .. } => false,

        // Provider errors might be transient, but we conservatively don't retry
        LlmError::Provider(_) => false,

        // Stream errors are typically not retryable
        LlmError::Stream(_) => false,
    }
}

/// Strategy for retrying failed operations with exponential backoff
#[derive(Debug, Clone)]
pub struct RetryStrategy {
    config: RetryConfig,
}

impl RetryStrategy {
    /// Create a new retry strategy with the given configuration
    pub fn new(config: RetryConfig) -> Self {
        Self { config }
    }

    /// Create a retry strategy with default configuration
    pub fn default_config() -> Self {
        Self::new(RetryConfig::default())
    }

    /// Execute an async operation with retries
    ///
    /// # Arguments
    /// * `operation` - A function that returns a Future producing a Result
    ///
    /// # Returns
    /// The result of the operation, or the last error if all retries fail
    ///
    /// # Example
    /// ```ignore
    /// let strategy = RetryStrategy::default_config();
    /// let result = strategy.execute(|| async {
    ///     // Your LLM API call here
    ///     llm_client.chat(&messages, &options, None).await
    /// }).await?;
    /// ```
    pub async fn execute<F, T, Fut>(&self, operation: F) -> Result<T, LlmError>
    where
        F: Fn() -> Fut,
        Fut: std::future::Future<Output = Result<T, LlmError>>,
    {
        let mut attempt = 0;

        loop {
            // Try the operation
            match operation().await {
                Ok(result) => return Ok(result),
                Err(err) => {
                    // Check if we should retry
                    if !is_retryable_error(&err) {
                        return Err(err);
                    }

                    attempt += 1;

                    // Check if we've exhausted retries
                    if attempt > self.config.max_retries {
                        return Err(err);
                    }

                    // Calculate delay with exponential backoff and jitter
                    let delay = self.delay_for_attempt(attempt);

                    // Log retry attempt (in a real implementation, you might use tracing)
                    eprintln!(
                        "Retry attempt {}/{} after {:?}",
                        attempt, self.config.max_retries, delay
                    );

                    // Sleep before next attempt
                    tokio::time::sleep(delay).await;
                }
            }
        }
    }

    /// Calculate the delay for a given attempt number with jitter
    ///
    /// Uses exponential backoff: delay = initial_delay * (backoff_multiplier ^ (attempt - 1))
    /// Adds random jitter of ±10% to prevent thundering herd problems
    ///
    /// # Arguments
    /// * `attempt` - The attempt number (1-indexed)
    ///
    /// # Returns
    /// The delay duration to wait before the next retry
    fn delay_for_attempt(&self, attempt: u32) -> Duration {
        // Calculate base delay with exponential backoff
        let base_delay_ms = self.config.initial_delay_ms as f64
            * self.config.backoff_multiplier.powi((attempt - 1) as i32) as f64;

        // Cap at max_delay_ms
        let capped_delay_ms = base_delay_ms.min(self.config.max_delay_ms as f64);

        // Add jitter: ±10%
        let mut rng = rand::thread_rng();
        let jitter_factor = rng.gen_range(0.9..=1.1);
        let final_delay_ms = capped_delay_ms * jitter_factor;

        Duration::from_millis(final_delay_ms as u64)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::{AtomicU32, Ordering};
    use std::sync::Arc;

    #[test]
    fn test_default_retry_config() {
        let config = RetryConfig::default();
        assert_eq!(config.max_retries, 3);
        assert_eq!(config.initial_delay_ms, 1000);
        assert_eq!(config.max_delay_ms, 30000);
        assert_eq!(config.backoff_multiplier, 2.0);
    }

    #[test]
    fn test_no_retry_config() {
        let config = RetryConfig::no_retry();
        assert_eq!(config.max_retries, 0);
    }

    #[test]
    fn test_is_retryable_error() {
        // Retryable errors
        assert!(is_retryable_error(&LlmError::RateLimit {
            retry_after_secs: Some(60)
        }));
        assert!(is_retryable_error(&LlmError::Timeout(
            "Request timed out".to_string()
        )));
        assert!(is_retryable_error(&LlmError::Http(
            "500 Internal Server Error".to_string()
        )));
        assert!(is_retryable_error(&LlmError::Http(
            "503 Service Unavailable".to_string()
        )));

        // Non-retryable errors
        assert!(!is_retryable_error(&LlmError::Auth(
            "Invalid API key".to_string()
        )));
        assert!(!is_retryable_error(&LlmError::Json(
            "Invalid JSON".to_string()
        )));
        assert!(!is_retryable_error(&LlmError::Api {
            message: "Invalid request".to_string(),
            code: Some("400".to_string())
        }));
        assert!(!is_retryable_error(&LlmError::Http(
            "400 Bad Request".to_string()
        )));
    }

    #[test]
    fn test_delay_calculation() {
        let config = RetryConfig::default();
        let strategy = RetryStrategy::new(config);

        // Test that delays increase exponentially
        let delay1 = strategy.delay_for_attempt(1);
        let delay2 = strategy.delay_for_attempt(2);
        let delay3 = strategy.delay_for_attempt(3);

        // Base delays should be around 1000ms, 2000ms, 4000ms (with jitter)
        assert!(delay1.as_millis() >= 900 && delay1.as_millis() <= 1100);
        assert!(delay2.as_millis() >= 1800 && delay2.as_millis() <= 2200);
        assert!(delay3.as_millis() >= 3600 && delay3.as_millis() <= 4400);
    }

    #[test]
    fn test_delay_max_cap() {
        let config = RetryConfig {
            max_retries: 10,
            initial_delay_ms: 1000,
            max_delay_ms: 5000,
            backoff_multiplier: 2.0,
        };
        let strategy = RetryStrategy::new(config);

        // Even with high attempt numbers, delay should be capped
        let delay = strategy.delay_for_attempt(10);
        assert!(delay.as_millis() <= 5500); // max + jitter
    }

    #[tokio::test]
    async fn test_retry_success_on_first_attempt() {
        let strategy = RetryStrategy::default_config();
        let call_count = Arc::new(AtomicU32::new(0));
        let count_clone = call_count.clone();

        let result = strategy
            .execute(|| {
                let count = count_clone.clone();
                async move {
                    count.fetch_add(1, Ordering::SeqCst);
                    Ok::<_, LlmError>(42)
                }
            })
            .await;

        assert_eq!(result.unwrap(), 42);
        assert_eq!(call_count.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn test_retry_success_after_retries() {
        let config = RetryConfig::new(3, 10, 100, 2.0); // Fast retries for testing
        let strategy = RetryStrategy::new(config);
        let call_count = Arc::new(AtomicU32::new(0));
        let count_clone = call_count.clone();

        let result = strategy
            .execute(|| {
                let count = count_clone.clone();
                async move {
                    let current = count.fetch_add(1, Ordering::SeqCst) + 1;
                    if current < 3 {
                        Err(LlmError::Timeout("Timeout".to_string()))
                    } else {
                        Ok(42)
                    }
                }
            })
            .await;

        assert_eq!(result.unwrap(), 42);
        assert_eq!(call_count.load(Ordering::SeqCst), 3);
    }

    #[tokio::test]
    async fn test_retry_exhausted() {
        let config = RetryConfig::new(2, 10, 100, 2.0); // Fast retries for testing
        let strategy = RetryStrategy::new(config);
        let call_count = Arc::new(AtomicU32::new(0));
        let count_clone = call_count.clone();

        let result = strategy
            .execute(|| {
                let count = count_clone.clone();
                async move {
                    count.fetch_add(1, Ordering::SeqCst);
                    Err::<(), _>(LlmError::Timeout("Always fails".to_string()))
                }
            })
            .await;

        assert!(result.is_err());
        assert_eq!(call_count.load(Ordering::SeqCst), 3); // Initial attempt + 2 retries
    }

    #[tokio::test]
    async fn test_non_retryable_error() {
        let strategy = RetryStrategy::default_config();
        let call_count = Arc::new(AtomicU32::new(0));
        let count_clone = call_count.clone();

        let result = strategy
            .execute(|| {
                let count = count_clone.clone();
                async move {
                    count.fetch_add(1, Ordering::SeqCst);
                    Err::<(), _>(LlmError::Auth("Invalid API key".to_string()))
                }
            })
            .await;

        assert!(result.is_err());
        assert_eq!(call_count.load(Ordering::SeqCst), 1); // Should not retry
    }
}