Release v1.9.0: Add video-comparer skill and enhance transcript-fixer
## New Skill: video-comparer v1.0.0 - Compare original and compressed videos with interactive HTML reports - Calculate quality metrics (PSNR, SSIM) for compression analysis - Generate frame-by-frame visual comparisons (slider, side-by-side, grid) - Extract video metadata (codec, resolution, bitrate, duration) - Multi-platform FFmpeg support with security features ## transcript-fixer Enhancements - Add async AI processor for parallel processing - Add connection pool management for database operations - Add concurrency manager and rate limiter - Add audit log retention and database migrations - Add health check and metrics monitoring - Add comprehensive test suite (8 new test files) - Enhance security with domain and path validators ## Marketplace Updates - Update marketplace version from 1.8.0 to 1.9.0 - Update skills count from 15 to 16 - Update documentation (README.md, CLAUDE.md, CHANGELOG.md) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
377
transcript-fixer/scripts/utils/retry_logic.py
Normal file
377
transcript-fixer/scripts/utils/retry_logic.py
Normal file
@@ -0,0 +1,377 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Retry Logic with Exponential Backoff
|
||||
|
||||
CRITICAL FIX: Implements retry for transient failures
|
||||
ISSUE: Critical-4 in Engineering Excellence Plan
|
||||
|
||||
This module provides:
|
||||
1. Exponential backoff retry logic
|
||||
2. Error categorization (transient vs permanent)
|
||||
3. Configurable retry strategies
|
||||
4. Async retry support
|
||||
|
||||
Author: Chief Engineer
|
||||
Date: 2025-10-28
|
||||
Priority: P0 - Critical
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import TypeVar, Callable, Any, Optional, Set
|
||||
from functools import wraps
|
||||
from dataclasses import dataclass
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
@dataclass
|
||||
class RetryConfig:
|
||||
"""
|
||||
Configuration for retry behavior.
|
||||
|
||||
Attributes:
|
||||
max_attempts: Maximum number of retry attempts (default: 3)
|
||||
base_delay: Initial delay between retries in seconds (default: 1.0)
|
||||
max_delay: Maximum delay between retries in seconds (default: 60.0)
|
||||
exponential_base: Multiplier for exponential backoff (default: 2.0)
|
||||
jitter: Add randomness to avoid thundering herd (default: True)
|
||||
"""
|
||||
max_attempts: int = 3
|
||||
base_delay: float = 1.0
|
||||
max_delay: float = 60.0
|
||||
exponential_base: float = 2.0
|
||||
jitter: bool = True
|
||||
|
||||
|
||||
# Transient errors that should be retried
|
||||
TRANSIENT_EXCEPTIONS: Set[type] = {
|
||||
# Network errors
|
||||
httpx.ConnectTimeout,
|
||||
httpx.ReadTimeout,
|
||||
httpx.WriteTimeout,
|
||||
httpx.PoolTimeout,
|
||||
httpx.ConnectError,
|
||||
httpx.ReadError,
|
||||
httpx.WriteError,
|
||||
|
||||
# HTTP status codes (will check separately)
|
||||
# 408 Request Timeout
|
||||
# 429 Too Many Requests
|
||||
# 500 Internal Server Error
|
||||
# 502 Bad Gateway
|
||||
# 503 Service Unavailable
|
||||
# 504 Gateway Timeout
|
||||
}
|
||||
|
||||
# Status codes that indicate transient failures
|
||||
TRANSIENT_STATUS_CODES: Set[int] = {
|
||||
408, # Request Timeout
|
||||
429, # Too Many Requests
|
||||
500, # Internal Server Error
|
||||
502, # Bad Gateway
|
||||
503, # Service Unavailable
|
||||
504, # Gateway Timeout
|
||||
}
|
||||
|
||||
# Permanent errors that should NOT be retried
|
||||
PERMANENT_EXCEPTIONS: Set[type] = {
|
||||
# Authentication/Authorization
|
||||
httpx.HTTPStatusError, # Will check status code
|
||||
|
||||
# Validation errors
|
||||
ValueError,
|
||||
KeyError,
|
||||
TypeError,
|
||||
}
|
||||
|
||||
|
||||
def is_transient_error(exception: Exception) -> bool:
|
||||
"""
|
||||
Determine if an exception represents a transient failure.
|
||||
|
||||
Transient errors:
|
||||
- Network timeouts
|
||||
- Connection errors
|
||||
- Server overload (429, 503)
|
||||
- Temporary server errors (500, 502, 504)
|
||||
|
||||
Permanent errors:
|
||||
- Authentication failures (401, 403)
|
||||
- Not found (404)
|
||||
- Validation errors (400, 422)
|
||||
|
||||
Args:
|
||||
exception: Exception to categorize
|
||||
|
||||
Returns:
|
||||
True if error is transient and should be retried
|
||||
"""
|
||||
# Check exception type
|
||||
if type(exception) in TRANSIENT_EXCEPTIONS:
|
||||
return True
|
||||
|
||||
# Check HTTP status codes
|
||||
if isinstance(exception, httpx.HTTPStatusError):
|
||||
return exception.response.status_code in TRANSIENT_STATUS_CODES
|
||||
|
||||
# Default: treat as permanent
|
||||
return False
|
||||
|
||||
|
||||
def calculate_delay(
|
||||
attempt: int,
|
||||
config: RetryConfig
|
||||
) -> float:
|
||||
"""
|
||||
Calculate delay for exponential backoff.
|
||||
|
||||
Formula: min(base_delay * (exponential_base ** attempt), max_delay)
|
||||
With optional jitter to avoid thundering herd.
|
||||
|
||||
Args:
|
||||
attempt: Current attempt number (0-indexed)
|
||||
config: Retry configuration
|
||||
|
||||
Returns:
|
||||
Delay in seconds
|
||||
|
||||
Example:
|
||||
>>> calculate_delay(0, RetryConfig(base_delay=1.0, exponential_base=2.0))
|
||||
1.0
|
||||
>>> calculate_delay(1, RetryConfig(base_delay=1.0, exponential_base=2.0))
|
||||
2.0
|
||||
>>> calculate_delay(2, RetryConfig(base_delay=1.0, exponential_base=2.0))
|
||||
4.0
|
||||
"""
|
||||
delay = config.base_delay * (config.exponential_base ** attempt)
|
||||
delay = min(delay, config.max_delay)
|
||||
|
||||
if config.jitter:
|
||||
import random
|
||||
# Add ±25% jitter
|
||||
jitter_amount = delay * 0.25
|
||||
delay = delay + random.uniform(-jitter_amount, jitter_amount)
|
||||
|
||||
return max(0, delay) # Ensure non-negative
|
||||
|
||||
|
||||
def retry_sync(
|
||||
config: Optional[RetryConfig] = None,
|
||||
on_retry: Optional[Callable[[Exception, int], None]] = None
|
||||
):
|
||||
"""
|
||||
Decorator for synchronous retry logic with exponential backoff.
|
||||
|
||||
Args:
|
||||
config: Retry configuration (uses defaults if None)
|
||||
on_retry: Optional callback called on each retry attempt
|
||||
|
||||
Example:
|
||||
>>> @retry_sync(RetryConfig(max_attempts=3))
|
||||
... def fetch_data():
|
||||
... return call_api()
|
||||
|
||||
Raises:
|
||||
Original exception if all retries exhausted
|
||||
"""
|
||||
if config is None:
|
||||
config = RetryConfig()
|
||||
|
||||
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
||||
@wraps(func)
|
||||
def wrapper(*args: Any, **kwargs: Any) -> T:
|
||||
last_exception: Optional[Exception] = None
|
||||
|
||||
for attempt in range(config.max_attempts):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
|
||||
# Check if error is transient
|
||||
if not is_transient_error(e):
|
||||
logger.error(
|
||||
f"{func.__name__} failed with permanent error: {e}"
|
||||
)
|
||||
raise
|
||||
|
||||
# Last attempt?
|
||||
if attempt >= config.max_attempts - 1:
|
||||
logger.error(
|
||||
f"{func.__name__} failed after {config.max_attempts} attempts: {e}"
|
||||
)
|
||||
raise
|
||||
|
||||
# Calculate delay
|
||||
delay = calculate_delay(attempt, config)
|
||||
|
||||
logger.warning(
|
||||
f"{func.__name__} attempt {attempt + 1}/{config.max_attempts} "
|
||||
f"failed with transient error: {e}. "
|
||||
f"Retrying in {delay:.1f}s..."
|
||||
)
|
||||
|
||||
# Call retry callback if provided
|
||||
if on_retry:
|
||||
on_retry(e, attempt)
|
||||
|
||||
# Wait before retry
|
||||
time.sleep(delay)
|
||||
|
||||
# Should never reach here, but satisfy type checker
|
||||
if last_exception:
|
||||
raise last_exception
|
||||
raise RuntimeError("Retry logic error")
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def retry_async(
|
||||
config: Optional[RetryConfig] = None,
|
||||
on_retry: Optional[Callable[[Exception, int], None]] = None
|
||||
):
|
||||
"""
|
||||
Decorator for asynchronous retry logic with exponential backoff.
|
||||
|
||||
Args:
|
||||
config: Retry configuration (uses defaults if None)
|
||||
on_retry: Optional callback called on each retry attempt
|
||||
|
||||
Example:
|
||||
>>> @retry_async(RetryConfig(max_attempts=3))
|
||||
... async def fetch_data():
|
||||
... return await call_api_async()
|
||||
|
||||
Raises:
|
||||
Original exception if all retries exhausted
|
||||
"""
|
||||
if config is None:
|
||||
config = RetryConfig()
|
||||
|
||||
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
||||
@wraps(func)
|
||||
async def wrapper(*args: Any, **kwargs: Any) -> Any:
|
||||
last_exception: Optional[Exception] = None
|
||||
|
||||
for attempt in range(config.max_attempts):
|
||||
try:
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
|
||||
# Check if error is transient
|
||||
if not is_transient_error(e):
|
||||
logger.error(
|
||||
f"{func.__name__} failed with permanent error: {e}"
|
||||
)
|
||||
raise
|
||||
|
||||
# Last attempt?
|
||||
if attempt >= config.max_attempts - 1:
|
||||
logger.error(
|
||||
f"{func.__name__} failed after {config.max_attempts} attempts: {e}"
|
||||
)
|
||||
raise
|
||||
|
||||
# Calculate delay
|
||||
delay = calculate_delay(attempt, config)
|
||||
|
||||
logger.warning(
|
||||
f"{func.__name__} attempt {attempt + 1}/{config.max_attempts} "
|
||||
f"failed with transient error: {e}. "
|
||||
f"Retrying in {delay:.1f}s..."
|
||||
)
|
||||
|
||||
# Call retry callback if provided
|
||||
if on_retry:
|
||||
on_retry(e, attempt)
|
||||
|
||||
# Wait before retry (async)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
# Should never reach here, but satisfy type checker
|
||||
if last_exception:
|
||||
raise last_exception
|
||||
raise RuntimeError("Retry logic error")
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
# Example usage and testing
|
||||
if __name__ == "__main__":
|
||||
import logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Test synchronous retry
|
||||
print("=== Testing Synchronous Retry ===")
|
||||
|
||||
attempt_count = 0
|
||||
|
||||
@retry_sync(RetryConfig(max_attempts=3, base_delay=0.1))
|
||||
def flaky_function():
|
||||
global attempt_count
|
||||
attempt_count += 1
|
||||
print(f"Attempt {attempt_count}")
|
||||
|
||||
if attempt_count < 3:
|
||||
raise httpx.ConnectTimeout("Connection timeout")
|
||||
return "Success!"
|
||||
|
||||
try:
|
||||
result = flaky_function()
|
||||
print(f"Result: {result}")
|
||||
except Exception as e:
|
||||
print(f"Failed: {e}")
|
||||
|
||||
# Test async retry
|
||||
print("\n=== Testing Asynchronous Retry ===")
|
||||
|
||||
async def test_async():
|
||||
attempt_count = 0
|
||||
|
||||
@retry_async(RetryConfig(max_attempts=3, base_delay=0.1))
|
||||
async def async_flaky_function():
|
||||
nonlocal attempt_count
|
||||
attempt_count += 1
|
||||
print(f"Async attempt {attempt_count}")
|
||||
|
||||
if attempt_count < 2:
|
||||
raise httpx.ReadTimeout("Read timeout")
|
||||
return "Async success!"
|
||||
|
||||
try:
|
||||
result = await async_flaky_function()
|
||||
print(f"Result: {result}")
|
||||
except Exception as e:
|
||||
print(f"Failed: {e}")
|
||||
|
||||
asyncio.run(test_async())
|
||||
|
||||
# Test permanent error (should not retry)
|
||||
print("\n=== Testing Permanent Error (No Retry) ===")
|
||||
|
||||
attempt_count = 0
|
||||
|
||||
@retry_sync(RetryConfig(max_attempts=3, base_delay=0.1))
|
||||
def permanent_error_function():
|
||||
global attempt_count
|
||||
attempt_count += 1
|
||||
print(f"Attempt {attempt_count}")
|
||||
raise ValueError("Invalid input") # Permanent error
|
||||
|
||||
try:
|
||||
result = permanent_error_function()
|
||||
except ValueError as e:
|
||||
print(f"Correctly failed immediately: {e}")
|
||||
print(f"Attempts made: {attempt_count} (should be 1)")
|
||||
Reference in New Issue
Block a user