#!/usr/bin/env python3 """ Concurrency Management Module - Production-Grade Concurrent Request Handling CRITICAL FIX (P1-9): Tune concurrent request handling for optimal performance Features: - Semaphore-based request limiting - Circuit breaker pattern for fault tolerance - Backpressure handling - Request queue management - Integration with rate limiter - Concurrent operation monitoring - Adaptive concurrency tuning Use cases: - API request management - Database query concurrency - File operation limiting - Resource-intensive tasks """ from __future__ import annotations import asyncio import logging import time import threading from contextlib import asynccontextmanager, contextmanager from dataclasses import dataclass, field from datetime import datetime, timedelta from enum import Enum from typing import Optional, Dict, Any, Callable, TypeVar, Final from collections import deque logger = logging.getLogger(__name__) T = TypeVar('T') class CircuitState(Enum): """Circuit breaker states""" CLOSED = "closed" # Normal operation OPEN = "open" # Failing, rejecting requests HALF_OPEN = "half_open" # Testing if service recovered @dataclass class ConcurrencyConfig: """Configuration for concurrency management""" max_concurrent: int = 10 # Maximum concurrent operations max_queue_size: int = 100 # Maximum queued requests timeout: float = 30.0 # Operation timeout in seconds enable_backpressure: bool = True # Enable backpressure when queue full enable_circuit_breaker: bool = True # Enable circuit breaker circuit_failure_threshold: int = 5 # Failures before opening circuit circuit_recovery_timeout: float = 60.0 # Seconds before attempting recovery circuit_success_threshold: int = 2 # Successes needed to close circuit enable_adaptive_tuning: bool = False # Adjust concurrency based on performance min_concurrent: int = 2 # Minimum concurrent (for adaptive tuning) max_response_time: float = 5.0 # Target max response time (for adaptive tuning) @dataclass class ConcurrencyMetrics: """Metrics for concurrency monitoring""" total_requests: int = 0 successful_requests: int = 0 failed_requests: int = 0 rejected_requests: int = 0 # Rejected due to backpressure timeout_requests: int = 0 active_operations: int = 0 queued_operations: int = 0 avg_response_time_ms: float = 0.0 current_concurrency: int = 0 circuit_state: CircuitState = CircuitState.CLOSED circuit_failures: int = 0 last_updated: datetime = field(default_factory=datetime.now) def to_dict(self) -> Dict[str, Any]: """Convert to dictionary""" return { 'total_requests': self.total_requests, 'successful_requests': self.successful_requests, 'failed_requests': self.failed_requests, 'rejected_requests': self.rejected_requests, 'timeout_requests': self.timeout_requests, 'active_operations': self.active_operations, 'queued_operations': self.queued_operations, 'avg_response_time_ms': round(self.avg_response_time_ms, 2), 'current_concurrency': self.current_concurrency, 'circuit_state': self.circuit_state.value, 'circuit_failures': self.circuit_failures, 'success_rate': round( self.successful_requests / max(self.total_requests, 1) * 100, 2 ), 'last_updated': self.last_updated.isoformat() } class BackpressureError(Exception): """Raised when backpressure limits are exceeded""" pass class CircuitBreakerOpenError(Exception): """Raised when circuit breaker is open""" pass class ConcurrencyManager: """ Production-grade concurrency management with advanced features Features: - Semaphore-based limiting (prevents resource exhaustion) - Circuit breaker pattern (fault tolerance) - Backpressure handling (graceful degradation) - Request queue management (fairness) - Performance monitoring (observability) - Adaptive tuning (optimization) """ def __init__(self, config: ConcurrencyConfig = None): """ Initialize concurrency manager Args: config: Concurrency configuration """ self.config = config or ConcurrencyConfig() # Semaphore for concurrency limiting self._semaphore = asyncio.Semaphore(self.config.max_concurrent) self._sync_semaphore = threading.Semaphore(self.config.max_concurrent) # Queue for pending requests self._queue: deque = deque(maxlen=self.config.max_queue_size) self._queue_lock = threading.Lock() # Metrics tracking self._metrics = ConcurrencyMetrics() self._metrics.current_concurrency = self.config.max_concurrent self._metrics_lock = threading.Lock() # Response time tracking for adaptive tuning self._response_times: deque = deque(maxlen=100) # Last 100 responses self._response_times_lock = threading.Lock() # Circuit breaker state self._circuit_state = CircuitState.CLOSED self._circuit_failures = 0 self._circuit_last_failure_time: Optional[float] = None self._circuit_successes = 0 self._circuit_lock = threading.Lock() logger.info(f"ConcurrencyManager initialized: max_concurrent={self.config.max_concurrent}") def _check_circuit_breaker(self) -> None: """Check circuit breaker state and potentially transition""" if not self.config.enable_circuit_breaker: return with self._circuit_lock: if self._circuit_state == CircuitState.OPEN: # Check if recovery timeout has elapsed if self._circuit_last_failure_time: elapsed = time.time() - self._circuit_last_failure_time if elapsed >= self.config.circuit_recovery_timeout: logger.info("Circuit breaker: OPEN -> HALF_OPEN (recovery timeout elapsed)") self._circuit_state = CircuitState.HALF_OPEN self._circuit_successes = 0 else: raise CircuitBreakerOpenError( f"Circuit breaker is OPEN. Retry after " f"{self.config.circuit_recovery_timeout - elapsed:.1f}s" ) elif self._circuit_state == CircuitState.HALF_OPEN: # In half-open state, allow limited requests through pass def _record_success(self) -> None: """Record successful operation for circuit breaker""" if not self.config.enable_circuit_breaker: return with self._circuit_lock: if self._circuit_state == CircuitState.HALF_OPEN: self._circuit_successes += 1 if self._circuit_successes >= self.config.circuit_success_threshold: logger.info("Circuit breaker: HALF_OPEN -> CLOSED (recovered)") self._circuit_state = CircuitState.CLOSED self._circuit_failures = 0 self._circuit_successes = 0 def _record_failure(self) -> None: """Record failed operation for circuit breaker""" if not self.config.enable_circuit_breaker: return with self._circuit_lock: self._circuit_failures += 1 self._circuit_last_failure_time = time.time() if self._circuit_state == CircuitState.CLOSED: if self._circuit_failures >= self.config.circuit_failure_threshold: logger.warning( f"Circuit breaker: CLOSED -> OPEN " f"({self._circuit_failures} failures)" ) self._circuit_state = CircuitState.OPEN with self._metrics_lock: self._metrics.circuit_state = CircuitState.OPEN elif self._circuit_state == CircuitState.HALF_OPEN: # Failure during recovery - back to OPEN logger.warning("Circuit breaker: HALF_OPEN -> OPEN (recovery failed)") self._circuit_state = CircuitState.OPEN self._circuit_successes = 0 def _update_response_time(self, response_time_ms: float) -> None: """Update response time metrics""" with self._response_times_lock: self._response_times.append(response_time_ms) # Update average if len(self._response_times) > 0: avg = sum(self._response_times) / len(self._response_times) with self._metrics_lock: self._metrics.avg_response_time_ms = avg def _adjust_concurrency(self) -> None: """Adaptive concurrency tuning based on performance""" if not self.config.enable_adaptive_tuning: return with self._response_times_lock: if len(self._response_times) < 10: return # Not enough data avg_time = sum(self._response_times) / len(self._response_times) target_time = self.config.max_response_time * 1000 # Convert to ms current_concurrency = self.config.max_concurrent if avg_time > target_time * 1.5: # Response time too high - decrease concurrency new_concurrency = max( self.config.min_concurrent, current_concurrency - 1 ) if new_concurrency != current_concurrency: logger.info( f"Adaptive tuning: Decreasing concurrency " f"{current_concurrency} -> {new_concurrency} " f"(avg response time: {avg_time:.1f}ms)" ) self.config.max_concurrent = new_concurrency # Note: Can't easily adjust asyncio.Semaphore, # would need to recreate it elif avg_time < target_time * 0.5: # Response time low - can increase concurrency new_concurrency = min( 20, # Hard cap current_concurrency + 1 ) if new_concurrency != current_concurrency: logger.info( f"Adaptive tuning: Increasing concurrency " f"{current_concurrency} -> {new_concurrency} " f"(avg response time: {avg_time:.1f}ms)" ) self.config.max_concurrent = new_concurrency @asynccontextmanager async def acquire(self, timeout: Optional[float] = None): """ Async context manager to acquire concurrency slot Args: timeout: Optional timeout override Raises: BackpressureError: If queue is full and backpressure is enabled CircuitBreakerOpenError: If circuit breaker is open asyncio.TimeoutError: If timeout exceeded Example: async with manager.acquire(): result = await some_async_operation() """ timeout = timeout or self.config.timeout start_time = time.time() # Check circuit breaker self._check_circuit_breaker() # Check backpressure if self.config.enable_backpressure: with self._metrics_lock: if self._metrics.queued_operations >= self.config.max_queue_size: self._metrics.rejected_requests += 1 raise BackpressureError( f"Queue full ({self.config.max_queue_size} operations pending). " "Try again later." ) # Update queue metrics with self._metrics_lock: self._metrics.queued_operations += 1 self._metrics.total_requests += 1 try: # Acquire semaphore with timeout async with asyncio.timeout(timeout): async with self._semaphore: # Update active metrics with self._metrics_lock: self._metrics.queued_operations -= 1 self._metrics.active_operations += 1 operation_start = time.time() try: yield # Record success response_time_ms = (time.time() - operation_start) * 1000 self._update_response_time(response_time_ms) self._record_success() with self._metrics_lock: self._metrics.successful_requests += 1 except Exception as e: # Record failure self._record_failure() with self._metrics_lock: self._metrics.failed_requests += 1 raise finally: # Update active metrics with self._metrics_lock: self._metrics.active_operations -= 1 # Adaptive tuning self._adjust_concurrency() except asyncio.TimeoutError: with self._metrics_lock: self._metrics.timeout_requests += 1 self._metrics.queued_operations -= 1 elapsed = time.time() - start_time raise asyncio.TimeoutError( f"Operation timed out after {elapsed:.1f}s " f"(timeout: {timeout}s)" ) @contextmanager def acquire_sync(self, timeout: Optional[float] = None): """ Synchronous context manager to acquire concurrency slot Args: timeout: Optional timeout override Example: with manager.acquire_sync(): result = some_operation() """ timeout = timeout or self.config.timeout start_time = time.time() # Check circuit breaker self._check_circuit_breaker() # Check backpressure if self.config.enable_backpressure: with self._metrics_lock: if self._metrics.queued_operations >= self.config.max_queue_size: self._metrics.rejected_requests += 1 raise BackpressureError( f"Queue full ({self.config.max_queue_size} operations pending)" ) # Update queue metrics with self._metrics_lock: self._metrics.queued_operations += 1 self._metrics.total_requests += 1 acquired = False try: # Acquire semaphore with timeout acquired = self._sync_semaphore.acquire(timeout=timeout) if not acquired: raise TimeoutError(f"Failed to acquire semaphore within {timeout}s") # Update active metrics with self._metrics_lock: self._metrics.queued_operations -= 1 self._metrics.active_operations += 1 operation_start = time.time() try: yield # Record success response_time_ms = (time.time() - operation_start) * 1000 self._update_response_time(response_time_ms) self._record_success() with self._metrics_lock: self._metrics.successful_requests += 1 except Exception as e: # Record failure self._record_failure() with self._metrics_lock: self._metrics.failed_requests += 1 raise finally: # Update active metrics with self._metrics_lock: self._metrics.active_operations -= 1 finally: if acquired: self._sync_semaphore.release() else: with self._metrics_lock: self._metrics.timeout_requests += 1 self._metrics.queued_operations -= 1 def get_metrics(self) -> ConcurrencyMetrics: """Get current concurrency metrics""" with self._metrics_lock: # Update circuit state with self._circuit_lock: self._metrics.circuit_state = self._circuit_state self._metrics.circuit_failures = self._circuit_failures self._metrics.last_updated = datetime.now() return ConcurrencyMetrics(**self._metrics.__dict__) def reset_circuit_breaker(self) -> None: """Manually reset circuit breaker to CLOSED state""" with self._circuit_lock: logger.info("Manually resetting circuit breaker to CLOSED") self._circuit_state = CircuitState.CLOSED self._circuit_failures = 0 self._circuit_successes = 0 self._circuit_last_failure_time = None def get_status(self) -> Dict[str, Any]: """Get human-readable status""" metrics = self.get_metrics() return { 'status': 'healthy' if metrics.circuit_state == CircuitState.CLOSED else 'degraded', 'concurrency': { 'current': metrics.current_concurrency, 'active': metrics.active_operations, 'queued': metrics.queued_operations, }, 'performance': { 'avg_response_time_ms': metrics.avg_response_time_ms, 'success_rate': round( metrics.successful_requests / max(metrics.total_requests, 1) * 100, 2 ) }, 'circuit_breaker': { 'state': metrics.circuit_state.value, 'failures': metrics.circuit_failures, }, 'requests': { 'total': metrics.total_requests, 'successful': metrics.successful_requests, 'failed': metrics.failed_requests, 'rejected': metrics.rejected_requests, 'timeout': metrics.timeout_requests, } } # Global instance for convenience _global_manager: Optional[ConcurrencyManager] = None _global_manager_lock = threading.Lock() def get_concurrency_manager(config: Optional[ConcurrencyConfig] = None) -> ConcurrencyManager: """ Get global concurrency manager instance (singleton pattern) Args: config: Optional configuration (only used on first call) Returns: Global ConcurrencyManager instance """ global _global_manager with _global_manager_lock: if _global_manager is None: _global_manager = ConcurrencyManager(config) return _global_manager def reset_concurrency_manager() -> None: """Reset global concurrency manager (mainly for testing)""" global _global_manager with _global_manager_lock: _global_manager = None