#!/usr/bin/env python3
"""
Error Recovery Testing Module

CRITICAL FIX (P1-10): Comprehensive error recovery testing

This module tests the system's ability to recover from various failure scenarios:
- Database failures and transaction rollbacks
- Network failures and retries
- File system errors
- Concurrent access conflicts
- Resource exhaustion
- Timeout handling
- Data corruption

Author: Chief Engineer (ISTJ, 20 years experience)
Date: 2025-10-29
Priority: P1 - High
"""

from __future__ import annotations

import asyncio
import logging
import pytest
import sqlite3
import tempfile
import threading
import time
from pathlib import Path
from typing import Any, List, Optional
from unittest.mock import Mock, patch, MagicMock

# Add parent directory to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))

from core.connection_pool import ConnectionPool, PoolExhaustedError
from core.correction_repository import CorrectionRepository, DatabaseError
from utils.retry_logic import retry_sync, retry_async, RetryConfig, is_transient_error
from utils.concurrency_manager import (
    ConcurrencyManager,
    ConcurrencyConfig,
    BackpressureError,
    CircuitBreakerOpenError
)
from utils.rate_limiter import RateLimiter, RateLimitConfig, RateLimitExceeded

logger = logging.getLogger(__name__)


# ==================== Test Fixtures ====================

@pytest.fixture
def temp_db_path():
    """Create temporary database for testing"""
    with tempfile.TemporaryDirectory() as tmp_dir:
        db_path = Path(tmp_dir) / "test.db"
        yield db_path


@pytest.fixture
def connection_pool(temp_db_path):
    """Create connection pool for testing"""
    pool = ConnectionPool(temp_db_path, max_connections=3, pool_timeout=2.0)
    yield pool
    pool.close_all()


@pytest.fixture
def correction_repository(temp_db_path):
    """Create correction repository for testing"""
    repo = CorrectionRepository(temp_db_path, max_connections=3)
    yield repo
    # Cleanup handled by temp_db_path


@pytest.fixture
def concurrency_manager():
    """Create concurrency manager for testing"""
    config = ConcurrencyConfig(
        max_concurrent=3,
        max_queue_size=5,
        enable_circuit_breaker=True,
        circuit_failure_threshold=3
    )
    return ConcurrencyManager(config)


# ==================== Database Error Recovery Tests ====================

class TestDatabaseErrorRecovery:
    """Test database error recovery mechanisms"""

    def test_transaction_rollback_on_error(self, correction_repository):
        """
        Test that database transactions are rolled back on error.

        Scenario: Try to insert correction with invalid confidence value.
        Expected: Error is raised, no data is modified.
        """
        # Add a correction successfully
        correction_repository.add_correction(
            from_text="test1",
            to_text="corrected1",
            domain="general",
            source="manual",
            confidence=0.9
        )

        # Verify it was added
        corrections = correction_repository.get_all_corrections(domain="general")
        initial_count = len(corrections)
        assert initial_count >= 1

        # Try to add correction with invalid confidence (should fail)
        from utils.domain_validator import ValidationError
        with pytest.raises((ValidationError, DatabaseError)):
            correction_repository.add_correction(
                from_text="test_invalid",
                to_text="corrected",
                domain="general",
                source="manual",
                confidence=1.5  # Invalid: must be 0.0-1.0
            )

        # Verify no new corrections were added
        corrections = correction_repository.get_all_corrections(domain="general")
        assert len(corrections) == initial_count

    def test_connection_pool_recovery_from_exhaustion(self, connection_pool):
        """
        Test that connection pool recovers after exhaustion.

        Scenario: Exhaust all connections, then release them.
        Expected: Pool should become available again.
        """
        connections = []

        # Acquire all connections using context managers properly
        for i in range(3):
            ctx = connection_pool.get_connection()
            conn = ctx.__enter__()
            connections.append((ctx, conn))

        # Try to acquire one more (should timeout with pool_timeout=2.0)
        with pytest.raises((PoolExhaustedError, TimeoutError)):
            with connection_pool.get_connection():
                pass

        # Release all connections properly
        for ctx, conn in connections:
            try:
                ctx.__exit__(None, None, None)
            except:
                pass  # Ignore errors during cleanup

        # Should be able to acquire connection again
        with connection_pool.get_connection() as conn:
            assert conn is not None

    def test_database_recovery_from_corruption(self, temp_db_path):
        """
        Test that system handles corrupted database gracefully.

        Scenario: Create corrupted database file.
        Expected: System should detect corruption and handle it.
        """
        # Create a corrupted database file
        with open(temp_db_path, 'wb') as f:
            f.write(b'This is not a valid SQLite database')

        # Try to create repository (should fail gracefully)
        with pytest.raises((sqlite3.DatabaseError, DatabaseError, FileNotFoundError)):
            repo = CorrectionRepository(temp_db_path)
            repo.get_all_corrections()

    def test_concurrent_write_conflict_recovery(self, temp_db_path):
        """
        Test recovery from concurrent write conflicts.

        Scenario: Multiple threads try to write to same record.
        Expected: First write succeeds, subsequent ones update (UPSERT behavior).

        Note: Each thread needs its own CorrectionRepository instance
        due to SQLite's thread-safety limitations.
        """
        results = []
        errors = []

        def write_correction(thread_id, db_path):
            try:
                # Each thread creates its own repository
                from core.correction_repository import CorrectionRepository
                thread_repo = CorrectionRepository(db_path, max_connections=1)

                thread_repo.add_correction(
                    from_text="concurrent_test",
                    to_text=f"corrected_{thread_id}",
                    domain="general",
                    source="manual"
                )
                results.append(thread_id)
            except Exception as e:
                errors.append((thread_id, str(e)))

        # Start multiple threads
        threads = [threading.Thread(target=write_correction, args=(i, temp_db_path)) for i in range(5)]
        for t in threads:
            t.start()
        for t in threads:
            t.join()

        # Due to UPSERT behavior, all should succeed (they update the same record)
        assert len(results) + len(errors) == 5

        # Verify database is still consistent
        verify_repo = CorrectionRepository(temp_db_path)
        corrections = verify_repo.get_all_corrections()
        assert any(c.from_text == "concurrent_test" for c in corrections)

        # Should only have one record (UNIQUE constraint + UPSERT)
        concurrent_corrections = [c for c in corrections if c.from_text == "concurrent_test"]
        assert len(concurrent_corrections) == 1


# ==================== Network Error Recovery Tests ====================

class TestNetworkErrorRecovery:
    """Test network error recovery mechanisms"""

    @pytest.mark.asyncio
    async def test_retry_on_transient_network_error(self):
        """
        Test that transient network errors trigger retry.

        Scenario: API call fails with timeout, then succeeds on retry.
        Expected: Operation succeeds after retry.
        """
        attempt_count = [0]

        @retry_async(RetryConfig(max_attempts=3, base_delay=0.1))
        async def flaky_network_call():
            attempt_count[0] += 1
            if attempt_count[0] < 3:
                import httpx
                raise httpx.ConnectTimeout("Connection timeout")
            return "success"

        result = await flaky_network_call()
        assert result == "success"
        assert attempt_count[0] == 3

    @pytest.mark.asyncio
    async def test_no_retry_on_permanent_error(self):
        """
        Test that permanent errors are not retried.

        Scenario: API call fails with authentication error.
        Expected: Error is raised immediately without retry.
        """
        attempt_count = [0]

        @retry_async(RetryConfig(max_attempts=3, base_delay=0.1))
        async def auth_error_call():
            attempt_count[0] += 1
            raise ValueError("Invalid credentials")  # Permanent error

        with pytest.raises(ValueError):
            await auth_error_call()

        # Should fail immediately without retry
        assert attempt_count[0] == 1

    def test_transient_error_classification(self):
        """
        Test correct classification of transient vs permanent errors.

        Scenario: Various exception types.
        Expected: Correct classification for each type.
        """
        import httpx

        # Transient errors
        assert is_transient_error(httpx.ConnectTimeout("timeout")) == True
        assert is_transient_error(httpx.ReadTimeout("timeout")) == True
        assert is_transient_error(httpx.ConnectError("connection failed")) == True

        # Permanent errors
        assert is_transient_error(ValueError("invalid input")) == False
        assert is_transient_error(KeyError("not found")) == False


# ==================== Concurrency Error Recovery Tests ====================

class TestConcurrencyErrorRecovery:
    """Test concurrent operation error recovery"""

    @pytest.mark.asyncio
    async def test_circuit_breaker_opens_after_failures(self, concurrency_manager):
        """
        Test that circuit breaker opens after threshold failures.

        Scenario: Multiple consecutive failures.
        Expected: Circuit opens, subsequent requests rejected.
        """
        # Cause 3 failures (threshold)
        for i in range(3):
            try:
                async with concurrency_manager.acquire():
                    raise Exception("Simulated failure")
            except Exception:
                pass

        # Circuit should be OPEN now
        with pytest.raises(CircuitBreakerOpenError):
            async with concurrency_manager.acquire():
                pass

    @pytest.mark.asyncio
    async def test_circuit_breaker_recovery(self, concurrency_manager):
        """
        Test that circuit breaker can recover after timeout.

        Scenario: Circuit opens, then recovery timeout elapses, then success.
        Expected: Circuit transitions OPEN → HALF_OPEN → CLOSED.
        """
        # Configure short recovery timeout for testing
        concurrency_manager.config.circuit_recovery_timeout = 0.5

        # Cause failures to open circuit
        for i in range(3):
            try:
                async with concurrency_manager.acquire():
                    raise Exception("Failure")
            except Exception:
                pass

        # Circuit should be OPEN
        metrics = concurrency_manager.get_metrics()
        assert metrics.circuit_state.value == "open"

        # Wait for recovery timeout
        await asyncio.sleep(0.6)

        # Try a successful operation (should transition to HALF_OPEN then CLOSED)
        async with concurrency_manager.acquire():
            pass  # Success

        # One more success to fully close
        async with concurrency_manager.acquire():
            pass

        # Circuit should be CLOSED
        metrics = concurrency_manager.get_metrics()
        assert metrics.circuit_state.value in ("closed", "half_open")

    @pytest.mark.asyncio
    async def test_backpressure_handling(self):
        """
        Test that backpressure prevents system overload.

        Scenario: Queue fills up beyond max_queue_size.
        Expected: Additional requests are rejected with BackpressureError.
        """
        # Create manager with small limits for testing
        config = ConcurrencyConfig(
            max_concurrent=1,
            max_queue_size=2,
            enable_backpressure=True
        )
        manager = ConcurrencyManager(config)

        async def slow_task():
            async with manager.acquire():
                await asyncio.sleep(0.5)

        # Start tasks that will fill queue
        tasks = []
        rejected_count = 0

        for i in range(6):  # Try to start 6 tasks (more than queue can hold)
            try:
                task = asyncio.create_task(slow_task())
                tasks.append(task)
                await asyncio.sleep(0.01)  # Small delay between starts
            except BackpressureError:
                rejected_count += 1

        # Wait a bit then cancel remaining tasks
        await asyncio.sleep(0.1)
        for task in tasks:
            if not task.done():
                task.cancel()

        # Gather results (ignore cancellation errors)
        results = await asyncio.gather(*tasks, return_exceptions=True)

        # Check metrics
        metrics = manager.get_metrics()

        # Either direct BackpressureError or rejected in metrics
        assert rejected_count > 0 or metrics.rejected_requests > 0


# ==================== Resource Error Recovery Tests ====================

class TestResourceErrorRecovery:
    """Test resource error recovery mechanisms"""

    def test_rate_limiter_recovery_after_limit_reached(self):
        """
        Test that rate limiter allows requests after window resets.

        Scenario: Exhaust rate limit, wait for window reset.
        Expected: New requests are allowed after reset.
        """
        config = RateLimitConfig(
            max_requests=3,
            window_seconds=0.5,  # Short window for testing
        )
        limiter = RateLimiter(config)

        # Exhaust limit
        for i in range(3):
            assert limiter.acquire(blocking=False) == True

        # Should be exhausted
        assert limiter.acquire(blocking=False) == False

        # Wait for window reset
        time.sleep(0.6)

        # Should be available again
        assert limiter.acquire(blocking=False) == True

    @pytest.mark.asyncio
    async def test_timeout_recovery(self, concurrency_manager):
        """
        Test that timeouts are handled gracefully.

        Scenario: Operation exceeds timeout.
        Expected: Operation is cancelled, resources released.
        """
        with pytest.raises(asyncio.TimeoutError):
            async with concurrency_manager.acquire(timeout=0.1):
                await asyncio.sleep(1.0)  # Exceeds timeout

        # Verify metrics were updated
        metrics = concurrency_manager.get_metrics()
        assert metrics.timeout_requests > 0

    def test_file_lock_recovery_after_timeout(self, temp_db_path):
        """
        Test recovery from file lock timeouts.

        Scenario: Lock held too long, timeout occurs.
        Expected: Lock is released, subsequent operations succeed.
        """
        from filelock import FileLock, Timeout as FileLockTimeout

        lock_path = temp_db_path.parent / "test.lock"
        lock = FileLock(str(lock_path), timeout=0.5)

        # Acquire lock
        with lock.acquire():
            # Try to acquire again (should timeout)
            lock2 = FileLock(str(lock_path), timeout=0.2)
            with pytest.raises(FileLockTimeout):
                with lock2.acquire():
                    pass

        # Lock should be released, can acquire now
        with lock.acquire():
            pass  # Success


# ==================== Data Corruption Recovery Tests ====================

class TestDataCorruptionRecovery:
    """Test data corruption detection and recovery"""

    def test_invalid_data_detection(self, correction_repository):
        """
        Test that invalid data is detected and rejected.

        Scenario: Attempt to insert invalid data.
        Expected: Validation error, database remains consistent.
        """
        # Try to insert correction with invalid confidence
        with pytest.raises(DatabaseError):
            correction_repository.add_correction(
                from_text="test",
                to_text="corrected",
                domain="general",
                source="manual",
                confidence=1.5  # Invalid (must be 0.0-1.0)
            )

        # Verify database is still consistent
        corrections = correction_repository.get_all_corrections()
        assert all(0.0 <= c.confidence <= 1.0 for c in corrections)

    def test_encoding_error_recovery(self):
        """
        Test recovery from encoding errors.

        Scenario: Process text with invalid encoding.
        Expected: Error is handled, processing continues.
        """
        from core.change_extractor import ChangeExtractor, InputValidationError

        extractor = ChangeExtractor()

        # Test with invalid UTF-8 sequences
        invalid_text = b'\x80\x81\x82'.decode('utf-8', errors='replace')

        try:
            # Should handle gracefully or raise specific error
            changes = extractor.extract_changes(invalid_text, "corrected")
        except InputValidationError as e:
            # Expected - validation caught the issue
            assert "UTF-8" in str(e) or "encoding" in str(e).lower()


# ==================== Integration Error Recovery Tests ====================

class TestIntegrationErrorRecovery:
    """Test end-to-end error recovery scenarios"""

    def test_full_system_recovery_from_multiple_failures(
        self, correction_repository, concurrency_manager
    ):
        """
        Test that system recovers from multiple simultaneous failures.

        Scenario: Database error + rate limit + concurrency limit.
        Expected: System degrades gracefully, recovers when possible.
        """
        # Record initial state
        initial_corrections = len(correction_repository.get_all_corrections())

        # Simulate various failures
        failures = []

        # 1. Try to add duplicate correction (database error)
        correction_repository.add_correction(
            from_text="multi_fail_test",
            to_text="original",
            domain="general",
            source="manual"
        )

        try:
            correction_repository.add_correction(
                from_text="multi_fail_test",  # Duplicate
                to_text="duplicate",
                domain="general",
                source="manual"
            )
        except DatabaseError:
            failures.append("database")

        # 2. Simulate concurrency failure
        async def test_concurrency():
            try:
                # Cause circuit breaker to open
                for i in range(3):
                    try:
                        async with concurrency_manager.acquire():
                            raise Exception("Failure")
                    except Exception:
                        pass

                # Circuit should be open
                with pytest.raises(CircuitBreakerOpenError):
                    async with concurrency_manager.acquire():
                        pass
                failures.append("concurrency")
            except Exception:
                pass

        asyncio.run(test_concurrency())

        # Verify system is still operational
        corrections = correction_repository.get_all_corrections()
        assert len(corrections) == initial_corrections + 1

        # Verify metrics were recorded
        metrics = concurrency_manager.get_metrics()
        assert metrics.failed_requests > 0

    @pytest.mark.asyncio
    async def test_cascading_failure_prevention(self):
        """
        Test that failures don't cascade through the system.

        Scenario: One component fails, others continue working.
        Expected: Failure is isolated, system remains operational.
        """
        # This test verifies isolation between components
        config = ConcurrencyConfig(
            max_concurrent=2,
            enable_circuit_breaker=True,
            circuit_failure_threshold=3
        )
        manager1 = ConcurrencyManager(config)
        manager2 = ConcurrencyManager(config)

        # Cause failures in manager1
        for i in range(3):
            try:
                async with manager1.acquire():
                    raise Exception("Failure")
            except Exception:
                pass

        # manager1 circuit should be open
        metrics1 = manager1.get_metrics()
        assert metrics1.circuit_state.value == "open"

        # manager2 should still work
        async with manager2.acquire():
            pass  # Success

        metrics2 = manager2.get_metrics()
        assert metrics2.circuit_state.value == "closed"


# ==================== Test Runner ====================

if __name__ == "__main__":
    # Run tests with pytest
    pytest.main([__file__, "-v", "-s"])