Release v1.9.0: Add video-comparer skill and enhance transcript-fixer

## New Skill: video-comparer v1.0.0 - Compare original and compressed videos with interactive HTML reports - Calculate quality metrics (PSNR, SSIM) for compression analysis - Generate frame-by-frame visual comparisons (slider, side-by-side, grid) - Extract video metadata (codec, resolution, bitrate, duration) - Multi-platform FFmpeg support with security features ## transcript-fixer Enhancements - Add async AI processor for parallel processing - Add connection pool management for database operations - Add concurrency manager and rate limiter - Add audit log retention and database migrations - Add health check and metrics monitoring - Add comprehensive test suite (8 new test files) - Enhance security with domain and path validators ## Marketplace Updates - Update marketplace version from 1.8.0 to 1.9.0 - Update skills count from 15 to 16 - Update documentation (README.md, CLAUDE.md, CHANGELOG.md) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-30 00:23:12 +08:00
parent bd0aa12004
commit 9b724f33e3
49 changed files with 15357 additions and 270 deletions
--- a/transcript-fixer/scripts/utils/init.py
+++ b/transcript-fixer/scripts/utils/init.py
@@ -4,13 +4,127 @@ Utils Module - Utility Functions and Tools
 This module contains utility functions:
 - diff_generator: Multi-format diff report generation
 - validation: Configuration validation
+- health_check: System health monitoring (P1-4 fix)
+- metrics: Metrics collection and monitoring (P1-7 fix)
+- rate_limiter: Production-grade rate limiting (P1-8 fix)
+- config: Centralized configuration management (P1-5 fix)
+- database_migration: Database migration system (P1-6 fix)
+- concurrency_manager: Concurrent request handling (P1-9 fix)
+- audit_log_retention: Audit log retention and compliance (P1-11 fix)
 """

 from .diff_generator import generate_full_report
 from .validation import validate_configuration, print_validation_summary
+from .health_check import HealthChecker, CheckLevel, HealthStatus, format_health_output
+from .metrics import get_metrics, format_metrics_summary, MetricsCollector
+from .rate_limiter import (
+    RateLimiter,
+    RateLimitConfig,
+    RateLimitStrategy,
+    RateLimitExceeded,
+    RateLimitPresets,
+    get_rate_limiter,
+)
+from .config import (
+    Config,
+    Environment,
+    DatabaseConfig,
+    APIConfig,
+    PathConfig,
+    get_config,
+    set_config,
+    reset_config,
+    create_example_config,
+)
+from .database_migration import (
+    DatabaseMigrationManager,
+    Migration,
+    MigrationRecord,
+    MigrationDirection,
+    MigrationStatus,
+)
+from .migrations import (
+    MIGRATION_REGISTRY,
+    LATEST_VERSION,
+    get_migration,
+    get_migrations_up_to,
+    get_migrations_from,
+)
+from .db_migrations_cli import create_migration_cli
+from .concurrency_manager import (
+    ConcurrencyManager,
+    ConcurrencyConfig,
+    ConcurrencyMetrics,
+    CircuitState,
+    BackpressureError,
+    CircuitBreakerOpenError,
+    get_concurrency_manager,
+    reset_concurrency_manager,
+)
+from .audit_log_retention import (
+    AuditLogRetentionManager,
+    RetentionPolicy,
+    RetentionPeriod,
+    CleanupStrategy,
+    CleanupResult,
+    ComplianceReport,
+    CRITICAL_ACTIONS,
+    get_retention_manager,
+    reset_retention_manager,
+)

 __all__ = [
    'generate_full_report',
    'validate_configuration',
    'print_validation_summary',
+    'HealthChecker',
+    'CheckLevel',
+    'HealthStatus',
+    'format_health_output',
+    'get_metrics',
+    'format_metrics_summary',
+    'MetricsCollector',
+    'RateLimiter',
+    'RateLimitConfig',
+    'RateLimitStrategy',
+    'RateLimitExceeded',
+    'RateLimitPresets',
+    'get_rate_limiter',
+    'Config',
+    'Environment',
+    'DatabaseConfig',
+    'APIConfig',
+    'PathConfig',
+    'get_config',
+    'set_config',
+    'reset_config',
+    'create_example_config',
+    'DatabaseMigrationManager',
+    'Migration',
+    'MigrationRecord',
+    'MigrationDirection',
+    'MigrationStatus',
+    'MIGRATION_REGISTRY',
+    'LATEST_VERSION',
+    'get_migration',
+    'get_migrations_up_to',
+    'get_migrations_from',
+    'create_migration_cli',
+    'ConcurrencyManager',
+    'ConcurrencyConfig',
+    'ConcurrencyMetrics',
+    'CircuitState',
+    'BackpressureError',
+    'CircuitBreakerOpenError',
+    'get_concurrency_manager',
+    'reset_concurrency_manager',
+    'AuditLogRetentionManager',
+    'RetentionPolicy',
+    'RetentionPeriod',
+    'CleanupStrategy',
+    'CleanupResult',
+    'ComplianceReport',
+    'CRITICAL_ACTIONS',
+    'get_retention_manager',
+    'reset_retention_manager',
 ]
--- a/transcript-fixer/scripts/utils/audit_log_retention.py
+++ b/transcript-fixer/scripts/utils/audit_log_retention.py
@@ -0,0 +1,709 @@
+#!/usr/bin/env python3
+"""
+Audit Log Retention Management Module
+
+CRITICAL FIX (P1-11): Production-grade audit log retention and compliance
+
+Features:
+- Configurable retention policies per entity type
+- Automatic cleanup of expired logs
+- Archive capability for long-term storage
+- Compliance reporting (GDPR, SOX, etc.)
+- Selective retention based on criticality
+- Restoration from archives
+
+Compliance Standards:
+- GDPR: Right to erasure, data minimization
+- SOX: 7-year retention for financial records
+- HIPAA: 6-year retention for healthcare data
+- Industry best practices
+
+Author: Chief Engineer (ISTJ, 20 years experience)
+Date: 2025-10-29
+Priority: P1 - High
+"""
+
+from __future__ import annotations
+
+import gzip
+import json
+import logging
+import sqlite3
+from datetime import datetime, timedelta
+from dataclasses import dataclass, asdict
+from enum import Enum
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Final
+from contextlib import contextmanager
+
+logger = logging.getLogger(__name__)
+
+
+class RetentionPeriod(Enum):
+    """Standard retention periods"""
+    SHORT = 30  # 30 days - operational logs
+    MEDIUM = 90  # 90 days - default
+    LONG = 180  # 180 days - 6 months
+    ANNUAL = 365  # 1 year
+    COMPLIANCE_SOX = 2555  # 7 years for SOX compliance
+    COMPLIANCE_HIPAA = 2190  # 6 years for HIPAA
+    PERMANENT = -1  # Never delete
+
+
+class CleanupStrategy(Enum):
+    """Cleanup strategies"""
+    DELETE = "delete"  # Permanent deletion
+    ARCHIVE = "archive"  # Move to archive before deletion
+    ANONYMIZE = "anonymize"  # Remove PII, keep metadata
+
+
+@dataclass
+class RetentionPolicy:
+    """Retention policy configuration"""
+    entity_type: str
+    retention_days: int
+    strategy: CleanupStrategy = CleanupStrategy.ARCHIVE
+    critical_action_retention_days: Optional[int] = None  # Extended retention for critical actions
+    is_active: bool = True
+    description: Optional[str] = None
+
+    def __post_init__(self):
+        """Validate retention policy"""
+        if self.retention_days < -1:
+            raise ValueError("retention_days must be -1 (permanent) or positive")
+        if self.critical_action_retention_days and self.critical_action_retention_days < self.retention_days:
+            raise ValueError("critical_action_retention_days must be >= retention_days")
+
+
+@dataclass
+class CleanupResult:
+    """Result of cleanup operation"""
+    entity_type: str
+    records_scanned: int
+    records_deleted: int
+    records_archived: int
+    records_anonymized: int
+    execution_time_ms: int
+    errors: List[str]
+    success: bool
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return asdict(self)
+
+
+@dataclass
+class ComplianceReport:
+    """Compliance report for audit purposes"""
+    report_date: datetime
+    total_audit_logs: int
+    oldest_log_date: Optional[datetime]
+    newest_log_date: Optional[datetime]
+    logs_by_entity_type: Dict[str, int]
+    retention_violations: List[str]
+    archived_logs_count: int
+    storage_size_mb: float
+    is_compliant: bool
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        result = asdict(self)
+        result['report_date'] = self.report_date.isoformat()
+        if self.oldest_log_date:
+            result['oldest_log_date'] = self.oldest_log_date.isoformat()
+        if self.newest_log_date:
+            result['newest_log_date'] = self.newest_log_date.isoformat()
+        return result
+
+
+# Critical actions that require extended retention
+CRITICAL_ACTIONS: Final[set] = {
+    'delete_correction',
+    'update_correction',
+    'approve_learned_suggestion',
+    'reject_learned_suggestion',
+    'system_config_change',
+    'migration_applied',
+    'security_event',
+}
+
+
+class AuditLogRetentionManager:
+    """
+    Production-grade audit log retention management
+
+    Features:
+    - Automatic cleanup based on retention policies
+    - Archival to compressed files
+    - Compliance reporting
+    - Selective retention for critical actions
+    - Transaction safety
+    """
+
+    def __init__(self, db_path: Path, archive_dir: Optional[Path] = None):
+        """
+        Initialize retention manager
+
+        Args:
+            db_path: Path to SQLite database
+            archive_dir: Directory for archived logs (defaults to db_path.parent / 'archives')
+        """
+        self.db_path = Path(db_path)
+        self.archive_dir = archive_dir or (self.db_path.parent / "archives")
+        self.archive_dir.mkdir(parents=True, exist_ok=True)
+
+        # Default retention policies (can be overridden in database)
+        self.default_policies = {
+            'correction': RetentionPolicy(
+                entity_type='correction',
+                retention_days=RetentionPeriod.ANNUAL.value,
+                strategy=CleanupStrategy.ARCHIVE,
+                critical_action_retention_days=RetentionPeriod.COMPLIANCE_SOX.value,
+                description='Correction operations'
+            ),
+            'suggestion': RetentionPolicy(
+                entity_type='suggestion',
+                retention_days=RetentionPeriod.MEDIUM.value,
+                strategy=CleanupStrategy.ARCHIVE,
+                description='Learning suggestions'
+            ),
+            'system': RetentionPolicy(
+                entity_type='system',
+                retention_days=RetentionPeriod.COMPLIANCE_SOX.value,
+                strategy=CleanupStrategy.ARCHIVE,
+                description='System configuration changes'
+            ),
+            'migration': RetentionPolicy(
+                entity_type='migration',
+                retention_days=RetentionPeriod.PERMANENT.value,
+                strategy=CleanupStrategy.ARCHIVE,
+                description='Database migrations'
+            ),
+        }
+
+    @contextmanager
+    def _get_connection(self):
+        """Get database connection"""
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row
+        try:
+            yield conn
+        finally:
+            conn.close()
+
+    @contextmanager
+    def _transaction(self):
+        """Transaction context manager"""
+        with self._get_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute("BEGIN")
+            try:
+                yield cursor
+                conn.commit()
+            except Exception:
+                conn.rollback()
+                raise
+
+    def load_retention_policies(self) -> Dict[str, RetentionPolicy]:
+        """
+        Load retention policies from database
+
+        Returns:
+            Dictionary of policies by entity_type
+        """
+        policies = dict(self.default_policies)
+
+        try:
+            with self._get_connection() as conn:
+                cursor = conn.cursor()
+                cursor.execute("""
+                    SELECT entity_type, retention_days, is_active, description
+                    FROM retention_policies
+                    WHERE is_active = 1
+                """)
+
+                for row in cursor.fetchall():
+                    entity_type = row['entity_type']
+                    # Update default policy or create new one
+                    if entity_type in policies:
+                        policies[entity_type].retention_days = row['retention_days']
+                        policies[entity_type].is_active = bool(row['is_active'])
+                    else:
+                        policies[entity_type] = RetentionPolicy(
+                            entity_type=entity_type,
+                            retention_days=row['retention_days'],
+                            is_active=bool(row['is_active']),
+                            description=row['description']
+                        )
+
+        except sqlite3.Error as e:
+            logger.warning(f"Failed to load retention policies from database: {e}")
+            # Continue with default policies
+
+        return policies
+
+    def _archive_logs(self, logs: List[Dict[str, Any]], entity_type: str) -> Path:
+        """
+        Archive logs to compressed file
+
+        Args:
+            logs: List of log records
+            entity_type: Entity type being archived
+
+        Returns:
+            Path to archive file
+        """
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        archive_file = self.archive_dir / f"audit_log_{entity_type}_{timestamp}.json.gz"
+
+        with gzip.open(archive_file, 'wt', encoding='utf-8') as f:
+            json.dump(logs, f, indent=2, default=str)
+
+        logger.info(f"Archived {len(logs)} logs to {archive_file}")
+        return archive_file
+
+    def _anonymize_log(self, log: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Anonymize log record (remove PII while keeping metadata)
+
+        Args:
+            log: Log record
+
+        Returns:
+            Anonymized log record
+        """
+        anonymized = dict(log)
+
+        # Remove/mask PII fields
+        if 'user' in anonymized and anonymized['user']:
+            anonymized['user'] = 'ANONYMIZED'
+
+        if 'details' in anonymized and anonymized['details']:
+            # Keep only non-PII metadata
+            try:
+                details = json.loads(anonymized['details'])
+                # Remove potential PII
+                for key in list(details.keys()):
+                    if any(pii in key.lower() for pii in ['email', 'name', 'ip', 'address']):
+                        details[key] = 'ANONYMIZED'
+                anonymized['details'] = json.dumps(details)
+            except (json.JSONDecodeError, TypeError):
+                anonymized['details'] = 'ANONYMIZED'
+
+        return anonymized
+
+    def cleanup_expired_logs(
+        self,
+        entity_type: Optional[str] = None,
+        dry_run: bool = False
+    ) -> List[CleanupResult]:
+        """
+        Clean up expired audit logs based on retention policies
+
+        Args:
+            entity_type: Specific entity type to clean (None for all)
+            dry_run: If True, only simulate without actual deletion
+
+        Returns:
+            List of cleanup results per entity type
+        """
+        policies = self.load_retention_policies()
+        results = []
+
+        # Filter policies
+        if entity_type:
+            if entity_type not in policies:
+                logger.warning(f"No retention policy found for entity_type: {entity_type}")
+                return results
+            policies = {entity_type: policies[entity_type]}
+
+        for entity_type, policy in policies.items():
+            if not policy.is_active:
+                logger.info(f"Skipping inactive policy for {entity_type}")
+                continue
+
+            if policy.retention_days == RetentionPeriod.PERMANENT.value:
+                logger.info(f"Permanent retention for {entity_type}, skipping cleanup")
+                continue
+
+            result = self._cleanup_entity_type(policy, dry_run)
+            results.append(result)
+
+        return results
+
+    def _cleanup_entity_type(
+        self,
+        policy: RetentionPolicy,
+        dry_run: bool = False
+    ) -> CleanupResult:
+        """
+        Clean up logs for specific entity type
+
+        Args:
+            policy: Retention policy to apply
+            dry_run: Simulation mode
+
+        Returns:
+            Cleanup result
+        """
+        start_time = datetime.now()
+        entity_type = policy.entity_type
+        errors = []
+
+        records_scanned = 0
+        records_deleted = 0
+        records_archived = 0
+        records_anonymized = 0
+
+        try:
+            # Calculate cutoff date
+            cutoff_date = datetime.now() - timedelta(days=policy.retention_days)
+
+            # Extended retention for critical actions
+            critical_cutoff_date = None
+            if policy.critical_action_retention_days:
+                critical_cutoff_date = datetime.now() - timedelta(
+                    days=policy.critical_action_retention_days
+                )
+
+            with self._transaction() as cursor:
+                # Find expired logs
+                cursor.execute("""
+                    SELECT * FROM audit_log
+                    WHERE entity_type = ?
+                    AND timestamp < ?
+                    ORDER BY timestamp ASC
+                """, (entity_type, cutoff_date.isoformat()))
+
+                expired_logs = [dict(row) for row in cursor.fetchall()]
+                records_scanned = len(expired_logs)
+
+                if records_scanned == 0:
+                    logger.info(f"No expired logs found for {entity_type}")
+                    return CleanupResult(
+                        entity_type=entity_type,
+                        records_scanned=0,
+                        records_deleted=0,
+                        records_archived=0,
+                        records_anonymized=0,
+                        execution_time_ms=0,
+                        errors=[],
+                        success=True
+                    )
+
+                # Filter out critical actions with extended retention
+                logs_to_process = []
+                for log in expired_logs:
+                    action = log.get('action', '')
+                    if action in CRITICAL_ACTIONS and critical_cutoff_date:
+                        log_date = datetime.fromisoformat(log['timestamp'])
+                        if log_date >= critical_cutoff_date:
+                            # Skip - still within critical retention period
+                            continue
+                    logs_to_process.append(log)
+
+                if not logs_to_process:
+                    logger.info(f"All expired logs for {entity_type} are critical, skipping")
+                    return CleanupResult(
+                        entity_type=entity_type,
+                        records_scanned=records_scanned,
+                        records_deleted=0,
+                        records_archived=0,
+                        records_anonymized=0,
+                        execution_time_ms=0,
+                        errors=[],
+                        success=True
+                    )
+
+                if dry_run:
+                    logger.info(
+                        f"[DRY RUN] Would process {len(logs_to_process)} logs "
+                        f"for {entity_type} with strategy {policy.strategy.value}"
+                    )
+                    return CleanupResult(
+                        entity_type=entity_type,
+                        records_scanned=records_scanned,
+                        records_deleted=len(logs_to_process) if policy.strategy == CleanupStrategy.DELETE else 0,
+                        records_archived=len(logs_to_process) if policy.strategy == CleanupStrategy.ARCHIVE else 0,
+                        records_anonymized=len(logs_to_process) if policy.strategy == CleanupStrategy.ANONYMIZE else 0,
+                        execution_time_ms=0,
+                        errors=[],
+                        success=True
+                    )
+
+                # Execute cleanup strategy
+                log_ids = [log['id'] for log in logs_to_process]
+
+                if policy.strategy == CleanupStrategy.ARCHIVE:
+                    # Archive before deletion
+                    try:
+                        archive_path = self._archive_logs(logs_to_process, entity_type)
+                        records_archived = len(logs_to_process)
+                        logger.info(f"Archived to {archive_path}")
+                    except Exception as e:
+                        errors.append(f"Archive failed: {e}")
+                        raise
+
+                    # Delete archived logs
+                    cursor.execute(f"""
+                        DELETE FROM audit_log
+                        WHERE id IN ({','.join('?' * len(log_ids))})
+                    """, log_ids)
+                    records_deleted = cursor.rowcount
+
+                elif policy.strategy == CleanupStrategy.DELETE:
+                    # Direct deletion (permanent)
+                    cursor.execute(f"""
+                        DELETE FROM audit_log
+                        WHERE id IN ({','.join('?' * len(log_ids))})
+                    """, log_ids)
+                    records_deleted = cursor.rowcount
+
+                elif policy.strategy == CleanupStrategy.ANONYMIZE:
+                    # Anonymize in place
+                    for log in logs_to_process:
+                        anonymized = self._anonymize_log(log)
+                        cursor.execute("""
+                            UPDATE audit_log
+                            SET user = ?, details = ?
+                            WHERE id = ?
+                        """, (anonymized['user'], anonymized['details'], log['id']))
+                    records_anonymized = len(logs_to_process)
+
+                # Record cleanup in history
+                execution_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
+
+                cursor.execute("""
+                    INSERT INTO cleanup_history
+                    (entity_type, records_deleted, execution_time_ms, success)
+                    VALUES (?, ?, ?, 1)
+                """, (entity_type, records_deleted + records_anonymized, execution_time_ms))
+
+                logger.info(
+                    f"Cleanup completed for {entity_type}: "
+                    f"deleted={records_deleted}, archived={records_archived}, "
+                    f"anonymized={records_anonymized}"
+                )
+
+        except Exception as e:
+            logger.error(f"Cleanup failed for {entity_type}: {e}")
+            errors.append(str(e))
+
+            # Record failure in history
+            try:
+                with self._transaction() as cursor:
+                    execution_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
+                    cursor.execute("""
+                        INSERT INTO cleanup_history
+                        (entity_type, records_deleted, execution_time_ms, success, error_message)
+                        VALUES (?, 0, ?, 0, ?)
+                    """, (entity_type, execution_time_ms, str(e)))
+            except Exception:
+                pass  # Best effort
+
+            return CleanupResult(
+                entity_type=entity_type,
+                records_scanned=records_scanned,
+                records_deleted=0,
+                records_archived=0,
+                records_anonymized=0,
+                execution_time_ms=int((datetime.now() - start_time).total_seconds() * 1000),
+                errors=errors,
+                success=False
+            )
+
+        execution_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
+
+        return CleanupResult(
+            entity_type=entity_type,
+            records_scanned=records_scanned,
+            records_deleted=records_deleted,
+            records_archived=records_archived,
+            records_anonymized=records_anonymized,
+            execution_time_ms=execution_time_ms,
+            errors=errors,
+            success=len(errors) == 0
+        )
+
+    def generate_compliance_report(self) -> ComplianceReport:
+        """
+        Generate compliance report for audit purposes
+
+        Returns:
+            Compliance report with statistics and violations
+        """
+        with self._get_connection() as conn:
+            cursor = conn.cursor()
+
+            # Total audit logs
+            cursor.execute("SELECT COUNT(*) as count FROM audit_log")
+            total_logs = cursor.fetchone()['count']
+
+            # Date range
+            cursor.execute("""
+                SELECT
+                    MIN(timestamp) as oldest,
+                    MAX(timestamp) as newest
+                FROM audit_log
+            """)
+            row = cursor.fetchone()
+            oldest_log_date = datetime.fromisoformat(row['oldest']) if row['oldest'] else None
+            newest_log_date = datetime.fromisoformat(row['newest']) if row['newest'] else None
+
+            # Logs by entity type
+            cursor.execute("""
+                SELECT entity_type, COUNT(*) as count
+                FROM audit_log
+                GROUP BY entity_type
+            """)
+            logs_by_entity_type = {row['entity_type']: row['count'] for row in cursor.fetchall()}
+
+            # Check for retention violations
+            violations = []
+            policies = self.load_retention_policies()
+
+            for entity_type, policy in policies.items():
+                if policy.retention_days == RetentionPeriod.PERMANENT.value:
+                    continue
+
+                cutoff_date = datetime.now() - timedelta(days=policy.retention_days)
+
+                cursor.execute("""
+                    SELECT COUNT(*) as count
+                    FROM audit_log
+                    WHERE entity_type = ? AND timestamp < ?
+                """, (entity_type, cutoff_date.isoformat()))
+
+                expired_count = cursor.fetchone()['count']
+                if expired_count > 0:
+                    violations.append(
+                        f"{entity_type}: {expired_count} logs exceed retention period "
+                        f"of {policy.retention_days} days"
+                    )
+
+            # Archived logs count (count .gz files)
+            archived_count = len(list(self.archive_dir.glob("audit_log_*.json.gz")))
+
+            # Storage size
+            storage_size_mb = 0.0
+            db_size = self.db_path.stat().st_size if self.db_path.exists() else 0
+            storage_size_mb = db_size / (1024 * 1024)
+
+            # Archive size
+            for archive_file in self.archive_dir.glob("*.gz"):
+                storage_size_mb += archive_file.stat().st_size / (1024 * 1024)
+
+            is_compliant = len(violations) == 0
+
+            return ComplianceReport(
+                report_date=datetime.now(),
+                total_audit_logs=total_logs,
+                oldest_log_date=oldest_log_date,
+                newest_log_date=newest_log_date,
+                logs_by_entity_type=logs_by_entity_type,
+                retention_violations=violations,
+                archived_logs_count=archived_count,
+                storage_size_mb=round(storage_size_mb, 2),
+                is_compliant=is_compliant
+            )
+
+    def restore_from_archive(
+        self,
+        archive_file: Path,
+        verify_only: bool = False
+    ) -> int:
+        """
+        Restore logs from archive file
+
+        Args:
+            archive_file: Path to archive file
+            verify_only: If True, only verify archive integrity
+
+        Returns:
+            Number of logs restored (or that would be restored)
+        """
+        if not archive_file.exists():
+            raise FileNotFoundError(f"Archive file not found: {archive_file}")
+
+        try:
+            with gzip.open(archive_file, 'rt', encoding='utf-8') as f:
+                logs = json.load(f)
+
+            if verify_only:
+                logger.info(f"Archive {archive_file.name} contains {len(logs)} logs")
+                return len(logs)
+
+            # Restore logs
+            with self._transaction() as cursor:
+                restored_count = 0
+                for log in logs:
+                    # Check if log already exists
+                    cursor.execute("""
+                        SELECT id FROM audit_log
+                        WHERE id = ?
+                    """, (log['id'],))
+
+                    if cursor.fetchone():
+                        continue  # Skip duplicates
+
+                    # Insert log
+                    cursor.execute("""
+                        INSERT INTO audit_log
+                        (id, timestamp, action, entity_type, entity_id, user, details, success, error_message)
+                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                    """, (
+                        log['id'],
+                        log['timestamp'],
+                        log['action'],
+                        log['entity_type'],
+                        log.get('entity_id'),
+                        log.get('user'),
+                        log.get('details'),
+                        log.get('success', 1),
+                        log.get('error_message')
+                    ))
+                    restored_count += 1
+
+                logger.info(f"Restored {restored_count} logs from {archive_file.name}")
+                return restored_count
+
+        except Exception as e:
+            logger.error(f"Failed to restore from archive {archive_file}: {e}")
+            raise
+
+
+# Global instance for convenience
+_global_manager: Optional[AuditLogRetentionManager] = None
+
+
+def get_retention_manager(
+    db_path: Optional[Path] = None,
+    archive_dir: Optional[Path] = None
+) -> AuditLogRetentionManager:
+    """
+    Get global retention manager instance (singleton pattern)
+
+    Args:
+        db_path: Database path (only used on first call)
+        archive_dir: Archive directory (only used on first call)
+
+    Returns:
+        Global AuditLogRetentionManager instance
+    """
+    global _global_manager
+
+    if _global_manager is None:
+        if db_path is None:
+            from utils.config import get_config
+            config = get_config()
+            db_path = config.database.path
+
+        _global_manager = AuditLogRetentionManager(db_path, archive_dir)
+
+    return _global_manager
+
+
+def reset_retention_manager() -> None:
+    """Reset global retention manager (mainly for testing)"""
+    global _global_manager
+    _global_manager = None
--- a/transcript-fixer/scripts/utils/concurrency_manager.py
+++ b/transcript-fixer/scripts/utils/concurrency_manager.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Concurrency Management Module - Production-Grade Concurrent Request Handling
+
+CRITICAL FIX (P1-9): Tune concurrent request handling for optimal performance
+
+Features:
+- Semaphore-based request limiting
+- Circuit breaker pattern for fault tolerance
+- Backpressure handling
+- Request queue management
+- Integration with rate limiter
+- Concurrent operation monitoring
+- Adaptive concurrency tuning
+
+Use cases:
+- API request management
+- Database query concurrency
+- File operation limiting
+- Resource-intensive tasks
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import time
+import threading
+from contextlib import asynccontextmanager, contextmanager
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from enum import Enum
+from typing import Optional, Dict, Any, Callable, TypeVar, Final
+from collections import deque
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar('T')
+
+
+class CircuitState(Enum):
+    """Circuit breaker states"""
+    CLOSED = "closed"  # Normal operation
+    OPEN = "open"  # Failing, rejecting requests
+    HALF_OPEN = "half_open"  # Testing if service recovered
+
+
+@dataclass
+class ConcurrencyConfig:
+    """Configuration for concurrency management"""
+    max_concurrent: int = 10  # Maximum concurrent operations
+    max_queue_size: int = 100  # Maximum queued requests
+    timeout: float = 30.0  # Operation timeout in seconds
+    enable_backpressure: bool = True  # Enable backpressure when queue full
+    enable_circuit_breaker: bool = True  # Enable circuit breaker
+    circuit_failure_threshold: int = 5  # Failures before opening circuit
+    circuit_recovery_timeout: float = 60.0  # Seconds before attempting recovery
+    circuit_success_threshold: int = 2  # Successes needed to close circuit
+    enable_adaptive_tuning: bool = False  # Adjust concurrency based on performance
+    min_concurrent: int = 2  # Minimum concurrent (for adaptive tuning)
+    max_response_time: float = 5.0  # Target max response time (for adaptive tuning)
+
+
+@dataclass
+class ConcurrencyMetrics:
+    """Metrics for concurrency monitoring"""
+    total_requests: int = 0
+    successful_requests: int = 0
+    failed_requests: int = 0
+    rejected_requests: int = 0  # Rejected due to backpressure
+    timeout_requests: int = 0
+    active_operations: int = 0
+    queued_operations: int = 0
+    avg_response_time_ms: float = 0.0
+    current_concurrency: int = 0
+    circuit_state: CircuitState = CircuitState.CLOSED
+    circuit_failures: int = 0
+    last_updated: datetime = field(default_factory=datetime.now)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            'total_requests': self.total_requests,
+            'successful_requests': self.successful_requests,
+            'failed_requests': self.failed_requests,
+            'rejected_requests': self.rejected_requests,
+            'timeout_requests': self.timeout_requests,
+            'active_operations': self.active_operations,
+            'queued_operations': self.queued_operations,
+            'avg_response_time_ms': round(self.avg_response_time_ms, 2),
+            'current_concurrency': self.current_concurrency,
+            'circuit_state': self.circuit_state.value,
+            'circuit_failures': self.circuit_failures,
+            'success_rate': round(
+                self.successful_requests / max(self.total_requests, 1) * 100, 2
+            ),
+            'last_updated': self.last_updated.isoformat()
+        }
+
+
+class BackpressureError(Exception):
+    """Raised when backpressure limits are exceeded"""
+    pass
+
+
+class CircuitBreakerOpenError(Exception):
+    """Raised when circuit breaker is open"""
+    pass
+
+
+class ConcurrencyManager:
+    """
+    Production-grade concurrency management with advanced features
+
+    Features:
+    - Semaphore-based limiting (prevents resource exhaustion)
+    - Circuit breaker pattern (fault tolerance)
+    - Backpressure handling (graceful degradation)
+    - Request queue management (fairness)
+    - Performance monitoring (observability)
+    - Adaptive tuning (optimization)
+    """
+
+    def __init__(self, config: ConcurrencyConfig = None):
+        """
+        Initialize concurrency manager
+
+        Args:
+            config: Concurrency configuration
+        """
+        self.config = config or ConcurrencyConfig()
+
+        # Semaphore for concurrency limiting
+        self._semaphore = asyncio.Semaphore(self.config.max_concurrent)
+        self._sync_semaphore = threading.Semaphore(self.config.max_concurrent)
+
+        # Queue for pending requests
+        self._queue: deque = deque(maxlen=self.config.max_queue_size)
+        self._queue_lock = threading.Lock()
+
+        # Metrics tracking
+        self._metrics = ConcurrencyMetrics()
+        self._metrics.current_concurrency = self.config.max_concurrent
+        self._metrics_lock = threading.Lock()
+
+        # Response time tracking for adaptive tuning
+        self._response_times: deque = deque(maxlen=100)  # Last 100 responses
+        self._response_times_lock = threading.Lock()
+
+        # Circuit breaker state
+        self._circuit_state = CircuitState.CLOSED
+        self._circuit_failures = 0
+        self._circuit_last_failure_time: Optional[float] = None
+        self._circuit_successes = 0
+        self._circuit_lock = threading.Lock()
+
+        logger.info(f"ConcurrencyManager initialized: max_concurrent={self.config.max_concurrent}")
+
+    def _check_circuit_breaker(self) -> None:
+        """Check circuit breaker state and potentially transition"""
+        if not self.config.enable_circuit_breaker:
+            return
+
+        with self._circuit_lock:
+            if self._circuit_state == CircuitState.OPEN:
+                # Check if recovery timeout has elapsed
+                if self._circuit_last_failure_time:
+                    elapsed = time.time() - self._circuit_last_failure_time
+                    if elapsed >= self.config.circuit_recovery_timeout:
+                        logger.info("Circuit breaker: OPEN -> HALF_OPEN (recovery timeout elapsed)")
+                        self._circuit_state = CircuitState.HALF_OPEN
+                        self._circuit_successes = 0
+                    else:
+                        raise CircuitBreakerOpenError(
+                            f"Circuit breaker is OPEN. Retry after "
+                            f"{self.config.circuit_recovery_timeout - elapsed:.1f}s"
+                        )
+
+            elif self._circuit_state == CircuitState.HALF_OPEN:
+                # In half-open state, allow limited requests through
+                pass
+
+    def _record_success(self) -> None:
+        """Record successful operation for circuit breaker"""
+        if not self.config.enable_circuit_breaker:
+            return
+
+        with self._circuit_lock:
+            if self._circuit_state == CircuitState.HALF_OPEN:
+                self._circuit_successes += 1
+                if self._circuit_successes >= self.config.circuit_success_threshold:
+                    logger.info("Circuit breaker: HALF_OPEN -> CLOSED (recovered)")
+                    self._circuit_state = CircuitState.CLOSED
+                    self._circuit_failures = 0
+                    self._circuit_successes = 0
+
+    def _record_failure(self) -> None:
+        """Record failed operation for circuit breaker"""
+        if not self.config.enable_circuit_breaker:
+            return
+
+        with self._circuit_lock:
+            self._circuit_failures += 1
+            self._circuit_last_failure_time = time.time()
+
+            if self._circuit_state == CircuitState.CLOSED:
+                if self._circuit_failures >= self.config.circuit_failure_threshold:
+                    logger.warning(
+                        f"Circuit breaker: CLOSED -> OPEN "
+                        f"({self._circuit_failures} failures)"
+                    )
+                    self._circuit_state = CircuitState.OPEN
+                    with self._metrics_lock:
+                        self._metrics.circuit_state = CircuitState.OPEN
+
+            elif self._circuit_state == CircuitState.HALF_OPEN:
+                # Failure during recovery - back to OPEN
+                logger.warning("Circuit breaker: HALF_OPEN -> OPEN (recovery failed)")
+                self._circuit_state = CircuitState.OPEN
+                self._circuit_successes = 0
+
+    def _update_response_time(self, response_time_ms: float) -> None:
+        """Update response time metrics"""
+        with self._response_times_lock:
+            self._response_times.append(response_time_ms)
+
+            # Update average
+            if len(self._response_times) > 0:
+                avg = sum(self._response_times) / len(self._response_times)
+                with self._metrics_lock:
+                    self._metrics.avg_response_time_ms = avg
+
+    def _adjust_concurrency(self) -> None:
+        """Adaptive concurrency tuning based on performance"""
+        if not self.config.enable_adaptive_tuning:
+            return
+
+        with self._response_times_lock:
+            if len(self._response_times) < 10:
+                return  # Not enough data
+
+            avg_time = sum(self._response_times) / len(self._response_times)
+            target_time = self.config.max_response_time * 1000  # Convert to ms
+
+            current_concurrency = self.config.max_concurrent
+
+            if avg_time > target_time * 1.5:
+                # Response time too high - decrease concurrency
+                new_concurrency = max(
+                    self.config.min_concurrent,
+                    current_concurrency - 1
+                )
+                if new_concurrency != current_concurrency:
+                    logger.info(
+                        f"Adaptive tuning: Decreasing concurrency "
+                        f"{current_concurrency} -> {new_concurrency} "
+                        f"(avg response time: {avg_time:.1f}ms)"
+                    )
+                    self.config.max_concurrent = new_concurrency
+                    # Note: Can't easily adjust asyncio.Semaphore,
+                    # would need to recreate it
+
+            elif avg_time < target_time * 0.5:
+                # Response time low - can increase concurrency
+                new_concurrency = min(
+                    20,  # Hard cap
+                    current_concurrency + 1
+                )
+                if new_concurrency != current_concurrency:
+                    logger.info(
+                        f"Adaptive tuning: Increasing concurrency "
+                        f"{current_concurrency} -> {new_concurrency} "
+                        f"(avg response time: {avg_time:.1f}ms)"
+                    )
+                    self.config.max_concurrent = new_concurrency
+
+    @asynccontextmanager
+    async def acquire(self, timeout: Optional[float] = None):
+        """
+        Async context manager to acquire concurrency slot
+
+        Args:
+            timeout: Optional timeout override
+
+        Raises:
+            BackpressureError: If queue is full and backpressure is enabled
+            CircuitBreakerOpenError: If circuit breaker is open
+            asyncio.TimeoutError: If timeout exceeded
+
+        Example:
+            async with manager.acquire():
+                result = await some_async_operation()
+        """
+        timeout = timeout or self.config.timeout
+        start_time = time.time()
+
+        # Check circuit breaker
+        self._check_circuit_breaker()
+
+        # Check backpressure
+        if self.config.enable_backpressure:
+            with self._metrics_lock:
+                if self._metrics.queued_operations >= self.config.max_queue_size:
+                    self._metrics.rejected_requests += 1
+                    raise BackpressureError(
+                        f"Queue full ({self.config.max_queue_size} operations pending). "
+                        "Try again later."
+                    )
+
+        # Update queue metrics
+        with self._metrics_lock:
+            self._metrics.queued_operations += 1
+            self._metrics.total_requests += 1
+
+        try:
+            # Acquire semaphore with timeout
+            async with asyncio.timeout(timeout):
+                async with self._semaphore:
+                    # Update active metrics
+                    with self._metrics_lock:
+                        self._metrics.queued_operations -= 1
+                        self._metrics.active_operations += 1
+
+                    operation_start = time.time()
+
+                    try:
+                        yield
+
+                        # Record success
+                        response_time_ms = (time.time() - operation_start) * 1000
+                        self._update_response_time(response_time_ms)
+                        self._record_success()
+
+                        with self._metrics_lock:
+                            self._metrics.successful_requests += 1
+
+                    except Exception as e:
+                        # Record failure
+                        self._record_failure()
+
+                        with self._metrics_lock:
+                            self._metrics.failed_requests += 1
+
+                        raise
+
+                    finally:
+                        # Update active metrics
+                        with self._metrics_lock:
+                            self._metrics.active_operations -= 1
+
+                        # Adaptive tuning
+                        self._adjust_concurrency()
+
+        except asyncio.TimeoutError:
+            with self._metrics_lock:
+                self._metrics.timeout_requests += 1
+                self._metrics.queued_operations -= 1
+
+            elapsed = time.time() - start_time
+            raise asyncio.TimeoutError(
+                f"Operation timed out after {elapsed:.1f}s "
+                f"(timeout: {timeout}s)"
+            )
+
+    @contextmanager
+    def acquire_sync(self, timeout: Optional[float] = None):
+        """
+        Synchronous context manager to acquire concurrency slot
+
+        Args:
+            timeout: Optional timeout override
+
+        Example:
+            with manager.acquire_sync():
+                result = some_operation()
+        """
+        timeout = timeout or self.config.timeout
+        start_time = time.time()
+
+        # Check circuit breaker
+        self._check_circuit_breaker()
+
+        # Check backpressure
+        if self.config.enable_backpressure:
+            with self._metrics_lock:
+                if self._metrics.queued_operations >= self.config.max_queue_size:
+                    self._metrics.rejected_requests += 1
+                    raise BackpressureError(
+                        f"Queue full ({self.config.max_queue_size} operations pending)"
+                    )
+
+        # Update queue metrics
+        with self._metrics_lock:
+            self._metrics.queued_operations += 1
+            self._metrics.total_requests += 1
+
+        acquired = False
+        try:
+            # Acquire semaphore with timeout
+            acquired = self._sync_semaphore.acquire(timeout=timeout)
+
+            if not acquired:
+                raise TimeoutError(f"Failed to acquire semaphore within {timeout}s")
+
+            # Update active metrics
+            with self._metrics_lock:
+                self._metrics.queued_operations -= 1
+                self._metrics.active_operations += 1
+
+            operation_start = time.time()
+
+            try:
+                yield
+
+                # Record success
+                response_time_ms = (time.time() - operation_start) * 1000
+                self._update_response_time(response_time_ms)
+                self._record_success()
+
+                with self._metrics_lock:
+                    self._metrics.successful_requests += 1
+
+            except Exception as e:
+                # Record failure
+                self._record_failure()
+
+                with self._metrics_lock:
+                    self._metrics.failed_requests += 1
+
+                raise
+
+            finally:
+                # Update active metrics
+                with self._metrics_lock:
+                    self._metrics.active_operations -= 1
+
+        finally:
+            if acquired:
+                self._sync_semaphore.release()
+            else:
+                with self._metrics_lock:
+                    self._metrics.timeout_requests += 1
+                    self._metrics.queued_operations -= 1
+
+    def get_metrics(self) -> ConcurrencyMetrics:
+        """Get current concurrency metrics"""
+        with self._metrics_lock:
+            # Update circuit state
+            with self._circuit_lock:
+                self._metrics.circuit_state = self._circuit_state
+                self._metrics.circuit_failures = self._circuit_failures
+
+            self._metrics.last_updated = datetime.now()
+            return ConcurrencyMetrics(**self._metrics.__dict__)
+
+    def reset_circuit_breaker(self) -> None:
+        """Manually reset circuit breaker to CLOSED state"""
+        with self._circuit_lock:
+            logger.info("Manually resetting circuit breaker to CLOSED")
+            self._circuit_state = CircuitState.CLOSED
+            self._circuit_failures = 0
+            self._circuit_successes = 0
+            self._circuit_last_failure_time = None
+
+    def get_status(self) -> Dict[str, Any]:
+        """Get human-readable status"""
+        metrics = self.get_metrics()
+
+        return {
+            'status': 'healthy' if metrics.circuit_state == CircuitState.CLOSED else 'degraded',
+            'concurrency': {
+                'current': metrics.current_concurrency,
+                'active': metrics.active_operations,
+                'queued': metrics.queued_operations,
+            },
+            'performance': {
+                'avg_response_time_ms': metrics.avg_response_time_ms,
+                'success_rate': round(
+                    metrics.successful_requests / max(metrics.total_requests, 1) * 100, 2
+                )
+            },
+            'circuit_breaker': {
+                'state': metrics.circuit_state.value,
+                'failures': metrics.circuit_failures,
+            },
+            'requests': {
+                'total': metrics.total_requests,
+                'successful': metrics.successful_requests,
+                'failed': metrics.failed_requests,
+                'rejected': metrics.rejected_requests,
+                'timeout': metrics.timeout_requests,
+            }
+        }
+
+
+# Global instance for convenience
+_global_manager: Optional[ConcurrencyManager] = None
+_global_manager_lock = threading.Lock()
+
+
+def get_concurrency_manager(config: Optional[ConcurrencyConfig] = None) -> ConcurrencyManager:
+    """
+    Get global concurrency manager instance (singleton pattern)
+
+    Args:
+        config: Optional configuration (only used on first call)
+
+    Returns:
+        Global ConcurrencyManager instance
+    """
+    global _global_manager
+
+    with _global_manager_lock:
+        if _global_manager is None:
+            _global_manager = ConcurrencyManager(config)
+        return _global_manager
+
+
+def reset_concurrency_manager() -> None:
+    """Reset global concurrency manager (mainly for testing)"""
+    global _global_manager
+
+    with _global_manager_lock:
+        _global_manager = None
--- a/transcript-fixer/scripts/utils/config.py
+++ b/transcript-fixer/scripts/utils/config.py
@@ -0,0 +1,538 @@
+#!/usr/bin/env python3
+"""
+Configuration Management Module
+
+CRITICAL FIX (P1-5): Production-grade configuration management
+
+Features:
+- Centralized configuration (single source of truth)
+- Environment-based config (dev/staging/prod)
+- Type-safe access with validation
+- Multiple config sources (env vars, files, defaults)
+- Config schema validation
+- Secure secrets management
+
+Use cases:
+- Application configuration
+- Environment-specific settings
+- API keys and secrets management
+- Path configuration
+- Feature flags
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Optional, Dict, Any, Final
+
+logger = logging.getLogger(__name__)
+
+
+class Environment(Enum):
+    """Application environment"""
+    DEVELOPMENT = "development"
+    STAGING = "staging"
+    PRODUCTION = "production"
+    TEST = "test"
+
+
+@dataclass
+class DatabaseConfig:
+    """Database configuration"""
+    path: Path
+    max_connections: int = 5
+    connection_timeout: float = 30.0
+    enable_wal_mode: bool = True  # Write-Ahead Logging for better concurrency
+
+    def __post_init__(self):
+        """Validate database configuration"""
+        if self.max_connections <= 0:
+            raise ValueError("max_connections must be positive")
+        if self.connection_timeout <= 0:
+            raise ValueError("connection_timeout must be positive")
+
+        # Ensure database directory exists
+        self.path = Path(self.path)
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+
+
+@dataclass
+class APIConfig:
+    """API configuration"""
+    api_key: Optional[str] = None
+    base_url: Optional[str] = None
+    timeout: float = 60.0
+    max_retries: int = 3
+    retry_backoff: float = 1.0  # Exponential backoff base (seconds)
+
+    def __post_init__(self):
+        """Validate API configuration"""
+        if self.timeout <= 0:
+            raise ValueError("timeout must be positive")
+        if self.max_retries < 0:
+            raise ValueError("max_retries must be non-negative")
+        if self.retry_backoff < 0:
+            raise ValueError("retry_backoff must be non-negative")
+
+
+@dataclass
+class PathConfig:
+    """Path configuration"""
+    config_dir: Path
+    data_dir: Path
+    log_dir: Path
+    cache_dir: Path
+
+    def __post_init__(self):
+        """Validate and create directories"""
+        self.config_dir = Path(self.config_dir)
+        self.data_dir = Path(self.data_dir)
+        self.log_dir = Path(self.log_dir)
+        self.cache_dir = Path(self.cache_dir)
+
+        # Create all directories
+        for dir_path in [self.config_dir, self.data_dir, self.log_dir, self.cache_dir]:
+            dir_path.mkdir(parents=True, exist_ok=True)
+
+
+@dataclass
+class ResourceLimits:
+    """Resource limits configuration"""
+    max_text_length: int = 1_000_000  # 1MB max text
+    max_file_size: int = 10_000_000  # 10MB max file
+    max_concurrent_tasks: int = 10
+    max_memory_mb: int = 512
+    rate_limit_requests: int = 100
+    rate_limit_window_seconds: float = 60.0
+
+    def __post_init__(self):
+        """Validate resource limits"""
+        if self.max_text_length <= 0:
+            raise ValueError("max_text_length must be positive")
+        if self.max_file_size <= 0:
+            raise ValueError("max_file_size must be positive")
+        if self.max_concurrent_tasks <= 0:
+            raise ValueError("max_concurrent_tasks must be positive")
+
+
+@dataclass
+class FeatureFlags:
+    """Feature flags for conditional functionality"""
+    enable_learning: bool = True
+    enable_metrics: bool = True
+    enable_health_checks: bool = True
+    enable_rate_limiting: bool = True
+    enable_caching: bool = True
+    enable_auto_approval: bool = False  # Auto-approve learned suggestions
+
+
+@dataclass
+class Config:
+    """
+    Main configuration class - Single source of truth for all configuration.
+
+    Configuration precedence (highest to lowest):
+    1. Environment variables
+    2. Config file (if provided)
+    3. Default values
+    """
+
+    # Environment
+    environment: Environment = Environment.DEVELOPMENT
+
+    # Sub-configurations
+    database: DatabaseConfig = field(default_factory=lambda: DatabaseConfig(
+        path=Path.home() / ".transcript-fixer" / "corrections.db"
+    ))
+    api: APIConfig = field(default_factory=APIConfig)
+    paths: PathConfig = field(default_factory=lambda: PathConfig(
+        config_dir=Path.home() / ".transcript-fixer",
+        data_dir=Path.home() / ".transcript-fixer" / "data",
+        log_dir=Path.home() / ".transcript-fixer" / "logs",
+        cache_dir=Path.home() / ".transcript-fixer" / "cache",
+    ))
+    resources: ResourceLimits = field(default_factory=ResourceLimits)
+    features: FeatureFlags = field(default_factory=FeatureFlags)
+
+    # Application metadata
+    app_name: str = "transcript-fixer"
+    app_version: str = "1.0.0"
+    debug: bool = False
+
+    def __post_init__(self):
+        """Post-initialization validation"""
+        logger.debug(f"Config initialized for environment: {self.environment.value}")
+
+    @classmethod
+    def from_env(cls) -> Config:
+        """
+        Create configuration from environment variables.
+
+        Environment variables:
+            - TRANSCRIPT_FIXER_ENV: Environment (development/staging/production)
+            - TRANSCRIPT_FIXER_CONFIG_DIR: Config directory path
+            - TRANSCRIPT_FIXER_DB_PATH: Database path
+            - GLM_API_KEY: API key for GLM service
+            - ANTHROPIC_API_KEY: Alternative API key
+            - ANTHROPIC_BASE_URL: API base URL
+            - TRANSCRIPT_FIXER_DEBUG: Enable debug mode (1/true/yes)
+
+        Returns:
+            Config instance with values from environment variables
+        """
+        # Parse environment
+        env_str = os.getenv("TRANSCRIPT_FIXER_ENV", "development").lower()
+        try:
+            environment = Environment(env_str)
+        except ValueError:
+            logger.warning(f"Invalid environment '{env_str}', defaulting to development")
+            environment = Environment.DEVELOPMENT
+
+        # Parse debug flag
+        debug_str = os.getenv("TRANSCRIPT_FIXER_DEBUG", "0").lower()
+        debug = debug_str in ("1", "true", "yes", "on")
+
+        # Parse paths
+        config_dir = Path(os.getenv(
+            "TRANSCRIPT_FIXER_CONFIG_DIR",
+            str(Path.home() / ".transcript-fixer")
+        ))
+
+        # Database config
+        db_path = Path(os.getenv(
+            "TRANSCRIPT_FIXER_DB_PATH",
+            str(config_dir / "corrections.db")
+        ))
+        db_max_connections = int(os.getenv("TRANSCRIPT_FIXER_DB_MAX_CONNECTIONS", "5"))
+
+        database = DatabaseConfig(
+            path=db_path,
+            max_connections=db_max_connections,
+        )
+
+        # API config
+        api_key = os.getenv("GLM_API_KEY") or os.getenv("ANTHROPIC_API_KEY")
+        base_url = os.getenv("ANTHROPIC_BASE_URL")
+        api_timeout = float(os.getenv("TRANSCRIPT_FIXER_API_TIMEOUT", "60.0"))
+
+        api = APIConfig(
+            api_key=api_key,
+            base_url=base_url,
+            timeout=api_timeout,
+        )
+
+        # Path config
+        paths = PathConfig(
+            config_dir=config_dir,
+            data_dir=config_dir / "data",
+            log_dir=config_dir / "logs",
+            cache_dir=config_dir / "cache",
+        )
+
+        # Resource limits
+        resources = ResourceLimits(
+            max_concurrent_tasks=int(os.getenv("TRANSCRIPT_FIXER_MAX_CONCURRENT", "10")),
+            rate_limit_requests=int(os.getenv("TRANSCRIPT_FIXER_RATE_LIMIT", "100")),
+        )
+
+        # Feature flags
+        features = FeatureFlags(
+            enable_learning=os.getenv("TRANSCRIPT_FIXER_ENABLE_LEARNING", "1") != "0",
+            enable_metrics=os.getenv("TRANSCRIPT_FIXER_ENABLE_METRICS", "1") != "0",
+            enable_auto_approval=os.getenv("TRANSCRIPT_FIXER_AUTO_APPROVE", "0") == "1",
+        )
+
+        return cls(
+            environment=environment,
+            database=database,
+            api=api,
+            paths=paths,
+            resources=resources,
+            features=features,
+            debug=debug,
+        )
+
+    @classmethod
+    def from_file(cls, config_path: Path) -> Config:
+        """
+        Load configuration from JSON file.
+
+        Args:
+            config_path: Path to JSON config file
+
+        Returns:
+            Config instance with values from file
+
+        Raises:
+            FileNotFoundError: If config file doesn't exist
+            ValueError: If config file is invalid
+        """
+        config_path = Path(config_path)
+
+        if not config_path.exists():
+            raise FileNotFoundError(f"Config file not found: {config_path}")
+
+        try:
+            with open(config_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in config file: {e}")
+
+        # Parse environment
+        env_str = data.get("environment", "development")
+        try:
+            environment = Environment(env_str)
+        except ValueError:
+            logger.warning(f"Invalid environment '{env_str}', defaulting to development")
+            environment = Environment.DEVELOPMENT
+
+        # Parse database config
+        db_data = data.get("database", {})
+        database = DatabaseConfig(
+            path=Path(db_data.get("path", str(Path.home() / ".transcript-fixer" / "corrections.db"))),
+            max_connections=db_data.get("max_connections", 5),
+            connection_timeout=db_data.get("connection_timeout", 30.0),
+        )
+
+        # Parse API config
+        api_data = data.get("api", {})
+        api = APIConfig(
+            api_key=api_data.get("api_key"),
+            base_url=api_data.get("base_url"),
+            timeout=api_data.get("timeout", 60.0),
+            max_retries=api_data.get("max_retries", 3),
+        )
+
+        # Parse path config
+        paths_data = data.get("paths", {})
+        config_dir = Path(paths_data.get("config_dir", str(Path.home() / ".transcript-fixer")))
+        paths = PathConfig(
+            config_dir=config_dir,
+            data_dir=Path(paths_data.get("data_dir", str(config_dir / "data"))),
+            log_dir=Path(paths_data.get("log_dir", str(config_dir / "logs"))),
+            cache_dir=Path(paths_data.get("cache_dir", str(config_dir / "cache"))),
+        )
+
+        # Parse resource limits
+        resources_data = data.get("resources", {})
+        resources = ResourceLimits(
+            max_text_length=resources_data.get("max_text_length", 1_000_000),
+            max_file_size=resources_data.get("max_file_size", 10_000_000),
+            max_concurrent_tasks=resources_data.get("max_concurrent_tasks", 10),
+        )
+
+        # Parse feature flags
+        features_data = data.get("features", {})
+        features = FeatureFlags(
+            enable_learning=features_data.get("enable_learning", True),
+            enable_metrics=features_data.get("enable_metrics", True),
+            enable_auto_approval=features_data.get("enable_auto_approval", False),
+        )
+
+        return cls(
+            environment=environment,
+            database=database,
+            api=api,
+            paths=paths,
+            resources=resources,
+            features=features,
+            debug=data.get("debug", False),
+        )
+
+    def save_to_file(self, config_path: Path) -> None:
+        """
+        Save configuration to JSON file.
+
+        Args:
+            config_path: Path to save config file
+        """
+        config_path = Path(config_path)
+        config_path.parent.mkdir(parents=True, exist_ok=True)
+
+        data = {
+            "environment": self.environment.value,
+            "database": {
+                "path": str(self.database.path),
+                "max_connections": self.database.max_connections,
+                "connection_timeout": self.database.connection_timeout,
+            },
+            "api": {
+                "api_key": self.api.api_key,
+                "base_url": self.api.base_url,
+                "timeout": self.api.timeout,
+                "max_retries": self.api.max_retries,
+            },
+            "paths": {
+                "config_dir": str(self.paths.config_dir),
+                "data_dir": str(self.paths.data_dir),
+                "log_dir": str(self.paths.log_dir),
+                "cache_dir": str(self.paths.cache_dir),
+            },
+            "resources": {
+                "max_text_length": self.resources.max_text_length,
+                "max_file_size": self.resources.max_file_size,
+                "max_concurrent_tasks": self.resources.max_concurrent_tasks,
+            },
+            "features": {
+                "enable_learning": self.features.enable_learning,
+                "enable_metrics": self.features.enable_metrics,
+                "enable_auto_approval": self.features.enable_auto_approval,
+            },
+            "debug": self.debug,
+        }
+
+        with open(config_path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+
+        logger.info(f"Configuration saved to {config_path}")
+
+    def validate(self) -> tuple[list[str], list[str]]:
+        """
+        Validate configuration completeness and correctness.
+
+        Returns:
+            Tuple of (errors, warnings)
+        """
+        errors = []
+        warnings = []
+
+        # Check API key for production
+        if self.environment == Environment.PRODUCTION:
+            if not self.api.api_key:
+                errors.append("API key is required in production environment")
+        elif not self.api.api_key:
+            warnings.append("API key not set (required for AI corrections)")
+
+        # Check database path
+        if not self.database.path.parent.exists():
+            errors.append(f"Database directory doesn't exist: {self.database.path.parent}")
+
+        # Check paths exist
+        for name, path in [
+            ("config_dir", self.paths.config_dir),
+            ("data_dir", self.paths.data_dir),
+            ("log_dir", self.paths.log_dir),
+        ]:
+            if not path.exists():
+                warnings.append(f"{name} doesn't exist: {path}")
+
+        # Check resource limits are reasonable
+        if self.resources.max_concurrent_tasks > 50:
+            warnings.append(f"max_concurrent_tasks is very high: {self.resources.max_concurrent_tasks}")
+
+        return errors, warnings
+
+    def get_database_url(self) -> str:
+        """Get database connection URL"""
+        return f"sqlite:///{self.database.path}"
+
+    def is_production(self) -> bool:
+        """Check if running in production"""
+        return self.environment == Environment.PRODUCTION
+
+    def is_development(self) -> bool:
+        """Check if running in development"""
+        return self.environment == Environment.DEVELOPMENT
+
+
+# Global configuration instance
+_config: Optional[Config] = None
+
+
+def get_config() -> Config:
+    """
+    Get global configuration instance (singleton pattern).
+
+    Returns:
+        Config instance loaded from environment variables
+    """
+    global _config
+
+    if _config is None:
+        # Load from environment by default
+        _config = Config.from_env()
+        logger.info(f"Configuration loaded: {_config.environment.value}")
+
+        # Validate
+        errors, warnings = _config.validate()
+        if errors:
+            logger.error(f"Configuration errors: {errors}")
+        if warnings:
+            logger.warning(f"Configuration warnings: {warnings}")
+
+    return _config
+
+
+def set_config(config: Config) -> None:
+    """
+    Set global configuration instance (for testing or manual config).
+
+    Args:
+        config: Config instance to set globally
+    """
+    global _config
+    _config = config
+    logger.info(f"Configuration set: {config.environment.value}")
+
+
+def reset_config() -> None:
+    """Reset global configuration (mainly for testing)"""
+    global _config
+    _config = None
+    logger.debug("Configuration reset")
+
+
+# Example configuration file template
+CONFIG_FILE_TEMPLATE: Final[str] = """{
+  "environment": "development",
+  "database": {
+    "path": "~/.transcript-fixer/corrections.db",
+    "max_connections": 5,
+    "connection_timeout": 30.0
+  },
+  "api": {
+    "api_key": "your-api-key-here",
+    "base_url": null,
+    "timeout": 60.0,
+    "max_retries": 3
+  },
+  "paths": {
+    "config_dir": "~/.transcript-fixer",
+    "data_dir": "~/.transcript-fixer/data",
+    "log_dir": "~/.transcript-fixer/logs",
+    "cache_dir": "~/.transcript-fixer/cache"
+  },
+  "resources": {
+    "max_text_length": 1000000,
+    "max_file_size": 10000000,
+    "max_concurrent_tasks": 10
+  },
+  "features": {
+    "enable_learning": true,
+    "enable_metrics": true,
+    "enable_auto_approval": false
+  },
+  "debug": false
+}
+"""
+
+
+def create_example_config(output_path: Path) -> None:
+    """
+    Create example configuration file.
+
+    Args:
+        output_path: Path to write example config
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(CONFIG_FILE_TEMPLATE)
+
+    logger.info(f"Example config created: {output_path}")
--- a/transcript-fixer/scripts/utils/database_migration.py
+++ b/transcript-fixer/scripts/utils/database_migration.py
@@ -0,0 +1,567 @@
+#!/usr/bin/env python3
+"""
+Database Migration Module - Production-Grade Migration Strategy
+
+CRITICAL FIX (P1-6): Production database migration system
+
+Features:
+- Versioned migrations with forward and rollback capability
+- Migration history tracking
+- Atomic transactions with rollback support
+- Dry-run mode for testing
+- Migration validation and verification
+- Backward compatibility checks
+
+Migration Types:
+- Forward: Apply new schema changes
+- Rollback: Revert to previous version
+- Validation: Check migration safety
+- Dry-run: Test migrations without applying
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import sqlite3
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any, Callable
+from contextlib import contextmanager
+from dataclasses import dataclass, asdict
+import hashlib
+
+logger = logging.getLogger(__name__)
+
+
+class MigrationDirection(Enum):
+    """Migration direction"""
+    FORWARD = "forward"
+    BACKWARD = "backward"
+
+
+class MigrationStatus(Enum):
+    """Migration execution status"""
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    ROLLED_BACK = "rolled_back"
+
+
+@dataclass
+class Migration:
+    """Migration definition"""
+    version: str
+    name: str
+    description: str
+    forward_sql: str
+    backward_sql: Optional[str] = None  # For rollback capability
+    dependencies: List[str] = None  # List of required migration versions
+    check_function: Optional[Callable] = None  # Validation function
+    is_breaking: bool = False  # If True, requires explicit confirmation
+
+    def __post_init__(self):
+        if self.dependencies is None:
+            self.dependencies = []
+
+    def get_hash(self) -> str:
+        """Get hash of migration content for integrity checking"""
+        content = f"{self.version}:{self.name}:{self.forward_sql}"
+        return hashlib.sha256(content.encode('utf-8')).hexdigest()
+
+
+@dataclass
+class MigrationRecord:
+    """Migration execution record"""
+    id: int
+    version: str
+    name: str
+    status: MigrationStatus
+    direction: MigrationDirection
+    execution_time_ms: int
+    checksum: str
+    executed_at: str = ""
+    error_message: Optional[str] = None
+    details: Optional[Dict[str, Any]] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization"""
+        result = asdict(self)
+        result['status'] = self.status.value
+        result['direction'] = self.direction.value
+        return result
+
+
+class DatabaseMigrationManager:
+    """
+    Production-grade database migration manager
+
+    Handles versioned schema migrations with:
+    - Automatic rollback on failure
+    - Migration history tracking
+    - Dependency resolution
+    - Safety checks and validation
+    """
+
+    def __init__(self, db_path: Path):
+        """
+        Initialize migration manager
+
+        Args:
+            db_path: Path to SQLite database file
+        """
+        self.db_path = Path(db_path)
+        self.migrations: Dict[str, Migration] = {}
+        self._ensure_migration_table()
+
+    def register_migration(self, migration: Migration) -> None:
+        """
+        Register a migration definition
+
+        Args:
+            migration: Migration to register
+        """
+        if migration.version in self.migrations:
+            raise ValueError(f"Migration version {migration.version} already registered")
+
+        # Validate dependencies exist
+        for dep_version in migration.dependencies:
+            if dep_version not in self.migrations:
+                raise ValueError(f"Dependency migration {dep_version} not found")
+
+        self.migrations[migration.version] = migration
+        logger.info(f"Registered migration {migration.version}: {migration.name}")
+
+    def _ensure_migration_table(self) -> None:
+        """Create migration tracking table if not exists"""
+        with self._get_connection() as conn:
+            cursor = conn.cursor()
+
+            # Create migration history table
+            cursor.execute('''
+                CREATE TABLE IF NOT EXISTS schema_migrations (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    version TEXT NOT NULL UNIQUE,
+                    name TEXT NOT NULL,
+                    status TEXT NOT NULL CHECK(status IN ('pending', 'running', 'completed', 'failed', 'rolled_back')),
+                    direction TEXT NOT NULL CHECK(direction IN ('forward', 'backward')),
+                    execution_time_ms INTEGER NOT NULL CHECK(execution_time_ms >= 0),
+                    checksum TEXT NOT NULL,
+                    executed_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+                    error_message TEXT,
+                    details TEXT
+                )
+            ''')
+
+            # Create index for faster queries
+            cursor.execute('''
+                CREATE INDEX IF NOT EXISTS idx_migrations_version
+                ON schema_migrations(version)
+            ''')
+
+            cursor.execute('''
+                CREATE INDEX IF NOT EXISTS idx_migrations_executed_at
+                ON schema_migrations(executed_at DESC)
+            ''')
+
+            # Insert initial migration record if table is empty
+            cursor.execute('''
+                INSERT OR IGNORE INTO schema_migrations
+                (version, name, status, direction, execution_time_ms, checksum)
+                VALUES ('0.0', 'Initial empty schema', 'completed', 'forward', 0, 'empty')
+            ''')
+
+            conn.commit()
+
+    @contextmanager
+    def _get_connection(self):
+        """Get database connection with proper error handling"""
+        conn = sqlite3.connect(str(self.db_path))
+        conn.execute("PRAGMA foreign_keys = ON")
+        try:
+            yield conn
+        finally:
+            conn.close()
+
+    @contextmanager
+    def _transaction(self):
+        """Context manager for database transactions"""
+        with self._get_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute("BEGIN")
+            try:
+                yield cursor
+                conn.commit()
+            except Exception:
+                conn.rollback()
+                raise
+
+    def get_current_version(self) -> str:
+        """
+        Get current database schema version
+
+        Returns:
+            Current version string
+        """
+        with self._get_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute('''
+                SELECT version FROM schema_migrations
+                WHERE status = 'completed' AND direction = 'forward'
+                ORDER BY executed_at DESC LIMIT 1
+            ''')
+            result = cursor.fetchone()
+            return result[0] if result else "0.0"
+
+    def get_migration_history(self) -> List[MigrationRecord]:
+        """
+        Get migration execution history
+
+        Returns:
+            List of migration records, most recent first
+        """
+        with self._get_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute('''
+                SELECT id, version, name, status, direction,
+                       execution_time_ms, checksum, error_message,
+                       executed_at, details
+                FROM schema_migrations
+                ORDER BY executed_at DESC
+            ''')
+
+            records = []
+            for row in cursor.fetchall():
+                record = MigrationRecord(
+                    id=row[0],
+                    version=row[1],
+                    name=row[2],
+                    status=MigrationStatus(row[3]),
+                    direction=MigrationDirection(row[4]),
+                    execution_time_ms=row[5],
+                    checksum=row[6],
+                    error_message=row[7],
+                    executed_at=row[8],
+                    details=json.loads(row[9]) if row[9] else None
+                )
+                records.append(record)
+
+            return records
+
+    def _validate_migration(self, migration: Migration) -> Tuple[bool, List[str]]:
+        """
+        Validate migration safety
+
+        Args:
+            migration: Migration to validate
+
+        Returns:
+            Tuple of (is_valid, error_messages)
+        """
+        errors = []
+
+        # Check migration hash
+        if migration.get_hash() != migration.get_hash():  # Simple consistency check
+            errors.append("Migration content is inconsistent")
+
+        # Run custom validation function if provided
+        if migration.check_function:
+            try:
+                with self._get_connection() as conn:
+                    is_valid, validation_error = migration.check_function(conn, migration)
+                    if not is_valid:
+                        errors.append(validation_error)
+            except Exception as e:
+                errors.append(f"Validation function failed: {e}")
+
+        return len(errors) == 0, errors
+
+    def _execute_migration_sql(self, cursor: sqlite3.Cursor, sql: str) -> None:
+        """
+        Execute migration SQL safely
+
+        Args:
+            cursor: Database cursor
+            sql: SQL to execute
+        """
+        # Split SQL into individual statements
+        statements = [s.strip() for s in sql.split(';') if s.strip()]
+
+        for statement in statements:
+            if statement:
+                cursor.execute(statement)
+
+    def _run_migration(self, migration: Migration, direction: MigrationDirection,
+                     dry_run: bool = False) -> None:
+        """
+        Run a single migration
+
+        Args:
+            migration: Migration to run
+            direction: Migration direction
+            dry_run: If True, only validate without executing
+        """
+        start_time = datetime.now()
+
+        # Select appropriate SQL
+        if direction == MigrationDirection.FORWARD:
+            sql = migration.forward_sql
+        elif direction == MigrationDirection.BACKWARD:
+            if not migration.backward_sql:
+                raise ValueError(f"Migration {migration.version} cannot be rolled back")
+            sql = migration.backward_sql
+        else:
+            raise ValueError(f"Invalid migration direction: {direction}")
+
+        # Validate migration
+        is_valid, errors = self._validate_migration(migration)
+        if not is_valid:
+            raise ValueError(f"Migration validation failed: {'; '.join(errors)}")
+
+        if dry_run:
+            logger.info(f"[DRY RUN] Would apply {direction.value} migration {migration.version}")
+            return
+
+        # Record migration start
+        with self._transaction() as cursor:
+            # Insert running record
+            cursor.execute('''
+                INSERT INTO schema_migrations
+                (version, name, status, direction, execution_time_ms, checksum)
+                VALUES (?, ?, 'running', ?, 0, ?)
+            ''', (migration.version, migration.name, direction.value, migration.get_hash()))
+
+            # Execute migration
+            try:
+                self._execute_migration_sql(cursor, sql)
+
+                # Calculate execution time
+                execution_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
+
+                # Update record as completed
+                cursor.execute('''
+                    UPDATE schema_migrations
+                    SET status = 'completed', execution_time_ms = ?
+                    WHERE version = ? AND status = 'running' AND direction = ?
+                    ORDER BY executed_at DESC LIMIT 1
+                ''', (execution_time_ms, migration.version, direction.value))
+
+                logger.info(f"Successfully applied {direction.value} migration {migration.version} "
+                           f"in {execution_time_ms}ms")
+
+            except Exception as e:
+                execution_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
+
+                # Update record as failed
+                cursor.execute('''
+                    UPDATE schema_migrations
+                    SET status = 'failed', error_message = ?
+                    WHERE version = ? AND status = 'running' AND direction = ?
+                    ORDER BY executed_at DESC LIMIT 1
+                ''', (str(e), migration.version, direction.value))
+
+                logger.error(f"Migration {migration.version} failed: {e}")
+                raise RuntimeError(f"Migration {migration.version} failed: {e}")
+
+    def get_pending_migrations(self) -> List[Migration]:
+        """
+        Get list of pending migrations
+
+        Returns:
+            List of migrations that need to be applied
+        """
+        current_version = self.get_current_version()
+        pending = []
+
+        # Get all migration versions
+        all_versions = sorted(self.migrations.keys(), key=lambda v: tuple(map(int, v.split('.'))))
+
+        for version in all_versions:
+            if version > current_version:
+                migration = self.migrations[version]
+                pending.append(migration)
+
+        return pending
+
+    def migrate_to_version(self, target_version: str, dry_run: bool = False,
+                          force: bool = False) -> None:
+        """
+        Migrate database to target version
+
+        Args:
+            target_version: Target version to migrate to
+            dry_run: If True, only validate without executing
+            force: If True, skip breaking change confirmation
+        """
+        current_version = self.get_current_version()
+        logger.info(f"Current version: {current_version}, Target version: {target_version}")
+
+        # Validate target version exists
+        if target_version != "latest" and target_version not in self.migrations:
+            raise ValueError(f"Target version {target_version} not found")
+
+        # Determine migration path
+        if target_version == "latest":
+            # Migrate forward to latest
+            target_migration = max(self.migrations.keys(), key=lambda v: tuple(map(int, v.split('.'))))
+        else:
+            target_migration = target_version
+
+        if target_migration > current_version:
+            # Forward migration
+            self._migrate_forward(current_version, target_migration, dry_run, force)
+        elif target_migration < current_version:
+            # Rollback
+            self._migrate_backward(current_version, target_migration, dry_run, force)
+        else:
+            logger.info("Database is already at target version")
+
+    def _migrate_forward(self, from_version: str, to_version: str,
+                         dry_run: bool = False, force: bool = False) -> None:
+        """Execute forward migrations"""
+        all_versions = sorted(self.migrations.keys(), key=lambda v: tuple(map(int, v.split('.'))))
+
+        for version in all_versions:
+            if version > from_version and version <= to_version:
+                migration = self.migrations[version]
+
+                # Check for breaking changes
+                if migration.is_breaking and not force:
+                    raise RuntimeError(
+                        f"Migration {migration.version} is a breaking change. "
+                        f"Use --force to apply."
+                    )
+
+                # Check dependencies
+                for dep in migration.dependencies:
+                    if dep > from_version:
+                        raise RuntimeError(
+                            f"Migration {migration.version} requires dependency {dep} "
+                            f"which is not yet applied"
+                        )
+
+                self._run_migration(migration, MigrationDirection.FORWARD, dry_run)
+
+    def _migrate_backward(self, from_version: str, to_version: str,
+                          dry_run: bool = False, force: bool = False) -> None:
+        """Execute rollback migrations"""
+        all_versions = sorted(self.migrations.keys(), key=lambda v: tuple(map(int, v.split('.'))), reverse=True)
+
+        for version in all_versions:
+            if version <= from_version and version > to_version:
+                migration = self.migrations[version]
+
+                if not migration.backward_sql:
+                    raise RuntimeError(f"Migration {migration.version} cannot be rolled back")
+
+                # Check if migration would break other migrations
+                dependent_migrations = [
+                    v for v, m in self.migrations.items()
+                    if version in m.dependencies and v <= from_version
+                ]
+                if dependent_migrations and not force:
+                    raise RuntimeError(
+                        f"Cannot rollback {version} because it has dependencies: "
+                        f"{', '.join(dependent_migrations)}"
+                    )
+
+                self._run_migration(migration, MigrationDirection.BACKWARD, dry_run)
+
+    def rollback_migration(self, version: str, dry_run: bool = False,
+                          force: bool = False) -> None:
+        """
+        Rollback a specific migration
+
+        Args:
+            version: Migration version to rollback
+            dry_run: If True, only validate without executing
+            force: If True, skip safety checks
+        """
+        if version not in self.migrations:
+            raise ValueError(f"Migration {version} not found")
+
+        migration = self.migrations[version]
+        if not migration.backward_sql:
+            raise ValueError(f"Migration {version} cannot be rolled back")
+
+        # Check if migration has been applied
+        history = self.get_migration_history()
+        applied_versions = [m.version for m in history if m.status == MigrationStatus.COMPLETED]
+
+        if version not in applied_versions:
+            raise ValueError(f"Migration {version} has not been applied")
+
+        # Check for dependent migrations
+        dependent_migrations = [
+            v for v, m in self.migrations.items()
+            if version in m.dependencies and v in applied_versions
+        ]
+        if dependent_migrations and not force:
+            raise RuntimeError(
+                f"Cannot rollback {version} because it has dependencies: "
+                f"{', '.join(dependent_migrations)}"
+            )
+
+        logger.info(f"Rolling back migration {version}")
+        self._run_migration(migration, MigrationDirection.BACKWARD, dry_run)
+
+    def get_migration_plan(self, target_version: str = "latest") -> List[Dict[str, Any]]:
+        """
+        Get migration execution plan
+
+        Args:
+            target_version: Target version to plan for
+
+        Returns:
+            List of migration steps with details
+        """
+        current_version = self.get_current_version()
+        plan = []
+
+        if target_version == "latest":
+            target_version = max(self.migrations.keys(), key=lambda v: tuple(map(int, v.split('.'))))
+
+        all_versions = sorted(self.migrations.keys(), key=lambda v: tuple(map(int, v.split('.'))))
+
+        for version in all_versions:
+            if version > current_version and version <= target_version:
+                migration = self.migrations[version]
+                step = {
+                    'version': version,
+                    'name': migration.name,
+                    'description': migration.description,
+                    'is_breaking': migration.is_breaking,
+                    'dependencies': migration.dependencies,
+                    'has_rollback': migration.backward_sql is not None
+                }
+                plan.append(step)
+
+        return plan
+
+    def validate_migration_safety(self, target_version: str = "latest") -> Tuple[bool, List[str]]:
+        """
+        Validate migration plan for safety issues
+
+        Args:
+            target_version: Target version to validate
+
+        Returns:
+            Tuple of (is_safe, safety_issues)
+        """
+        plan = self.get_migration_plan(target_version)
+        issues = []
+
+        for step in plan:
+            migration = self.migrations[step['version']]
+
+            # Check breaking changes
+            if migration.is_breaking:
+                issues.append(f"Breaking change in {step['version']}: {step['name']}")
+
+            # Check rollback capability
+            if not migration.backward_sql:
+                issues.append(f"Migration {step['version']} cannot be rolled back")
+
+        return len(issues) == 0, issues
--- a/transcript-fixer/scripts/utils/db_migrations_cli.py
+++ b/transcript-fixer/scripts/utils/db_migrations_cli.py
@@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+"""
+Database Migration CLI - Migration Management Commands
+
+CRITICAL FIX (P1-6): Production database migration CLI commands
+
+Features:
+- Run migrations with dry-run support
+- Migration status and history
+- Rollback capability
+- Migration validation
+- Migration planning
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+from typing import Dict, Any, List
+from dataclasses import asdict
+
+from .database_migration import DatabaseMigrationManager, MigrationRecord, MigrationStatus
+from .migrations import MIGRATION_REGISTRY, LATEST_VERSION, get_migration, get_migrations_up_to
+from .config import get_config
+
+logger = logging.getLogger(__name__)
+
+
+class DatabaseMigrationCLI:
+    """CLI interface for database migrations"""
+
+    def __init__(self, db_path: Path = None):
+        """
+        Initialize migration CLI
+
+        Args:
+            db_path: Database path (uses config if not provided)
+        """
+        if db_path is None:
+            config = get_config()
+            db_path = config.database.path
+
+        self.db_path = Path(db_path)
+        self.migration_manager = DatabaseMigrationManager(self.db_path)
+
+        # Register all migrations
+        for migration in MIGRATION_REGISTRY.values():
+            self.migration_manager.register_migration(migration)
+
+    def cmd_status(self, args) -> None:
+        """
+        Show migration status
+
+        Args:
+            args: Command line arguments
+        """
+        try:
+            current_version = self.migration_manager.get_current_version()
+            history = self.migration_manager.get_migration_history()
+            pending = self.migration_manager.get_pending_migrations()
+
+            print("Database Migration Status")
+            print("=" * 40)
+            print(f"Database Path: {self.db_path}")
+            print(f"Current Version: {current_version}")
+            print(f"Latest Version: {LATEST_VERSION}")
+            print(f"Pending Migrations: {len(pending)}")
+            print(f"Total Migrations Applied: {len([h for h in history if h.status == MigrationStatus.COMPLETED])}")
+
+            if pending:
+                print("\nPending Migrations:")
+                for migration in pending:
+                    print(f"  - {migration.version}: {migration.name}")
+
+            if history:
+                print("\nRecent Migration History:")
+                for i, record in enumerate(history[:5]):
+                    status_icon = "✅" if record.status == MigrationStatus.COMPLETED else "❌"
+                    print(f"  {status_icon} {record.version}: {record.name} ({record.status.value})")
+
+        except Exception as e:
+            print(f"❌ Error getting status: {e}")
+            sys.exit(1)
+
+    def cmd_history(self, args) -> None:
+        """
+        Show migration history
+
+        Args:
+            args: Command line arguments
+        """
+        try:
+            history = self.migration_manager.get_migration_history()
+
+            if not history:
+                print("No migration history found")
+                return
+
+            if args.format == 'json':
+                records = [record.to_dict() for record in history]
+                print(json.dumps(records, indent=2, default=str))
+            else:
+                print("Migration History")
+                print("=" * 40)
+                for record in history:
+                    status_icon = {
+                        MigrationStatus.COMPLETED: "✅",
+                        MigrationStatus.FAILED: "❌",
+                        MigrationStatus.ROLLED_BACK: "↩️",
+                        MigrationStatus.RUNNING: "⏳",
+                    }.get(record.status, "❓")
+
+                    print(f"{status_icon} {record.version} ({record.direction.value})")
+                    print(f"  Name: {record.name}")
+                    print(f"  Status: {record.status.value}")
+                    print(f"  Executed: {record.executed_at}")
+                    print(f"  Duration: {record.execution_time_ms}ms")
+
+                    if record.error_message:
+                        print(f"  Error: {record.error_message}")
+                    print()
+
+        except Exception as e:
+            print(f"❌ Error getting history: {e}")
+            sys.exit(1)
+
+    def cmd_migrate(self, args) -> None:
+        """
+        Run migrations
+
+        Args:
+            args: Command line arguments
+        """
+        try:
+            target_version = args.version if args.version else LATEST_VERSION
+            dry_run = args.dry_run
+            force = args.force
+
+            print(f"Running migrations to version: {target_version}")
+            if dry_run:
+                print("🚨 DRY RUN MODE - No changes will be applied")
+            if force:
+                print("🚨 FORCE MODE - Safety checks bypassed")
+
+            # Get migration plan
+            plan = self.migration_manager.get_migration_plan(target_version)
+
+            if not plan:
+                print("✅ No migrations to apply")
+                return
+
+            print(f"\nMigration Plan:")
+            print("=" * 40)
+            for i, step in enumerate(plan, 1):
+                breaking_icon = "🔴" if step.get('is_breaking') else "🟢"
+                print(f"{i}. {breaking_icon} {step['version']}: {step['name']}")
+                print(f"   Description: {step['description']}")
+                if step.get('dependencies'):
+                    print(f"   Dependencies: {', '.join(step['dependencies'])}")
+                if step.get('is_breaking'):
+                    print("   ⚠️  Breaking change - may require data migration")
+                print()
+
+            if not args.yes and not dry_run:
+                response = input("Continue with migration? (y/N): ")
+                if response.lower() != 'y':
+                    print("Migration cancelled")
+                    return
+
+            # Run migration
+            self.migration_manager.migrate_to_version(target_version, dry_run, force)
+
+            if dry_run:
+                print("✅ Dry run completed successfully")
+            else:
+                print("✅ Migration completed successfully")
+
+                # Show new status
+                new_version = self.migration_manager.get_current_version()
+                print(f"Database is now at version: {new_version}")
+
+        except Exception as e:
+            print(f"❌ Migration failed: {e}")
+            sys.exit(1)
+
+    def cmd_rollback(self, args) -> None:
+        """
+        Rollback migration
+
+        Args:
+            args: Command line arguments
+        """
+        try:
+            target_version = args.version
+            dry_run = args.dry_run
+            force = args.force
+
+            if not target_version:
+                print("❌ Target version is required for rollback")
+                sys.exit(1)
+
+            current_version = self.migration_manager.get_current_version()
+
+            print(f"Rolling back from version {current_version} to {target_version}")
+            if dry_run:
+                print("🚨 DRY RUN MODE - No changes will be applied")
+            if force:
+                print("🚨 FORCE MODE - Safety checks bypassed")
+
+            # Warn about potential data loss
+            if not args.yes and not dry_run:
+                response = input("⚠️  WARNING: Rollback may cause data loss. Continue? (y/N): ")
+                if response.lower() != 'y':
+                    print("Rollback cancelled")
+                    return
+
+            # Run rollback
+            self.migration_manager.migrate_to_version(target_version, dry_run, force)
+
+            if dry_run:
+                print("✅ Dry run completed successfully")
+            else:
+                print("✅ Rollback completed successfully")
+
+                # Show new status
+                new_version = self.migration_manager.get_current_version()
+                print(f"Database is now at version: {new_version}")
+
+        except Exception as e:
+            print(f"❌ Rollback failed: {e}")
+            sys.exit(1)
+
+    def cmd_plan(self, args) -> None:
+        """
+        Show migration plan
+
+        Args:
+            args: Command line arguments
+        """
+        try:
+            target_version = args.version if args.version else LATEST_VERSION
+            plan = self.migration_manager.get_migration_plan(target_version)
+
+            if not plan:
+                print("✅ No migrations to apply")
+                return
+
+            print(f"Migration Plan (to version {target_version})")
+            print("=" * 50)
+
+            current_version = self.migration_manager.get_current_version()
+            print(f"Current Version: {current_version}")
+            print(f"Target Version: {target_version}")
+            print()
+
+            for i, step in enumerate(plan, 1):
+                breaking_icon = "🔴" if step.get('is_breaking') else "🟢"
+                rollback_icon = "✅" if step.get('has_rollback') else "❌"
+
+                print(f"{i}. {breaking_icon} {step['version']}: {step['name']}")
+                print(f"   Description: {step['description']}")
+                print(f"   Rollback: {rollback_icon}")
+
+                if step.get('dependencies'):
+                    print(f"   Dependencies: {', '.join(step['dependencies'])}")
+
+                print()
+
+            # Safety validation
+            is_safe, issues = self.migration_manager.validate_migration_safety(target_version)
+            if is_safe:
+                print("✅ Migration plan is safe")
+            else:
+                print("⚠️  Safety issues detected:")
+                for issue in issues:
+                    print(f"   - {issue}")
+
+        except Exception as e:
+            print(f"❌ Error getting migration plan: {e}")
+            sys.exit(1)
+
+    def cmd_validate(self, args) -> None:
+        """
+        Validate migration safety
+
+        Args:
+            args: Command line arguments
+        """
+        try:
+            target_version = args.version if args.version else LATEST_VERSION
+
+            is_safe, issues = self.migration_manager.validate_migration_safety(target_version)
+
+            if is_safe:
+                print("✅ Migration plan is safe")
+                sys.exit(0)
+            else:
+                print("❌ Migration safety issues found:")
+                for issue in issues:
+                    print(f"   - {issue}")
+                sys.exit(1)
+
+        except Exception as e:
+            print(f"❌ Validation failed: {e}")
+            sys.exit(1)
+
+    def cmd_create_migration(self, args) -> None:
+        """
+        Create a new migration template
+
+        Args:
+            args: Command line arguments
+        """
+        try:
+            version = args.version
+            name = args.name
+            description = args.description
+
+            if not version or not name:
+                print("❌ Version and name are required")
+                sys.exit(1)
+
+            # Check if migration already exists
+            if version in MIGRATION_REGISTRY:
+                print(f"❌ Migration {version} already exists")
+                sys.exit(1)
+
+            # Create migration template
+            template = f'''
+# Migration {version}: {name}
+# Description: {description}
+
+from __future__ import annotations
+import sqlite3
+from typing import Tuple
+from .database_migration import Migration
+from utils.migrations import get_migration
+
+
+def _validate_migration(conn: sqlite3.Connection, migration: Migration) -> Tuple[bool, str]:
+    """Validate migration"""
+    # Add custom validation logic here
+    return True, "Migration validation passed"
+
+
+MIGRATION_{version.replace(".", "_")} = Migration(
+    version="{version}",
+    name="{name}",
+    description="{description}",
+    forward_sql=\"\"\"
+    -- Add your forward migration SQL here
+    \"\"\",
+    backward_sql=\"\"\"
+    -- Add your backward migration SQL here (optional)
+    \"\"\",
+    dependencies=["2.2"],  # List required migrations
+    check_function=_validate_migration,
+    is_breaking=False  # Set to True for breaking changes
+)
+
+# Add to MIGRATION_REGISTRY in migrations.py
+# ALL_MIGRATIONS.append(MIGRATION_{version.replace(".", "_")})
+# MIGRATION_REGISTRY["{version}"] = MIGRATION_{version.replace(".", "_")}
+# LATEST_VERSION = "{version}"  # Update if this is the latest
+            '''.strip()
+
+            print("Migration Template:")
+            print("=" * 50)
+            print(template)
+            print("\n⚠️  Remember to:")
+            print("1. Add the migration to ALL_MIGRATIONS list in migrations.py")
+            print("2. Update MIGRATION_REGISTRY and LATEST_VERSION")
+            print("3. Test the migration before deploying")
+
+        except Exception as e:
+            print(f"❌ Error creating template: {e}")
+            sys.exit(1)
+
+
+def create_migration_cli(db_path: Path = None) -> DatabaseMigrationCLI:
+    """Create migration CLI instance"""
+    return DatabaseMigrationCLI(db_path)
--- a/transcript-fixer/scripts/utils/domain_validator.py
+++ b/transcript-fixer/scripts/utils/domain_validator.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+"""
+Domain Validation and Input Sanitization
+
+CRITICAL FIX: Prevents SQL injection via domain parameter
+ISSUE: Critical-3 in Engineering Excellence Plan
+
+This module provides:
+1. Domain whitelist validation
+2. Input sanitization for text fields
+3. SQL injection prevention helpers
+
+Author: Chief Engineer
+Date: 2025-10-28
+Priority: P0 - Critical
+"""
+
+from __future__ import annotations
+
+from typing import Final, Set
+import re
+
+# Domain whitelist - ONLY these values are allowed
+VALID_DOMAINS: Final[Set[str]] = {
+    'general',
+    'embodied_ai',
+    'finance',
+    'medical',
+    'legal',
+    'technical',
+}
+
+# Source whitelist
+VALID_SOURCES: Final[Set[str]] = {
+    'manual',
+    'learned',
+    'imported',
+    'ai_suggested',
+    'community',
+}
+
+# Maximum text lengths to prevent DoS
+MAX_FROM_TEXT_LENGTH: Final[int] = 500
+MAX_TO_TEXT_LENGTH: Final[int] = 500
+MAX_NOTES_LENGTH: Final[int] = 2000
+MAX_USER_LENGTH: Final[int] = 100
+
+
+class ValidationError(Exception):
+    """Input validation failed"""
+    pass
+
+
+def validate_domain(domain: str) -> str:
+    """
+    Validate domain against whitelist.
+
+    CRITICAL: Prevents SQL injection via domain parameter.
+    Domain is used in WHERE clauses - must be whitelisted.
+
+    Args:
+        domain: Domain string to validate
+
+    Returns:
+        Validated domain (guaranteed to be in whitelist)
+
+    Raises:
+        ValidationError: If domain not in whitelist
+
+    Examples:
+        >>> validate_domain('general')
+        'general'
+
+        >>> validate_domain('hacked"; DROP TABLE corrections--')
+        ValidationError: Invalid domain
+    """
+    if not domain:
+        raise ValidationError("Domain cannot be empty")
+
+    domain = domain.strip().lower()
+
+    # Check again after stripping (whitespace-only input)
+    if not domain:
+        raise ValidationError("Domain cannot be empty")
+
+    if domain not in VALID_DOMAINS:
+        raise ValidationError(
+            f"Invalid domain: '{domain}'. "
+            f"Valid domains: {sorted(VALID_DOMAINS)}"
+        )
+
+    return domain
+
+
+def validate_source(source: str) -> str:
+    """
+    Validate source against whitelist.
+
+    Args:
+        source: Source string to validate
+
+    Returns:
+        Validated source
+
+    Raises:
+        ValidationError: If source not in whitelist
+    """
+    if not source:
+        raise ValidationError("Source cannot be empty")
+
+    source = source.strip().lower()
+
+    if source not in VALID_SOURCES:
+        raise ValidationError(
+            f"Invalid source: '{source}'. "
+            f"Valid sources: {sorted(VALID_SOURCES)}"
+        )
+
+    return source
+
+
+def sanitize_text_field(text: str, max_length: int, field_name: str = "field") -> str:
+    """
+    Sanitize text input with length validation.
+
+    Prevents:
+    - Excessively long inputs (DoS)
+    - Binary data
+    - Control characters (except whitespace)
+
+    Args:
+        text: Text to sanitize
+        max_length: Maximum allowed length
+        field_name: Field name for error messages
+
+    Returns:
+        Sanitized text
+
+    Raises:
+        ValidationError: If validation fails
+    """
+    if not text:
+        raise ValidationError(f"{field_name} cannot be empty")
+
+    if not isinstance(text, str):
+        raise ValidationError(f"{field_name} must be a string")
+
+    # Check length
+    if len(text) > max_length:
+        raise ValidationError(
+            f"{field_name} too long: {len(text)} chars "
+            f"(max: {max_length})"
+        )
+
+    # Check for null bytes (can break SQLite)
+    if '\x00' in text:
+        raise ValidationError(f"{field_name} contains null bytes")
+
+    # Remove other control characters except tab, newline, carriage return
+    sanitized = ''.join(
+        char for char in text
+        if ord(char) >= 32 or char in '\t\n\r'
+    )
+
+    if not sanitized.strip():
+        raise ValidationError(f"{field_name} is empty after sanitization")
+
+    return sanitized
+
+
+def validate_correction_inputs(
+    from_text: str,
+    to_text: str,
+    domain: str,
+    source: str,
+    notes: str | None = None,
+    added_by: str | None = None
+) -> tuple[str, str, str, str, str | None, str | None]:
+    """
+    Validate all inputs for correction creation.
+
+    Comprehensive validation in one function.
+    Call this before any database operation.
+
+    Args:
+        from_text: Original text
+        to_text: Corrected text
+        domain: Domain name
+        source: Source type
+        notes: Optional notes
+        added_by: Optional user
+
+    Returns:
+        Tuple of (sanitized from_text, to_text, domain, source, notes, added_by)
+
+    Raises:
+        ValidationError: If any validation fails
+
+    Example:
+        >>> validate_correction_inputs(
+        ...     "teh", "the", "general", "manual", None, "user123"
+        ... )
+        ('teh', 'the', 'general', 'manual', None, 'user123')
+    """
+    # Validate domain and source (whitelist)
+    domain = validate_domain(domain)
+    source = validate_source(source)
+
+    # Sanitize text fields
+    from_text = sanitize_text_field(from_text, MAX_FROM_TEXT_LENGTH, "from_text")
+    to_text = sanitize_text_field(to_text, MAX_TO_TEXT_LENGTH, "to_text")
+
+    # Optional fields
+    if notes is not None:
+        notes = sanitize_text_field(notes, MAX_NOTES_LENGTH, "notes")
+
+    if added_by is not None:
+        added_by = sanitize_text_field(added_by, MAX_USER_LENGTH, "added_by")
+
+    return from_text, to_text, domain, source, notes, added_by
+
+
+def validate_confidence(confidence: float) -> float:
+    """
+    Validate confidence score is in valid range.
+
+    Args:
+        confidence: Confidence score
+
+    Returns:
+        Validated confidence
+
+    Raises:
+        ValidationError: If out of range
+    """
+    if not isinstance(confidence, (int, float)):
+        raise ValidationError("Confidence must be a number")
+
+    if not 0.0 <= confidence <= 1.0:
+        raise ValidationError(
+            f"Confidence must be between 0.0 and 1.0, got: {confidence}"
+        )
+
+    return float(confidence)
+
+
+def is_safe_sql_identifier(identifier: str) -> bool:
+    """
+    Check if string is a safe SQL identifier.
+
+    Safe identifiers:
+    - Only alphanumeric and underscores
+    - Start with letter or underscore
+    - Max 64 chars
+
+    Use this for table/column names if dynamically constructing SQL.
+    (Though we should avoid this entirely - use parameterized queries!)
+
+    Args:
+        identifier: String to check
+
+    Returns:
+        True if safe to use as SQL identifier
+    """
+    if not identifier:
+        return False
+
+    if len(identifier) > 64:
+        return False
+
+    # Must match: ^[a-zA-Z_][a-zA-Z0-9_]*$
+    pattern = r'^[a-zA-Z_][a-zA-Z0-9_]*$'
+    return bool(re.match(pattern, identifier))
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    print("Testing domain_validator.py")
+    print("=" * 60)
+
+    # Test valid domain
+    try:
+        result = validate_domain("general")
+        print(f"✓ Valid domain: {result}")
+    except ValidationError as e:
+        print(f"✗ Unexpected error: {e}")
+
+    # Test invalid domain
+    try:
+        result = validate_domain("hacked'; DROP TABLE--")
+        print(f"✗ Should have failed: {result}")
+    except ValidationError as e:
+        print(f"✓ Correctly rejected: {e}")
+
+    # Test text sanitization
+    try:
+        result = sanitize_text_field("hello\x00world", 100, "test")
+        print(f"✗ Should have rejected null byte")
+    except ValidationError as e:
+        print(f"✓ Correctly rejected null byte: {e}")
+
+    # Test full validation
+    try:
+        result = validate_correction_inputs(
+            from_text="teh",
+            to_text="the",
+            domain="general",
+            source="manual",
+            notes="Typo fix",
+            added_by="test_user"
+        )
+        print(f"✓ Full validation passed: {result[0]} → {result[1]}")
+    except ValidationError as e:
+        print(f"✗ Unexpected error: {e}")
+
+    print("=" * 60)
+    print("✅ All validation tests completed")
--- a/transcript-fixer/scripts/utils/health_check.py
+++ b/transcript-fixer/scripts/utils/health_check.py
@@ -0,0 +1,654 @@
+#!/usr/bin/env python3
+"""
+Health Check Module - System Health Monitoring
+
+CRITICAL FIX (P1-4): Production-grade health checks for monitoring
+
+Features:
+- Database connectivity and schema validation
+- File system access checks
+- Configuration validation
+- Dependency verification
+- Resource availability checks
+
+Health Check Levels:
+- Basic: Quick connectivity checks (< 100ms)
+- Standard: Full system validation (< 1s)
+- Deep: Comprehensive diagnostics (< 5s)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import sys
+import time
+from dataclasses import dataclass, asdict
+from enum import Enum
+from pathlib import Path
+from typing import List, Dict, Optional, Final
+
+logger = logging.getLogger(__name__)
+
+# Import configuration for centralized config management (P1-5 fix)
+from .config import get_config
+
+# Health check thresholds
+RESPONSE_TIME_WARNING: Final[float] = 1.0  # seconds
+RESPONSE_TIME_CRITICAL: Final[float] = 5.0  # seconds
+MIN_DISK_SPACE_MB: Final[int] = 100  # MB
+
+
+class HealthStatus(Enum):
+    """Health status levels"""
+    HEALTHY = "healthy"
+    DEGRADED = "degraded"
+    UNHEALTHY = "unhealthy"
+    UNKNOWN = "unknown"
+
+
+class CheckLevel(Enum):
+    """Health check thoroughness levels"""
+    BASIC = "basic"        # Quick checks (< 100ms)
+    STANDARD = "standard"  # Full validation (< 1s)
+    DEEP = "deep"          # Comprehensive (< 5s)
+
+
+@dataclass
+class HealthCheckResult:
+    """Result of a single health check"""
+    name: str
+    status: HealthStatus
+    message: str
+    duration_ms: float
+    details: Optional[Dict] = None
+    error: Optional[str] = None
+
+    def to_dict(self) -> Dict:
+        """Convert to dictionary"""
+        result = asdict(self)
+        result['status'] = self.status.value
+        return result
+
+
+@dataclass
+class SystemHealth:
+    """Overall system health status"""
+    status: HealthStatus
+    timestamp: str
+    duration_ms: float
+    checks: List[HealthCheckResult]
+    summary: Dict[str, int]
+
+    def to_dict(self) -> Dict:
+        """Convert to dictionary"""
+        return {
+            'status': self.status.value,
+            'timestamp': self.timestamp,
+            'duration_ms': round(self.duration_ms, 2),
+            'checks': [check.to_dict() for check in self.checks],
+            'summary': self.summary
+        }
+
+    def to_json(self) -> str:
+        """Convert to JSON string"""
+        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
+
+
+class HealthChecker:
+    """
+    System health checker with configurable thoroughness levels.
+
+    CRITICAL FIX (P1-4): Enables monitoring and observability
+    """
+
+    def __init__(self, config_dir: Optional[Path] = None):
+        """
+        Initialize health checker
+
+        Args:
+            config_dir: Configuration directory (defaults to ~/.transcript-fixer)
+        """
+        # P1-5 FIX: Use centralized configuration
+        config = get_config()
+
+        # For backward compatibility, still accept config_dir parameter
+        self.config_dir = config_dir or config.paths.config_dir
+        self.db_path = config.database.path
+
+    def check_health(self, level: CheckLevel = CheckLevel.STANDARD) -> SystemHealth:
+        """
+        Perform health check at specified level
+
+        Args:
+            level: Thoroughness level (BASIC, STANDARD, DEEP)
+
+        Returns:
+            SystemHealth with overall status and individual check results
+        """
+        start_time = time.time()
+        checks: List[HealthCheckResult] = []
+
+        logger.info(f"Starting health check (level: {level.value})")
+
+        # Always run basic checks
+        checks.append(self._check_config_directory())
+        checks.append(self._check_database())
+
+        # Standard level: add configuration checks
+        if level in (CheckLevel.STANDARD, CheckLevel.DEEP):
+            checks.append(self._check_api_key())
+            checks.append(self._check_dependencies())
+            checks.append(self._check_disk_space())
+
+        # Deep level: add comprehensive diagnostics
+        if level == CheckLevel.DEEP:
+            checks.append(self._check_database_schema())
+            checks.append(self._check_file_permissions())
+            checks.append(self._check_python_version())
+
+        # Calculate overall status
+        duration_ms = (time.time() - start_time) * 1000
+        overall_status = self._calculate_overall_status(checks)
+
+        # Generate summary
+        summary = {
+            'total': len(checks),
+            'healthy': sum(1 for c in checks if c.status == HealthStatus.HEALTHY),
+            'degraded': sum(1 for c in checks if c.status == HealthStatus.DEGRADED),
+            'unhealthy': sum(1 for c in checks if c.status == HealthStatus.UNHEALTHY),
+        }
+
+        # Check for slow response time
+        if duration_ms > RESPONSE_TIME_CRITICAL * 1000:
+            logger.warning(f"Health check took {duration_ms:.0f}ms (critical threshold)")
+        elif duration_ms > RESPONSE_TIME_WARNING * 1000:
+            logger.warning(f"Health check took {duration_ms:.0f}ms (warning threshold)")
+
+        return SystemHealth(
+            status=overall_status,
+            timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
+            duration_ms=duration_ms,
+            checks=checks,
+            summary=summary
+        )
+
+    def _calculate_overall_status(self, checks: List[HealthCheckResult]) -> HealthStatus:
+        """Calculate overall system status from individual checks"""
+        if not checks:
+            return HealthStatus.UNKNOWN
+
+        # Any unhealthy check = system unhealthy
+        if any(c.status == HealthStatus.UNHEALTHY for c in checks):
+            return HealthStatus.UNHEALTHY
+
+        # Any degraded check = system degraded
+        if any(c.status == HealthStatus.DEGRADED for c in checks):
+            return HealthStatus.DEGRADED
+
+        # All healthy = system healthy
+        if all(c.status == HealthStatus.HEALTHY for c in checks):
+            return HealthStatus.HEALTHY
+
+        return HealthStatus.UNKNOWN
+
+    def _check_config_directory(self) -> HealthCheckResult:
+        """Check configuration directory exists and is writable"""
+        start_time = time.time()
+        name = "config_directory"
+
+        try:
+            # Check existence
+            if not self.config_dir.exists():
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.UNHEALTHY,
+                    message="Configuration directory does not exist",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'path': str(self.config_dir)},
+                    error="Directory not found"
+                )
+
+            # Check writability
+            test_file = self.config_dir / ".health_check_test"
+            try:
+                test_file.touch()
+                test_file.unlink()
+            except (PermissionError, OSError) as e:
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.DEGRADED,
+                    message="Configuration directory not writable",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'path': str(self.config_dir)},
+                    error=str(e)
+                )
+
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.HEALTHY,
+                message="Configuration directory accessible",
+                duration_ms=(time.time() - start_time) * 1000,
+                details={'path': str(self.config_dir)}
+            )
+
+        except Exception as e:
+            logger.exception("Config directory check failed")
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.UNHEALTHY,
+                message="Configuration directory check failed",
+                duration_ms=(time.time() - start_time) * 1000,
+                error=str(e)
+            )
+
+    def _check_database(self) -> HealthCheckResult:
+        """Check database exists and is accessible"""
+        start_time = time.time()
+        name = "database"
+
+        try:
+            if not self.db_path.exists():
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.DEGRADED,
+                    message="Database not initialized",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'path': str(self.db_path)},
+                    error="Database file not found"
+                )
+
+            # Try to open database
+            import sqlite3
+            try:
+                conn = sqlite3.connect(str(self.db_path), timeout=5.0)
+                cursor = conn.execute("SELECT COUNT(*) FROM sqlite_master WHERE type='table'")
+                table_count = cursor.fetchone()[0]
+                conn.close()
+
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.HEALTHY,
+                    message="Database accessible",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={
+                        'path': str(self.db_path),
+                        'tables': table_count,
+                        'size_kb': self.db_path.stat().st_size // 1024
+                    }
+                )
+
+            except sqlite3.Error as e:
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.UNHEALTHY,
+                    message="Database connection failed",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'path': str(self.db_path)},
+                    error=str(e)
+                )
+
+        except Exception as e:
+            logger.exception("Database check failed")
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.UNHEALTHY,
+                message="Database check failed",
+                duration_ms=(time.time() - start_time) * 1000,
+                error=str(e)
+            )
+
+    def _check_api_key(self) -> HealthCheckResult:
+        """Check API key is configured"""
+        start_time = time.time()
+        name = "api_key"
+
+        try:
+            # P1-5 FIX: Use centralized configuration
+            config = get_config()
+            api_key = config.api.api_key
+
+            if not api_key:
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.DEGRADED,
+                    message="API key not configured",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'env_vars_checked': ['GLM_API_KEY', 'ANTHROPIC_API_KEY']},
+                    error="No API key found in environment"
+                )
+
+            # Check key format (don't validate by calling API)
+            if len(api_key) < 10:
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.DEGRADED,
+                    message="API key format suspicious",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'key_length': len(api_key)},
+                    error="API key too short"
+                )
+
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.HEALTHY,
+                message="API key configured",
+                duration_ms=(time.time() - start_time) * 1000,
+                details={'key_length': len(api_key), 'masked_key': api_key[:8] + '***'}
+            )
+
+        except Exception as e:
+            logger.exception("API key check failed")
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.UNHEALTHY,
+                message="API key check failed",
+                duration_ms=(time.time() - start_time) * 1000,
+                error=str(e)
+            )
+
+    def _check_dependencies(self) -> HealthCheckResult:
+        """Check required dependencies are installed"""
+        start_time = time.time()
+        name = "dependencies"
+
+        required_modules = ['httpx', 'filelock']
+        missing = []
+        installed = []
+
+        try:
+            for module in required_modules:
+                try:
+                    __import__(module)
+                    installed.append(module)
+                except ImportError:
+                    missing.append(module)
+
+            if missing:
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.UNHEALTHY,
+                    message=f"Missing dependencies: {', '.join(missing)}",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'installed': installed, 'missing': missing},
+                    error=f"Install with: pip install {' '.join(missing)}"
+                )
+
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.HEALTHY,
+                message="All dependencies installed",
+                duration_ms=(time.time() - start_time) * 1000,
+                details={'installed': installed}
+            )
+
+        except Exception as e:
+            logger.exception("Dependencies check failed")
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.UNHEALTHY,
+                message="Dependencies check failed",
+                duration_ms=(time.time() - start_time) * 1000,
+                error=str(e)
+            )
+
+    def _check_disk_space(self) -> HealthCheckResult:
+        """Check available disk space"""
+        start_time = time.time()
+        name = "disk_space"
+
+        try:
+            import shutil
+            stat = shutil.disk_usage(self.config_dir.parent)
+
+            free_mb = stat.free / (1024 * 1024)
+            total_mb = stat.total / (1024 * 1024)
+            used_percent = (stat.used / stat.total) * 100
+
+            if free_mb < MIN_DISK_SPACE_MB:
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.UNHEALTHY,
+                    message=f"Low disk space: {free_mb:.0f}MB free",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={
+                        'free_mb': round(free_mb, 2),
+                        'total_mb': round(total_mb, 2),
+                        'used_percent': round(used_percent, 1)
+                    },
+                    error=f"Less than {MIN_DISK_SPACE_MB}MB available"
+                )
+
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.HEALTHY,
+                message=f"Sufficient disk space: {free_mb:.0f}MB free",
+                duration_ms=(time.time() - start_time) * 1000,
+                details={
+                    'free_mb': round(free_mb, 2),
+                    'total_mb': round(total_mb, 2),
+                    'used_percent': round(used_percent, 1)
+                }
+            )
+
+        except Exception as e:
+            logger.exception("Disk space check failed")
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.UNKNOWN,
+                message="Disk space check failed",
+                duration_ms=(time.time() - start_time) * 1000,
+                error=str(e)
+            )
+
+    def _check_database_schema(self) -> HealthCheckResult:
+        """Check database schema is valid (deep check)"""
+        start_time = time.time()
+        name = "database_schema"
+
+        expected_tables = [
+            'corrections', 'context_rules', 'correction_history',
+            'correction_changes', 'learned_suggestions', 'suggestion_examples',
+            'system_config', 'audit_log'
+        ]
+
+        try:
+            if not self.db_path.exists():
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.DEGRADED,
+                    message="Database not initialized",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    error="Cannot check schema - database missing"
+                )
+
+            import sqlite3
+            conn = sqlite3.connect(str(self.db_path), timeout=5.0)
+            cursor = conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
+            )
+            actual_tables = [row[0] for row in cursor.fetchall()]
+            conn.close()
+
+            missing = [t for t in expected_tables if t not in actual_tables]
+            extra = [t for t in actual_tables if t not in expected_tables and not t.startswith('sqlite_')]
+
+            if missing:
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.DEGRADED,
+                    message=f"Missing tables: {', '.join(missing)}",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={
+                        'expected': expected_tables,
+                        'actual': actual_tables,
+                        'missing': missing,
+                        'extra': extra
+                    },
+                    error="Schema incomplete"
+                )
+
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.HEALTHY,
+                message="Database schema valid",
+                duration_ms=(time.time() - start_time) * 1000,
+                details={
+                    'tables': actual_tables,
+                    'count': len(actual_tables)
+                }
+            )
+
+        except Exception as e:
+            logger.exception("Database schema check failed")
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.UNHEALTHY,
+                message="Database schema check failed",
+                duration_ms=(time.time() - start_time) * 1000,
+                error=str(e)
+            )
+
+    def _check_file_permissions(self) -> HealthCheckResult:
+        """Check file permissions (deep check)"""
+        start_time = time.time()
+        name = "file_permissions"
+
+        try:
+            issues = []
+
+            # Check config directory permissions
+            if not os.access(self.config_dir, os.R_OK | os.W_OK | os.X_OK):
+                issues.append(f"Config dir: insufficient permissions")
+
+            # Check database permissions (if exists)
+            if self.db_path.exists():
+                if not os.access(self.db_path, os.R_OK | os.W_OK):
+                    issues.append(f"Database: read/write denied")
+
+            if issues:
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.DEGRADED,
+                    message="Permission issues detected",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'issues': issues},
+                    error='; '.join(issues)
+                )
+
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.HEALTHY,
+                message="File permissions correct",
+                duration_ms=(time.time() - start_time) * 1000
+            )
+
+        except Exception as e:
+            logger.exception("File permissions check failed")
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.UNKNOWN,
+                message="File permissions check failed",
+                duration_ms=(time.time() - start_time) * 1000,
+                error=str(e)
+            )
+
+    def _check_python_version(self) -> HealthCheckResult:
+        """Check Python version (deep check)"""
+        start_time = time.time()
+        name = "python_version"
+
+        try:
+            version = sys.version_info
+            version_str = f"{version.major}.{version.minor}.{version.micro}"
+
+            # Minimum required: Python 3.8
+            if version < (3, 8):
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.UNHEALTHY,
+                    message=f"Python version too old: {version_str}",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'version': version_str, 'minimum': '3.8'},
+                    error="Python 3.8+ required"
+                )
+
+            # Warn if using Python 3.12+ (may have compatibility issues)
+            if version >= (3, 13):
+                return HealthCheckResult(
+                    name=name,
+                    status=HealthStatus.DEGRADED,
+                    message=f"Python version very new: {version_str}",
+                    duration_ms=(time.time() - start_time) * 1000,
+                    details={'version': version_str, 'recommended': '3.8-3.12'},
+                    error="May have untested compatibility issues"
+                )
+
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.HEALTHY,
+                message=f"Python version supported: {version_str}",
+                duration_ms=(time.time() - start_time) * 1000,
+                details={'version': version_str}
+            )
+
+        except Exception as e:
+            logger.exception("Python version check failed")
+            return HealthCheckResult(
+                name=name,
+                status=HealthStatus.UNKNOWN,
+                message="Python version check failed",
+                duration_ms=(time.time() - start_time) * 1000,
+                error=str(e)
+            )
+
+
+def format_health_output(health: SystemHealth, verbose: bool = False) -> str:
+    """
+    Format health check results for CLI output
+
+    Args:
+        health: SystemHealth object
+        verbose: Show detailed information
+
+    Returns:
+        Formatted string for display
+    """
+    lines = []
+
+    # Header - icon mapping
+    status_icon_map = {
+        HealthStatus.HEALTHY: "✅",
+        HealthStatus.DEGRADED: "⚠️",
+        HealthStatus.UNHEALTHY: "❌",
+        HealthStatus.UNKNOWN: "❓"
+    }
+
+    overall_icon = status_icon_map[health.status]
+
+    lines.append(f"\n{overall_icon} System Health: {health.status.value.upper()}")
+    lines.append(f"{'=' * 70}")
+    lines.append(f"Timestamp: {health.timestamp}")
+    lines.append(f"Duration: {health.duration_ms:.1f}ms")
+    lines.append(f"Checks: {health.summary['healthy']}/{health.summary['total']} passed")
+    lines.append("")
+
+    # Individual checks
+    for check in health.checks:
+        icon = status_icon_map.get(check.status, "❓")
+        lines.append(f"{icon} {check.name}: {check.message}")
+
+        if verbose and check.details:
+            for key, value in check.details.items():
+                lines.append(f"    {key}: {value}")
+
+        if check.error:
+            lines.append(f"    Error: {check.error}")
+
+        if verbose:
+            lines.append(f"    Duration: {check.duration_ms:.1f}ms")
+
+    lines.append(f"\n{'=' * 70}")
+
+    return "\n".join(lines)
--- a/transcript-fixer/scripts/utils/logging_config.py
+++ b/transcript-fixer/scripts/utils/logging_config.py
@@ -2,14 +2,26 @@
 """
 Logging Configuration for Transcript Fixer

+CRITICAL FIX: Enhanced with structured logging and error tracking
+ISSUE: Critical-4 in Engineering Excellence Plan
+
 Provides structured logging with rotation, levels, and audit trails.
+Added: Error rate monitoring, performance tracking, context enrichment
+
+Author: Chief Engineer
+Date: 2025-10-28
+Priority: P0 - Critical
 """

 import logging
 import logging.handlers
 import sys
+import json
+import time
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Dict, Any
+from contextlib import contextmanager
+from datetime import datetime


 def setup_logging(
@@ -114,6 +126,156 @@ def get_audit_logger() -> logging.Logger:
    return logging.getLogger('audit')


+class ErrorCounter:
+    """
+    Track error rates for failure threshold monitoring.
+
+    CRITICAL FIX: Added for Critical-4
+    Prevents silent failures by monitoring error rates.
+
+    Usage:
+        counter = ErrorCounter(threshold=0.3)
+        for item in items:
+            try:
+                process(item)
+                counter.success()
+            except Exception:
+                counter.failure()
+                if counter.should_abort():
+                    logger.error("Error rate too high, aborting")
+                    break
+    """
+
+    def __init__(self, threshold: float = 0.3, window_size: int = 100):
+        """
+        Initialize error counter.
+
+        Args:
+            threshold: Failure rate threshold (0.3 = 30%)
+            window_size: Number of recent operations to track
+        """
+        self.threshold = threshold
+        self.window_size = window_size
+        self.results: list[bool] = []  # True = success, False = failure
+        self.total_successes = 0
+        self.total_failures = 0
+
+    def success(self) -> None:
+        """Record a successful operation"""
+        self.results.append(True)
+        self.total_successes += 1
+        if len(self.results) > self.window_size:
+            self.results.pop(0)
+
+    def failure(self) -> None:
+        """Record a failed operation"""
+        self.results.append(False)
+        self.total_failures += 1
+        if len(self.results) > self.window_size:
+            self.results.pop(0)
+
+    def failure_rate(self) -> float:
+        """Calculate current failure rate (rolling window)"""
+        if not self.results:
+            return 0.0
+        failures = sum(1 for r in self.results if not r)
+        return failures / len(self.results)
+
+    def should_abort(self) -> bool:
+        """Check if failure rate exceeds threshold"""
+        # Need minimum sample size before aborting
+        if len(self.results) < 10:
+            return False
+        return self.failure_rate() > self.threshold
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get error statistics"""
+        window_total = len(self.results)
+        window_failures = sum(1 for r in self.results if not r)
+        window_successes = window_total - window_failures
+
+        return {
+            "window_total": window_total,
+            "window_successes": window_successes,
+            "window_failures": window_failures,
+            "window_failure_rate": self.failure_rate(),
+            "total_successes": self.total_successes,
+            "total_failures": self.total_failures,
+            "threshold": self.threshold,
+            "should_abort": self.should_abort(),
+        }
+
+    def reset(self) -> None:
+        """Reset counters"""
+        self.results.clear()
+        self.total_successes = 0
+        self.total_failures = 0
+
+
+class TimedLogger:
+    """
+    Logger wrapper with automatic performance tracking.
+
+    CRITICAL FIX: Added for Critical-4
+    Automatically logs execution time for operations.
+
+    Usage:
+        logger = TimedLogger(logging.getLogger(__name__))
+        with logger.timed("chunk_processing", chunk_id=5):
+            process_chunk()
+        # Automatically logs: "chunk_processing completed in 123ms"
+    """
+
+    def __init__(self, logger: logging.Logger):
+        """
+        Initialize with a logger instance.
+
+        Args:
+            logger: Logger to wrap
+        """
+        self.logger = logger
+
+    @contextmanager
+    def timed(self, operation_name: str, **context: Any):
+        """
+        Context manager for timing operations.
+
+        Args:
+            operation_name: Name of operation
+            **context: Additional context to log
+
+        Yields:
+            None
+
+        Example:
+            >>> with logger.timed("api_call", chunk_id=5):
+            ...     call_api()
+            # Logs: "api_call completed in 123ms (chunk_id=5)"
+        """
+        start_time = time.time()
+
+        # Format context for logging
+        context_str = ", ".join(f"{k}={v}" for k, v in context.items())
+        if context_str:
+            context_str = f" ({context_str})"
+
+        self.logger.info(f"{operation_name} started{context_str}")
+
+        try:
+            yield
+        except Exception as e:
+            duration_ms = (time.time() - start_time) * 1000
+            self.logger.error(
+                f"{operation_name} failed in {duration_ms:.1f}ms{context_str}: {e}"
+            )
+            raise
+        else:
+            duration_ms = (time.time() - start_time) * 1000
+            self.logger.info(
+                f"{operation_name} completed in {duration_ms:.1f}ms{context_str}"
+            )
+
+
 # Example usage
 if __name__ == "__main__":
    setup_logging(level="DEBUG")
@@ -127,3 +289,21 @@ if __name__ == "__main__":

    audit_logger = get_audit_logger()
    audit_logger.info("User 'admin' added correction: '错误' → '正确'")
+
+    # Test ErrorCounter
+    print("\n--- Testing ErrorCounter ---")
+    counter = ErrorCounter(threshold=0.3)
+    for i in range(20):
+        if i % 4 == 0:
+            counter.failure()
+        else:
+            counter.success()
+
+    stats = counter.get_stats()
+    print(f"Stats: {json.dumps(stats, indent=2)}")
+
+    # Test TimedLogger
+    print("\n--- Testing TimedLogger ---")
+    timed_logger = TimedLogger(logger)
+    with timed_logger.timed("test_operation", item_count=100):
+        time.sleep(0.1)
--- a/transcript-fixer/scripts/utils/metrics.py
+++ b/transcript-fixer/scripts/utils/metrics.py
@@ -0,0 +1,535 @@
+#!/usr/bin/env python3
+"""
+Metrics Collection and Monitoring
+
+CRITICAL FIX (P1-7): Production-grade metrics and observability
+
+Features:
+- Real-time metrics collection
+- Time-series data storage (in-memory)
+- Prometheus-compatible export format
+- Common metrics: requests, errors, latency, throughput
+- Custom metric support
+- Thread-safe operations
+
+Metrics Types:
+- Counter: Monotonically increasing value (e.g., total requests)
+- Gauge: Point-in-time value (e.g., active connections)
+- Histogram: Distribution of values (e.g., response times)
+- Summary: Statistical summary (e.g., percentiles)
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from collections import defaultdict, deque
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Deque, Final
+from contextlib import contextmanager
+import json
+
+logger = logging.getLogger(__name__)
+
+# Configuration constants
+MAX_HISTOGRAM_SAMPLES: Final[int] = 1000  # Keep last 1000 samples per histogram
+MAX_TIMESERIES_POINTS: Final[int] = 100   # Keep last 100 time series points
+PERCENTILES: Final[List[float]] = [0.5, 0.9, 0.95, 0.99]  # P50, P90, P95, P99
+
+
+class MetricType(Enum):
+    """Type of metric"""
+    COUNTER = "counter"
+    GAUGE = "gauge"
+    HISTOGRAM = "histogram"
+    SUMMARY = "summary"
+
+
+@dataclass
+class MetricValue:
+    """Single metric data point"""
+    timestamp: float
+    value: float
+    labels: Dict[str, str] = field(default_factory=dict)
+
+
+@dataclass
+class MetricSnapshot:
+    """Snapshot of a metric at a point in time"""
+    name: str
+    type: MetricType
+    value: float
+    labels: Dict[str, str]
+    help_text: str
+    timestamp: float
+
+    # Additional statistics for histograms
+    samples: Optional[int] = None
+    sum: Optional[float] = None
+    percentiles: Optional[Dict[str, float]] = None
+
+    def to_dict(self) -> Dict:
+        """Convert to dictionary"""
+        result = {
+            'name': self.name,
+            'type': self.type.value,
+            'value': self.value,
+            'labels': self.labels,
+            'help': self.help_text,
+            'timestamp': self.timestamp
+        }
+        if self.samples is not None:
+            result['samples'] = self.samples
+        if self.sum is not None:
+            result['sum'] = self.sum
+        if self.percentiles:
+            result['percentiles'] = self.percentiles
+        return result
+
+
+class Counter:
+    """
+    Counter metric - monotonically increasing value.
+
+    Use for: total requests, total errors, total API calls
+    """
+
+    def __init__(self, name: str, help_text: str = ""):
+        self.name = name
+        self.help_text = help_text
+        self._value = 0.0
+        self._lock = threading.Lock()
+        self._labels: Dict[str, str] = {}
+
+    def inc(self, amount: float = 1.0) -> None:
+        """Increment counter by amount"""
+        if amount < 0:
+            raise ValueError("Counter can only increase")
+
+        with self._lock:
+            self._value += amount
+
+    def get(self) -> float:
+        """Get current value"""
+        with self._lock:
+            return self._value
+
+    def snapshot(self) -> MetricSnapshot:
+        """Get current snapshot"""
+        return MetricSnapshot(
+            name=self.name,
+            type=MetricType.COUNTER,
+            value=self.get(),
+            labels=self._labels.copy(),
+            help_text=self.help_text,
+            timestamp=time.time()
+        )
+
+
+class Gauge:
+    """
+    Gauge metric - can increase or decrease.
+
+    Use for: active connections, memory usage, queue size
+    """
+
+    def __init__(self, name: str, help_text: str = ""):
+        self.name = name
+        self.help_text = help_text
+        self._value = 0.0
+        self._lock = threading.Lock()
+        self._labels: Dict[str, str] = {}
+
+    def set(self, value: float) -> None:
+        """Set gauge to specific value"""
+        with self._lock:
+            self._value = value
+
+    def inc(self, amount: float = 1.0) -> None:
+        """Increment gauge"""
+        with self._lock:
+            self._value += amount
+
+    def dec(self, amount: float = 1.0) -> None:
+        """Decrement gauge"""
+        with self._lock:
+            self._value -= amount
+
+    def get(self) -> float:
+        """Get current value"""
+        with self._lock:
+            return self._value
+
+    def snapshot(self) -> MetricSnapshot:
+        """Get current snapshot"""
+        return MetricSnapshot(
+            name=self.name,
+            type=MetricType.GAUGE,
+            value=self.get(),
+            labels=self._labels.copy(),
+            help_text=self.help_text,
+            timestamp=time.time()
+        )
+
+
+class Histogram:
+    """
+    Histogram metric - tracks distribution of values.
+
+    Use for: request latency, response sizes, processing times
+    """
+
+    def __init__(self, name: str, help_text: str = ""):
+        self.name = name
+        self.help_text = help_text
+        self._samples: Deque[float] = deque(maxlen=MAX_HISTOGRAM_SAMPLES)
+        self._count = 0
+        self._sum = 0.0
+        self._lock = threading.Lock()
+        self._labels: Dict[str, str] = {}
+
+    def observe(self, value: float) -> None:
+        """Record a new observation"""
+        with self._lock:
+            self._samples.append(value)
+            self._count += 1
+            self._sum += value
+
+    def get_percentile(self, percentile: float) -> float:
+        """
+        Calculate percentile value.
+
+        Args:
+            percentile: Value between 0 and 1 (e.g., 0.95 for P95)
+        """
+        with self._lock:
+            if not self._samples:
+                return 0.0
+
+            sorted_samples = sorted(self._samples)
+            index = int(len(sorted_samples) * percentile)
+            index = max(0, min(index, len(sorted_samples) - 1))
+            return sorted_samples[index]
+
+    def get_mean(self) -> float:
+        """Calculate mean value"""
+        with self._lock:
+            if self._count == 0:
+                return 0.0
+            return self._sum / self._count
+
+    def snapshot(self) -> MetricSnapshot:
+        """Get current snapshot with percentiles"""
+        percentiles = {
+            f"p{int(p * 100)}": self.get_percentile(p)
+            for p in PERCENTILES
+        }
+
+        return MetricSnapshot(
+            name=self.name,
+            type=MetricType.HISTOGRAM,
+            value=self.get_mean(),
+            labels=self._labels.copy(),
+            help_text=self.help_text,
+            timestamp=time.time(),
+            samples=len(self._samples),
+            sum=self._sum,
+            percentiles=percentiles
+        )
+
+
+class MetricsCollector:
+    """
+    Central metrics collector for the application.
+
+    CRITICAL FIX (P1-7): Thread-safe metrics collection and aggregation
+    """
+
+    def __init__(self):
+        self._counters: Dict[str, Counter] = {}
+        self._gauges: Dict[str, Gauge] = {}
+        self._histograms: Dict[str, Histogram] = {}
+        self._lock = threading.Lock()
+
+        # Initialize standard metrics
+        self._init_standard_metrics()
+
+    def _init_standard_metrics(self) -> None:
+        """Initialize standard application metrics"""
+        # Request metrics
+        self.register_counter("requests_total", "Total number of requests")
+        self.register_counter("requests_success", "Total successful requests")
+        self.register_counter("requests_failed", "Total failed requests")
+
+        # Performance metrics
+        self.register_histogram("request_duration_seconds", "Request duration in seconds")
+        self.register_histogram("api_call_duration_seconds", "API call duration in seconds")
+
+        # Resource metrics
+        self.register_gauge("active_connections", "Current active connections")
+        self.register_gauge("active_tasks", "Current active tasks")
+
+        # Database metrics
+        self.register_counter("db_queries_total", "Total database queries")
+        self.register_histogram("db_query_duration_seconds", "Database query duration")
+
+        # Error metrics
+        self.register_counter("errors_total", "Total errors")
+        self.register_counter("errors_by_type", "Errors by type")
+
+    def register_counter(self, name: str, help_text: str = "") -> Counter:
+        """Register a new counter metric"""
+        with self._lock:
+            if name not in self._counters:
+                self._counters[name] = Counter(name, help_text)
+            return self._counters[name]
+
+    def register_gauge(self, name: str, help_text: str = "") -> Gauge:
+        """Register a new gauge metric"""
+        with self._lock:
+            if name not in self._gauges:
+                self._gauges[name] = Gauge(name, help_text)
+            return self._gauges[name]
+
+    def register_histogram(self, name: str, help_text: str = "") -> Histogram:
+        """Register a new histogram metric"""
+        with self._lock:
+            if name not in self._histograms:
+                self._histograms[name] = Histogram(name, help_text)
+            return self._histograms[name]
+
+    def get_counter(self, name: str) -> Optional[Counter]:
+        """Get counter by name"""
+        return self._counters.get(name)
+
+    def get_gauge(self, name: str) -> Optional[Gauge]:
+        """Get gauge by name"""
+        return self._gauges.get(name)
+
+    def get_histogram(self, name: str) -> Optional[Histogram]:
+        """Get histogram by name"""
+        return self._histograms.get(name)
+
+    @contextmanager
+    def track_request(self, success: bool = True):
+        """
+        Context manager to track request metrics.
+
+        Usage:
+            with metrics.track_request():
+                # Do work
+                pass
+        """
+        start_time = time.time()
+        self.get_gauge("active_tasks").inc()
+
+        try:
+            yield
+            if success:
+                self.get_counter("requests_success").inc()
+        except Exception:
+            self.get_counter("requests_failed").inc()
+            raise
+        finally:
+            duration = time.time() - start_time
+            self.get_histogram("request_duration_seconds").observe(duration)
+            self.get_counter("requests_total").inc()
+            self.get_gauge("active_tasks").dec()
+
+    @contextmanager
+    def track_api_call(self):
+        """
+        Context manager to track API call metrics.
+
+        Usage:
+            with metrics.track_api_call():
+                response = await client.post(...)
+        """
+        start_time = time.time()
+
+        try:
+            yield
+        finally:
+            duration = time.time() - start_time
+            self.get_histogram("api_call_duration_seconds").observe(duration)
+
+    @contextmanager
+    def track_db_query(self):
+        """
+        Context manager to track database query metrics.
+
+        Usage:
+            with metrics.track_db_query():
+                cursor.execute(query)
+        """
+        start_time = time.time()
+
+        try:
+            yield
+        finally:
+            duration = time.time() - start_time
+            self.get_histogram("db_query_duration_seconds").observe(duration)
+            self.get_counter("db_queries_total").inc()
+
+    def get_all_snapshots(self) -> List[MetricSnapshot]:
+        """Get snapshots of all metrics"""
+        snapshots = []
+
+        with self._lock:
+            for counter in self._counters.values():
+                snapshots.append(counter.snapshot())
+
+            for gauge in self._gauges.values():
+                snapshots.append(gauge.snapshot())
+
+            for histogram in self._histograms.values():
+                snapshots.append(histogram.snapshot())
+
+        return snapshots
+
+    def to_json(self) -> str:
+        """Export all metrics as JSON"""
+        snapshots = self.get_all_snapshots()
+        data = {
+            'timestamp': time.time(),
+            'metrics': [s.to_dict() for s in snapshots]
+        }
+        return json.dumps(data, indent=2)
+
+    def to_prometheus(self) -> str:
+        """
+        Export metrics in Prometheus text format.
+
+        Format:
+        # HELP metric_name Description
+        # TYPE metric_name counter
+        metric_name{label="value"} 123.45 timestamp
+        """
+        lines = []
+        snapshots = self.get_all_snapshots()
+
+        for snapshot in snapshots:
+            # HELP line
+            lines.append(f"# HELP {snapshot.name} {snapshot.help_text}")
+
+            # TYPE line
+            lines.append(f"# TYPE {snapshot.name} {snapshot.type.value}")
+
+            # Metric line
+            labels_str = ",".join(f'{k}="{v}"' for k, v in snapshot.labels.items())
+            if labels_str:
+                labels_str = f"{{{labels_str}}}"
+
+            # For histograms, export percentiles
+            if snapshot.type == MetricType.HISTOGRAM and snapshot.percentiles:
+                for pct_name, pct_value in snapshot.percentiles.items():
+                    lines.append(
+                        f'{snapshot.name}_bucket{{le="{pct_name}"}}{labels_str} '
+                        f'{pct_value} {int(snapshot.timestamp * 1000)}'
+                    )
+                lines.append(
+                    f'{snapshot.name}_count{labels_str} '
+                    f'{snapshot.samples} {int(snapshot.timestamp * 1000)}'
+                )
+                lines.append(
+                    f'{snapshot.name}_sum{labels_str} '
+                    f'{snapshot.sum} {int(snapshot.timestamp * 1000)}'
+                )
+            else:
+                lines.append(
+                    f'{snapshot.name}{labels_str} '
+                    f'{snapshot.value} {int(snapshot.timestamp * 1000)}'
+                )
+
+            lines.append("")  # Blank line between metrics
+
+        return "\n".join(lines)
+
+    def get_summary(self) -> Dict:
+        """Get human-readable summary of key metrics"""
+        request_duration = self.get_histogram("request_duration_seconds")
+        api_duration = self.get_histogram("api_call_duration_seconds")
+        db_duration = self.get_histogram("db_query_duration_seconds")
+
+        return {
+            'requests': {
+                'total': int(self.get_counter("requests_total").get()),
+                'success': int(self.get_counter("requests_success").get()),
+                'failed': int(self.get_counter("requests_failed").get()),
+                'active': int(self.get_gauge("active_tasks").get()),
+                'avg_duration_ms': round(request_duration.get_mean() * 1000, 2),
+                'p95_duration_ms': round(request_duration.get_percentile(0.95) * 1000, 2),
+            },
+            'api_calls': {
+                'avg_duration_ms': round(api_duration.get_mean() * 1000, 2),
+                'p95_duration_ms': round(api_duration.get_percentile(0.95) * 1000, 2),
+            },
+            'database': {
+                'total_queries': int(self.get_counter("db_queries_total").get()),
+                'avg_duration_ms': round(db_duration.get_mean() * 1000, 2),
+                'p95_duration_ms': round(db_duration.get_percentile(0.95) * 1000, 2),
+            },
+            'errors': {
+                'total': int(self.get_counter("errors_total").get()),
+            },
+            'resources': {
+                'active_connections': int(self.get_gauge("active_connections").get()),
+                'active_tasks': int(self.get_gauge("active_tasks").get()),
+            }
+        }
+
+
+# Global metrics collector singleton
+_global_metrics: Optional[MetricsCollector] = None
+_metrics_lock = threading.Lock()
+
+
+def get_metrics() -> MetricsCollector:
+    """Get global metrics collector (singleton)"""
+    global _global_metrics
+
+    if _global_metrics is None:
+        with _metrics_lock:
+            if _global_metrics is None:
+                _global_metrics = MetricsCollector()
+                logger.info("Initialized global metrics collector")
+
+    return _global_metrics
+
+
+def format_metrics_summary(summary: Dict) -> str:
+    """Format metrics summary for CLI display"""
+    lines = [
+        "\n📊 Metrics Summary",
+        "=" * 70,
+        "",
+        "Requests:",
+        f"  Total: {summary['requests']['total']}",
+        f"  Success: {summary['requests']['success']}",
+        f"  Failed: {summary['requests']['failed']}",
+        f"  Active: {summary['requests']['active']}",
+        f"  Avg Duration: {summary['requests']['avg_duration_ms']}ms",
+        f"  P95 Duration: {summary['requests']['p95_duration_ms']}ms",
+        "",
+        "API Calls:",
+        f"  Avg Duration: {summary['api_calls']['avg_duration_ms']}ms",
+        f"  P95 Duration: {summary['api_calls']['p95_duration_ms']}ms",
+        "",
+        "Database:",
+        f"  Total Queries: {summary['database']['total_queries']}",
+        f"  Avg Duration: {summary['database']['avg_duration_ms']}ms",
+        f"  P95 Duration: {summary['database']['p95_duration_ms']}ms",
+        "",
+        "Errors:",
+        f"  Total: {summary['errors']['total']}",
+        "",
+        "Resources:",
+        f"  Active Connections: {summary['resources']['active_connections']}",
+        f"  Active Tasks: {summary['resources']['active_tasks']}",
+        "",
+        "=" * 70
+    ]
+
+    return "\n".join(lines)
--- a/transcript-fixer/scripts/utils/migrations.py
+++ b/transcript-fixer/scripts/utils/migrations.py
@@ -0,0 +1,468 @@
+#!/usr/bin/env python3
+"""
+Migration Definitions - Database Schema Migrations
+
+This module contains all database migrations for the transcript-fixer system.
+
+Migrations are defined here to ensure version control and proper migration ordering.
+Each migration has:
+- Unique version number
+- Forward SQL
+- Optional backward SQL (for rollback)
+- Dependencies on previous versions
+- Validation functions
+"""
+
+from __future__ import annotations
+
+import sqlite3
+import logging
+from typing import Dict, Any, Tuple, Optional
+
+from .database_migration import Migration
+
+logger = logging.getLogger(__name__)
+
+
+def _validate_schema_2_0(conn: sqlite3.Connection, migration: Migration) -> Tuple[bool, str]:
+    """Validate that schema v2.0 is correctly applied"""
+    cursor = conn.cursor()
+
+    # Check if all tables exist
+    expected_tables = {
+        'corrections', 'context_rules', 'correction_history',
+        'correction_changes', 'learned_suggestions',
+        'suggestion_examples', 'system_config', 'audit_log'
+    }
+
+    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
+    existing_tables = {row[0] for row in cursor.fetchall()}
+
+    missing_tables = expected_tables - existing_tables
+    if missing_tables:
+        return False, f"Missing tables: {missing_tables}"
+
+    # Check system_config has required entries
+    cursor.execute("SELECT key FROM system_config WHERE key = 'schema_version'")
+    if not cursor.fetchone():
+        return False, "Missing schema_version in system_config"
+
+    return True, "Schema validation passed"
+
+
+# Migration from no schema to v1.0 (basic structure)
+MIGRATION_V1_0 = Migration(
+    version="1.0",
+    name="Initial Database Schema",
+    description="Create basic tables for correction storage",
+    forward_sql="""
+    -- Enable foreign keys
+    PRAGMA foreign_keys = ON;
+
+    -- Table: corrections
+    CREATE TABLE corrections (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        from_text TEXT NOT NULL,
+        to_text TEXT NOT NULL,
+        domain TEXT NOT NULL DEFAULT 'general',
+        source TEXT NOT NULL CHECK(source IN ('manual', 'learned', 'imported')),
+        confidence REAL NOT NULL DEFAULT 1.0 CHECK(confidence >= 0.0 AND confidence <= 1.0),
+        added_by TEXT,
+        added_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        usage_count INTEGER NOT NULL DEFAULT 0 CHECK(usage_count >= 0),
+        last_used TIMESTAMP,
+        notes TEXT,
+        is_active BOOLEAN NOT NULL DEFAULT 1,
+        UNIQUE(from_text, domain)
+    );
+
+    -- Table: correction_history
+    CREATE TABLE correction_history (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        filename TEXT NOT NULL,
+        domain TEXT NOT NULL,
+        run_timestamp TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        original_length INTEGER NOT NULL CHECK(original_length >= 0),
+        stage1_changes INTEGER NOT NULL DEFAULT 0 CHECK(stage1_changes >= 0),
+        stage2_changes INTEGER NOT NULL DEFAULT 0 CHECK(stage2_changes >= 0),
+        model TEXT,
+        execution_time_ms INTEGER CHECK(execution_time_ms >= 0),
+        success BOOLEAN NOT NULL DEFAULT 1,
+        error_message TEXT
+    );
+
+    -- Insert initial system config
+    CREATE TABLE system_config (
+        key TEXT PRIMARY KEY,
+        value TEXT NOT NULL,
+        value_type TEXT NOT NULL CHECK(value_type IN ('string', 'int', 'float', 'boolean', 'json')),
+        description TEXT,
+        updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
+    );
+
+    INSERT OR IGNORE INTO system_config (key, value, value_type, description) VALUES
+        ('schema_version', '1.0', 'string', 'Database schema version'),
+        ('api_provider', 'GLM', 'string', 'API provider name'),
+        ('api_model', 'GLM-4.6', 'string', 'Default AI model');
+
+    -- Create indexes
+    CREATE INDEX idx_corrections_domain ON corrections(domain);
+    CREATE INDEX idx_corrections_source ON corrections(source);
+    CREATE INDEX idx_corrections_added_at ON corrections(added_at);
+    CREATE INDEX idx_corrections_is_active ON corrections(is_active);
+    CREATE INDEX idx_corrections_from_text ON corrections(from_text);
+    CREATE INDEX idx_history_run_timestamp ON correction_history(run_timestamp DESC);
+    CREATE INDEX idx_history_domain ON correction_history(domain);
+    CREATE INDEX idx_history_success ON correction_history(success);
+    """,
+    backward_sql="""
+    -- Drop indexes
+    DROP INDEX IF EXISTS idx_corrections_domain;
+    DROP INDEX IF EXISTS idx_corrections_source;
+    DROP INDEX IF EXISTS idx_corrections_added_at;
+    DROP INDEX IF EXISTS idx_corrections_is_active;
+    DROP INDEX IF EXISTS idx_corrections_from_text;
+    DROP INDEX IF EXISTS idx_history_run_timestamp;
+    DROP INDEX IF EXISTS idx_history_domain;
+    DROP INDEX IF EXISTS idx_history_success;
+
+    -- Drop tables
+    DROP TABLE IF EXISTS correction_history;
+    DROP TABLE IF EXISTS corrections;
+    DROP TABLE IF EXISTS system_config;
+    """,
+    dependencies=[],
+    check_function=None
+)
+
+# Migration from v1.0 to v2.0 (full schema)
+MIGRATION_V2_0 = Migration(
+    version="2.0",
+    name="Complete Schema Enhancement",
+    description="Add advanced tables for learning system and audit trail",
+    forward_sql="""
+    -- Enable foreign keys
+    PRAGMA foreign_keys = ON;
+
+    -- Add new tables
+    CREATE TABLE context_rules (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        pattern TEXT NOT NULL UNIQUE,
+        replacement TEXT NOT NULL,
+        description TEXT,
+        priority INTEGER NOT NULL DEFAULT 0,
+        is_active BOOLEAN NOT NULL DEFAULT 1,
+        added_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        added_by TEXT
+    );
+
+    CREATE TABLE correction_changes (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        history_id INTEGER NOT NULL,
+        line_number INTEGER,
+        from_text TEXT NOT NULL,
+        to_text TEXT NOT NULL,
+        rule_type TEXT NOT NULL CHECK(rule_type IN ('context', 'dictionary', 'ai')),
+        rule_id INTEGER,
+        context_before TEXT,
+        context_after TEXT,
+        FOREIGN KEY (history_id) REFERENCES correction_history(id) ON DELETE CASCADE
+    );
+
+    CREATE TABLE learned_suggestions (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        from_text TEXT NOT NULL,
+        to_text TEXT NOT NULL,
+        domain TEXT NOT NULL DEFAULT 'general',
+        frequency INTEGER NOT NULL DEFAULT 1 CHECK(frequency > 0),
+        confidence REAL NOT NULL CHECK(confidence >= 0.0 AND confidence <= 1.0),
+        first_seen TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        last_seen TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        status TEXT NOT NULL DEFAULT 'pending' CHECK(status IN ('pending', 'approved', 'rejected')),
+        reviewed_at TIMESTAMP,
+        reviewed_by TEXT,
+        UNIQUE(from_text, to_text, domain)
+    );
+
+    CREATE TABLE suggestion_examples (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        suggestion_id INTEGER NOT NULL,
+        filename TEXT NOT NULL,
+        line_number INTEGER,
+        context TEXT NOT NULL,
+        occurred_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        FOREIGN KEY (suggestion_id) REFERENCES learned_suggestions(id) ON DELETE CASCADE
+    );
+
+    CREATE TABLE audit_log (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        timestamp TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        action TEXT NOT NULL,
+        entity_type TEXT NOT NULL,
+        entity_id INTEGER,
+        user TEXT,
+        details TEXT,
+        success BOOLEAN NOT NULL DEFAULT 1,
+        error_message TEXT
+    );
+
+    -- Create indexes
+    CREATE INDEX idx_context_rules_priority ON context_rules(priority DESC);
+    CREATE INDEX idx_context_rules_is_active ON context_rules(is_active);
+    CREATE INDEX idx_changes_history_id ON correction_changes(history_id);
+    CREATE INDEX idx_changes_rule_type ON correction_changes(rule_type);
+    CREATE INDEX idx_suggestions_status ON learned_suggestions(status);
+    CREATE INDEX idx_suggestions_domain ON learned_suggestions(domain);
+    CREATE INDEX idx_suggestions_confidence ON learned_suggestions(confidence DESC);
+    CREATE INDEX idx_suggestions_frequency ON learned_suggestions(frequency DESC);
+    CREATE INDEX idx_examples_suggestion_id ON suggestion_examples(suggestion_id);
+    CREATE INDEX idx_audit_timestamp ON audit_log(timestamp DESC);
+    CREATE INDEX idx_audit_action ON audit_log(action);
+    CREATE INDEX idx_audit_entity_type ON audit_log(entity_type);
+    CREATE INDEX idx_audit_success ON audit_log(success);
+
+    -- Create views
+    CREATE VIEW active_corrections AS
+    SELECT
+        id, from_text, to_text, domain, source, confidence,
+        usage_count, last_used, added_at
+    FROM corrections
+    WHERE is_active = 1
+    ORDER BY domain, from_text;
+
+    CREATE VIEW pending_suggestions AS
+    SELECT
+        s.id, s.from_text, s.to_text, s.domain, s.frequency,
+        s.confidence, s.first_seen, s.last_seen, COUNT(e.id) as example_count
+    FROM learned_suggestions s
+    LEFT JOIN suggestion_examples e ON s.id = e.suggestion_id
+    WHERE s.status = 'pending'
+    GROUP BY s.id
+    ORDER BY s.confidence DESC, s.frequency DESC;
+
+    CREATE VIEW correction_statistics AS
+    SELECT
+        domain,
+        COUNT(*) as total_corrections,
+        COUNT(CASE WHEN source = 'manual' THEN 1 END) as manual_count,
+        COUNT(CASE WHEN source = 'learned' THEN 1 END) as learned_count,
+        COUNT(CASE WHEN source = 'imported' THEN 1 END) as imported_count,
+        SUM(usage_count) as total_usage,
+        MAX(added_at) as last_updated
+    FROM corrections
+    WHERE is_active = 1
+    GROUP BY domain;
+
+    -- Update system config
+    UPDATE system_config SET value = '2.0' WHERE key = 'schema_version';
+    INSERT OR IGNORE INTO system_config (key, value, value_type, description) VALUES
+        ('api_base_url', 'https://open.bigmodel.cn/api/anthropic', 'string', 'API endpoint URL'),
+        ('default_domain', 'general', 'string', 'Default correction domain'),
+        ('auto_learn_enabled', 'true', 'boolean', 'Enable automatic pattern learning'),
+        ('backup_enabled', 'true', 'boolean', 'Create backups before operations'),
+        ('learning_frequency_threshold', '3', 'int', 'Min frequency for learned suggestions'),
+        ('learning_confidence_threshold', '0.8', 'float', 'Min confidence for learned suggestions'),
+        ('history_retention_days', '90', 'int', 'Days to retain correction history'),
+        ('max_correction_length', '1000', 'int', 'Maximum length for correction text');
+    """,
+    backward_sql="""
+    -- Drop views
+    DROP VIEW IF EXISTS correction_statistics;
+    DROP VIEW IF EXISTS pending_suggestions;
+    DROP VIEW IF EXISTS active_corrections;
+
+    -- Drop indexes
+    DROP INDEX IF EXISTS idx_audit_success;
+    DROP INDEX IF EXISTS idx_audit_entity_type;
+    DROP INDEX IF EXISTS idx_audit_action;
+    DROP INDEX IF EXISTS idx_audit_timestamp;
+    DROP INDEX IF EXISTS idx_examples_suggestion_id;
+    DROP INDEX IF EXISTS idx_suggestions_frequency;
+    DROP INDEX IF EXISTS idx_suggestions_confidence;
+    DROP INDEX IF EXISTS idx_suggestions_domain;
+    DROP INDEX IF EXISTS idx_suggestions_status;
+    DROP INDEX IF EXISTS idx_changes_rule_type;
+    DROP INDEX IF EXISTS idx_changes_history_id;
+    DROP INDEX IF EXISTS idx_context_rules_is_active;
+    DROP INDEX IF EXISTS idx_context_rules_priority;
+
+    -- Drop tables
+    DROP TABLE IF EXISTS audit_log;
+    DROP TABLE IF EXISTS suggestion_examples;
+    DROP TABLE IF EXISTS learned_suggestions;
+    DROP TABLE IF EXISTS correction_changes;
+    DROP TABLE IF EXISTS context_rules;
+
+    -- Reset schema version
+    UPDATE system_config SET value = '1.0' WHERE key = 'schema_version';
+    DELETE FROM system_config WHERE key IN (
+        'api_base_url', 'default_domain', 'auto_learn_enabled',
+        'backup_enabled', 'learning_frequency_threshold',
+        'learning_confidence_threshold', 'history_retention_days',
+        'max_correction_length'
+    );
+    """,
+    dependencies=["1.0"],
+    check_function=_validate_schema_2_0,
+    is_breaking=False
+)
+
+# Migration from v2.0 to v2.1 (add performance optimizations)
+MIGRATION_V2_1 = Migration(
+    version="2.1",
+    name="Performance Optimizations",
+    description="Add indexes and constraints for better query performance",
+    forward_sql="""
+    -- Add composite indexes for common queries
+    CREATE INDEX idx_corrections_domain_active ON corrections(domain, is_active);
+    CREATE INDEX idx_corrections_domain_from_text ON corrections(domain, from_text);
+    CREATE INDEX idx_corrections_usage_count ON corrections(usage_count DESC);
+    CREATE INDEX idx_corrections_last_used ON corrections(last_used DESC);
+
+    -- Add indexes for learned_suggestions queries
+    CREATE INDEX idx_suggestions_domain_status ON learned_suggestions(domain, status);
+    CREATE INDEX idx_suggestions_domain_confidence ON learned_suggestions(domain, confidence DESC);
+    CREATE INDEX idx_suggestions_domain_frequency ON learned_suggestions(domain, frequency DESC);
+
+    -- Add indexes for audit_log queries
+    CREATE INDEX idx_audit_timestamp_entity ON audit_log(timestamp DESC, entity_type);
+    CREATE INDEX idx_audit_entity_type_id ON audit_log(entity_type, entity_id);
+
+    -- Add composite indexes for history queries
+    CREATE INDEX idx_history_domain_timestamp ON correction_history(domain, run_timestamp DESC);
+    CREATE INDEX idx_history_domain_success ON correction_history(domain, success, run_timestamp DESC);
+
+    -- Add index for frequently joined tables
+    CREATE INDEX idx_changes_history_rule_type ON correction_changes(history_id, rule_type);
+
+    -- Update system config
+    INSERT OR IGNORE INTO system_config (key, value, value_type, description) VALUES
+        ('performance_optimization_applied', 'true', 'boolean', 'Performance optimization v2.1 applied');
+    """,
+    backward_sql="""
+    -- Drop indexes
+    DROP INDEX IF EXISTS idx_changes_history_rule_type;
+    DROP INDEX IF EXISTS idx_history_domain_success;
+    DROP INDEX IF EXISTS idx_history_domain_timestamp;
+    DROP INDEX IF EXISTS idx_audit_entity_type_id;
+    DROP INDEX IF EXISTS idx_audit_timestamp_entity;
+    DROP INDEX IF EXISTS idx_suggestions_domain_frequency;
+    DROP INDEX IF EXISTS idx_suggestions_domain_confidence;
+    DROP INDEX IF EXISTS idx_suggestions_domain_status;
+    DROP INDEX IF EXISTS idx_corrections_last_used;
+    DROP INDEX IF EXISTS idx_corrections_usage_count;
+    DROP INDEX IF EXISTS idx_corrections_domain_from_text;
+    DROP INDEX IF EXISTS idx_corrections_domain_active;
+
+    -- Remove system config
+    DELETE FROM system_config WHERE key = 'performance_optimization_applied';
+    """,
+    dependencies=["2.0"],
+    check_function=None,
+    is_breaking=False
+)
+
+# Migration from v2.1 to v2.2 (add data retention policies)
+MIGRATION_V2_2 = Migration(
+    version="2.2",
+    name="Data Retention Policies",
+    description="Add retention policies and automatic cleanup mechanisms",
+    forward_sql="""
+    -- Add retention_policy table
+    CREATE TABLE retention_policies (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        entity_type TEXT NOT NULL CHECK(entity_type IN ('corrections', 'history', 'audits', 'suggestions')),
+        retention_days INTEGER NOT NULL CHECK(retention_days > 0),
+        is_active BOOLEAN NOT NULL DEFAULT 1,
+        created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        description TEXT
+    );
+
+    -- Insert default retention policies
+    INSERT INTO retention_policies (entity_type, retention_days, is_active, description) VALUES
+        ('history', 90, 1, 'Keep correction history for 90 days'),
+        ('audits', 180, 1, 'Keep audit logs for 180 days'),
+        ('suggestions', 30, 1, 'Keep rejected suggestions for 30 days'),
+        ('corrections', 365, 0, 'Keep all corrections by default');
+
+    -- Add cleanup_history table
+    CREATE TABLE cleanup_history (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        cleanup_date TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        entity_type TEXT NOT NULL,
+        records_deleted INTEGER NOT NULL CHECK(records_deleted >= 0),
+        execution_time_ms INTEGER NOT NULL CHECK(execution_time_ms >= 0),
+        success BOOLEAN NOT NULL DEFAULT 1,
+        error_message TEXT
+    );
+
+    -- Create indexes
+    CREATE INDEX idx_retention_entity_type ON retention_policies(entity_type);
+    CREATE INDEX idx_retention_is_active ON retention_policies(is_active);
+    CREATE INDEX idx_cleanup_date ON cleanup_history(cleanup_date DESC);
+
+    -- Update system config
+    INSERT OR IGNORE INTO system_config (key, value, value_type, description) VALUES
+        ('retention_cleanup_enabled', 'true', 'boolean', 'Enable automatic retention cleanup'),
+        ('retention_cleanup_hour', '2', 'int', 'Hour of day to run cleanup (0-23)'),
+        ('last_retention_cleanup', '', 'string', 'Timestamp of last retention cleanup');
+    """,
+    backward_sql="""
+    -- Drop retention cleanup tables
+    DROP TABLE IF EXISTS cleanup_history;
+    DROP TABLE IF EXISTS retention_policies;
+
+    -- Remove system config
+    DELETE FROM system_config WHERE key IN (
+        'retention_cleanup_enabled',
+        'retention_cleanup_hour',
+        'last_retention_cleanup'
+    );
+    """,
+    dependencies=["2.1"],
+    check_function=None,
+    is_breaking=False
+)
+
+# Registry of all migrations
+# Order matters - add new migrations at the end
+ALL_MIGRATIONS = [
+    MIGRATION_V1_0,
+    MIGRATION_V2_0,
+    MIGRATION_V2_1,
+    MIGRATION_V2_2,
+]
+
+# Migration registry by version
+MIGRATION_REGISTRY = {m.version: m for m in ALL_MIGRATIONS}
+
+# Latest version
+LATEST_VERSION = max(MIGRATION_REGISTRY.keys(), key=lambda v: tuple(map(int, v.split('.'))))
+
+
+def get_migration(version: str) -> Migration:
+    """Get migration by version"""
+    if version not in MIGRATION_REGISTRY:
+        raise ValueError(f"Migration version {version} not found")
+    return MIGRATION_REGISTRY[version]
+
+
+def get_migrations_up_to(target_version: str) -> list[Migration]:
+    """Get all migrations up to target version"""
+    versions = sorted(MIGRATION_REGISTRY.keys(), key=lambda v: tuple(map(int, v.split('.'))))
+    result = []
+    for version in versions:
+        if version <= target_version:
+            result.append(MIGRATION_REGISTRY[version])
+    return result
+
+
+def get_migrations_from(from_version: str) -> list[Migration]:
+    """Get all migrations from version onwards"""
+    versions = sorted(MIGRATION_REGISTRY.keys(), key=lambda v: tuple(map(int, v.split('.'))))
+    result = []
+    for version in versions:
+        if version > from_version:
+            result.append(MIGRATION_REGISTRY[version])
+    return result
--- a/transcript-fixer/scripts/utils/path_validator.py
+++ b/transcript-fixer/scripts/utils/path_validator.py
@@ -0,0 +1,478 @@
+#!/usr/bin/env python3
+"""
+Path Validation and Security
+
+CRITICAL FIX: Prevents path traversal and symlink attacks
+ISSUE: Critical-5 in Engineering Excellence Plan
+
+This module provides:
+1. Path whitelist validation
+2. Path traversal prevention (../)
+3. Symlink attack detection
+4. File extension validation
+5. Directory containment checks
+
+Author: Chief Engineer
+Date: 2025-10-28
+Priority: P0 - Critical
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Set, Optional, Final, List
+import logging
+
+logger = logging.getLogger(__name__)
+
+# Allowed base directories (whitelist)
+# Only files under these directories can be accessed
+ALLOWED_BASE_DIRS: Final[Set[Path]] = {
+    Path.home() / ".transcript-fixer",  # Config/data directory
+    Path.home() / "Downloads",           # Common download location
+    Path.home() / "Documents",           # Common documents location
+    Path.home() / "Desktop",             # Desktop files
+    Path("/tmp"),                        # Temporary files
+}
+
+# Allowed file extensions for reading
+ALLOWED_READ_EXTENSIONS: Final[Set[str]] = {
+    '.md',      # Markdown
+    '.txt',     # Text
+    '.html',    # HTML output
+    '.json',    # JSON config
+    '.sql',     # SQL schema
+}
+
+# Allowed file extensions for writing
+ALLOWED_WRITE_EXTENSIONS: Final[Set[str]] = {
+    '.md',      # Markdown output
+    '.html',    # HTML diff
+    '.db',      # SQLite database
+    '.log',     # Log files
+}
+
+# Dangerous patterns to reject
+DANGEROUS_PATTERNS: Final[List[str]] = [
+    '..',      # Parent directory traversal
+    '\x00',    # Null byte
+    '\n',      # Newline injection
+    '\r',      # Carriage return injection
+]
+
+
+class PathValidationError(Exception):
+    """Path validation failed"""
+    pass
+
+
+class PathValidator:
+    """
+    Validates file paths for security.
+
+    Prevents:
+    - Path traversal attacks (../)
+    - Symlink attacks
+    - Access outside whitelisted directories
+    - Dangerous file types
+    - Null byte injection
+
+    Usage:
+        validator = PathValidator()
+        safe_path = validator.validate_input_path("/path/to/file.md")
+        safe_output = validator.validate_output_path("/path/to/output.md")
+    """
+
+    def __init__(
+        self,
+        allowed_base_dirs: Optional[Set[Path]] = None,
+        allowed_read_extensions: Optional[Set[str]] = None,
+        allowed_write_extensions: Optional[Set[str]] = None,
+        allow_symlinks: bool = False
+    ):
+        """
+        Initialize path validator.
+
+        Args:
+            allowed_base_dirs: Whitelist of allowed base directories
+            allowed_read_extensions: Allowed file extensions for reading
+            allowed_write_extensions: Allowed file extensions for writing
+            allow_symlinks: Allow symlinks (default: False for security)
+        """
+        self.allowed_base_dirs = allowed_base_dirs or ALLOWED_BASE_DIRS
+        self.allowed_read_extensions = allowed_read_extensions or ALLOWED_READ_EXTENSIONS
+        self.allowed_write_extensions = allowed_write_extensions or ALLOWED_WRITE_EXTENSIONS
+        self.allow_symlinks = allow_symlinks
+
+    def _check_dangerous_patterns(self, path_str: str) -> None:
+        """
+        Check for dangerous patterns in path string.
+
+        Args:
+            path_str: Path string to check
+
+        Raises:
+            PathValidationError: If dangerous pattern found
+        """
+        for pattern in DANGEROUS_PATTERNS:
+            if pattern in path_str:
+                raise PathValidationError(
+                    f"Dangerous pattern '{pattern}' detected in path: {path_str}"
+                )
+
+    def _is_under_allowed_directory(self, path: Path) -> bool:
+        """
+        Check if path is under any allowed base directory.
+
+        Args:
+            path: Resolved path to check
+
+        Returns:
+            True if path is under allowed directory
+        """
+        for allowed_dir in self.allowed_base_dirs:
+            try:
+                # Check if path is relative to allowed_dir
+                path.relative_to(allowed_dir)
+                return True
+            except ValueError:
+                # Not relative to this allowed_dir
+                continue
+
+        return False
+
+    def _check_symlink(self, path: Path) -> None:
+        """
+        Check for symlink attacks.
+
+        Args:
+            path: Path to check
+
+        Raises:
+            PathValidationError: If symlink detected and not allowed
+        """
+        if not self.allow_symlinks and path.is_symlink():
+            raise PathValidationError(
+                f"Symlink detected and not allowed: {path}"
+            )
+
+        # Check parent directories for symlinks (but stop at system dirs)
+        if not self.allow_symlinks:
+            current = path.parent
+
+            # Stop checking at common system directories (they may be symlinks on macOS)
+            system_dirs = {Path('/'), Path('/usr'), Path('/etc'), Path('/var')}
+
+            while current != current.parent:  # Until root
+                if current in system_dirs:
+                    break
+
+                if current.is_symlink():
+                    raise PathValidationError(
+                        f"Symlink in path hierarchy detected: {current}"
+                    )
+                current = current.parent
+
+    def _validate_extension(
+        self,
+        path: Path,
+        allowed_extensions: Set[str],
+        operation: str
+    ) -> None:
+        """
+        Validate file extension.
+
+        Args:
+            path: Path to validate
+            allowed_extensions: Set of allowed extensions
+            operation: Operation name (for error message)
+
+        Raises:
+            PathValidationError: If extension not allowed
+        """
+        extension = path.suffix.lower()
+
+        if extension not in allowed_extensions:
+            raise PathValidationError(
+                f"File extension '{extension}' not allowed for {operation}. "
+                f"Allowed: {sorted(allowed_extensions)}"
+            )
+
+    def validate_input_path(self, path_str: str) -> Path:
+        """
+        Validate an input file path for reading.
+
+        Security checks:
+        1. No dangerous patterns (.., null bytes, etc.)
+        2. Path resolves to absolute path
+        3. No symlinks (unless explicitly allowed)
+        4. Under allowed base directory
+        5. Allowed file extension for reading
+        6. File exists
+
+        Args:
+            path_str: Path string to validate
+
+        Returns:
+            Validated, resolved Path object
+
+        Raises:
+            PathValidationError: If validation fails
+
+        Example:
+            >>> validator = PathValidator()
+            >>> safe_path = validator.validate_input_path("~/Documents/file.md")
+            >>> # Returns: Path('/home/username/Documents/file.md') or similar
+        """
+        # Check dangerous patterns in raw string
+        self._check_dangerous_patterns(path_str)
+
+        # Convert to Path (but don't resolve yet - need to check symlinks first)
+        try:
+            path = Path(path_str).expanduser().absolute()
+        except Exception as e:
+            raise PathValidationError(f"Invalid path format: {path_str}") from e
+
+        # Check if file exists
+        if not path.exists():
+            raise PathValidationError(f"File does not exist: {path}")
+
+        # Check if it's a file (not directory)
+        if not path.is_file():
+            raise PathValidationError(f"Path is not a file: {path}")
+
+        # CRITICAL: Check for symlinks BEFORE resolving
+        self._check_symlink(path)
+
+        # Now resolve to get canonical path
+        path = path.resolve()
+
+        # Check if under allowed directory
+        if not self._is_under_allowed_directory(path):
+            raise PathValidationError(
+                f"Path not under allowed directories: {path}\n"
+                f"Allowed directories: {[str(d) for d in self.allowed_base_dirs]}"
+            )
+
+        # Check file extension
+        self._validate_extension(path, self.allowed_read_extensions, "reading")
+
+        logger.info(f"Input path validated: {path}")
+        return path
+
+    def validate_output_path(self, path_str: str, create_parent: bool = True) -> Path:
+        """
+        Validate an output file path for writing.
+
+        Security checks:
+        1. No dangerous patterns
+        2. Path resolves to absolute path
+        3. No symlinks in path hierarchy
+        4. Under allowed base directory
+        5. Allowed file extension for writing
+        6. Parent directory exists or can be created
+
+        Args:
+            path_str: Path string to validate
+            create_parent: Create parent directory if it doesn't exist
+
+        Returns:
+            Validated, resolved Path object
+
+        Raises:
+            PathValidationError: If validation fails
+
+        Example:
+            >>> validator = PathValidator()
+            >>> safe_path = validator.validate_output_path("~/Documents/output.md")
+            >>> # Returns: Path('/home/username/Documents/output.md') or similar
+        """
+        # Check dangerous patterns
+        self._check_dangerous_patterns(path_str)
+
+        # Convert to Path and resolve
+        try:
+            path = Path(path_str).expanduser().resolve()
+        except Exception as e:
+            raise PathValidationError(f"Invalid path format: {path_str}") from e
+
+        # Check parent directory exists
+        parent = path.parent
+        if not parent.exists():
+            if create_parent:
+                # Validate parent directory first
+                if not self._is_under_allowed_directory(parent):
+                    raise PathValidationError(
+                        f"Parent directory not under allowed directories: {parent}"
+                    )
+                try:
+                    parent.mkdir(parents=True, exist_ok=True)
+                    logger.info(f"Created parent directory: {parent}")
+                except Exception as e:
+                    raise PathValidationError(
+                        f"Failed to create parent directory: {parent}"
+                    ) from e
+            else:
+                raise PathValidationError(f"Parent directory does not exist: {parent}")
+
+        # Check for symlinks in path hierarchy (but file itself doesn't exist yet)
+        if not self.allow_symlinks:
+            current = parent
+            while current != current.parent:
+                if current.is_symlink():
+                    raise PathValidationError(
+                        f"Symlink in path hierarchy: {current}"
+                    )
+                current = current.parent
+
+        # Check if under allowed directory
+        if not self._is_under_allowed_directory(path):
+            raise PathValidationError(
+                f"Path not under allowed directories: {path}\n"
+                f"Allowed directories: {[str(d) for d in self.allowed_base_dirs]}"
+            )
+
+        # Check file extension
+        self._validate_extension(path, self.allowed_write_extensions, "writing")
+
+        logger.info(f"Output path validated: {path}")
+        return path
+
+    def add_allowed_directory(self, directory: str | Path) -> None:
+        """
+        Add a directory to the whitelist.
+
+        Args:
+            directory: Directory path to add
+
+        Example:
+            >>> validator.add_allowed_directory("/home/username/Projects")
+        """
+        dir_path = Path(directory).expanduser().resolve()
+        self.allowed_base_dirs.add(dir_path)
+        logger.info(f"Added allowed directory: {dir_path}")
+
+    def is_path_safe(self, path_str: str, for_writing: bool = False) -> bool:
+        """
+        Check if a path is safe without raising exceptions.
+
+        Args:
+            path_str: Path to check
+            for_writing: Check for writing (vs reading)
+
+        Returns:
+            True if path is safe
+
+        Example:
+            >>> if validator.is_path_safe("~/Documents/file.md"):
+            ...     process_file()
+        """
+        try:
+            if for_writing:
+                self.validate_output_path(path_str, create_parent=False)
+            else:
+                self.validate_input_path(path_str)
+            return True
+        except PathValidationError:
+            return False
+
+
+# Global validator instance
+_global_validator: Optional[PathValidator] = None
+
+
+def get_validator() -> PathValidator:
+    """
+    Get global validator instance.
+
+    Returns:
+        Global PathValidator instance
+
+    Example:
+        >>> validator = get_validator()
+        >>> safe_path = validator.validate_input_path("file.md")
+    """
+    global _global_validator
+    if _global_validator is None:
+        _global_validator = PathValidator()
+    return _global_validator
+
+
+# Convenience functions
+def validate_input_path(path_str: str) -> Path:
+    """Validate input path using global validator"""
+    return get_validator().validate_input_path(path_str)
+
+
+def validate_output_path(path_str: str, create_parent: bool = True) -> Path:
+    """Validate output path using global validator"""
+    return get_validator().validate_output_path(path_str, create_parent)
+
+
+def add_allowed_directory(directory: str | Path) -> None:
+    """Add allowed directory to global validator"""
+    get_validator().add_allowed_directory(directory)
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    import logging
+    logging.basicConfig(level=logging.INFO)
+
+    print("=== Testing PathValidator ===\n")
+
+    validator = PathValidator()
+
+    # Test 1: Valid input path (create a test file first)
+    print("Test 1: Valid input path")
+    test_file = Path.home() / "Documents" / "test.md"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("test")
+
+    try:
+        result = validator.validate_input_path(str(test_file))
+        print(f"✓ Valid: {result}\n")
+    except PathValidationError as e:
+        print(f"✗ Failed: {e}\n")
+
+    # Test 2: Path traversal attack
+    print("Test 2: Path traversal attack")
+    try:
+        result = validator.validate_input_path("../../etc/passwd")
+        print(f"✗ Should have failed: {result}\n")
+    except PathValidationError as e:
+        print(f"✓ Correctly rejected: {e}\n")
+
+    # Test 3: Invalid extension
+    print("Test 3: Invalid extension")
+    dangerous_file = Path.home() / "Documents" / "script.sh"
+    dangerous_file.write_text("#!/bin/bash")
+
+    try:
+        result = validator.validate_input_path(str(dangerous_file))
+        print(f"✗ Should have failed: {result}\n")
+    except PathValidationError as e:
+        print(f"✓ Correctly rejected: {e}\n")
+
+    # Test 4: Valid output path
+    print("Test 4: Valid output path")
+    try:
+        result = validator.validate_output_path(str(Path.home() / "Documents" / "output.html"))
+        print(f"✓ Valid: {result}\n")
+    except PathValidationError as e:
+        print(f"✗ Failed: {e}\n")
+
+    # Test 5: Null byte injection
+    print("Test 5: Null byte injection")
+    try:
+        result = validator.validate_input_path("file.md\x00.txt")
+        print(f"✗ Should have failed: {result}\n")
+    except PathValidationError as e:
+        print(f"✓ Correctly rejected: {e}\n")
+
+    # Cleanup
+    test_file.unlink(missing_ok=True)
+    dangerous_file.unlink(missing_ok=True)
+
+    print("=== All tests completed ===")
--- a/transcript-fixer/scripts/utils/rate_limiter.py
+++ b/transcript-fixer/scripts/utils/rate_limiter.py
@@ -0,0 +1,441 @@
+#!/usr/bin/env python3
+"""
+Rate Limiting Module
+
+CRITICAL FIX (P1-8): Production-grade rate limiting for API protection
+
+Features:
+- Token Bucket algorithm (smooth rate limiting)
+- Sliding Window algorithm (precise rate limiting)
+- Fixed Window algorithm (simple, memory-efficient)
+- Thread-safe operations
+- Burst support
+- Multiple rate limit tiers
+- Metrics integration
+
+Use cases:
+- API rate limiting (e.g., 100 requests/minute)
+- Resource protection (e.g., max 5 concurrent DB connections)
+- DoS prevention
+- Cost control (e.g., limit API calls)
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from collections import deque
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Deque, Final
+from contextlib import contextmanager
+
+logger = logging.getLogger(__name__)
+
+
+class RateLimitStrategy(Enum):
+    """Rate limiting strategy"""
+    TOKEN_BUCKET = "token_bucket"
+    SLIDING_WINDOW = "sliding_window"
+    FIXED_WINDOW = "fixed_window"
+
+
+@dataclass
+class RateLimitConfig:
+    """Rate limit configuration"""
+    max_requests: int           # Maximum requests allowed
+    window_seconds: float       # Time window in seconds
+    strategy: RateLimitStrategy = RateLimitStrategy.TOKEN_BUCKET
+    burst_size: Optional[int] = None  # Burst allowance (for token bucket)
+
+    def __post_init__(self):
+        """Validate configuration"""
+        if self.max_requests <= 0:
+            raise ValueError("max_requests must be positive")
+        if self.window_seconds <= 0:
+            raise ValueError("window_seconds must be positive")
+
+        # Default burst size = max_requests (allow full burst)
+        if self.burst_size is None:
+            self.burst_size = self.max_requests
+
+
+class RateLimitExceeded(Exception):
+    """Raised when rate limit is exceeded"""
+    def __init__(self, message: str, retry_after: float):
+        super().__init__(message)
+        self.retry_after = retry_after  # Seconds to wait before retry
+
+
+class TokenBucketLimiter:
+    """
+    Token Bucket algorithm implementation.
+
+    Properties:
+    - Smooth rate limiting
+    - Allows bursts up to bucket capacity
+    - Memory efficient (O(1))
+    - Fast (O(1) per request)
+
+    Use for: API rate limiting, general request throttling
+    """
+
+    def __init__(self, config: RateLimitConfig):
+        self.config = config
+        self.capacity = config.burst_size or config.max_requests
+        self.refill_rate = config.max_requests / config.window_seconds
+
+        self._tokens = float(self.capacity)
+        self._last_refill = time.time()
+        self._lock = threading.Lock()
+
+        logger.debug(
+            f"TokenBucket initialized: capacity={self.capacity}, "
+            f"refill_rate={self.refill_rate:.2f}/s"
+        )
+
+    def _refill(self) -> None:
+        """Refill tokens based on elapsed time"""
+        now = time.time()
+        elapsed = now - self._last_refill
+
+        # Add tokens based on time elapsed
+        tokens_to_add = elapsed * self.refill_rate
+        self._tokens = min(self.capacity, self._tokens + tokens_to_add)
+        self._last_refill = now
+
+    def acquire(self, tokens: int = 1, blocking: bool = True, timeout: Optional[float] = None) -> bool:
+        """
+        Acquire tokens from bucket.
+
+        Args:
+            tokens: Number of tokens to acquire (default: 1)
+            blocking: If True, wait for tokens. If False, return immediately
+            timeout: Maximum time to wait (seconds). None = wait forever
+
+        Returns:
+            True if tokens acquired, False if rate limit exceeded (non-blocking only)
+
+        Raises:
+            RateLimitExceeded: If rate limit exceeded in blocking mode
+        """
+        if tokens <= 0:
+            raise ValueError("tokens must be positive")
+
+        start_time = time.time()
+
+        while True:
+            with self._lock:
+                self._refill()
+
+                if self._tokens >= tokens:
+                    # Sufficient tokens available
+                    self._tokens -= tokens
+                    return True
+
+                if not blocking:
+                    # Non-blocking mode - return immediately
+                    return False
+
+                # Calculate retry_after
+                tokens_needed = tokens - self._tokens
+                retry_after = tokens_needed / self.refill_rate
+
+            # Check timeout
+            if timeout is not None:
+                elapsed = time.time() - start_time
+                if elapsed >= timeout:
+                    raise RateLimitExceeded(
+                        f"Rate limit exceeded: need {tokens} tokens, have {self._tokens:.1f}",
+                        retry_after=retry_after
+                    )
+
+            # Wait before retry (but not longer than needed or timeout)
+            wait_time = min(retry_after, 0.1)  # Check at least every 100ms
+            if timeout is not None:
+                remaining_timeout = timeout - (time.time() - start_time)
+                wait_time = min(wait_time, remaining_timeout)
+
+            if wait_time > 0:
+                time.sleep(wait_time)
+
+    def get_available_tokens(self) -> float:
+        """Get current number of available tokens"""
+        with self._lock:
+            self._refill()
+            return self._tokens
+
+    def reset(self) -> None:
+        """Reset to full capacity"""
+        with self._lock:
+            self._tokens = float(self.capacity)
+            self._last_refill = time.time()
+
+
+class SlidingWindowLimiter:
+    """
+    Sliding Window algorithm implementation.
+
+    Properties:
+    - Precise rate limiting
+    - No "boundary problem" (unlike fixed window)
+    - Memory: O(max_requests)
+    - Fast: O(n) per request, where n = requests in window
+
+    Use for: Strict rate limits, billing, quota enforcement
+    """
+
+    def __init__(self, config: RateLimitConfig):
+        self.config = config
+        self.max_requests = config.max_requests
+        self.window_seconds = config.window_seconds
+
+        self._timestamps: Deque[float] = deque()
+        self._lock = threading.Lock()
+
+        logger.debug(
+            f"SlidingWindow initialized: max_requests={self.max_requests}, "
+            f"window={self.window_seconds}s"
+        )
+
+    def _cleanup_old_timestamps(self, now: float) -> None:
+        """Remove timestamps outside the window"""
+        cutoff = now - self.window_seconds
+        while self._timestamps and self._timestamps[0] < cutoff:
+            self._timestamps.popleft()
+
+    def acquire(self, tokens: int = 1, blocking: bool = True, timeout: Optional[float] = None) -> bool:
+        """
+        Acquire tokens (check if request allowed).
+
+        Args:
+            tokens: Number of requests to make (usually 1)
+            blocking: If True, wait for capacity. If False, return immediately
+            timeout: Maximum time to wait (seconds)
+
+        Returns:
+            True if allowed, False if rate limit exceeded (non-blocking only)
+
+        Raises:
+            RateLimitExceeded: If rate limit exceeded in blocking mode
+        """
+        if tokens <= 0:
+            raise ValueError("tokens must be positive")
+
+        start_time = time.time()
+
+        while True:
+            now = time.time()
+
+            with self._lock:
+                self._cleanup_old_timestamps(now)
+
+                current_count = len(self._timestamps)
+
+                if current_count + tokens <= self.max_requests:
+                    # Allowed - record timestamps
+                    for _ in range(tokens):
+                        self._timestamps.append(now)
+                    return True
+
+                if not blocking:
+                    # Non-blocking mode
+                    return False
+
+                # Calculate retry_after (when oldest request falls out of window)
+                if self._timestamps:
+                    oldest = self._timestamps[0]
+                    retry_after = oldest + self.window_seconds - now
+                else:
+                    retry_after = 0.1
+
+            # Check timeout
+            if timeout is not None:
+                elapsed = time.time() - start_time
+                if elapsed >= timeout:
+                    raise RateLimitExceeded(
+                        f"Rate limit exceeded: {current_count}/{self.max_requests} "
+                        f"requests in {self.window_seconds}s window",
+                        retry_after=max(retry_after, 0.1)
+                    )
+
+            # Wait before retry
+            wait_time = min(retry_after, 0.1)
+            if timeout is not None:
+                remaining_timeout = timeout - (time.time() - start_time)
+                wait_time = min(wait_time, remaining_timeout)
+
+            if wait_time > 0:
+                time.sleep(wait_time)
+
+    def get_current_count(self) -> int:
+        """Get current request count in window"""
+        with self._lock:
+            self._cleanup_old_timestamps(time.time())
+            return len(self._timestamps)
+
+    def reset(self) -> None:
+        """Reset (clear all timestamps)"""
+        with self._lock:
+            self._timestamps.clear()
+
+
+class RateLimiter:
+    """
+    Main rate limiter with configurable strategy.
+
+    CRITICAL FIX (P1-8): Thread-safe rate limiting for production use
+    """
+
+    def __init__(self, config: RateLimitConfig):
+        self.config = config
+
+        # Select implementation based on strategy
+        if config.strategy == RateLimitStrategy.TOKEN_BUCKET:
+            self._impl = TokenBucketLimiter(config)
+        elif config.strategy == RateLimitStrategy.SLIDING_WINDOW:
+            self._impl = SlidingWindowLimiter(config)
+        else:
+            raise ValueError(f"Unsupported strategy: {config.strategy}")
+
+        logger.info(
+            f"RateLimiter created: {config.strategy.value}, "
+            f"{config.max_requests}/{config.window_seconds}s"
+        )
+
+    def acquire(self, tokens: int = 1, blocking: bool = True, timeout: Optional[float] = None) -> bool:
+        """
+        Acquire permission to proceed.
+
+        Args:
+            tokens: Number of requests (default: 1)
+            blocking: Wait for availability (default: True)
+            timeout: Maximum wait time in seconds (default: None = forever)
+
+        Returns:
+            True if allowed, False if rate limit exceeded (non-blocking only)
+
+        Raises:
+            RateLimitExceeded: If rate limit exceeded in blocking mode
+        """
+        return self._impl.acquire(tokens=tokens, blocking=blocking, timeout=timeout)
+
+    @contextmanager
+    def limit(self, tokens: int = 1):
+        """
+        Context manager for rate-limited operations.
+
+        Usage:
+            with rate_limiter.limit():
+                # Make API call
+                response = client.post(...)
+
+        Raises:
+            RateLimitExceeded: If rate limit exceeded
+        """
+        self.acquire(tokens=tokens, blocking=True)
+        try:
+            yield
+        finally:
+            pass  # Tokens already consumed
+
+    def check_available(self) -> bool:
+        """Check if capacity available (non-blocking)"""
+        return self.acquire(tokens=1, blocking=False)
+
+    def reset(self) -> None:
+        """Reset rate limiter state"""
+        self._impl.reset()
+
+    def get_info(self) -> dict:
+        """Get current rate limiter information"""
+        info = {
+            'strategy': self.config.strategy.value,
+            'max_requests': self.config.max_requests,
+            'window_seconds': self.config.window_seconds,
+        }
+
+        if isinstance(self._impl, TokenBucketLimiter):
+            info['available_tokens'] = self._impl.get_available_tokens()
+            info['capacity'] = self._impl.capacity
+        elif isinstance(self._impl, SlidingWindowLimiter):
+            info['current_count'] = self._impl.get_current_count()
+
+        return info
+
+
+# Predefined rate limit configurations
+class RateLimitPresets:
+    """Common rate limit configurations"""
+
+    # API rate limits
+    API_CONSERVATIVE = RateLimitConfig(
+        max_requests=10,
+        window_seconds=60.0,
+        strategy=RateLimitStrategy.TOKEN_BUCKET
+    )
+
+    API_MODERATE = RateLimitConfig(
+        max_requests=60,
+        window_seconds=60.0,
+        strategy=RateLimitStrategy.TOKEN_BUCKET
+    )
+
+    API_AGGRESSIVE = RateLimitConfig(
+        max_requests=100,
+        window_seconds=60.0,
+        strategy=RateLimitStrategy.TOKEN_BUCKET
+    )
+
+    # Burst limits
+    BURST_ALLOWED = RateLimitConfig(
+        max_requests=50,
+        window_seconds=60.0,
+        burst_size=100,  # Allow double burst
+        strategy=RateLimitStrategy.TOKEN_BUCKET
+    )
+
+    # Strict limits (sliding window)
+    STRICT_LIMIT = RateLimitConfig(
+        max_requests=100,
+        window_seconds=60.0,
+        strategy=RateLimitStrategy.SLIDING_WINDOW
+    )
+
+
+# Global rate limiters
+_global_limiters: dict[str, RateLimiter] = {}
+_limiters_lock = threading.Lock()
+
+
+def get_rate_limiter(name: str, config: Optional[RateLimitConfig] = None) -> RateLimiter:
+    """
+    Get or create a named rate limiter.
+
+    Args:
+        name: Unique name for this rate limiter
+        config: Rate limit configuration (required if creating new)
+
+    Returns:
+        RateLimiter instance
+    """
+    global _global_limiters
+
+    with _limiters_lock:
+        if name not in _global_limiters:
+            if config is None:
+                raise ValueError(f"Rate limiter '{name}' not found and no config provided")
+
+            _global_limiters[name] = RateLimiter(config)
+            logger.info(f"Created global rate limiter: {name}")
+
+        return _global_limiters[name]
+
+
+def reset_all_limiters() -> None:
+    """Reset all global rate limiters (mainly for testing)"""
+    with _limiters_lock:
+        for limiter in _global_limiters.values():
+            limiter.reset()
+        logger.info("Reset all rate limiters")
--- a/transcript-fixer/scripts/utils/retry_logic.py
+++ b/transcript-fixer/scripts/utils/retry_logic.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+"""
+Retry Logic with Exponential Backoff
+
+CRITICAL FIX: Implements retry for transient failures
+ISSUE: Critical-4 in Engineering Excellence Plan
+
+This module provides:
+1. Exponential backoff retry logic
+2. Error categorization (transient vs permanent)
+3. Configurable retry strategies
+4. Async retry support
+
+Author: Chief Engineer
+Date: 2025-10-28
+Priority: P0 - Critical
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import time
+from typing import TypeVar, Callable, Any, Optional, Set
+from functools import wraps
+from dataclasses import dataclass
+import httpx
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar('T')
+
+
+@dataclass
+class RetryConfig:
+    """
+    Configuration for retry behavior.
+
+    Attributes:
+        max_attempts: Maximum number of retry attempts (default: 3)
+        base_delay: Initial delay between retries in seconds (default: 1.0)
+        max_delay: Maximum delay between retries in seconds (default: 60.0)
+        exponential_base: Multiplier for exponential backoff (default: 2.0)
+        jitter: Add randomness to avoid thundering herd (default: True)
+    """
+    max_attempts: int = 3
+    base_delay: float = 1.0
+    max_delay: float = 60.0
+    exponential_base: float = 2.0
+    jitter: bool = True
+
+
+# Transient errors that should be retried
+TRANSIENT_EXCEPTIONS: Set[type] = {
+    # Network errors
+    httpx.ConnectTimeout,
+    httpx.ReadTimeout,
+    httpx.WriteTimeout,
+    httpx.PoolTimeout,
+    httpx.ConnectError,
+    httpx.ReadError,
+    httpx.WriteError,
+
+    # HTTP status codes (will check separately)
+    # 408 Request Timeout
+    # 429 Too Many Requests
+    # 500 Internal Server Error
+    # 502 Bad Gateway
+    # 503 Service Unavailable
+    # 504 Gateway Timeout
+}
+
+# Status codes that indicate transient failures
+TRANSIENT_STATUS_CODES: Set[int] = {
+    408,  # Request Timeout
+    429,  # Too Many Requests
+    500,  # Internal Server Error
+    502,  # Bad Gateway
+    503,  # Service Unavailable
+    504,  # Gateway Timeout
+}
+
+# Permanent errors that should NOT be retried
+PERMANENT_EXCEPTIONS: Set[type] = {
+    # Authentication/Authorization
+    httpx.HTTPStatusError,  # Will check status code
+
+    # Validation errors
+    ValueError,
+    KeyError,
+    TypeError,
+}
+
+
+def is_transient_error(exception: Exception) -> bool:
+    """
+    Determine if an exception represents a transient failure.
+
+    Transient errors:
+    - Network timeouts
+    - Connection errors
+    - Server overload (429, 503)
+    - Temporary server errors (500, 502, 504)
+
+    Permanent errors:
+    - Authentication failures (401, 403)
+    - Not found (404)
+    - Validation errors (400, 422)
+
+    Args:
+        exception: Exception to categorize
+
+    Returns:
+        True if error is transient and should be retried
+    """
+    # Check exception type
+    if type(exception) in TRANSIENT_EXCEPTIONS:
+        return True
+
+    # Check HTTP status codes
+    if isinstance(exception, httpx.HTTPStatusError):
+        return exception.response.status_code in TRANSIENT_STATUS_CODES
+
+    # Default: treat as permanent
+    return False
+
+
+def calculate_delay(
+    attempt: int,
+    config: RetryConfig
+) -> float:
+    """
+    Calculate delay for exponential backoff.
+
+    Formula: min(base_delay * (exponential_base ** attempt), max_delay)
+    With optional jitter to avoid thundering herd.
+
+    Args:
+        attempt: Current attempt number (0-indexed)
+        config: Retry configuration
+
+    Returns:
+        Delay in seconds
+
+    Example:
+        >>> calculate_delay(0, RetryConfig(base_delay=1.0, exponential_base=2.0))
+        1.0
+        >>> calculate_delay(1, RetryConfig(base_delay=1.0, exponential_base=2.0))
+        2.0
+        >>> calculate_delay(2, RetryConfig(base_delay=1.0, exponential_base=2.0))
+        4.0
+    """
+    delay = config.base_delay * (config.exponential_base ** attempt)
+    delay = min(delay, config.max_delay)
+
+    if config.jitter:
+        import random
+        # Add ±25% jitter
+        jitter_amount = delay * 0.25
+        delay = delay + random.uniform(-jitter_amount, jitter_amount)
+
+    return max(0, delay)  # Ensure non-negative
+
+
+def retry_sync(
+    config: Optional[RetryConfig] = None,
+    on_retry: Optional[Callable[[Exception, int], None]] = None
+):
+    """
+    Decorator for synchronous retry logic with exponential backoff.
+
+    Args:
+        config: Retry configuration (uses defaults if None)
+        on_retry: Optional callback called on each retry attempt
+
+    Example:
+        >>> @retry_sync(RetryConfig(max_attempts=3))
+        ... def fetch_data():
+        ...     return call_api()
+
+    Raises:
+        Original exception if all retries exhausted
+    """
+    if config is None:
+        config = RetryConfig()
+
+    def decorator(func: Callable[..., T]) -> Callable[..., T]:
+        @wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> T:
+            last_exception: Optional[Exception] = None
+
+            for attempt in range(config.max_attempts):
+                try:
+                    return func(*args, **kwargs)
+
+                except Exception as e:
+                    last_exception = e
+
+                    # Check if error is transient
+                    if not is_transient_error(e):
+                        logger.error(
+                            f"{func.__name__} failed with permanent error: {e}"
+                        )
+                        raise
+
+                    # Last attempt?
+                    if attempt >= config.max_attempts - 1:
+                        logger.error(
+                            f"{func.__name__} failed after {config.max_attempts} attempts: {e}"
+                        )
+                        raise
+
+                    # Calculate delay
+                    delay = calculate_delay(attempt, config)
+
+                    logger.warning(
+                        f"{func.__name__} attempt {attempt + 1}/{config.max_attempts} "
+                        f"failed with transient error: {e}. "
+                        f"Retrying in {delay:.1f}s..."
+                    )
+
+                    # Call retry callback if provided
+                    if on_retry:
+                        on_retry(e, attempt)
+
+                    # Wait before retry
+                    time.sleep(delay)
+
+            # Should never reach here, but satisfy type checker
+            if last_exception:
+                raise last_exception
+            raise RuntimeError("Retry logic error")
+
+        return wrapper
+    return decorator
+
+
+def retry_async(
+    config: Optional[RetryConfig] = None,
+    on_retry: Optional[Callable[[Exception, int], None]] = None
+):
+    """
+    Decorator for asynchronous retry logic with exponential backoff.
+
+    Args:
+        config: Retry configuration (uses defaults if None)
+        on_retry: Optional callback called on each retry attempt
+
+    Example:
+        >>> @retry_async(RetryConfig(max_attempts=3))
+        ... async def fetch_data():
+        ...     return await call_api_async()
+
+    Raises:
+        Original exception if all retries exhausted
+    """
+    if config is None:
+        config = RetryConfig()
+
+    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
+        @wraps(func)
+        async def wrapper(*args: Any, **kwargs: Any) -> Any:
+            last_exception: Optional[Exception] = None
+
+            for attempt in range(config.max_attempts):
+                try:
+                    return await func(*args, **kwargs)
+
+                except Exception as e:
+                    last_exception = e
+
+                    # Check if error is transient
+                    if not is_transient_error(e):
+                        logger.error(
+                            f"{func.__name__} failed with permanent error: {e}"
+                        )
+                        raise
+
+                    # Last attempt?
+                    if attempt >= config.max_attempts - 1:
+                        logger.error(
+                            f"{func.__name__} failed after {config.max_attempts} attempts: {e}"
+                        )
+                        raise
+
+                    # Calculate delay
+                    delay = calculate_delay(attempt, config)
+
+                    logger.warning(
+                        f"{func.__name__} attempt {attempt + 1}/{config.max_attempts} "
+                        f"failed with transient error: {e}. "
+                        f"Retrying in {delay:.1f}s..."
+                    )
+
+                    # Call retry callback if provided
+                    if on_retry:
+                        on_retry(e, attempt)
+
+                    # Wait before retry (async)
+                    await asyncio.sleep(delay)
+
+            # Should never reach here, but satisfy type checker
+            if last_exception:
+                raise last_exception
+            raise RuntimeError("Retry logic error")
+
+        return wrapper
+    return decorator
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    import logging
+    logging.basicConfig(level=logging.INFO)
+
+    # Test synchronous retry
+    print("=== Testing Synchronous Retry ===")
+
+    attempt_count = 0
+
+    @retry_sync(RetryConfig(max_attempts=3, base_delay=0.1))
+    def flaky_function():
+        global attempt_count
+        attempt_count += 1
+        print(f"Attempt {attempt_count}")
+
+        if attempt_count < 3:
+            raise httpx.ConnectTimeout("Connection timeout")
+        return "Success!"
+
+    try:
+        result = flaky_function()
+        print(f"Result: {result}")
+    except Exception as e:
+        print(f"Failed: {e}")
+
+    # Test async retry
+    print("\n=== Testing Asynchronous Retry ===")
+
+    async def test_async():
+        attempt_count = 0
+
+        @retry_async(RetryConfig(max_attempts=3, base_delay=0.1))
+        async def async_flaky_function():
+            nonlocal attempt_count
+            attempt_count += 1
+            print(f"Async attempt {attempt_count}")
+
+            if attempt_count < 2:
+                raise httpx.ReadTimeout("Read timeout")
+            return "Async success!"
+
+        try:
+            result = await async_flaky_function()
+            print(f"Result: {result}")
+        except Exception as e:
+            print(f"Failed: {e}")
+
+    asyncio.run(test_async())
+
+    # Test permanent error (should not retry)
+    print("\n=== Testing Permanent Error (No Retry) ===")
+
+    attempt_count = 0
+
+    @retry_sync(RetryConfig(max_attempts=3, base_delay=0.1))
+    def permanent_error_function():
+        global attempt_count
+        attempt_count += 1
+        print(f"Attempt {attempt_count}")
+        raise ValueError("Invalid input")  # Permanent error
+
+    try:
+        result = permanent_error_function()
+    except ValueError as e:
+        print(f"Correctly failed immediately: {e}")
+        print(f"Attempts made: {attempt_count} (should be 1)")
--- a/transcript-fixer/scripts/utils/security.py
+++ b/transcript-fixer/scripts/utils/security.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python3
+"""
+Security Utilities
+
+CRITICAL FIX: Secure handling of sensitive data
+ISSUE: Critical-2 in Engineering Excellence Plan
+
+This module provides:
+1. Secret masking for logs
+2. Secure memory handling
+3. API key validation
+4. Input sanitization
+
+Author: Chief Engineer
+Date: 2025-10-28
+Priority: P0 - Critical
+"""
+
+from __future__ import annotations
+
+import re
+import ctypes
+import sys
+from typing import Optional, Final
+
+# Constants
+MIN_API_KEY_LENGTH: Final[int] = 20  # Minimum reasonable API key length
+MASK_PREFIX_LENGTH: Final[int] = 4   # Show first 4 chars
+MASK_SUFFIX_LENGTH: Final[int] = 4   # Show last 4 chars
+
+
+def mask_secret(secret: str, visible_chars: int = 4) -> str:
+    """
+    Safely mask secrets for logging.
+
+    CRITICAL: Never log full secrets. Always use this function.
+
+    Args:
+        secret: The secret to mask (API key, token, password)
+        visible_chars: Number of chars to show at start/end (default: 4)
+
+    Returns:
+        Masked string like "7fb3...DPRR"
+
+    Examples:
+        >>> mask_secret("7fb3ab7b186242288fe93a27227b7149.bJCOEAsUfejvWDPR")
+        '7fb3...DPRR'
+
+        >>> mask_secret("short")
+        '***'
+
+        >>> mask_secret("")
+        '***'
+    """
+    if not secret:
+        return "***"
+
+    secret_len = len(secret)
+
+    # Very short secrets: completely hide
+    if secret_len < 2 * visible_chars:
+        return "***"
+
+    # Show prefix and suffix with ... in middle
+    prefix = secret[:visible_chars]
+    suffix = secret[-visible_chars:]
+
+    return f"{prefix}...{suffix}"
+
+
+def mask_secret_in_text(text: str, secret: str) -> str:
+    """
+    Replace all occurrences of secret in text with masked version.
+
+    Useful for sanitizing error messages, logs, etc.
+
+    Args:
+        text: Text that might contain secrets
+        secret: The secret to mask
+
+    Returns:
+        Text with secret masked
+
+    Examples:
+        >>> text = "API key example-fake-key-1234567890abcdef.test failed"
+        >>> secret = "example-fake-key-1234567890abcdef.test"
+        >>> mask_secret_in_text(text, secret)
+        'API key exam...test failed'
+    """
+    if not secret or not text:
+        return text
+
+    masked = mask_secret(secret)
+    return text.replace(secret, masked)
+
+
+def validate_api_key(key: str) -> bool:
+    """
+    Validate API key format (basic checks).
+
+    This doesn't verify if the key is valid with the API,
+    just checks if it looks reasonable.
+
+    Args:
+        key: API key to validate
+
+    Returns:
+        True if key format is valid
+
+    Checks:
+    - Not empty
+    - Minimum length (20 chars)
+    - No suspicious patterns (only whitespace, etc.)
+    """
+    if not key:
+        return False
+
+    # Remove whitespace
+    key_stripped = key.strip()
+
+    # Check minimum length
+    if len(key_stripped) < MIN_API_KEY_LENGTH:
+        return False
+
+    # Check it's not all spaces or special chars
+    if key_stripped.isspace():
+        return False
+
+    # Check it contains some alphanumeric characters
+    if not any(c.isalnum() for c in key_stripped):
+        return False
+
+    return True
+
+
+def sanitize_for_logging(text: str, max_length: int = 200) -> str:
+    """
+    Sanitize text for safe logging.
+
+    Prevents:
+    - Log injection attacks
+    - Excessively long log entries
+    - Binary data in logs
+    - Control characters
+
+    Args:
+        text: Text to sanitize
+        max_length: Maximum length (default: 200)
+
+    Returns:
+        Safe text for logging
+    """
+    if not text:
+        return ""
+
+    # Truncate if too long
+    if len(text) > max_length:
+        text = text[:max_length] + "... (truncated)"
+
+    # Remove control characters (except newline, tab)
+    text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\t')
+
+    # Escape newlines to prevent log injection
+    text = text.replace('\n', '\\n').replace('\r', '\\r')
+
+    return text
+
+
+def detect_and_mask_api_keys(text: str) -> str:
+    """
+    Automatically detect and mask potential API keys in text.
+
+    Patterns detected:
+    - Typical API key formats (alphanumeric + special chars, 20+ chars)
+    - Bearer tokens
+    - Authorization headers
+
+    Args:
+        text: Text that might contain API keys
+
+    Returns:
+        Text with API keys masked
+
+    Warning:
+        This is heuristic-based and may have false positives/negatives.
+        Best practice: Don't let keys get into logs in the first place.
+    """
+    # Pattern for typical API keys
+    # Looks for: 20+ chars of alphanumeric, dots, dashes, underscores
+    api_key_pattern = r'\b[A-Za-z0-9._-]{20,}\b'
+
+    def replace_with_mask(match):
+        potential_key = match.group(0)
+        # Only mask if it looks like a real key
+        if validate_api_key(potential_key):
+            return mask_secret(potential_key)
+        return potential_key
+
+    # Replace potential keys
+    text = re.sub(api_key_pattern, replace_with_mask, text)
+
+    # Also mask Authorization headers
+    text = re.sub(
+        r'Authorization:\s*Bearer\s+([A-Za-z0-9._-]+)',
+        lambda m: f'Authorization: Bearer {mask_secret(m.group(1))}',
+        text,
+        flags=re.IGNORECASE
+    )
+
+    return text
+
+
+def zero_memory(data: str) -> None:
+    """
+    Attempt to overwrite sensitive data in memory.
+
+    NOTE: This is best-effort in Python due to string immutability.
+    Python strings cannot be truly zeroed. This is a defense-in-depth
+    measure that may help in some scenarios but is not guaranteed.
+
+    For truly secure secret handling, consider:
+    - Using memoryview/bytearray for mutable secrets
+    - Storing secrets in kernel memory (OS features)
+    - Hardware security modules (HSM)
+
+    Args:
+        data: String to attempt to zero
+
+    Limitations:
+        - Python strings are immutable
+        - GC may have already copied the data
+        - This is NOT cryptographically secure erasure
+    """
+    try:
+        # This is best-effort only
+        # Python strings are immutable, so we can't truly zero them
+        # But we can try to overwrite the memory location
+        location = id(data) + sys.getsizeof('')
+        size = len(data.encode('utf-8'))
+        ctypes.memset(location, 0, size)
+    except Exception:
+        # Silently fail - this is best-effort
+        pass
+
+
+class SecretStr:
+    """
+    Wrapper for secrets that prevents accidental logging.
+
+    Usage:
+        api_key = SecretStr("7fb3ab7b186242288fe93a27227b7149.bJCOEAsUfejvWDPR")
+        print(api_key)  # Prints: SecretStr(7fb3...DPRR)
+        print(api_key.get())  # Get actual value when needed
+
+    This prevents accidentally logging secrets:
+        logger.info(f"Using key: {api_key}")  # Safe! Automatically masked
+    """
+
+    def __init__(self, secret: str):
+        """
+        Initialize with secret value.
+
+        Args:
+            secret: The secret to wrap
+        """
+        self._secret = secret
+
+    def get(self) -> str:
+        """
+        Get the actual secret value.
+
+        Use this only when you need the real value.
+        Never log the result!
+
+        Returns:
+            The actual secret
+        """
+        return self._secret
+
+    def __str__(self) -> str:
+        """String representation (masked)"""
+        return f"SecretStr({mask_secret(self._secret)})"
+
+    def __repr__(self) -> str:
+        """Repr (masked)"""
+        return f"SecretStr({mask_secret(self._secret)})"
+
+    def __del__(self):
+        """Attempt to zero memory on deletion"""
+        zero_memory(self._secret)
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    # Test masking (using fake example key for testing)
+    api_key = "example-fake-key-for-testing-only-not-real"
+    print(f"Original: {api_key}")
+    print(f"Masked: {mask_secret(api_key)}")
+
+    # Test in text
+    text = f"Connection failed with key {api_key}"
+    print(f"Sanitized: {mask_secret_in_text(text, api_key)}")
+
+    # Test SecretStr
+    secret = SecretStr(api_key)
+    print(f"SecretStr: {secret}")  # Automatically masked
+
+    # Test validation
+    print(f"Valid: {validate_api_key(api_key)}")
+    print(f"Invalid: {validate_api_key('short')}")
+
+    # Test auto-detection
+    log_text = f"ERROR: API request failed with key {api_key}"
+    print(f"Auto-masked: {detect_and_mask_api_keys(log_text)}")
--- a/transcript-fixer/scripts/utils/validation.py
+++ b/transcript-fixer/scripts/utils/validation.py
@@ -18,16 +18,6 @@ import os
 import sys
 from pathlib import Path

-# Handle imports for both standalone and package usage
-try:
-    from core import CorrectionRepository, CorrectionService
-except ImportError:
-    # Fallback for when run from scripts directory directly
-    import sys
-    from pathlib import Path
-    sys.path.insert(0, str(Path(__file__).parent.parent))
-    from core import CorrectionRepository, CorrectionService
-

 def validate_configuration() -> tuple[list[str], list[str]]:
    """
@@ -56,6 +46,10 @@ def validate_configuration() -> tuple[list[str], list[str]]:
    # Validate SQLite database
    if db_path.exists():
        try:
+            # CRITICAL FIX: Lazy import to prevent circular dependency
+            # circular import: core → utils.domain_validator → utils → utils.validation → core
+            from core import CorrectionRepository, CorrectionService
+
            repository = CorrectionRepository(db_path)
            service = CorrectionService(repository)

@@ -64,9 +58,9 @@ def validate_configuration() -> tuple[list[str], list[str]]:
            print(f"✅ Database valid: {stats['total_corrections']} corrections")

            # Check tables exist
-            conn = repository._get_connection()
-            cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
-            tables = [row[0] for row in cursor.fetchall()]
+            with repository._pool.get_connection() as conn:
+                cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
+                tables = [row[0] for row in cursor.fetchall()]

            expected_tables = [
                'corrections', 'context_rules', 'correction_history',