Release v1.8.0: Add transcript-fixer skill

## New Skill: transcript-fixer v1.0.0 Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning. **Features:** - Two-stage correction pipeline (dictionary + AI) - Automatic pattern detection and learning - Domain-specific dictionaries (general, embodied_ai, finance, medical) - SQLite-based correction repository - Team collaboration with import/export - GLM API integration for AI corrections - Cost optimization through dictionary promotion **Use cases:** - Correcting meeting notes, lecture recordings, or interview transcripts - Fixing Chinese/English homophone errors and technical terminology - Building domain-specific correction dictionaries - Improving transcript accuracy through iterative learning **Documentation:** - Complete workflow guides in references/ - SQL query templates - Troubleshooting guide - Team collaboration patterns - API setup instructions **Marketplace updates:** - Updated marketplace to v1.8.0 - Added transcript-fixer plugin (category: productivity) - Updated README.md with skill description and use cases - Updated CLAUDE.md with skill listing and counts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-28 13:16:37 +08:00
parent d1041ac203
commit bd0aa12004
44 changed files with 7432 additions and 8 deletions
--- a/transcript-fixer/scripts/core/init.py
+++ b/transcript-fixer/scripts/core/init.py
@@ -0,0 +1,44 @@
+"""
+Core Module - Business Logic and Data Access
+
+This module contains the core business logic for transcript correction:
+- CorrectionRepository: Data access layer with ACID transactions
+- CorrectionService: Business logic layer with validation
+- DictionaryProcessor: Stage 1 dictionary-based corrections
+- AIProcessor: Stage 2 AI-powered corrections
+- LearningEngine: Pattern detection and learning
+"""
+
+# Core SQLite-based components (always available)
+from .correction_repository import CorrectionRepository, Correction, DatabaseError, ValidationError
+from .correction_service import CorrectionService, ValidationRules
+
+# Processing components (imported lazily to avoid dependency issues)
+def _lazy_import(name):
+    """Lazy import to avoid loading heavy dependencies."""
+    if name == 'DictionaryProcessor':
+        from .dictionary_processor import DictionaryProcessor
+        return DictionaryProcessor
+    elif name == 'AIProcessor':
+        from .ai_processor import AIProcessor
+        return AIProcessor
+    elif name == 'LearningEngine':
+        from .learning_engine import LearningEngine
+        return LearningEngine
+    raise ImportError(f"Unknown module: {name}")
+
+# Export main classes
+__all__ = [
+    'CorrectionRepository',
+    'CorrectionService',
+    'Correction',
+    'DatabaseError',
+    'ValidationError',
+    'ValidationRules',
+]
+
+# Make lazy imports available via __getattr__
+def __getattr__(name):
+    if name in ['DictionaryProcessor', 'AIProcessor', 'LearningEngine']:
+        return _lazy_import(name)
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
--- a/transcript-fixer/scripts/core/ai_processor.py
+++ b/transcript-fixer/scripts/core/ai_processor.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""
+AI Processor - Stage 2: AI-powered Text Corrections
+
+SINGLE RESPONSIBILITY: Process text using GLM API for intelligent corrections
+
+Features:
+- Split text into chunks for API processing
+- Call GLM-4.6 for context-aware corrections
+- Track AI-suggested changes
+- Handle API errors gracefully
+"""
+
+from __future__ import annotations
+
+import os
+import re
+from typing import List, Tuple
+from dataclasses import dataclass
+import httpx
+
+
+@dataclass
+class AIChange:
+    """Represents an AI-suggested change"""
+    chunk_index: int
+    from_text: str
+    to_text: str
+    confidence: float  # 0.0 to 1.0
+
+
+class AIProcessor:
+    """
+    Stage 2 Processor: AI-powered corrections using GLM-4.6
+
+    Process:
+    1. Split text into chunks (respecting API limits)
+    2. Send each chunk to GLM API
+    3. Track changes for learning engine
+    4. Preserve formatting and structure
+    """
+
+    def __init__(self, api_key: str, model: str = "GLM-4.6",
+                 base_url: str = "https://open.bigmodel.cn/api/anthropic",
+                 fallback_model: str = "GLM-4.5-Air"):
+        """
+        Initialize AI processor
+
+        Args:
+            api_key: GLM API key
+            model: Model name (default: GLM-4.6)
+            base_url: API base URL
+            fallback_model: Fallback model on primary failure
+        """
+        self.api_key = api_key
+        self.model = model
+        self.fallback_model = fallback_model
+        self.base_url = base_url
+        self.max_chunk_size = 6000  # Characters per chunk
+
+    def process(self, text: str, context: str = "") -> Tuple[str, List[AIChange]]:
+        """
+        Process text with AI corrections
+
+        Args:
+            text: Text to correct
+            context: Optional domain/meeting context
+
+        Returns:
+            (corrected_text, list_of_changes)
+        """
+        chunks = self._split_into_chunks(text)
+        corrected_chunks = []
+        all_changes = []
+
+        print(f"📝 Processing {len(chunks)} chunks with {self.model}...")
+
+        for i, chunk in enumerate(chunks, 1):
+            print(f"   Chunk {i}/{len(chunks)}... ", end="", flush=True)
+
+            try:
+                corrected_chunk = self._process_chunk(chunk, context, self.model)
+                corrected_chunks.append(corrected_chunk)
+
+                # TODO: Extract actual changes for learning
+                # For now, we assume the whole chunk changed
+                if corrected_chunk != chunk:
+                    all_changes.append(AIChange(
+                        chunk_index=i,
+                        from_text=chunk[:50] + "...",
+                        to_text=corrected_chunk[:50] + "...",
+                        confidence=0.9  # Placeholder
+                    ))
+
+                print("✓")
+
+            except Exception as e:
+                print(f"✗ {str(e)[:50]}")
+
+                # Retry with fallback model
+                if self.fallback_model and self.fallback_model != self.model:
+                    print(f"   Retrying with {self.fallback_model}... ", end="", flush=True)
+                    try:
+                        corrected_chunk = self._process_chunk(chunk, context, self.fallback_model)
+                        corrected_chunks.append(corrected_chunk)
+                        print("✓")
+                        continue
+                    except Exception as e2:
+                        print(f"✗ {str(e2)[:50]}")
+
+                print("   Using original text...")
+                corrected_chunks.append(chunk)
+
+        return "\n\n".join(corrected_chunks), all_changes
+
+    def _split_into_chunks(self, text: str) -> List[str]:
+        """
+        Split text into processable chunks
+
+        Strategy:
+        - Split by double newlines (paragraphs)
+        - Keep chunks under max_chunk_size
+        - Don't split mid-paragraph if possible
+        """
+        paragraphs = text.split('\n\n')
+        chunks = []
+        current_chunk = []
+        current_length = 0
+
+        for para in paragraphs:
+            para_length = len(para)
+
+            # If single paragraph exceeds limit, force split
+            if para_length > self.max_chunk_size:
+                if current_chunk:
+                    chunks.append('\n\n'.join(current_chunk))
+                    current_chunk = []
+                    current_length = 0
+
+                # Split long paragraph by sentences
+                sentences = re.split(r'([。！？\n])', para)
+                temp_para = ""
+                for i in range(0, len(sentences), 2):
+                    sentence = sentences[i] + (sentences[i+1] if i+1 < len(sentences) else "")
+                    if len(temp_para) + len(sentence) > self.max_chunk_size:
+                        if temp_para:
+                            chunks.append(temp_para)
+                        temp_para = sentence
+                    else:
+                        temp_para += sentence
+                if temp_para:
+                    chunks.append(temp_para)
+
+            # Normal case: accumulate paragraphs
+            elif current_length + para_length > self.max_chunk_size and current_chunk:
+                chunks.append('\n\n'.join(current_chunk))
+                current_chunk = [para]
+                current_length = para_length
+            else:
+                current_chunk.append(para)
+                current_length += para_length + 2  # +2 for \n\n
+
+        if current_chunk:
+            chunks.append('\n\n'.join(current_chunk))
+
+        return chunks
+
+    def _process_chunk(self, chunk: str, context: str, model: str) -> str:
+        """Process a single chunk with GLM API"""
+        prompt = self._build_prompt(chunk, context)
+
+        url = f"{self.base_url}/v1/messages"
+        headers = {
+            "anthropic-version": "2023-06-01",
+            "Authorization": f"Bearer {self.api_key}",
+            "content-type": "application/json"
+        }
+
+        data = {
+            "model": model,
+            "max_tokens": 8000,
+            "temperature": 0.3,
+            "messages": [{"role": "user", "content": prompt}]
+        }
+
+        with httpx.Client(timeout=60.0) as client:
+            response = client.post(url, headers=headers, json=data)
+            response.raise_for_status()
+            result = response.json()
+            return result["content"][0]["text"]
+
+    def _build_prompt(self, chunk: str, context: str) -> str:
+        """Build correction prompt for GLM"""
+        base_prompt = """你是专业的会议记录校对专家。请修复以下会议转录中的语音识别错误。
+
+**修复原则**：
+1. 严格保留原有格式（时间戳、发言人标识、Markdown标记等）
+2. 修复明显的同音字错误
+3. 修复专业术语错误
+4. 修复语法错误，但保持口语化特征
+5. 不确定的地方保持原样，不要过度修改
+
+"""
+
+        if context:
+            base_prompt += f"\n**会议背景**：\n{context}\n"
+
+        base_prompt += f"""
+**需要修复的内容**：
+{chunk}
+
+**请直接输出修复后的文本，不要添加任何解释或标注**："""
+
+        return base_prompt
--- a/transcript-fixer/scripts/core/correction_repository.py
+++ b/transcript-fixer/scripts/core/correction_repository.py
@@ -0,0 +1,465 @@
+#!/usr/bin/env python3
+"""
+Correction Repository - SQLite Data Access Layer
+
+SINGLE RESPONSIBILITY: Manage database operations with ACID guarantees
+
+Thread-safe, transactional, and follows Repository pattern.
+All database operations are atomic and properly handle errors.
+"""
+
+from __future__ import annotations
+
+import sqlite3
+import logging
+from pathlib import Path
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple, Any
+from contextlib import contextmanager
+from dataclasses import dataclass, asdict
+import threading
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Correction:
+    """Correction entity"""
+    id: Optional[int]
+    from_text: str
+    to_text: str
+    domain: str
+    source: str  # 'manual' | 'learned' | 'imported'
+    confidence: float
+    added_by: Optional[str]
+    added_at: str
+    usage_count: int
+    last_used: Optional[str]
+    notes: Optional[str]
+    is_active: bool
+
+
+@dataclass
+class ContextRule:
+    """Context-aware rule entity"""
+    id: Optional[int]
+    pattern: str
+    replacement: str
+    description: Optional[str]
+    priority: int
+    is_active: bool
+    added_at: str
+    added_by: Optional[str]
+
+
+@dataclass
+class LearnedSuggestion:
+    """Learned pattern suggestion"""
+    id: Optional[int]
+    from_text: str
+    to_text: str
+    domain: str
+    frequency: int
+    confidence: float
+    first_seen: str
+    last_seen: str
+    status: str  # 'pending' | 'approved' | 'rejected'
+    reviewed_at: Optional[str]
+    reviewed_by: Optional[str]
+
+
+class DatabaseError(Exception):
+    """Base exception for database errors"""
+    pass
+
+
+class ValidationError(DatabaseError):
+    """Data validation error"""
+    pass
+
+
+class CorrectionRepository:
+    """
+    Thread-safe repository for correction storage using SQLite.
+
+    Features:
+    - ACID transactions
+    - Connection pooling
+    - Prepared statements (SQL injection prevention)
+    - Comprehensive error handling
+    - Audit logging
+    """
+
+    def __init__(self, db_path: Path):
+        """
+        Initialize repository with database path.
+
+        Args:
+            db_path: Path to SQLite database file
+        """
+        self.db_path = db_path
+        self._local = threading.local()
+        self._ensure_database_exists()
+
+    def _get_connection(self) -> sqlite3.Connection:
+        """Get thread-local database connection."""
+        if not hasattr(self._local, 'connection'):
+            self._local.connection = sqlite3.connect(
+                self.db_path,
+                isolation_level=None,  # Autocommit mode off, manual transactions
+                check_same_thread=False
+            )
+            self._local.connection.row_factory = sqlite3.Row
+            # Enable foreign keys
+            self._local.connection.execute("PRAGMA foreign_keys = ON")
+        return self._local.connection
+
+    @contextmanager
+    def _transaction(self):
+        """
+        Context manager for database transactions.
+
+        Provides ACID guarantees:
+        - Atomicity: All or nothing
+        - Consistency: Constraints enforced
+        - Isolation: Serializable by default
+        - Durability: Changes persisted to disk
+        """
+        conn = self._get_connection()
+        try:
+            conn.execute("BEGIN IMMEDIATE")  # Acquire write lock immediately
+            yield conn
+            conn.commit()
+        except Exception as e:
+            conn.rollback()
+            logger.error(f"Transaction rolled back: {e}")
+            raise DatabaseError(f"Database operation failed: {e}") from e
+
+    def _ensure_database_exists(self) -> None:
+        """Create database schema if not exists."""
+        schema_path = Path(__file__).parent / "schema.sql"
+
+        if not schema_path.exists():
+            raise FileNotFoundError(f"Schema file not found: {schema_path}")
+
+        with open(schema_path, 'r', encoding='utf-8') as f:
+            schema_sql = f.read()
+
+        with self._transaction() as conn:
+            conn.executescript(schema_sql)
+
+        logger.info(f"Database initialized: {self.db_path}")
+
+    # ==================== Correction Operations ====================
+
+    def add_correction(
+        self,
+        from_text: str,
+        to_text: str,
+        domain: str = "general",
+        source: str = "manual",
+        confidence: float = 1.0,
+        added_by: Optional[str] = None,
+        notes: Optional[str] = None
+    ) -> int:
+        """
+        Add a new correction with full validation.
+
+        Args:
+            from_text: Original (incorrect) text
+            to_text: Corrected text
+            domain: Correction domain
+            source: Origin of correction
+            confidence: Confidence score (0.0-1.0)
+            added_by: User who added it
+            notes: Optional notes
+
+        Returns:
+            ID of inserted correction
+
+        Raises:
+            ValidationError: If validation fails
+            DatabaseError: If database operation fails
+        """
+        with self._transaction() as conn:
+            try:
+                cursor = conn.execute("""
+                    INSERT INTO corrections
+                    (from_text, to_text, domain, source, confidence, added_by, notes)
+                    VALUES (?, ?, ?, ?, ?, ?, ?)
+                """, (from_text, to_text, domain, source, confidence, added_by, notes))
+
+                correction_id = cursor.lastrowid
+
+                # Audit log
+                self._audit_log(
+                    conn,
+                    action="add_correction",
+                    entity_type="correction",
+                    entity_id=correction_id,
+                    user=added_by,
+                    details=f"Added: '{from_text}' → '{to_text}' (domain: {domain})"
+                )
+
+                logger.info(f"Added correction ID {correction_id}: {from_text} → {to_text}")
+                return correction_id
+
+            except sqlite3.IntegrityError as e:
+                if "UNIQUE constraint failed" in str(e):
+                    # Update existing correction instead (within same transaction)
+                    logger.warning(f"Correction already exists, updating: {from_text}")
+                    cursor = conn.execute("""
+                        UPDATE corrections
+                        SET to_text = ?, source = ?, confidence = ?,
+                            added_by = ?, notes = ?, added_at = CURRENT_TIMESTAMP
+                        WHERE from_text = ? AND domain = ? AND is_active = 1
+                    """, (to_text, source, confidence, added_by, notes, from_text, domain))
+
+                    if cursor.rowcount > 0:
+                        # Get the ID of the updated row
+                        cursor = conn.execute("""
+                            SELECT id FROM corrections
+                            WHERE from_text = ? AND domain = ? AND is_active = 1
+                        """, (from_text, domain))
+                        correction_id = cursor.fetchone()[0]
+
+                        # Audit log
+                        self._audit_log(
+                            conn,
+                            action="update_correction",
+                            entity_type="correction",
+                            entity_id=correction_id,
+                            user=added_by,
+                            details=f"Updated: '{from_text}' → '{to_text}' (domain: {domain})"
+                        )
+
+                        logger.info(f"Updated correction ID {correction_id}: {from_text} → {to_text}")
+                        return correction_id
+                    else:
+                        raise ValidationError(f"Correction not found: {from_text} in domain {domain}")
+                raise ValidationError(f"Integrity constraint violated: {e}") from e
+
+    def get_correction(self, from_text: str, domain: str = "general") -> Optional[Correction]:
+        """Get a specific correction."""
+        conn = self._get_connection()
+        cursor = conn.execute("""
+            SELECT * FROM corrections
+            WHERE from_text = ? AND domain = ? AND is_active = 1
+        """, (from_text, domain))
+
+        row = cursor.fetchone()
+        return self._row_to_correction(row) if row else None
+
+    def get_all_corrections(self, domain: Optional[str] = None, active_only: bool = True) -> List[Correction]:
+        """Get all corrections, optionally filtered by domain."""
+        conn = self._get_connection()
+
+        if domain:
+            if active_only:
+                cursor = conn.execute("""
+                    SELECT * FROM corrections
+                    WHERE domain = ? AND is_active = 1
+                    ORDER BY from_text
+                """, (domain,))
+            else:
+                cursor = conn.execute("""
+                    SELECT * FROM corrections
+                    WHERE domain = ?
+                    ORDER BY from_text
+                """, (domain,))
+        else:
+            if active_only:
+                cursor = conn.execute("""
+                    SELECT * FROM corrections
+                    WHERE is_active = 1
+                    ORDER BY domain, from_text
+                """)
+            else:
+                cursor = conn.execute("""
+                    SELECT * FROM corrections
+                    ORDER BY domain, from_text
+                """)
+
+        return [self._row_to_correction(row) for row in cursor.fetchall()]
+
+    def get_corrections_dict(self, domain: str = "general") -> Dict[str, str]:
+        """Get corrections as a simple dictionary for processing."""
+        corrections = self.get_all_corrections(domain=domain, active_only=True)
+        return {c.from_text: c.to_text for c in corrections}
+
+    def update_correction(
+        self,
+        from_text: str,
+        to_text: str,
+        domain: str = "general",
+        updated_by: Optional[str] = None
+    ) -> int:
+        """Update an existing correction."""
+        with self._transaction() as conn:
+            cursor = conn.execute("""
+                UPDATE corrections
+                SET to_text = ?, added_at = CURRENT_TIMESTAMP
+                WHERE from_text = ? AND domain = ? AND is_active = 1
+            """, (to_text, from_text, domain))
+
+            if cursor.rowcount == 0:
+                raise ValidationError(f"Correction not found: {from_text} in domain {domain}")
+
+            # Audit log
+            self._audit_log(
+                conn,
+                action="update_correction",
+                entity_type="correction",
+                user=updated_by,
+                details=f"Updated: '{from_text}' → '{to_text}' (domain: {domain})"
+            )
+
+            logger.info(f"Updated correction: {from_text} → {to_text}")
+            return cursor.rowcount
+
+    def delete_correction(self, from_text: str, domain: str = "general", deleted_by: Optional[str] = None) -> bool:
+        """Soft delete a correction (mark as inactive)."""
+        with self._transaction() as conn:
+            cursor = conn.execute("""
+                UPDATE corrections
+                SET is_active = 0
+                WHERE from_text = ? AND domain = ? AND is_active = 1
+            """, (from_text, domain))
+
+            if cursor.rowcount > 0:
+                self._audit_log(
+                    conn,
+                    action="delete_correction",
+                    entity_type="correction",
+                    user=deleted_by,
+                    details=f"Deleted: '{from_text}' (domain: {domain})"
+                )
+                logger.info(f"Deleted correction: {from_text}")
+                return True
+            return False
+
+    def increment_usage(self, from_text: str, domain: str = "general") -> None:
+        """Increment usage count for a correction."""
+        with self._transaction() as conn:
+            conn.execute("""
+                UPDATE corrections
+                SET usage_count = usage_count + 1,
+                    last_used = CURRENT_TIMESTAMP
+                WHERE from_text = ? AND domain = ? AND is_active = 1
+            """, (from_text, domain))
+
+    # ==================== Bulk Operations ====================
+
+    def bulk_import_corrections(
+        self,
+        corrections: Dict[str, str],
+        domain: str = "general",
+        source: str = "imported",
+        imported_by: Optional[str] = None,
+        merge: bool = True
+    ) -> Tuple[int, int, int]:
+        """
+        Bulk import corrections with conflict resolution.
+
+        Returns:
+            Tuple of (inserted_count, updated_count, skipped_count)
+        """
+        inserted, updated, skipped = 0, 0, 0
+
+        with self._transaction() as conn:
+            for from_text, to_text in corrections.items():
+                try:
+                    if merge:
+                        # Check if exists
+                        cursor = conn.execute("""
+                            SELECT id, to_text FROM corrections
+                            WHERE from_text = ? AND domain = ? AND is_active = 1
+                        """, (from_text, domain))
+                        existing = cursor.fetchone()
+
+                        if existing:
+                            if existing['to_text'] != to_text:
+                                # Update
+                                conn.execute("""
+                                    UPDATE corrections
+                                    SET to_text = ?, source = ?, added_at = CURRENT_TIMESTAMP
+                                    WHERE from_text = ? AND domain = ? AND is_active = 1
+                                """, (to_text, source, from_text, domain))
+                                updated += 1
+                            else:
+                                skipped += 1
+                        else:
+                            # Insert
+                            conn.execute("""
+                                INSERT INTO corrections
+                                (from_text, to_text, domain, source, confidence, added_by)
+                                VALUES (?, ?, ?, ?, 1.0, ?)
+                            """, (from_text, to_text, domain, source, imported_by))
+                            inserted += 1
+                    else:
+                        # Replace mode: just insert
+                        conn.execute("""
+                            INSERT OR REPLACE INTO corrections
+                            (from_text, to_text, domain, source, confidence, added_by)
+                            VALUES (?, ?, ?, ?, 1.0, ?)
+                        """, (from_text, to_text, domain, source, imported_by))
+                        inserted += 1
+
+                except sqlite3.Error as e:
+                    logger.warning(f"Failed to import '{from_text}': {e}")
+                    skipped += 1
+
+            # Audit log
+            self._audit_log(
+                conn,
+                action="bulk_import",
+                entity_type="correction",
+                user=imported_by,
+                details=f"Imported {inserted} new, updated {updated}, skipped {skipped} (domain: {domain})"
+            )
+
+        logger.info(f"Bulk import: {inserted} inserted, {updated} updated, {skipped} skipped")
+        return (inserted, updated, skipped)
+
+    # ==================== Helper Methods ====================
+
+    def _row_to_correction(self, row: sqlite3.Row) -> Correction:
+        """Convert database row to Correction object."""
+        return Correction(
+            id=row['id'],
+            from_text=row['from_text'],
+            to_text=row['to_text'],
+            domain=row['domain'],
+            source=row['source'],
+            confidence=row['confidence'],
+            added_by=row['added_by'],
+            added_at=row['added_at'],
+            usage_count=row['usage_count'],
+            last_used=row['last_used'],
+            notes=row['notes'],
+            is_active=bool(row['is_active'])
+        )
+
+    def _audit_log(
+        self,
+        conn: sqlite3.Connection,
+        action: str,
+        entity_type: str,
+        entity_id: Optional[int] = None,
+        user: Optional[str] = None,
+        details: Optional[str] = None,
+        success: bool = True,
+        error_message: Optional[str] = None
+    ) -> None:
+        """Write audit log entry."""
+        conn.execute("""
+            INSERT INTO audit_log (action, entity_type, entity_id, user, details, success, error_message)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
+        """, (action, entity_type, entity_id, user, details, success, error_message))
+
+    def close(self) -> None:
+        """Close database connection."""
+        if hasattr(self._local, 'connection'):
+            self._local.connection.close()
+            delattr(self._local, 'connection')
+            logger.info("Database connection closed")
--- a/transcript-fixer/scripts/core/correction_service.py
+++ b/transcript-fixer/scripts/core/correction_service.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Correction Service - Business Logic Layer
+
+SINGLE RESPONSIBILITY: Implement business rules and validation
+
+Orchestrates repository operations with comprehensive validation,
+error handling, and business logic enforcement.
+"""
+
+from __future__ import annotations
+
+import re
+import os
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from dataclasses import dataclass
+
+from .correction_repository import (
+    CorrectionRepository,
+    ValidationError,
+    DatabaseError
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ValidationRules:
+    """Validation rules configuration"""
+    max_text_length: int = 1000
+    min_text_length: int = 1
+    max_domain_length: int = 50
+    allowed_domain_pattern: str = r'^[a-zA-Z0-9_-]+$'
+    max_confidence: float = 1.0
+    min_confidence: float = 0.0
+
+
+class CorrectionService:
+    """
+    Service layer for correction management.
+
+    Responsibilities:
+    - Input validation and sanitization
+    - Business rule enforcement
+    - Conflict detection and resolution
+    - Statistics and reporting
+    - Integration with repository layer
+    """
+
+    def __init__(self, repository: CorrectionRepository, rules: Optional[ValidationRules] = None):
+        """
+        Initialize service with repository.
+
+        Args:
+            repository: Data access layer
+            rules: Validation rules (uses defaults if None)
+        """
+        self.repository = repository
+        self.rules = rules or ValidationRules()
+        self.db_path = repository.db_path
+        logger.info("CorrectionService initialized")
+
+    def initialize(self) -> None:
+        """
+        Initialize database (already done by repository, kept for API compatibility).
+        """
+        # Database is auto-initialized by repository on first access
+        logger.info(f"✅ Database ready: {self.db_path}")
+
+    # ==================== Validation Methods ====================
+
+    def validate_correction_text(self, text: str, field_name: str = "text") -> None:
+        """
+        Validate correction text with comprehensive checks.
+
+        Args:
+            text: Text to validate
+            field_name: Field name for error messages
+
+        Raises:
+            ValidationError: If validation fails
+        """
+        # Check not None or empty
+        if not text:
+            raise ValidationError(f"{field_name} cannot be None or empty")
+
+        # Check not only whitespace
+        if not text.strip():
+            raise ValidationError(f"{field_name} cannot be only whitespace")
+
+        # Check length constraints
+        if len(text) < self.rules.min_text_length:
+            raise ValidationError(
+                f"{field_name} too short: {len(text)} chars (min: {self.rules.min_text_length})"
+            )
+
+        if len(text) > self.rules.max_text_length:
+            raise ValidationError(
+                f"{field_name} too long: {len(text)} chars (max: {self.rules.max_text_length})"
+            )
+
+        # Check for control characters (except newline and tab)
+        invalid_chars = [c for c in text if ord(c) < 32 and c not in '\n\t']
+        if invalid_chars:
+            raise ValidationError(
+                f"{field_name} contains invalid control characters: {invalid_chars}"
+            )
+
+        # Check for NULL bytes
+        if '\x00' in text:
+            raise ValidationError(f"{field_name} contains NULL bytes")
+
+    def validate_domain_name(self, domain: str) -> None:
+        """
+        Validate domain name to prevent path traversal and injection.
+
+        Args:
+            domain: Domain name to validate
+
+        Raises:
+            ValidationError: If validation fails
+        """
+        if not domain:
+            raise ValidationError("Domain name cannot be empty")
+
+        if len(domain) > self.rules.max_domain_length:
+            raise ValidationError(
+                f"Domain name too long: {len(domain)} chars (max: {self.rules.max_domain_length})"
+            )
+
+        # Check pattern: only alphanumeric, underscore, hyphen
+        if not re.match(self.rules.allowed_domain_pattern, domain):
+            raise ValidationError(
+                f"Domain name contains invalid characters: {domain}. "
+                f"Allowed pattern: {self.rules.allowed_domain_pattern}"
+            )
+
+        # Check for path traversal attempts
+        if '..' in domain or '/' in domain or '\\' in domain:
+            raise ValidationError(f"Domain name contains path traversal: {domain}")
+
+        # Reserved names
+        reserved = ['con', 'prn', 'aux', 'nul', 'com1', 'lpt1']  # Windows reserved
+        if domain.lower() in reserved:
+            raise ValidationError(f"Domain name is reserved: {domain}")
+
+    def validate_confidence(self, confidence: float) -> None:
+        """Validate confidence score."""
+        if not isinstance(confidence, (int, float)):
+            raise ValidationError(f"Confidence must be numeric, got {type(confidence)}")
+
+        if not (self.rules.min_confidence <= confidence <= self.rules.max_confidence):
+            raise ValidationError(
+                f"Confidence must be between {self.rules.min_confidence} "
+                f"and {self.rules.max_confidence}, got {confidence}"
+            )
+
+    def validate_source(self, source: str) -> None:
+        """Validate correction source."""
+        valid_sources = ['manual', 'learned', 'imported']
+        if source not in valid_sources:
+            raise ValidationError(
+                f"Invalid source: {source}. Must be one of: {valid_sources}"
+            )
+
+    # ==================== Correction Operations ====================
+
+    def add_correction(
+        self,
+        from_text: str,
+        to_text: str,
+        domain: str = "general",
+        source: str = "manual",
+        confidence: float = 1.0,
+        notes: Optional[str] = None
+    ) -> int:
+        """
+        Add a correction with full validation.
+
+        Args:
+            from_text: Original (incorrect) text
+            to_text: Corrected text
+            domain: Correction domain
+            source: Origin of correction
+            confidence: Confidence score
+            notes: Optional notes
+
+        Returns:
+            ID of inserted correction
+
+        Raises:
+            ValidationError: If validation fails
+        """
+        # Comprehensive validation
+        self.validate_correction_text(from_text, "from_text")
+        self.validate_correction_text(to_text, "to_text")
+        self.validate_domain_name(domain)
+        self.validate_source(source)
+        self.validate_confidence(confidence)
+
+        # Business rule: from_text and to_text should be different
+        if from_text.strip() == to_text.strip():
+            raise ValidationError(
+                f"from_text and to_text are identical: '{from_text}'"
+            )
+
+        # Get current user
+        added_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
+
+        try:
+            correction_id = self.repository.add_correction(
+                from_text=from_text,
+                to_text=to_text,
+                domain=domain,
+                source=source,
+                confidence=confidence,
+                added_by=added_by,
+                notes=notes
+            )
+
+            logger.info(
+                f"Successfully added correction ID {correction_id}: "
+                f"'{from_text}' → '{to_text}' (domain: {domain})"
+            )
+            return correction_id
+
+        except DatabaseError as e:
+            logger.error(f"Failed to add correction: {e}")
+            raise
+
+    def get_corrections(self, domain: Optional[str] = None) -> Dict[str, str]:
+        """
+        Get corrections as a dictionary for processing.
+
+        Args:
+            domain: Optional domain filter
+
+        Returns:
+            Dictionary of corrections {from_text: to_text}
+        """
+        if domain:
+            self.validate_domain_name(domain)
+            return self.repository.get_corrections_dict(domain)
+        else:
+            # Get all domains
+            all_corrections = self.repository.get_all_corrections(active_only=True)
+            return {c.from_text: c.to_text for c in all_corrections}
+
+    def remove_correction(
+        self,
+        from_text: str,
+        domain: str = "general"
+    ) -> bool:
+        """
+        Remove a correction (soft delete).
+
+        Args:
+            from_text: Text to remove
+            domain: Domain
+
+        Returns:
+            True if removed, False if not found
+        """
+        self.validate_correction_text(from_text, "from_text")
+        self.validate_domain_name(domain)
+
+        deleted_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
+
+        success = self.repository.delete_correction(from_text, domain, deleted_by)
+
+        if success:
+            logger.info(f"Removed correction: '{from_text}' (domain: {domain})")
+        else:
+            logger.warning(f"Correction not found: '{from_text}' (domain: {domain})")
+
+        return success
+
+    # ==================== Import/Export Operations ====================
+
+    def import_corrections(
+        self,
+        corrections: Dict[str, str],
+        domain: str = "general",
+        merge: bool = True,
+        validate_all: bool = True
+    ) -> Tuple[int, int, int]:
+        """
+        Import corrections with validation and conflict resolution.
+
+        Args:
+            corrections: Dictionary of corrections to import
+            domain: Target domain
+            merge: If True, merge with existing; if False, replace
+            validate_all: If True, validate all before import (safer but slower)
+
+        Returns:
+            Tuple of (inserted_count, updated_count, skipped_count)
+
+        Raises:
+            ValidationError: If validation fails (when validate_all=True)
+        """
+        self.validate_domain_name(domain)
+
+        if not corrections:
+            raise ValidationError("Cannot import empty corrections dictionary")
+
+        # Pre-validation (if requested)
+        if validate_all:
+            logger.info(f"Pre-validating {len(corrections)} corrections...")
+            invalid_count = 0
+            for from_text, to_text in corrections.items():
+                try:
+                    self.validate_correction_text(from_text, "from_text")
+                    self.validate_correction_text(to_text, "to_text")
+                except ValidationError as e:
+                    logger.error(f"Validation failed for '{from_text}' → '{to_text}': {e}")
+                    invalid_count += 1
+
+            if invalid_count > 0:
+                raise ValidationError(
+                    f"Pre-validation failed: {invalid_count}/{len(corrections)} corrections invalid"
+                )
+
+        # Detect conflicts if merge mode
+        if merge:
+            existing = self.repository.get_corrections_dict(domain)
+            conflicts = self._detect_conflicts(corrections, existing)
+
+            if conflicts:
+                logger.warning(
+                    f"Found {len(conflicts)} conflicts that will be overwritten"
+                )
+                for from_text, (old_val, new_val) in conflicts.items():
+                    logger.debug(f"Conflict: '{from_text}': '{old_val}' → '{new_val}'")
+
+        # Perform import
+        imported_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
+
+        try:
+            inserted, updated, skipped = self.repository.bulk_import_corrections(
+                corrections=corrections,
+                domain=domain,
+                source="imported",
+                imported_by=imported_by,
+                merge=merge
+            )
+
+            logger.info(
+                f"Import complete: {inserted} inserted, {updated} updated, "
+                f"{skipped} skipped (domain: {domain})"
+            )
+
+            return (inserted, updated, skipped)
+
+        except DatabaseError as e:
+            logger.error(f"Import failed: {e}")
+            raise
+
+    def export_corrections(self, domain: str = "general") -> Dict[str, str]:
+        """
+        Export corrections for sharing.
+
+        Args:
+            domain: Domain to export
+
+        Returns:
+            Dictionary of corrections
+        """
+        self.validate_domain_name(domain)
+
+        corrections = self.repository.get_corrections_dict(domain)
+
+        logger.info(f"Exported {len(corrections)} corrections (domain: {domain})")
+
+        return corrections
+
+    # ==================== Statistics and Reporting ====================
+
+    def get_statistics(self, domain: Optional[str] = None) -> Dict[str, any]:
+        """
+        Get correction statistics.
+
+        Args:
+            domain: Optional domain filter
+
+        Returns:
+            Dictionary of statistics
+        """
+        if domain:
+            self.validate_domain_name(domain)
+            corrections = self.repository.get_all_corrections(domain=domain, active_only=True)
+        else:
+            corrections = self.repository.get_all_corrections(active_only=True)
+
+        # Calculate statistics
+        total = len(corrections)
+        by_source = {'manual': 0, 'learned': 0, 'imported': 0}
+        total_usage = 0
+        high_confidence = 0
+
+        for c in corrections:
+            by_source[c.source] = by_source.get(c.source, 0) + 1
+            total_usage += c.usage_count
+            if c.confidence >= 0.9:
+                high_confidence += 1
+
+        stats = {
+            'total_corrections': total,
+            'by_source': by_source,
+            'total_usage': total_usage,
+            'average_usage': total_usage / total if total > 0 else 0,
+            'high_confidence_count': high_confidence,
+            'high_confidence_ratio': high_confidence / total if total > 0 else 0
+        }
+
+        logger.debug(f"Statistics for domain '{domain}': {stats}")
+
+        return stats
+
+    # ==================== Helper Methods ====================
+
+    def _detect_conflicts(
+        self,
+        incoming: Dict[str, str],
+        existing: Dict[str, str]
+    ) -> Dict[str, Tuple[str, str]]:
+        """
+        Detect conflicts between incoming and existing corrections.
+
+        Returns:
+            Dictionary of conflicts {from_text: (existing_to, incoming_to)}
+        """
+        conflicts = {}
+
+        for from_text in set(incoming.keys()) & set(existing.keys()):
+            if existing[from_text] != incoming[from_text]:
+                conflicts[from_text] = (existing[from_text], incoming[from_text])
+
+        return conflicts
+
+    def load_context_rules(self) -> List[Dict]:
+        """
+        Load active context-aware regex rules.
+
+        Returns:
+            List of rule dictionaries with pattern, replacement, description
+        """
+        try:
+            conn = self.repository._get_connection()
+            cursor = conn.execute("""
+                SELECT pattern, replacement, description
+                FROM context_rules
+                WHERE is_active = 1
+                ORDER BY priority DESC
+            """)
+
+            rules = []
+            for row in cursor.fetchall():
+                rules.append({
+                    "pattern": row[0],
+                    "replacement": row[1],
+                    "description": row[2]
+                })
+
+            logger.debug(f"Loaded {len(rules)} context rules")
+            return rules
+
+        except Exception as e:
+            logger.error(f"Failed to load context rules: {e}")
+            return []
+
+    def save_history(self, filename: str, domain: str, original_length: int,
+                    stage1_changes: int, stage2_changes: int, model: str,
+                    changes: List[Dict]) -> None:
+        """
+        Save correction run history for learning.
+
+        Args:
+            filename: File that was corrected
+            domain: Correction domain
+            original_length: Original file length
+            stage1_changes: Number of Stage 1 changes
+            stage2_changes: Number of Stage 2 changes
+            model: AI model used
+            changes: List of individual changes
+        """
+        try:
+            with self.repository._transaction() as conn:
+                # Insert history record
+                cursor = conn.execute("""
+                    INSERT INTO correction_history
+                    (filename, domain, original_length, stage1_changes, stage2_changes, model)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                """, (filename, domain, original_length, stage1_changes, stage2_changes, model))
+
+                history_id = cursor.lastrowid
+
+                # Insert individual changes
+                for change in changes:
+                    conn.execute("""
+                        INSERT INTO correction_changes
+                        (history_id, line_number, from_text, to_text, rule_type, context_before, context_after)
+                        VALUES (?, ?, ?, ?, ?, ?, ?)
+                    """, (
+                        history_id,
+                        change.get("line_number"),
+                        change.get("from_text", ""),
+                        change.get("to_text", ""),
+                        change.get("rule_type", "dictionary"),
+                        change.get("context_before"),
+                        change.get("context_after")
+                    ))
+
+                logger.info(f"Saved correction history for {filename}: {stage1_changes + stage2_changes} total changes")
+
+        except Exception as e:
+            logger.error(f"Failed to save history: {e}")
+
+    def close(self) -> None:
+        """Close underlying repository."""
+        self.repository.close()
+        logger.info("CorrectionService closed")
--- a/transcript-fixer/scripts/core/dictionary_processor.py
+++ b/transcript-fixer/scripts/core/dictionary_processor.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""
+Dictionary Processor - Stage 1: Dictionary-based Text Corrections
+
+SINGLE RESPONSIBILITY: Apply dictionary and regex-based corrections to text
+
+Features:
+- Apply simple dictionary replacements
+- Apply context-aware regex rules
+- Track all changes for history
+- Case-sensitive and insensitive matching
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Tuple
+from dataclasses import dataclass
+
+
+@dataclass
+class Change:
+    """Represents a single text change"""
+    line_number: int
+    from_text: str
+    to_text: str
+    rule_type: str  # "dictionary" or "context_rule"
+    rule_name: str
+
+
+class DictionaryProcessor:
+    """
+    Stage 1 Processor: Apply dictionary-based corrections
+
+    Process:
+    1. Apply context-aware regex rules first (more specific)
+    2. Apply simple dictionary replacements (more general)
+    3. Track all changes for learning
+    """
+
+    def __init__(self, corrections: Dict[str, str], context_rules: List[Dict]):
+        """
+        Initialize processor with corrections and rules
+
+        Args:
+            corrections: Dictionary of {wrong: correct} pairs
+            context_rules: List of context-aware regex rules
+        """
+        self.corrections = corrections
+        self.context_rules = context_rules
+
+    def process(self, text: str) -> Tuple[str, List[Change]]:
+        """
+        Apply all corrections to text
+
+        Returns:
+            (corrected_text, list_of_changes)
+        """
+        corrected_text = text
+        all_changes = []
+
+        # Step 1: Apply context rules (more specific, higher priority)
+        corrected_text, context_changes = self._apply_context_rules(corrected_text)
+        all_changes.extend(context_changes)
+
+        # Step 2: Apply dictionary replacements (more general)
+        corrected_text, dict_changes = self._apply_dictionary(corrected_text)
+        all_changes.extend(dict_changes)
+
+        return corrected_text, all_changes
+
+    def _apply_context_rules(self, text: str) -> Tuple[str, List[Change]]:
+        """Apply context-aware regex rules"""
+        changes = []
+        corrected = text
+
+        for rule in self.context_rules:
+            pattern = rule["pattern"]
+            replacement = rule["replacement"]
+            description = rule.get("description", "")
+
+            # Find all matches with their positions
+            for match in re.finditer(pattern, corrected):
+                line_num = corrected[:match.start()].count('\n') + 1
+                changes.append(Change(
+                    line_number=line_num,
+                    from_text=match.group(0),
+                    to_text=replacement,
+                    rule_type="context_rule",
+                    rule_name=description or pattern
+                ))
+
+            # Apply replacement
+            corrected = re.sub(pattern, replacement, corrected)
+
+        return corrected, changes
+
+    def _apply_dictionary(self, text: str) -> Tuple[str, List[Change]]:
+        """Apply simple dictionary replacements"""
+        changes = []
+        corrected = text
+
+        for wrong, correct in self.corrections.items():
+            if wrong not in corrected:
+                continue
+
+            # Find all occurrences
+            occurrences = []
+            start = 0
+            while True:
+                pos = corrected.find(wrong, start)
+                if pos == -1:
+                    break
+                line_num = corrected[:pos].count('\n') + 1
+                occurrences.append(line_num)
+                start = pos + len(wrong)
+
+            # Track changes
+            for line_num in occurrences:
+                changes.append(Change(
+                    line_number=line_num,
+                    from_text=wrong,
+                    to_text=correct,
+                    rule_type="dictionary",
+                    rule_name="corrections_dict"
+                ))
+
+            # Apply replacement
+            corrected = corrected.replace(wrong, correct)
+
+        return corrected, changes
+
+    def get_summary(self, changes: List[Change]) -> Dict[str, int]:
+        """Generate summary statistics"""
+        summary = {
+            "total_changes": len(changes),
+            "dictionary_changes": sum(1 for c in changes if c.rule_type == "dictionary"),
+            "context_rule_changes": sum(1 for c in changes if c.rule_type == "context_rule")
+        }
+        return summary
--- a/transcript-fixer/scripts/core/learning_engine.py
+++ b/transcript-fixer/scripts/core/learning_engine.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+"""
+Learning Engine - Pattern Detection from Correction History
+
+SINGLE RESPONSIBILITY: Analyze history and suggest new corrections
+
+Features:
+- Analyze correction history for patterns
+- Detect frequently occurring corrections
+- Calculate confidence scores
+- Generate suggestions for user review
+- Track rejected suggestions to avoid re-suggesting
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import List, Dict
+from dataclasses import dataclass, asdict
+from collections import defaultdict
+
+
+@dataclass
+class Suggestion:
+    """Represents a learned correction suggestion"""
+    from_text: str
+    to_text: str
+    frequency: int
+    confidence: float
+    examples: List[Dict]  # List of {file, line, context}
+    first_seen: str
+    last_seen: str
+    status: str  # "pending", "approved", "rejected"
+
+
+class LearningEngine:
+    """
+    Analyzes correction history to suggest new corrections
+
+    Algorithm:
+    1. Load all history files
+    2. Extract stage2 (AI) changes
+    3. Group by pattern (from_text → to_text)
+    4. Calculate frequency and confidence
+    5. Filter by thresholds
+    6. Save suggestions for user review
+    """
+
+    # Thresholds for suggesting corrections
+    MIN_FREQUENCY = 3  # Must appear at least 3 times
+    MIN_CONFIDENCE = 0.8  # Must have 80%+ confidence
+
+    def __init__(self, history_dir: Path, learned_dir: Path):
+        """
+        Initialize learning engine
+
+        Args:
+            history_dir: Directory containing correction history
+            learned_dir: Directory for learned suggestions
+        """
+        self.history_dir = history_dir
+        self.learned_dir = learned_dir
+        self.pending_file = learned_dir / "pending_review.json"
+        self.rejected_file = learned_dir / "rejected.json"
+
+    def analyze_and_suggest(self) -> List[Suggestion]:
+        """
+        Analyze history and generate suggestions
+
+        Returns:
+            List of new suggestions for user review
+        """
+        # Load all history
+        patterns = self._extract_patterns()
+
+        # Filter rejected patterns
+        rejected = self._load_rejected()
+        patterns = {k: v for k, v in patterns.items()
+                   if k not in rejected}
+
+        # Generate suggestions
+        suggestions = []
+        for (from_text, to_text), occurrences in patterns.items():
+            frequency = len(occurrences)
+
+            if frequency < self.MIN_FREQUENCY:
+                continue
+
+            confidence = self._calculate_confidence(occurrences)
+
+            if confidence < self.MIN_CONFIDENCE:
+                continue
+
+            suggestion = Suggestion(
+                from_text=from_text,
+                to_text=to_text,
+                frequency=frequency,
+                confidence=confidence,
+                examples=occurrences[:5],  # Top 5 examples
+                first_seen=occurrences[0]["timestamp"],
+                last_seen=occurrences[-1]["timestamp"],
+                status="pending"
+            )
+
+            suggestions.append(suggestion)
+
+        # Save new suggestions
+        if suggestions:
+            self._save_pending_suggestions(suggestions)
+
+        return suggestions
+
+    def approve_suggestion(self, from_text: str) -> bool:
+        """
+        Approve a suggestion (remove from pending)
+
+        Returns:
+            True if approved, False if not found
+        """
+        pending = self._load_pending_suggestions()
+
+        for suggestion in pending:
+            if suggestion["from_text"] == from_text:
+                pending.remove(suggestion)
+                self._save_suggestions(pending, self.pending_file)
+                return True
+
+        return False
+
+    def reject_suggestion(self, from_text: str, to_text: str) -> None:
+        """
+        Reject a suggestion (move to rejected list)
+        """
+        # Remove from pending
+        pending = self._load_pending_suggestions()
+        pending = [s for s in pending
+                  if not (s["from_text"] == from_text and s["to_text"] == to_text)]
+        self._save_suggestions(pending, self.pending_file)
+
+        # Add to rejected
+        rejected = self._load_rejected()
+        rejected.add((from_text, to_text))
+        self._save_rejected(rejected)
+
+    def list_pending(self) -> List[Dict]:
+        """List all pending suggestions"""
+        return self._load_pending_suggestions()
+
+    def _extract_patterns(self) -> Dict[tuple, List[Dict]]:
+        """Extract all correction patterns from history"""
+        patterns = defaultdict(list)
+
+        if not self.history_dir.exists():
+            return patterns
+
+        for history_file in self.history_dir.glob("*.json"):
+            with open(history_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            # Extract stage2 changes (AI corrections)
+            if "stages" in data and "stage2" in data["stages"]:
+                changes = data["stages"]["stage2"].get("changes", [])
+
+                for change in changes:
+                    key = (change["from"], change["to"])
+                    patterns[key].append({
+                        "file": data["filename"],
+                        "line": change.get("line", 0),
+                        "context": change.get("context", ""),
+                        "timestamp": data["timestamp"]
+                    })
+
+        return patterns
+
+    def _calculate_confidence(self, occurrences: List[Dict]) -> float:
+        """
+        Calculate confidence score for a pattern
+
+        Factors:
+        - Frequency (more = higher)
+        - Consistency (always same correction = higher)
+        - Recency (recent occurrences = higher)
+        """
+        # Base confidence from frequency
+        frequency_score = min(len(occurrences) / 10.0, 1.0)
+
+        # Consistency: always the same from→to mapping
+        consistency_score = 1.0  # Already consistent by grouping
+
+        # Recency: more recent = higher
+        # (Simplified: assume chronological order)
+        recency_score = 0.9 if len(occurrences) > 1 else 0.8
+
+        # Weighted average
+        confidence = (
+            0.5 * frequency_score +
+            0.3 * consistency_score +
+            0.2 * recency_score
+        )
+
+        return confidence
+
+    def _load_pending_suggestions(self) -> List[Dict]:
+        """Load pending suggestions from file"""
+        if not self.pending_file.exists():
+            return []
+
+        with open(self.pending_file, 'r', encoding='utf-8') as f:
+            content = f.read().strip()
+            if not content:
+                return []
+            return json.loads(content).get("suggestions", [])
+
+    def _save_pending_suggestions(self, suggestions: List[Suggestion]) -> None:
+        """Save pending suggestions to file"""
+        existing = self._load_pending_suggestions()
+
+        # Convert to dict and append
+        new_suggestions = [asdict(s) for s in suggestions]
+        all_suggestions = existing + new_suggestions
+
+        self._save_suggestions(all_suggestions, self.pending_file)
+
+    def _save_suggestions(self, suggestions: List[Dict], filepath: Path) -> None:
+        """Save suggestions to file"""
+        data = {"suggestions": suggestions}
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+
+    def _load_rejected(self) -> set:
+        """Load rejected patterns"""
+        if not self.rejected_file.exists():
+            return set()
+
+        with open(self.rejected_file, 'r', encoding='utf-8') as f:
+            content = f.read().strip()
+            if not content:
+                return set()
+            data = json.loads(content)
+            return {(r["from"], r["to"]) for r in data.get("rejected", [])}
+
+    def _save_rejected(self, rejected: set) -> None:
+        """Save rejected patterns"""
+        data = {
+            "rejected": [
+                {"from": from_text, "to": to_text}
+                for from_text, to_text in rejected
+            ]
+        }
+        with open(self.rejected_file, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
--- a/transcript-fixer/scripts/core/schema.sql
+++ b/transcript-fixer/scripts/core/schema.sql
@@ -0,0 +1,215 @@
+-- Transcript Fixer Database Schema v2.0
+-- Migration from JSON to SQLite for ACID compliance and scalability
+-- Author: ISTJ Chief Engineer
+-- Date: 2025-01-28
+
+-- Enable foreign keys
+PRAGMA foreign_keys = ON;
+
+-- Table: corrections
+-- Stores all correction mappings with metadata
+CREATE TABLE IF NOT EXISTS corrections (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    from_text TEXT NOT NULL,
+    to_text TEXT NOT NULL,
+    domain TEXT NOT NULL DEFAULT 'general',
+    source TEXT NOT NULL CHECK(source IN ('manual', 'learned', 'imported')),
+    confidence REAL NOT NULL DEFAULT 1.0 CHECK(confidence >= 0.0 AND confidence <= 1.0),
+    added_by TEXT,
+    added_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    usage_count INTEGER NOT NULL DEFAULT 0 CHECK(usage_count >= 0),
+    last_used TIMESTAMP,
+    notes TEXT,
+    is_active BOOLEAN NOT NULL DEFAULT 1,
+    UNIQUE(from_text, domain)
+);
+
+CREATE INDEX IF NOT EXISTS idx_corrections_domain ON corrections(domain);
+CREATE INDEX IF NOT EXISTS idx_corrections_source ON corrections(source);
+CREATE INDEX IF NOT EXISTS idx_corrections_added_at ON corrections(added_at);
+CREATE INDEX IF NOT EXISTS idx_corrections_is_active ON corrections(is_active);
+CREATE INDEX IF NOT EXISTS idx_corrections_from_text ON corrections(from_text);
+
+-- Table: context_rules
+-- Regex-based context-aware correction rules
+CREATE TABLE IF NOT EXISTS context_rules (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    pattern TEXT NOT NULL UNIQUE,
+    replacement TEXT NOT NULL,
+    description TEXT,
+    priority INTEGER NOT NULL DEFAULT 0,
+    is_active BOOLEAN NOT NULL DEFAULT 1,
+    added_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    added_by TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_context_rules_priority ON context_rules(priority DESC);
+CREATE INDEX IF NOT EXISTS idx_context_rules_is_active ON context_rules(is_active);
+
+-- Table: correction_history
+-- Audit log for all correction runs
+CREATE TABLE IF NOT EXISTS correction_history (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    filename TEXT NOT NULL,
+    domain TEXT NOT NULL,
+    run_timestamp TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    original_length INTEGER NOT NULL CHECK(original_length >= 0),
+    stage1_changes INTEGER NOT NULL DEFAULT 0 CHECK(stage1_changes >= 0),
+    stage2_changes INTEGER NOT NULL DEFAULT 0 CHECK(stage2_changes >= 0),
+    model TEXT,
+    execution_time_ms INTEGER CHECK(execution_time_ms >= 0),
+    success BOOLEAN NOT NULL DEFAULT 1,
+    error_message TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_history_run_timestamp ON correction_history(run_timestamp DESC);
+CREATE INDEX IF NOT EXISTS idx_history_domain ON correction_history(domain);
+CREATE INDEX IF NOT EXISTS idx_history_success ON correction_history(success);
+
+-- Table: correction_changes
+-- Detailed changes made in each correction run
+CREATE TABLE IF NOT EXISTS correction_changes (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    history_id INTEGER NOT NULL,
+    line_number INTEGER,
+    from_text TEXT NOT NULL,
+    to_text TEXT NOT NULL,
+    rule_type TEXT NOT NULL CHECK(rule_type IN ('context', 'dictionary', 'ai')),
+    rule_id INTEGER,
+    context_before TEXT,
+    context_after TEXT,
+    FOREIGN KEY (history_id) REFERENCES correction_history(id) ON DELETE CASCADE
+);
+
+CREATE INDEX IF NOT EXISTS idx_changes_history_id ON correction_changes(history_id);
+CREATE INDEX IF NOT EXISTS idx_changes_rule_type ON correction_changes(rule_type);
+
+-- Table: learned_suggestions
+-- AI-learned patterns pending user review
+CREATE TABLE IF NOT EXISTS learned_suggestions (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    from_text TEXT NOT NULL,
+    to_text TEXT NOT NULL,
+    domain TEXT NOT NULL DEFAULT 'general',
+    frequency INTEGER NOT NULL DEFAULT 1 CHECK(frequency > 0),
+    confidence REAL NOT NULL CHECK(confidence >= 0.0 AND confidence <= 1.0),
+    first_seen TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    last_seen TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    status TEXT NOT NULL DEFAULT 'pending' CHECK(status IN ('pending', 'approved', 'rejected')),
+    reviewed_at TIMESTAMP,
+    reviewed_by TEXT,
+    UNIQUE(from_text, to_text, domain)
+);
+
+CREATE INDEX IF NOT EXISTS idx_suggestions_status ON learned_suggestions(status);
+CREATE INDEX IF NOT EXISTS idx_suggestions_domain ON learned_suggestions(domain);
+CREATE INDEX IF NOT EXISTS idx_suggestions_confidence ON learned_suggestions(confidence DESC);
+CREATE INDEX IF NOT EXISTS idx_suggestions_frequency ON learned_suggestions(frequency DESC);
+
+-- Table: suggestion_examples
+-- Example occurrences of learned patterns
+CREATE TABLE IF NOT EXISTS suggestion_examples (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    suggestion_id INTEGER NOT NULL,
+    filename TEXT NOT NULL,
+    line_number INTEGER,
+    context TEXT NOT NULL,
+    occurred_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (suggestion_id) REFERENCES learned_suggestions(id) ON DELETE CASCADE
+);
+
+CREATE INDEX IF NOT EXISTS idx_examples_suggestion_id ON suggestion_examples(suggestion_id);
+
+-- Table: system_config
+-- System configuration and preferences
+CREATE TABLE IF NOT EXISTS system_config (
+    key TEXT PRIMARY KEY,
+    value TEXT NOT NULL,
+    value_type TEXT NOT NULL CHECK(value_type IN ('string', 'int', 'float', 'boolean', 'json')),
+    description TEXT,
+    updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Insert default configuration
+INSERT OR IGNORE INTO system_config (key, value, value_type, description) VALUES
+    ('schema_version', '2.0', 'string', 'Database schema version'),
+    ('api_provider', 'GLM', 'string', 'API provider name'),
+    ('api_model', 'GLM-4.6', 'string', 'Default AI model'),
+    ('api_base_url', 'https://open.bigmodel.cn/api/anthropic', 'string', 'API endpoint URL'),
+    ('default_domain', 'general', 'string', 'Default correction domain'),
+    ('auto_learn_enabled', 'true', 'boolean', 'Enable automatic pattern learning'),
+    ('backup_enabled', 'true', 'boolean', 'Create backups before operations'),
+    ('learning_frequency_threshold', '3', 'int', 'Min frequency for learned suggestions'),
+    ('learning_confidence_threshold', '0.8', 'float', 'Min confidence for learned suggestions'),
+    ('history_retention_days', '90', 'int', 'Days to retain correction history'),
+    ('max_correction_length', '1000', 'int', 'Maximum length for correction text');
+
+-- Table: audit_log
+-- Comprehensive audit trail for all operations
+CREATE TABLE IF NOT EXISTS audit_log (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    timestamp TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    action TEXT NOT NULL,
+    entity_type TEXT NOT NULL,
+    entity_id INTEGER,
+    user TEXT,
+    details TEXT,
+    success BOOLEAN NOT NULL DEFAULT 1,
+    error_message TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_audit_timestamp ON audit_log(timestamp DESC);
+CREATE INDEX IF NOT EXISTS idx_audit_action ON audit_log(action);
+CREATE INDEX IF NOT EXISTS idx_audit_entity_type ON audit_log(entity_type);
+CREATE INDEX IF NOT EXISTS idx_audit_success ON audit_log(success);
+
+-- View: active_corrections
+-- Quick access to active corrections
+CREATE VIEW IF NOT EXISTS active_corrections AS
+SELECT
+    id,
+    from_text,
+    to_text,
+    domain,
+    source,
+    confidence,
+    usage_count,
+    last_used,
+    added_at
+FROM corrections
+WHERE is_active = 1
+ORDER BY domain, from_text;
+
+-- View: pending_suggestions
+-- Quick access to suggestions pending review
+CREATE VIEW IF NOT EXISTS pending_suggestions AS
+SELECT
+    s.id,
+    s.from_text,
+    s.to_text,
+    s.domain,
+    s.frequency,
+    s.confidence,
+    s.first_seen,
+    s.last_seen,
+    COUNT(e.id) as example_count
+FROM learned_suggestions s
+LEFT JOIN suggestion_examples e ON s.id = e.suggestion_id
+WHERE s.status = 'pending'
+GROUP BY s.id
+ORDER BY s.confidence DESC, s.frequency DESC;
+
+-- View: correction_statistics
+-- Statistics per domain
+CREATE VIEW IF NOT EXISTS correction_statistics AS
+SELECT
+    domain,
+    COUNT(*) as total_corrections,
+    COUNT(CASE WHEN source = 'manual' THEN 1 END) as manual_count,
+    COUNT(CASE WHEN source = 'learned' THEN 1 END) as learned_count,
+    COUNT(CASE WHEN source = 'imported' THEN 1 END) as imported_count,
+    SUM(usage_count) as total_usage,
+    MAX(added_at) as last_updated
+FROM corrections
+WHERE is_active = 1
+GROUP BY domain;