Release v1.9.0: Add video-comparer skill and enhance transcript-fixer

## New Skill: video-comparer v1.0.0 - Compare original and compressed videos with interactive HTML reports - Calculate quality metrics (PSNR, SSIM) for compression analysis - Generate frame-by-frame visual comparisons (slider, side-by-side, grid) - Extract video metadata (codec, resolution, bitrate, duration) - Multi-platform FFmpeg support with security features ## transcript-fixer Enhancements - Add async AI processor for parallel processing - Add connection pool management for database operations - Add concurrency manager and rate limiter - Add audit log retention and database migrations - Add health check and metrics monitoring - Add comprehensive test suite (8 new test files) - Enhance security with domain and path validators ## Marketplace Updates - Update marketplace version from 1.8.0 to 1.9.0 - Update skills count from 15 to 16 - Update documentation (README.md, CLAUDE.md, CHANGELOG.md) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-30 00:23:12 +08:00
parent bd0aa12004
commit 9b724f33e3
49 changed files with 15357 additions and 270 deletions
--- a/transcript-fixer/scripts/core/learning_engine.py
+++ b/transcript-fixer/scripts/core/learning_engine.py
@@ -10,15 +10,33 @@ Features:
 - Calculate confidence scores
 - Generate suggestions for user review
 - Track rejected suggestions to avoid re-suggesting
+
+CRITICAL FIX (P1-1): Thread-safe file operations with file locking
+- Prevents race conditions in concurrent access
+- Atomic read-modify-write operations
+- Cross-platform file locking support
 """

 from __future__ import annotations

 import json
+import logging
 from pathlib import Path
-from typing import List, Dict
+from typing import List, Dict, Optional
 from dataclasses import dataclass, asdict
 from collections import defaultdict
+from contextlib import contextmanager
+
+# CRITICAL FIX: Import file locking
+try:
+    from filelock import FileLock, Timeout as FileLockTimeout
+except ImportError:
+    raise ImportError(
+        "filelock library required for thread-safe operations. "
+        "Install with: uv add filelock"
+    )
+
+logger = logging.getLogger(__name__)


@dataclass
@@ -51,18 +69,77 @@ class LearningEngine:
    MIN_FREQUENCY = 3  # Must appear at least 3 times
    MIN_CONFIDENCE = 0.8  # Must have 80%+ confidence

-    def __init__(self, history_dir: Path, learned_dir: Path):
+    # Thresholds for auto-approval (stricter)
+    AUTO_APPROVE_FREQUENCY = 5  # Must appear at least 5 times
+    AUTO_APPROVE_CONFIDENCE = 0.85  # Must have 85%+ confidence
+
+    def __init__(self, history_dir: Path, learned_dir: Path, correction_service=None):
        """
        Initialize learning engine

        Args:
            history_dir: Directory containing correction history
            learned_dir: Directory for learned suggestions
+            correction_service: CorrectionService for auto-adding to dictionary
        """
        self.history_dir = history_dir
        self.learned_dir = learned_dir
        self.pending_file = learned_dir / "pending_review.json"
        self.rejected_file = learned_dir / "rejected.json"
+        self.auto_approved_file = learned_dir / "auto_approved.json"
+        self.correction_service = correction_service
+
+        # CRITICAL FIX: Lock files for thread-safe operations
+        # Each JSON file gets its own lock file
+        self.pending_lock = learned_dir / ".pending_review.lock"
+        self.rejected_lock = learned_dir / ".rejected.lock"
+        self.auto_approved_lock = learned_dir / ".auto_approved.lock"
+
+        # Lock timeout (seconds)
+        self.lock_timeout = 10.0
+
+    @contextmanager
+    def _file_lock(self, lock_path: Path, operation: str = "file operation"):
+        """
+        Context manager for file locking.
+
+        CRITICAL FIX: Ensures atomic file operations, prevents race conditions.
+
+        Args:
+            lock_path: Path to lock file
+            operation: Description of operation (for logging)
+
+        Yields:
+            None
+
+        Raises:
+            FileLockTimeout: If lock cannot be acquired within timeout
+
+        Example:
+            with self._file_lock(self.pending_lock, "save pending"):
+                # Atomic read-modify-write
+                data = self._load_pending_suggestions()
+                data.append(new_item)
+                self._save_suggestions(data, self.pending_file)
+        """
+        lock = FileLock(str(lock_path), timeout=self.lock_timeout)
+
+        try:
+            logger.debug(f"Acquiring lock for {operation}: {lock_path}")
+            with lock.acquire(timeout=self.lock_timeout):
+                logger.debug(f"Lock acquired for {operation}")
+                yield
+        except FileLockTimeout as e:
+            logger.error(
+                f"Failed to acquire lock for {operation} after {self.lock_timeout}s: {lock_path}"
+            )
+            raise RuntimeError(
+                f"File lock timeout for {operation}. "
+                f"Another process may be holding the lock. "
+                f"Lock file: {lock_path}"
+            ) from e
+        finally:
+            logger.debug(f"Lock released for {operation}")

    def analyze_and_suggest(self) -> List[Suggestion]:
        """
@@ -113,35 +190,64 @@ class LearningEngine:

    def approve_suggestion(self, from_text: str) -> bool:
        """
-        Approve a suggestion (remove from pending)
+        Approve a suggestion (remove from pending).
+
+        CRITICAL FIX: Atomic read-modify-write operation with file lock.
+
+        Args:
+            from_text: The 'from' text of suggestion to approve

        Returns:
            True if approved, False if not found
        """
-        pending = self._load_pending_suggestions()
+        # CRITICAL FIX: Acquire lock for entire read-modify-write operation
+        with self._file_lock(self.pending_lock, "approve suggestion"):
+            pending = self._load_pending_suggestions_unlocked()

-        for suggestion in pending:
-            if suggestion["from_text"] == from_text:
-                pending.remove(suggestion)
-                self._save_suggestions(pending, self.pending_file)
-                return True
+            for suggestion in pending:
+                if suggestion["from_text"] == from_text:
+                    pending.remove(suggestion)
+                    self._save_suggestions_unlocked(pending, self.pending_file)
+                    logger.info(f"Approved suggestion: {from_text}")
+                    return True

-        return False
+            logger.warning(f"Suggestion not found for approval: {from_text}")
+            return False

    def reject_suggestion(self, from_text: str, to_text: str) -> None:
        """
-        Reject a suggestion (move to rejected list)
-        """
-        # Remove from pending
-        pending = self._load_pending_suggestions()
-        pending = [s for s in pending
-                  if not (s["from_text"] == from_text and s["to_text"] == to_text)]
-        self._save_suggestions(pending, self.pending_file)
+        Reject a suggestion (move to rejected list).

-        # Add to rejected
-        rejected = self._load_rejected()
-        rejected.add((from_text, to_text))
-        self._save_rejected(rejected)
+        CRITICAL FIX: Acquires BOTH pending and rejected locks in consistent order.
+        This prevents deadlocks when multiple threads call this method concurrently.
+
+        Lock acquisition order: pending_lock, then rejected_lock (alphabetical).
+
+        Args:
+            from_text: The 'from' text of suggestion to reject
+            to_text: The 'to' text of suggestion to reject
+        """
+        # CRITICAL FIX: Acquire locks in consistent order to prevent deadlock
+        # Order: pending < rejected (alphabetically by filename)
+        with self._file_lock(self.pending_lock, "reject suggestion (pending)"):
+            # Remove from pending
+            pending = self._load_pending_suggestions_unlocked()
+            original_count = len(pending)
+            pending = [s for s in pending
+                      if not (s["from_text"] == from_text and s["to_text"] == to_text)]
+            self._save_suggestions_unlocked(pending, self.pending_file)
+
+            removed = original_count - len(pending)
+            if removed > 0:
+                logger.info(f"Removed {removed} suggestions from pending: {from_text} → {to_text}")
+
+        # Now acquire rejected lock (separate operation, different file)
+        with self._file_lock(self.rejected_lock, "reject suggestion (rejected)"):
+            # Add to rejected
+            rejected = self._load_rejected_unlocked()
+            rejected.add((from_text, to_text))
+            self._save_rejected_unlocked(rejected)
+            logger.info(f"Added to rejected: {from_text} → {to_text}")

    def list_pending(self) -> List[Dict]:
        """List all pending suggestions"""
@@ -201,8 +307,15 @@ class LearningEngine:

        return confidence

-    def _load_pending_suggestions(self) -> List[Dict]:
-        """Load pending suggestions from file"""
+    def _load_pending_suggestions_unlocked(self) -> List[Dict]:
+        """
+        Load pending suggestions from file (UNLOCKED - caller must hold lock).
+
+        Internal method. Use _load_pending_suggestions() for thread-safe access.
+
+        Returns:
+            List of suggestion dictionaries
+        """
        if not self.pending_file.exists():
            return []

@@ -212,24 +325,64 @@ class LearningEngine:
                return []
            return json.loads(content).get("suggestions", [])

+    def _load_pending_suggestions(self) -> List[Dict]:
+        """
+        Load pending suggestions from file (THREAD-SAFE).
+
+        CRITICAL FIX: Acquires lock before reading to ensure consistency.
+
+        Returns:
+            List of suggestion dictionaries
+        """
+        with self._file_lock(self.pending_lock, "load pending suggestions"):
+            return self._load_pending_suggestions_unlocked()
+
    def _save_pending_suggestions(self, suggestions: List[Suggestion]) -> None:
-        """Save pending suggestions to file"""
-        existing = self._load_pending_suggestions()
+        """
+        Save pending suggestions to file.

-        # Convert to dict and append
-        new_suggestions = [asdict(s) for s in suggestions]
-        all_suggestions = existing + new_suggestions
+        CRITICAL FIX: Atomic read-modify-write operation with file lock.
+        Prevents race conditions where concurrent writes could lose data.
+        """
+        # CRITICAL FIX: Acquire lock for entire read-modify-write operation
+        with self._file_lock(self.pending_lock, "save pending suggestions"):
+            # Read
+            existing = self._load_pending_suggestions_unlocked()

-        self._save_suggestions(all_suggestions, self.pending_file)
+            # Modify
+            new_suggestions = [asdict(s) for s in suggestions]
+            all_suggestions = existing + new_suggestions
+
+            # Write
+            # All done atomically under lock
+            self._save_suggestions_unlocked(all_suggestions, self.pending_file)
+
+    def _save_suggestions_unlocked(self, suggestions: List[Dict], filepath: Path) -> None:
+        """
+        Save suggestions to file (UNLOCKED - caller must hold lock).
+
+        Internal method. Caller must acquire appropriate lock before calling.
+
+        Args:
+            suggestions: List of suggestion dictionaries
+            filepath: Path to save to
+        """
+        # Ensure parent directory exists
+        filepath.parent.mkdir(parents=True, exist_ok=True)

-    def _save_suggestions(self, suggestions: List[Dict], filepath: Path) -> None:
-        """Save suggestions to file"""
        data = {"suggestions": suggestions}
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

-    def _load_rejected(self) -> set:
-        """Load rejected patterns"""
+    def _load_rejected_unlocked(self) -> set:
+        """
+        Load rejected patterns (UNLOCKED - caller must hold lock).
+
+        Internal method. Use _load_rejected() for thread-safe access.
+
+        Returns:
+            Set of (from_text, to_text) tuples
+        """
        if not self.rejected_file.exists():
            return set()

@@ -240,8 +393,30 @@ class LearningEngine:
            data = json.loads(content)
            return {(r["from"], r["to"]) for r in data.get("rejected", [])}

-    def _save_rejected(self, rejected: set) -> None:
-        """Save rejected patterns"""
+    def _load_rejected(self) -> set:
+        """
+        Load rejected patterns (THREAD-SAFE).
+
+        CRITICAL FIX: Acquires lock before reading to ensure consistency.
+
+        Returns:
+            Set of (from_text, to_text) tuples
+        """
+        with self._file_lock(self.rejected_lock, "load rejected"):
+            return self._load_rejected_unlocked()
+
+    def _save_rejected_unlocked(self, rejected: set) -> None:
+        """
+        Save rejected patterns (UNLOCKED - caller must hold lock).
+
+        Internal method. Caller must acquire rejected_lock before calling.
+
+        Args:
+            rejected: Set of (from_text, to_text) tuples
+        """
+        # Ensure parent directory exists
+        self.rejected_file.parent.mkdir(parents=True, exist_ok=True)
+
        data = {
            "rejected": [
                {"from": from_text, "to": to_text}
@@ -250,3 +425,141 @@ class LearningEngine:
        }
        with open(self.rejected_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
+
+    def _save_rejected(self, rejected: set) -> None:
+        """
+        Save rejected patterns (THREAD-SAFE).
+
+        CRITICAL FIX: Acquires lock before writing to prevent race conditions.
+
+        Args:
+            rejected: Set of (from_text, to_text) tuples
+        """
+        with self._file_lock(self.rejected_lock, "save rejected"):
+            self._save_rejected_unlocked(rejected)
+
+    def analyze_and_auto_approve(self, changes: List, domain: str = "general") -> Dict:
+        """
+        Analyze AI changes and auto-approve high-confidence patterns
+
+        This is the CORE learning loop:
+        1. Group changes by pattern
+        2. Find high-frequency, high-confidence patterns
+        3. Auto-add to dictionary (no manual review needed)
+        4. Track auto-approvals for transparency
+
+        Args:
+            changes: List of AIChange objects from recent AI processing
+            domain: Domain to add corrections to
+
+        Returns:
+            Dict with stats: {
+                "total_changes": int,
+                "unique_patterns": int,
+                "auto_approved": int,
+                "pending_review": int,
+                "savings_potential": str
+            }
+        """
+        if not changes:
+            return {"total_changes": 0, "unique_patterns": 0, "auto_approved": 0, "pending_review": 0}
+
+        # Group changes by pattern
+        patterns = {}
+        for change in changes:
+            key = (change.from_text, change.to_text)
+            if key not in patterns:
+                patterns[key] = []
+            patterns[key].append(change)
+
+        stats = {
+            "total_changes": len(changes),
+            "unique_patterns": len(patterns),
+            "auto_approved": 0,
+            "pending_review": 0,
+            "savings_potential": ""
+        }
+
+        auto_approved_patterns = []
+        pending_patterns = []
+
+        for (from_text, to_text), occurrences in patterns.items():
+            frequency = len(occurrences)
+
+            # Calculate confidence
+            confidences = [c.confidence for c in occurrences]
+            avg_confidence = sum(confidences) / len(confidences)
+
+            # Auto-approve if meets strict criteria
+            if (frequency >= self.AUTO_APPROVE_FREQUENCY and
+                avg_confidence >= self.AUTO_APPROVE_CONFIDENCE):
+
+                if self.correction_service:
+                    try:
+                        self.correction_service.add_correction(from_text, to_text, domain)
+                        auto_approved_patterns.append({
+                            "from": from_text,
+                            "to": to_text,
+                            "frequency": frequency,
+                            "confidence": avg_confidence,
+                            "domain": domain
+                        })
+                        stats["auto_approved"] += 1
+                    except Exception as e:
+                        # Already exists or validation error
+                        pass
+
+            # Add to pending review if meets minimum criteria
+            elif (frequency >= self.MIN_FREQUENCY and
+                  avg_confidence >= self.MIN_CONFIDENCE):
+                pending_patterns.append({
+                    "from": from_text,
+                    "to": to_text,
+                    "frequency": frequency,
+                    "confidence": avg_confidence
+                })
+                stats["pending_review"] += 1
+
+        # Save auto-approved for transparency
+        if auto_approved_patterns:
+            self._save_auto_approved(auto_approved_patterns)
+
+        # Calculate savings potential
+        total_dict_covered = sum(p["frequency"] for p in auto_approved_patterns)
+        if total_dict_covered > 0:
+            savings_pct = int((total_dict_covered / stats["total_changes"]) * 100)
+            stats["savings_potential"] = f"{savings_pct}% of current errors now handled by dictionary (free)"
+
+        return stats
+
+    def _save_auto_approved(self, patterns: List[Dict]) -> None:
+        """
+        Save auto-approved patterns for transparency.
+
+        CRITICAL FIX: Atomic read-modify-write operation with file lock.
+        Prevents race conditions where concurrent auto-approvals could lose data.
+
+        Args:
+            patterns: List of pattern dictionaries to save
+        """
+        # CRITICAL FIX: Acquire lock for entire read-modify-write operation
+        with self._file_lock(self.auto_approved_lock, "save auto-approved"):
+            # Load existing
+            existing = []
+            if self.auto_approved_file.exists():
+                with open(self.auto_approved_file, 'r', encoding='utf-8') as f:
+                    content = f.read().strip()
+                    if content:
+                        data = json.load(json.loads(content) if isinstance(content, str) else f)
+                        existing = data.get("auto_approved", [])
+
+            # Append new
+            all_patterns = existing + patterns
+
+            # Save
+            self.auto_approved_file.parent.mkdir(parents=True, exist_ok=True)
+            data = {"auto_approved": all_patterns}
+            with open(self.auto_approved_file, 'w', encoding='utf-8') as f:
+                json.dump(data, f, ensure_ascii=False, indent=2)
+
+            logger.info(f"Saved {len(patterns)} auto-approved patterns (total: {len(all_patterns)})")