Release v1.8.0: Add transcript-fixer skill

## New Skill: transcript-fixer v1.0.0 Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning. **Features:** - Two-stage correction pipeline (dictionary + AI) - Automatic pattern detection and learning - Domain-specific dictionaries (general, embodied_ai, finance, medical) - SQLite-based correction repository - Team collaboration with import/export - GLM API integration for AI corrections - Cost optimization through dictionary promotion **Use cases:** - Correcting meeting notes, lecture recordings, or interview transcripts - Fixing Chinese/English homophone errors and technical terminology - Building domain-specific correction dictionaries - Improving transcript accuracy through iterative learning **Documentation:** - Complete workflow guides in references/ - SQL query templates - Troubleshooting guide - Team collaboration patterns - API setup instructions **Marketplace updates:** - Updated marketplace to v1.8.0 - Added transcript-fixer plugin (category: productivity) - Updated README.md with skill description and use cases - Updated CLAUDE.md with skill listing and counts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-28 13:16:37 +08:00
parent d1041ac203
commit bd0aa12004
44 changed files with 7432 additions and 8 deletions
--- a/transcript-fixer/scripts/core/learning_engine.py
+++ b/transcript-fixer/scripts/core/learning_engine.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+"""
+Learning Engine - Pattern Detection from Correction History
+
+SINGLE RESPONSIBILITY: Analyze history and suggest new corrections
+
+Features:
+- Analyze correction history for patterns
+- Detect frequently occurring corrections
+- Calculate confidence scores
+- Generate suggestions for user review
+- Track rejected suggestions to avoid re-suggesting
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import List, Dict
+from dataclasses import dataclass, asdict
+from collections import defaultdict
+
+
+@dataclass
+class Suggestion:
+    """Represents a learned correction suggestion"""
+    from_text: str
+    to_text: str
+    frequency: int
+    confidence: float
+    examples: List[Dict]  # List of {file, line, context}
+    first_seen: str
+    last_seen: str
+    status: str  # "pending", "approved", "rejected"
+
+
+class LearningEngine:
+    """
+    Analyzes correction history to suggest new corrections
+
+    Algorithm:
+    1. Load all history files
+    2. Extract stage2 (AI) changes
+    3. Group by pattern (from_text → to_text)
+    4. Calculate frequency and confidence
+    5. Filter by thresholds
+    6. Save suggestions for user review
+    """
+
+    # Thresholds for suggesting corrections
+    MIN_FREQUENCY = 3  # Must appear at least 3 times
+    MIN_CONFIDENCE = 0.8  # Must have 80%+ confidence
+
+    def __init__(self, history_dir: Path, learned_dir: Path):
+        """
+        Initialize learning engine
+
+        Args:
+            history_dir: Directory containing correction history
+            learned_dir: Directory for learned suggestions
+        """
+        self.history_dir = history_dir
+        self.learned_dir = learned_dir
+        self.pending_file = learned_dir / "pending_review.json"
+        self.rejected_file = learned_dir / "rejected.json"
+
+    def analyze_and_suggest(self) -> List[Suggestion]:
+        """
+        Analyze history and generate suggestions
+
+        Returns:
+            List of new suggestions for user review
+        """
+        # Load all history
+        patterns = self._extract_patterns()
+
+        # Filter rejected patterns
+        rejected = self._load_rejected()
+        patterns = {k: v for k, v in patterns.items()
+                   if k not in rejected}
+
+        # Generate suggestions
+        suggestions = []
+        for (from_text, to_text), occurrences in patterns.items():
+            frequency = len(occurrences)
+
+            if frequency < self.MIN_FREQUENCY:
+                continue
+
+            confidence = self._calculate_confidence(occurrences)
+
+            if confidence < self.MIN_CONFIDENCE:
+                continue
+
+            suggestion = Suggestion(
+                from_text=from_text,
+                to_text=to_text,
+                frequency=frequency,
+                confidence=confidence,
+                examples=occurrences[:5],  # Top 5 examples
+                first_seen=occurrences[0]["timestamp"],
+                last_seen=occurrences[-1]["timestamp"],
+                status="pending"
+            )
+
+            suggestions.append(suggestion)
+
+        # Save new suggestions
+        if suggestions:
+            self._save_pending_suggestions(suggestions)
+
+        return suggestions
+
+    def approve_suggestion(self, from_text: str) -> bool:
+        """
+        Approve a suggestion (remove from pending)
+
+        Returns:
+            True if approved, False if not found
+        """
+        pending = self._load_pending_suggestions()
+
+        for suggestion in pending:
+            if suggestion["from_text"] == from_text:
+                pending.remove(suggestion)
+                self._save_suggestions(pending, self.pending_file)
+                return True
+
+        return False
+
+    def reject_suggestion(self, from_text: str, to_text: str) -> None:
+        """
+        Reject a suggestion (move to rejected list)
+        """
+        # Remove from pending
+        pending = self._load_pending_suggestions()
+        pending = [s for s in pending
+                  if not (s["from_text"] == from_text and s["to_text"] == to_text)]
+        self._save_suggestions(pending, self.pending_file)
+
+        # Add to rejected
+        rejected = self._load_rejected()
+        rejected.add((from_text, to_text))
+        self._save_rejected(rejected)
+
+    def list_pending(self) -> List[Dict]:
+        """List all pending suggestions"""
+        return self._load_pending_suggestions()
+
+    def _extract_patterns(self) -> Dict[tuple, List[Dict]]:
+        """Extract all correction patterns from history"""
+        patterns = defaultdict(list)
+
+        if not self.history_dir.exists():
+            return patterns
+
+        for history_file in self.history_dir.glob("*.json"):
+            with open(history_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            # Extract stage2 changes (AI corrections)
+            if "stages" in data and "stage2" in data["stages"]:
+                changes = data["stages"]["stage2"].get("changes", [])
+
+                for change in changes:
+                    key = (change["from"], change["to"])
+                    patterns[key].append({
+                        "file": data["filename"],
+                        "line": change.get("line", 0),
+                        "context": change.get("context", ""),
+                        "timestamp": data["timestamp"]
+                    })
+
+        return patterns
+
+    def _calculate_confidence(self, occurrences: List[Dict]) -> float:
+        """
+        Calculate confidence score for a pattern
+
+        Factors:
+        - Frequency (more = higher)
+        - Consistency (always same correction = higher)
+        - Recency (recent occurrences = higher)
+        """
+        # Base confidence from frequency
+        frequency_score = min(len(occurrences) / 10.0, 1.0)
+
+        # Consistency: always the same from→to mapping
+        consistency_score = 1.0  # Already consistent by grouping
+
+        # Recency: more recent = higher
+        # (Simplified: assume chronological order)
+        recency_score = 0.9 if len(occurrences) > 1 else 0.8
+
+        # Weighted average
+        confidence = (
+            0.5 * frequency_score +
+            0.3 * consistency_score +
+            0.2 * recency_score
+        )
+
+        return confidence
+
+    def _load_pending_suggestions(self) -> List[Dict]:
+        """Load pending suggestions from file"""
+        if not self.pending_file.exists():
+            return []
+
+        with open(self.pending_file, 'r', encoding='utf-8') as f:
+            content = f.read().strip()
+            if not content:
+                return []
+            return json.loads(content).get("suggestions", [])
+
+    def _save_pending_suggestions(self, suggestions: List[Suggestion]) -> None:
+        """Save pending suggestions to file"""
+        existing = self._load_pending_suggestions()
+
+        # Convert to dict and append
+        new_suggestions = [asdict(s) for s in suggestions]
+        all_suggestions = existing + new_suggestions
+
+        self._save_suggestions(all_suggestions, self.pending_file)
+
+    def _save_suggestions(self, suggestions: List[Dict], filepath: Path) -> None:
+        """Save suggestions to file"""
+        data = {"suggestions": suggestions}
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+
+    def _load_rejected(self) -> set:
+        """Load rejected patterns"""
+        if not self.rejected_file.exists():
+            return set()
+
+        with open(self.rejected_file, 'r', encoding='utf-8') as f:
+            content = f.read().strip()
+            if not content:
+                return set()
+            data = json.loads(content)
+            return {(r["from"], r["to"]) for r in data.get("rejected", [])}
+
+    def _save_rejected(self, rejected: set) -> None:
+        """Save rejected patterns"""
+        data = {
+            "rejected": [
+                {"from": from_text, "to": to_text}
+                for from_text, to_text in rejected
+            ]
+        }
+        with open(self.rejected_file, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)