Release v1.8.0: Add transcript-fixer skill
## New Skill: transcript-fixer v1.0.0 Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning. **Features:** - Two-stage correction pipeline (dictionary + AI) - Automatic pattern detection and learning - Domain-specific dictionaries (general, embodied_ai, finance, medical) - SQLite-based correction repository - Team collaboration with import/export - GLM API integration for AI corrections - Cost optimization through dictionary promotion **Use cases:** - Correcting meeting notes, lecture recordings, or interview transcripts - Fixing Chinese/English homophone errors and technical terminology - Building domain-specific correction dictionaries - Improving transcript accuracy through iterative learning **Documentation:** - Complete workflow guides in references/ - SQL query templates - Troubleshooting guide - Team collaboration patterns - API setup instructions **Marketplace updates:** - Updated marketplace to v1.8.0 - Added transcript-fixer plugin (category: productivity) - Updated README.md with skill description and use cases - Updated CLAUDE.md with skill listing and counts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
252
transcript-fixer/scripts/core/learning_engine.py
Normal file
252
transcript-fixer/scripts/core/learning_engine.py
Normal file
@@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Learning Engine - Pattern Detection from Correction History
|
||||
|
||||
SINGLE RESPONSIBILITY: Analyze history and suggest new corrections
|
||||
|
||||
Features:
|
||||
- Analyze correction history for patterns
|
||||
- Detect frequently occurring corrections
|
||||
- Calculate confidence scores
|
||||
- Generate suggestions for user review
|
||||
- Track rejected suggestions to avoid re-suggesting
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
from dataclasses import dataclass, asdict
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
@dataclass
|
||||
class Suggestion:
|
||||
"""Represents a learned correction suggestion"""
|
||||
from_text: str
|
||||
to_text: str
|
||||
frequency: int
|
||||
confidence: float
|
||||
examples: List[Dict] # List of {file, line, context}
|
||||
first_seen: str
|
||||
last_seen: str
|
||||
status: str # "pending", "approved", "rejected"
|
||||
|
||||
|
||||
class LearningEngine:
|
||||
"""
|
||||
Analyzes correction history to suggest new corrections
|
||||
|
||||
Algorithm:
|
||||
1. Load all history files
|
||||
2. Extract stage2 (AI) changes
|
||||
3. Group by pattern (from_text → to_text)
|
||||
4. Calculate frequency and confidence
|
||||
5. Filter by thresholds
|
||||
6. Save suggestions for user review
|
||||
"""
|
||||
|
||||
# Thresholds for suggesting corrections
|
||||
MIN_FREQUENCY = 3 # Must appear at least 3 times
|
||||
MIN_CONFIDENCE = 0.8 # Must have 80%+ confidence
|
||||
|
||||
def __init__(self, history_dir: Path, learned_dir: Path):
|
||||
"""
|
||||
Initialize learning engine
|
||||
|
||||
Args:
|
||||
history_dir: Directory containing correction history
|
||||
learned_dir: Directory for learned suggestions
|
||||
"""
|
||||
self.history_dir = history_dir
|
||||
self.learned_dir = learned_dir
|
||||
self.pending_file = learned_dir / "pending_review.json"
|
||||
self.rejected_file = learned_dir / "rejected.json"
|
||||
|
||||
def analyze_and_suggest(self) -> List[Suggestion]:
|
||||
"""
|
||||
Analyze history and generate suggestions
|
||||
|
||||
Returns:
|
||||
List of new suggestions for user review
|
||||
"""
|
||||
# Load all history
|
||||
patterns = self._extract_patterns()
|
||||
|
||||
# Filter rejected patterns
|
||||
rejected = self._load_rejected()
|
||||
patterns = {k: v for k, v in patterns.items()
|
||||
if k not in rejected}
|
||||
|
||||
# Generate suggestions
|
||||
suggestions = []
|
||||
for (from_text, to_text), occurrences in patterns.items():
|
||||
frequency = len(occurrences)
|
||||
|
||||
if frequency < self.MIN_FREQUENCY:
|
||||
continue
|
||||
|
||||
confidence = self._calculate_confidence(occurrences)
|
||||
|
||||
if confidence < self.MIN_CONFIDENCE:
|
||||
continue
|
||||
|
||||
suggestion = Suggestion(
|
||||
from_text=from_text,
|
||||
to_text=to_text,
|
||||
frequency=frequency,
|
||||
confidence=confidence,
|
||||
examples=occurrences[:5], # Top 5 examples
|
||||
first_seen=occurrences[0]["timestamp"],
|
||||
last_seen=occurrences[-1]["timestamp"],
|
||||
status="pending"
|
||||
)
|
||||
|
||||
suggestions.append(suggestion)
|
||||
|
||||
# Save new suggestions
|
||||
if suggestions:
|
||||
self._save_pending_suggestions(suggestions)
|
||||
|
||||
return suggestions
|
||||
|
||||
def approve_suggestion(self, from_text: str) -> bool:
|
||||
"""
|
||||
Approve a suggestion (remove from pending)
|
||||
|
||||
Returns:
|
||||
True if approved, False if not found
|
||||
"""
|
||||
pending = self._load_pending_suggestions()
|
||||
|
||||
for suggestion in pending:
|
||||
if suggestion["from_text"] == from_text:
|
||||
pending.remove(suggestion)
|
||||
self._save_suggestions(pending, self.pending_file)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def reject_suggestion(self, from_text: str, to_text: str) -> None:
|
||||
"""
|
||||
Reject a suggestion (move to rejected list)
|
||||
"""
|
||||
# Remove from pending
|
||||
pending = self._load_pending_suggestions()
|
||||
pending = [s for s in pending
|
||||
if not (s["from_text"] == from_text and s["to_text"] == to_text)]
|
||||
self._save_suggestions(pending, self.pending_file)
|
||||
|
||||
# Add to rejected
|
||||
rejected = self._load_rejected()
|
||||
rejected.add((from_text, to_text))
|
||||
self._save_rejected(rejected)
|
||||
|
||||
def list_pending(self) -> List[Dict]:
|
||||
"""List all pending suggestions"""
|
||||
return self._load_pending_suggestions()
|
||||
|
||||
def _extract_patterns(self) -> Dict[tuple, List[Dict]]:
|
||||
"""Extract all correction patterns from history"""
|
||||
patterns = defaultdict(list)
|
||||
|
||||
if not self.history_dir.exists():
|
||||
return patterns
|
||||
|
||||
for history_file in self.history_dir.glob("*.json"):
|
||||
with open(history_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Extract stage2 changes (AI corrections)
|
||||
if "stages" in data and "stage2" in data["stages"]:
|
||||
changes = data["stages"]["stage2"].get("changes", [])
|
||||
|
||||
for change in changes:
|
||||
key = (change["from"], change["to"])
|
||||
patterns[key].append({
|
||||
"file": data["filename"],
|
||||
"line": change.get("line", 0),
|
||||
"context": change.get("context", ""),
|
||||
"timestamp": data["timestamp"]
|
||||
})
|
||||
|
||||
return patterns
|
||||
|
||||
def _calculate_confidence(self, occurrences: List[Dict]) -> float:
|
||||
"""
|
||||
Calculate confidence score for a pattern
|
||||
|
||||
Factors:
|
||||
- Frequency (more = higher)
|
||||
- Consistency (always same correction = higher)
|
||||
- Recency (recent occurrences = higher)
|
||||
"""
|
||||
# Base confidence from frequency
|
||||
frequency_score = min(len(occurrences) / 10.0, 1.0)
|
||||
|
||||
# Consistency: always the same from→to mapping
|
||||
consistency_score = 1.0 # Already consistent by grouping
|
||||
|
||||
# Recency: more recent = higher
|
||||
# (Simplified: assume chronological order)
|
||||
recency_score = 0.9 if len(occurrences) > 1 else 0.8
|
||||
|
||||
# Weighted average
|
||||
confidence = (
|
||||
0.5 * frequency_score +
|
||||
0.3 * consistency_score +
|
||||
0.2 * recency_score
|
||||
)
|
||||
|
||||
return confidence
|
||||
|
||||
def _load_pending_suggestions(self) -> List[Dict]:
|
||||
"""Load pending suggestions from file"""
|
||||
if not self.pending_file.exists():
|
||||
return []
|
||||
|
||||
with open(self.pending_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read().strip()
|
||||
if not content:
|
||||
return []
|
||||
return json.loads(content).get("suggestions", [])
|
||||
|
||||
def _save_pending_suggestions(self, suggestions: List[Suggestion]) -> None:
|
||||
"""Save pending suggestions to file"""
|
||||
existing = self._load_pending_suggestions()
|
||||
|
||||
# Convert to dict and append
|
||||
new_suggestions = [asdict(s) for s in suggestions]
|
||||
all_suggestions = existing + new_suggestions
|
||||
|
||||
self._save_suggestions(all_suggestions, self.pending_file)
|
||||
|
||||
def _save_suggestions(self, suggestions: List[Dict], filepath: Path) -> None:
|
||||
"""Save suggestions to file"""
|
||||
data = {"suggestions": suggestions}
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def _load_rejected(self) -> set:
|
||||
"""Load rejected patterns"""
|
||||
if not self.rejected_file.exists():
|
||||
return set()
|
||||
|
||||
with open(self.rejected_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read().strip()
|
||||
if not content:
|
||||
return set()
|
||||
data = json.loads(content)
|
||||
return {(r["from"], r["to"]) for r in data.get("rejected", [])}
|
||||
|
||||
def _save_rejected(self, rejected: set) -> None:
|
||||
"""Save rejected patterns"""
|
||||
data = {
|
||||
"rejected": [
|
||||
{"from": from_text, "to": to_text}
|
||||
for from_text, to_text in rejected
|
||||
]
|
||||
}
|
||||
with open(self.rejected_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
Reference in New Issue
Block a user