Files
claude-code-skills-reference/transcript-fixer/scripts/core/dictionary_processor.py
daymade bd0aa12004 Release v1.8.0: Add transcript-fixer skill
## New Skill: transcript-fixer v1.0.0

Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning.

**Features:**
- Two-stage correction pipeline (dictionary + AI)
- Automatic pattern detection and learning
- Domain-specific dictionaries (general, embodied_ai, finance, medical)
- SQLite-based correction repository
- Team collaboration with import/export
- GLM API integration for AI corrections
- Cost optimization through dictionary promotion

**Use cases:**
- Correcting meeting notes, lecture recordings, or interview transcripts
- Fixing Chinese/English homophone errors and technical terminology
- Building domain-specific correction dictionaries
- Improving transcript accuracy through iterative learning

**Documentation:**
- Complete workflow guides in references/
- SQL query templates
- Troubleshooting guide
- Team collaboration patterns
- API setup instructions

**Marketplace updates:**
- Updated marketplace to v1.8.0
- Added transcript-fixer plugin (category: productivity)
- Updated README.md with skill description and use cases
- Updated CLAUDE.md with skill listing and counts

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-28 13:16:37 +08:00

141 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""
Dictionary Processor - Stage 1: Dictionary-based Text Corrections
SINGLE RESPONSIBILITY: Apply dictionary and regex-based corrections to text
Features:
- Apply simple dictionary replacements
- Apply context-aware regex rules
- Track all changes for history
- Case-sensitive and insensitive matching
"""
from __future__ import annotations
import re
from typing import Dict, List, Tuple
from dataclasses import dataclass
@dataclass
class Change:
"""Represents a single text change"""
line_number: int
from_text: str
to_text: str
rule_type: str # "dictionary" or "context_rule"
rule_name: str
class DictionaryProcessor:
"""
Stage 1 Processor: Apply dictionary-based corrections
Process:
1. Apply context-aware regex rules first (more specific)
2. Apply simple dictionary replacements (more general)
3. Track all changes for learning
"""
def __init__(self, corrections: Dict[str, str], context_rules: List[Dict]):
"""
Initialize processor with corrections and rules
Args:
corrections: Dictionary of {wrong: correct} pairs
context_rules: List of context-aware regex rules
"""
self.corrections = corrections
self.context_rules = context_rules
def process(self, text: str) -> Tuple[str, List[Change]]:
"""
Apply all corrections to text
Returns:
(corrected_text, list_of_changes)
"""
corrected_text = text
all_changes = []
# Step 1: Apply context rules (more specific, higher priority)
corrected_text, context_changes = self._apply_context_rules(corrected_text)
all_changes.extend(context_changes)
# Step 2: Apply dictionary replacements (more general)
corrected_text, dict_changes = self._apply_dictionary(corrected_text)
all_changes.extend(dict_changes)
return corrected_text, all_changes
def _apply_context_rules(self, text: str) -> Tuple[str, List[Change]]:
"""Apply context-aware regex rules"""
changes = []
corrected = text
for rule in self.context_rules:
pattern = rule["pattern"]
replacement = rule["replacement"]
description = rule.get("description", "")
# Find all matches with their positions
for match in re.finditer(pattern, corrected):
line_num = corrected[:match.start()].count('\n') + 1
changes.append(Change(
line_number=line_num,
from_text=match.group(0),
to_text=replacement,
rule_type="context_rule",
rule_name=description or pattern
))
# Apply replacement
corrected = re.sub(pattern, replacement, corrected)
return corrected, changes
def _apply_dictionary(self, text: str) -> Tuple[str, List[Change]]:
"""Apply simple dictionary replacements"""
changes = []
corrected = text
for wrong, correct in self.corrections.items():
if wrong not in corrected:
continue
# Find all occurrences
occurrences = []
start = 0
while True:
pos = corrected.find(wrong, start)
if pos == -1:
break
line_num = corrected[:pos].count('\n') + 1
occurrences.append(line_num)
start = pos + len(wrong)
# Track changes
for line_num in occurrences:
changes.append(Change(
line_number=line_num,
from_text=wrong,
to_text=correct,
rule_type="dictionary",
rule_name="corrections_dict"
))
# Apply replacement
corrected = corrected.replace(wrong, correct)
return corrected, changes
def get_summary(self, changes: List[Change]) -> Dict[str, int]:
"""Generate summary statistics"""
summary = {
"total_changes": len(changes),
"dictionary_changes": sum(1 for c in changes if c.rule_type == "dictionary"),
"context_rule_changes": sum(1 for c in changes if c.rule_type == "context_rule")
}
return summary