Release v1.8.0: Add transcript-fixer skill

## New Skill: transcript-fixer v1.0.0

Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning.

**Features:**
- Two-stage correction pipeline (dictionary + AI)
- Automatic pattern detection and learning
- Domain-specific dictionaries (general, embodied_ai, finance, medical)
- SQLite-based correction repository
- Team collaboration with import/export
- GLM API integration for AI corrections
- Cost optimization through dictionary promotion

**Use cases:**
- Correcting meeting notes, lecture recordings, or interview transcripts
- Fixing Chinese/English homophone errors and technical terminology
- Building domain-specific correction dictionaries
- Improving transcript accuracy through iterative learning

**Documentation:**
- Complete workflow guides in references/
- SQL query templates
- Troubleshooting guide
- Team collaboration patterns
- API setup instructions

**Marketplace updates:**
- Updated marketplace to v1.8.0
- Added transcript-fixer plugin (category: productivity)
- Updated README.md with skill description and use cases
- Updated CLAUDE.md with skill listing and counts

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
daymade
2025-10-28 13:16:37 +08:00
parent d1041ac203
commit bd0aa12004
44 changed files with 7432 additions and 8 deletions

View File

@@ -0,0 +1,524 @@
#!/usr/bin/env python3
"""
Correction Service - Business Logic Layer
SINGLE RESPONSIBILITY: Implement business rules and validation
Orchestrates repository operations with comprehensive validation,
error handling, and business logic enforcement.
"""
from __future__ import annotations
import re
import os
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from .correction_repository import (
CorrectionRepository,
ValidationError,
DatabaseError
)
logger = logging.getLogger(__name__)
@dataclass
class ValidationRules:
"""Validation rules configuration"""
max_text_length: int = 1000
min_text_length: int = 1
max_domain_length: int = 50
allowed_domain_pattern: str = r'^[a-zA-Z0-9_-]+$'
max_confidence: float = 1.0
min_confidence: float = 0.0
class CorrectionService:
"""
Service layer for correction management.
Responsibilities:
- Input validation and sanitization
- Business rule enforcement
- Conflict detection and resolution
- Statistics and reporting
- Integration with repository layer
"""
def __init__(self, repository: CorrectionRepository, rules: Optional[ValidationRules] = None):
"""
Initialize service with repository.
Args:
repository: Data access layer
rules: Validation rules (uses defaults if None)
"""
self.repository = repository
self.rules = rules or ValidationRules()
self.db_path = repository.db_path
logger.info("CorrectionService initialized")
def initialize(self) -> None:
"""
Initialize database (already done by repository, kept for API compatibility).
"""
# Database is auto-initialized by repository on first access
logger.info(f"✅ Database ready: {self.db_path}")
# ==================== Validation Methods ====================
def validate_correction_text(self, text: str, field_name: str = "text") -> None:
"""
Validate correction text with comprehensive checks.
Args:
text: Text to validate
field_name: Field name for error messages
Raises:
ValidationError: If validation fails
"""
# Check not None or empty
if not text:
raise ValidationError(f"{field_name} cannot be None or empty")
# Check not only whitespace
if not text.strip():
raise ValidationError(f"{field_name} cannot be only whitespace")
# Check length constraints
if len(text) < self.rules.min_text_length:
raise ValidationError(
f"{field_name} too short: {len(text)} chars (min: {self.rules.min_text_length})"
)
if len(text) > self.rules.max_text_length:
raise ValidationError(
f"{field_name} too long: {len(text)} chars (max: {self.rules.max_text_length})"
)
# Check for control characters (except newline and tab)
invalid_chars = [c for c in text if ord(c) < 32 and c not in '\n\t']
if invalid_chars:
raise ValidationError(
f"{field_name} contains invalid control characters: {invalid_chars}"
)
# Check for NULL bytes
if '\x00' in text:
raise ValidationError(f"{field_name} contains NULL bytes")
def validate_domain_name(self, domain: str) -> None:
"""
Validate domain name to prevent path traversal and injection.
Args:
domain: Domain name to validate
Raises:
ValidationError: If validation fails
"""
if not domain:
raise ValidationError("Domain name cannot be empty")
if len(domain) > self.rules.max_domain_length:
raise ValidationError(
f"Domain name too long: {len(domain)} chars (max: {self.rules.max_domain_length})"
)
# Check pattern: only alphanumeric, underscore, hyphen
if not re.match(self.rules.allowed_domain_pattern, domain):
raise ValidationError(
f"Domain name contains invalid characters: {domain}. "
f"Allowed pattern: {self.rules.allowed_domain_pattern}"
)
# Check for path traversal attempts
if '..' in domain or '/' in domain or '\\' in domain:
raise ValidationError(f"Domain name contains path traversal: {domain}")
# Reserved names
reserved = ['con', 'prn', 'aux', 'nul', 'com1', 'lpt1'] # Windows reserved
if domain.lower() in reserved:
raise ValidationError(f"Domain name is reserved: {domain}")
def validate_confidence(self, confidence: float) -> None:
"""Validate confidence score."""
if not isinstance(confidence, (int, float)):
raise ValidationError(f"Confidence must be numeric, got {type(confidence)}")
if not (self.rules.min_confidence <= confidence <= self.rules.max_confidence):
raise ValidationError(
f"Confidence must be between {self.rules.min_confidence} "
f"and {self.rules.max_confidence}, got {confidence}"
)
def validate_source(self, source: str) -> None:
"""Validate correction source."""
valid_sources = ['manual', 'learned', 'imported']
if source not in valid_sources:
raise ValidationError(
f"Invalid source: {source}. Must be one of: {valid_sources}"
)
# ==================== Correction Operations ====================
def add_correction(
self,
from_text: str,
to_text: str,
domain: str = "general",
source: str = "manual",
confidence: float = 1.0,
notes: Optional[str] = None
) -> int:
"""
Add a correction with full validation.
Args:
from_text: Original (incorrect) text
to_text: Corrected text
domain: Correction domain
source: Origin of correction
confidence: Confidence score
notes: Optional notes
Returns:
ID of inserted correction
Raises:
ValidationError: If validation fails
"""
# Comprehensive validation
self.validate_correction_text(from_text, "from_text")
self.validate_correction_text(to_text, "to_text")
self.validate_domain_name(domain)
self.validate_source(source)
self.validate_confidence(confidence)
# Business rule: from_text and to_text should be different
if from_text.strip() == to_text.strip():
raise ValidationError(
f"from_text and to_text are identical: '{from_text}'"
)
# Get current user
added_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
try:
correction_id = self.repository.add_correction(
from_text=from_text,
to_text=to_text,
domain=domain,
source=source,
confidence=confidence,
added_by=added_by,
notes=notes
)
logger.info(
f"Successfully added correction ID {correction_id}: "
f"'{from_text}''{to_text}' (domain: {domain})"
)
return correction_id
except DatabaseError as e:
logger.error(f"Failed to add correction: {e}")
raise
def get_corrections(self, domain: Optional[str] = None) -> Dict[str, str]:
"""
Get corrections as a dictionary for processing.
Args:
domain: Optional domain filter
Returns:
Dictionary of corrections {from_text: to_text}
"""
if domain:
self.validate_domain_name(domain)
return self.repository.get_corrections_dict(domain)
else:
# Get all domains
all_corrections = self.repository.get_all_corrections(active_only=True)
return {c.from_text: c.to_text for c in all_corrections}
def remove_correction(
self,
from_text: str,
domain: str = "general"
) -> bool:
"""
Remove a correction (soft delete).
Args:
from_text: Text to remove
domain: Domain
Returns:
True if removed, False if not found
"""
self.validate_correction_text(from_text, "from_text")
self.validate_domain_name(domain)
deleted_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
success = self.repository.delete_correction(from_text, domain, deleted_by)
if success:
logger.info(f"Removed correction: '{from_text}' (domain: {domain})")
else:
logger.warning(f"Correction not found: '{from_text}' (domain: {domain})")
return success
# ==================== Import/Export Operations ====================
def import_corrections(
self,
corrections: Dict[str, str],
domain: str = "general",
merge: bool = True,
validate_all: bool = True
) -> Tuple[int, int, int]:
"""
Import corrections with validation and conflict resolution.
Args:
corrections: Dictionary of corrections to import
domain: Target domain
merge: If True, merge with existing; if False, replace
validate_all: If True, validate all before import (safer but slower)
Returns:
Tuple of (inserted_count, updated_count, skipped_count)
Raises:
ValidationError: If validation fails (when validate_all=True)
"""
self.validate_domain_name(domain)
if not corrections:
raise ValidationError("Cannot import empty corrections dictionary")
# Pre-validation (if requested)
if validate_all:
logger.info(f"Pre-validating {len(corrections)} corrections...")
invalid_count = 0
for from_text, to_text in corrections.items():
try:
self.validate_correction_text(from_text, "from_text")
self.validate_correction_text(to_text, "to_text")
except ValidationError as e:
logger.error(f"Validation failed for '{from_text}''{to_text}': {e}")
invalid_count += 1
if invalid_count > 0:
raise ValidationError(
f"Pre-validation failed: {invalid_count}/{len(corrections)} corrections invalid"
)
# Detect conflicts if merge mode
if merge:
existing = self.repository.get_corrections_dict(domain)
conflicts = self._detect_conflicts(corrections, existing)
if conflicts:
logger.warning(
f"Found {len(conflicts)} conflicts that will be overwritten"
)
for from_text, (old_val, new_val) in conflicts.items():
logger.debug(f"Conflict: '{from_text}': '{old_val}''{new_val}'")
# Perform import
imported_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
try:
inserted, updated, skipped = self.repository.bulk_import_corrections(
corrections=corrections,
domain=domain,
source="imported",
imported_by=imported_by,
merge=merge
)
logger.info(
f"Import complete: {inserted} inserted, {updated} updated, "
f"{skipped} skipped (domain: {domain})"
)
return (inserted, updated, skipped)
except DatabaseError as e:
logger.error(f"Import failed: {e}")
raise
def export_corrections(self, domain: str = "general") -> Dict[str, str]:
"""
Export corrections for sharing.
Args:
domain: Domain to export
Returns:
Dictionary of corrections
"""
self.validate_domain_name(domain)
corrections = self.repository.get_corrections_dict(domain)
logger.info(f"Exported {len(corrections)} corrections (domain: {domain})")
return corrections
# ==================== Statistics and Reporting ====================
def get_statistics(self, domain: Optional[str] = None) -> Dict[str, any]:
"""
Get correction statistics.
Args:
domain: Optional domain filter
Returns:
Dictionary of statistics
"""
if domain:
self.validate_domain_name(domain)
corrections = self.repository.get_all_corrections(domain=domain, active_only=True)
else:
corrections = self.repository.get_all_corrections(active_only=True)
# Calculate statistics
total = len(corrections)
by_source = {'manual': 0, 'learned': 0, 'imported': 0}
total_usage = 0
high_confidence = 0
for c in corrections:
by_source[c.source] = by_source.get(c.source, 0) + 1
total_usage += c.usage_count
if c.confidence >= 0.9:
high_confidence += 1
stats = {
'total_corrections': total,
'by_source': by_source,
'total_usage': total_usage,
'average_usage': total_usage / total if total > 0 else 0,
'high_confidence_count': high_confidence,
'high_confidence_ratio': high_confidence / total if total > 0 else 0
}
logger.debug(f"Statistics for domain '{domain}': {stats}")
return stats
# ==================== Helper Methods ====================
def _detect_conflicts(
self,
incoming: Dict[str, str],
existing: Dict[str, str]
) -> Dict[str, Tuple[str, str]]:
"""
Detect conflicts between incoming and existing corrections.
Returns:
Dictionary of conflicts {from_text: (existing_to, incoming_to)}
"""
conflicts = {}
for from_text in set(incoming.keys()) & set(existing.keys()):
if existing[from_text] != incoming[from_text]:
conflicts[from_text] = (existing[from_text], incoming[from_text])
return conflicts
def load_context_rules(self) -> List[Dict]:
"""
Load active context-aware regex rules.
Returns:
List of rule dictionaries with pattern, replacement, description
"""
try:
conn = self.repository._get_connection()
cursor = conn.execute("""
SELECT pattern, replacement, description
FROM context_rules
WHERE is_active = 1
ORDER BY priority DESC
""")
rules = []
for row in cursor.fetchall():
rules.append({
"pattern": row[0],
"replacement": row[1],
"description": row[2]
})
logger.debug(f"Loaded {len(rules)} context rules")
return rules
except Exception as e:
logger.error(f"Failed to load context rules: {e}")
return []
def save_history(self, filename: str, domain: str, original_length: int,
stage1_changes: int, stage2_changes: int, model: str,
changes: List[Dict]) -> None:
"""
Save correction run history for learning.
Args:
filename: File that was corrected
domain: Correction domain
original_length: Original file length
stage1_changes: Number of Stage 1 changes
stage2_changes: Number of Stage 2 changes
model: AI model used
changes: List of individual changes
"""
try:
with self.repository._transaction() as conn:
# Insert history record
cursor = conn.execute("""
INSERT INTO correction_history
(filename, domain, original_length, stage1_changes, stage2_changes, model)
VALUES (?, ?, ?, ?, ?, ?)
""", (filename, domain, original_length, stage1_changes, stage2_changes, model))
history_id = cursor.lastrowid
# Insert individual changes
for change in changes:
conn.execute("""
INSERT INTO correction_changes
(history_id, line_number, from_text, to_text, rule_type, context_before, context_after)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
history_id,
change.get("line_number"),
change.get("from_text", ""),
change.get("to_text", ""),
change.get("rule_type", "dictionary"),
change.get("context_before"),
change.get("context_after")
))
logger.info(f"Saved correction history for {filename}: {stage1_changes + stage2_changes} total changes")
except Exception as e:
logger.error(f"Failed to save history: {e}")
def close(self) -> None:
"""Close underlying repository."""
self.repository.close()
logger.info("CorrectionService closed")