## New Skill: transcript-fixer v1.0.0 Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning. **Features:** - Two-stage correction pipeline (dictionary + AI) - Automatic pattern detection and learning - Domain-specific dictionaries (general, embodied_ai, finance, medical) - SQLite-based correction repository - Team collaboration with import/export - GLM API integration for AI corrections - Cost optimization through dictionary promotion **Use cases:** - Correcting meeting notes, lecture recordings, or interview transcripts - Fixing Chinese/English homophone errors and technical terminology - Building domain-specific correction dictionaries - Improving transcript accuracy through iterative learning **Documentation:** - Complete workflow guides in references/ - SQL query templates - Troubleshooting guide - Team collaboration patterns - API setup instructions **Marketplace updates:** - Updated marketplace to v1.8.0 - Added transcript-fixer plugin (category: productivity) - Updated README.md with skill description and use cases - Updated CLAUDE.md with skill listing and counts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
525 lines
17 KiB
Python
525 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Correction Service - Business Logic Layer
|
|
|
|
SINGLE RESPONSIBILITY: Implement business rules and validation
|
|
|
|
Orchestrates repository operations with comprehensive validation,
|
|
error handling, and business logic enforcement.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import os
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
from dataclasses import dataclass
|
|
|
|
from .correction_repository import (
|
|
CorrectionRepository,
|
|
ValidationError,
|
|
DatabaseError
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ValidationRules:
|
|
"""Validation rules configuration"""
|
|
max_text_length: int = 1000
|
|
min_text_length: int = 1
|
|
max_domain_length: int = 50
|
|
allowed_domain_pattern: str = r'^[a-zA-Z0-9_-]+$'
|
|
max_confidence: float = 1.0
|
|
min_confidence: float = 0.0
|
|
|
|
|
|
class CorrectionService:
|
|
"""
|
|
Service layer for correction management.
|
|
|
|
Responsibilities:
|
|
- Input validation and sanitization
|
|
- Business rule enforcement
|
|
- Conflict detection and resolution
|
|
- Statistics and reporting
|
|
- Integration with repository layer
|
|
"""
|
|
|
|
def __init__(self, repository: CorrectionRepository, rules: Optional[ValidationRules] = None):
|
|
"""
|
|
Initialize service with repository.
|
|
|
|
Args:
|
|
repository: Data access layer
|
|
rules: Validation rules (uses defaults if None)
|
|
"""
|
|
self.repository = repository
|
|
self.rules = rules or ValidationRules()
|
|
self.db_path = repository.db_path
|
|
logger.info("CorrectionService initialized")
|
|
|
|
def initialize(self) -> None:
|
|
"""
|
|
Initialize database (already done by repository, kept for API compatibility).
|
|
"""
|
|
# Database is auto-initialized by repository on first access
|
|
logger.info(f"✅ Database ready: {self.db_path}")
|
|
|
|
# ==================== Validation Methods ====================
|
|
|
|
def validate_correction_text(self, text: str, field_name: str = "text") -> None:
|
|
"""
|
|
Validate correction text with comprehensive checks.
|
|
|
|
Args:
|
|
text: Text to validate
|
|
field_name: Field name for error messages
|
|
|
|
Raises:
|
|
ValidationError: If validation fails
|
|
"""
|
|
# Check not None or empty
|
|
if not text:
|
|
raise ValidationError(f"{field_name} cannot be None or empty")
|
|
|
|
# Check not only whitespace
|
|
if not text.strip():
|
|
raise ValidationError(f"{field_name} cannot be only whitespace")
|
|
|
|
# Check length constraints
|
|
if len(text) < self.rules.min_text_length:
|
|
raise ValidationError(
|
|
f"{field_name} too short: {len(text)} chars (min: {self.rules.min_text_length})"
|
|
)
|
|
|
|
if len(text) > self.rules.max_text_length:
|
|
raise ValidationError(
|
|
f"{field_name} too long: {len(text)} chars (max: {self.rules.max_text_length})"
|
|
)
|
|
|
|
# Check for control characters (except newline and tab)
|
|
invalid_chars = [c for c in text if ord(c) < 32 and c not in '\n\t']
|
|
if invalid_chars:
|
|
raise ValidationError(
|
|
f"{field_name} contains invalid control characters: {invalid_chars}"
|
|
)
|
|
|
|
# Check for NULL bytes
|
|
if '\x00' in text:
|
|
raise ValidationError(f"{field_name} contains NULL bytes")
|
|
|
|
def validate_domain_name(self, domain: str) -> None:
|
|
"""
|
|
Validate domain name to prevent path traversal and injection.
|
|
|
|
Args:
|
|
domain: Domain name to validate
|
|
|
|
Raises:
|
|
ValidationError: If validation fails
|
|
"""
|
|
if not domain:
|
|
raise ValidationError("Domain name cannot be empty")
|
|
|
|
if len(domain) > self.rules.max_domain_length:
|
|
raise ValidationError(
|
|
f"Domain name too long: {len(domain)} chars (max: {self.rules.max_domain_length})"
|
|
)
|
|
|
|
# Check pattern: only alphanumeric, underscore, hyphen
|
|
if not re.match(self.rules.allowed_domain_pattern, domain):
|
|
raise ValidationError(
|
|
f"Domain name contains invalid characters: {domain}. "
|
|
f"Allowed pattern: {self.rules.allowed_domain_pattern}"
|
|
)
|
|
|
|
# Check for path traversal attempts
|
|
if '..' in domain or '/' in domain or '\\' in domain:
|
|
raise ValidationError(f"Domain name contains path traversal: {domain}")
|
|
|
|
# Reserved names
|
|
reserved = ['con', 'prn', 'aux', 'nul', 'com1', 'lpt1'] # Windows reserved
|
|
if domain.lower() in reserved:
|
|
raise ValidationError(f"Domain name is reserved: {domain}")
|
|
|
|
def validate_confidence(self, confidence: float) -> None:
|
|
"""Validate confidence score."""
|
|
if not isinstance(confidence, (int, float)):
|
|
raise ValidationError(f"Confidence must be numeric, got {type(confidence)}")
|
|
|
|
if not (self.rules.min_confidence <= confidence <= self.rules.max_confidence):
|
|
raise ValidationError(
|
|
f"Confidence must be between {self.rules.min_confidence} "
|
|
f"and {self.rules.max_confidence}, got {confidence}"
|
|
)
|
|
|
|
def validate_source(self, source: str) -> None:
|
|
"""Validate correction source."""
|
|
valid_sources = ['manual', 'learned', 'imported']
|
|
if source not in valid_sources:
|
|
raise ValidationError(
|
|
f"Invalid source: {source}. Must be one of: {valid_sources}"
|
|
)
|
|
|
|
# ==================== Correction Operations ====================
|
|
|
|
def add_correction(
|
|
self,
|
|
from_text: str,
|
|
to_text: str,
|
|
domain: str = "general",
|
|
source: str = "manual",
|
|
confidence: float = 1.0,
|
|
notes: Optional[str] = None
|
|
) -> int:
|
|
"""
|
|
Add a correction with full validation.
|
|
|
|
Args:
|
|
from_text: Original (incorrect) text
|
|
to_text: Corrected text
|
|
domain: Correction domain
|
|
source: Origin of correction
|
|
confidence: Confidence score
|
|
notes: Optional notes
|
|
|
|
Returns:
|
|
ID of inserted correction
|
|
|
|
Raises:
|
|
ValidationError: If validation fails
|
|
"""
|
|
# Comprehensive validation
|
|
self.validate_correction_text(from_text, "from_text")
|
|
self.validate_correction_text(to_text, "to_text")
|
|
self.validate_domain_name(domain)
|
|
self.validate_source(source)
|
|
self.validate_confidence(confidence)
|
|
|
|
# Business rule: from_text and to_text should be different
|
|
if from_text.strip() == to_text.strip():
|
|
raise ValidationError(
|
|
f"from_text and to_text are identical: '{from_text}'"
|
|
)
|
|
|
|
# Get current user
|
|
added_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
|
|
|
|
try:
|
|
correction_id = self.repository.add_correction(
|
|
from_text=from_text,
|
|
to_text=to_text,
|
|
domain=domain,
|
|
source=source,
|
|
confidence=confidence,
|
|
added_by=added_by,
|
|
notes=notes
|
|
)
|
|
|
|
logger.info(
|
|
f"Successfully added correction ID {correction_id}: "
|
|
f"'{from_text}' → '{to_text}' (domain: {domain})"
|
|
)
|
|
return correction_id
|
|
|
|
except DatabaseError as e:
|
|
logger.error(f"Failed to add correction: {e}")
|
|
raise
|
|
|
|
def get_corrections(self, domain: Optional[str] = None) -> Dict[str, str]:
|
|
"""
|
|
Get corrections as a dictionary for processing.
|
|
|
|
Args:
|
|
domain: Optional domain filter
|
|
|
|
Returns:
|
|
Dictionary of corrections {from_text: to_text}
|
|
"""
|
|
if domain:
|
|
self.validate_domain_name(domain)
|
|
return self.repository.get_corrections_dict(domain)
|
|
else:
|
|
# Get all domains
|
|
all_corrections = self.repository.get_all_corrections(active_only=True)
|
|
return {c.from_text: c.to_text for c in all_corrections}
|
|
|
|
def remove_correction(
|
|
self,
|
|
from_text: str,
|
|
domain: str = "general"
|
|
) -> bool:
|
|
"""
|
|
Remove a correction (soft delete).
|
|
|
|
Args:
|
|
from_text: Text to remove
|
|
domain: Domain
|
|
|
|
Returns:
|
|
True if removed, False if not found
|
|
"""
|
|
self.validate_correction_text(from_text, "from_text")
|
|
self.validate_domain_name(domain)
|
|
|
|
deleted_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
|
|
|
|
success = self.repository.delete_correction(from_text, domain, deleted_by)
|
|
|
|
if success:
|
|
logger.info(f"Removed correction: '{from_text}' (domain: {domain})")
|
|
else:
|
|
logger.warning(f"Correction not found: '{from_text}' (domain: {domain})")
|
|
|
|
return success
|
|
|
|
# ==================== Import/Export Operations ====================
|
|
|
|
def import_corrections(
|
|
self,
|
|
corrections: Dict[str, str],
|
|
domain: str = "general",
|
|
merge: bool = True,
|
|
validate_all: bool = True
|
|
) -> Tuple[int, int, int]:
|
|
"""
|
|
Import corrections with validation and conflict resolution.
|
|
|
|
Args:
|
|
corrections: Dictionary of corrections to import
|
|
domain: Target domain
|
|
merge: If True, merge with existing; if False, replace
|
|
validate_all: If True, validate all before import (safer but slower)
|
|
|
|
Returns:
|
|
Tuple of (inserted_count, updated_count, skipped_count)
|
|
|
|
Raises:
|
|
ValidationError: If validation fails (when validate_all=True)
|
|
"""
|
|
self.validate_domain_name(domain)
|
|
|
|
if not corrections:
|
|
raise ValidationError("Cannot import empty corrections dictionary")
|
|
|
|
# Pre-validation (if requested)
|
|
if validate_all:
|
|
logger.info(f"Pre-validating {len(corrections)} corrections...")
|
|
invalid_count = 0
|
|
for from_text, to_text in corrections.items():
|
|
try:
|
|
self.validate_correction_text(from_text, "from_text")
|
|
self.validate_correction_text(to_text, "to_text")
|
|
except ValidationError as e:
|
|
logger.error(f"Validation failed for '{from_text}' → '{to_text}': {e}")
|
|
invalid_count += 1
|
|
|
|
if invalid_count > 0:
|
|
raise ValidationError(
|
|
f"Pre-validation failed: {invalid_count}/{len(corrections)} corrections invalid"
|
|
)
|
|
|
|
# Detect conflicts if merge mode
|
|
if merge:
|
|
existing = self.repository.get_corrections_dict(domain)
|
|
conflicts = self._detect_conflicts(corrections, existing)
|
|
|
|
if conflicts:
|
|
logger.warning(
|
|
f"Found {len(conflicts)} conflicts that will be overwritten"
|
|
)
|
|
for from_text, (old_val, new_val) in conflicts.items():
|
|
logger.debug(f"Conflict: '{from_text}': '{old_val}' → '{new_val}'")
|
|
|
|
# Perform import
|
|
imported_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
|
|
|
|
try:
|
|
inserted, updated, skipped = self.repository.bulk_import_corrections(
|
|
corrections=corrections,
|
|
domain=domain,
|
|
source="imported",
|
|
imported_by=imported_by,
|
|
merge=merge
|
|
)
|
|
|
|
logger.info(
|
|
f"Import complete: {inserted} inserted, {updated} updated, "
|
|
f"{skipped} skipped (domain: {domain})"
|
|
)
|
|
|
|
return (inserted, updated, skipped)
|
|
|
|
except DatabaseError as e:
|
|
logger.error(f"Import failed: {e}")
|
|
raise
|
|
|
|
def export_corrections(self, domain: str = "general") -> Dict[str, str]:
|
|
"""
|
|
Export corrections for sharing.
|
|
|
|
Args:
|
|
domain: Domain to export
|
|
|
|
Returns:
|
|
Dictionary of corrections
|
|
"""
|
|
self.validate_domain_name(domain)
|
|
|
|
corrections = self.repository.get_corrections_dict(domain)
|
|
|
|
logger.info(f"Exported {len(corrections)} corrections (domain: {domain})")
|
|
|
|
return corrections
|
|
|
|
# ==================== Statistics and Reporting ====================
|
|
|
|
def get_statistics(self, domain: Optional[str] = None) -> Dict[str, any]:
|
|
"""
|
|
Get correction statistics.
|
|
|
|
Args:
|
|
domain: Optional domain filter
|
|
|
|
Returns:
|
|
Dictionary of statistics
|
|
"""
|
|
if domain:
|
|
self.validate_domain_name(domain)
|
|
corrections = self.repository.get_all_corrections(domain=domain, active_only=True)
|
|
else:
|
|
corrections = self.repository.get_all_corrections(active_only=True)
|
|
|
|
# Calculate statistics
|
|
total = len(corrections)
|
|
by_source = {'manual': 0, 'learned': 0, 'imported': 0}
|
|
total_usage = 0
|
|
high_confidence = 0
|
|
|
|
for c in corrections:
|
|
by_source[c.source] = by_source.get(c.source, 0) + 1
|
|
total_usage += c.usage_count
|
|
if c.confidence >= 0.9:
|
|
high_confidence += 1
|
|
|
|
stats = {
|
|
'total_corrections': total,
|
|
'by_source': by_source,
|
|
'total_usage': total_usage,
|
|
'average_usage': total_usage / total if total > 0 else 0,
|
|
'high_confidence_count': high_confidence,
|
|
'high_confidence_ratio': high_confidence / total if total > 0 else 0
|
|
}
|
|
|
|
logger.debug(f"Statistics for domain '{domain}': {stats}")
|
|
|
|
return stats
|
|
|
|
# ==================== Helper Methods ====================
|
|
|
|
def _detect_conflicts(
|
|
self,
|
|
incoming: Dict[str, str],
|
|
existing: Dict[str, str]
|
|
) -> Dict[str, Tuple[str, str]]:
|
|
"""
|
|
Detect conflicts between incoming and existing corrections.
|
|
|
|
Returns:
|
|
Dictionary of conflicts {from_text: (existing_to, incoming_to)}
|
|
"""
|
|
conflicts = {}
|
|
|
|
for from_text in set(incoming.keys()) & set(existing.keys()):
|
|
if existing[from_text] != incoming[from_text]:
|
|
conflicts[from_text] = (existing[from_text], incoming[from_text])
|
|
|
|
return conflicts
|
|
|
|
def load_context_rules(self) -> List[Dict]:
|
|
"""
|
|
Load active context-aware regex rules.
|
|
|
|
Returns:
|
|
List of rule dictionaries with pattern, replacement, description
|
|
"""
|
|
try:
|
|
conn = self.repository._get_connection()
|
|
cursor = conn.execute("""
|
|
SELECT pattern, replacement, description
|
|
FROM context_rules
|
|
WHERE is_active = 1
|
|
ORDER BY priority DESC
|
|
""")
|
|
|
|
rules = []
|
|
for row in cursor.fetchall():
|
|
rules.append({
|
|
"pattern": row[0],
|
|
"replacement": row[1],
|
|
"description": row[2]
|
|
})
|
|
|
|
logger.debug(f"Loaded {len(rules)} context rules")
|
|
return rules
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to load context rules: {e}")
|
|
return []
|
|
|
|
def save_history(self, filename: str, domain: str, original_length: int,
|
|
stage1_changes: int, stage2_changes: int, model: str,
|
|
changes: List[Dict]) -> None:
|
|
"""
|
|
Save correction run history for learning.
|
|
|
|
Args:
|
|
filename: File that was corrected
|
|
domain: Correction domain
|
|
original_length: Original file length
|
|
stage1_changes: Number of Stage 1 changes
|
|
stage2_changes: Number of Stage 2 changes
|
|
model: AI model used
|
|
changes: List of individual changes
|
|
"""
|
|
try:
|
|
with self.repository._transaction() as conn:
|
|
# Insert history record
|
|
cursor = conn.execute("""
|
|
INSERT INTO correction_history
|
|
(filename, domain, original_length, stage1_changes, stage2_changes, model)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
""", (filename, domain, original_length, stage1_changes, stage2_changes, model))
|
|
|
|
history_id = cursor.lastrowid
|
|
|
|
# Insert individual changes
|
|
for change in changes:
|
|
conn.execute("""
|
|
INSERT INTO correction_changes
|
|
(history_id, line_number, from_text, to_text, rule_type, context_before, context_after)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
history_id,
|
|
change.get("line_number"),
|
|
change.get("from_text", ""),
|
|
change.get("to_text", ""),
|
|
change.get("rule_type", "dictionary"),
|
|
change.get("context_before"),
|
|
change.get("context_after")
|
|
))
|
|
|
|
logger.info(f"Saved correction history for {filename}: {stage1_changes + stage2_changes} total changes")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to save history: {e}")
|
|
|
|
def close(self) -> None:
|
|
"""Close underlying repository."""
|
|
self.repository.close()
|
|
logger.info("CorrectionService closed")
|