Release v1.8.0: Add transcript-fixer skill
## New Skill: transcript-fixer v1.0.0 Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning. **Features:** - Two-stage correction pipeline (dictionary + AI) - Automatic pattern detection and learning - Domain-specific dictionaries (general, embodied_ai, finance, medical) - SQLite-based correction repository - Team collaboration with import/export - GLM API integration for AI corrections - Cost optimization through dictionary promotion **Use cases:** - Correcting meeting notes, lecture recordings, or interview transcripts - Fixing Chinese/English homophone errors and technical terminology - Building domain-specific correction dictionaries - Improving transcript accuracy through iterative learning **Documentation:** - Complete workflow guides in references/ - SQL query templates - Troubleshooting guide - Team collaboration patterns - API setup instructions **Marketplace updates:** - Updated marketplace to v1.8.0 - Added transcript-fixer plugin (category: productivity) - Updated README.md with skill description and use cases - Updated CLAUDE.md with skill listing and counts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
524
transcript-fixer/scripts/core/correction_service.py
Normal file
524
transcript-fixer/scripts/core/correction_service.py
Normal file
@@ -0,0 +1,524 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Correction Service - Business Logic Layer
|
||||
|
||||
SINGLE RESPONSIBILITY: Implement business rules and validation
|
||||
|
||||
Orchestrates repository operations with comprehensive validation,
|
||||
error handling, and business logic enforcement.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .correction_repository import (
|
||||
CorrectionRepository,
|
||||
ValidationError,
|
||||
DatabaseError
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationRules:
|
||||
"""Validation rules configuration"""
|
||||
max_text_length: int = 1000
|
||||
min_text_length: int = 1
|
||||
max_domain_length: int = 50
|
||||
allowed_domain_pattern: str = r'^[a-zA-Z0-9_-]+$'
|
||||
max_confidence: float = 1.0
|
||||
min_confidence: float = 0.0
|
||||
|
||||
|
||||
class CorrectionService:
|
||||
"""
|
||||
Service layer for correction management.
|
||||
|
||||
Responsibilities:
|
||||
- Input validation and sanitization
|
||||
- Business rule enforcement
|
||||
- Conflict detection and resolution
|
||||
- Statistics and reporting
|
||||
- Integration with repository layer
|
||||
"""
|
||||
|
||||
def __init__(self, repository: CorrectionRepository, rules: Optional[ValidationRules] = None):
|
||||
"""
|
||||
Initialize service with repository.
|
||||
|
||||
Args:
|
||||
repository: Data access layer
|
||||
rules: Validation rules (uses defaults if None)
|
||||
"""
|
||||
self.repository = repository
|
||||
self.rules = rules or ValidationRules()
|
||||
self.db_path = repository.db_path
|
||||
logger.info("CorrectionService initialized")
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""
|
||||
Initialize database (already done by repository, kept for API compatibility).
|
||||
"""
|
||||
# Database is auto-initialized by repository on first access
|
||||
logger.info(f"✅ Database ready: {self.db_path}")
|
||||
|
||||
# ==================== Validation Methods ====================
|
||||
|
||||
def validate_correction_text(self, text: str, field_name: str = "text") -> None:
|
||||
"""
|
||||
Validate correction text with comprehensive checks.
|
||||
|
||||
Args:
|
||||
text: Text to validate
|
||||
field_name: Field name for error messages
|
||||
|
||||
Raises:
|
||||
ValidationError: If validation fails
|
||||
"""
|
||||
# Check not None or empty
|
||||
if not text:
|
||||
raise ValidationError(f"{field_name} cannot be None or empty")
|
||||
|
||||
# Check not only whitespace
|
||||
if not text.strip():
|
||||
raise ValidationError(f"{field_name} cannot be only whitespace")
|
||||
|
||||
# Check length constraints
|
||||
if len(text) < self.rules.min_text_length:
|
||||
raise ValidationError(
|
||||
f"{field_name} too short: {len(text)} chars (min: {self.rules.min_text_length})"
|
||||
)
|
||||
|
||||
if len(text) > self.rules.max_text_length:
|
||||
raise ValidationError(
|
||||
f"{field_name} too long: {len(text)} chars (max: {self.rules.max_text_length})"
|
||||
)
|
||||
|
||||
# Check for control characters (except newline and tab)
|
||||
invalid_chars = [c for c in text if ord(c) < 32 and c not in '\n\t']
|
||||
if invalid_chars:
|
||||
raise ValidationError(
|
||||
f"{field_name} contains invalid control characters: {invalid_chars}"
|
||||
)
|
||||
|
||||
# Check for NULL bytes
|
||||
if '\x00' in text:
|
||||
raise ValidationError(f"{field_name} contains NULL bytes")
|
||||
|
||||
def validate_domain_name(self, domain: str) -> None:
|
||||
"""
|
||||
Validate domain name to prevent path traversal and injection.
|
||||
|
||||
Args:
|
||||
domain: Domain name to validate
|
||||
|
||||
Raises:
|
||||
ValidationError: If validation fails
|
||||
"""
|
||||
if not domain:
|
||||
raise ValidationError("Domain name cannot be empty")
|
||||
|
||||
if len(domain) > self.rules.max_domain_length:
|
||||
raise ValidationError(
|
||||
f"Domain name too long: {len(domain)} chars (max: {self.rules.max_domain_length})"
|
||||
)
|
||||
|
||||
# Check pattern: only alphanumeric, underscore, hyphen
|
||||
if not re.match(self.rules.allowed_domain_pattern, domain):
|
||||
raise ValidationError(
|
||||
f"Domain name contains invalid characters: {domain}. "
|
||||
f"Allowed pattern: {self.rules.allowed_domain_pattern}"
|
||||
)
|
||||
|
||||
# Check for path traversal attempts
|
||||
if '..' in domain or '/' in domain or '\\' in domain:
|
||||
raise ValidationError(f"Domain name contains path traversal: {domain}")
|
||||
|
||||
# Reserved names
|
||||
reserved = ['con', 'prn', 'aux', 'nul', 'com1', 'lpt1'] # Windows reserved
|
||||
if domain.lower() in reserved:
|
||||
raise ValidationError(f"Domain name is reserved: {domain}")
|
||||
|
||||
def validate_confidence(self, confidence: float) -> None:
|
||||
"""Validate confidence score."""
|
||||
if not isinstance(confidence, (int, float)):
|
||||
raise ValidationError(f"Confidence must be numeric, got {type(confidence)}")
|
||||
|
||||
if not (self.rules.min_confidence <= confidence <= self.rules.max_confidence):
|
||||
raise ValidationError(
|
||||
f"Confidence must be between {self.rules.min_confidence} "
|
||||
f"and {self.rules.max_confidence}, got {confidence}"
|
||||
)
|
||||
|
||||
def validate_source(self, source: str) -> None:
|
||||
"""Validate correction source."""
|
||||
valid_sources = ['manual', 'learned', 'imported']
|
||||
if source not in valid_sources:
|
||||
raise ValidationError(
|
||||
f"Invalid source: {source}. Must be one of: {valid_sources}"
|
||||
)
|
||||
|
||||
# ==================== Correction Operations ====================
|
||||
|
||||
def add_correction(
|
||||
self,
|
||||
from_text: str,
|
||||
to_text: str,
|
||||
domain: str = "general",
|
||||
source: str = "manual",
|
||||
confidence: float = 1.0,
|
||||
notes: Optional[str] = None
|
||||
) -> int:
|
||||
"""
|
||||
Add a correction with full validation.
|
||||
|
||||
Args:
|
||||
from_text: Original (incorrect) text
|
||||
to_text: Corrected text
|
||||
domain: Correction domain
|
||||
source: Origin of correction
|
||||
confidence: Confidence score
|
||||
notes: Optional notes
|
||||
|
||||
Returns:
|
||||
ID of inserted correction
|
||||
|
||||
Raises:
|
||||
ValidationError: If validation fails
|
||||
"""
|
||||
# Comprehensive validation
|
||||
self.validate_correction_text(from_text, "from_text")
|
||||
self.validate_correction_text(to_text, "to_text")
|
||||
self.validate_domain_name(domain)
|
||||
self.validate_source(source)
|
||||
self.validate_confidence(confidence)
|
||||
|
||||
# Business rule: from_text and to_text should be different
|
||||
if from_text.strip() == to_text.strip():
|
||||
raise ValidationError(
|
||||
f"from_text and to_text are identical: '{from_text}'"
|
||||
)
|
||||
|
||||
# Get current user
|
||||
added_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
|
||||
|
||||
try:
|
||||
correction_id = self.repository.add_correction(
|
||||
from_text=from_text,
|
||||
to_text=to_text,
|
||||
domain=domain,
|
||||
source=source,
|
||||
confidence=confidence,
|
||||
added_by=added_by,
|
||||
notes=notes
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully added correction ID {correction_id}: "
|
||||
f"'{from_text}' → '{to_text}' (domain: {domain})"
|
||||
)
|
||||
return correction_id
|
||||
|
||||
except DatabaseError as e:
|
||||
logger.error(f"Failed to add correction: {e}")
|
||||
raise
|
||||
|
||||
def get_corrections(self, domain: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Get corrections as a dictionary for processing.
|
||||
|
||||
Args:
|
||||
domain: Optional domain filter
|
||||
|
||||
Returns:
|
||||
Dictionary of corrections {from_text: to_text}
|
||||
"""
|
||||
if domain:
|
||||
self.validate_domain_name(domain)
|
||||
return self.repository.get_corrections_dict(domain)
|
||||
else:
|
||||
# Get all domains
|
||||
all_corrections = self.repository.get_all_corrections(active_only=True)
|
||||
return {c.from_text: c.to_text for c in all_corrections}
|
||||
|
||||
def remove_correction(
|
||||
self,
|
||||
from_text: str,
|
||||
domain: str = "general"
|
||||
) -> bool:
|
||||
"""
|
||||
Remove a correction (soft delete).
|
||||
|
||||
Args:
|
||||
from_text: Text to remove
|
||||
domain: Domain
|
||||
|
||||
Returns:
|
||||
True if removed, False if not found
|
||||
"""
|
||||
self.validate_correction_text(from_text, "from_text")
|
||||
self.validate_domain_name(domain)
|
||||
|
||||
deleted_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
|
||||
|
||||
success = self.repository.delete_correction(from_text, domain, deleted_by)
|
||||
|
||||
if success:
|
||||
logger.info(f"Removed correction: '{from_text}' (domain: {domain})")
|
||||
else:
|
||||
logger.warning(f"Correction not found: '{from_text}' (domain: {domain})")
|
||||
|
||||
return success
|
||||
|
||||
# ==================== Import/Export Operations ====================
|
||||
|
||||
def import_corrections(
|
||||
self,
|
||||
corrections: Dict[str, str],
|
||||
domain: str = "general",
|
||||
merge: bool = True,
|
||||
validate_all: bool = True
|
||||
) -> Tuple[int, int, int]:
|
||||
"""
|
||||
Import corrections with validation and conflict resolution.
|
||||
|
||||
Args:
|
||||
corrections: Dictionary of corrections to import
|
||||
domain: Target domain
|
||||
merge: If True, merge with existing; if False, replace
|
||||
validate_all: If True, validate all before import (safer but slower)
|
||||
|
||||
Returns:
|
||||
Tuple of (inserted_count, updated_count, skipped_count)
|
||||
|
||||
Raises:
|
||||
ValidationError: If validation fails (when validate_all=True)
|
||||
"""
|
||||
self.validate_domain_name(domain)
|
||||
|
||||
if not corrections:
|
||||
raise ValidationError("Cannot import empty corrections dictionary")
|
||||
|
||||
# Pre-validation (if requested)
|
||||
if validate_all:
|
||||
logger.info(f"Pre-validating {len(corrections)} corrections...")
|
||||
invalid_count = 0
|
||||
for from_text, to_text in corrections.items():
|
||||
try:
|
||||
self.validate_correction_text(from_text, "from_text")
|
||||
self.validate_correction_text(to_text, "to_text")
|
||||
except ValidationError as e:
|
||||
logger.error(f"Validation failed for '{from_text}' → '{to_text}': {e}")
|
||||
invalid_count += 1
|
||||
|
||||
if invalid_count > 0:
|
||||
raise ValidationError(
|
||||
f"Pre-validation failed: {invalid_count}/{len(corrections)} corrections invalid"
|
||||
)
|
||||
|
||||
# Detect conflicts if merge mode
|
||||
if merge:
|
||||
existing = self.repository.get_corrections_dict(domain)
|
||||
conflicts = self._detect_conflicts(corrections, existing)
|
||||
|
||||
if conflicts:
|
||||
logger.warning(
|
||||
f"Found {len(conflicts)} conflicts that will be overwritten"
|
||||
)
|
||||
for from_text, (old_val, new_val) in conflicts.items():
|
||||
logger.debug(f"Conflict: '{from_text}': '{old_val}' → '{new_val}'")
|
||||
|
||||
# Perform import
|
||||
imported_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
|
||||
|
||||
try:
|
||||
inserted, updated, skipped = self.repository.bulk_import_corrections(
|
||||
corrections=corrections,
|
||||
domain=domain,
|
||||
source="imported",
|
||||
imported_by=imported_by,
|
||||
merge=merge
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Import complete: {inserted} inserted, {updated} updated, "
|
||||
f"{skipped} skipped (domain: {domain})"
|
||||
)
|
||||
|
||||
return (inserted, updated, skipped)
|
||||
|
||||
except DatabaseError as e:
|
||||
logger.error(f"Import failed: {e}")
|
||||
raise
|
||||
|
||||
def export_corrections(self, domain: str = "general") -> Dict[str, str]:
|
||||
"""
|
||||
Export corrections for sharing.
|
||||
|
||||
Args:
|
||||
domain: Domain to export
|
||||
|
||||
Returns:
|
||||
Dictionary of corrections
|
||||
"""
|
||||
self.validate_domain_name(domain)
|
||||
|
||||
corrections = self.repository.get_corrections_dict(domain)
|
||||
|
||||
logger.info(f"Exported {len(corrections)} corrections (domain: {domain})")
|
||||
|
||||
return corrections
|
||||
|
||||
# ==================== Statistics and Reporting ====================
|
||||
|
||||
def get_statistics(self, domain: Optional[str] = None) -> Dict[str, any]:
|
||||
"""
|
||||
Get correction statistics.
|
||||
|
||||
Args:
|
||||
domain: Optional domain filter
|
||||
|
||||
Returns:
|
||||
Dictionary of statistics
|
||||
"""
|
||||
if domain:
|
||||
self.validate_domain_name(domain)
|
||||
corrections = self.repository.get_all_corrections(domain=domain, active_only=True)
|
||||
else:
|
||||
corrections = self.repository.get_all_corrections(active_only=True)
|
||||
|
||||
# Calculate statistics
|
||||
total = len(corrections)
|
||||
by_source = {'manual': 0, 'learned': 0, 'imported': 0}
|
||||
total_usage = 0
|
||||
high_confidence = 0
|
||||
|
||||
for c in corrections:
|
||||
by_source[c.source] = by_source.get(c.source, 0) + 1
|
||||
total_usage += c.usage_count
|
||||
if c.confidence >= 0.9:
|
||||
high_confidence += 1
|
||||
|
||||
stats = {
|
||||
'total_corrections': total,
|
||||
'by_source': by_source,
|
||||
'total_usage': total_usage,
|
||||
'average_usage': total_usage / total if total > 0 else 0,
|
||||
'high_confidence_count': high_confidence,
|
||||
'high_confidence_ratio': high_confidence / total if total > 0 else 0
|
||||
}
|
||||
|
||||
logger.debug(f"Statistics for domain '{domain}': {stats}")
|
||||
|
||||
return stats
|
||||
|
||||
# ==================== Helper Methods ====================
|
||||
|
||||
def _detect_conflicts(
|
||||
self,
|
||||
incoming: Dict[str, str],
|
||||
existing: Dict[str, str]
|
||||
) -> Dict[str, Tuple[str, str]]:
|
||||
"""
|
||||
Detect conflicts between incoming and existing corrections.
|
||||
|
||||
Returns:
|
||||
Dictionary of conflicts {from_text: (existing_to, incoming_to)}
|
||||
"""
|
||||
conflicts = {}
|
||||
|
||||
for from_text in set(incoming.keys()) & set(existing.keys()):
|
||||
if existing[from_text] != incoming[from_text]:
|
||||
conflicts[from_text] = (existing[from_text], incoming[from_text])
|
||||
|
||||
return conflicts
|
||||
|
||||
def load_context_rules(self) -> List[Dict]:
|
||||
"""
|
||||
Load active context-aware regex rules.
|
||||
|
||||
Returns:
|
||||
List of rule dictionaries with pattern, replacement, description
|
||||
"""
|
||||
try:
|
||||
conn = self.repository._get_connection()
|
||||
cursor = conn.execute("""
|
||||
SELECT pattern, replacement, description
|
||||
FROM context_rules
|
||||
WHERE is_active = 1
|
||||
ORDER BY priority DESC
|
||||
""")
|
||||
|
||||
rules = []
|
||||
for row in cursor.fetchall():
|
||||
rules.append({
|
||||
"pattern": row[0],
|
||||
"replacement": row[1],
|
||||
"description": row[2]
|
||||
})
|
||||
|
||||
logger.debug(f"Loaded {len(rules)} context rules")
|
||||
return rules
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load context rules: {e}")
|
||||
return []
|
||||
|
||||
def save_history(self, filename: str, domain: str, original_length: int,
|
||||
stage1_changes: int, stage2_changes: int, model: str,
|
||||
changes: List[Dict]) -> None:
|
||||
"""
|
||||
Save correction run history for learning.
|
||||
|
||||
Args:
|
||||
filename: File that was corrected
|
||||
domain: Correction domain
|
||||
original_length: Original file length
|
||||
stage1_changes: Number of Stage 1 changes
|
||||
stage2_changes: Number of Stage 2 changes
|
||||
model: AI model used
|
||||
changes: List of individual changes
|
||||
"""
|
||||
try:
|
||||
with self.repository._transaction() as conn:
|
||||
# Insert history record
|
||||
cursor = conn.execute("""
|
||||
INSERT INTO correction_history
|
||||
(filename, domain, original_length, stage1_changes, stage2_changes, model)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", (filename, domain, original_length, stage1_changes, stage2_changes, model))
|
||||
|
||||
history_id = cursor.lastrowid
|
||||
|
||||
# Insert individual changes
|
||||
for change in changes:
|
||||
conn.execute("""
|
||||
INSERT INTO correction_changes
|
||||
(history_id, line_number, from_text, to_text, rule_type, context_before, context_after)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
history_id,
|
||||
change.get("line_number"),
|
||||
change.get("from_text", ""),
|
||||
change.get("to_text", ""),
|
||||
change.get("rule_type", "dictionary"),
|
||||
change.get("context_before"),
|
||||
change.get("context_after")
|
||||
))
|
||||
|
||||
logger.info(f"Saved correction history for {filename}: {stage1_changes + stage2_changes} total changes")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save history: {e}")
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close underlying repository."""
|
||||
self.repository.close()
|
||||
logger.info("CorrectionService closed")
|
||||
Reference in New Issue
Block a user