## New Skill: video-comparer v1.0.0 - Compare original and compressed videos with interactive HTML reports - Calculate quality metrics (PSNR, SSIM) for compression analysis - Generate frame-by-frame visual comparisons (slider, side-by-side, grid) - Extract video metadata (codec, resolution, bitrate, duration) - Multi-platform FFmpeg support with security features ## transcript-fixer Enhancements - Add async AI processor for parallel processing - Add connection pool management for database operations - Add concurrency manager and rate limiter - Add audit log retention and database migrations - Add health check and metrics monitoring - Add comprehensive test suite (8 new test files) - Enhance security with domain and path validators ## Marketplace Updates - Update marketplace version from 1.8.0 to 1.9.0 - Update skills count from 15 to 16 - Update documentation (README.md, CLAUDE.md, CHANGELOG.md) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
449 lines
15 KiB
Python
449 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Change Extractor - Extract Precise From→To Changes
|
|
|
|
CRITICAL FEATURE: Extract specific corrections from AI results for learning
|
|
|
|
This enables the learning loop:
|
|
1. AI makes corrections → Extract specific from→to pairs
|
|
2. High-frequency patterns → Auto-add to dictionary
|
|
3. Next run → Dictionary handles learned patterns (free)
|
|
4. Progressive cost reduction → System gets smarter with use
|
|
|
|
CRITICAL FIX (P1-2): Comprehensive input validation
|
|
- Prevents DoS attacks from oversized input
|
|
- Type checking for all parameters
|
|
- Range validation for numeric arguments
|
|
- Protection against malicious input
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import difflib
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import List, Tuple, Final
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Security limits for DoS prevention
|
|
MAX_TEXT_LENGTH: Final[int] = 1_000_000 # 1MB of text
|
|
MAX_CHANGES: Final[int] = 10_000 # Maximum changes to extract
|
|
|
|
|
|
class InputValidationError(ValueError):
|
|
"""Raised when input validation fails"""
|
|
pass
|
|
|
|
|
|
@dataclass
|
|
class ExtractedChange:
|
|
"""Represents a specific from→to change extracted from AI results"""
|
|
from_text: str
|
|
to_text: str
|
|
context_before: str # 20 chars before
|
|
context_after: str # 20 chars after
|
|
position: int # Character position in original
|
|
change_type: str # 'word', 'phrase', 'punctuation'
|
|
confidence: float # 0.0-1.0 based on context consistency
|
|
|
|
def __hash__(self):
|
|
"""Allow use in sets for deduplication"""
|
|
return hash((self.from_text, self.to_text))
|
|
|
|
def __eq__(self, other):
|
|
"""Equality based on from/to text"""
|
|
return (self.from_text == other.from_text and
|
|
self.to_text == other.to_text)
|
|
|
|
|
|
class ChangeExtractor:
|
|
"""
|
|
Extract precise from→to changes from before/after text pairs
|
|
|
|
Strategy:
|
|
1. Use difflib.SequenceMatcher for accurate diff
|
|
2. Filter out formatting-only changes
|
|
3. Extract context for confidence scoring
|
|
4. Classify change types
|
|
5. Calculate confidence based on consistency
|
|
"""
|
|
|
|
def __init__(self, min_change_length: int = 1, max_change_length: int = 50):
|
|
"""
|
|
Initialize extractor
|
|
|
|
Args:
|
|
min_change_length: Ignore changes shorter than this (chars)
|
|
- Helps filter noise like single punctuation
|
|
- Must be >= 1
|
|
max_change_length: Ignore changes longer than this (chars)
|
|
- Helps filter large rewrites (not corrections)
|
|
- Must be > min_change_length
|
|
|
|
Raises:
|
|
InputValidationError: If parameters are invalid
|
|
|
|
CRITICAL FIX (P1-2): Added comprehensive parameter validation
|
|
"""
|
|
# CRITICAL FIX: Validate parameter types
|
|
if not isinstance(min_change_length, int):
|
|
raise InputValidationError(
|
|
f"min_change_length must be int, got {type(min_change_length).__name__}"
|
|
)
|
|
|
|
if not isinstance(max_change_length, int):
|
|
raise InputValidationError(
|
|
f"max_change_length must be int, got {type(max_change_length).__name__}"
|
|
)
|
|
|
|
# CRITICAL FIX: Validate parameter ranges
|
|
if min_change_length < 1:
|
|
raise InputValidationError(
|
|
f"min_change_length must be >= 1, got {min_change_length}"
|
|
)
|
|
|
|
if max_change_length < 1:
|
|
raise InputValidationError(
|
|
f"max_change_length must be >= 1, got {max_change_length}"
|
|
)
|
|
|
|
# CRITICAL FIX: Validate logical consistency
|
|
if min_change_length > max_change_length:
|
|
raise InputValidationError(
|
|
f"min_change_length ({min_change_length}) must be <= "
|
|
f"max_change_length ({max_change_length})"
|
|
)
|
|
|
|
# CRITICAL FIX: Validate reasonable upper bounds (DoS prevention)
|
|
if max_change_length > 1000:
|
|
logger.warning(
|
|
f"Large max_change_length ({max_change_length}) may impact performance"
|
|
)
|
|
|
|
self.min_change_length = min_change_length
|
|
self.max_change_length = max_change_length
|
|
|
|
logger.debug(
|
|
f"ChangeExtractor initialized: min={min_change_length}, max={max_change_length}"
|
|
)
|
|
|
|
def extract_changes(self, original: str, corrected: str) -> List[ExtractedChange]:
|
|
"""
|
|
Extract all from→to changes between original and corrected text
|
|
|
|
Args:
|
|
original: Original text (before correction)
|
|
corrected: Corrected text (after AI processing)
|
|
|
|
Returns:
|
|
List of ExtractedChange objects with context and confidence
|
|
|
|
Raises:
|
|
InputValidationError: If input validation fails
|
|
|
|
CRITICAL FIX (P1-2): Comprehensive input validation to prevent:
|
|
- DoS attacks from oversized input
|
|
- Crashes from None/invalid input
|
|
- Performance issues from malicious input
|
|
"""
|
|
# CRITICAL FIX: Validate input types
|
|
if not isinstance(original, str):
|
|
raise InputValidationError(
|
|
f"original must be str, got {type(original).__name__}"
|
|
)
|
|
|
|
if not isinstance(corrected, str):
|
|
raise InputValidationError(
|
|
f"corrected must be str, got {type(corrected).__name__}"
|
|
)
|
|
|
|
# CRITICAL FIX: Validate input length (DoS prevention)
|
|
if len(original) > MAX_TEXT_LENGTH:
|
|
raise InputValidationError(
|
|
f"original text too long ({len(original)} chars). "
|
|
f"Maximum allowed: {MAX_TEXT_LENGTH}"
|
|
)
|
|
|
|
if len(corrected) > MAX_TEXT_LENGTH:
|
|
raise InputValidationError(
|
|
f"corrected text too long ({len(corrected)} chars). "
|
|
f"Maximum allowed: {MAX_TEXT_LENGTH}"
|
|
)
|
|
|
|
# CRITICAL FIX: Handle empty strings gracefully
|
|
if not original and not corrected:
|
|
logger.debug("Both texts are empty, returning empty changes list")
|
|
return []
|
|
|
|
# CRITICAL FIX: Validate text contains valid characters (not binary data)
|
|
try:
|
|
# Try to encode/decode to ensure valid text
|
|
original.encode('utf-8')
|
|
corrected.encode('utf-8')
|
|
except UnicodeError as e:
|
|
raise InputValidationError(f"Invalid text encoding: {e}") from e
|
|
|
|
logger.debug(
|
|
f"Extracting changes: original={len(original)} chars, "
|
|
f"corrected={len(corrected)} chars"
|
|
)
|
|
|
|
matcher = difflib.SequenceMatcher(None, original, corrected)
|
|
changes = []
|
|
|
|
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
|
if tag == 'replace': # Actual replacement (from→to)
|
|
from_text = original[i1:i2]
|
|
to_text = corrected[j1:j2]
|
|
|
|
# Filter by length
|
|
if not self._is_valid_change_length(from_text, to_text):
|
|
continue
|
|
|
|
# Filter formatting-only changes
|
|
if self._is_formatting_only(from_text, to_text):
|
|
continue
|
|
|
|
# Extract context
|
|
context_before = original[max(0, i1-20):i1]
|
|
context_after = original[i2:min(len(original), i2+20)]
|
|
|
|
# Classify change type
|
|
change_type = self._classify_change(from_text, to_text)
|
|
|
|
# Calculate confidence (based on text similarity and context)
|
|
confidence = self._calculate_confidence(
|
|
from_text, to_text, context_before, context_after
|
|
)
|
|
|
|
changes.append(ExtractedChange(
|
|
from_text=from_text.strip(),
|
|
to_text=to_text.strip(),
|
|
context_before=context_before,
|
|
context_after=context_after,
|
|
position=i1,
|
|
change_type=change_type,
|
|
confidence=confidence
|
|
))
|
|
|
|
# CRITICAL FIX: Prevent DoS from excessive changes
|
|
if len(changes) >= MAX_CHANGES:
|
|
logger.warning(
|
|
f"Reached maximum changes limit ({MAX_CHANGES}), stopping extraction"
|
|
)
|
|
break
|
|
|
|
logger.debug(f"Extracted {len(changes)} changes")
|
|
return changes
|
|
|
|
def group_by_pattern(self, changes: List[ExtractedChange]) -> dict[Tuple[str, str], List[ExtractedChange]]:
|
|
"""
|
|
Group changes by from→to pattern for frequency analysis
|
|
|
|
Args:
|
|
changes: List of ExtractedChange objects
|
|
|
|
Returns:
|
|
Dict mapping (from_text, to_text) to list of occurrences
|
|
|
|
Raises:
|
|
InputValidationError: If input is invalid
|
|
|
|
CRITICAL FIX (P1-2): Added input validation
|
|
"""
|
|
# CRITICAL FIX: Validate input type
|
|
if not isinstance(changes, list):
|
|
raise InputValidationError(
|
|
f"changes must be list, got {type(changes).__name__}"
|
|
)
|
|
|
|
# CRITICAL FIX: Validate list elements
|
|
grouped = {}
|
|
for i, change in enumerate(changes):
|
|
if not isinstance(change, ExtractedChange):
|
|
raise InputValidationError(
|
|
f"changes[{i}] must be ExtractedChange, "
|
|
f"got {type(change).__name__}"
|
|
)
|
|
|
|
key = (change.from_text, change.to_text)
|
|
if key not in grouped:
|
|
grouped[key] = []
|
|
grouped[key].append(change)
|
|
|
|
logger.debug(f"Grouped {len(changes)} changes into {len(grouped)} patterns")
|
|
return grouped
|
|
|
|
def calculate_pattern_confidence(self, occurrences: List[ExtractedChange]) -> float:
|
|
"""
|
|
Calculate overall confidence for a pattern based on multiple occurrences
|
|
|
|
Higher confidence if:
|
|
- Appears in different contexts
|
|
- Consistent across occurrences
|
|
- Not ambiguous (one from → multiple to)
|
|
|
|
Args:
|
|
occurrences: List of ExtractedChange objects for same pattern
|
|
|
|
Returns:
|
|
Confidence score 0.0-1.0
|
|
|
|
Raises:
|
|
InputValidationError: If input is invalid
|
|
|
|
CRITICAL FIX (P1-2): Added input validation
|
|
"""
|
|
# CRITICAL FIX: Validate input type
|
|
if not isinstance(occurrences, list):
|
|
raise InputValidationError(
|
|
f"occurrences must be list, got {type(occurrences).__name__}"
|
|
)
|
|
|
|
# Handle empty list
|
|
if not occurrences:
|
|
return 0.0
|
|
|
|
# CRITICAL FIX: Validate list elements
|
|
for i, occurrence in enumerate(occurrences):
|
|
if not isinstance(occurrence, ExtractedChange):
|
|
raise InputValidationError(
|
|
f"occurrences[{i}] must be ExtractedChange, "
|
|
f"got {type(occurrence).__name__}"
|
|
)
|
|
|
|
# Base confidence from individual changes (safe division - len > 0)
|
|
avg_confidence = sum(c.confidence for c in occurrences) / len(occurrences)
|
|
|
|
# Frequency boost (more occurrences = higher confidence)
|
|
frequency_factor = min(1.0, len(occurrences) / 5.0) # Max at 5 occurrences
|
|
|
|
# Context diversity (appears in different contexts = more reliable)
|
|
unique_contexts = len(set(
|
|
(c.context_before, c.context_after) for c in occurrences
|
|
))
|
|
diversity_factor = min(1.0, unique_contexts / len(occurrences))
|
|
|
|
# Combined confidence (weighted average)
|
|
final_confidence = (
|
|
0.5 * avg_confidence +
|
|
0.3 * frequency_factor +
|
|
0.2 * diversity_factor
|
|
)
|
|
|
|
return round(final_confidence, 2)
|
|
|
|
def _is_valid_change_length(self, from_text: str, to_text: str) -> bool:
|
|
"""Check if change is within valid length range"""
|
|
from_len = len(from_text.strip())
|
|
to_len = len(to_text.strip())
|
|
|
|
# Both must be within range
|
|
if from_len < self.min_change_length or from_len > self.max_change_length:
|
|
return False
|
|
if to_len < self.min_change_length or to_len > self.max_change_length:
|
|
return False
|
|
|
|
return True
|
|
|
|
def _is_formatting_only(self, from_text: str, to_text: str) -> bool:
|
|
"""
|
|
Check if change is formatting-only (whitespace, case)
|
|
|
|
Returns True if we should ignore this change
|
|
"""
|
|
# Strip whitespace and compare
|
|
from_stripped = ''.join(from_text.split())
|
|
to_stripped = ''.join(to_text.split())
|
|
|
|
# Same after stripping whitespace = formatting only
|
|
if from_stripped == to_stripped:
|
|
return True
|
|
|
|
# Only case difference = formatting only
|
|
if from_stripped.lower() == to_stripped.lower():
|
|
return True
|
|
|
|
return False
|
|
|
|
def _classify_change(self, from_text: str, to_text: str) -> str:
|
|
"""
|
|
Classify the type of change
|
|
|
|
Returns: 'word', 'phrase', 'punctuation', 'mixed'
|
|
"""
|
|
# Single character = punctuation or letter
|
|
if len(from_text.strip()) == 1 and len(to_text.strip()) == 1:
|
|
return 'punctuation'
|
|
|
|
# Contains space = phrase
|
|
if ' ' in from_text or ' ' in to_text:
|
|
return 'phrase'
|
|
|
|
# Single word
|
|
if re.match(r'^\w+$', from_text) and re.match(r'^\w+$', to_text):
|
|
return 'word'
|
|
|
|
return 'mixed'
|
|
|
|
def _calculate_confidence(
|
|
self,
|
|
from_text: str,
|
|
to_text: str,
|
|
context_before: str,
|
|
context_after: str
|
|
) -> float:
|
|
"""
|
|
Calculate confidence score for this change
|
|
|
|
Higher confidence if:
|
|
- Similar length (likely homophone, not rewrite)
|
|
- Clear context (not ambiguous)
|
|
- Common error pattern (e.g., Chinese homophones)
|
|
|
|
Returns:
|
|
Confidence score 0.0-1.0
|
|
|
|
CRITICAL FIX (P1-2): Division by zero prevention
|
|
"""
|
|
# CRITICAL FIX: Length similarity (prevent division by zero)
|
|
len_from = len(from_text)
|
|
len_to = len(to_text)
|
|
|
|
if len_from == 0 and len_to == 0:
|
|
# Both empty - shouldn't happen due to upstream filtering, but handle it
|
|
length_score = 1.0
|
|
elif len_from == 0 or len_to == 0:
|
|
# One empty - low confidence (major rewrite)
|
|
length_score = 0.0
|
|
else:
|
|
# Normal case: calculate ratio safely
|
|
len_ratio = min(len_from, len_to) / max(len_from, len_to)
|
|
length_score = len_ratio
|
|
|
|
# Context clarity (longer context = less ambiguous)
|
|
context_score = min(1.0, (len(context_before) + len(context_after)) / 40.0)
|
|
|
|
# Chinese character ratio (higher = likely homophone error)
|
|
chinese_chars_from = len(re.findall(r'[\u4e00-\u9fff]', from_text))
|
|
chinese_chars_to = len(re.findall(r'[\u4e00-\u9fff]', to_text))
|
|
|
|
# CRITICAL FIX: Prevent division by zero
|
|
total_len = len_from + len_to
|
|
if total_len == 0:
|
|
chinese_score = 0.0
|
|
else:
|
|
chinese_ratio = (chinese_chars_from + chinese_chars_to) / total_len
|
|
chinese_score = min(1.0, chinese_ratio * 2) # Boost for Chinese
|
|
|
|
# Combined score (weighted)
|
|
confidence = (
|
|
0.4 * length_score +
|
|
0.3 * context_score +
|
|
0.3 * chinese_score
|
|
)
|
|
|
|
return round(confidence, 2)
|