Files
claude-code-skills-reference/transcript-fixer/scripts/core/change_extractor.py
daymade 9b724f33e3 Release v1.9.0: Add video-comparer skill and enhance transcript-fixer
## New Skill: video-comparer v1.0.0
- Compare original and compressed videos with interactive HTML reports
- Calculate quality metrics (PSNR, SSIM) for compression analysis
- Generate frame-by-frame visual comparisons (slider, side-by-side, grid)
- Extract video metadata (codec, resolution, bitrate, duration)
- Multi-platform FFmpeg support with security features

## transcript-fixer Enhancements
- Add async AI processor for parallel processing
- Add connection pool management for database operations
- Add concurrency manager and rate limiter
- Add audit log retention and database migrations
- Add health check and metrics monitoring
- Add comprehensive test suite (8 new test files)
- Enhance security with domain and path validators

## Marketplace Updates
- Update marketplace version from 1.8.0 to 1.9.0
- Update skills count from 15 to 16
- Update documentation (README.md, CLAUDE.md, CHANGELOG.md)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-30 00:23:12 +08:00

449 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Change Extractor - Extract Precise From→To Changes
CRITICAL FEATURE: Extract specific corrections from AI results for learning
This enables the learning loop:
1. AI makes corrections → Extract specific from→to pairs
2. High-frequency patterns → Auto-add to dictionary
3. Next run → Dictionary handles learned patterns (free)
4. Progressive cost reduction → System gets smarter with use
CRITICAL FIX (P1-2): Comprehensive input validation
- Prevents DoS attacks from oversized input
- Type checking for all parameters
- Range validation for numeric arguments
- Protection against malicious input
"""
from __future__ import annotations
import difflib
import logging
import re
from dataclasses import dataclass
from typing import List, Tuple, Final
logger = logging.getLogger(__name__)
# Security limits for DoS prevention
MAX_TEXT_LENGTH: Final[int] = 1_000_000 # 1MB of text
MAX_CHANGES: Final[int] = 10_000 # Maximum changes to extract
class InputValidationError(ValueError):
"""Raised when input validation fails"""
pass
@dataclass
class ExtractedChange:
"""Represents a specific from→to change extracted from AI results"""
from_text: str
to_text: str
context_before: str # 20 chars before
context_after: str # 20 chars after
position: int # Character position in original
change_type: str # 'word', 'phrase', 'punctuation'
confidence: float # 0.0-1.0 based on context consistency
def __hash__(self):
"""Allow use in sets for deduplication"""
return hash((self.from_text, self.to_text))
def __eq__(self, other):
"""Equality based on from/to text"""
return (self.from_text == other.from_text and
self.to_text == other.to_text)
class ChangeExtractor:
"""
Extract precise from→to changes from before/after text pairs
Strategy:
1. Use difflib.SequenceMatcher for accurate diff
2. Filter out formatting-only changes
3. Extract context for confidence scoring
4. Classify change types
5. Calculate confidence based on consistency
"""
def __init__(self, min_change_length: int = 1, max_change_length: int = 50):
"""
Initialize extractor
Args:
min_change_length: Ignore changes shorter than this (chars)
- Helps filter noise like single punctuation
- Must be >= 1
max_change_length: Ignore changes longer than this (chars)
- Helps filter large rewrites (not corrections)
- Must be > min_change_length
Raises:
InputValidationError: If parameters are invalid
CRITICAL FIX (P1-2): Added comprehensive parameter validation
"""
# CRITICAL FIX: Validate parameter types
if not isinstance(min_change_length, int):
raise InputValidationError(
f"min_change_length must be int, got {type(min_change_length).__name__}"
)
if not isinstance(max_change_length, int):
raise InputValidationError(
f"max_change_length must be int, got {type(max_change_length).__name__}"
)
# CRITICAL FIX: Validate parameter ranges
if min_change_length < 1:
raise InputValidationError(
f"min_change_length must be >= 1, got {min_change_length}"
)
if max_change_length < 1:
raise InputValidationError(
f"max_change_length must be >= 1, got {max_change_length}"
)
# CRITICAL FIX: Validate logical consistency
if min_change_length > max_change_length:
raise InputValidationError(
f"min_change_length ({min_change_length}) must be <= "
f"max_change_length ({max_change_length})"
)
# CRITICAL FIX: Validate reasonable upper bounds (DoS prevention)
if max_change_length > 1000:
logger.warning(
f"Large max_change_length ({max_change_length}) may impact performance"
)
self.min_change_length = min_change_length
self.max_change_length = max_change_length
logger.debug(
f"ChangeExtractor initialized: min={min_change_length}, max={max_change_length}"
)
def extract_changes(self, original: str, corrected: str) -> List[ExtractedChange]:
"""
Extract all from→to changes between original and corrected text
Args:
original: Original text (before correction)
corrected: Corrected text (after AI processing)
Returns:
List of ExtractedChange objects with context and confidence
Raises:
InputValidationError: If input validation fails
CRITICAL FIX (P1-2): Comprehensive input validation to prevent:
- DoS attacks from oversized input
- Crashes from None/invalid input
- Performance issues from malicious input
"""
# CRITICAL FIX: Validate input types
if not isinstance(original, str):
raise InputValidationError(
f"original must be str, got {type(original).__name__}"
)
if not isinstance(corrected, str):
raise InputValidationError(
f"corrected must be str, got {type(corrected).__name__}"
)
# CRITICAL FIX: Validate input length (DoS prevention)
if len(original) > MAX_TEXT_LENGTH:
raise InputValidationError(
f"original text too long ({len(original)} chars). "
f"Maximum allowed: {MAX_TEXT_LENGTH}"
)
if len(corrected) > MAX_TEXT_LENGTH:
raise InputValidationError(
f"corrected text too long ({len(corrected)} chars). "
f"Maximum allowed: {MAX_TEXT_LENGTH}"
)
# CRITICAL FIX: Handle empty strings gracefully
if not original and not corrected:
logger.debug("Both texts are empty, returning empty changes list")
return []
# CRITICAL FIX: Validate text contains valid characters (not binary data)
try:
# Try to encode/decode to ensure valid text
original.encode('utf-8')
corrected.encode('utf-8')
except UnicodeError as e:
raise InputValidationError(f"Invalid text encoding: {e}") from e
logger.debug(
f"Extracting changes: original={len(original)} chars, "
f"corrected={len(corrected)} chars"
)
matcher = difflib.SequenceMatcher(None, original, corrected)
changes = []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == 'replace': # Actual replacement (from→to)
from_text = original[i1:i2]
to_text = corrected[j1:j2]
# Filter by length
if not self._is_valid_change_length(from_text, to_text):
continue
# Filter formatting-only changes
if self._is_formatting_only(from_text, to_text):
continue
# Extract context
context_before = original[max(0, i1-20):i1]
context_after = original[i2:min(len(original), i2+20)]
# Classify change type
change_type = self._classify_change(from_text, to_text)
# Calculate confidence (based on text similarity and context)
confidence = self._calculate_confidence(
from_text, to_text, context_before, context_after
)
changes.append(ExtractedChange(
from_text=from_text.strip(),
to_text=to_text.strip(),
context_before=context_before,
context_after=context_after,
position=i1,
change_type=change_type,
confidence=confidence
))
# CRITICAL FIX: Prevent DoS from excessive changes
if len(changes) >= MAX_CHANGES:
logger.warning(
f"Reached maximum changes limit ({MAX_CHANGES}), stopping extraction"
)
break
logger.debug(f"Extracted {len(changes)} changes")
return changes
def group_by_pattern(self, changes: List[ExtractedChange]) -> dict[Tuple[str, str], List[ExtractedChange]]:
"""
Group changes by from→to pattern for frequency analysis
Args:
changes: List of ExtractedChange objects
Returns:
Dict mapping (from_text, to_text) to list of occurrences
Raises:
InputValidationError: If input is invalid
CRITICAL FIX (P1-2): Added input validation
"""
# CRITICAL FIX: Validate input type
if not isinstance(changes, list):
raise InputValidationError(
f"changes must be list, got {type(changes).__name__}"
)
# CRITICAL FIX: Validate list elements
grouped = {}
for i, change in enumerate(changes):
if not isinstance(change, ExtractedChange):
raise InputValidationError(
f"changes[{i}] must be ExtractedChange, "
f"got {type(change).__name__}"
)
key = (change.from_text, change.to_text)
if key not in grouped:
grouped[key] = []
grouped[key].append(change)
logger.debug(f"Grouped {len(changes)} changes into {len(grouped)} patterns")
return grouped
def calculate_pattern_confidence(self, occurrences: List[ExtractedChange]) -> float:
"""
Calculate overall confidence for a pattern based on multiple occurrences
Higher confidence if:
- Appears in different contexts
- Consistent across occurrences
- Not ambiguous (one from → multiple to)
Args:
occurrences: List of ExtractedChange objects for same pattern
Returns:
Confidence score 0.0-1.0
Raises:
InputValidationError: If input is invalid
CRITICAL FIX (P1-2): Added input validation
"""
# CRITICAL FIX: Validate input type
if not isinstance(occurrences, list):
raise InputValidationError(
f"occurrences must be list, got {type(occurrences).__name__}"
)
# Handle empty list
if not occurrences:
return 0.0
# CRITICAL FIX: Validate list elements
for i, occurrence in enumerate(occurrences):
if not isinstance(occurrence, ExtractedChange):
raise InputValidationError(
f"occurrences[{i}] must be ExtractedChange, "
f"got {type(occurrence).__name__}"
)
# Base confidence from individual changes (safe division - len > 0)
avg_confidence = sum(c.confidence for c in occurrences) / len(occurrences)
# Frequency boost (more occurrences = higher confidence)
frequency_factor = min(1.0, len(occurrences) / 5.0) # Max at 5 occurrences
# Context diversity (appears in different contexts = more reliable)
unique_contexts = len(set(
(c.context_before, c.context_after) for c in occurrences
))
diversity_factor = min(1.0, unique_contexts / len(occurrences))
# Combined confidence (weighted average)
final_confidence = (
0.5 * avg_confidence +
0.3 * frequency_factor +
0.2 * diversity_factor
)
return round(final_confidence, 2)
def _is_valid_change_length(self, from_text: str, to_text: str) -> bool:
"""Check if change is within valid length range"""
from_len = len(from_text.strip())
to_len = len(to_text.strip())
# Both must be within range
if from_len < self.min_change_length or from_len > self.max_change_length:
return False
if to_len < self.min_change_length or to_len > self.max_change_length:
return False
return True
def _is_formatting_only(self, from_text: str, to_text: str) -> bool:
"""
Check if change is formatting-only (whitespace, case)
Returns True if we should ignore this change
"""
# Strip whitespace and compare
from_stripped = ''.join(from_text.split())
to_stripped = ''.join(to_text.split())
# Same after stripping whitespace = formatting only
if from_stripped == to_stripped:
return True
# Only case difference = formatting only
if from_stripped.lower() == to_stripped.lower():
return True
return False
def _classify_change(self, from_text: str, to_text: str) -> str:
"""
Classify the type of change
Returns: 'word', 'phrase', 'punctuation', 'mixed'
"""
# Single character = punctuation or letter
if len(from_text.strip()) == 1 and len(to_text.strip()) == 1:
return 'punctuation'
# Contains space = phrase
if ' ' in from_text or ' ' in to_text:
return 'phrase'
# Single word
if re.match(r'^\w+$', from_text) and re.match(r'^\w+$', to_text):
return 'word'
return 'mixed'
def _calculate_confidence(
self,
from_text: str,
to_text: str,
context_before: str,
context_after: str
) -> float:
"""
Calculate confidence score for this change
Higher confidence if:
- Similar length (likely homophone, not rewrite)
- Clear context (not ambiguous)
- Common error pattern (e.g., Chinese homophones)
Returns:
Confidence score 0.0-1.0
CRITICAL FIX (P1-2): Division by zero prevention
"""
# CRITICAL FIX: Length similarity (prevent division by zero)
len_from = len(from_text)
len_to = len(to_text)
if len_from == 0 and len_to == 0:
# Both empty - shouldn't happen due to upstream filtering, but handle it
length_score = 1.0
elif len_from == 0 or len_to == 0:
# One empty - low confidence (major rewrite)
length_score = 0.0
else:
# Normal case: calculate ratio safely
len_ratio = min(len_from, len_to) / max(len_from, len_to)
length_score = len_ratio
# Context clarity (longer context = less ambiguous)
context_score = min(1.0, (len(context_before) + len(context_after)) / 40.0)
# Chinese character ratio (higher = likely homophone error)
chinese_chars_from = len(re.findall(r'[\u4e00-\u9fff]', from_text))
chinese_chars_to = len(re.findall(r'[\u4e00-\u9fff]', to_text))
# CRITICAL FIX: Prevent division by zero
total_len = len_from + len_to
if total_len == 0:
chinese_score = 0.0
else:
chinese_ratio = (chinese_chars_from + chinese_chars_to) / total_len
chinese_score = min(1.0, chinese_ratio * 2) # Boost for Chinese
# Combined score (weighted)
confidence = (
0.4 * length_score +
0.3 * context_score +
0.3 * chinese_score
)
return round(confidence, 2)