#!/usr/bin/env python3 """ Change Extractor - Extract Precise From→To Changes CRITICAL FEATURE: Extract specific corrections from AI results for learning This enables the learning loop: 1. AI makes corrections → Extract specific from→to pairs 2. High-frequency patterns → Auto-add to dictionary 3. Next run → Dictionary handles learned patterns (free) 4. Progressive cost reduction → System gets smarter with use CRITICAL FIX (P1-2): Comprehensive input validation - Prevents DoS attacks from oversized input - Type checking for all parameters - Range validation for numeric arguments - Protection against malicious input """ from __future__ import annotations import difflib import logging import re from dataclasses import dataclass from typing import List, Tuple, Final logger = logging.getLogger(__name__) # Security limits for DoS prevention MAX_TEXT_LENGTH: Final[int] = 1_000_000 # 1MB of text MAX_CHANGES: Final[int] = 10_000 # Maximum changes to extract class InputValidationError(ValueError): """Raised when input validation fails""" pass @dataclass class ExtractedChange: """Represents a specific from→to change extracted from AI results""" from_text: str to_text: str context_before: str # 20 chars before context_after: str # 20 chars after position: int # Character position in original change_type: str # 'word', 'phrase', 'punctuation' confidence: float # 0.0-1.0 based on context consistency def __hash__(self): """Allow use in sets for deduplication""" return hash((self.from_text, self.to_text)) def __eq__(self, other): """Equality based on from/to text""" return (self.from_text == other.from_text and self.to_text == other.to_text) class ChangeExtractor: """ Extract precise from→to changes from before/after text pairs Strategy: 1. Use difflib.SequenceMatcher for accurate diff 2. Filter out formatting-only changes 3. Extract context for confidence scoring 4. Classify change types 5. Calculate confidence based on consistency """ def __init__(self, min_change_length: int = 1, max_change_length: int = 50): """ Initialize extractor Args: min_change_length: Ignore changes shorter than this (chars) - Helps filter noise like single punctuation - Must be >= 1 max_change_length: Ignore changes longer than this (chars) - Helps filter large rewrites (not corrections) - Must be > min_change_length Raises: InputValidationError: If parameters are invalid CRITICAL FIX (P1-2): Added comprehensive parameter validation """ # CRITICAL FIX: Validate parameter types if not isinstance(min_change_length, int): raise InputValidationError( f"min_change_length must be int, got {type(min_change_length).__name__}" ) if not isinstance(max_change_length, int): raise InputValidationError( f"max_change_length must be int, got {type(max_change_length).__name__}" ) # CRITICAL FIX: Validate parameter ranges if min_change_length < 1: raise InputValidationError( f"min_change_length must be >= 1, got {min_change_length}" ) if max_change_length < 1: raise InputValidationError( f"max_change_length must be >= 1, got {max_change_length}" ) # CRITICAL FIX: Validate logical consistency if min_change_length > max_change_length: raise InputValidationError( f"min_change_length ({min_change_length}) must be <= " f"max_change_length ({max_change_length})" ) # CRITICAL FIX: Validate reasonable upper bounds (DoS prevention) if max_change_length > 1000: logger.warning( f"Large max_change_length ({max_change_length}) may impact performance" ) self.min_change_length = min_change_length self.max_change_length = max_change_length logger.debug( f"ChangeExtractor initialized: min={min_change_length}, max={max_change_length}" ) def extract_changes(self, original: str, corrected: str) -> List[ExtractedChange]: """ Extract all from→to changes between original and corrected text Args: original: Original text (before correction) corrected: Corrected text (after AI processing) Returns: List of ExtractedChange objects with context and confidence Raises: InputValidationError: If input validation fails CRITICAL FIX (P1-2): Comprehensive input validation to prevent: - DoS attacks from oversized input - Crashes from None/invalid input - Performance issues from malicious input """ # CRITICAL FIX: Validate input types if not isinstance(original, str): raise InputValidationError( f"original must be str, got {type(original).__name__}" ) if not isinstance(corrected, str): raise InputValidationError( f"corrected must be str, got {type(corrected).__name__}" ) # CRITICAL FIX: Validate input length (DoS prevention) if len(original) > MAX_TEXT_LENGTH: raise InputValidationError( f"original text too long ({len(original)} chars). " f"Maximum allowed: {MAX_TEXT_LENGTH}" ) if len(corrected) > MAX_TEXT_LENGTH: raise InputValidationError( f"corrected text too long ({len(corrected)} chars). " f"Maximum allowed: {MAX_TEXT_LENGTH}" ) # CRITICAL FIX: Handle empty strings gracefully if not original and not corrected: logger.debug("Both texts are empty, returning empty changes list") return [] # CRITICAL FIX: Validate text contains valid characters (not binary data) try: # Try to encode/decode to ensure valid text original.encode('utf-8') corrected.encode('utf-8') except UnicodeError as e: raise InputValidationError(f"Invalid text encoding: {e}") from e logger.debug( f"Extracting changes: original={len(original)} chars, " f"corrected={len(corrected)} chars" ) matcher = difflib.SequenceMatcher(None, original, corrected) changes = [] for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == 'replace': # Actual replacement (from→to) from_text = original[i1:i2] to_text = corrected[j1:j2] # Filter by length if not self._is_valid_change_length(from_text, to_text): continue # Filter formatting-only changes if self._is_formatting_only(from_text, to_text): continue # Extract context context_before = original[max(0, i1-20):i1] context_after = original[i2:min(len(original), i2+20)] # Classify change type change_type = self._classify_change(from_text, to_text) # Calculate confidence (based on text similarity and context) confidence = self._calculate_confidence( from_text, to_text, context_before, context_after ) changes.append(ExtractedChange( from_text=from_text.strip(), to_text=to_text.strip(), context_before=context_before, context_after=context_after, position=i1, change_type=change_type, confidence=confidence )) # CRITICAL FIX: Prevent DoS from excessive changes if len(changes) >= MAX_CHANGES: logger.warning( f"Reached maximum changes limit ({MAX_CHANGES}), stopping extraction" ) break logger.debug(f"Extracted {len(changes)} changes") return changes def group_by_pattern(self, changes: List[ExtractedChange]) -> dict[Tuple[str, str], List[ExtractedChange]]: """ Group changes by from→to pattern for frequency analysis Args: changes: List of ExtractedChange objects Returns: Dict mapping (from_text, to_text) to list of occurrences Raises: InputValidationError: If input is invalid CRITICAL FIX (P1-2): Added input validation """ # CRITICAL FIX: Validate input type if not isinstance(changes, list): raise InputValidationError( f"changes must be list, got {type(changes).__name__}" ) # CRITICAL FIX: Validate list elements grouped = {} for i, change in enumerate(changes): if not isinstance(change, ExtractedChange): raise InputValidationError( f"changes[{i}] must be ExtractedChange, " f"got {type(change).__name__}" ) key = (change.from_text, change.to_text) if key not in grouped: grouped[key] = [] grouped[key].append(change) logger.debug(f"Grouped {len(changes)} changes into {len(grouped)} patterns") return grouped def calculate_pattern_confidence(self, occurrences: List[ExtractedChange]) -> float: """ Calculate overall confidence for a pattern based on multiple occurrences Higher confidence if: - Appears in different contexts - Consistent across occurrences - Not ambiguous (one from → multiple to) Args: occurrences: List of ExtractedChange objects for same pattern Returns: Confidence score 0.0-1.0 Raises: InputValidationError: If input is invalid CRITICAL FIX (P1-2): Added input validation """ # CRITICAL FIX: Validate input type if not isinstance(occurrences, list): raise InputValidationError( f"occurrences must be list, got {type(occurrences).__name__}" ) # Handle empty list if not occurrences: return 0.0 # CRITICAL FIX: Validate list elements for i, occurrence in enumerate(occurrences): if not isinstance(occurrence, ExtractedChange): raise InputValidationError( f"occurrences[{i}] must be ExtractedChange, " f"got {type(occurrence).__name__}" ) # Base confidence from individual changes (safe division - len > 0) avg_confidence = sum(c.confidence for c in occurrences) / len(occurrences) # Frequency boost (more occurrences = higher confidence) frequency_factor = min(1.0, len(occurrences) / 5.0) # Max at 5 occurrences # Context diversity (appears in different contexts = more reliable) unique_contexts = len(set( (c.context_before, c.context_after) for c in occurrences )) diversity_factor = min(1.0, unique_contexts / len(occurrences)) # Combined confidence (weighted average) final_confidence = ( 0.5 * avg_confidence + 0.3 * frequency_factor + 0.2 * diversity_factor ) return round(final_confidence, 2) def _is_valid_change_length(self, from_text: str, to_text: str) -> bool: """Check if change is within valid length range""" from_len = len(from_text.strip()) to_len = len(to_text.strip()) # Both must be within range if from_len < self.min_change_length or from_len > self.max_change_length: return False if to_len < self.min_change_length or to_len > self.max_change_length: return False return True def _is_formatting_only(self, from_text: str, to_text: str) -> bool: """ Check if change is formatting-only (whitespace, case) Returns True if we should ignore this change """ # Strip whitespace and compare from_stripped = ''.join(from_text.split()) to_stripped = ''.join(to_text.split()) # Same after stripping whitespace = formatting only if from_stripped == to_stripped: return True # Only case difference = formatting only if from_stripped.lower() == to_stripped.lower(): return True return False def _classify_change(self, from_text: str, to_text: str) -> str: """ Classify the type of change Returns: 'word', 'phrase', 'punctuation', 'mixed' """ # Single character = punctuation or letter if len(from_text.strip()) == 1 and len(to_text.strip()) == 1: return 'punctuation' # Contains space = phrase if ' ' in from_text or ' ' in to_text: return 'phrase' # Single word if re.match(r'^\w+$', from_text) and re.match(r'^\w+$', to_text): return 'word' return 'mixed' def _calculate_confidence( self, from_text: str, to_text: str, context_before: str, context_after: str ) -> float: """ Calculate confidence score for this change Higher confidence if: - Similar length (likely homophone, not rewrite) - Clear context (not ambiguous) - Common error pattern (e.g., Chinese homophones) Returns: Confidence score 0.0-1.0 CRITICAL FIX (P1-2): Division by zero prevention """ # CRITICAL FIX: Length similarity (prevent division by zero) len_from = len(from_text) len_to = len(to_text) if len_from == 0 and len_to == 0: # Both empty - shouldn't happen due to upstream filtering, but handle it length_score = 1.0 elif len_from == 0 or len_to == 0: # One empty - low confidence (major rewrite) length_score = 0.0 else: # Normal case: calculate ratio safely len_ratio = min(len_from, len_to) / max(len_from, len_to) length_score = len_ratio # Context clarity (longer context = less ambiguous) context_score = min(1.0, (len(context_before) + len(context_after)) / 40.0) # Chinese character ratio (higher = likely homophone error) chinese_chars_from = len(re.findall(r'[\u4e00-\u9fff]', from_text)) chinese_chars_to = len(re.findall(r'[\u4e00-\u9fff]', to_text)) # CRITICAL FIX: Prevent division by zero total_len = len_from + len_to if total_len == 0: chinese_score = 0.0 else: chinese_ratio = (chinese_chars_from + chinese_chars_to) / total_len chinese_score = min(1.0, chinese_ratio * 2) # Boost for Chinese # Combined score (weighted) confidence = ( 0.4 * length_score + 0.3 * context_score + 0.3 * chinese_score ) return round(confidence, 2)