#!/usr/bin/env python3 """ Test Example Extractor - Extract real usage examples from test files Analyzes test files to extract meaningful code examples showing: - Object instantiation with real parameters - Method calls with expected behaviors - Configuration examples - Setup patterns from fixtures/setUp() - Multi-step workflows from integration tests Supports 9 languages: - Python (AST-based, deep analysis) - JavaScript, TypeScript, Go, Rust, Java, C#, PHP, Ruby (regex-based) Example usage: # Extract from directory python test_example_extractor.py tests/ --language python # Extract from single file python test_example_extractor.py --file tests/test_scraper.py # JSON output python test_example_extractor.py tests/ --json > examples.json # Filter by confidence python test_example_extractor.py tests/ --min-confidence 0.7 """ from dataclasses import dataclass, field, asdict from typing import List, Dict, Optional, Literal, Set from pathlib import Path import ast import re import hashlib import logging import argparse import json import sys # Configure logging logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') logger = logging.getLogger(__name__) # ============================================================================ # DATA MODELS # ============================================================================ @dataclass class TestExample: """Single extracted usage example from test code""" # Identity example_id: str # Unique hash of example test_name: str # Test function/method name category: Literal["instantiation", "method_call", "config", "setup", "workflow"] # Code code: str # Actual example code language: str # Programming language # Context description: str # What this demonstrates expected_behavior: str # Expected outcome from assertions # Source file_path: str line_start: int line_end: int # Quality complexity_score: float # 0-1 scale (higher = more complex/valuable) confidence: float # 0-1 scale (higher = more confident extraction) # Optional fields (must come after required fields) setup_code: Optional[str] = None # Required setup code tags: List[str] = field(default_factory=list) # ["pytest", "mock", "async"] dependencies: List[str] = field(default_factory=list) # Imported modules ai_analysis: Optional[Dict] = None # AI-generated analysis (C3.6) def to_dict(self) -> dict: """Convert to dictionary for JSON serialization""" return asdict(self) def to_markdown(self) -> str: """Convert to markdown format""" md = f"### {self.test_name}\n\n" md += f"**Category**: {self.category} \n" md += f"**Description**: {self.description} \n" if self.expected_behavior: md += f"**Expected**: {self.expected_behavior} \n" md += f"**Confidence**: {self.confidence:.2f} \n" if self.tags: md += f"**Tags**: {', '.join(self.tags)} \n" # Add AI analysis if available (C3.6) if self.ai_analysis: md += f"\n**🤖 AI Analysis:** \n" if self.ai_analysis.get('explanation'): md += f"*{self.ai_analysis['explanation']}* \n" if self.ai_analysis.get('best_practices'): md += f"**Best Practices:** {', '.join(self.ai_analysis['best_practices'])} \n" if self.ai_analysis.get('tutorial_group'): md += f"**Tutorial Group:** {self.ai_analysis['tutorial_group']} \n" md += f"\n```{self.language.lower()}\n" if self.setup_code: md += f"# Setup\n{self.setup_code}\n\n" md += f"{self.code}\n```\n\n" md += f"*Source: {self.file_path}:{self.line_start}*\n\n" return md @dataclass class ExampleReport: """Summary of test example extraction results""" total_examples: int examples_by_category: Dict[str, int] examples_by_language: Dict[str, int] examples: List[TestExample] avg_complexity: float high_value_count: int # confidence > 0.7 file_path: Optional[str] = None # If single file directory: Optional[str] = None # If directory def to_dict(self) -> dict: """Convert to dictionary for JSON serialization""" return { "total_examples": self.total_examples, "examples_by_category": self.examples_by_category, "examples_by_language": self.examples_by_language, "avg_complexity": self.avg_complexity, "high_value_count": self.high_value_count, "file_path": self.file_path, "directory": self.directory, "examples": [ex.to_dict() for ex in self.examples] } def to_markdown(self) -> str: """Convert to markdown format""" md = "# Test Example Extraction Report\n\n" md += f"**Total Examples**: {self.total_examples} \n" md += f"**High Value Examples** (confidence > 0.7): {self.high_value_count} \n" md += f"**Average Complexity**: {self.avg_complexity:.2f} \n" md += "\n## Examples by Category\n\n" for category, count in sorted(self.examples_by_category.items()): md += f"- **{category}**: {count}\n" md += "\n## Examples by Language\n\n" for language, count in sorted(self.examples_by_language.items()): md += f"- **{language}**: {count}\n" md += "\n## Extracted Examples\n\n" for example in sorted(self.examples, key=lambda x: x.confidence, reverse=True): md += example.to_markdown() return md # ============================================================================ # PYTHON TEST ANALYZER (AST-based) # ============================================================================ class PythonTestAnalyzer: """Deep AST-based test example extraction for Python""" def __init__(self): self.trivial_patterns = { 'assertTrue(True)', 'assertFalse(False)', 'assertEqual(1, 1)', 'assertIsNone(None)', 'assertIsNotNone(None)', } def extract(self, file_path: str, code: str) -> List[TestExample]: """Extract examples from Python test file""" examples = [] try: tree = ast.parse(code) except SyntaxError as e: logger.warning(f"Failed to parse {file_path}: {e}") return [] # Extract imports for dependency tracking imports = self._extract_imports(tree) # Find test classes (unittest.TestCase) for node in ast.walk(tree): if isinstance(node, ast.ClassDef): if self._is_test_class(node): examples.extend(self._extract_from_test_class( node, file_path, imports )) # Find test functions (pytest) elif isinstance(node, ast.FunctionDef): if self._is_test_function(node): examples.extend(self._extract_from_test_function( node, file_path, imports )) return examples def _extract_imports(self, tree: ast.AST) -> List[str]: """Extract imported modules""" imports = [] for node in ast.walk(tree): if isinstance(node, ast.Import): imports.extend([alias.name for alias in node.names]) elif isinstance(node, ast.ImportFrom): if node.module: imports.append(node.module) return imports def _is_test_class(self, node: ast.ClassDef) -> bool: """Check if class is a test class""" # unittest.TestCase pattern for base in node.bases: if isinstance(base, ast.Name) and 'Test' in base.id: return True elif isinstance(base, ast.Attribute) and base.attr == 'TestCase': return True return False def _is_test_function(self, node: ast.FunctionDef) -> bool: """Check if function is a test function""" # pytest pattern: starts with test_ if node.name.startswith('test_'): return True # Has @pytest.mark decorator for decorator in node.decorator_list: if isinstance(decorator, ast.Attribute): if 'pytest' in ast.unparse(decorator): return True return False def _extract_from_test_class( self, class_node: ast.ClassDef, file_path: str, imports: List[str] ) -> List[TestExample]: """Extract examples from unittest.TestCase class""" examples = [] # Extract setUp method if exists setup_code = self._extract_setup_method(class_node) # Process each test method for node in class_node.body: if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'): examples.extend(self._analyze_test_body( node, file_path, imports, setup_code=setup_code )) return examples def _extract_from_test_function( self, func_node: ast.FunctionDef, file_path: str, imports: List[str] ) -> List[TestExample]: """Extract examples from pytest test function""" # Check for fixture parameters fixture_setup = self._extract_fixtures(func_node) return self._analyze_test_body( func_node, file_path, imports, setup_code=fixture_setup ) def _extract_setup_method(self, class_node: ast.ClassDef) -> Optional[str]: """Extract setUp method code""" for node in class_node.body: if isinstance(node, ast.FunctionDef) and node.name == 'setUp': return ast.unparse(node.body) return None def _extract_fixtures(self, func_node: ast.FunctionDef) -> Optional[str]: """Extract pytest fixture parameters""" if not func_node.args.args: return None # Skip 'self' parameter params = [arg.arg for arg in func_node.args.args if arg.arg != 'self'] if params: return f"# Fixtures: {', '.join(params)}" return None def _analyze_test_body( self, func_node: ast.FunctionDef, file_path: str, imports: List[str], setup_code: Optional[str] = None ) -> List[TestExample]: """Analyze test function body for extractable patterns""" examples = [] # Get docstring for description docstring = ast.get_docstring(func_node) or func_node.name.replace('_', ' ') # Detect tags tags = self._detect_tags(func_node, imports) # Extract different pattern categories # 1. Instantiation patterns instantiations = self._find_instantiations(func_node, file_path, docstring, setup_code, tags, imports) examples.extend(instantiations) # 2. Method calls with assertions method_calls = self._find_method_calls_with_assertions(func_node, file_path, docstring, setup_code, tags, imports) examples.extend(method_calls) # 3. Configuration dictionaries configs = self._find_config_dicts(func_node, file_path, docstring, setup_code, tags, imports) examples.extend(configs) # 4. Multi-step workflows (integration tests) workflows = self._find_workflows(func_node, file_path, docstring, setup_code, tags, imports) examples.extend(workflows) return examples def _detect_tags(self, func_node: ast.FunctionDef, imports: List[str]) -> List[str]: """Detect test tags (pytest, mock, async, etc.)""" tags = [] # Check decorators for decorator in func_node.decorator_list: decorator_str = ast.unparse(decorator).lower() if 'pytest' in decorator_str: tags.append('pytest') if 'mock' in decorator_str: tags.append('mock') if 'async' in decorator_str or func_node.name.startswith('test_async'): tags.append('async') # Check if using unittest if 'unittest' in imports: tags.append('unittest') # Check function body for mock usage func_str = ast.unparse(func_node).lower() if 'mock' in func_str or 'patch' in func_str: tags.append('mock') return list(set(tags)) def _find_instantiations( self, func_node: ast.FunctionDef, file_path: str, description: str, setup_code: Optional[str], tags: List[str], imports: List[str] ) -> List[TestExample]: """Find object instantiation patterns: obj = ClassName(...)""" examples = [] for node in ast.walk(func_node): if isinstance(node, ast.Assign): if isinstance(node.value, ast.Call): # Check if meaningful instantiation if self._is_meaningful_instantiation(node): code = ast.unparse(node) # Skip trivial or mock-only if len(code) < 20 or 'Mock()' in code: continue # Get class name class_name = self._get_class_name(node.value) example = TestExample( example_id=self._generate_id(code), test_name=func_node.name, category="instantiation", code=code, language="Python", description=f"Instantiate {class_name}: {description}", expected_behavior=self._extract_assertion_after(func_node, node), setup_code=setup_code, file_path=file_path, line_start=node.lineno, line_end=node.end_lineno or node.lineno, complexity_score=self._calculate_complexity(code), confidence=0.8, tags=tags, dependencies=imports ) examples.append(example) return examples def _find_method_calls_with_assertions( self, func_node: ast.FunctionDef, file_path: str, description: str, setup_code: Optional[str], tags: List[str], imports: List[str] ) -> List[TestExample]: """Find method calls followed by assertions""" examples = [] statements = func_node.body for i, stmt in enumerate(statements): # Look for method calls if isinstance(stmt, ast.Expr) and isinstance(stmt.value, ast.Call): # Check if next statement is an assertion if i + 1 < len(statements): next_stmt = statements[i + 1] if self._is_assertion(next_stmt): method_call = ast.unparse(stmt) assertion = ast.unparse(next_stmt) code = f"{method_call}\n{assertion}" # Skip trivial assertions if any(trivial in assertion for trivial in self.trivial_patterns): continue example = TestExample( example_id=self._generate_id(code), test_name=func_node.name, category="method_call", code=code, language="Python", description=description, expected_behavior=assertion, setup_code=setup_code, file_path=file_path, line_start=stmt.lineno, line_end=next_stmt.end_lineno or next_stmt.lineno, complexity_score=self._calculate_complexity(code), confidence=0.85, tags=tags, dependencies=imports ) examples.append(example) return examples def _find_config_dicts( self, func_node: ast.FunctionDef, file_path: str, description: str, setup_code: Optional[str], tags: List[str], imports: List[str] ) -> List[TestExample]: """Find configuration dictionary patterns""" examples = [] for node in ast.walk(func_node): if isinstance(node, ast.Assign) and isinstance(node.value, ast.Dict): # Must have 2+ keys and be meaningful if len(node.value.keys) >= 2: code = ast.unparse(node) # Check if looks like configuration if self._is_config_dict(node.value): example = TestExample( example_id=self._generate_id(code), test_name=func_node.name, category="config", code=code, language="Python", description=f"Configuration example: {description}", expected_behavior=self._extract_assertion_after(func_node, node), setup_code=setup_code, file_path=file_path, line_start=node.lineno, line_end=node.end_lineno or node.lineno, complexity_score=self._calculate_complexity(code), confidence=0.75, tags=tags, dependencies=imports ) examples.append(example) return examples def _find_workflows( self, func_node: ast.FunctionDef, file_path: str, description: str, setup_code: Optional[str], tags: List[str], imports: List[str] ) -> List[TestExample]: """Find multi-step workflow patterns (integration tests)""" examples = [] # Check if this looks like an integration test (3+ meaningful steps) if len(func_node.body) >= 3 and self._is_integration_test(func_node): # Extract the full workflow code = ast.unparse(func_node.body) # Skip if too long (> 30 lines) if code.count('\n') > 30: return examples example = TestExample( example_id=self._generate_id(code), test_name=func_node.name, category="workflow", code=code, language="Python", description=f"Workflow: {description}", expected_behavior=self._extract_final_assertion(func_node), setup_code=setup_code, file_path=file_path, line_start=func_node.lineno, line_end=func_node.end_lineno or func_node.lineno, complexity_score=min(1.0, len(func_node.body) / 10), confidence=0.9, tags=tags + ['workflow', 'integration'], dependencies=imports ) examples.append(example) return examples # Helper methods def _is_meaningful_instantiation(self, node: ast.Assign) -> bool: """Check if instantiation has meaningful parameters""" if not isinstance(node.value, ast.Call): return False # Must have at least one argument or keyword argument call = node.value if call.args or call.keywords: return True return False def _get_class_name(self, call_node: ast.Call) -> str: """Extract class name from Call node""" if isinstance(call_node.func, ast.Name): return call_node.func.id elif isinstance(call_node.func, ast.Attribute): return call_node.func.attr return "UnknownClass" def _is_assertion(self, node: ast.stmt) -> bool: """Check if statement is an assertion""" if isinstance(node, ast.Assert): return True if isinstance(node, ast.Expr) and isinstance(node.value, ast.Call): call_str = ast.unparse(node.value).lower() assertion_methods = ['assert', 'expect', 'should'] return any(method in call_str for method in assertion_methods) return False def _is_config_dict(self, dict_node: ast.Dict) -> bool: """Check if dictionary looks like configuration""" # Keys should be strings for key in dict_node.keys: if not isinstance(key, ast.Constant) or not isinstance(key.value, str): return False return True def _is_integration_test(self, func_node: ast.FunctionDef) -> bool: """Check if test looks like an integration test""" test_name = func_node.name.lower() integration_keywords = ['workflow', 'integration', 'end_to_end', 'e2e', 'full'] return any(keyword in test_name for keyword in integration_keywords) def _extract_assertion_after(self, func_node: ast.FunctionDef, target_node: ast.AST) -> str: """Find assertion that follows the target node""" found_target = False for stmt in func_node.body: if stmt == target_node: found_target = True continue if found_target and self._is_assertion(stmt): return ast.unparse(stmt) return "" def _extract_final_assertion(self, func_node: ast.FunctionDef) -> str: """Extract the final assertion from test""" for stmt in reversed(func_node.body): if self._is_assertion(stmt): return ast.unparse(stmt) return "" def _calculate_complexity(self, code: str) -> float: """Calculate code complexity score (0-1)""" # Simple heuristic: more lines + more parameters = more complex lines = code.count('\n') + 1 params = code.count(',') + 1 complexity = min(1.0, (lines * 0.1) + (params * 0.05)) return round(complexity, 2) def _generate_id(self, code: str) -> str: """Generate unique ID for example""" return hashlib.md5(code.encode()).hexdigest()[:8] # ============================================================================ # GENERIC TEST ANALYZER (Regex-based for non-Python languages) # ============================================================================ class GenericTestAnalyzer: """Regex-based test example extraction for non-Python languages""" # Language-specific regex patterns PATTERNS = { "javascript": { "instantiation": r'(?:const|let|var)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)', "assertion": r'expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)', "test_function": r'(?:test|it)\(["\']([^"\']+)["\']', "config": r'(?:const|let)\s+config\s*=\s*\{[\s\S]{20,500}?\}', }, "typescript": { "instantiation": r'(?:const|let|var)\s+(\w+):\s*\w+\s*=\s*new\s+(\w+)\(([^)]*)\)', "assertion": r'expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)', "test_function": r'(?:test|it)\(["\']([^"\']+)["\']', "config": r'(?:const|let)\s+config:\s*\w+\s*=\s*\{[\s\S]{20,500}?\}', }, "go": { "instantiation": r'(\w+)\s*:=\s*(\w+)\{([^}]+)\}', "assertion": r't\.(?:Error|Fatal)(?:f)?\(["\']([^"\']+)["\']', "test_function": r'func\s+(Test\w+)\(t\s+\*testing\.T\)', "table_test": r'tests\s*:=\s*\[\]struct\s*\{[\s\S]{50,1000}?\}', }, "rust": { "instantiation": r'let\s+(\w+)\s*=\s*(\w+)::new\(([^)]*)\)', "assertion": r'assert(?:_eq)?!\(([^)]+)\)', "test_function": r'#\[test\]\s*fn\s+(\w+)\(\)', }, "java": { "instantiation": r'(\w+)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)', "assertion": r'assert(?:Equals|True|False|NotNull)\(([^)]+)\)', "test_function": r'@Test\s+public\s+void\s+(\w+)\(\)', }, "csharp": { "instantiation": r'var\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)', "assertion": r'Assert\.(?:AreEqual|IsTrue|IsFalse|IsNotNull)\(([^)]+)\)', "test_function": r'\[Test\]\s+public\s+void\s+(\w+)\(\)', }, "php": { "instantiation": r'\$(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)', "assertion": r'\$this->assert(?:Equals|True|False|NotNull)\(([^)]+)\)', "test_function": r'public\s+function\s+(test\w+)\(\)', }, "ruby": { "instantiation": r'(\w+)\s*=\s*(\w+)\.new\(([^)]*)\)', "assertion": r'expect\(([^)]+)\)\.to\s+(?:eq|be|match)\(([^)]+)\)', "test_function": r'(?:test|it)\s+["\']([^"\']+)["\']', } } def extract(self, file_path: str, code: str, language: str) -> List[TestExample]: """Extract examples from test file using regex patterns""" examples = [] language_lower = language.lower() if language_lower not in self.PATTERNS: logger.warning(f"Language {language} not supported for regex extraction") return [] patterns = self.PATTERNS[language_lower] # Extract test functions test_functions = re.finditer(patterns["test_function"], code) for match in test_functions: test_name = match.group(1) # Get test function body (approximate - find next function start) start_pos = match.end() next_match = re.search(patterns["test_function"], code[start_pos:]) end_pos = start_pos + next_match.start() if next_match else len(code) test_body = code[start_pos:end_pos] # Extract instantiations for inst_match in re.finditer(patterns["instantiation"], test_body): example = self._create_example( test_name=test_name, category="instantiation", code=inst_match.group(0), language=language, file_path=file_path, line_number=code[:start_pos + inst_match.start()].count('\n') + 1 ) examples.append(example) # Extract config dictionaries (if pattern exists) if "config" in patterns: for config_match in re.finditer(patterns["config"], test_body): example = self._create_example( test_name=test_name, category="config", code=config_match.group(0), language=language, file_path=file_path, line_number=code[:start_pos + config_match.start()].count('\n') + 1 ) examples.append(example) return examples def _create_example( self, test_name: str, category: str, code: str, language: str, file_path: str, line_number: int ) -> TestExample: """Create TestExample from regex match""" return TestExample( example_id=hashlib.md5(code.encode()).hexdigest()[:8], test_name=test_name, category=category, code=code, language=language, description=f"Test: {test_name}", expected_behavior="", file_path=file_path, line_start=line_number, line_end=line_number + code.count('\n'), complexity_score=min(1.0, (code.count('\n') + 1) * 0.1), confidence=0.6, # Lower confidence for regex extraction tags=[], dependencies=[] ) # ============================================================================ # EXAMPLE QUALITY FILTER # ============================================================================ class ExampleQualityFilter: """Filter out trivial or low-quality examples""" def __init__(self, min_confidence: float = 0.7, min_code_length: int = 20): self.min_confidence = min_confidence self.min_code_length = min_code_length # Trivial patterns to exclude self.trivial_patterns = [ 'Mock()', 'MagicMock()', 'assertTrue(True)', 'assertFalse(False)', 'assertEqual(1, 1)', 'pass', '...', ] def filter(self, examples: List[TestExample]) -> List[TestExample]: """Filter examples by quality criteria""" filtered = [] for example in examples: # Check confidence threshold if example.confidence < self.min_confidence: continue # Check code length if len(example.code) < self.min_code_length: continue # Check for trivial patterns if self._is_trivial(example.code): continue filtered.append(example) return filtered def _is_trivial(self, code: str) -> bool: """Check if code contains trivial patterns""" return any(pattern in code for pattern in self.trivial_patterns) # ============================================================================ # TEST EXAMPLE EXTRACTOR (Main Orchestrator) # ============================================================================ class TestExampleExtractor: """Main orchestrator for test example extraction""" # Test file patterns TEST_PATTERNS = [ 'test_*.py', '*_test.py', 'test*.js', '*test.js', '*_test.go', '*_test.rs', 'Test*.java', 'Test*.cs', '*Test.php', '*_spec.rb', ] # Language detection by extension LANGUAGE_MAP = { '.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript', '.go': 'Go', '.rs': 'Rust', '.java': 'Java', '.cs': 'C#', '.php': 'PHP', '.rb': 'Ruby', } def __init__( self, min_confidence: float = 0.7, max_per_file: int = 10, languages: Optional[List[str]] = None, enhance_with_ai: bool = True ): self.python_analyzer = PythonTestAnalyzer() self.generic_analyzer = GenericTestAnalyzer() self.quality_filter = ExampleQualityFilter(min_confidence=min_confidence) self.max_per_file = max_per_file self.languages = [lang.lower() for lang in languages] if languages else None self.enhance_with_ai = enhance_with_ai # Initialize AI enhancer if enabled (C3.6) self.ai_enhancer = None if self.enhance_with_ai: try: from skill_seekers.cli.ai_enhancer import TestExampleEnhancer self.ai_enhancer = TestExampleEnhancer() except Exception as e: logger.warning(f"⚠️ Failed to initialize AI enhancer: {e}") self.enhance_with_ai = False def extract_from_directory( self, directory: Path, recursive: bool = True ) -> ExampleReport: """Extract examples from all test files in directory""" directory = Path(directory) if not directory.exists(): raise FileNotFoundError(f"Directory not found: {directory}") # Find test files test_files = self._find_test_files(directory, recursive) logger.info(f"Found {len(test_files)} test files in {directory}") # Extract from each file all_examples = [] for test_file in test_files: examples = self.extract_from_file(test_file) all_examples.extend(examples) # Generate report return self._create_report(all_examples, directory=str(directory)) def extract_from_file(self, file_path: Path) -> List[TestExample]: """Extract examples from single test file""" file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") # Detect language language = self._detect_language(file_path) # Filter by language if specified if self.languages and language.lower() not in self.languages: return [] # Read file try: code = file_path.read_text(encoding='utf-8') except UnicodeDecodeError: logger.warning(f"Failed to read {file_path} (encoding error)") return [] # Extract examples based on language if language == 'Python': examples = self.python_analyzer.extract(str(file_path), code) else: examples = self.generic_analyzer.extract(str(file_path), code, language) # Apply quality filter filtered_examples = self.quality_filter.filter(examples) # Limit per file if len(filtered_examples) > self.max_per_file: # Sort by confidence and take top N filtered_examples = sorted( filtered_examples, key=lambda x: x.confidence, reverse=True )[:self.max_per_file] logger.info(f"Extracted {len(filtered_examples)} examples from {file_path.name}") return filtered_examples def _find_test_files(self, directory: Path, recursive: bool) -> List[Path]: """Find test files in directory""" test_files = [] for pattern in self.TEST_PATTERNS: if recursive: test_files.extend(directory.rglob(pattern)) else: test_files.extend(directory.glob(pattern)) return list(set(test_files)) # Remove duplicates def _detect_language(self, file_path: Path) -> str: """Detect programming language from file extension""" suffix = file_path.suffix.lower() return self.LANGUAGE_MAP.get(suffix, 'Unknown') def _create_report( self, examples: List[TestExample], file_path: Optional[str] = None, directory: Optional[str] = None ) -> ExampleReport: """Create summary report from examples""" # Enhance examples with AI analysis (C3.6) if self.enhance_with_ai and self.ai_enhancer and examples: # Convert examples to dict format for AI processing example_dicts = [ex.to_dict() for ex in examples] enhanced_dicts = self.ai_enhancer.enhance_examples(example_dicts) # Update examples with AI analysis for i, example in enumerate(examples): if i < len(enhanced_dicts) and 'ai_analysis' in enhanced_dicts[i]: example.ai_analysis = enhanced_dicts[i]['ai_analysis'] # Count by category examples_by_category = {} for example in examples: examples_by_category[example.category] = \ examples_by_category.get(example.category, 0) + 1 # Count by language examples_by_language = {} for example in examples: examples_by_language[example.language] = \ examples_by_language.get(example.language, 0) + 1 # Calculate averages avg_complexity = sum(ex.complexity_score for ex in examples) / len(examples) if examples else 0.0 high_value_count = sum(1 for ex in examples if ex.confidence > 0.7) return ExampleReport( total_examples=len(examples), examples_by_category=examples_by_category, examples_by_language=examples_by_language, examples=examples, avg_complexity=round(avg_complexity, 2), high_value_count=high_value_count, file_path=file_path, directory=directory ) # ============================================================================ # COMMAND-LINE INTERFACE # ============================================================================ def main(): """Main entry point for CLI""" parser = argparse.ArgumentParser( description='Extract usage examples from test files', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Extract from directory %(prog)s tests/ --language python # Extract from single file %(prog)s --file tests/test_scraper.py # JSON output %(prog)s tests/ --json > examples.json # Filter by confidence %(prog)s tests/ --min-confidence 0.7 """ ) parser.add_argument( 'directory', nargs='?', help='Directory containing test files' ) parser.add_argument( '--file', help='Single test file to analyze' ) parser.add_argument( '--language', help='Filter by programming language (python, javascript, etc.)' ) parser.add_argument( '--min-confidence', type=float, default=0.5, help='Minimum confidence threshold (0.0-1.0, default: 0.5)' ) parser.add_argument( '--max-per-file', type=int, default=10, help='Maximum examples per file (default: 10)' ) parser.add_argument( '--json', action='store_true', help='Output JSON format' ) parser.add_argument( '--markdown', action='store_true', help='Output Markdown format' ) parser.add_argument( '--recursive', action='store_true', default=True, help='Search directory recursively (default: True)' ) args = parser.parse_args() # Validate arguments if not args.directory and not args.file: parser.error("Either directory or --file must be specified") # Create extractor languages = [args.language] if args.language else None extractor = TestExampleExtractor( min_confidence=args.min_confidence, max_per_file=args.max_per_file, languages=languages ) # Extract examples if args.file: examples = extractor.extract_from_file(Path(args.file)) report = extractor._create_report(examples, file_path=args.file) else: report = extractor.extract_from_directory( Path(args.directory), recursive=args.recursive ) # Output results if args.json: print(json.dumps(report.to_dict(), indent=2)) elif args.markdown: print(report.to_markdown()) else: # Human-readable summary print(f"\nTest Example Extraction Results") print(f"=" * 50) print(f"Total Examples: {report.total_examples}") print(f"High Value (confidence > 0.7): {report.high_value_count}") print(f"Average Complexity: {report.avg_complexity:.2f}") print(f"\nExamples by Category:") for category, count in sorted(report.examples_by_category.items()): print(f" {category}: {count}") print(f"\nExamples by Language:") for language, count in sorted(report.examples_by_language.items()): print(f" {language}: {count}") print(f"\nUse --json or --markdown for detailed output") if __name__ == '__main__': main()