Adds full C3.x pipeline support for Kotlin (.kt, .kts): - Language detection patterns (40+ weighted patterns for data/sealed classes, coroutines, companion objects, KMP, etc.) - AST regex parser in code_analyzer.py (classes, objects, functions, extension functions, suspend functions) - Dependency extraction for Kotlin import statements (with alias support) - Design pattern adaptations (object→Singleton, companion→Factory, sealed→Strategy, data→Builder, Flow→Observer) - Test example extraction for JUnit 4/5, Kotest, MockK, Spek - Config detection for build.gradle.kts / settings.gradle.kts - Extension maps registered in codebase_scraper, unified_codebase_analyzer, github_scraper, generate_router Also fixes pre-existing parser count tests (35→36 for doctor command added in previous commit). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1220 lines
45 KiB
Python
1220 lines
45 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test Example Extractor - Extract real usage examples from test files
|
|
|
|
Analyzes test files to extract meaningful code examples showing:
|
|
- Object instantiation with real parameters
|
|
- Method calls with expected behaviors
|
|
- Configuration examples
|
|
- Setup patterns from fixtures/setUp()
|
|
- Multi-step workflows from integration tests
|
|
|
|
Supports 10 languages:
|
|
- Python (AST-based, deep analysis)
|
|
- JavaScript, TypeScript, Go, Rust, Java, C#, PHP, Ruby, GDScript (regex-based)
|
|
|
|
Example usage:
|
|
# Extract from directory
|
|
python test_example_extractor.py tests/ --language python
|
|
|
|
# Extract from single file
|
|
python test_example_extractor.py --file tests/test_scraper.py
|
|
|
|
# JSON output
|
|
python test_example_extractor.py tests/ --json > examples.json
|
|
|
|
# Filter by confidence
|
|
python test_example_extractor.py tests/ --min-confidence 0.7
|
|
"""
|
|
|
|
import argparse
|
|
import ast
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import re
|
|
from dataclasses import asdict, dataclass, field
|
|
from pathlib import Path
|
|
from typing import Literal
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ============================================================================
|
|
# DATA MODELS
|
|
# ============================================================================
|
|
|
|
|
|
@dataclass
|
|
class TestExample:
|
|
"""Single extracted usage example from test code"""
|
|
|
|
# Identity
|
|
example_id: str # Unique hash of example
|
|
test_name: str # Test function/method name
|
|
category: Literal["instantiation", "method_call", "config", "setup", "workflow"]
|
|
|
|
# Code
|
|
code: str # Actual example code
|
|
language: str # Programming language
|
|
|
|
# Context
|
|
description: str # What this demonstrates
|
|
expected_behavior: str # Expected outcome from assertions
|
|
|
|
# Source
|
|
file_path: str
|
|
line_start: int
|
|
line_end: int
|
|
|
|
# Quality
|
|
complexity_score: float # 0-1 scale (higher = more complex/valuable)
|
|
confidence: float # 0-1 scale (higher = more confident extraction)
|
|
|
|
# Optional fields (must come after required fields)
|
|
setup_code: str | None = None # Required setup code
|
|
tags: list[str] = field(default_factory=list) # ["pytest", "mock", "async"]
|
|
dependencies: list[str] = field(default_factory=list) # Imported modules
|
|
ai_analysis: dict | None = None # AI-generated analysis (C3.6)
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary for JSON serialization"""
|
|
return asdict(self)
|
|
|
|
def to_markdown(self) -> str:
|
|
"""Convert to markdown format"""
|
|
md = f"### {self.test_name}\n\n"
|
|
md += f"**Category**: {self.category} \n"
|
|
md += f"**Description**: {self.description} \n"
|
|
if self.expected_behavior:
|
|
md += f"**Expected**: {self.expected_behavior} \n"
|
|
md += f"**Confidence**: {self.confidence:.2f} \n"
|
|
if self.tags:
|
|
md += f"**Tags**: {', '.join(self.tags)} \n"
|
|
|
|
# Add AI analysis if available (C3.6)
|
|
if self.ai_analysis:
|
|
md += "\n**🤖 AI Analysis:** \n"
|
|
if self.ai_analysis.get("explanation"):
|
|
md += f"*{self.ai_analysis['explanation']}* \n"
|
|
if self.ai_analysis.get("best_practices"):
|
|
md += f"**Best Practices:** {', '.join(self.ai_analysis['best_practices'])} \n"
|
|
if self.ai_analysis.get("tutorial_group"):
|
|
md += f"**Tutorial Group:** {self.ai_analysis['tutorial_group']} \n"
|
|
|
|
md += f"\n```{self.language.lower()}\n"
|
|
if self.setup_code:
|
|
md += f"# Setup\n{self.setup_code}\n\n"
|
|
md += f"{self.code}\n```\n\n"
|
|
md += f"*Source: {self.file_path}:{self.line_start}*\n\n"
|
|
return md
|
|
|
|
|
|
@dataclass
|
|
class ExampleReport:
|
|
"""Summary of test example extraction results"""
|
|
|
|
total_examples: int
|
|
examples_by_category: dict[str, int]
|
|
examples_by_language: dict[str, int]
|
|
examples: list[TestExample]
|
|
avg_complexity: float
|
|
high_value_count: int # confidence > 0.7
|
|
file_path: str | None = None # If single file
|
|
directory: str | None = None # If directory
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary for JSON serialization"""
|
|
return {
|
|
"total_examples": self.total_examples,
|
|
"examples_by_category": self.examples_by_category,
|
|
"examples_by_language": self.examples_by_language,
|
|
"avg_complexity": self.avg_complexity,
|
|
"high_value_count": self.high_value_count,
|
|
"file_path": self.file_path,
|
|
"directory": self.directory,
|
|
"examples": [ex.to_dict() for ex in self.examples],
|
|
}
|
|
|
|
def to_markdown(self) -> str:
|
|
"""Convert to markdown format"""
|
|
md = "# Test Example Extraction Report\n\n"
|
|
md += f"**Total Examples**: {self.total_examples} \n"
|
|
md += f"**High Value Examples** (confidence > 0.7): {self.high_value_count} \n"
|
|
md += f"**Average Complexity**: {self.avg_complexity:.2f} \n"
|
|
|
|
md += "\n## Examples by Category\n\n"
|
|
for category, count in sorted(self.examples_by_category.items()):
|
|
md += f"- **{category}**: {count}\n"
|
|
|
|
md += "\n## Examples by Language\n\n"
|
|
for language, count in sorted(self.examples_by_language.items()):
|
|
md += f"- **{language}**: {count}\n"
|
|
|
|
md += "\n## Extracted Examples\n\n"
|
|
for example in sorted(self.examples, key=lambda x: x.confidence, reverse=True):
|
|
md += example.to_markdown()
|
|
|
|
return md
|
|
|
|
|
|
# ============================================================================
|
|
# PYTHON TEST ANALYZER (AST-based)
|
|
# ============================================================================
|
|
|
|
|
|
class PythonTestAnalyzer:
|
|
"""Deep AST-based test example extraction for Python"""
|
|
|
|
def __init__(self):
|
|
self.trivial_patterns = {
|
|
"assertTrue(True)",
|
|
"assertFalse(False)",
|
|
"assertEqual(1, 1)",
|
|
"assertIsNone(None)",
|
|
"assertIsNotNone(None)",
|
|
}
|
|
|
|
def extract(self, file_path: str, code: str) -> list[TestExample]:
|
|
"""Extract examples from Python test file"""
|
|
examples = []
|
|
|
|
try:
|
|
tree = ast.parse(code)
|
|
except SyntaxError as e:
|
|
logger.warning(f"Failed to parse {file_path}: {e}")
|
|
return []
|
|
|
|
# Extract imports for dependency tracking
|
|
imports = self._extract_imports(tree)
|
|
|
|
# Find test classes (unittest.TestCase)
|
|
for node in ast.walk(tree):
|
|
if isinstance(node, ast.ClassDef):
|
|
if self._is_test_class(node):
|
|
examples.extend(self._extract_from_test_class(node, file_path, imports))
|
|
|
|
# Find test functions (pytest)
|
|
elif isinstance(node, ast.FunctionDef) and self._is_test_function(node):
|
|
examples.extend(self._extract_from_test_function(node, file_path, imports))
|
|
|
|
return examples
|
|
|
|
def _extract_imports(self, tree: ast.AST) -> list[str]:
|
|
"""Extract imported modules"""
|
|
imports = []
|
|
for node in ast.walk(tree):
|
|
if isinstance(node, ast.Import):
|
|
imports.extend([alias.name for alias in node.names])
|
|
elif isinstance(node, ast.ImportFrom) and node.module:
|
|
imports.append(node.module)
|
|
return imports
|
|
|
|
def _is_test_class(self, node: ast.ClassDef) -> bool:
|
|
"""Check if class is a test class"""
|
|
# unittest.TestCase pattern
|
|
for base in node.bases:
|
|
if (
|
|
isinstance(base, ast.Name)
|
|
and "Test" in base.id
|
|
or isinstance(base, ast.Attribute)
|
|
and base.attr == "TestCase"
|
|
):
|
|
return True
|
|
return False
|
|
|
|
def _is_test_function(self, node: ast.FunctionDef) -> bool:
|
|
"""Check if function is a test function"""
|
|
# pytest pattern: starts with test_
|
|
if node.name.startswith("test_"):
|
|
return True
|
|
# Has @pytest.mark decorator
|
|
for decorator in node.decorator_list:
|
|
if isinstance(decorator, ast.Attribute) and "pytest" in ast.unparse(decorator):
|
|
return True
|
|
return False
|
|
|
|
def _extract_from_test_class(
|
|
self, class_node: ast.ClassDef, file_path: str, imports: list[str]
|
|
) -> list[TestExample]:
|
|
"""Extract examples from unittest.TestCase class"""
|
|
examples = []
|
|
|
|
# Extract setUp method if exists
|
|
setup_code = self._extract_setup_method(class_node)
|
|
|
|
# Process each test method
|
|
for node in class_node.body:
|
|
if isinstance(node, ast.FunctionDef) and node.name.startswith("test_"):
|
|
examples.extend(
|
|
self._analyze_test_body(node, file_path, imports, setup_code=setup_code)
|
|
)
|
|
|
|
return examples
|
|
|
|
def _extract_from_test_function(
|
|
self, func_node: ast.FunctionDef, file_path: str, imports: list[str]
|
|
) -> list[TestExample]:
|
|
"""Extract examples from pytest test function"""
|
|
# Check for fixture parameters
|
|
fixture_setup = self._extract_fixtures(func_node)
|
|
|
|
return self._analyze_test_body(func_node, file_path, imports, setup_code=fixture_setup)
|
|
|
|
def _extract_setup_method(self, class_node: ast.ClassDef) -> str | None:
|
|
"""Extract setUp method code"""
|
|
for node in class_node.body:
|
|
if isinstance(node, ast.FunctionDef) and node.name == "setUp":
|
|
return ast.unparse(node.body)
|
|
return None
|
|
|
|
def _extract_fixtures(self, func_node: ast.FunctionDef) -> str | None:
|
|
"""Extract pytest fixture parameters"""
|
|
if not func_node.args.args:
|
|
return None
|
|
|
|
# Skip 'self' parameter
|
|
params = [arg.arg for arg in func_node.args.args if arg.arg != "self"]
|
|
if params:
|
|
return f"# Fixtures: {', '.join(params)}"
|
|
return None
|
|
|
|
def _analyze_test_body(
|
|
self,
|
|
func_node: ast.FunctionDef,
|
|
file_path: str,
|
|
imports: list[str],
|
|
setup_code: str | None = None,
|
|
) -> list[TestExample]:
|
|
"""Analyze test function body for extractable patterns"""
|
|
examples = []
|
|
|
|
# Get docstring for description
|
|
docstring = ast.get_docstring(func_node) or func_node.name.replace("_", " ")
|
|
|
|
# Detect tags
|
|
tags = self._detect_tags(func_node, imports)
|
|
|
|
# Extract different pattern categories
|
|
|
|
# 1. Instantiation patterns
|
|
instantiations = self._find_instantiations(
|
|
func_node, file_path, docstring, setup_code, tags, imports
|
|
)
|
|
examples.extend(instantiations)
|
|
|
|
# 2. Method calls with assertions
|
|
method_calls = self._find_method_calls_with_assertions(
|
|
func_node, file_path, docstring, setup_code, tags, imports
|
|
)
|
|
examples.extend(method_calls)
|
|
|
|
# 3. Configuration dictionaries
|
|
configs = self._find_config_dicts(
|
|
func_node, file_path, docstring, setup_code, tags, imports
|
|
)
|
|
examples.extend(configs)
|
|
|
|
# 4. Multi-step workflows (integration tests)
|
|
workflows = self._find_workflows(func_node, file_path, docstring, setup_code, tags, imports)
|
|
examples.extend(workflows)
|
|
|
|
return examples
|
|
|
|
def _detect_tags(self, func_node: ast.FunctionDef, imports: list[str]) -> list[str]:
|
|
"""Detect test tags (pytest, mock, async, etc.)"""
|
|
tags = []
|
|
|
|
# Check decorators
|
|
for decorator in func_node.decorator_list:
|
|
decorator_str = ast.unparse(decorator).lower()
|
|
if "pytest" in decorator_str:
|
|
tags.append("pytest")
|
|
if "mock" in decorator_str:
|
|
tags.append("mock")
|
|
if "async" in decorator_str or func_node.name.startswith("test_async"):
|
|
tags.append("async")
|
|
|
|
# Check if using unittest
|
|
if "unittest" in imports:
|
|
tags.append("unittest")
|
|
|
|
# Check function body for mock usage
|
|
func_str = ast.unparse(func_node).lower()
|
|
if "mock" in func_str or "patch" in func_str:
|
|
tags.append("mock")
|
|
|
|
return list(set(tags))
|
|
|
|
def _find_instantiations(
|
|
self,
|
|
func_node: ast.FunctionDef,
|
|
file_path: str,
|
|
description: str,
|
|
setup_code: str | None,
|
|
tags: list[str],
|
|
imports: list[str],
|
|
) -> list[TestExample]:
|
|
"""Find object instantiation patterns: obj = ClassName(...)"""
|
|
examples = []
|
|
|
|
for node in ast.walk(func_node):
|
|
# Check if meaningful instantiation
|
|
if (
|
|
isinstance(node, ast.Assign)
|
|
and isinstance(node.value, ast.Call)
|
|
and self._is_meaningful_instantiation(node)
|
|
):
|
|
code = ast.unparse(node)
|
|
|
|
# Skip trivial or mock-only
|
|
if len(code) < 20 or "Mock()" in code:
|
|
continue
|
|
|
|
# Get class name
|
|
class_name = self._get_class_name(node.value)
|
|
|
|
example = TestExample(
|
|
example_id=self._generate_id(code),
|
|
test_name=func_node.name,
|
|
category="instantiation",
|
|
code=code,
|
|
language="Python",
|
|
description=f"Instantiate {class_name}: {description}",
|
|
expected_behavior=self._extract_assertion_after(func_node, node),
|
|
setup_code=setup_code,
|
|
file_path=file_path,
|
|
line_start=node.lineno,
|
|
line_end=node.end_lineno or node.lineno,
|
|
complexity_score=self._calculate_complexity(code),
|
|
confidence=0.8,
|
|
tags=tags,
|
|
dependencies=imports,
|
|
)
|
|
examples.append(example)
|
|
|
|
return examples
|
|
|
|
def _find_method_calls_with_assertions(
|
|
self,
|
|
func_node: ast.FunctionDef,
|
|
file_path: str,
|
|
description: str,
|
|
setup_code: str | None,
|
|
tags: list[str],
|
|
imports: list[str],
|
|
) -> list[TestExample]:
|
|
"""Find method calls followed by assertions"""
|
|
examples = []
|
|
|
|
statements = func_node.body
|
|
for i, stmt in enumerate(statements):
|
|
# Look for method calls and check if next statement is an assertion
|
|
if (
|
|
isinstance(stmt, ast.Expr)
|
|
and isinstance(stmt.value, ast.Call)
|
|
and i + 1 < len(statements)
|
|
):
|
|
next_stmt = statements[i + 1]
|
|
if self._is_assertion(next_stmt):
|
|
method_call = ast.unparse(stmt)
|
|
assertion = ast.unparse(next_stmt)
|
|
|
|
code = f"{method_call}\n{assertion}"
|
|
|
|
# Skip trivial assertions
|
|
if any(trivial in assertion for trivial in self.trivial_patterns):
|
|
continue
|
|
|
|
example = TestExample(
|
|
example_id=self._generate_id(code),
|
|
test_name=func_node.name,
|
|
category="method_call",
|
|
code=code,
|
|
language="Python",
|
|
description=description,
|
|
expected_behavior=assertion,
|
|
setup_code=setup_code,
|
|
file_path=file_path,
|
|
line_start=stmt.lineno,
|
|
line_end=next_stmt.end_lineno or next_stmt.lineno,
|
|
complexity_score=self._calculate_complexity(code),
|
|
confidence=0.85,
|
|
tags=tags,
|
|
dependencies=imports,
|
|
)
|
|
examples.append(example)
|
|
|
|
return examples
|
|
|
|
def _find_config_dicts(
|
|
self,
|
|
func_node: ast.FunctionDef,
|
|
file_path: str,
|
|
description: str,
|
|
setup_code: str | None,
|
|
tags: list[str],
|
|
imports: list[str],
|
|
) -> list[TestExample]:
|
|
"""Find configuration dictionary patterns"""
|
|
examples = []
|
|
|
|
for node in ast.walk(func_node):
|
|
# Must have 2+ keys and be meaningful
|
|
if (
|
|
isinstance(node, ast.Assign)
|
|
and isinstance(node.value, ast.Dict)
|
|
and len(node.value.keys) >= 2
|
|
):
|
|
code = ast.unparse(node)
|
|
|
|
# Check if looks like configuration
|
|
if self._is_config_dict(node.value):
|
|
example = TestExample(
|
|
example_id=self._generate_id(code),
|
|
test_name=func_node.name,
|
|
category="config",
|
|
code=code,
|
|
language="Python",
|
|
description=f"Configuration example: {description}",
|
|
expected_behavior=self._extract_assertion_after(func_node, node),
|
|
setup_code=setup_code,
|
|
file_path=file_path,
|
|
line_start=node.lineno,
|
|
line_end=node.end_lineno or node.lineno,
|
|
complexity_score=self._calculate_complexity(code),
|
|
confidence=0.75,
|
|
tags=tags,
|
|
dependencies=imports,
|
|
)
|
|
examples.append(example)
|
|
|
|
return examples
|
|
|
|
def _find_workflows(
|
|
self,
|
|
func_node: ast.FunctionDef,
|
|
file_path: str,
|
|
description: str,
|
|
setup_code: str | None,
|
|
tags: list[str],
|
|
imports: list[str],
|
|
) -> list[TestExample]:
|
|
"""Find multi-step workflow patterns (integration tests)"""
|
|
examples = []
|
|
|
|
# Check if this looks like an integration test (3+ meaningful steps)
|
|
if len(func_node.body) >= 3 and self._is_integration_test(func_node):
|
|
# Extract the full workflow
|
|
code = ast.unparse(func_node.body)
|
|
|
|
# Skip if too long (> 30 lines)
|
|
if code.count("\n") > 30:
|
|
return examples
|
|
|
|
example = TestExample(
|
|
example_id=self._generate_id(code),
|
|
test_name=func_node.name,
|
|
category="workflow",
|
|
code=code,
|
|
language="Python",
|
|
description=f"Workflow: {description}",
|
|
expected_behavior=self._extract_final_assertion(func_node),
|
|
setup_code=setup_code,
|
|
file_path=file_path,
|
|
line_start=func_node.lineno,
|
|
line_end=func_node.end_lineno or func_node.lineno,
|
|
complexity_score=min(1.0, len(func_node.body) / 10),
|
|
confidence=0.9,
|
|
tags=tags + ["workflow", "integration"],
|
|
dependencies=imports,
|
|
)
|
|
examples.append(example)
|
|
|
|
return examples
|
|
|
|
# Helper methods
|
|
|
|
def _is_meaningful_instantiation(self, node: ast.Assign) -> bool:
|
|
"""Check if instantiation has meaningful parameters"""
|
|
if not isinstance(node.value, ast.Call):
|
|
return False
|
|
|
|
# Must have at least one argument or keyword argument
|
|
call = node.value
|
|
return bool(call.args or call.keywords)
|
|
|
|
def _get_class_name(self, call_node: ast.Call) -> str:
|
|
"""Extract class name from Call node"""
|
|
if isinstance(call_node.func, ast.Name):
|
|
return call_node.func.id
|
|
elif isinstance(call_node.func, ast.Attribute):
|
|
return call_node.func.attr
|
|
return "UnknownClass"
|
|
|
|
def _is_assertion(self, node: ast.stmt) -> bool:
|
|
"""Check if statement is an assertion"""
|
|
if isinstance(node, ast.Assert):
|
|
return True
|
|
|
|
if isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
|
|
call_str = ast.unparse(node.value).lower()
|
|
assertion_methods = ["assert", "expect", "should"]
|
|
return any(method in call_str for method in assertion_methods)
|
|
|
|
return False
|
|
|
|
def _is_config_dict(self, dict_node: ast.Dict) -> bool:
|
|
"""Check if dictionary looks like configuration"""
|
|
# Keys should be strings
|
|
for key in dict_node.keys:
|
|
if not isinstance(key, ast.Constant) or not isinstance(key.value, str):
|
|
return False
|
|
return True
|
|
|
|
def _is_integration_test(self, func_node: ast.FunctionDef) -> bool:
|
|
"""Check if test looks like an integration test"""
|
|
test_name = func_node.name.lower()
|
|
# Expanded keyword list for better workflow detection
|
|
integration_keywords = [
|
|
"workflow",
|
|
"integration",
|
|
"end_to_end",
|
|
"e2e",
|
|
"full",
|
|
"complete",
|
|
"scenario",
|
|
"flow",
|
|
"multi_step",
|
|
"multistep",
|
|
"process",
|
|
"chain",
|
|
"sequence",
|
|
"pipeline",
|
|
"lifecycle",
|
|
]
|
|
|
|
# Check test name for keywords
|
|
if any(keyword in test_name for keyword in integration_keywords):
|
|
return True
|
|
|
|
# Heuristic: tests with 4+ assignments and 3+ calls are likely workflows
|
|
assignments = sum(
|
|
1 for n in ast.walk(func_node) if isinstance(n, (ast.Assign, ast.AugAssign))
|
|
)
|
|
calls = sum(1 for n in ast.walk(func_node) if isinstance(n, ast.Call))
|
|
|
|
return assignments >= 4 and calls >= 3
|
|
|
|
def _extract_assertion_after(self, func_node: ast.FunctionDef, target_node: ast.AST) -> str:
|
|
"""Find assertion that follows the target node"""
|
|
found_target = False
|
|
for stmt in func_node.body:
|
|
if stmt == target_node:
|
|
found_target = True
|
|
continue
|
|
if found_target and self._is_assertion(stmt):
|
|
return ast.unparse(stmt)
|
|
return ""
|
|
|
|
def _extract_final_assertion(self, func_node: ast.FunctionDef) -> str:
|
|
"""Extract the final assertion from test"""
|
|
for stmt in reversed(func_node.body):
|
|
if self._is_assertion(stmt):
|
|
return ast.unparse(stmt)
|
|
return ""
|
|
|
|
def _calculate_complexity(self, code: str) -> float:
|
|
"""Calculate code complexity score (0-1)"""
|
|
# Simple heuristic: more lines + more parameters = more complex
|
|
lines = code.count("\n") + 1
|
|
params = code.count(",") + 1
|
|
|
|
complexity = min(1.0, (lines * 0.1) + (params * 0.05))
|
|
return round(complexity, 2)
|
|
|
|
def _generate_id(self, code: str) -> str:
|
|
"""Generate unique ID for example"""
|
|
return hashlib.md5(code.encode()).hexdigest()[:8]
|
|
|
|
|
|
# ============================================================================
|
|
# GENERIC TEST ANALYZER (Regex-based for non-Python languages)
|
|
# ============================================================================
|
|
|
|
|
|
class GenericTestAnalyzer:
|
|
"""Regex-based test example extraction for non-Python languages"""
|
|
|
|
# Language-specific regex patterns
|
|
PATTERNS = {
|
|
"javascript": {
|
|
"instantiation": r"(?:const|let|var)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
|
|
"assertion": r"expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)",
|
|
"test_function": r'(?:test|it)\(["\']([^"\']+)["\']',
|
|
"config": r"(?:const|let)\s+config\s*=\s*\{[\s\S]{20,500}?\}",
|
|
},
|
|
"typescript": {
|
|
"instantiation": r"(?:const|let|var)\s+(\w+):\s*\w+\s*=\s*new\s+(\w+)\(([^)]*)\)",
|
|
"assertion": r"expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)",
|
|
"test_function": r'(?:test|it)\(["\']([^"\']+)["\']',
|
|
"config": r"(?:const|let)\s+config:\s*\w+\s*=\s*\{[\s\S]{20,500}?\}",
|
|
},
|
|
"go": {
|
|
"instantiation": r"(\w+)\s*:=\s*(\w+)\{([^}]+)\}",
|
|
"assertion": r't\.(?:Error|Fatal)(?:f)?\(["\']([^"\']+)["\']',
|
|
"test_function": r"func\s+(Test\w+)\(t\s+\*testing\.T\)",
|
|
"table_test": r"tests\s*:=\s*\[\]struct\s*\{[\s\S]{50,1000}?\}",
|
|
},
|
|
"rust": {
|
|
"instantiation": r"let\s+(\w+)\s*=\s*(\w+)::new\(([^)]*)\)",
|
|
"assertion": r"assert(?:_eq)?!\(([^)]+)\)",
|
|
"test_function": r"#\[test\]\s*fn\s+(\w+)\(\)",
|
|
},
|
|
"java": {
|
|
"instantiation": r"(\w+)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
|
|
"assertion": r"assert(?:Equals|True|False|NotNull)\(([^)]+)\)",
|
|
"test_function": r"@Test\s+public\s+void\s+(\w+)\(\)",
|
|
},
|
|
"kotlin": {
|
|
# Object instantiation: val x = Foo(args) or val x: Type = Foo(args)
|
|
"instantiation": r"(?:val|var)\s+(\w+)(?:\s*:\s*[\w<>.,\s?]+)?\s*=\s*(\w+)\(([^)]*)\)",
|
|
# JUnit assertions + Kotest matchers
|
|
"assertion": r"(?:assert(?:Equals|True|False|NotNull|That)\(([^)]+)\)|(\w+)\s+should(?:Be|Equal|Match|Have|Contain|Throw)\b)",
|
|
# JUnit @Test, Kotest test functions, Spek describe/it
|
|
"test_function": r"(?:@Test\s+fun\s+(\w+)\s*\(|fun\s+[\"']([^\"']+)[\"']\s*\(|(?:test|it|should)\s*\(\s*[\"']([^\"']+)[\"'])",
|
|
# MockK mocking patterns
|
|
"mock": r"(?:mockk<([\w<>]+)>\s*\(|every\s*\{\s*(\w+)\.(\w+)|verify\s*\{)",
|
|
# Coroutine test patterns
|
|
"coroutine_test": r"(?:runTest\s*\{|runBlocking\s*\{|testCoroutineDispatcher)",
|
|
},
|
|
"csharp": {
|
|
# Object instantiation patterns (var, explicit type, generic)
|
|
"instantiation": r"(?:var|[\w<>]+)\s+(\w+)\s*=\s*new\s+([\w<>]+)\(([^)]*)\)",
|
|
# NUnit assertions (Assert.AreEqual, Assert.That, etc.)
|
|
"assertion": r"Assert\.(?:AreEqual|AreNotEqual|IsTrue|IsFalse|IsNull|IsNotNull|That|Throws|DoesNotThrow|Greater|Less|Contains)\(([^)]+)\)",
|
|
# NUnit test attributes ([Test], [TestCase], [TestCaseSource])
|
|
"test_function": r"\[(?:Test|TestCase|TestCaseSource|Theory|Fact)\(?[^\]]*\)?\]\s*(?:\[[\w\(\)\"',\s]+\]\s*)*public\s+(?:async\s+)?(?:Task|void)\s+(\w+)\s*\(",
|
|
# Setup/Teardown patterns
|
|
"setup": r"\[(?:SetUp|OneTimeSetUp|TearDown|OneTimeTearDown)\]\s*public\s+(?:async\s+)?(?:Task|void)\s+(\w+)\s*\(",
|
|
# Mock/substitute patterns (NSubstitute, Moq)
|
|
"mock": r"(?:Substitute\.For<([\w<>]+)>|new\s+Mock<([\w<>]+)>|MockRepository\.GenerateMock<([\w<>]+)>)\(",
|
|
# Dependency injection patterns (Zenject, etc.)
|
|
"injection": r"Container\.(?:Bind|BindInterfacesTo|BindInterfacesAndSelfTo)<([\w<>]+)>",
|
|
# Configuration/setup dictionaries
|
|
"config": r"(?:var|[\w<>]+)\s+\w+\s*=\s*new\s+(?:Dictionary|List|HashSet)<[^>]+>\s*\{[\s\S]{20,500}?\}",
|
|
},
|
|
"php": {
|
|
"instantiation": r"\$(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
|
|
"assertion": r"\$this->assert(?:Equals|True|False|NotNull)\(([^)]+)\)",
|
|
"test_function": r"public\s+function\s+(test\w+)\(\)",
|
|
},
|
|
"ruby": {
|
|
"instantiation": r"(\w+)\s*=\s*(\w+)\.new\(([^)]*)\)",
|
|
"assertion": r"expect\(([^)]+)\)\.to\s+(?:eq|be|match)\(([^)]+)\)",
|
|
"test_function": r'(?:test|it)\s+["\']([^"\']+)["\']',
|
|
},
|
|
"gdscript": {
|
|
# GDScript object instantiation (var x = Class.new(), preload, load)
|
|
"instantiation": r"(?:var|const)\s+(\w+)\s*=\s*(?:(\w+)\.new\(|(?:preload|load)\([\"']([^\"']+)[\"']\)\.new\()",
|
|
# GUT/gdUnit4 assertions
|
|
"assertion": r"assert_(?:eq|ne|true|false|null|not_null|gt|lt|between|has|contains|typeof)\(([^)]+)\)",
|
|
# Test functions: GUT (func test_*), gdUnit4 (@test), WAT (extends WAT.Test)
|
|
"test_function": r"(?:@test\s+)?func\s+(test_\w+)\s*\(",
|
|
# Signal connections and emissions
|
|
"signal": r"(?:(\w+)\.connect\(|emit_signal\([\"'](\w+)[\"'])",
|
|
},
|
|
}
|
|
|
|
# Language name normalization mapping
|
|
LANGUAGE_ALIASES = {
|
|
"c#": "csharp",
|
|
"c++": "cpp",
|
|
"c plus plus": "cpp",
|
|
}
|
|
|
|
# Language name normalization mapping
|
|
LANGUAGE_ALIASES = {
|
|
"c#": "csharp",
|
|
"c++": "cpp",
|
|
"c plus plus": "cpp",
|
|
}
|
|
|
|
def extract(self, file_path: str, code: str, language: str) -> list[TestExample]:
|
|
"""Extract examples from test file using regex patterns"""
|
|
examples = []
|
|
|
|
language_lower = language.lower()
|
|
# Normalize language name (e.g., "C#" -> "csharp")
|
|
language_lower = self.LANGUAGE_ALIASES.get(language_lower, language_lower)
|
|
|
|
if language_lower not in self.PATTERNS:
|
|
logger.warning(f"Language {language} not supported for regex extraction")
|
|
return []
|
|
|
|
patterns = self.PATTERNS[language_lower]
|
|
|
|
# Extract test functions
|
|
test_functions = re.finditer(patterns["test_function"], code)
|
|
|
|
for match in test_functions:
|
|
test_name = match.group(1)
|
|
|
|
# Get test function body (approximate - find next function start)
|
|
start_pos = match.end()
|
|
next_match = re.search(patterns["test_function"], code[start_pos:])
|
|
end_pos = start_pos + next_match.start() if next_match else len(code)
|
|
test_body = code[start_pos:end_pos]
|
|
|
|
# Extract instantiations
|
|
for inst_match in re.finditer(patterns["instantiation"], test_body):
|
|
example = self._create_example(
|
|
test_name=test_name,
|
|
category="instantiation",
|
|
code=inst_match.group(0),
|
|
language=language,
|
|
file_path=file_path,
|
|
line_number=code[: start_pos + inst_match.start()].count("\n") + 1,
|
|
)
|
|
examples.append(example)
|
|
|
|
# Extract config dictionaries (if pattern exists)
|
|
if "config" in patterns:
|
|
for config_match in re.finditer(patterns["config"], test_body):
|
|
example = self._create_example(
|
|
test_name=test_name,
|
|
category="config",
|
|
code=config_match.group(0),
|
|
language=language,
|
|
file_path=file_path,
|
|
line_number=code[: start_pos + config_match.start()].count("\n") + 1,
|
|
)
|
|
examples.append(example)
|
|
|
|
# Extract mock/substitute patterns (if pattern exists)
|
|
if "mock" in patterns:
|
|
for mock_match in re.finditer(patterns["mock"], test_body):
|
|
example = self._create_example(
|
|
test_name=test_name,
|
|
category="setup",
|
|
code=mock_match.group(0),
|
|
language=language,
|
|
file_path=file_path,
|
|
line_number=code[: start_pos + mock_match.start()].count("\n") + 1,
|
|
)
|
|
examples.append(example)
|
|
|
|
# Extract dependency injection patterns (if pattern exists)
|
|
if "injection" in patterns:
|
|
for inject_match in re.finditer(patterns["injection"], test_body):
|
|
example = self._create_example(
|
|
test_name=test_name,
|
|
category="setup",
|
|
code=inject_match.group(0),
|
|
language=language,
|
|
file_path=file_path,
|
|
line_number=code[: start_pos + inject_match.start()].count("\n") + 1,
|
|
)
|
|
examples.append(example)
|
|
|
|
# Also extract setup/teardown methods (outside test functions)
|
|
if "setup" in patterns:
|
|
for setup_match in re.finditer(patterns["setup"], code):
|
|
setup_name = setup_match.group(1)
|
|
# Get setup function body
|
|
setup_start = setup_match.end()
|
|
# Find next method (setup or test)
|
|
next_pattern = patterns.get("setup", patterns["test_function"])
|
|
next_setup = re.search(next_pattern, code[setup_start:])
|
|
setup_end = (
|
|
setup_start + next_setup.start()
|
|
if next_setup
|
|
else min(setup_start + 500, len(code))
|
|
)
|
|
setup_body = code[setup_start:setup_end]
|
|
|
|
example = self._create_example(
|
|
test_name=setup_name,
|
|
category="setup",
|
|
code=setup_match.group(0) + setup_body[:200], # Include some of the body
|
|
language=language,
|
|
file_path=file_path,
|
|
line_number=code[: setup_match.start()].count("\n") + 1,
|
|
)
|
|
examples.append(example)
|
|
|
|
return examples
|
|
|
|
def _create_example(
|
|
self,
|
|
test_name: str,
|
|
category: str,
|
|
code: str,
|
|
language: str,
|
|
file_path: str,
|
|
line_number: int,
|
|
) -> TestExample:
|
|
"""Create TestExample from regex match"""
|
|
return TestExample(
|
|
example_id=hashlib.md5(code.encode()).hexdigest()[:8],
|
|
test_name=test_name,
|
|
category=category,
|
|
code=code,
|
|
language=language,
|
|
description=f"Test: {test_name}",
|
|
expected_behavior="",
|
|
file_path=file_path,
|
|
line_start=line_number,
|
|
line_end=line_number + code.count("\n"),
|
|
complexity_score=min(1.0, (code.count("\n") + 1) * 0.1),
|
|
confidence=0.6, # Lower confidence for regex extraction
|
|
tags=[],
|
|
dependencies=[],
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# EXAMPLE QUALITY FILTER
|
|
# ============================================================================
|
|
|
|
|
|
class ExampleQualityFilter:
|
|
"""Filter out trivial or low-quality examples"""
|
|
|
|
def __init__(self, min_confidence: float = 0.7, min_code_length: int = 20):
|
|
self.min_confidence = min_confidence
|
|
self.min_code_length = min_code_length
|
|
|
|
# Trivial patterns to exclude
|
|
self.trivial_patterns = [
|
|
"Mock()",
|
|
"MagicMock()",
|
|
"assertTrue(True)",
|
|
"assertFalse(False)",
|
|
"assertEqual(1, 1)",
|
|
"pass",
|
|
"...",
|
|
]
|
|
|
|
def filter(self, examples: list[TestExample]) -> list[TestExample]:
|
|
"""Filter examples by quality criteria"""
|
|
filtered = []
|
|
|
|
for example in examples:
|
|
# Check confidence threshold
|
|
if example.confidence < self.min_confidence:
|
|
continue
|
|
|
|
# Check code length
|
|
if len(example.code) < self.min_code_length:
|
|
continue
|
|
|
|
# Check for trivial patterns
|
|
if self._is_trivial(example.code):
|
|
continue
|
|
|
|
filtered.append(example)
|
|
|
|
return filtered
|
|
|
|
def _is_trivial(self, code: str) -> bool:
|
|
"""Check if code contains trivial patterns"""
|
|
return any(pattern in code for pattern in self.trivial_patterns)
|
|
|
|
|
|
# ============================================================================
|
|
# TEST EXAMPLE EXTRACTOR (Main Orchestrator)
|
|
# ============================================================================
|
|
|
|
|
|
class TestExampleExtractor:
|
|
"""Main orchestrator for test example extraction"""
|
|
|
|
# Test file patterns
|
|
TEST_PATTERNS = [
|
|
"test_*.py",
|
|
"*_test.py",
|
|
"test*.js",
|
|
"*test.js",
|
|
"*_test.go",
|
|
"*_test.rs",
|
|
"Test*.java",
|
|
"*Test.kt",
|
|
"Test*.kt",
|
|
"*Spec.kt", # Kotest/Spek naming convention
|
|
"Test*.cs",
|
|
"*Test.php",
|
|
"*_spec.rb",
|
|
"test_*.gd", # GUT, gdUnit4, WAT test files
|
|
"*_test.gd",
|
|
]
|
|
|
|
# Language detection by extension
|
|
LANGUAGE_MAP = {
|
|
".py": "Python",
|
|
".js": "JavaScript",
|
|
".ts": "TypeScript",
|
|
".go": "Go",
|
|
".rs": "Rust",
|
|
".java": "Java",
|
|
".kt": "Kotlin",
|
|
".kts": "Kotlin",
|
|
".cs": "C#",
|
|
".php": "PHP",
|
|
".rb": "Ruby",
|
|
".gd": "GDScript",
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
min_confidence: float = 0.7,
|
|
max_per_file: int = 10,
|
|
languages: list[str] | None = None,
|
|
enhance_with_ai: bool = True,
|
|
):
|
|
self.python_analyzer = PythonTestAnalyzer()
|
|
self.generic_analyzer = GenericTestAnalyzer()
|
|
self.quality_filter = ExampleQualityFilter(min_confidence=min_confidence)
|
|
self.max_per_file = max_per_file
|
|
self.languages = [lang.lower() for lang in languages] if languages else None
|
|
self.enhance_with_ai = enhance_with_ai
|
|
|
|
# Initialize AI enhancer if enabled (C3.6)
|
|
self.ai_enhancer = None
|
|
if self.enhance_with_ai:
|
|
try:
|
|
from skill_seekers.cli.ai_enhancer import TestExampleEnhancer
|
|
|
|
self.ai_enhancer = TestExampleEnhancer()
|
|
except Exception as e:
|
|
logger.warning(f"⚠️ Failed to initialize AI enhancer: {e}")
|
|
self.enhance_with_ai = False
|
|
|
|
def extract_from_directory(self, directory: Path, recursive: bool = True) -> ExampleReport:
|
|
"""Extract examples from all test files in directory"""
|
|
directory = Path(directory)
|
|
|
|
if not directory.exists():
|
|
raise FileNotFoundError(f"Directory not found: {directory}")
|
|
|
|
# Find test files
|
|
test_files = self._find_test_files(directory, recursive)
|
|
|
|
logger.info(f"Found {len(test_files)} test files in {directory}")
|
|
|
|
# Extract from each file
|
|
all_examples = []
|
|
for test_file in test_files:
|
|
examples = self.extract_from_file(test_file)
|
|
all_examples.extend(examples)
|
|
|
|
# Generate report
|
|
return self._create_report(all_examples, directory=str(directory))
|
|
|
|
def extract_from_file(self, file_path: Path) -> list[TestExample]:
|
|
"""Extract examples from single test file"""
|
|
file_path = Path(file_path)
|
|
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
# Detect language
|
|
language = self._detect_language(file_path)
|
|
|
|
# Filter by language if specified
|
|
if self.languages and language.lower() not in self.languages:
|
|
return []
|
|
|
|
# Read file
|
|
try:
|
|
code = file_path.read_text(encoding="utf-8")
|
|
except UnicodeDecodeError:
|
|
logger.warning(f"Failed to read {file_path} (encoding error)")
|
|
return []
|
|
|
|
# Extract examples based on language
|
|
if language == "Python":
|
|
examples = self.python_analyzer.extract(str(file_path), code)
|
|
else:
|
|
examples = self.generic_analyzer.extract(str(file_path), code, language)
|
|
|
|
# Apply quality filter
|
|
filtered_examples = self.quality_filter.filter(examples)
|
|
|
|
# Limit per file
|
|
if len(filtered_examples) > self.max_per_file:
|
|
# Sort by confidence and take top N
|
|
filtered_examples = sorted(filtered_examples, key=lambda x: x.confidence, reverse=True)[
|
|
: self.max_per_file
|
|
]
|
|
|
|
logger.info(f"Extracted {len(filtered_examples)} examples from {file_path.name}")
|
|
|
|
return filtered_examples
|
|
|
|
def _find_test_files(self, directory: Path, recursive: bool) -> list[Path]:
|
|
"""Find test files in directory"""
|
|
test_files = []
|
|
|
|
for pattern in self.TEST_PATTERNS:
|
|
if recursive:
|
|
test_files.extend(directory.rglob(pattern))
|
|
else:
|
|
test_files.extend(directory.glob(pattern))
|
|
|
|
return list(set(test_files)) # Remove duplicates
|
|
|
|
def _detect_language(self, file_path: Path) -> str:
|
|
"""Detect programming language from file extension"""
|
|
suffix = file_path.suffix.lower()
|
|
return self.LANGUAGE_MAP.get(suffix, "Unknown")
|
|
|
|
def _create_report(
|
|
self,
|
|
examples: list[TestExample],
|
|
file_path: str | None = None,
|
|
directory: str | None = None,
|
|
) -> ExampleReport:
|
|
"""Create summary report from examples"""
|
|
# Enhance examples with AI analysis (C3.6)
|
|
if self.enhance_with_ai and self.ai_enhancer and examples:
|
|
# Convert examples to dict format for AI processing
|
|
example_dicts = [ex.to_dict() for ex in examples]
|
|
enhanced_dicts = self.ai_enhancer.enhance_examples(example_dicts)
|
|
|
|
# Update examples with AI analysis
|
|
for i, example in enumerate(examples):
|
|
if i < len(enhanced_dicts) and "ai_analysis" in enhanced_dicts[i]:
|
|
example.ai_analysis = enhanced_dicts[i]["ai_analysis"]
|
|
|
|
# Count by category
|
|
examples_by_category = {}
|
|
for example in examples:
|
|
examples_by_category[example.category] = (
|
|
examples_by_category.get(example.category, 0) + 1
|
|
)
|
|
|
|
# Count by language
|
|
examples_by_language = {}
|
|
for example in examples:
|
|
examples_by_language[example.language] = (
|
|
examples_by_language.get(example.language, 0) + 1
|
|
)
|
|
|
|
# Calculate averages
|
|
avg_complexity = (
|
|
sum(ex.complexity_score for ex in examples) / len(examples) if examples else 0.0
|
|
)
|
|
high_value_count = sum(1 for ex in examples if ex.confidence > 0.7)
|
|
|
|
return ExampleReport(
|
|
total_examples=len(examples),
|
|
examples_by_category=examples_by_category,
|
|
examples_by_language=examples_by_language,
|
|
examples=examples,
|
|
avg_complexity=round(avg_complexity, 2),
|
|
high_value_count=high_value_count,
|
|
file_path=file_path,
|
|
directory=directory,
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# COMMAND-LINE INTERFACE
|
|
# ============================================================================
|
|
|
|
|
|
def main():
|
|
"""Main entry point for CLI"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract usage examples from test files",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Extract from directory
|
|
%(prog)s tests/ --language python
|
|
|
|
# Extract from single file
|
|
%(prog)s --file tests/test_scraper.py
|
|
|
|
# JSON output
|
|
%(prog)s tests/ --json > examples.json
|
|
|
|
# Filter by confidence
|
|
%(prog)s tests/ --min-confidence 0.7
|
|
""",
|
|
)
|
|
|
|
parser.add_argument("directory", nargs="?", help="Directory containing test files")
|
|
parser.add_argument("--file", help="Single test file to analyze")
|
|
parser.add_argument(
|
|
"--language", help="Filter by programming language (python, javascript, etc.)"
|
|
)
|
|
parser.add_argument(
|
|
"--min-confidence",
|
|
type=float,
|
|
default=0.5,
|
|
help="Minimum confidence threshold (0.0-1.0, default: 0.5)",
|
|
)
|
|
parser.add_argument(
|
|
"--max-per-file",
|
|
type=int,
|
|
default=10,
|
|
help="Maximum examples per file (default: 10)",
|
|
)
|
|
parser.add_argument("--json", action="store_true", help="Output JSON format")
|
|
parser.add_argument("--markdown", action="store_true", help="Output Markdown format")
|
|
parser.add_argument(
|
|
"--recursive",
|
|
action="store_true",
|
|
default=True,
|
|
help="Search directory recursively (default: True)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate arguments
|
|
if not args.directory and not args.file:
|
|
parser.error("Either directory or --file must be specified")
|
|
|
|
# Create extractor
|
|
languages = [args.language] if args.language else None
|
|
extractor = TestExampleExtractor(
|
|
min_confidence=args.min_confidence,
|
|
max_per_file=args.max_per_file,
|
|
languages=languages,
|
|
)
|
|
|
|
# Extract examples
|
|
if args.file:
|
|
examples = extractor.extract_from_file(Path(args.file))
|
|
report = extractor._create_report(examples, file_path=args.file)
|
|
else:
|
|
report = extractor.extract_from_directory(Path(args.directory), recursive=args.recursive)
|
|
|
|
# Output results
|
|
if args.json:
|
|
print(json.dumps(report.to_dict(), indent=2))
|
|
elif args.markdown:
|
|
print(report.to_markdown())
|
|
else:
|
|
# Human-readable summary
|
|
print("\nTest Example Extraction Results")
|
|
print("=" * 50)
|
|
print(f"Total Examples: {report.total_examples}")
|
|
print(f"High Value (confidence > 0.7): {report.high_value_count}")
|
|
print(f"Average Complexity: {report.avg_complexity:.2f}")
|
|
print("\nExamples by Category:")
|
|
for category, count in sorted(report.examples_by_category.items()):
|
|
print(f" {category}: {count}")
|
|
print("\nExamples by Language:")
|
|
for language, count in sorted(report.examples_by_language.items()):
|
|
print(f" {language}: {count}")
|
|
print("\nUse --json or --markdown for detailed output")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|