Phase 1: Unified Config Format - Created config_validator.py with full validation - Supports multiple sources (documentation, github, pdf) - Backward compatible with legacy configs - Auto-converts legacy → unified format - Validates merge_mode and code_analysis_depth Phase 2: Deep Code Analysis - Created code_analyzer.py with language-specific parsers - Supports Python (AST), JavaScript/TypeScript (regex), C/C++ (regex) - Configurable depth: surface, deep, full - Extracts classes, functions, parameters, types, docstrings - Integrated into github_scraper.py Features: ✅ Unified config with sources array ✅ Code analysis depth: surface/deep/full ✅ Language detection and parser selection ✅ Signature extraction with full parameter info ✅ Type hints and default values captured ✅ Docstring extraction ✅ Example config: godot_unified.json Next: Conflict detection and merging 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
492 lines
16 KiB
Python
492 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Code Analyzer for GitHub Repositories
|
|
|
|
Extracts code signatures at configurable depth levels:
|
|
- surface: File tree only (existing behavior)
|
|
- deep: Parse files for signatures, parameters, types
|
|
- full: Complete AST analysis (future enhancement)
|
|
|
|
Supports multiple languages with language-specific parsers.
|
|
"""
|
|
|
|
import ast
|
|
import re
|
|
import logging
|
|
from typing import Dict, List, Any, Optional
|
|
from dataclasses import dataclass, asdict
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class Parameter:
|
|
"""Represents a function parameter."""
|
|
name: str
|
|
type_hint: Optional[str] = None
|
|
default: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class FunctionSignature:
|
|
"""Represents a function/method signature."""
|
|
name: str
|
|
parameters: List[Parameter]
|
|
return_type: Optional[str] = None
|
|
docstring: Optional[str] = None
|
|
line_number: Optional[int] = None
|
|
is_async: bool = False
|
|
is_method: bool = False
|
|
decorators: List[str] = None
|
|
|
|
def __post_init__(self):
|
|
if self.decorators is None:
|
|
self.decorators = []
|
|
|
|
|
|
@dataclass
|
|
class ClassSignature:
|
|
"""Represents a class signature."""
|
|
name: str
|
|
base_classes: List[str]
|
|
methods: List[FunctionSignature]
|
|
docstring: Optional[str] = None
|
|
line_number: Optional[int] = None
|
|
|
|
|
|
class CodeAnalyzer:
|
|
"""
|
|
Analyzes code at different depth levels.
|
|
"""
|
|
|
|
def __init__(self, depth: str = 'surface'):
|
|
"""
|
|
Initialize code analyzer.
|
|
|
|
Args:
|
|
depth: Analysis depth ('surface', 'deep', 'full')
|
|
"""
|
|
self.depth = depth
|
|
|
|
def analyze_file(self, file_path: str, content: str, language: str) -> Dict[str, Any]:
|
|
"""
|
|
Analyze a single file based on depth level.
|
|
|
|
Args:
|
|
file_path: Path to file in repository
|
|
content: File content as string
|
|
language: Programming language (Python, JavaScript, etc.)
|
|
|
|
Returns:
|
|
Dict containing extracted signatures
|
|
"""
|
|
if self.depth == 'surface':
|
|
return {} # Surface level doesn't analyze individual files
|
|
|
|
logger.debug(f"Analyzing {file_path} (language: {language}, depth: {self.depth})")
|
|
|
|
try:
|
|
if language == 'Python':
|
|
return self._analyze_python(content, file_path)
|
|
elif language in ['JavaScript', 'TypeScript']:
|
|
return self._analyze_javascript(content, file_path)
|
|
elif language in ['C', 'C++']:
|
|
return self._analyze_cpp(content, file_path)
|
|
else:
|
|
logger.debug(f"No analyzer for language: {language}")
|
|
return {}
|
|
except Exception as e:
|
|
logger.warning(f"Error analyzing {file_path}: {e}")
|
|
return {}
|
|
|
|
def _analyze_python(self, content: str, file_path: str) -> Dict[str, Any]:
|
|
"""Analyze Python file using AST."""
|
|
try:
|
|
tree = ast.parse(content)
|
|
except SyntaxError as e:
|
|
logger.debug(f"Syntax error in {file_path}: {e}")
|
|
return {}
|
|
|
|
classes = []
|
|
functions = []
|
|
|
|
for node in ast.walk(tree):
|
|
if isinstance(node, ast.ClassDef):
|
|
class_sig = self._extract_python_class(node)
|
|
classes.append(asdict(class_sig))
|
|
elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
|
|
# Only top-level functions (not methods)
|
|
if not any(isinstance(parent, ast.ClassDef)
|
|
for parent in ast.walk(tree) if hasattr(parent, 'body') and node in parent.body):
|
|
func_sig = self._extract_python_function(node)
|
|
functions.append(asdict(func_sig))
|
|
|
|
return {
|
|
'classes': classes,
|
|
'functions': functions
|
|
}
|
|
|
|
def _extract_python_class(self, node: ast.ClassDef) -> ClassSignature:
|
|
"""Extract class signature from AST node."""
|
|
# Extract base classes
|
|
bases = []
|
|
for base in node.bases:
|
|
if isinstance(base, ast.Name):
|
|
bases.append(base.id)
|
|
elif isinstance(base, ast.Attribute):
|
|
bases.append(f"{base.value.id}.{base.attr}" if hasattr(base.value, 'id') else base.attr)
|
|
|
|
# Extract methods
|
|
methods = []
|
|
for item in node.body:
|
|
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
method_sig = self._extract_python_function(item, is_method=True)
|
|
methods.append(method_sig)
|
|
|
|
# Extract docstring
|
|
docstring = ast.get_docstring(node)
|
|
|
|
return ClassSignature(
|
|
name=node.name,
|
|
base_classes=bases,
|
|
methods=methods,
|
|
docstring=docstring,
|
|
line_number=node.lineno
|
|
)
|
|
|
|
def _extract_python_function(self, node, is_method: bool = False) -> FunctionSignature:
|
|
"""Extract function signature from AST node."""
|
|
# Extract parameters
|
|
params = []
|
|
for arg in node.args.args:
|
|
param_type = None
|
|
if arg.annotation:
|
|
param_type = ast.unparse(arg.annotation) if hasattr(ast, 'unparse') else None
|
|
|
|
params.append(Parameter(
|
|
name=arg.arg,
|
|
type_hint=param_type
|
|
))
|
|
|
|
# Extract defaults
|
|
defaults = node.args.defaults
|
|
if defaults:
|
|
# Defaults are aligned to the end of params
|
|
num_no_default = len(params) - len(defaults)
|
|
for i, default in enumerate(defaults):
|
|
param_idx = num_no_default + i
|
|
if param_idx < len(params):
|
|
try:
|
|
params[param_idx].default = ast.unparse(default) if hasattr(ast, 'unparse') else str(default)
|
|
except:
|
|
params[param_idx].default = "..."
|
|
|
|
# Extract return type
|
|
return_type = None
|
|
if node.returns:
|
|
try:
|
|
return_type = ast.unparse(node.returns) if hasattr(ast, 'unparse') else None
|
|
except:
|
|
pass
|
|
|
|
# Extract decorators
|
|
decorators = []
|
|
for decorator in node.decorator_list:
|
|
try:
|
|
if hasattr(ast, 'unparse'):
|
|
decorators.append(ast.unparse(decorator))
|
|
elif isinstance(decorator, ast.Name):
|
|
decorators.append(decorator.id)
|
|
except:
|
|
pass
|
|
|
|
# Extract docstring
|
|
docstring = ast.get_docstring(node)
|
|
|
|
return FunctionSignature(
|
|
name=node.name,
|
|
parameters=params,
|
|
return_type=return_type,
|
|
docstring=docstring,
|
|
line_number=node.lineno,
|
|
is_async=isinstance(node, ast.AsyncFunctionDef),
|
|
is_method=is_method,
|
|
decorators=decorators
|
|
)
|
|
|
|
def _analyze_javascript(self, content: str, file_path: str) -> Dict[str, Any]:
|
|
"""
|
|
Analyze JavaScript/TypeScript file using regex patterns.
|
|
|
|
Note: This is a simplified approach. For production, consider using
|
|
a proper JS/TS parser like esprima or ts-morph.
|
|
"""
|
|
classes = []
|
|
functions = []
|
|
|
|
# Extract class definitions
|
|
class_pattern = r'class\s+(\w+)(?:\s+extends\s+(\w+))?\s*\{'
|
|
for match in re.finditer(class_pattern, content):
|
|
class_name = match.group(1)
|
|
base_class = match.group(2) if match.group(2) else None
|
|
|
|
# Try to extract methods (simplified)
|
|
class_block_start = match.end()
|
|
# This is a simplification - proper parsing would track braces
|
|
class_block_end = content.find('}', class_block_start)
|
|
if class_block_end != -1:
|
|
class_body = content[class_block_start:class_block_end]
|
|
methods = self._extract_js_methods(class_body)
|
|
else:
|
|
methods = []
|
|
|
|
classes.append({
|
|
'name': class_name,
|
|
'base_classes': [base_class] if base_class else [],
|
|
'methods': methods,
|
|
'docstring': None,
|
|
'line_number': content[:match.start()].count('\n') + 1
|
|
})
|
|
|
|
# Extract top-level functions
|
|
func_pattern = r'(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)'
|
|
for match in re.finditer(func_pattern, content):
|
|
func_name = match.group(1)
|
|
params_str = match.group(2)
|
|
is_async = 'async' in match.group(0)
|
|
|
|
params = self._parse_js_parameters(params_str)
|
|
|
|
functions.append({
|
|
'name': func_name,
|
|
'parameters': params,
|
|
'return_type': None, # JS doesn't have type annotations (unless TS)
|
|
'docstring': None,
|
|
'line_number': content[:match.start()].count('\n') + 1,
|
|
'is_async': is_async,
|
|
'is_method': False,
|
|
'decorators': []
|
|
})
|
|
|
|
# Extract arrow functions assigned to const/let
|
|
arrow_pattern = r'(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\(([^)]*)\)\s*=>'
|
|
for match in re.finditer(arrow_pattern, content):
|
|
func_name = match.group(1)
|
|
params_str = match.group(2)
|
|
is_async = 'async' in match.group(0)
|
|
|
|
params = self._parse_js_parameters(params_str)
|
|
|
|
functions.append({
|
|
'name': func_name,
|
|
'parameters': params,
|
|
'return_type': None,
|
|
'docstring': None,
|
|
'line_number': content[:match.start()].count('\n') + 1,
|
|
'is_async': is_async,
|
|
'is_method': False,
|
|
'decorators': []
|
|
})
|
|
|
|
return {
|
|
'classes': classes,
|
|
'functions': functions
|
|
}
|
|
|
|
def _extract_js_methods(self, class_body: str) -> List[Dict]:
|
|
"""Extract method signatures from class body."""
|
|
methods = []
|
|
|
|
# Match method definitions
|
|
method_pattern = r'(?:async\s+)?(\w+)\s*\(([^)]*)\)'
|
|
for match in re.finditer(method_pattern, class_body):
|
|
method_name = match.group(1)
|
|
params_str = match.group(2)
|
|
is_async = 'async' in match.group(0)
|
|
|
|
# Skip constructor keyword detection
|
|
if method_name in ['if', 'for', 'while', 'switch']:
|
|
continue
|
|
|
|
params = self._parse_js_parameters(params_str)
|
|
|
|
methods.append({
|
|
'name': method_name,
|
|
'parameters': params,
|
|
'return_type': None,
|
|
'docstring': None,
|
|
'line_number': None,
|
|
'is_async': is_async,
|
|
'is_method': True,
|
|
'decorators': []
|
|
})
|
|
|
|
return methods
|
|
|
|
def _parse_js_parameters(self, params_str: str) -> List[Dict]:
|
|
"""Parse JavaScript parameter string."""
|
|
params = []
|
|
|
|
if not params_str.strip():
|
|
return params
|
|
|
|
# Split by comma (simplified - doesn't handle complex default values)
|
|
param_list = [p.strip() for p in params_str.split(',')]
|
|
|
|
for param in param_list:
|
|
if not param:
|
|
continue
|
|
|
|
# Check for default value
|
|
if '=' in param:
|
|
name, default = param.split('=', 1)
|
|
name = name.strip()
|
|
default = default.strip()
|
|
else:
|
|
name = param
|
|
default = None
|
|
|
|
# Check for type annotation (TypeScript)
|
|
type_hint = None
|
|
if ':' in name:
|
|
name, type_hint = name.split(':', 1)
|
|
name = name.strip()
|
|
type_hint = type_hint.strip()
|
|
|
|
params.append({
|
|
'name': name,
|
|
'type_hint': type_hint,
|
|
'default': default
|
|
})
|
|
|
|
return params
|
|
|
|
def _analyze_cpp(self, content: str, file_path: str) -> Dict[str, Any]:
|
|
"""
|
|
Analyze C/C++ header file using regex patterns.
|
|
|
|
Note: This is a simplified approach focusing on header files.
|
|
For production, consider using libclang or similar.
|
|
"""
|
|
classes = []
|
|
functions = []
|
|
|
|
# Extract class definitions (simplified - doesn't handle nested classes)
|
|
class_pattern = r'class\s+(\w+)(?:\s*:\s*public\s+(\w+))?\s*\{'
|
|
for match in re.finditer(class_pattern, content):
|
|
class_name = match.group(1)
|
|
base_class = match.group(2) if match.group(2) else None
|
|
|
|
classes.append({
|
|
'name': class_name,
|
|
'base_classes': [base_class] if base_class else [],
|
|
'methods': [], # Simplified - would need to parse class body
|
|
'docstring': None,
|
|
'line_number': content[:match.start()].count('\n') + 1
|
|
})
|
|
|
|
# Extract function declarations
|
|
func_pattern = r'(\w+(?:\s*\*|\s*&)?)\s+(\w+)\s*\(([^)]*)\)'
|
|
for match in re.finditer(func_pattern, content):
|
|
return_type = match.group(1).strip()
|
|
func_name = match.group(2)
|
|
params_str = match.group(3)
|
|
|
|
# Skip common keywords
|
|
if func_name in ['if', 'for', 'while', 'switch', 'return']:
|
|
continue
|
|
|
|
params = self._parse_cpp_parameters(params_str)
|
|
|
|
functions.append({
|
|
'name': func_name,
|
|
'parameters': params,
|
|
'return_type': return_type,
|
|
'docstring': None,
|
|
'line_number': content[:match.start()].count('\n') + 1,
|
|
'is_async': False,
|
|
'is_method': False,
|
|
'decorators': []
|
|
})
|
|
|
|
return {
|
|
'classes': classes,
|
|
'functions': functions
|
|
}
|
|
|
|
def _parse_cpp_parameters(self, params_str: str) -> List[Dict]:
|
|
"""Parse C++ parameter string."""
|
|
params = []
|
|
|
|
if not params_str.strip() or params_str.strip() == 'void':
|
|
return params
|
|
|
|
# Split by comma (simplified)
|
|
param_list = [p.strip() for p in params_str.split(',')]
|
|
|
|
for param in param_list:
|
|
if not param:
|
|
continue
|
|
|
|
# Check for default value
|
|
default = None
|
|
if '=' in param:
|
|
param, default = param.rsplit('=', 1)
|
|
param = param.strip()
|
|
default = default.strip()
|
|
|
|
# Extract type and name (simplified)
|
|
# Format: "type name" or "type* name" or "type& name"
|
|
parts = param.split()
|
|
if len(parts) >= 2:
|
|
param_type = ' '.join(parts[:-1])
|
|
param_name = parts[-1]
|
|
else:
|
|
param_type = param
|
|
param_name = "unknown"
|
|
|
|
params.append({
|
|
'name': param_name,
|
|
'type_hint': param_type,
|
|
'default': default
|
|
})
|
|
|
|
return params
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Test the analyzer
|
|
python_code = '''
|
|
class Node2D:
|
|
"""Base class for 2D nodes."""
|
|
|
|
def move_local_x(self, delta: float, snap: bool = False) -> None:
|
|
"""Move node along local X axis."""
|
|
pass
|
|
|
|
async def tween_position(self, target: tuple, duration: float = 1.0):
|
|
"""Animate position to target."""
|
|
pass
|
|
|
|
def create_sprite(texture: str) -> Node2D:
|
|
"""Create a new sprite node."""
|
|
return Node2D()
|
|
'''
|
|
|
|
analyzer = CodeAnalyzer(depth='deep')
|
|
result = analyzer.analyze_file('test.py', python_code, 'Python')
|
|
|
|
print("Analysis Result:")
|
|
print(f"Classes: {len(result.get('classes', []))}")
|
|
print(f"Functions: {len(result.get('functions', []))}")
|
|
|
|
if result.get('classes'):
|
|
cls = result['classes'][0]
|
|
print(f"\nClass: {cls['name']}")
|
|
print(f" Methods: {len(cls['methods'])}")
|
|
for method in cls['methods']:
|
|
params = ', '.join([f"{p['name']}: {p['type_hint']}" + (f" = {p['default']}" if p.get('default') else "")
|
|
for p in method['parameters']])
|
|
print(f" {method['name']}({params}) -> {method['return_type']}")
|