This commit is contained in:
Pablo Estevez
2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions

View File

@@ -24,65 +24,80 @@ Credits:
- pathspec for .gitignore support: https://pypi.org/project/pathspec/
"""
import argparse
import json
import logging
import os
import sys
import json
import argparse
import logging
from pathlib import Path
from typing import Dict, List, Optional, Any
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from skill_seekers.cli.code_analyzer import CodeAnalyzer
from skill_seekers.cli.api_reference_builder import APIReferenceBuilder
from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer
from skill_seekers.cli.code_analyzer import CodeAnalyzer
from skill_seekers.cli.config_extractor import ConfigExtractor
from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer
# Try to import pathspec for .gitignore support
try:
import pathspec
PATHSPEC_AVAILABLE = True
except ImportError:
PATHSPEC_AVAILABLE = False
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Language extension mapping
LANGUAGE_EXTENSIONS = {
'.py': 'Python',
'.js': 'JavaScript',
'.jsx': 'JavaScript',
'.ts': 'TypeScript',
'.tsx': 'TypeScript',
'.cpp': 'C++',
'.cc': 'C++',
'.cxx': 'C++',
'.h': 'C++',
'.hpp': 'C++',
'.hxx': 'C++',
'.c': 'C',
'.cs': 'C#',
'.go': 'Go',
'.rs': 'Rust',
'.java': 'Java',
'.rb': 'Ruby',
'.php': 'PHP',
".py": "Python",
".js": "JavaScript",
".jsx": "JavaScript",
".ts": "TypeScript",
".tsx": "TypeScript",
".cpp": "C++",
".cc": "C++",
".cxx": "C++",
".h": "C++",
".hpp": "C++",
".hxx": "C++",
".c": "C",
".cs": "C#",
".go": "Go",
".rs": "Rust",
".java": "Java",
".rb": "Ruby",
".php": "PHP",
}
# Default directories to exclude
DEFAULT_EXCLUDED_DIRS = {
'node_modules', 'venv', '__pycache__', '.git', '.svn', '.hg',
'build', 'dist', 'target', '.pytest_cache', '.tox', '.mypy_cache',
'htmlcov', 'coverage', '.coverage', '.eggs', '*.egg-info',
'.idea', '.vscode', '.vs', '__pypackages__'
"node_modules",
"venv",
"__pycache__",
".git",
".svn",
".hg",
"build",
"dist",
"target",
".pytest_cache",
".tox",
".mypy_cache",
"htmlcov",
"coverage",
".coverage",
".eggs",
"*.egg-info",
".idea",
".vscode",
".vs",
"__pypackages__",
}
@@ -97,10 +112,10 @@ def detect_language(file_path: Path) -> str:
Language name or 'Unknown'
"""
extension = file_path.suffix.lower()
return LANGUAGE_EXTENSIONS.get(extension, 'Unknown')
return LANGUAGE_EXTENSIONS.get(extension, "Unknown")
def load_gitignore(directory: Path) -> Optional[pathspec.PathSpec]:
def load_gitignore(directory: Path) -> pathspec.PathSpec | None:
"""
Load .gitignore file and create pathspec matcher.
@@ -115,14 +130,14 @@ def load_gitignore(directory: Path) -> Optional[pathspec.PathSpec]:
logger.warning("Install with: pip install pathspec")
return None
gitignore_path = directory / '.gitignore'
gitignore_path = directory / ".gitignore"
if not gitignore_path.exists():
logger.debug(f"No .gitignore found in {directory}")
return None
try:
with open(gitignore_path, 'r', encoding='utf-8') as f:
spec = pathspec.PathSpec.from_lines('gitwildmatch', f)
with open(gitignore_path, encoding="utf-8") as f:
spec = pathspec.PathSpec.from_lines("gitwildmatch", f)
logger.info(f"Loaded .gitignore from {gitignore_path}")
return spec
except Exception as e:
@@ -146,10 +161,10 @@ def should_exclude_dir(dir_name: str, excluded_dirs: set) -> bool:
def walk_directory(
root: Path,
patterns: Optional[List[str]] = None,
gitignore_spec: Optional[pathspec.PathSpec] = None,
excluded_dirs: Optional[set] = None
) -> List[Path]:
patterns: list[str] | None = None,
gitignore_spec: pathspec.PathSpec | None = None,
excluded_dirs: set | None = None,
) -> list[Path]:
"""
Walk directory tree and collect source files.
@@ -205,9 +220,9 @@ def walk_directory(
def analyze_codebase(
directory: Path,
output_dir: Path,
depth: str = 'deep',
languages: Optional[List[str]] = None,
file_patterns: Optional[List[str]] = None,
depth: str = "deep",
languages: list[str] | None = None,
file_patterns: list[str] | None = None,
build_api_reference: bool = True,
extract_comments: bool = True,
build_dependency_graph: bool = True,
@@ -216,8 +231,8 @@ def analyze_codebase(
build_how_to_guides: bool = True,
extract_config_patterns: bool = True,
enhance_with_ai: bool = True,
ai_mode: str = "auto"
) -> Dict[str, Any]:
ai_mode: str = "auto",
) -> dict[str, Any]:
"""
Analyze local codebase and extract code knowledge.
@@ -255,11 +270,7 @@ def analyze_codebase(
# Walk directory tree
logger.info("Scanning directory tree...")
files = walk_directory(
directory,
patterns=file_patterns,
gitignore_spec=gitignore_spec
)
files = walk_directory(directory, patterns=file_patterns, gitignore_spec=gitignore_spec)
logger.info(f"Found {len(files)} source files")
@@ -273,27 +284,25 @@ def analyze_codebase(
analyzer = CodeAnalyzer(depth=depth)
# Analyze each file
results = {'files': []}
results = {"files": []}
analyzed_count = 0
for file_path in files:
try:
content = file_path.read_text(encoding='utf-8', errors='ignore')
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language == 'Unknown':
if language == "Unknown":
continue
# Analyze file
analysis = analyzer.analyze_file(str(file_path), content, language)
# Only include files with actual analysis results
if analysis and (analysis.get('classes') or analysis.get('functions')):
results['files'].append({
'file': str(file_path.relative_to(directory)),
'language': language,
**analysis
})
if analysis and (analysis.get("classes") or analysis.get("functions")):
results["files"].append(
{"file": str(file_path.relative_to(directory)), "language": language, **analysis}
)
analyzed_count += 1
if analyzed_count % 10 == 0:
@@ -306,17 +315,17 @@ def analyze_codebase(
logger.info(f"✅ Successfully analyzed {analyzed_count} files")
# Save results
output_json = output_dir / 'code_analysis.json'
with open(output_json, 'w', encoding='utf-8') as f:
output_json = output_dir / "code_analysis.json"
with open(output_json, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2)
logger.info(f"📁 Saved analysis to: {output_json}")
# Build API reference if requested
if build_api_reference and results['files']:
if build_api_reference and results["files"]:
logger.info("Building API reference documentation...")
builder = APIReferenceBuilder(results)
api_output_dir = output_dir / 'api_reference'
api_output_dir = output_dir / "api_reference"
generated_files = builder.build_reference(api_output_dir)
logger.info(f"✅ Generated {len(generated_files)} API reference files")
logger.info(f"📁 API reference: {api_output_dir}")
@@ -329,10 +338,10 @@ def analyze_codebase(
# Analyze dependencies for all files
for file_path in files:
try:
content = file_path.read_text(encoding='utf-8', errors='ignore')
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language != 'Unknown':
if language != "Unknown":
# Use relative path from directory for better graph readability
rel_path = str(file_path.relative_to(directory))
dep_analyzer.analyze_file(rel_path, content, language)
@@ -348,7 +357,7 @@ def analyze_codebase(
if cycles:
logger.warning(f"⚠️ Found {len(cycles)} circular dependencies:")
for i, cycle in enumerate(cycles[:5], 1): # Show first 5
cycle_str = ''.join(cycle) + f"{cycle[0]}"
cycle_str = "".join(cycle) + f"{cycle[0]}"
logger.warning(f" {i}. {cycle_str}")
if len(cycles) > 5:
logger.warning(f" ... and {len(cycles) - 5} more")
@@ -356,32 +365,34 @@ def analyze_codebase(
logger.info("✅ No circular dependencies found")
# Save dependency graph data
dep_output_dir = output_dir / 'dependencies'
dep_output_dir = output_dir / "dependencies"
dep_output_dir.mkdir(parents=True, exist_ok=True)
# Export as JSON
dep_json = dep_output_dir / 'dependency_graph.json'
with open(dep_json, 'w', encoding='utf-8') as f:
dep_json = dep_output_dir / "dependency_graph.json"
with open(dep_json, "w", encoding="utf-8") as f:
json.dump(dep_analyzer.export_json(), f, indent=2)
logger.info(f"📁 Saved dependency graph: {dep_json}")
# Export as Mermaid diagram
mermaid_file = dep_output_dir / 'dependency_graph.mmd'
mermaid_file = dep_output_dir / "dependency_graph.mmd"
mermaid_file.write_text(dep_analyzer.export_mermaid())
logger.info(f"📁 Saved Mermaid diagram: {mermaid_file}")
# Save statistics
stats = dep_analyzer.get_statistics()
stats_file = dep_output_dir / 'statistics.json'
with open(stats_file, 'w', encoding='utf-8') as f:
stats_file = dep_output_dir / "statistics.json"
with open(stats_file, "w", encoding="utf-8") as f:
json.dump(stats, f, indent=2)
logger.info(f"📊 Statistics: {stats['total_files']} files, "
f"{stats['total_dependencies']} dependencies, "
f"{stats['circular_dependencies']} cycles")
logger.info(
f"📊 Statistics: {stats['total_files']} files, "
f"{stats['total_dependencies']} dependencies, "
f"{stats['circular_dependencies']} cycles"
)
# Try to export as DOT (requires pydot)
try:
dot_file = dep_output_dir / 'dependency_graph.dot'
dot_file = dep_output_dir / "dependency_graph.dot"
dep_analyzer.export_dot(str(dot_file))
except:
pass # pydot not installed, skip DOT export
@@ -396,13 +407,11 @@ def analyze_codebase(
for file_path in files:
try:
content = file_path.read_text(encoding='utf-8', errors='ignore')
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language != 'Unknown':
report = pattern_recognizer.analyze_file(
str(file_path), content, language
)
if language != "Unknown":
report = pattern_recognizer.analyze_file(str(file_path), content, language)
if report.patterns:
pattern_results.append(report.to_dict())
@@ -412,14 +421,14 @@ def analyze_codebase(
# Save pattern results
if pattern_results:
pattern_output = output_dir / 'patterns'
pattern_output = output_dir / "patterns"
pattern_output.mkdir(parents=True, exist_ok=True)
pattern_json = pattern_output / 'detected_patterns.json'
with open(pattern_json, 'w', encoding='utf-8') as f:
pattern_json = pattern_output / "detected_patterns.json"
with open(pattern_json, "w", encoding="utf-8") as f:
json.dump(pattern_results, f, indent=2)
total_patterns = sum(len(r['patterns']) for r in pattern_results)
total_patterns = sum(len(r["patterns"]) for r in pattern_results)
logger.info(f"✅ Detected {total_patterns} patterns in {len(pattern_results)} files")
logger.info(f"📁 Saved to: {pattern_json}")
else:
@@ -432,35 +441,31 @@ def analyze_codebase(
# Create extractor
test_extractor = TestExampleExtractor(
min_confidence=0.5,
max_per_file=10,
languages=languages,
enhance_with_ai=enhance_with_ai
min_confidence=0.5, max_per_file=10, languages=languages, enhance_with_ai=enhance_with_ai
)
# Extract examples from directory
try:
example_report = test_extractor.extract_from_directory(
directory,
recursive=True
)
example_report = test_extractor.extract_from_directory(directory, recursive=True)
if example_report.total_examples > 0:
# Save results
examples_output = output_dir / 'test_examples'
examples_output = output_dir / "test_examples"
examples_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
examples_json = examples_output / 'test_examples.json'
with open(examples_json, 'w', encoding='utf-8') as f:
examples_json = examples_output / "test_examples.json"
with open(examples_json, "w", encoding="utf-8") as f:
json.dump(example_report.to_dict(), f, indent=2)
# Save as Markdown
examples_md = examples_output / 'test_examples.md'
examples_md.write_text(example_report.to_markdown(), encoding='utf-8')
examples_md = examples_output / "test_examples.md"
examples_md.write_text(example_report.to_markdown(), encoding="utf-8")
logger.info(f"✅ Extracted {example_report.total_examples} test examples "
f"({example_report.high_value_count} high-value)")
logger.info(
f"✅ Extracted {example_report.total_examples} test examples "
f"({example_report.high_value_count} high-value)"
)
logger.info(f"📁 Saved to: {examples_output}")
else:
logger.info("No test examples extracted")
@@ -479,25 +484,25 @@ def analyze_codebase(
guide_builder = HowToGuideBuilder(enhance_with_ai=enhance_with_ai)
# Build guides from workflow examples
tutorials_dir = output_dir / 'tutorials'
tutorials_dir = output_dir / "tutorials"
# Get workflow examples from the example_report if available
if 'example_report' in locals() and example_report and example_report.total_examples > 0:
if "example_report" in locals() and example_report and example_report.total_examples > 0:
# Convert example_report to list of dicts for processing
examples_list = example_report.to_dict().get('examples', [])
examples_list = example_report.to_dict().get("examples", [])
guide_collection = guide_builder.build_guides_from_examples(
examples_list,
grouping_strategy='ai-tutorial-group',
grouping_strategy="ai-tutorial-group",
output_dir=tutorials_dir,
enhance_with_ai=enhance_with_ai,
ai_mode=ai_mode
ai_mode=ai_mode,
)
if guide_collection and guide_collection.total_guides > 0:
# Save collection summary
collection_json = tutorials_dir / 'guide_collection.json'
with open(collection_json, 'w', encoding='utf-8') as f:
collection_json = tutorials_dir / "guide_collection.json"
with open(collection_json, "w", encoding="utf-8") as f:
json.dump(guide_collection.to_dict(), f, indent=2)
logger.info(f"✅ Built {guide_collection.total_guides} how-to guides")
@@ -524,9 +529,10 @@ def analyze_codebase(
result_dict = config_extractor.to_dict(extraction_result)
# AI Enhancement (if enabled)
if enhance_with_ai and ai_mode != 'none':
if enhance_with_ai and ai_mode != "none":
try:
from skill_seekers.cli.config_enhancer import ConfigEnhancer
logger.info(f"🤖 Enhancing config analysis with AI (mode: {ai_mode})...")
enhancer = ConfigEnhancer(mode=ai_mode)
result_dict = enhancer.enhance_config_result(result_dict)
@@ -535,28 +541,30 @@ def analyze_codebase(
logger.warning(f"⚠️ Config AI enhancement failed: {e}")
# Save results
config_output = output_dir / 'config_patterns'
config_output = output_dir / "config_patterns"
config_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
config_json = config_output / 'config_patterns.json'
with open(config_json, 'w', encoding='utf-8') as f:
config_json = config_output / "config_patterns.json"
with open(config_json, "w", encoding="utf-8") as f:
json.dump(result_dict, f, indent=2)
# Save as Markdown (basic - AI enhancements in JSON only for now)
config_md = config_output / 'config_patterns.md'
config_md.write_text(extraction_result.to_markdown(), encoding='utf-8')
config_md = config_output / "config_patterns.md"
config_md.write_text(extraction_result.to_markdown(), encoding="utf-8")
# Count total settings across all files
total_settings = sum(len(cf.settings) for cf in extraction_result.config_files)
total_patterns = sum(len(cf.patterns) for cf in extraction_result.config_files)
logger.info(f"✅ Extracted {len(extraction_result.config_files)} config files "
f"with {total_settings} settings and {total_patterns} detected patterns")
logger.info(
f"✅ Extracted {len(extraction_result.config_files)} config files "
f"with {total_settings} settings and {total_patterns} detected patterns"
)
if 'ai_enhancements' in result_dict:
insights = result_dict['ai_enhancements'].get('overall_insights', {})
if insights.get('security_issues_found'):
if "ai_enhancements" in result_dict:
insights = result_dict["ai_enhancements"].get("overall_insights", {})
if insights.get("security_issues_found"):
logger.info(f"🔐 Security issues found: {insights['security_issues_found']}")
logger.info(f"📁 Saved to: {config_output}")
@@ -572,15 +580,15 @@ def analyze_codebase(
from skill_seekers.cli.architectural_pattern_detector import ArchitecturalPatternDetector
arch_detector = ArchitecturalPatternDetector(enhance_with_ai=enhance_with_ai)
arch_report = arch_detector.analyze(directory, results['files'])
arch_report = arch_detector.analyze(directory, results["files"])
if arch_report.patterns:
arch_output = output_dir / 'architecture'
arch_output = output_dir / "architecture"
arch_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
arch_json = arch_output / 'architectural_patterns.json'
with open(arch_json, 'w', encoding='utf-8') as f:
arch_json = arch_output / "architectural_patterns.json"
with open(arch_json, "w", encoding="utf-8") as f:
json.dump(arch_report.to_dict(), f, indent=2)
logger.info(f"🏗️ Detected {len(arch_report.patterns)} architectural patterns")
@@ -601,7 +609,7 @@ def analyze_codebase(
build_dependency_graph=build_dependency_graph,
detect_patterns=detect_patterns,
extract_test_examples=extract_test_examples,
extract_config_patterns=extract_config_patterns
extract_config_patterns=extract_config_patterns,
)
return results
@@ -610,13 +618,13 @@ def analyze_codebase(
def _generate_skill_md(
output_dir: Path,
directory: Path,
results: Dict[str, Any],
results: dict[str, Any],
depth: str,
build_api_reference: bool,
build_dependency_graph: bool,
detect_patterns: bool,
extract_test_examples: bool,
extract_config_patterns: bool
extract_config_patterns: bool,
):
"""
Generate rich SKILL.md from codebase analysis results.
@@ -635,14 +643,14 @@ def _generate_skill_md(
repo_name = directory.name
# Generate skill name (lowercase, hyphens only, max 64 chars)
skill_name = repo_name.lower().replace('_', '-').replace(' ', '-')[:64]
skill_name = repo_name.lower().replace("_", "-").replace(" ", "-")[:64]
# Generate description
description = f"Local codebase analysis for {repo_name}"
# Count files by language
language_stats = _get_language_stats(results.get('files', []))
total_files = len(results.get('files', []))
language_stats = _get_language_stats(results.get("files", []))
total_files = len(results.get("files", []))
# Start building content
skill_content = f"""---
@@ -658,7 +666,7 @@ Local codebase analysis and documentation generated from code analysis.
**Path:** `{directory}`
**Files Analyzed:** {total_files}
**Languages:** {', '.join(language_stats.keys())}
**Languages:** {", ".join(language_stats.keys())}
**Analysis Depth:** {depth}
## When to Use This Skill
@@ -732,22 +740,22 @@ Use this skill when you need to:
skill_content += "This skill includes detailed reference documentation:\n\n"
refs_added = False
if build_api_reference and (output_dir / 'api_reference').exists():
if build_api_reference and (output_dir / "api_reference").exists():
skill_content += "- **API Reference**: `references/api_reference/` - Complete API documentation\n"
refs_added = True
if build_dependency_graph and (output_dir / 'dependencies').exists():
if build_dependency_graph and (output_dir / "dependencies").exists():
skill_content += "- **Dependencies**: `references/dependencies/` - Dependency graph and analysis\n"
refs_added = True
if detect_patterns and (output_dir / 'patterns').exists():
if detect_patterns and (output_dir / "patterns").exists():
skill_content += "- **Patterns**: `references/patterns/` - Detected design patterns\n"
refs_added = True
if extract_test_examples and (output_dir / 'test_examples').exists():
if extract_test_examples and (output_dir / "test_examples").exists():
skill_content += "- **Examples**: `references/test_examples/` - Usage examples from tests\n"
refs_added = True
if extract_config_patterns and (output_dir / 'config_patterns').exists():
if extract_config_patterns and (output_dir / "config_patterns").exists():
skill_content += "- **Configuration**: `references/config_patterns/` - Configuration patterns\n"
refs_added = True
if (output_dir / 'architecture').exists():
if (output_dir / "architecture").exists():
skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
refs_added = True
@@ -762,34 +770,34 @@ Use this skill when you need to:
# Write SKILL.md
skill_path = output_dir / "SKILL.md"
skill_path.write_text(skill_content, encoding='utf-8')
skill_path.write_text(skill_content, encoding="utf-8")
line_count = len(skill_content.split('\n'))
line_count = len(skill_content.split("\n"))
logger.info(f"✅ Generated SKILL.md: {skill_path} ({line_count} lines)")
# Generate references/ directory structure
_generate_references(output_dir)
def _get_language_stats(files: List[Dict]) -> Dict[str, int]:
def _get_language_stats(files: list[dict]) -> dict[str, int]:
"""Count files by language from analysis results."""
stats = {}
for file_data in files:
# files is a list of dicts with 'language' key
lang = file_data.get('language', 'Unknown')
if lang != 'Unknown':
lang = file_data.get("language", "Unknown")
if lang != "Unknown":
stats[lang] = stats.get(lang, 0) + 1
return stats
def _format_patterns_section(output_dir: Path) -> str:
"""Format design patterns section from patterns/detected_patterns.json."""
patterns_file = output_dir / 'patterns' / 'detected_patterns.json'
patterns_file = output_dir / "patterns" / "detected_patterns.json"
if not patterns_file.exists():
return ""
try:
with open(patterns_file, 'r', encoding='utf-8') as f:
with open(patterns_file, encoding="utf-8") as f:
patterns_data = json.load(f)
except Exception:
return ""
@@ -802,10 +810,10 @@ def _format_patterns_section(output_dir: Path) -> str:
by_class = {}
for pattern_file in patterns_data:
for pattern in pattern_file.get('patterns', []):
ptype = pattern.get('pattern_type', 'Unknown')
cls = pattern.get('class_name', '')
confidence = pattern.get('confidence', 0)
for pattern in pattern_file.get("patterns", []):
ptype = pattern.get("pattern_type", "Unknown")
cls = pattern.get("class_name", "")
confidence = pattern.get("confidence", 0)
# Skip low confidence
if confidence < 0.7:
@@ -813,7 +821,7 @@ def _format_patterns_section(output_dir: Path) -> str:
# Deduplicate by class
key = f"{cls}:{ptype}"
if key not in by_class or by_class[key]['confidence'] < confidence:
if key not in by_class or by_class[key]["confidence"] < confidence:
by_class[key] = pattern
# Count by type
@@ -836,22 +844,22 @@ def _format_patterns_section(output_dir: Path) -> str:
def _format_examples_section(output_dir: Path) -> str:
"""Format code examples section from test_examples/test_examples.json."""
examples_file = output_dir / 'test_examples' / 'test_examples.json'
examples_file = output_dir / "test_examples" / "test_examples.json"
if not examples_file.exists():
return ""
try:
with open(examples_file, 'r', encoding='utf-8') as f:
with open(examples_file, encoding="utf-8") as f:
examples_data = json.load(f)
except Exception:
return ""
examples = examples_data.get('examples', [])
examples = examples_data.get("examples", [])
if not examples:
return ""
# Filter high-value examples (complexity > 0.7)
high_value = [ex for ex in examples if ex.get('complexity_score', 0) > 0.7]
high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7]
if not high_value:
# If no high complexity, take any examples
@@ -864,11 +872,11 @@ def _format_examples_section(output_dir: Path) -> str:
content += "*High-quality examples extracted from test files (C3.2)*\n\n"
# Top 10 examples
for ex in sorted(high_value, key=lambda x: x.get('complexity_score', 0), reverse=True)[:10]:
desc = ex.get('description', 'Example')
lang = ex.get('language', 'python').lower()
code = ex.get('code', '')
complexity = ex.get('complexity_score', 0)
for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]:
desc = ex.get("description", "Example")
lang = ex.get("language", "python").lower()
code = ex.get("code", "")
complexity = ex.get("complexity_score", 0)
content += f"**{desc}** (complexity: {complexity:.2f})\n\n"
content += f"```{lang}\n{code}\n```\n\n"
@@ -879,16 +887,16 @@ def _format_examples_section(output_dir: Path) -> str:
def _format_api_section(output_dir: Path) -> str:
"""Format API reference section."""
api_dir = output_dir / 'api_reference'
api_dir = output_dir / "api_reference"
if not api_dir.exists():
return ""
api_md = api_dir / 'api_reference.md'
api_md = api_dir / "api_reference.md"
if not api_md.exists():
return ""
try:
api_content = api_md.read_text(encoding='utf-8')
api_content = api_md.read_text(encoding="utf-8")
except Exception:
return ""
@@ -906,17 +914,17 @@ def _format_api_section(output_dir: Path) -> str:
def _format_architecture_section(output_dir: Path) -> str:
"""Format architecture section from architecture/architectural_patterns.json."""
arch_file = output_dir / 'architecture' / 'architectural_patterns.json'
arch_file = output_dir / "architecture" / "architectural_patterns.json"
if not arch_file.exists():
return ""
try:
with open(arch_file, 'r', encoding='utf-8') as f:
with open(arch_file, encoding="utf-8") as f:
arch_data = json.load(f)
except Exception:
return ""
patterns = arch_data.get('patterns', [])
patterns = arch_data.get("patterns", [])
if not patterns:
return ""
@@ -925,9 +933,9 @@ def _format_architecture_section(output_dir: Path) -> str:
content += "**Detected Architectural Patterns:**\n\n"
for pattern in patterns[:5]:
name = pattern.get('pattern_name', 'Unknown')
confidence = pattern.get('confidence', 0)
indicators = pattern.get('indicators', [])
name = pattern.get("pattern_name", "Unknown")
confidence = pattern.get("confidence", 0)
indicators = pattern.get("indicators", [])
content += f"- **{name}** (confidence: {confidence:.2f})\n"
if indicators:
@@ -940,22 +948,22 @@ def _format_architecture_section(output_dir: Path) -> str:
def _format_config_section(output_dir: Path) -> str:
"""Format configuration patterns section."""
config_file = output_dir / 'config_patterns' / 'config_patterns.json'
config_file = output_dir / "config_patterns" / "config_patterns.json"
if not config_file.exists():
return ""
try:
with open(config_file, 'r', encoding='utf-8') as f:
with open(config_file, encoding="utf-8") as f:
config_data = json.load(f)
except Exception:
return ""
config_files = config_data.get('config_files', [])
config_files = config_data.get("config_files", [])
if not config_files:
return ""
total_settings = sum(len(cf.get('settings', [])) for cf in config_files)
total_patterns = sum(len(cf.get('patterns', [])) for cf in config_files)
total_settings = sum(len(cf.get("settings", [])) for cf in config_files)
total_patterns = sum(len(cf.get("patterns", [])) for cf in config_files)
content = "## ⚙️ Configuration Patterns\n\n"
content += "*From C3.4 configuration analysis*\n\n"
@@ -966,7 +974,7 @@ def _format_config_section(output_dir: Path) -> str:
# List config file types found
file_types = {}
for cf in config_files:
ctype = cf.get('config_type', 'unknown')
ctype = cf.get("config_type", "unknown")
file_types[ctype] = file_types.get(ctype, 0) + 1
if file_types:
@@ -985,18 +993,18 @@ def _generate_references(output_dir: Path):
Creates a clean references/ directory that links to all analysis outputs.
"""
references_dir = output_dir / 'references'
references_dir = output_dir / "references"
references_dir.mkdir(exist_ok=True)
# Map analysis directories to reference names
mappings = {
'api_reference': 'api_reference',
'dependencies': 'dependencies',
'patterns': 'patterns',
'test_examples': 'test_examples',
'tutorials': 'tutorials',
'config_patterns': 'config_patterns',
'architecture': 'architecture'
"api_reference": "api_reference",
"dependencies": "dependencies",
"patterns": "patterns",
"test_examples": "test_examples",
"tutorials": "tutorials",
"config_patterns": "config_patterns",
"architecture": "architecture",
}
for source, target in mappings.items():
@@ -1007,9 +1015,11 @@ def _generate_references(output_dir: Path):
# Copy directory to references/ (not symlink, for portability)
if target_dir.exists():
import shutil
shutil.rmtree(target_dir)
import shutil
shutil.copytree(source_dir, target_dir)
logger.debug(f"Copied {source} → references/{target}")
@@ -1019,7 +1029,7 @@ def _generate_references(output_dir: Path):
def main():
"""Command-line interface for codebase analysis."""
parser = argparse.ArgumentParser(
description='Analyze local codebases and extract code knowledge',
description="Analyze local codebases and extract code knowledge",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -1043,101 +1053,78 @@ Examples:
# Skip specific features
codebase-scraper --directory . --skip-patterns --skip-test-examples
"""
""",
)
parser.add_argument("--directory", required=True, help="Directory to analyze")
parser.add_argument("--output", default="output/codebase/", help="Output directory (default: output/codebase/)")
parser.add_argument(
'--directory',
required=True,
help='Directory to analyze'
"--depth", choices=["surface", "deep", "full"], default="deep", help="Analysis depth (default: deep)"
)
parser.add_argument("--languages", help="Comma-separated languages to analyze (e.g., Python,JavaScript,C++)")
parser.add_argument("--file-patterns", help="Comma-separated file patterns (e.g., *.py,src/**/*.js)")
parser.add_argument(
'--output',
default='output/codebase/',
help='Output directory (default: output/codebase/)'
)
parser.add_argument(
'--depth',
choices=['surface', 'deep', 'full'],
default='deep',
help='Analysis depth (default: deep)'
)
parser.add_argument(
'--languages',
help='Comma-separated languages to analyze (e.g., Python,JavaScript,C++)'
)
parser.add_argument(
'--file-patterns',
help='Comma-separated file patterns (e.g., *.py,src/**/*.js)'
)
parser.add_argument(
'--skip-api-reference',
action='store_true',
"--skip-api-reference",
action="store_true",
default=False,
help='Skip API reference markdown documentation generation (default: enabled)'
help="Skip API reference markdown documentation generation (default: enabled)",
)
parser.add_argument(
'--skip-dependency-graph',
action='store_true',
"--skip-dependency-graph",
action="store_true",
default=False,
help='Skip dependency graph and circular dependency detection (default: enabled)'
help="Skip dependency graph and circular dependency detection (default: enabled)",
)
parser.add_argument(
'--skip-patterns',
action='store_true',
"--skip-patterns",
action="store_true",
default=False,
help='Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)'
help="Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)",
)
parser.add_argument(
'--skip-test-examples',
action='store_true',
"--skip-test-examples",
action="store_true",
default=False,
help='Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)'
help="Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)",
)
parser.add_argument(
'--skip-how-to-guides',
action='store_true',
"--skip-how-to-guides",
action="store_true",
default=False,
help='Skip how-to guide generation from workflow examples (default: enabled)'
help="Skip how-to guide generation from workflow examples (default: enabled)",
)
parser.add_argument(
'--skip-config-patterns',
action='store_true',
"--skip-config-patterns",
action="store_true",
default=False,
help='Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)'
help="Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)",
)
parser.add_argument(
'--ai-mode',
choices=['auto', 'api', 'local', 'none'],
default='auto',
help='AI enhancement mode for how-to guides: auto (detect best), api (Claude API), local (Claude Code CLI), none (disable) (default: auto)'
)
parser.add_argument(
'--no-comments',
action='store_true',
help='Skip comment extraction'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Enable verbose logging'
"--ai-mode",
choices=["auto", "api", "local", "none"],
default="auto",
help="AI enhancement mode for how-to guides: auto (detect best), api (Claude API), local (Claude Code CLI), none (disable) (default: auto)",
)
parser.add_argument("--no-comments", action="store_true", help="Skip comment extraction")
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
# Check for deprecated flags
deprecated_flags = {
'--build-api-reference': '--skip-api-reference',
'--build-dependency-graph': '--skip-dependency-graph',
'--detect-patterns': '--skip-patterns',
'--extract-test-examples': '--skip-test-examples',
'--build-how-to-guides': '--skip-how-to-guides',
'--extract-config-patterns': '--skip-config-patterns'
"--build-api-reference": "--skip-api-reference",
"--build-dependency-graph": "--skip-dependency-graph",
"--detect-patterns": "--skip-patterns",
"--extract-test-examples": "--skip-test-examples",
"--build-how-to-guides": "--skip-how-to-guides",
"--extract-config-patterns": "--skip-config-patterns",
}
for old_flag, new_flag in deprecated_flags.items():
if old_flag in sys.argv:
logger.warning(f"⚠️ DEPRECATED: {old_flag} is deprecated. "
f"All features are now enabled by default. "
f"Use {new_flag} to disable this feature.")
logger.warning(
f"⚠️ DEPRECATED: {old_flag} is deprecated. "
f"All features are now enabled by default. "
f"Use {new_flag} to disable this feature."
)
args = parser.parse_args()
@@ -1158,12 +1145,12 @@ Examples:
# Parse languages
languages = None
if args.languages:
languages = [lang.strip() for lang in args.languages.split(',')]
languages = [lang.strip() for lang in args.languages.split(",")]
# Parse file patterns
file_patterns = None
if args.file_patterns:
file_patterns = [p.strip() for p in args.file_patterns.split(',')]
file_patterns = [p.strip() for p in args.file_patterns.split(",")]
# Analyze codebase
try:
@@ -1181,18 +1168,18 @@ Examples:
build_how_to_guides=not args.skip_how_to_guides,
extract_config_patterns=not args.skip_config_patterns,
enhance_with_ai=True, # Auto-disables if no API key present
ai_mode=args.ai_mode # NEW: AI enhancement mode for how-to guides
ai_mode=args.ai_mode, # NEW: AI enhancement mode for how-to guides
)
# Print summary
print(f"\n{'='*60}")
print(f"CODEBASE ANALYSIS COMPLETE")
print(f"{'='*60}")
print(f"\n{'=' * 60}")
print("CODEBASE ANALYSIS COMPLETE")
print(f"{'=' * 60}")
print(f"Files analyzed: {len(results['files'])}")
print(f"Output directory: {args.output}")
if args.build_api_reference:
print(f"API reference: {Path(args.output) / 'api_reference'}")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
return 0
@@ -1202,9 +1189,10 @@ Examples:
except Exception as e:
logger.error(f"Analysis failed: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main())