Files
skill-seekers-reference/src/skill_seekers/cli/codebase_scraper.py
2026-01-17 17:48:15 +00:00

1227 lines
42 KiB
Python

#!/usr/bin/env python3
"""
Codebase Scraper CLI Tool
Standalone tool for analyzing local codebases without GitHub API.
Extracts code signatures, comments, and optionally generates API documentation.
Usage:
codebase-scraper --directory /path/to/repo --output output/codebase/
codebase-scraper --directory . --depth deep --languages Python,JavaScript
codebase-scraper --directory /path/to/repo --build-api-reference
Features:
- File tree walking with .gitignore support
- Multi-language code analysis (9 languages: Python, JavaScript/TypeScript, C/C++, C#, Go, Rust, Java, Ruby, PHP)
- API reference generation
- Comment extraction
- Dependency graph analysis
- Configurable depth levels
Credits:
- Language parsing patterns inspired by official language specifications
- NetworkX for dependency graph analysis: https://networkx.org/
- pathspec for .gitignore support: https://pypi.org/project/pathspec/
"""
import argparse
import json
import logging
import os
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from skill_seekers.cli.api_reference_builder import APIReferenceBuilder
from skill_seekers.cli.code_analyzer import CodeAnalyzer
from skill_seekers.cli.config_extractor import ConfigExtractor
from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer
# Try to import pathspec for .gitignore support
try:
import pathspec
PATHSPEC_AVAILABLE = True
except ImportError:
PATHSPEC_AVAILABLE = False
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Language extension mapping
LANGUAGE_EXTENSIONS = {
".py": "Python",
".js": "JavaScript",
".jsx": "JavaScript",
".ts": "TypeScript",
".tsx": "TypeScript",
".cpp": "C++",
".cc": "C++",
".cxx": "C++",
".h": "C++",
".hpp": "C++",
".hxx": "C++",
".c": "C",
".cs": "C#",
".go": "Go",
".rs": "Rust",
".java": "Java",
".rb": "Ruby",
".php": "PHP",
}
# Default directories to exclude
DEFAULT_EXCLUDED_DIRS = {
"node_modules",
"venv",
"__pycache__",
".git",
".svn",
".hg",
"build",
"dist",
"target",
".pytest_cache",
".tox",
".mypy_cache",
"htmlcov",
"coverage",
".coverage",
".eggs",
"*.egg-info",
".idea",
".vscode",
".vs",
"__pypackages__",
}
def detect_language(file_path: Path) -> str:
"""
Detect programming language from file extension.
Args:
file_path: Path to source file
Returns:
Language name or 'Unknown'
"""
extension = file_path.suffix.lower()
return LANGUAGE_EXTENSIONS.get(extension, "Unknown")
def load_gitignore(directory: Path) -> pathspec.PathSpec | None:
"""
Load .gitignore file and create pathspec matcher.
Args:
directory: Root directory to search for .gitignore
Returns:
PathSpec object if .gitignore found, None otherwise
"""
if not PATHSPEC_AVAILABLE:
logger.warning("pathspec not installed - .gitignore support disabled")
logger.warning("Install with: pip install pathspec")
return None
gitignore_path = directory / ".gitignore"
if not gitignore_path.exists():
logger.debug(f"No .gitignore found in {directory}")
return None
try:
with open(gitignore_path, encoding="utf-8") as f:
spec = pathspec.PathSpec.from_lines("gitwildmatch", f)
logger.info(f"Loaded .gitignore from {gitignore_path}")
return spec
except Exception as e:
logger.warning(f"Failed to load .gitignore: {e}")
return None
def should_exclude_dir(dir_name: str, excluded_dirs: set) -> bool:
"""
Check if directory should be excluded from analysis.
Args:
dir_name: Directory name
excluded_dirs: Set of directory names to exclude
Returns:
True if directory should be excluded
"""
return dir_name in excluded_dirs
def walk_directory(
root: Path,
patterns: list[str] | None = None,
gitignore_spec: pathspec.PathSpec | None = None,
excluded_dirs: set | None = None,
) -> list[Path]:
"""
Walk directory tree and collect source files.
Args:
root: Root directory to walk
patterns: Optional file patterns to include (e.g., ['*.py', '*.js'])
gitignore_spec: Optional PathSpec object for .gitignore rules
excluded_dirs: Set of directory names to exclude
Returns:
List of source file paths
"""
if excluded_dirs is None:
excluded_dirs = DEFAULT_EXCLUDED_DIRS
files = []
root = Path(root).resolve()
for dirpath, dirnames, filenames in os.walk(root):
current_dir = Path(dirpath)
# Filter out excluded directories (in-place modification)
dirnames[:] = [d for d in dirnames if not should_exclude_dir(d, excluded_dirs)]
for filename in filenames:
file_path = current_dir / filename
# Check .gitignore rules
if gitignore_spec:
try:
rel_path = file_path.relative_to(root)
if gitignore_spec.match_file(str(rel_path)):
logger.debug(f"Skipping (gitignore): {rel_path}")
continue
except ValueError:
# File is outside root, skip it
continue
# Check file extension
if file_path.suffix.lower() not in LANGUAGE_EXTENSIONS:
continue
# Check file patterns if provided
if patterns:
if not any(file_path.match(pattern) for pattern in patterns):
continue
files.append(file_path)
return sorted(files)
def analyze_codebase(
directory: Path,
output_dir: Path,
depth: str = "deep",
languages: list[str] | None = None,
file_patterns: list[str] | None = None,
build_api_reference: bool = True,
extract_comments: bool = True,
build_dependency_graph: bool = True,
detect_patterns: bool = True,
extract_test_examples: bool = True,
build_how_to_guides: bool = True,
extract_config_patterns: bool = True,
enhance_with_ai: bool = True,
ai_mode: str = "auto",
) -> dict[str, Any]:
"""
Analyze local codebase and extract code knowledge.
Args:
directory: Directory to analyze
output_dir: Output directory for results
depth: Analysis depth (surface, deep, full)
languages: Optional list of languages to analyze
file_patterns: Optional file patterns to include
build_api_reference: Generate API reference markdown
extract_comments: Extract inline comments
build_dependency_graph: Generate dependency graph and detect circular dependencies
detect_patterns: Detect design patterns (Singleton, Factory, Observer, etc.)
extract_test_examples: Extract usage examples from test files
build_how_to_guides: Build how-to guides from workflow examples (C3.3)
extract_config_patterns: Extract configuration patterns from config files (C3.4)
enhance_with_ai: Enhance patterns and examples with AI analysis (C3.6)
ai_mode: AI enhancement mode for how-to guides (auto, api, local, none)
Returns:
Analysis results dictionary
"""
# Resolve directory to absolute path to avoid relative_to() errors
directory = Path(directory).resolve()
logger.info(f"Analyzing codebase: {directory}")
logger.info(f"Depth: {depth}")
# Create output directory
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Load .gitignore
gitignore_spec = load_gitignore(directory)
# Walk directory tree
logger.info("Scanning directory tree...")
files = walk_directory(directory, patterns=file_patterns, gitignore_spec=gitignore_spec)
logger.info(f"Found {len(files)} source files")
# Filter by language if specified
if languages:
language_set = set(languages)
files = [f for f in files if detect_language(f) in language_set]
logger.info(f"Filtered to {len(files)} files for languages: {', '.join(languages)}")
# Initialize code analyzer
analyzer = CodeAnalyzer(depth=depth)
# Analyze each file
results = {"files": []}
analyzed_count = 0
for file_path in files:
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language == "Unknown":
continue
# Analyze file
analysis = analyzer.analyze_file(str(file_path), content, language)
# Only include files with actual analysis results
if analysis and (analysis.get("classes") or analysis.get("functions")):
results["files"].append(
{
"file": str(file_path.relative_to(directory)),
"language": language,
**analysis,
}
)
analyzed_count += 1
if analyzed_count % 10 == 0:
logger.info(f"Analyzed {analyzed_count}/{len(files)} files...")
except Exception as e:
logger.warning(f"Error analyzing {file_path}: {e}")
continue
logger.info(f"✅ Successfully analyzed {analyzed_count} files")
# Save results
output_json = output_dir / "code_analysis.json"
with open(output_json, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2)
logger.info(f"📁 Saved analysis to: {output_json}")
# Build API reference if requested
if build_api_reference and results["files"]:
logger.info("Building API reference documentation...")
builder = APIReferenceBuilder(results)
api_output_dir = output_dir / "api_reference"
generated_files = builder.build_reference(api_output_dir)
logger.info(f"✅ Generated {len(generated_files)} API reference files")
logger.info(f"📁 API reference: {api_output_dir}")
# Build dependency graph if requested (C2.6)
if build_dependency_graph:
logger.info("Building dependency graph...")
dep_analyzer = DependencyAnalyzer()
# Analyze dependencies for all files
for file_path in files:
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language != "Unknown":
# Use relative path from directory for better graph readability
rel_path = str(file_path.relative_to(directory))
dep_analyzer.analyze_file(rel_path, content, language)
except Exception as e:
logger.warning(f"Error analyzing dependencies for {file_path}: {e}")
continue
# Build the graph
graph = dep_analyzer.build_graph()
# Detect circular dependencies
cycles = dep_analyzer.detect_cycles()
if cycles:
logger.warning(f"⚠️ Found {len(cycles)} circular dependencies:")
for i, cycle in enumerate(cycles[:5], 1): # Show first 5
cycle_str = "".join(cycle) + f"{cycle[0]}"
logger.warning(f" {i}. {cycle_str}")
if len(cycles) > 5:
logger.warning(f" ... and {len(cycles) - 5} more")
else:
logger.info("✅ No circular dependencies found")
# Save dependency graph data
dep_output_dir = output_dir / "dependencies"
dep_output_dir.mkdir(parents=True, exist_ok=True)
# Export as JSON
dep_json = dep_output_dir / "dependency_graph.json"
with open(dep_json, "w", encoding="utf-8") as f:
json.dump(dep_analyzer.export_json(), f, indent=2)
logger.info(f"📁 Saved dependency graph: {dep_json}")
# Export as Mermaid diagram
mermaid_file = dep_output_dir / "dependency_graph.mmd"
mermaid_file.write_text(dep_analyzer.export_mermaid())
logger.info(f"📁 Saved Mermaid diagram: {mermaid_file}")
# Save statistics
stats = dep_analyzer.get_statistics()
stats_file = dep_output_dir / "statistics.json"
with open(stats_file, "w", encoding="utf-8") as f:
json.dump(stats, f, indent=2)
logger.info(
f"📊 Statistics: {stats['total_files']} files, "
f"{stats['total_dependencies']} dependencies, "
f"{stats['circular_dependencies']} cycles"
)
# Try to export as DOT (requires pydot)
try:
dot_file = dep_output_dir / "dependency_graph.dot"
dep_analyzer.export_dot(str(dot_file))
except:
pass # pydot not installed, skip DOT export
# Detect design patterns if requested (C3.1)
if detect_patterns:
logger.info("Detecting design patterns...")
from skill_seekers.cli.pattern_recognizer import PatternRecognizer
pattern_recognizer = PatternRecognizer(depth=depth, enhance_with_ai=enhance_with_ai)
pattern_results = []
for file_path in files:
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language != "Unknown":
report = pattern_recognizer.analyze_file(str(file_path), content, language)
if report.patterns:
pattern_results.append(report.to_dict())
except Exception as e:
logger.warning(f"Pattern detection failed for {file_path}: {e}")
continue
# Save pattern results
if pattern_results:
pattern_output = output_dir / "patterns"
pattern_output.mkdir(parents=True, exist_ok=True)
pattern_json = pattern_output / "detected_patterns.json"
with open(pattern_json, "w", encoding="utf-8") as f:
json.dump(pattern_results, f, indent=2)
total_patterns = sum(len(r["patterns"]) for r in pattern_results)
logger.info(f"✅ Detected {total_patterns} patterns in {len(pattern_results)} files")
logger.info(f"📁 Saved to: {pattern_json}")
else:
logger.info("No design patterns detected")
# Extract test examples if requested (C3.2)
if extract_test_examples:
logger.info("Extracting usage examples from test files...")
from skill_seekers.cli.test_example_extractor import TestExampleExtractor
# Create extractor
test_extractor = TestExampleExtractor(
min_confidence=0.5,
max_per_file=10,
languages=languages,
enhance_with_ai=enhance_with_ai,
)
# Extract examples from directory
try:
example_report = test_extractor.extract_from_directory(directory, recursive=True)
if example_report.total_examples > 0:
# Save results
examples_output = output_dir / "test_examples"
examples_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
examples_json = examples_output / "test_examples.json"
with open(examples_json, "w", encoding="utf-8") as f:
json.dump(example_report.to_dict(), f, indent=2)
# Save as Markdown
examples_md = examples_output / "test_examples.md"
examples_md.write_text(example_report.to_markdown(), encoding="utf-8")
logger.info(
f"✅ Extracted {example_report.total_examples} test examples "
f"({example_report.high_value_count} high-value)"
)
logger.info(f"📁 Saved to: {examples_output}")
else:
logger.info("No test examples extracted")
except Exception as e:
logger.warning(f"Test example extraction failed: {e}")
example_report = None
# Build how-to guides from workflow examples (C3.3)
if build_how_to_guides and extract_test_examples:
logger.info("Building how-to guides from workflow examples...")
try:
from skill_seekers.cli.how_to_guide_builder import HowToGuideBuilder
# Create guide builder
guide_builder = HowToGuideBuilder(enhance_with_ai=enhance_with_ai)
# Build guides from workflow examples
tutorials_dir = output_dir / "tutorials"
# Get workflow examples from the example_report if available
if (
"example_report" in locals()
and example_report
and example_report.total_examples > 0
):
# Convert example_report to list of dicts for processing
examples_list = example_report.to_dict().get("examples", [])
guide_collection = guide_builder.build_guides_from_examples(
examples_list,
grouping_strategy="ai-tutorial-group",
output_dir=tutorials_dir,
enhance_with_ai=enhance_with_ai,
ai_mode=ai_mode,
)
if guide_collection and guide_collection.total_guides > 0:
# Save collection summary
collection_json = tutorials_dir / "guide_collection.json"
with open(collection_json, "w", encoding="utf-8") as f:
json.dump(guide_collection.to_dict(), f, indent=2)
logger.info(f"✅ Built {guide_collection.total_guides} how-to guides")
logger.info(f"📁 Saved to: {tutorials_dir}")
else:
logger.info("No how-to guides generated (insufficient workflow examples)")
else:
logger.info("No workflow examples available for guide generation")
except Exception as e:
logger.warning(f"How-to guide building failed: {e}")
# Extract configuration patterns (C3.4)
if extract_config_patterns:
logger.info("Extracting configuration patterns...")
try:
config_extractor = ConfigExtractor()
# Extract config patterns from directory
extraction_result = config_extractor.extract_from_directory(directory)
if extraction_result.config_files:
# Convert to dict for enhancement
result_dict = config_extractor.to_dict(extraction_result)
# AI Enhancement (if enabled)
if enhance_with_ai and ai_mode != "none":
try:
from skill_seekers.cli.config_enhancer import ConfigEnhancer
logger.info(f"🤖 Enhancing config analysis with AI (mode: {ai_mode})...")
enhancer = ConfigEnhancer(mode=ai_mode)
result_dict = enhancer.enhance_config_result(result_dict)
logger.info("✅ AI enhancement complete")
except Exception as e:
logger.warning(f"⚠️ Config AI enhancement failed: {e}")
# Save results
config_output = output_dir / "config_patterns"
config_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
config_json = config_output / "config_patterns.json"
with open(config_json, "w", encoding="utf-8") as f:
json.dump(result_dict, f, indent=2)
# Save as Markdown (basic - AI enhancements in JSON only for now)
config_md = config_output / "config_patterns.md"
config_md.write_text(extraction_result.to_markdown(), encoding="utf-8")
# Count total settings across all files
total_settings = sum(len(cf.settings) for cf in extraction_result.config_files)
total_patterns = sum(len(cf.patterns) for cf in extraction_result.config_files)
logger.info(
f"✅ Extracted {len(extraction_result.config_files)} config files "
f"with {total_settings} settings and {total_patterns} detected patterns"
)
if "ai_enhancements" in result_dict:
insights = result_dict["ai_enhancements"].get("overall_insights", {})
if insights.get("security_issues_found"):
logger.info(
f"🔐 Security issues found: {insights['security_issues_found']}"
)
logger.info(f"📁 Saved to: {config_output}")
else:
logger.info("No configuration files found")
except Exception as e:
logger.warning(f"Config pattern extraction failed: {e}")
# Detect architectural patterns (C3.7)
# Always run this - it provides high-level overview
logger.info("Analyzing architectural patterns...")
from skill_seekers.cli.architectural_pattern_detector import ArchitecturalPatternDetector
arch_detector = ArchitecturalPatternDetector(enhance_with_ai=enhance_with_ai)
arch_report = arch_detector.analyze(directory, results["files"])
if arch_report.patterns:
arch_output = output_dir / "architecture"
arch_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
arch_json = arch_output / "architectural_patterns.json"
with open(arch_json, "w", encoding="utf-8") as f:
json.dump(arch_report.to_dict(), f, indent=2)
logger.info(f"🏗️ Detected {len(arch_report.patterns)} architectural patterns")
for pattern in arch_report.patterns:
logger.info(f" - {pattern.pattern_name} (confidence: {pattern.confidence:.2f})")
logger.info(f"📁 Saved to: {arch_json}")
else:
logger.info("No clear architectural patterns detected")
# Generate SKILL.md and references/ directory
logger.info("Generating SKILL.md and references...")
_generate_skill_md(
output_dir=output_dir,
directory=directory,
results=results,
depth=depth,
build_api_reference=build_api_reference,
build_dependency_graph=build_dependency_graph,
detect_patterns=detect_patterns,
extract_test_examples=extract_test_examples,
extract_config_patterns=extract_config_patterns,
)
return results
def _generate_skill_md(
output_dir: Path,
directory: Path,
results: dict[str, Any],
depth: str,
build_api_reference: bool,
build_dependency_graph: bool,
detect_patterns: bool,
extract_test_examples: bool,
extract_config_patterns: bool,
):
"""
Generate rich SKILL.md from codebase analysis results.
Creates a 300+ line skill file with:
- Front matter (name, description)
- Repository info (path, languages, file count)
- When to Use section
- Quick Reference (patterns, languages, stats)
- Code Examples (from test files)
- API Reference (from code analysis)
- Architecture Overview
- Configuration Patterns
- Available References
"""
repo_name = directory.name
# Generate skill name (lowercase, hyphens only, max 64 chars)
skill_name = repo_name.lower().replace("_", "-").replace(" ", "-")[:64]
# Generate description
description = f"Local codebase analysis for {repo_name}"
# Count files by language
language_stats = _get_language_stats(results.get("files", []))
total_files = len(results.get("files", []))
# Start building content
skill_content = f"""---
name: {skill_name}
description: {description}
---
# {repo_name} Codebase
## Description
Local codebase analysis and documentation generated from code analysis.
**Path:** `{directory}`
**Files Analyzed:** {total_files}
**Languages:** {", ".join(language_stats.keys())}
**Analysis Depth:** {depth}
## When to Use This Skill
Use this skill when you need to:
- Understand the codebase architecture and design patterns
- Find implementation examples and usage patterns
- Review API documentation extracted from code
- Check configuration patterns and best practices
- Explore test examples and real-world usage
- Navigate the codebase structure efficiently
## ⚡ Quick Reference
### Codebase Statistics
"""
# Language breakdown
skill_content += "**Languages:**\n"
for lang, count in sorted(language_stats.items(), key=lambda x: x[1], reverse=True):
percentage = (count / total_files * 100) if total_files > 0 else 0
skill_content += f"- **{lang}**: {count} files ({percentage:.1f}%)\n"
skill_content += "\n"
# Analysis features performed
skill_content += "**Analysis Performed:**\n"
if build_api_reference:
skill_content += "- ✅ API Reference (C2.5)\n"
if build_dependency_graph:
skill_content += "- ✅ Dependency Graph (C2.6)\n"
if detect_patterns:
skill_content += "- ✅ Design Patterns (C3.1)\n"
if extract_test_examples:
skill_content += "- ✅ Test Examples (C3.2)\n"
if extract_config_patterns:
skill_content += "- ✅ Configuration Patterns (C3.4)\n"
skill_content += "- ✅ Architectural Analysis (C3.7)\n\n"
# Add design patterns if available
if detect_patterns:
patterns_content = _format_patterns_section(output_dir)
if patterns_content:
skill_content += patterns_content
# Add code examples if available
if extract_test_examples:
examples_content = _format_examples_section(output_dir)
if examples_content:
skill_content += examples_content
# Add API reference if available
if build_api_reference:
api_content = _format_api_section(output_dir)
if api_content:
skill_content += api_content
# Add architecture if available
arch_content = _format_architecture_section(output_dir)
if arch_content:
skill_content += arch_content
# Add configuration patterns if available
if extract_config_patterns:
config_content = _format_config_section(output_dir)
if config_content:
skill_content += config_content
# Available references
skill_content += "## 📚 Available References\n\n"
skill_content += "This skill includes detailed reference documentation:\n\n"
refs_added = False
if build_api_reference and (output_dir / "api_reference").exists():
skill_content += (
"- **API Reference**: `references/api_reference/` - Complete API documentation\n"
)
refs_added = True
if build_dependency_graph and (output_dir / "dependencies").exists():
skill_content += (
"- **Dependencies**: `references/dependencies/` - Dependency graph and analysis\n"
)
refs_added = True
if detect_patterns and (output_dir / "patterns").exists():
skill_content += "- **Patterns**: `references/patterns/` - Detected design patterns\n"
refs_added = True
if extract_test_examples and (output_dir / "test_examples").exists():
skill_content += "- **Examples**: `references/test_examples/` - Usage examples from tests\n"
refs_added = True
if extract_config_patterns and (output_dir / "config_patterns").exists():
skill_content += (
"- **Configuration**: `references/config_patterns/` - Configuration patterns\n"
)
refs_added = True
if (output_dir / "architecture").exists():
skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
refs_added = True
if not refs_added:
skill_content += "No additional references generated (analysis features disabled).\n"
skill_content += "\n"
# Footer
skill_content += "---\n\n"
skill_content += "**Generated by Skill Seeker** | Codebase Analyzer with C3.x Analysis\n"
# Write SKILL.md
skill_path = output_dir / "SKILL.md"
skill_path.write_text(skill_content, encoding="utf-8")
line_count = len(skill_content.split("\n"))
logger.info(f"✅ Generated SKILL.md: {skill_path} ({line_count} lines)")
# Generate references/ directory structure
_generate_references(output_dir)
def _get_language_stats(files: list[dict]) -> dict[str, int]:
"""Count files by language from analysis results."""
stats = {}
for file_data in files:
# files is a list of dicts with 'language' key
lang = file_data.get("language", "Unknown")
if lang != "Unknown":
stats[lang] = stats.get(lang, 0) + 1
return stats
def _format_patterns_section(output_dir: Path) -> str:
"""Format design patterns section from patterns/detected_patterns.json."""
patterns_file = output_dir / "patterns" / "detected_patterns.json"
if not patterns_file.exists():
return ""
try:
with open(patterns_file, encoding="utf-8") as f:
patterns_data = json.load(f)
except Exception:
return ""
if not patterns_data:
return ""
# Count patterns by type (deduplicate by class, keep highest confidence)
pattern_counts = {}
by_class = {}
for pattern_file in patterns_data:
for pattern in pattern_file.get("patterns", []):
ptype = pattern.get("pattern_type", "Unknown")
cls = pattern.get("class_name", "")
confidence = pattern.get("confidence", 0)
# Skip low confidence
if confidence < 0.7:
continue
# Deduplicate by class
key = f"{cls}:{ptype}"
if key not in by_class or by_class[key]["confidence"] < confidence:
by_class[key] = pattern
# Count by type
pattern_counts[ptype] = pattern_counts.get(ptype, 0) + 1
if not pattern_counts:
return ""
content = "### 🎨 Design Patterns Detected\n\n"
content += "*From C3.1 codebase analysis (confidence > 0.7)*\n\n"
# Top 5 pattern types
for ptype, count in sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
content += f"- **{ptype}**: {count} instances\n"
content += f"\n*Total: {len(by_class)} high-confidence patterns*\n\n"
content += "*See `references/patterns/` for complete pattern analysis*\n\n"
return content
def _format_examples_section(output_dir: Path) -> str:
"""Format code examples section from test_examples/test_examples.json."""
examples_file = output_dir / "test_examples" / "test_examples.json"
if not examples_file.exists():
return ""
try:
with open(examples_file, encoding="utf-8") as f:
examples_data = json.load(f)
except Exception:
return ""
examples = examples_data.get("examples", [])
if not examples:
return ""
# Filter high-value examples (complexity > 0.7)
high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7]
if not high_value:
# If no high complexity, take any examples
high_value = examples[:10]
if not high_value:
return ""
content = "## 📝 Code Examples\n\n"
content += "*High-quality examples extracted from test files (C3.2)*\n\n"
# Top 10 examples
for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]:
desc = ex.get("description", "Example")
lang = ex.get("language", "python").lower()
code = ex.get("code", "")
complexity = ex.get("complexity_score", 0)
content += f"**{desc}** (complexity: {complexity:.2f})\n\n"
content += f"```{lang}\n{code}\n```\n\n"
content += "*See `references/test_examples/` for all extracted examples*\n\n"
return content
def _format_api_section(output_dir: Path) -> str:
"""Format API reference section."""
api_dir = output_dir / "api_reference"
if not api_dir.exists():
return ""
api_md = api_dir / "api_reference.md"
if not api_md.exists():
return ""
try:
api_content = api_md.read_text(encoding="utf-8")
except Exception:
return ""
# Extract first section (up to 500 chars)
preview = api_content[:500]
if len(api_content) > 500:
preview += "..."
content = "## 🔧 API Reference\n\n"
content += "*Extracted from codebase analysis (C2.5)*\n\n"
content += preview + "\n\n"
content += "*See `references/api_reference/` for complete API documentation*\n\n"
return content
def _format_architecture_section(output_dir: Path) -> str:
"""Format architecture section from architecture/architectural_patterns.json."""
arch_file = output_dir / "architecture" / "architectural_patterns.json"
if not arch_file.exists():
return ""
try:
with open(arch_file, encoding="utf-8") as f:
arch_data = json.load(f)
except Exception:
return ""
patterns = arch_data.get("patterns", [])
if not patterns:
return ""
content = "## 🏗️ Architecture Overview\n\n"
content += "*From C3.7 architectural analysis*\n\n"
content += "**Detected Architectural Patterns:**\n\n"
for pattern in patterns[:5]:
name = pattern.get("pattern_name", "Unknown")
confidence = pattern.get("confidence", 0)
indicators = pattern.get("indicators", [])
content += f"- **{name}** (confidence: {confidence:.2f})\n"
if indicators:
content += f" - Indicators: {', '.join(indicators[:3])}\n"
content += f"\n*Total: {len(patterns)} architectural patterns detected*\n\n"
content += "*See `references/architecture/` for complete architectural analysis*\n\n"
return content
def _format_config_section(output_dir: Path) -> str:
"""Format configuration patterns section."""
config_file = output_dir / "config_patterns" / "config_patterns.json"
if not config_file.exists():
return ""
try:
with open(config_file, encoding="utf-8") as f:
config_data = json.load(f)
except Exception:
return ""
config_files = config_data.get("config_files", [])
if not config_files:
return ""
total_settings = sum(len(cf.get("settings", [])) for cf in config_files)
total_patterns = sum(len(cf.get("patterns", [])) for cf in config_files)
content = "## ⚙️ Configuration Patterns\n\n"
content += "*From C3.4 configuration analysis*\n\n"
content += f"**Configuration Files Analyzed:** {len(config_files)}\n"
content += f"**Total Settings:** {total_settings}\n"
content += f"**Patterns Detected:** {total_patterns}\n\n"
# List config file types found
file_types = {}
for cf in config_files:
ctype = cf.get("config_type", "unknown")
file_types[ctype] = file_types.get(ctype, 0) + 1
if file_types:
content += "**Configuration Types:**\n"
for ctype, count in sorted(file_types.items(), key=lambda x: x[1], reverse=True):
content += f"- {ctype}: {count} files\n"
content += "\n"
content += "*See `references/config_patterns/` for detailed configuration analysis*\n\n"
return content
def _generate_references(output_dir: Path):
"""
Generate references/ directory structure by symlinking analysis output.
Creates a clean references/ directory that links to all analysis outputs.
"""
references_dir = output_dir / "references"
references_dir.mkdir(exist_ok=True)
# Map analysis directories to reference names
mappings = {
"api_reference": "api_reference",
"dependencies": "dependencies",
"patterns": "patterns",
"test_examples": "test_examples",
"tutorials": "tutorials",
"config_patterns": "config_patterns",
"architecture": "architecture",
}
for source, target in mappings.items():
source_dir = output_dir / source
target_dir = references_dir / target
if source_dir.exists() and source_dir.is_dir():
# Copy directory to references/ (not symlink, for portability)
if target_dir.exists():
import shutil
shutil.rmtree(target_dir)
import shutil
shutil.copytree(source_dir, target_dir)
logger.debug(f"Copied {source} → references/{target}")
logger.info(f"✅ Generated references directory: {references_dir}")
def main():
"""Command-line interface for codebase analysis."""
parser = argparse.ArgumentParser(
description="Analyze local codebases and extract code knowledge",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Analyze current directory
codebase-scraper --directory . --output output/codebase/
# Deep analysis with API reference and dependency graph
codebase-scraper --directory /path/to/repo --depth deep --build-api-reference --build-dependency-graph
# Analyze only Python and JavaScript
codebase-scraper --directory . --languages Python,JavaScript
# Use file patterns
codebase-scraper --directory . --file-patterns "*.py,src/**/*.js"
# Full analysis with all features (default)
codebase-scraper --directory . --depth deep
# Surface analysis (fast, skip all analysis features)
codebase-scraper --directory . --depth surface --skip-api-reference --skip-dependency-graph --skip-patterns --skip-test-examples
# Skip specific features
codebase-scraper --directory . --skip-patterns --skip-test-examples
""",
)
parser.add_argument("--directory", required=True, help="Directory to analyze")
parser.add_argument(
"--output", default="output/codebase/", help="Output directory (default: output/codebase/)"
)
parser.add_argument(
"--depth",
choices=["surface", "deep", "full"],
default="deep",
help="Analysis depth (default: deep)",
)
parser.add_argument(
"--languages", help="Comma-separated languages to analyze (e.g., Python,JavaScript,C++)"
)
parser.add_argument(
"--file-patterns", help="Comma-separated file patterns (e.g., *.py,src/**/*.js)"
)
parser.add_argument(
"--skip-api-reference",
action="store_true",
default=False,
help="Skip API reference markdown documentation generation (default: enabled)",
)
parser.add_argument(
"--skip-dependency-graph",
action="store_true",
default=False,
help="Skip dependency graph and circular dependency detection (default: enabled)",
)
parser.add_argument(
"--skip-patterns",
action="store_true",
default=False,
help="Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)",
)
parser.add_argument(
"--skip-test-examples",
action="store_true",
default=False,
help="Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)",
)
parser.add_argument(
"--skip-how-to-guides",
action="store_true",
default=False,
help="Skip how-to guide generation from workflow examples (default: enabled)",
)
parser.add_argument(
"--skip-config-patterns",
action="store_true",
default=False,
help="Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)",
)
parser.add_argument(
"--ai-mode",
choices=["auto", "api", "local", "none"],
default="auto",
help="AI enhancement mode for how-to guides: auto (detect best), api (Claude API), local (Claude Code CLI), none (disable) (default: auto)",
)
parser.add_argument("--no-comments", action="store_true", help="Skip comment extraction")
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
# Check for deprecated flags
deprecated_flags = {
"--build-api-reference": "--skip-api-reference",
"--build-dependency-graph": "--skip-dependency-graph",
"--detect-patterns": "--skip-patterns",
"--extract-test-examples": "--skip-test-examples",
"--build-how-to-guides": "--skip-how-to-guides",
"--extract-config-patterns": "--skip-config-patterns",
}
for old_flag, new_flag in deprecated_flags.items():
if old_flag in sys.argv:
logger.warning(
f"⚠️ DEPRECATED: {old_flag} is deprecated. "
f"All features are now enabled by default. "
f"Use {new_flag} to disable this feature."
)
args = parser.parse_args()
# Set logging level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Validate directory
directory = Path(args.directory)
if not directory.exists():
logger.error(f"Directory not found: {directory}")
return 1
if not directory.is_dir():
logger.error(f"Not a directory: {directory}")
return 1
# Parse languages
languages = None
if args.languages:
languages = [lang.strip() for lang in args.languages.split(",")]
# Parse file patterns
file_patterns = None
if args.file_patterns:
file_patterns = [p.strip() for p in args.file_patterns.split(",")]
# Analyze codebase
try:
results = analyze_codebase(
directory=directory,
output_dir=Path(args.output),
depth=args.depth,
languages=languages,
file_patterns=file_patterns,
build_api_reference=not args.skip_api_reference,
extract_comments=not args.no_comments,
build_dependency_graph=not args.skip_dependency_graph,
detect_patterns=not args.skip_patterns,
extract_test_examples=not args.skip_test_examples,
build_how_to_guides=not args.skip_how_to_guides,
extract_config_patterns=not args.skip_config_patterns,
enhance_with_ai=True, # Auto-disables if no API key present
ai_mode=args.ai_mode, # NEW: AI enhancement mode for how-to guides
)
# Print summary
print(f"\n{'=' * 60}")
print("CODEBASE ANALYSIS COMPLETE")
print(f"{'=' * 60}")
print(f"Files analyzed: {len(results['files'])}")
print(f"Output directory: {args.output}")
if not args.skip_api_reference:
print(f"API reference: {Path(args.output) / 'api_reference'}")
print(f"{'=' * 60}\n")
return 0
except KeyboardInterrupt:
logger.error("\nAnalysis interrupted by user")
return 130
except Exception as e:
logger.error(f"Analysis failed: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())