Files
skill-seekers-reference/src/skill_seekers/cli/codebase_scraper.py
yusyus f214976ccd fix: apply review fixes from PR #309 and stabilize flaky benchmark test
Follow-up to PR #309 (perf: optimize with caching, pre-compiled regex,
O(1) lookups, and bisect line indexing). These fixes were committed to
the PR branch but missed the squash merge.

Review fixes (credit: PR #309 by copperlang2007):
1. Rename _pending_set -> _enqueued_urls to accurately reflect that the
   set tracks all ever-enqueued URLs, not just currently pending ones
2. Extract duplicated _build_line_index()/_offset_to_line() into shared
   build_line_index()/offset_to_line() in cli/utils.py (DRY)
3. Fix pre-existing bug: infer_categories() guard checked 'tutorial'
   but wrote to 'tutorials' key, risking silent overwrites
4. Remove unnecessary _store_results() closure in scrape_page()
5. Simplify parser pre-import in codebase_scraper.py

Benchmark stabilization:
- test_benchmark_metadata_overhead was flaky on CI (106.7% overhead
  observed, threshold 50%) because 5 iterations with mean averaging
  can't reliably measure microsecond-level differences
- Fix: 20 iterations, warm-up run, median instead of mean, threshold
  raised to 200% (guards catastrophic regression, not noise)

Ref: https://github.com/yusufkaraaslan/Skill_Seekers/pull/309
2026-03-14 23:39:23 +03:00

2497 lines
89 KiB
Python

#!/usr/bin/env python3
"""
Codebase Scraper CLI Tool
Standalone tool for analyzing local codebases without GitHub API.
Extracts code signatures, comments, and optionally generates API documentation.
Usage:
codebase-scraper --directory /path/to/repo --output output/codebase/
codebase-scraper --directory . --depth deep --languages Python,JavaScript
codebase-scraper --directory /path/to/repo --build-api-reference
Features:
- File tree walking with .gitignore support
- Multi-language code analysis (9 languages: Python, JavaScript/TypeScript, C/C++, C#, Go, Rust, Java, Ruby, PHP)
- API reference generation
- Comment extraction
- Dependency graph analysis
- Configurable depth levels
Credits:
- Language parsing patterns inspired by official language specifications
- NetworkX for dependency graph analysis: https://networkx.org/
- pathspec for .gitignore support: https://pypi.org/project/pathspec/
"""
import argparse
import json
import logging
import os
import re
import sys
from pathlib import Path
from typing import Any
from skill_seekers.cli.api_reference_builder import APIReferenceBuilder
from skill_seekers.cli.code_analyzer import CodeAnalyzer
from skill_seekers.cli.config_extractor import ConfigExtractor
from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer
from skill_seekers.cli.signal_flow_analyzer import SignalFlowAnalyzer
from skill_seekers.cli.utils import setup_logging
# Try to import pathspec for .gitignore support
try:
import pathspec
PATHSPEC_AVAILABLE = True
except ImportError:
PATHSPEC_AVAILABLE = False
logger = logging.getLogger(__name__)
# Language extension mapping
LANGUAGE_EXTENSIONS = {
".py": "Python",
".js": "JavaScript",
".jsx": "JavaScript",
".ts": "TypeScript",
".tsx": "TypeScript",
".cpp": "C++",
".cc": "C++",
".cxx": "C++",
".h": "C++",
".hpp": "C++",
".hxx": "C++",
".c": "C",
".cs": "C#",
".gd": "GDScript", # Godot scripting language
".tscn": "GodotScene", # Godot scene files
".tres": "GodotResource", # Godot resource files
".gdshader": "GodotShader", # Godot shader files
".go": "Go",
".rs": "Rust",
".java": "Java",
".rb": "Ruby",
".php": "PHP",
}
# Documentation file extensions
MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdown", ".mkd"}
RST_EXTENSIONS = {".rst", ".rest"} # ReStructuredText (Sphinx, Godot docs, etc.)
DOC_EXTENSIONS = MARKDOWN_EXTENSIONS | RST_EXTENSIONS # All supported doc formats
# Common documentation folders to scan
DOC_FOLDERS = {"docs", "doc", "documentation", "wiki", ".github"}
# Root-level doc files → category mapping
ROOT_DOC_CATEGORIES = {
"readme": "overview",
"contributing": "contributing",
"changelog": "changelog",
"history": "changelog",
"license": "license",
"authors": "authors",
"code_of_conduct": "community",
"security": "security",
"architecture": "architecture",
"design": "architecture",
}
# Folder name → category mapping
FOLDER_CATEGORIES = {
"architecture": "architecture",
"arch": "architecture",
"design": "architecture",
"guides": "guides",
"guide": "guides",
"tutorials": "guides",
"tutorial": "guides",
"howto": "guides",
"how-to": "guides",
"workflows": "workflows",
"workflow": "workflows",
"templates": "templates",
"template": "templates",
"api": "api",
"reference": "api",
"examples": "examples",
"example": "examples",
"specs": "specifications",
"spec": "specifications",
"rfcs": "specifications",
"rfc": "specifications",
"features": "features",
"feature": "features",
}
# Default directories to exclude
DEFAULT_EXCLUDED_DIRS = {
# Python/Node
"node_modules",
"venv",
"__pycache__",
".git",
".svn",
".hg",
"build",
"dist",
"target",
".pytest_cache",
".tox",
".mypy_cache",
"htmlcov",
"coverage",
".coverage",
".eggs",
"*.egg-info",
# IDE
".idea",
".vscode",
".vs",
"__pypackages__",
# Unity (critical - contains massive build cache)
"Library",
"Temp",
"Logs",
"UserSettings",
"MemoryCaptures",
"Recordings",
# Unreal Engine
"Intermediate",
"Saved",
"DerivedDataCache",
# Godot
".godot",
".import",
# Misc
"tmp",
".tmp",
}
def detect_language(file_path: Path) -> str:
"""
Detect programming language from file extension.
Args:
file_path: Path to source file
Returns:
Language name or 'Unknown'
"""
extension = file_path.suffix.lower()
return LANGUAGE_EXTENSIONS.get(extension, "Unknown")
def load_gitignore(directory: Path) -> pathspec.PathSpec | None:
"""
Load .gitignore file and create pathspec matcher.
Args:
directory: Root directory to search for .gitignore
Returns:
PathSpec object if .gitignore found, None otherwise
"""
if not PATHSPEC_AVAILABLE:
logger.warning("pathspec not installed - .gitignore support disabled")
logger.warning("Install with: pip install pathspec")
return None
gitignore_path = directory / ".gitignore"
if not gitignore_path.exists():
logger.debug(f"No .gitignore found in {directory}")
return None
try:
with open(gitignore_path, encoding="utf-8") as f:
spec = pathspec.PathSpec.from_lines("gitwildmatch", f)
logger.info(f"Loaded .gitignore from {gitignore_path}")
return spec
except Exception as e:
logger.warning(f"Failed to load .gitignore: {e}")
return None
def should_exclude_dir(dir_name: str, excluded_dirs: set) -> bool:
"""
Check if directory should be excluded from analysis.
Args:
dir_name: Directory name
excluded_dirs: Set of directory names to exclude
Returns:
True if directory should be excluded
"""
return dir_name in excluded_dirs
def walk_directory(
root: Path,
patterns: list[str] | None = None,
gitignore_spec: pathspec.PathSpec | None = None,
excluded_dirs: set | None = None,
) -> list[Path]:
"""
Walk directory tree and collect source files.
Args:
root: Root directory to walk
patterns: Optional file patterns to include (e.g., ['*.py', '*.js'])
gitignore_spec: Optional PathSpec object for .gitignore rules
excluded_dirs: Set of directory names to exclude
Returns:
List of source file paths
"""
if excluded_dirs is None:
excluded_dirs = DEFAULT_EXCLUDED_DIRS
files = []
root = Path(root).resolve()
for dirpath, dirnames, filenames in os.walk(root):
current_dir = Path(dirpath)
# Filter out excluded directories (in-place modification)
dirnames[:] = [d for d in dirnames if not should_exclude_dir(d, excluded_dirs)]
for filename in filenames:
file_path = current_dir / filename
# Check .gitignore rules
if gitignore_spec:
try:
rel_path = file_path.relative_to(root)
if gitignore_spec.match_file(str(rel_path)):
logger.debug(f"Skipping (gitignore): {rel_path}")
continue
except ValueError:
# File is outside root, skip it
continue
# Check file extension
if file_path.suffix.lower() not in LANGUAGE_EXTENSIONS:
continue
# Check file patterns if provided
if patterns and not any(file_path.match(pattern) for pattern in patterns):
continue
files.append(file_path)
return sorted(files)
def walk_markdown_files(
root: Path,
gitignore_spec: pathspec.PathSpec | None = None,
excluded_dirs: set | None = None,
) -> list[Path]:
"""
Walk directory tree and collect markdown documentation files.
Args:
root: Root directory to walk
gitignore_spec: Optional PathSpec object for .gitignore rules
excluded_dirs: Set of directory names to exclude
Returns:
List of markdown file paths
"""
if excluded_dirs is None:
excluded_dirs = DEFAULT_EXCLUDED_DIRS
files = []
root = Path(root).resolve()
for dirpath, dirnames, filenames in os.walk(root):
current_dir = Path(dirpath)
# Filter out excluded directories (in-place modification)
dirnames[:] = [d for d in dirnames if not should_exclude_dir(d, excluded_dirs)]
for filename in filenames:
file_path = current_dir / filename
# Check .gitignore rules
if gitignore_spec:
try:
rel_path = file_path.relative_to(root)
if gitignore_spec.match_file(str(rel_path)):
logger.debug(f"Skipping (gitignore): {rel_path}")
continue
except ValueError:
continue
# Check if documentation file (markdown or RST)
if file_path.suffix.lower() not in DOC_EXTENSIONS:
continue
files.append(file_path)
return sorted(files)
def categorize_markdown_file(file_path: Path, root: Path) -> str:
"""
Categorize a markdown file based on its location and filename.
Args:
file_path: Path to the markdown file
root: Root directory of the project
Returns:
Category name (e.g., 'overview', 'guides', 'architecture')
"""
try:
rel_path = file_path.relative_to(root)
except ValueError:
return "other"
# Check root-level files by filename
if len(rel_path.parts) == 1:
filename_lower = file_path.stem.lower().replace("-", "_").replace(" ", "_")
for key, category in ROOT_DOC_CATEGORIES.items():
if key in filename_lower:
return category
return "overview" # Default for root .md files
# Check folder-based categorization
for part in rel_path.parts[:-1]: # Exclude filename
part_lower = part.lower().replace("-", "_").replace(" ", "_")
for key, category in FOLDER_CATEGORIES.items():
if key in part_lower:
return category
# Default category
return "other"
def extract_markdown_structure(content: str) -> dict[str, Any]:
"""
Extract structure from markdown content (headers, code blocks, links).
Args:
content: Markdown file content
Returns:
Dictionary with extracted structure
"""
structure = {
"title": None,
"headers": [],
"code_blocks": [],
"links": [],
"word_count": len(content.split()),
"line_count": len(content.split("\n")),
}
lines = content.split("\n")
# Extract headers
for i, line in enumerate(lines):
header_match = re.match(r"^(#{1,6})\s+(.+)$", line)
if header_match:
level = len(header_match.group(1))
text = header_match.group(2).strip()
structure["headers"].append(
{
"level": level,
"text": text,
"line": i + 1,
}
)
# First h1 is the title
if level == 1 and structure["title"] is None:
structure["title"] = text
# Extract code blocks (fenced)
code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL)
for match in code_block_pattern.finditer(content):
language = match.group(1) or "text"
code = match.group(2).strip()
if len(code) > 0:
structure["code_blocks"].append(
{
"language": language,
"code": code, # Full code - no truncation
"full_length": len(code),
}
)
# Extract links
link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
for match in link_pattern.finditer(content):
structure["links"].append(
{
"text": match.group(1),
"url": match.group(2),
}
)
return structure
def extract_rst_structure(content: str) -> dict[str, Any]:
"""
Extract structure from ReStructuredText (RST) content.
Uses the enhanced unified RST parser for comprehensive extraction.
RST uses underline-style headers:
Title
=====
Section
-------
Subsection
~~~~~~~~~~
Args:
content: RST file content
Returns:
Dictionary with extracted structure including:
- title: Document title
- headers: List of headers with levels
- code_blocks: Code blocks with language and content
- tables: Tables with rows and headers
- links: External links
- cross_references: Internal cross-references
- word_count: Total word count
- line_count: Total line count
"""
# Use the enhanced unified RST parser
try:
from skill_seekers.cli.parsers.extractors import RstParser
parser = RstParser()
result = parser.parse_string(content, "<string>")
if result.success and result.document:
doc = result.document
# Convert to legacy structure format for backward compatibility
structure = {
"title": doc.title,
"headers": [
{"level": h.level, "text": h.text, "line": h.source_line} for h in doc.headings
],
"code_blocks": [
{
"language": cb.language or "text",
"code": cb.code, # Full code - no truncation
"full_length": len(cb.code),
"quality_score": cb.quality_score,
}
for cb in doc.code_blocks
],
"tables": [
{
"caption": t.caption,
"headers": t.headers,
"rows": t.rows,
"row_count": t.num_rows,
"col_count": t.num_cols,
}
for t in doc.tables
],
"links": [
{"text": x.text or x.target, "url": x.target} for x in doc.external_links
],
"cross_references": [
{"type": x.ref_type.value, "target": x.target} for x in doc.internal_links
],
"word_count": len(content.split()),
"line_count": len(content.split("\n")),
# New enhanced fields
"_enhanced": True,
"_extraction_stats": {
"total_blocks": doc.stats.total_blocks,
"code_blocks": len(doc.code_blocks),
"tables": len(doc.tables),
"headings": len(doc.headings),
"cross_references": len(doc.internal_links),
},
}
return structure
except Exception as e:
# Fall back to basic extraction if unified parser fails
logger.warning(f"Enhanced RST parser failed: {e}, using basic parser")
# Legacy basic extraction (fallback)
structure = {
"title": None,
"headers": [],
"code_blocks": [],
"tables": [],
"links": [],
"cross_references": [],
"word_count": len(content.split()),
"line_count": len(content.split("\n")),
"_enhanced": False,
}
lines = content.split("\n")
underline_chars = ["=", "-", "~", "^", '"', "'", "`", ":", "."]
# Extract headers (RST style: text on one line, underline on next)
for i in range(len(lines) - 1):
current_line = lines[i].strip()
next_line = lines[i + 1].strip()
if (
current_line
and next_line
and len(set(next_line)) == 1
and next_line[0] in underline_chars
and len(next_line) >= len(current_line) - 2
):
level = underline_chars.index(next_line[0]) + 1
text = current_line.strip()
structure["headers"].append({"level": level, "text": text, "line": i + 1})
if structure["title"] is None:
structure["title"] = text
# Basic code block extraction
code_block_pattern = re.compile(
r"\.\.\s+code-block::\s+(\w+)\s*\n\s+(.*?)(?=\n\S|\Z)", re.DOTALL
)
for match in code_block_pattern.finditer(content):
language = match.group(1) or "text"
code = match.group(2).strip()
if code:
structure["code_blocks"].append(
{
"language": language,
"code": code, # Full code - no truncation
"full_length": len(code),
}
)
# Basic link extraction
link_pattern = re.compile(r"`([^<`]+)\s+<([^>]+)>`_")
for match in link_pattern.finditer(content):
structure["links"].append({"text": match.group(1).strip(), "url": match.group(2)})
return structure
def generate_markdown_summary(
content: str, structure: dict[str, Any], max_length: int = 500
) -> str:
"""
Generate a summary of markdown content.
Args:
content: Full markdown content
structure: Extracted structure from extract_markdown_structure()
max_length: Maximum summary length
Returns:
Summary string
"""
# Start with title if available
summary_parts = []
if structure.get("title"):
summary_parts.append(f"**{structure['title']}**")
# Add header outline (first 5 h2/h3 headers)
h2_h3 = [h for h in structure.get("headers", []) if h["level"] in (2, 3)][:5]
if h2_h3:
sections = [h["text"] for h in h2_h3]
summary_parts.append(f"Sections: {', '.join(sections)}")
# Extract first paragraph (skip headers and empty lines)
lines = content.split("\n")
first_para = []
in_para = False
for line in lines:
stripped = line.strip()
if stripped.startswith("#") or stripped.startswith("```"):
if in_para:
break
continue
if stripped:
in_para = True
first_para.append(stripped)
elif in_para:
break
if first_para:
para_text = " ".join(first_para)
if len(para_text) > 200:
para_text = para_text[:200] + "..."
summary_parts.append(para_text)
# Add stats
stats = f"({structure.get('word_count', 0)} words, {len(structure.get('code_blocks', []))} code blocks)"
summary_parts.append(stats)
summary = "\n".join(summary_parts)
if len(summary) > max_length:
summary = summary[:max_length] + "..."
return summary
def process_markdown_docs(
directory: Path,
output_dir: Path,
depth: str = "deep",
gitignore_spec: pathspec.PathSpec | None = None,
enhance_with_ai: bool = False,
ai_mode: str = "none",
) -> dict[str, Any]:
"""
Process all markdown documentation files in a directory.
Args:
directory: Root directory to scan
output_dir: Output directory for processed docs
depth: Processing depth ('surface', 'deep', 'full')
gitignore_spec: Optional .gitignore spec
enhance_with_ai: Whether to use AI enhancement
ai_mode: AI mode ('none', 'auto', 'api', 'local')
Returns:
Dictionary with processed documentation data
"""
logger.info("Scanning for markdown documentation...")
# Find all markdown files
md_files = walk_markdown_files(directory, gitignore_spec)
logger.info(f"Found {len(md_files)} markdown files")
if not md_files:
return {"files": [], "categories": {}, "total_files": 0}
# Process each file
processed_docs = []
categories = {}
# Pre-import parsers once outside the loop
try:
from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
except ImportError:
RstParser = None # type: ignore[assignment,misc]
MarkdownParser = None # type: ignore[assignment,misc]
logger.debug("Unified parsers not available, using legacy parsers")
for md_path in md_files:
try:
content = md_path.read_text(encoding="utf-8", errors="ignore")
rel_path = str(md_path.relative_to(directory))
category = categorize_markdown_file(md_path, directory)
doc_data = {
"path": rel_path,
"filename": md_path.name,
"category": category,
"size_bytes": len(content.encode("utf-8")),
}
# Surface depth: just path and category
if depth == "surface":
processed_docs.append(doc_data)
else:
# Deep/Full: extract structure and summary using unified parsers
structure = None
parsed_doc = None
try:
if RstParser is None or MarkdownParser is None:
raise ImportError("Parsers not available")
# Use appropriate unified parser based on file extension
if md_path.suffix.lower() in RST_EXTENSIONS:
parser = RstParser()
result = parser.parse_string(content, str(md_path))
if result.success:
parsed_doc = result.document
# Convert to legacy structure format for backward compatibility
structure = {
"title": parsed_doc.title,
"headers": [
{"level": h.level, "text": h.text, "line": h.source_line}
for h in parsed_doc.headings
],
"code_blocks": [
{"language": cb.language, "code": cb.code} # Full code
for cb in parsed_doc.code_blocks
],
"tables": len(parsed_doc.tables),
"cross_refs": len(parsed_doc.internal_links),
"directives": len(
[b for b in parsed_doc.blocks if b.type.value == "admonition"]
),
"word_count": parsed_doc.stats.total_blocks
if parsed_doc.stats
else 0,
"line_count": len(content.split("\n")),
}
else:
parser = MarkdownParser()
result = parser.parse_string(content, str(md_path))
if result.success:
parsed_doc = result.document
# Convert to legacy structure format
structure = {
"title": parsed_doc.title,
"headers": [
{"level": h.level, "text": h.text, "line": h.source_line}
for h in parsed_doc.headings
],
"code_blocks": [
{"language": cb.language, "code": cb.code} # Full code
for cb in parsed_doc.code_blocks
],
"tables": len(parsed_doc.tables),
"images": len(parsed_doc.images),
"links": len(parsed_doc.external_links),
"word_count": parsed_doc.stats.total_blocks
if parsed_doc.stats
else 0,
"line_count": len(content.split("\n")),
}
except ImportError:
# Fallback to old parsers if unified parsers not available
logger.debug("Unified parsers not available, using legacy parsers")
if md_path.suffix.lower() in RST_EXTENSIONS:
structure = extract_rst_structure(content)
else:
structure = extract_markdown_structure(content)
# Generate summary
if structure is None:
# Fallback if parsing failed
if md_path.suffix.lower() in RST_EXTENSIONS:
structure = extract_rst_structure(content)
else:
structure = extract_markdown_structure(content)
summary = generate_markdown_summary(content, structure)
doc_data.update(
{
"title": structure.get("title") or md_path.stem,
"structure": structure,
"summary": summary,
"content": content if depth == "full" else None,
"_enhanced": parsed_doc is not None, # Mark if enhanced parser was used
}
)
# If we have rich parsed data, save it
if parsed_doc:
doc_data["parsed_data"] = {
"tables": len(parsed_doc.tables),
"cross_references": len(parsed_doc.internal_links),
"code_blocks": len(parsed_doc.code_blocks),
"images": len(getattr(parsed_doc, "images", [])),
"quality_scores": {
"avg_code_quality": sum(
cb.quality_score or 0 for cb in parsed_doc.code_blocks
)
/ len(parsed_doc.code_blocks)
if parsed_doc.code_blocks
else 0,
},
}
processed_docs.append(doc_data)
# Track categories
if category not in categories:
categories[category] = []
categories[category].append(rel_path)
except Exception as e:
logger.warning(f"Failed to process {md_path}: {e}")
continue
# AI Enhancement (if enabled and enhance_level >= 2)
if enhance_with_ai and ai_mode != "none" and processed_docs:
logger.info("🤖 Enhancing documentation analysis with AI...")
try:
processed_docs = _enhance_docs_with_ai(processed_docs, ai_mode)
logger.info("✅ AI documentation enhancement complete")
except Exception as e:
logger.warning(f"⚠️ AI enhancement failed: {e}")
# Save processed docs to output
docs_output_dir = output_dir / "documentation"
docs_output_dir.mkdir(parents=True, exist_ok=True)
# Copy files organized by category
for doc in processed_docs:
try:
src_path = directory / doc["path"]
category = doc["category"]
category_dir = docs_output_dir / category
category_dir.mkdir(parents=True, exist_ok=True)
# Copy file to category folder
dest_path = category_dir / doc["filename"]
import shutil
shutil.copy2(src_path, dest_path)
except Exception as e:
logger.debug(f"Failed to copy {doc['path']}: {e}")
# Save documentation index
index_data = {
"total_files": len(processed_docs),
"categories": categories,
"files": processed_docs,
}
index_json = docs_output_dir / "documentation_index.json"
with open(index_json, "w", encoding="utf-8") as f:
json.dump(index_data, f, indent=2, default=str)
# Save extraction summary (tables, cross-refs, etc.)
enhanced_count = sum(1 for doc in processed_docs if doc.get("_enhanced", False))
if enhanced_count > 0:
total_tables = sum(doc.get("parsed_data", {}).get("tables", 0) for doc in processed_docs)
total_xrefs = sum(
doc.get("parsed_data", {}).get("cross_references", 0) for doc in processed_docs
)
total_code_blocks = sum(
doc.get("parsed_data", {}).get("code_blocks", 0) for doc in processed_docs
)
extraction_summary = {
"enhanced_files": enhanced_count,
"total_files": len(processed_docs),
"extraction_stats": {
"tables": total_tables,
"cross_references": total_xrefs,
"code_blocks": total_code_blocks,
},
"parser_version": "unified_v1.0.0",
}
summary_json = docs_output_dir / "extraction_summary.json"
with open(summary_json, "w", encoding="utf-8") as f:
json.dump(extraction_summary, f, indent=2)
logger.info(f"📊 Extraction Summary:")
logger.info(f" - Enhanced files: {enhanced_count}/{len(processed_docs)}")
logger.info(f" - Tables extracted: {total_tables}")
logger.info(f" - Cross-references: {total_xrefs}")
logger.info(f" - Code blocks: {total_code_blocks}")
logger.info(
f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories"
)
logger.info(f"📁 Saved to: {docs_output_dir}")
return index_data
def _enhance_docs_with_ai(docs: list[dict], ai_mode: str) -> list[dict]:
"""
Enhance documentation analysis with AI.
Args:
docs: List of processed document dictionaries
ai_mode: AI mode ('api' or 'local')
Returns:
Enhanced document list
"""
# Try API mode first
if ai_mode in ("api", "auto"):
api_key = os.environ.get("ANTHROPIC_API_KEY")
if api_key:
return _enhance_docs_api(docs, api_key)
# Fall back to LOCAL mode
if ai_mode in ("local", "auto"):
return _enhance_docs_local(docs)
return docs
def _enhance_docs_api(docs: list[dict], api_key: str) -> list[dict]:
"""Enhance docs using Claude API."""
try:
import anthropic
client = anthropic.Anthropic(api_key=api_key)
# Batch documents for efficiency
batch_size = 10
for i in range(0, len(docs), batch_size):
batch = docs[i : i + batch_size]
# Create prompt for batch
docs_text = "\n\n".join(
[
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}"
for d in batch
if d.get("summary")
]
)
if not docs_text:
continue
prompt = f"""Analyze these documentation files and provide:
1. A brief description of what each document covers
2. Key topics/concepts mentioned
3. How they relate to each other
Documents:
{docs_text}
Return JSON with format:
{{"enhancements": [{{"filename": "...", "description": "...", "key_topics": [...], "related_to": [...]}}]}}"""
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
messages=[{"role": "user", "content": prompt}],
)
# Parse response and merge enhancements
try:
json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
if json_match:
enhancements = json.loads(json_match.group())
for enh in enhancements.get("enhancements", []):
for doc in batch:
if doc["filename"] == enh.get("filename"):
doc["ai_description"] = enh.get("description")
doc["ai_topics"] = enh.get("key_topics", [])
doc["ai_related"] = enh.get("related_to", [])
except Exception:
pass
except Exception as e:
logger.warning(f"API enhancement failed: {e}")
return docs
def _enhance_docs_local(docs: list[dict]) -> list[dict]:
"""Enhance docs using Claude Code CLI (LOCAL mode)."""
import subprocess
import tempfile
# Prepare batch of docs for enhancement
docs_with_summary = [d for d in docs if d.get("summary")]
if not docs_with_summary:
return docs
docs_text = "\n\n".join(
[
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}"
for d in docs_with_summary[:20] # Limit to 20 docs
]
)
prompt = f"""Analyze these documentation files from a codebase and provide insights.
For each document, provide:
1. A brief description of what it covers
2. Key topics/concepts
3. Related documents
Documents:
{docs_text}
Output JSON only:
{{"enhancements": [{{"filename": "...", "description": "...", "key_topics": ["..."], "related_to": ["..."]}}]}}"""
try:
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
f.write(prompt)
prompt_file = f.name
result = subprocess.run(
["claude", "--dangerously-skip-permissions", "-p", prompt],
capture_output=True,
text=True,
timeout=120,
)
os.unlink(prompt_file)
if result.returncode == 0 and result.stdout:
json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
if json_match:
enhancements = json.loads(json_match.group())
for enh in enhancements.get("enhancements", []):
for doc in docs:
if doc["filename"] == enh.get("filename"):
doc["ai_description"] = enh.get("description")
doc["ai_topics"] = enh.get("key_topics", [])
doc["ai_related"] = enh.get("related_to", [])
except Exception as e:
logger.warning(f"LOCAL enhancement failed: {e}")
return docs
def analyze_codebase(
directory: Path,
output_dir: Path,
depth: str = "deep",
languages: list[str] | None = None,
file_patterns: list[str] | None = None,
build_api_reference: bool = True,
extract_comments: bool = True,
build_dependency_graph: bool = True,
detect_patterns: bool = True,
extract_test_examples: bool = True,
build_how_to_guides: bool = True,
extract_config_patterns: bool = True,
extract_docs: bool = True,
enhance_level: int = 0,
skill_name: str | None = None,
skill_description: str | None = None,
doc_version: str = "",
) -> dict[str, Any]:
"""
Analyze local codebase and extract code knowledge.
Args:
directory: Directory to analyze
output_dir: Output directory for results
depth: Analysis depth (surface, deep, full)
languages: Optional list of languages to analyze
file_patterns: Optional file patterns to include
build_api_reference: Generate API reference markdown
extract_comments: Extract inline comments
build_dependency_graph: Generate dependency graph and detect circular dependencies
detect_patterns: Detect design patterns (Singleton, Factory, Observer, etc.)
extract_test_examples: Extract usage examples from test files
build_how_to_guides: Build how-to guides from workflow examples (C3.3)
extract_config_patterns: Extract configuration patterns from config files (C3.4)
extract_docs: Extract and process markdown documentation files (default: True)
enhance_level: AI enhancement level (0=off, 1=SKILL.md only, 2=+config+arch+docs, 3=full)
skill_name: Optional override for skill name (default: directory name)
skill_description: Optional override for skill description
Returns:
Analysis results dictionary
"""
# Determine AI enhancement settings based on level
# Level 0: No AI enhancement
# Level 1: SKILL.md only (handled in main.py)
# Level 2: Architecture + Config AI enhancement
# Level 3: Full AI enhancement (patterns, tests, config, architecture)
enhance_patterns = enhance_level >= 3
enhance_tests = enhance_level >= 3
enhance_config = enhance_level >= 2
enhance_architecture = enhance_level >= 2
ai_mode = "auto" if enhance_level > 0 else "none"
if enhance_level > 0:
level_names = {1: "SKILL.md only", 2: "SKILL.md+Architecture+Config", 3: "full"}
logger.info(
f"🤖 AI Enhancement Level: {enhance_level} ({level_names.get(enhance_level, 'unknown')})"
)
# Resolve directory to absolute path to avoid relative_to() errors
directory = Path(directory).resolve()
logger.info(f"Analyzing codebase: {directory}")
logger.info(f"Depth: {depth}")
# Create output directory
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Load .gitignore
gitignore_spec = load_gitignore(directory)
# Walk directory tree
logger.info("Scanning directory tree...")
files = walk_directory(directory, patterns=file_patterns, gitignore_spec=gitignore_spec)
logger.info(f"Found {len(files)} source files")
# Filter by language if specified
if languages:
language_set = set(languages)
files = [f for f in files if detect_language(f) in language_set]
logger.info(f"Filtered to {len(files)} files for languages: {', '.join(languages)}")
# Initialize code analyzer
analyzer = CodeAnalyzer(depth=depth)
# Analyze each file
results = {"files": []}
analyzed_count = 0
for file_path in files:
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language == "Unknown":
continue
# Analyze file
analysis = analyzer.analyze_file(str(file_path), content, language)
# Only include files with actual analysis results
# Check for any meaningful content (classes, functions, imports, nodes, properties, etc.)
# IMPORTANT: Include files with imports for framework detection (fixes #239)
has_content = (
analysis.get("classes")
or analysis.get("functions")
or analysis.get("imports") # Include import-only files (fixes #239)
or analysis.get("nodes") # Godot scenes
or analysis.get("properties") # Godot resources
or analysis.get("uniforms") # Godot shaders
or analysis.get("signals") # GDScript signals
or analysis.get("exports") # GDScript exports
)
if analysis and has_content:
results["files"].append(
{
"file": str(file_path.relative_to(directory)),
"language": language,
**analysis,
}
)
analyzed_count += 1
if analyzed_count % 10 == 0:
logger.info(f"Analyzed {analyzed_count}/{len(files)} files...")
except Exception as e:
logger.warning(f"Error analyzing {file_path}: {e}")
continue
logger.info(f"✅ Successfully analyzed {analyzed_count} files")
# Save results
output_json = output_dir / "code_analysis.json"
with open(output_json, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2)
logger.info(f"📁 Saved analysis to: {output_json}")
# Build API reference if requested
if build_api_reference and results["files"]:
logger.info("Building API reference documentation...")
builder = APIReferenceBuilder(results)
api_output_dir = output_dir / "api_reference"
generated_files = builder.build_reference(api_output_dir)
logger.info(f"✅ Generated {len(generated_files)} API reference files")
logger.info(f"📁 API reference: {api_output_dir}")
# Build dependency graph if requested (C2.6)
if build_dependency_graph:
logger.info("Building dependency graph...")
dep_analyzer = DependencyAnalyzer()
# Analyze dependencies for all files
for file_path in files:
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language != "Unknown":
# Use relative path from directory for better graph readability
rel_path = str(file_path.relative_to(directory))
dep_analyzer.analyze_file(rel_path, content, language)
except Exception as e:
logger.warning(f"Error analyzing dependencies for {file_path}: {e}")
continue
# Build the graph
graph = dep_analyzer.build_graph()
# Detect circular dependencies
cycles = dep_analyzer.detect_cycles()
if cycles:
logger.warning(f"⚠️ Found {len(cycles)} circular dependencies:")
for i, cycle in enumerate(cycles[:5], 1): # Show first 5
cycle_str = "".join(cycle) + f"{cycle[0]}"
logger.warning(f" {i}. {cycle_str}")
if len(cycles) > 5:
logger.warning(f" ... and {len(cycles) - 5} more")
else:
logger.info("✅ No circular dependencies found")
# Save dependency graph data
dep_output_dir = output_dir / "dependencies"
dep_output_dir.mkdir(parents=True, exist_ok=True)
# Export as JSON
dep_json = dep_output_dir / "dependency_graph.json"
with open(dep_json, "w", encoding="utf-8") as f:
json.dump(dep_analyzer.export_json(), f, indent=2)
logger.info(f"📁 Saved dependency graph: {dep_json}")
# Export as Mermaid diagram
mermaid_file = dep_output_dir / "dependency_graph.mmd"
mermaid_file.write_text(dep_analyzer.export_mermaid())
logger.info(f"📁 Saved Mermaid diagram: {mermaid_file}")
# Save statistics
stats = dep_analyzer.get_statistics()
stats_file = dep_output_dir / "statistics.json"
with open(stats_file, "w", encoding="utf-8") as f:
json.dump(stats, f, indent=2)
logger.info(
f"📊 Statistics: {stats['total_files']} files, "
f"{stats['total_dependencies']} dependencies, "
f"{stats['circular_dependencies']} cycles"
)
# Try to export as DOT (requires pydot)
try:
dot_file = dep_output_dir / "dependency_graph.dot"
dep_analyzer.export_dot(str(dot_file))
except Exception:
pass # pydot not installed, skip DOT export
# Detect design patterns if requested (C3.1)
if detect_patterns:
logger.info("Detecting design patterns...")
from skill_seekers.cli.pattern_recognizer import PatternRecognizer
# Step 1: Detect patterns WITHOUT enhancement (collect all first)
pattern_recognizer = PatternRecognizer(depth=depth, enhance_with_ai=False)
pattern_results = []
for file_path in files:
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language != "Unknown":
report = pattern_recognizer.analyze_file(str(file_path), content, language)
if report.patterns:
pattern_results.append(report.to_dict())
except Exception as e:
logger.warning(f"Pattern detection failed for {file_path}: {e}")
continue
# Step 2: Enhance ALL patterns at once (batched across all files)
if enhance_patterns and pattern_results:
logger.info("🤖 Enhancing patterns with AI (batched)...")
from skill_seekers.cli.ai_enhancer import PatternEnhancer
enhancer = PatternEnhancer()
# Flatten all patterns from all files
all_patterns = []
pattern_map = [] # Track (report_idx, pattern_idx) for each pattern
for report_idx, report in enumerate(pattern_results):
for pattern_idx, pattern in enumerate(report.get("patterns", [])):
all_patterns.append(pattern)
pattern_map.append((report_idx, pattern_idx))
if all_patterns:
# Enhance all patterns in batches (this is where batching happens!)
enhanced_patterns = enhancer.enhance_patterns(all_patterns)
# Map enhanced patterns back to their reports
for i, (report_idx, pattern_idx) in enumerate(pattern_map):
if i < len(enhanced_patterns):
pattern_results[report_idx]["patterns"][pattern_idx] = enhanced_patterns[i]
# Save pattern results with multi-level filtering (Issue #240)
if pattern_results:
pattern_output = output_dir / "patterns"
pattern_output.mkdir(parents=True, exist_ok=True)
# Import filtering utilities
from skill_seekers.cli.pattern_recognizer import create_multi_level_report
# Create multi-level report
multi_level = create_multi_level_report(pattern_results)
stats = multi_level["statistics"]
# Save all patterns (unfiltered)
all_patterns_json = pattern_output / "all_patterns.json"
with open(all_patterns_json, "w", encoding="utf-8") as f:
json.dump(pattern_results, f, indent=2)
# Save high-confidence patterns (>= 0.70) for detailed analysis
high_confidence_json = pattern_output / "high_confidence_patterns.json"
with open(high_confidence_json, "w", encoding="utf-8") as f:
json.dump(multi_level["high_confidence"], f, indent=2)
# Save critical patterns (>= 0.80) for ARCHITECTURE.md
critical_json = pattern_output / "critical_patterns.json"
with open(critical_json, "w", encoding="utf-8") as f:
json.dump(multi_level["critical"], f, indent=2)
# Save summary statistics
summary_json = pattern_output / "summary.json"
with open(summary_json, "w", encoding="utf-8") as f:
json.dump(
{
"statistics": stats,
"thresholds": multi_level["thresholds"],
"files_analyzed": len(pattern_results),
},
f,
indent=2,
)
# Log results with breakdown by confidence
logger.info(f"✅ Detected {stats['total']} patterns in {len(pattern_results)} files")
logger.info(f" 🔴 Critical (≥0.80): {stats['critical_count']} patterns")
logger.info(f" 🟠 High (≥0.70): {stats['high_confidence_count']} patterns")
logger.info(f" 🟡 Medium (≥0.60): {stats['medium_count']} patterns")
logger.info(f" ⚪ Low (<0.60): {stats['low_count']} patterns")
logger.info(f"📁 Saved to: {pattern_output}/")
else:
logger.info("No design patterns detected")
# Extract test examples if requested (C3.2)
if extract_test_examples:
logger.info("Extracting usage examples from test files...")
from skill_seekers.cli.test_example_extractor import TestExampleExtractor
# Create extractor
test_extractor = TestExampleExtractor(
min_confidence=0.5,
max_per_file=10,
languages=languages,
enhance_with_ai=enhance_tests,
)
# Extract examples from directory
try:
example_report = test_extractor.extract_from_directory(directory, recursive=True)
if example_report.total_examples > 0:
# Save results
examples_output = output_dir / "test_examples"
examples_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
examples_json = examples_output / "test_examples.json"
with open(examples_json, "w", encoding="utf-8") as f:
json.dump(example_report.to_dict(), f, indent=2)
# Save as Markdown
examples_md = examples_output / "test_examples.md"
examples_md.write_text(example_report.to_markdown(), encoding="utf-8")
logger.info(
f"✅ Extracted {example_report.total_examples} test examples "
f"({example_report.high_value_count} high-value)"
)
logger.info(f"📁 Saved to: {examples_output}")
else:
logger.info("No test examples extracted")
except Exception as e:
logger.warning(f"Test example extraction failed: {e}")
example_report = None
# Build how-to guides from workflow examples (C3.3)
if build_how_to_guides and extract_test_examples:
logger.info("Building how-to guides from workflow examples...")
try:
from skill_seekers.cli.how_to_guide_builder import HowToGuideBuilder
# Create guide builder (uses same enhance level as test examples)
guide_builder = HowToGuideBuilder(enhance_with_ai=enhance_tests)
# Build guides from workflow examples
tutorials_dir = output_dir / "tutorials"
# Get workflow examples from the example_report if available
if (
"example_report" in locals()
and example_report
and example_report.total_examples > 0
):
# Convert example_report to list of dicts for processing
examples_list = example_report.to_dict().get("examples", [])
guide_collection = guide_builder.build_guides_from_examples(
examples_list,
grouping_strategy="ai-tutorial-group",
output_dir=tutorials_dir,
enhance_with_ai=enhance_tests,
ai_mode=ai_mode,
)
if guide_collection and guide_collection.total_guides > 0:
# Save collection summary
collection_json = tutorials_dir / "guide_collection.json"
with open(collection_json, "w", encoding="utf-8") as f:
json.dump(guide_collection.to_dict(), f, indent=2)
logger.info(f"✅ Built {guide_collection.total_guides} how-to guides")
logger.info(f"📁 Saved to: {tutorials_dir}")
else:
logger.info("No how-to guides generated (insufficient workflow examples)")
else:
logger.info("No workflow examples available for guide generation")
except Exception as e:
logger.warning(f"How-to guide building failed: {e}")
# Extract configuration patterns (C3.4)
if extract_config_patterns:
logger.info("Extracting configuration patterns...")
try:
config_extractor = ConfigExtractor()
# Extract config patterns from directory
extraction_result = config_extractor.extract_from_directory(directory)
if extraction_result.config_files:
# Convert to dict for enhancement
result_dict = config_extractor.to_dict(extraction_result)
# AI Enhancement (if enabled - level 2+)
if enhance_config and ai_mode != "none":
try:
from skill_seekers.cli.config_enhancer import ConfigEnhancer
logger.info(f"🤖 Enhancing config analysis with AI (mode: {ai_mode})...")
enhancer = ConfigEnhancer(mode=ai_mode)
result_dict = enhancer.enhance_config_result(result_dict)
logger.info("✅ AI enhancement complete")
except Exception as e:
logger.warning(f"⚠️ Config AI enhancement failed: {e}")
# Save results
config_output = output_dir / "config_patterns"
config_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
config_json = config_output / "config_patterns.json"
with open(config_json, "w", encoding="utf-8") as f:
json.dump(result_dict, f, indent=2)
# Save as Markdown (basic - AI enhancements in JSON only for now)
config_md = config_output / "config_patterns.md"
config_md.write_text(extraction_result.to_markdown(), encoding="utf-8")
# Count total settings across all files
total_settings = sum(len(cf.settings) for cf in extraction_result.config_files)
total_patterns = sum(len(cf.patterns) for cf in extraction_result.config_files)
logger.info(
f"✅ Extracted {len(extraction_result.config_files)} config files "
f"with {total_settings} settings and {total_patterns} detected patterns"
)
if "ai_enhancements" in result_dict:
insights = result_dict["ai_enhancements"].get("overall_insights", {})
if insights.get("security_issues_found"):
logger.info(
f"🔐 Security issues found: {insights['security_issues_found']}"
)
logger.info(f"📁 Saved to: {config_output}")
else:
logger.info("No configuration files found")
except Exception as e:
logger.warning(f"Config pattern extraction failed: {e}")
# Detect architectural patterns (C3.7)
# Always run this - it provides high-level overview
logger.info("Analyzing architectural patterns...")
from skill_seekers.cli.architectural_pattern_detector import ArchitecturalPatternDetector
arch_detector = ArchitecturalPatternDetector(enhance_with_ai=enhance_architecture)
arch_report = arch_detector.analyze(directory, results["files"])
# Save architecture analysis if we have patterns OR frameworks (fixes #239)
if arch_report.patterns or arch_report.frameworks_detected:
arch_output = output_dir / "architecture"
arch_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
arch_json = arch_output / "architectural_patterns.json"
with open(arch_json, "w", encoding="utf-8") as f:
json.dump(arch_report.to_dict(), f, indent=2)
if arch_report.patterns:
logger.info(f"🏗️ Detected {len(arch_report.patterns)} architectural patterns")
for pattern in arch_report.patterns:
logger.info(f" - {pattern.pattern_name} (confidence: {pattern.confidence:.2f})")
else:
logger.info("No clear architectural patterns detected")
if arch_report.frameworks_detected:
logger.info(f"📦 Detected {len(arch_report.frameworks_detected)} frameworks")
logger.info(f"📁 Saved to: {arch_json}")
else:
logger.info("No architectural patterns or frameworks detected")
# Analyze signal flow patterns (C3.10) - Godot projects only
signal_analysis = None
has_godot_files = any(
f.get("language") in ("GDScript", "GodotScene", "GodotResource", "GodotShader")
for f in results.get("files", [])
)
if has_godot_files:
logger.info("Analyzing signal flow patterns (Godot)...")
try:
signal_analyzer = SignalFlowAnalyzer(results)
signal_output = signal_analyzer.save_analysis(output_dir, ai_mode)
signal_analysis = signal_analyzer.analyze()
stats = signal_analysis["statistics"]
logger.info(f"📡 Signal Analysis Complete:")
logger.info(f" - {stats['total_signals']} signal declarations")
logger.info(f" - {stats['total_connections']} signal connections")
logger.info(f" - {stats['total_emissions']} signal emissions")
logger.info(f" - {len(signal_analysis['patterns'])} patterns detected")
logger.info(f"📁 Saved to: {signal_output}")
except Exception as e:
logger.warning(f"Signal flow analysis failed: {e}")
# Extract markdown documentation (C3.9)
docs_data = None
if extract_docs:
logger.info("Extracting project documentation...")
try:
# Determine AI enhancement for docs (level 2+)
enhance_docs_ai = enhance_level >= 2
docs_data = process_markdown_docs(
directory=directory,
output_dir=output_dir,
depth=depth,
gitignore_spec=gitignore_spec,
enhance_with_ai=enhance_docs_ai,
ai_mode=ai_mode,
)
if docs_data and docs_data.get("total_files", 0) > 0:
logger.info(
f"✅ Extracted {docs_data['total_files']} documentation files "
f"in {len(docs_data.get('categories', {}))} categories"
)
else:
logger.info("No markdown documentation files found")
except Exception as e:
logger.warning(f"Documentation extraction failed: {e}")
docs_data = None
# Generate SKILL.md and references/ directory
logger.info("Generating SKILL.md and references...")
_generate_skill_md(
output_dir=output_dir,
directory=directory,
results=results,
depth=depth,
build_api_reference=build_api_reference,
build_dependency_graph=build_dependency_graph,
detect_patterns=detect_patterns,
extract_test_examples=extract_test_examples,
extract_config_patterns=extract_config_patterns,
extract_docs=extract_docs,
docs_data=docs_data,
skill_name=skill_name,
skill_description=skill_description,
doc_version=doc_version,
)
return results
def _generate_skill_md(
output_dir: Path,
directory: Path,
results: dict[str, Any],
depth: str,
build_api_reference: bool,
build_dependency_graph: bool,
detect_patterns: bool,
extract_test_examples: bool,
extract_config_patterns: bool,
extract_docs: bool = True,
docs_data: dict[str, Any] | None = None,
skill_name: str | None = None,
skill_description: str | None = None,
doc_version: str = "",
):
"""
Generate rich SKILL.md from codebase analysis results.
Creates a 300+ line skill file with:
- Front matter (name, description)
- Repository info (path, languages, file count)
- When to Use section
- Quick Reference (patterns, languages, stats)
- Code Examples (from test files)
- API Reference (from code analysis)
- Architecture Overview
- Configuration Patterns
- Available References
"""
repo_name = directory.name
# Generate skill name (lowercase, hyphens only, max 64 chars)
# Use CLI override if provided, otherwise derive from directory name
if skill_name:
skill_name = skill_name.lower().replace("_", "-").replace(" ", "-")[:64]
else:
skill_name = repo_name.lower().replace("_", "-").replace(" ", "-")[:64]
# Generate description (use CLI override if provided)
description = skill_description or f"Local codebase analysis for {repo_name}"
# Count files by language
language_stats = _get_language_stats(results.get("files", []))
total_files = len(results.get("files", []))
# Start building content
skill_content = f"""---
name: {skill_name}
description: {description}
doc_version: {doc_version}
---
# {repo_name} Codebase
## Description
Local codebase analysis and documentation generated from code analysis.
**Path:** `{directory}`
**Files Analyzed:** {total_files}
**Languages:** {", ".join(language_stats.keys())}
**Analysis Depth:** {depth}
## When to Use This Skill
Use this skill when you need to:
- Understand the codebase architecture and design patterns
- Find implementation examples and usage patterns
- Review API documentation extracted from code
- Check configuration patterns and best practices
- Explore test examples and real-world usage
- Navigate the codebase structure efficiently
## ⚡ Quick Reference
### Codebase Statistics
"""
# Language breakdown
skill_content += "**Languages:**\n"
for lang, count in sorted(language_stats.items(), key=lambda x: x[1], reverse=True):
percentage = (count / total_files * 100) if total_files > 0 else 0
skill_content += f"- **{lang}**: {count} files ({percentage:.1f}%)\n"
skill_content += "\n"
# Analysis features performed
skill_content += "**Analysis Performed:**\n"
if build_api_reference:
skill_content += "- ✅ API Reference (C2.5)\n"
if build_dependency_graph:
skill_content += "- ✅ Dependency Graph (C2.6)\n"
if detect_patterns:
skill_content += "- ✅ Design Patterns (C3.1)\n"
if extract_test_examples:
skill_content += "- ✅ Test Examples (C3.2)\n"
if extract_config_patterns:
skill_content += "- ✅ Configuration Patterns (C3.4)\n"
skill_content += "- ✅ Architectural Analysis (C3.7)\n"
if extract_docs:
skill_content += "- ✅ Project Documentation (C3.9)\n"
# Check if signal flow analysis was performed
has_signal_analysis = (output_dir / "signals" / "signal_flow.json").exists()
if has_signal_analysis:
skill_content += "- ✅ Signal Flow Analysis (C3.10)\n"
skill_content += "\n"
# Add design patterns if available
if detect_patterns:
patterns_content = _format_patterns_section(output_dir)
if patterns_content:
skill_content += patterns_content
# Add code examples if available
if extract_test_examples:
examples_content = _format_examples_section(output_dir)
if examples_content:
skill_content += examples_content
# Add API reference if available
if build_api_reference:
api_content = _format_api_section(output_dir)
if api_content:
skill_content += api_content
# Add architecture if available
arch_content = _format_architecture_section(output_dir)
if arch_content:
skill_content += arch_content
# Add configuration patterns if available
if extract_config_patterns:
config_content = _format_config_section(output_dir)
if config_content:
skill_content += config_content
# Add signal flow analysis if available (C3.10)
signal_content = _format_signal_flow_section(output_dir, results)
if signal_content:
skill_content += signal_content
# Add project documentation if available
if extract_docs and docs_data:
docs_content = _format_documentation_section(output_dir, docs_data)
if docs_content:
skill_content += docs_content
# Available references
skill_content += "## 📚 Available References\n\n"
skill_content += "This skill includes detailed reference documentation:\n\n"
refs_added = False
if build_api_reference and (output_dir / "api_reference").exists():
skill_content += (
"- **API Reference**: `references/api_reference/` - Complete API documentation\n"
)
refs_added = True
if build_dependency_graph and (output_dir / "dependencies").exists():
skill_content += (
"- **Dependencies**: `references/dependencies/` - Dependency graph and analysis\n"
)
refs_added = True
if detect_patterns and (output_dir / "patterns").exists():
skill_content += "- **Patterns**: `references/patterns/` - Detected design patterns\n"
refs_added = True
if extract_test_examples and (output_dir / "test_examples").exists():
skill_content += "- **Examples**: `references/test_examples/` - Usage examples from tests\n"
refs_added = True
if extract_config_patterns and (output_dir / "config_patterns").exists():
skill_content += (
"- **Configuration**: `references/config_patterns/` - Configuration patterns\n"
)
refs_added = True
if (output_dir / "architecture").exists():
skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
refs_added = True
if extract_docs and (output_dir / "documentation").exists():
skill_content += (
"- **Documentation**: `references/documentation/` - Project documentation\n"
)
refs_added = True
if not refs_added:
skill_content += "No additional references generated (analysis features disabled).\n"
skill_content += "\n"
# Footer
skill_content += "---\n\n"
skill_content += "**Generated by Skill Seeker** | Codebase Analyzer with C3.x Analysis\n"
# Write SKILL.md
skill_path = output_dir / "SKILL.md"
skill_path.write_text(skill_content, encoding="utf-8")
line_count = len(skill_content.split("\n"))
logger.info(f"✅ Generated SKILL.md: {skill_path} ({line_count} lines)")
# Generate references/ directory structure
_generate_references(output_dir)
def _get_language_stats(files: list[dict]) -> dict[str, int]:
"""Count files by language from analysis results."""
stats = {}
for file_data in files:
# files is a list of dicts with 'language' key
lang = file_data.get("language", "Unknown")
if lang != "Unknown":
stats[lang] = stats.get(lang, 0) + 1
return stats
def _format_patterns_section(output_dir: Path) -> str:
"""Format design patterns section from patterns/detected_patterns.json."""
patterns_file = output_dir / "patterns" / "detected_patterns.json"
if not patterns_file.exists():
return ""
try:
with open(patterns_file, encoding="utf-8") as f:
patterns_data = json.load(f)
except Exception:
return ""
if not patterns_data:
return ""
# Count patterns by type (deduplicate by class, keep highest confidence)
pattern_counts = {}
by_class = {}
for pattern_file in patterns_data:
for pattern in pattern_file.get("patterns", []):
ptype = pattern.get("pattern_type", "Unknown")
cls = pattern.get("class_name", "")
confidence = pattern.get("confidence", 0)
# Skip low confidence
if confidence < 0.7:
continue
# Deduplicate by class
key = f"{cls}:{ptype}"
if key not in by_class or by_class[key]["confidence"] < confidence:
by_class[key] = pattern
# Count by type
pattern_counts[ptype] = pattern_counts.get(ptype, 0) + 1
if not pattern_counts:
return ""
content = "### 🎨 Design Patterns Detected\n\n"
content += "*From C3.1 codebase analysis (confidence > 0.7)*\n\n"
# Top 5 pattern types
for ptype, count in sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
content += f"- **{ptype}**: {count} instances\n"
content += f"\n*Total: {len(by_class)} high-confidence patterns*\n\n"
content += "*See `references/patterns/` for complete pattern analysis*\n\n"
return content
def _format_examples_section(output_dir: Path) -> str:
"""Format code examples section from test_examples/test_examples.json."""
examples_file = output_dir / "test_examples" / "test_examples.json"
if not examples_file.exists():
return ""
try:
with open(examples_file, encoding="utf-8") as f:
examples_data = json.load(f)
except Exception:
return ""
examples = examples_data.get("examples", [])
if not examples:
return ""
# Filter high-value examples (complexity > 0.7)
high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7]
if not high_value:
# If no high complexity, take any examples
high_value = examples[:10]
if not high_value:
return ""
content = "## 📝 Code Examples\n\n"
content += "*High-quality examples extracted from test files (C3.2)*\n\n"
# Top 10 examples
for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]:
desc = ex.get("description", "Example")
lang = ex.get("language", "python").lower()
code = ex.get("code", "")
complexity = ex.get("complexity_score", 0)
content += f"**{desc}** (complexity: {complexity:.2f})\n\n"
content += f"```{lang}\n{code}\n```\n\n"
content += "*See `references/test_examples/` for all extracted examples*\n\n"
return content
def _format_api_section(output_dir: Path) -> str:
"""Format API reference section."""
api_dir = output_dir / "api_reference"
if not api_dir.exists():
return ""
api_md = api_dir / "api_reference.md"
if not api_md.exists():
return ""
try:
api_content = api_md.read_text(encoding="utf-8")
except Exception:
return ""
# Extract first section (up to 500 chars)
preview = api_content[:500]
if len(api_content) > 500:
preview += "..."
content = "## 🔧 API Reference\n\n"
content += "*Extracted from codebase analysis (C2.5)*\n\n"
content += preview + "\n\n"
content += "*See `references/api_reference/` for complete API documentation*\n\n"
return content
def _format_architecture_section(output_dir: Path) -> str:
"""Format architecture section from architecture/architectural_patterns.json."""
arch_file = output_dir / "architecture" / "architectural_patterns.json"
if not arch_file.exists():
return ""
try:
with open(arch_file, encoding="utf-8") as f:
arch_data = json.load(f)
except Exception:
return ""
patterns = arch_data.get("patterns", [])
if not patterns:
return ""
content = "## 🏗️ Architecture Overview\n\n"
content += "*From C3.7 architectural analysis*\n\n"
content += "**Detected Architectural Patterns:**\n\n"
for pattern in patterns[:5]:
name = pattern.get("pattern_name", "Unknown")
confidence = pattern.get("confidence", 0)
indicators = pattern.get("indicators", [])
content += f"- **{name}** (confidence: {confidence:.2f})\n"
if indicators:
content += f" - Indicators: {', '.join(indicators[:3])}\n"
content += f"\n*Total: {len(patterns)} architectural patterns detected*\n\n"
content += "*See `references/architecture/` for complete architectural analysis*\n\n"
return content
def _format_config_section(output_dir: Path) -> str:
"""Format configuration patterns section."""
config_file = output_dir / "config_patterns" / "config_patterns.json"
if not config_file.exists():
return ""
try:
with open(config_file, encoding="utf-8") as f:
config_data = json.load(f)
except Exception:
return ""
config_files = config_data.get("config_files", [])
if not config_files:
return ""
total_settings = sum(len(cf.get("settings", [])) for cf in config_files)
total_patterns = sum(len(cf.get("patterns", [])) for cf in config_files)
content = "## ⚙️ Configuration Patterns\n\n"
content += "*From C3.4 configuration analysis*\n\n"
content += f"**Configuration Files Analyzed:** {len(config_files)}\n"
content += f"**Total Settings:** {total_settings}\n"
content += f"**Patterns Detected:** {total_patterns}\n\n"
# List config file types found
file_types = {}
for cf in config_files:
ctype = cf.get("config_type", "unknown")
file_types[ctype] = file_types.get(ctype, 0) + 1
if file_types:
content += "**Configuration Types:**\n"
for ctype, count in sorted(file_types.items(), key=lambda x: x[1], reverse=True):
content += f"- {ctype}: {count} files\n"
content += "\n"
content += "*See `references/config_patterns/` for detailed configuration analysis*\n\n"
return content
def _format_signal_flow_section(output_dir: Path, results: dict[str, Any]) -> str:
"""Format signal flow analysis section (C3.10 - Godot projects)."""
signal_file = output_dir / "signals" / "signal_flow.json"
if not signal_file.exists():
return ""
try:
with open(signal_file, encoding="utf-8") as f:
signal_data = json.load(f)
except Exception:
return ""
stats = signal_data.get("statistics", {})
patterns = signal_data.get("patterns", {})
# Only show section if there are signals
if stats.get("total_signals", 0) == 0:
return ""
content = "## 📡 Signal Flow Analysis\n\n"
content += "*From C3.10 signal flow analysis (Godot Event System)*\n\n"
# Statistics
content += "**Signal Statistics:**\n"
content += f"- **Total Signals**: {stats.get('total_signals', 0)}\n"
content += f"- **Signal Connections**: {stats.get('total_connections', 0)}\n"
content += f"- **Signal Emissions**: {stats.get('total_emissions', 0)}\n"
content += f"- **Signal Density**: {stats.get('signal_density', 0):.2f} signals per file\n\n"
# Most connected signals
most_connected = stats.get("most_connected_signals", [])
if most_connected:
content += "**Most Connected Signals:**\n"
for sig in most_connected[:5]:
content += f"- `{sig['signal']}`: {sig['connection_count']} connections\n"
content += "\n"
# Detected patterns
if patterns:
content += "**Detected Event Patterns:**\n"
for pattern_name, pattern_data in patterns.items():
if pattern_data.get("detected"):
confidence = pattern_data.get("confidence", 0)
description = pattern_data.get("description", "")
content += f"- **{pattern_name}** (confidence: {confidence:.2f})\n"
content += f" - {description}\n"
content += "\n"
# Test framework detection
test_files = [f for f in results.get("files", []) if f.get("test_framework")]
if test_files:
frameworks = {}
total_tests = 0
for f in test_files:
fw = f.get("test_framework")
test_count = len(f.get("test_functions", []))
frameworks[fw] = frameworks.get(fw, 0) + 1
total_tests += test_count
content += "**Test Framework Detection:**\n"
for fw, count in frameworks.items():
content += f"- **{fw}**: {count} test files, {total_tests} test cases\n"
content += "\n"
content += "*See `references/signals/` for complete signal flow analysis*\n\n"
return content
def _format_documentation_section(_output_dir: Path, docs_data: dict[str, Any]) -> str:
"""Format project documentation section from extracted markdown files.
Note: output_dir parameter is unused but kept for consistency with other _format_* functions.
Documentation data is provided via docs_data parameter.
"""
if not docs_data or docs_data.get("total_files", 0) == 0:
return ""
categories = docs_data.get("categories", {})
files = docs_data.get("files", [])
content = "## 📖 Project Documentation\n\n"
content += "*Extracted from markdown files in the project (C3.9)*\n\n"
content += f"**Total Documentation Files:** {docs_data['total_files']}\n"
content += f"**Categories:** {len(categories)}\n\n"
# List documents by category (most important first)
priority_order = [
"overview",
"architecture",
"guides",
"workflows",
"features",
"api",
"examples",
]
# Sort categories by priority
sorted_categories = []
for cat in priority_order:
if cat in categories:
sorted_categories.append(cat)
for cat in sorted(categories.keys()):
if cat not in sorted_categories:
sorted_categories.append(cat)
for category in sorted_categories[:6]: # Limit to 6 categories in SKILL.md
cat_files = categories[category]
content += f"### {category.title()}\n\n"
# Get file details for this category
cat_docs = [f for f in files if f.get("category") == category]
for doc in cat_docs[:5]: # Limit to 5 docs per category
title = doc.get("title") or doc.get("filename", "Unknown")
path = doc.get("path", "")
# Add summary if available (deep/full depth)
if doc.get("ai_description"):
content += f"- **{title}**: {doc['ai_description']}\n"
elif doc.get("summary"):
# Extract first sentence from summary
summary = doc["summary"].split("\n")[0]
if len(summary) > 100:
summary = summary[:100] + "..."
content += f"- **{title}**: {summary}\n"
else:
content += f"- **{title}** (`{path}`)\n"
if len(cat_files) > 5:
content += f"- *...and {len(cat_files) - 5} more*\n"
content += "\n"
# AI-enhanced topics if available
all_topics = []
for doc in files:
all_topics.extend(doc.get("ai_topics", []))
if all_topics:
# Deduplicate and count
from collections import Counter
topic_counts = Counter(all_topics)
top_topics = [t for t, _ in topic_counts.most_common(10)]
content += f"**Key Topics:** {', '.join(top_topics)}\n\n"
content += "*See `references/documentation/` for all project documentation*\n\n"
return content
def _generate_references(output_dir: Path):
"""
Generate references/ directory structure by symlinking analysis output.
Creates a clean references/ directory that links to all analysis outputs.
"""
references_dir = output_dir / "references"
references_dir.mkdir(exist_ok=True)
# Map analysis directories to reference names
mappings = {
"api_reference": "api_reference",
"dependencies": "dependencies",
"patterns": "patterns",
"test_examples": "test_examples",
"tutorials": "tutorials",
"config_patterns": "config_patterns",
"architecture": "architecture",
"documentation": "documentation",
}
for source, target in mappings.items():
source_dir = output_dir / source
target_dir = references_dir / target
if source_dir.exists() and source_dir.is_dir():
# Copy directory to references/ (not symlink, for portability)
import shutil
if target_dir.exists():
shutil.rmtree(target_dir)
shutil.copytree(source_dir, target_dir)
logger.debug(f"Copied {source} → references/{target}")
# Clean up source directory to avoid duplication (Issue #279)
# SKILL.md only references references/{target}, so source dir is redundant
shutil.rmtree(source_dir)
logger.debug(f"Cleaned up duplicate {source}/ directory")
logger.info(f"✅ Generated references directory: {references_dir}")
def _check_deprecated_flags(args):
"""Check for deprecated flags and show migration warnings."""
warnings = []
# Deprecated: --depth
if hasattr(args, "depth") and args.depth:
preset_map = {
"surface": "quick",
"deep": "standard",
"full": "comprehensive",
}
suggested_preset = preset_map.get(args.depth, "standard")
warnings.append(
f"⚠️ DEPRECATED: --depth {args.depth} → use --preset {suggested_preset} instead"
)
# Deprecated: --ai-mode
if hasattr(args, "ai_mode") and args.ai_mode and args.ai_mode != "auto":
if args.ai_mode == "api":
warnings.append(
"⚠️ DEPRECATED: --ai-mode api → use --enhance-level with ANTHROPIC_API_KEY set instead"
)
elif args.ai_mode == "local":
warnings.append(
"⚠️ DEPRECATED: --ai-mode local → use --enhance-level without API key instead"
)
elif args.ai_mode == "none":
warnings.append("⚠️ DEPRECATED: --ai-mode none → use --enhance-level 0 instead")
# Deprecated: --quick flag
if hasattr(args, "quick") and args.quick:
warnings.append("⚠️ DEPRECATED: --quick → use --preset quick instead")
# Deprecated: --comprehensive flag
if hasattr(args, "comprehensive") and args.comprehensive:
warnings.append("⚠️ DEPRECATED: --comprehensive → use --preset comprehensive instead")
# Show warnings if any found
if warnings:
print("\n" + "=" * 70)
for warning in warnings:
print(warning)
print("\n💡 MIGRATION TIP:")
print(" --preset quick (1-2 min, basic features)")
print(" --preset standard (5-10 min, core features, DEFAULT)")
print(" --preset comprehensive (20-60 min, all features + AI)")
print(" --enhance-level 0-3 (granular AI enhancement control)")
print("\n⚠️ Deprecated flags will be removed in v4.0.0")
print("=" * 70 + "\n")
def main():
"""Command-line interface for codebase analysis."""
from skill_seekers.cli.arguments.analyze import add_analyze_arguments
parser = argparse.ArgumentParser(
description="Analyze local codebases and extract code knowledge",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Analyze current directory
codebase-scraper --directory . --output output/codebase/
# Deep analysis with API reference and dependency graph
codebase-scraper --directory /path/to/repo --depth deep --build-api-reference --build-dependency-graph
# Analyze only Python and JavaScript
codebase-scraper --directory . --languages Python,JavaScript
# Use file patterns
codebase-scraper --directory . --file-patterns "*.py,src/**/*.js"
# Full analysis with all features (default)
codebase-scraper --directory . --depth deep
# Surface analysis (fast, skip all analysis features)
codebase-scraper --directory . --depth surface --skip-api-reference --skip-dependency-graph --skip-patterns --skip-test-examples
# Skip specific features
codebase-scraper --directory . --skip-patterns --skip-test-examples
""",
)
# Register all args from the shared definitions module
add_analyze_arguments(parser)
# Extra legacy arg only used by standalone CLI (not in arguments/analyze.py)
parser.add_argument(
"--ai-mode",
choices=["auto", "api", "local", "none"],
default="auto",
help=(
"AI enhancement mode for how-to guides: "
"auto (auto-detect: API if ANTHROPIC_API_KEY set, else LOCAL), "
"api (Claude API, requires ANTHROPIC_API_KEY), "
"local (Claude Code Max, FREE, no API key), "
"none (disable AI enhancement). "
"💡 TIP: Use --enhance flag instead for simpler UX!"
),
)
# Check for deprecated flags
deprecated_flags = {
"--build-api-reference": "--skip-api-reference",
"--build-dependency-graph": "--skip-dependency-graph",
"--detect-patterns": "--skip-patterns",
"--extract-test-examples": "--skip-test-examples",
"--build-how-to-guides": "--skip-how-to-guides",
"--extract-config-patterns": "--skip-config-patterns",
}
for old_flag, new_flag in deprecated_flags.items():
if old_flag in sys.argv:
logger.warning(
f"⚠️ DEPRECATED: {old_flag} is deprecated. "
f"All features are now enabled by default. "
f"Use {new_flag} to disable this feature."
)
# Handle --preset-list flag BEFORE parse_args() to avoid required --directory validation
if "--preset-list" in sys.argv:
from skill_seekers.cli.presets import PresetManager
print(PresetManager.format_preset_help())
return 0
args = parser.parse_args()
# Check for deprecated flags and show warnings
_check_deprecated_flags(args)
# Handle presets using formal preset system
preset_name = None
if hasattr(args, "preset") and args.preset:
# New --preset flag (recommended)
preset_name = args.preset
elif hasattr(args, "quick") and args.quick:
# Legacy --quick flag (backward compatibility)
preset_name = "quick"
elif hasattr(args, "comprehensive") and args.comprehensive:
# Legacy --comprehensive flag (backward compatibility)
preset_name = "comprehensive"
else:
# Default preset if none specified
preset_name = "standard"
# Apply preset using PresetManager
if preset_name:
from skill_seekers.cli.presets import PresetManager
try:
preset_args = PresetManager.apply_preset(preset_name, vars(args))
# Update args with preset values
for key, value in preset_args.items():
setattr(args, key, value)
preset = PresetManager.get_preset(preset_name)
logger.info(f"{preset.icon} {preset.name} analysis mode: {preset.description}")
except ValueError as e:
logger.error(f"{e}")
return 1
# Apply default depth if not set by preset or CLI
if args.depth is None:
args.depth = "deep" # Default depth
setup_logging(verbose=args.verbose, quiet=getattr(args, "quiet", False))
# Handle --dry-run
if getattr(args, "dry_run", False):
directory = Path(args.directory)
print(f"\n{'=' * 60}")
print(f"DRY RUN: Codebase Analysis")
print(f"{'=' * 60}")
print(f"Directory: {directory.resolve()}")
print(f"Output: {args.output}")
print(f"Preset: {preset_name}")
print(f"Depth: {args.depth or 'deep (default)'}")
print(f"Name: {getattr(args, 'name', None) or directory.name}")
print(f"Enhance: level {args.enhance_level}")
print(f"Skip flags: ", end="")
skips = []
for flag in [
"skip_api_reference",
"skip_dependency_graph",
"skip_patterns",
"skip_test_examples",
"skip_how_to_guides",
"skip_config_patterns",
"skip_docs",
]:
if getattr(args, flag, False):
skips.append(f"--{flag.replace('_', '-')}")
print(", ".join(skips) if skips else "(none)")
print(f"\n✅ Dry run complete")
return 0
# Validate directory
directory = Path(args.directory)
if not directory.exists():
logger.error(f"Directory not found: {directory}")
return 1
if not directory.is_dir():
logger.error(f"Not a directory: {directory}")
return 1
# Parse languages
languages = None
if args.languages:
languages = [lang.strip() for lang in args.languages.split(",")]
# Parse file patterns
file_patterns = None
if args.file_patterns:
file_patterns = [p.strip() for p in args.file_patterns.split(",")]
# Analyze codebase
try:
results = analyze_codebase(
directory=directory,
output_dir=Path(args.output),
depth=args.depth,
languages=languages,
file_patterns=file_patterns,
build_api_reference=not args.skip_api_reference,
extract_comments=not args.no_comments,
build_dependency_graph=not args.skip_dependency_graph,
detect_patterns=not args.skip_patterns,
extract_test_examples=not args.skip_test_examples,
build_how_to_guides=not args.skip_how_to_guides,
extract_config_patterns=not args.skip_config_patterns,
extract_docs=not args.skip_docs,
enhance_level=args.enhance_level, # AI enhancement level (0-3)
skill_name=getattr(args, "name", None),
skill_description=getattr(args, "description", None),
doc_version=getattr(args, "doc_version", ""),
)
# ============================================================
# WORKFLOW SYSTEM INTEGRATION (Phase 2)
# ============================================================
from skill_seekers.cli.workflow_runner import run_workflows
workflow_executed, workflow_names = run_workflows(args)
# Print summary
print(f"\n{'=' * 60}")
print("CODEBASE ANALYSIS COMPLETE")
if workflow_executed:
print(f" + {len(workflow_names)} ENHANCEMENT WORKFLOW(S) EXECUTED")
print(f"{'=' * 60}")
print(f"Files analyzed: {len(results['files'])}")
print(f"Output directory: {args.output}")
if not args.skip_api_reference:
print(f"API reference: {Path(args.output) / 'api_reference'}")
if workflow_executed:
print(f"Workflows applied: {', '.join(workflow_names)}")
print(f"{'=' * 60}\n")
return 0
except KeyboardInterrupt:
logger.error("\nAnalysis interrupted by user")
return 130
except Exception as e:
logger.error(f"Analysis failed: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())