feat(C3.9): Add project documentation extraction from markdown files
- Scan ALL .md files in project (README, docs/, etc.) - Smart categorization by folder/filename (overview, architecture, guides, etc.) - Processing depth: surface=raw copy, deep=parse+summarize, full=AI-enhanced - AI enhancement at level 2+ adds topic extraction and cross-references - New "Project Documentation" section in SKILL.md with summaries - Output to references/documentation/ organized by category - Default ON, use --skip-docs to disable - Add skip_docs parameter to MCP scrape_codebase_tool - Add 15 new tests for markdown documentation features Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
10
CLAUDE.md
10
CLAUDE.md
@@ -297,9 +297,17 @@ skill-seekers analyze --directory . --skip-patterns --skip-how-to-guides
|
||||
```
|
||||
|
||||
- Generates 300+ line standalone SKILL.md files from codebases
|
||||
- All C3.x features integrated (patterns, tests, guides, config, architecture)
|
||||
- All C3.x features integrated (patterns, tests, guides, config, architecture, docs)
|
||||
- Complete codebase analysis without documentation scraping
|
||||
|
||||
**C3.9 Project Documentation Extraction** (`codebase_scraper.py`):
|
||||
- Extracts and categorizes all markdown files from the project
|
||||
- Auto-detects categories: overview, architecture, guides, workflows, features, etc.
|
||||
- Integrates documentation into SKILL.md with summaries
|
||||
- AI enhancement (level 2+) adds topic extraction and cross-references
|
||||
- Controlled by depth: surface=raw copy, deep=parse+summarize, full=AI-enhanced
|
||||
- Default ON, use `--skip-docs` to disable
|
||||
|
||||
**Key Architecture Decision (BREAKING in v2.5.2):**
|
||||
- Changed from opt-in (`--build-*`) to opt-out (`--skip-*`) flags
|
||||
- All analysis features now ON by default for maximum value
|
||||
|
||||
@@ -75,6 +75,53 @@ LANGUAGE_EXTENSIONS = {
|
||||
".php": "PHP",
|
||||
}
|
||||
|
||||
# Markdown extension mapping
|
||||
MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdown", ".mkd"}
|
||||
|
||||
# Common documentation folders to scan
|
||||
DOC_FOLDERS = {"docs", "doc", "documentation", "wiki", ".github"}
|
||||
|
||||
# Root-level doc files → category mapping
|
||||
ROOT_DOC_CATEGORIES = {
|
||||
"readme": "overview",
|
||||
"contributing": "contributing",
|
||||
"changelog": "changelog",
|
||||
"history": "changelog",
|
||||
"license": "license",
|
||||
"authors": "authors",
|
||||
"code_of_conduct": "community",
|
||||
"security": "security",
|
||||
"architecture": "architecture",
|
||||
"design": "architecture",
|
||||
}
|
||||
|
||||
# Folder name → category mapping
|
||||
FOLDER_CATEGORIES = {
|
||||
"architecture": "architecture",
|
||||
"arch": "architecture",
|
||||
"design": "architecture",
|
||||
"guides": "guides",
|
||||
"guide": "guides",
|
||||
"tutorials": "guides",
|
||||
"tutorial": "guides",
|
||||
"howto": "guides",
|
||||
"how-to": "guides",
|
||||
"workflows": "workflows",
|
||||
"workflow": "workflows",
|
||||
"templates": "templates",
|
||||
"template": "templates",
|
||||
"api": "api",
|
||||
"reference": "api",
|
||||
"examples": "examples",
|
||||
"example": "examples",
|
||||
"specs": "specifications",
|
||||
"spec": "specifications",
|
||||
"rfcs": "specifications",
|
||||
"rfc": "specifications",
|
||||
"features": "features",
|
||||
"feature": "features",
|
||||
}
|
||||
|
||||
# Default directories to exclude
|
||||
DEFAULT_EXCLUDED_DIRS = {
|
||||
"node_modules",
|
||||
@@ -216,6 +263,469 @@ def walk_directory(
|
||||
return sorted(files)
|
||||
|
||||
|
||||
def walk_markdown_files(
|
||||
root: Path,
|
||||
gitignore_spec: pathspec.PathSpec | None = None,
|
||||
excluded_dirs: set | None = None,
|
||||
) -> list[Path]:
|
||||
"""
|
||||
Walk directory tree and collect markdown documentation files.
|
||||
|
||||
Args:
|
||||
root: Root directory to walk
|
||||
gitignore_spec: Optional PathSpec object for .gitignore rules
|
||||
excluded_dirs: Set of directory names to exclude
|
||||
|
||||
Returns:
|
||||
List of markdown file paths
|
||||
"""
|
||||
if excluded_dirs is None:
|
||||
excluded_dirs = DEFAULT_EXCLUDED_DIRS
|
||||
|
||||
files = []
|
||||
root = Path(root).resolve()
|
||||
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
current_dir = Path(dirpath)
|
||||
|
||||
# Filter out excluded directories (in-place modification)
|
||||
dirnames[:] = [d for d in dirnames if not should_exclude_dir(d, excluded_dirs)]
|
||||
|
||||
for filename in filenames:
|
||||
file_path = current_dir / filename
|
||||
|
||||
# Check .gitignore rules
|
||||
if gitignore_spec:
|
||||
try:
|
||||
rel_path = file_path.relative_to(root)
|
||||
if gitignore_spec.match_file(str(rel_path)):
|
||||
logger.debug(f"Skipping (gitignore): {rel_path}")
|
||||
continue
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Check if markdown file
|
||||
if file_path.suffix.lower() not in MARKDOWN_EXTENSIONS:
|
||||
continue
|
||||
|
||||
files.append(file_path)
|
||||
|
||||
return sorted(files)
|
||||
|
||||
|
||||
def categorize_markdown_file(file_path: Path, root: Path) -> str:
|
||||
"""
|
||||
Categorize a markdown file based on its location and filename.
|
||||
|
||||
Args:
|
||||
file_path: Path to the markdown file
|
||||
root: Root directory of the project
|
||||
|
||||
Returns:
|
||||
Category name (e.g., 'overview', 'guides', 'architecture')
|
||||
"""
|
||||
try:
|
||||
rel_path = file_path.relative_to(root)
|
||||
except ValueError:
|
||||
return "other"
|
||||
|
||||
# Check root-level files by filename
|
||||
if len(rel_path.parts) == 1:
|
||||
filename_lower = file_path.stem.lower().replace("-", "_").replace(" ", "_")
|
||||
for key, category in ROOT_DOC_CATEGORIES.items():
|
||||
if key in filename_lower:
|
||||
return category
|
||||
return "overview" # Default for root .md files
|
||||
|
||||
# Check folder-based categorization
|
||||
for part in rel_path.parts[:-1]: # Exclude filename
|
||||
part_lower = part.lower().replace("-", "_").replace(" ", "_")
|
||||
for key, category in FOLDER_CATEGORIES.items():
|
||||
if key in part_lower:
|
||||
return category
|
||||
|
||||
# Default category
|
||||
return "other"
|
||||
|
||||
|
||||
def extract_markdown_structure(content: str) -> dict[str, Any]:
|
||||
"""
|
||||
Extract structure from markdown content (headers, code blocks, links).
|
||||
|
||||
Args:
|
||||
content: Markdown file content
|
||||
|
||||
Returns:
|
||||
Dictionary with extracted structure
|
||||
"""
|
||||
import re
|
||||
|
||||
structure = {
|
||||
"title": None,
|
||||
"headers": [],
|
||||
"code_blocks": [],
|
||||
"links": [],
|
||||
"word_count": len(content.split()),
|
||||
"line_count": len(content.split("\n")),
|
||||
}
|
||||
|
||||
lines = content.split("\n")
|
||||
|
||||
# Extract headers
|
||||
for i, line in enumerate(lines):
|
||||
header_match = re.match(r"^(#{1,6})\s+(.+)$", line)
|
||||
if header_match:
|
||||
level = len(header_match.group(1))
|
||||
text = header_match.group(2).strip()
|
||||
structure["headers"].append({
|
||||
"level": level,
|
||||
"text": text,
|
||||
"line": i + 1,
|
||||
})
|
||||
# First h1 is the title
|
||||
if level == 1 and structure["title"] is None:
|
||||
structure["title"] = text
|
||||
|
||||
# Extract code blocks (fenced)
|
||||
code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL)
|
||||
for match in code_block_pattern.finditer(content):
|
||||
language = match.group(1) or "text"
|
||||
code = match.group(2).strip()
|
||||
if len(code) > 0:
|
||||
structure["code_blocks"].append({
|
||||
"language": language,
|
||||
"code": code[:500], # Truncate long code blocks
|
||||
"full_length": len(code),
|
||||
})
|
||||
|
||||
# Extract links
|
||||
link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
||||
for match in link_pattern.finditer(content):
|
||||
structure["links"].append({
|
||||
"text": match.group(1),
|
||||
"url": match.group(2),
|
||||
})
|
||||
|
||||
return structure
|
||||
|
||||
|
||||
def generate_markdown_summary(content: str, structure: dict[str, Any], max_length: int = 500) -> str:
|
||||
"""
|
||||
Generate a summary of markdown content.
|
||||
|
||||
Args:
|
||||
content: Full markdown content
|
||||
structure: Extracted structure from extract_markdown_structure()
|
||||
max_length: Maximum summary length
|
||||
|
||||
Returns:
|
||||
Summary string
|
||||
"""
|
||||
# Start with title if available
|
||||
summary_parts = []
|
||||
|
||||
if structure.get("title"):
|
||||
summary_parts.append(f"**{structure['title']}**")
|
||||
|
||||
# Add header outline (first 5 h2/h3 headers)
|
||||
h2_h3 = [h for h in structure.get("headers", []) if h["level"] in (2, 3)][:5]
|
||||
if h2_h3:
|
||||
sections = [h["text"] for h in h2_h3]
|
||||
summary_parts.append(f"Sections: {', '.join(sections)}")
|
||||
|
||||
# Extract first paragraph (skip headers and empty lines)
|
||||
lines = content.split("\n")
|
||||
first_para = []
|
||||
in_para = False
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("#") or stripped.startswith("```"):
|
||||
if in_para:
|
||||
break
|
||||
continue
|
||||
if stripped:
|
||||
in_para = True
|
||||
first_para.append(stripped)
|
||||
elif in_para:
|
||||
break
|
||||
|
||||
if first_para:
|
||||
para_text = " ".join(first_para)
|
||||
if len(para_text) > 200:
|
||||
para_text = para_text[:200] + "..."
|
||||
summary_parts.append(para_text)
|
||||
|
||||
# Add stats
|
||||
stats = f"({structure.get('word_count', 0)} words, {len(structure.get('code_blocks', []))} code blocks)"
|
||||
summary_parts.append(stats)
|
||||
|
||||
summary = "\n".join(summary_parts)
|
||||
if len(summary) > max_length:
|
||||
summary = summary[:max_length] + "..."
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def process_markdown_docs(
|
||||
directory: Path,
|
||||
output_dir: Path,
|
||||
depth: str = "deep",
|
||||
gitignore_spec: pathspec.PathSpec | None = None,
|
||||
enhance_with_ai: bool = False,
|
||||
ai_mode: str = "none",
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Process all markdown documentation files in a directory.
|
||||
|
||||
Args:
|
||||
directory: Root directory to scan
|
||||
output_dir: Output directory for processed docs
|
||||
depth: Processing depth ('surface', 'deep', 'full')
|
||||
gitignore_spec: Optional .gitignore spec
|
||||
enhance_with_ai: Whether to use AI enhancement
|
||||
ai_mode: AI mode ('none', 'auto', 'api', 'local')
|
||||
|
||||
Returns:
|
||||
Dictionary with processed documentation data
|
||||
"""
|
||||
logger.info("Scanning for markdown documentation...")
|
||||
|
||||
# Find all markdown files
|
||||
md_files = walk_markdown_files(directory, gitignore_spec)
|
||||
logger.info(f"Found {len(md_files)} markdown files")
|
||||
|
||||
if not md_files:
|
||||
return {"files": [], "categories": {}, "total_files": 0}
|
||||
|
||||
# Process each file
|
||||
processed_docs = []
|
||||
categories = {}
|
||||
|
||||
for md_path in md_files:
|
||||
try:
|
||||
content = md_path.read_text(encoding="utf-8", errors="ignore")
|
||||
rel_path = str(md_path.relative_to(directory))
|
||||
category = categorize_markdown_file(md_path, directory)
|
||||
|
||||
doc_data = {
|
||||
"path": rel_path,
|
||||
"filename": md_path.name,
|
||||
"category": category,
|
||||
"size_bytes": len(content.encode("utf-8")),
|
||||
}
|
||||
|
||||
# Surface depth: just path and category
|
||||
if depth == "surface":
|
||||
processed_docs.append(doc_data)
|
||||
else:
|
||||
# Deep/Full: extract structure and summary
|
||||
structure = extract_markdown_structure(content)
|
||||
summary = generate_markdown_summary(content, structure)
|
||||
|
||||
doc_data.update({
|
||||
"title": structure.get("title") or md_path.stem,
|
||||
"structure": structure,
|
||||
"summary": summary,
|
||||
"content": content if depth == "full" else None,
|
||||
})
|
||||
processed_docs.append(doc_data)
|
||||
|
||||
# Track categories
|
||||
if category not in categories:
|
||||
categories[category] = []
|
||||
categories[category].append(rel_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to process {md_path}: {e}")
|
||||
continue
|
||||
|
||||
# AI Enhancement (if enabled and enhance_level >= 2)
|
||||
if enhance_with_ai and ai_mode != "none" and processed_docs:
|
||||
logger.info("🤖 Enhancing documentation analysis with AI...")
|
||||
try:
|
||||
processed_docs = _enhance_docs_with_ai(processed_docs, ai_mode)
|
||||
logger.info("✅ AI documentation enhancement complete")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ AI enhancement failed: {e}")
|
||||
|
||||
# Save processed docs to output
|
||||
docs_output_dir = output_dir / "documentation"
|
||||
docs_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy files organized by category
|
||||
for doc in processed_docs:
|
||||
try:
|
||||
src_path = directory / doc["path"]
|
||||
category = doc["category"]
|
||||
category_dir = docs_output_dir / category
|
||||
category_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy file to category folder
|
||||
dest_path = category_dir / doc["filename"]
|
||||
import shutil
|
||||
shutil.copy2(src_path, dest_path)
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to copy {doc['path']}: {e}")
|
||||
|
||||
# Save documentation index
|
||||
index_data = {
|
||||
"total_files": len(processed_docs),
|
||||
"categories": categories,
|
||||
"files": processed_docs,
|
||||
}
|
||||
|
||||
index_json = docs_output_dir / "documentation_index.json"
|
||||
with open(index_json, "w", encoding="utf-8") as f:
|
||||
json.dump(index_data, f, indent=2, default=str)
|
||||
|
||||
logger.info(f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories")
|
||||
logger.info(f"📁 Saved to: {docs_output_dir}")
|
||||
|
||||
return index_data
|
||||
|
||||
|
||||
def _enhance_docs_with_ai(docs: list[dict], ai_mode: str) -> list[dict]:
|
||||
"""
|
||||
Enhance documentation analysis with AI.
|
||||
|
||||
Args:
|
||||
docs: List of processed document dictionaries
|
||||
ai_mode: AI mode ('api' or 'local')
|
||||
|
||||
Returns:
|
||||
Enhanced document list
|
||||
"""
|
||||
# Try API mode first
|
||||
if ai_mode in ("api", "auto"):
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
if api_key:
|
||||
return _enhance_docs_api(docs, api_key)
|
||||
|
||||
# Fall back to LOCAL mode
|
||||
if ai_mode in ("local", "auto"):
|
||||
return _enhance_docs_local(docs)
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def _enhance_docs_api(docs: list[dict], api_key: str) -> list[dict]:
|
||||
"""Enhance docs using Claude API."""
|
||||
try:
|
||||
import anthropic
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
|
||||
# Batch documents for efficiency
|
||||
batch_size = 10
|
||||
for i in range(0, len(docs), batch_size):
|
||||
batch = docs[i:i + batch_size]
|
||||
|
||||
# Create prompt for batch
|
||||
docs_text = "\n\n".join([
|
||||
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}"
|
||||
for d in batch if d.get("summary")
|
||||
])
|
||||
|
||||
if not docs_text:
|
||||
continue
|
||||
|
||||
prompt = f"""Analyze these documentation files and provide:
|
||||
1. A brief description of what each document covers
|
||||
2. Key topics/concepts mentioned
|
||||
3. How they relate to each other
|
||||
|
||||
Documents:
|
||||
{docs_text}
|
||||
|
||||
Return JSON with format:
|
||||
{{"enhancements": [{{"filename": "...", "description": "...", "key_topics": [...], "related_to": [...]}}]}}"""
|
||||
|
||||
response = client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=2000,
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
|
||||
# Parse response and merge enhancements
|
||||
try:
|
||||
import re
|
||||
json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
|
||||
if json_match:
|
||||
enhancements = json.loads(json_match.group())
|
||||
for enh in enhancements.get("enhancements", []):
|
||||
for doc in batch:
|
||||
if doc["filename"] == enh.get("filename"):
|
||||
doc["ai_description"] = enh.get("description")
|
||||
doc["ai_topics"] = enh.get("key_topics", [])
|
||||
doc["ai_related"] = enh.get("related_to", [])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"API enhancement failed: {e}")
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def _enhance_docs_local(docs: list[dict]) -> list[dict]:
|
||||
"""Enhance docs using Claude Code CLI (LOCAL mode)."""
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
# Prepare batch of docs for enhancement
|
||||
docs_with_summary = [d for d in docs if d.get("summary")]
|
||||
if not docs_with_summary:
|
||||
return docs
|
||||
|
||||
docs_text = "\n\n".join([
|
||||
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}"
|
||||
for d in docs_with_summary[:20] # Limit to 20 docs
|
||||
])
|
||||
|
||||
prompt = f"""Analyze these documentation files from a codebase and provide insights.
|
||||
|
||||
For each document, provide:
|
||||
1. A brief description of what it covers
|
||||
2. Key topics/concepts
|
||||
3. Related documents
|
||||
|
||||
Documents:
|
||||
{docs_text}
|
||||
|
||||
Output JSON only:
|
||||
{{"enhancements": [{{"filename": "...", "description": "...", "key_topics": ["..."], "related_to": ["..."]}}]}}"""
|
||||
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||
f.write(prompt)
|
||||
prompt_file = f.name
|
||||
|
||||
result = subprocess.run(
|
||||
["claude", "--dangerously-skip-permissions", "-p", prompt],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
os.unlink(prompt_file)
|
||||
|
||||
if result.returncode == 0 and result.stdout:
|
||||
import re
|
||||
json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
|
||||
if json_match:
|
||||
enhancements = json.loads(json_match.group())
|
||||
for enh in enhancements.get("enhancements", []):
|
||||
for doc in docs:
|
||||
if doc["filename"] == enh.get("filename"):
|
||||
doc["ai_description"] = enh.get("description")
|
||||
doc["ai_topics"] = enh.get("key_topics", [])
|
||||
doc["ai_related"] = enh.get("related_to", [])
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"LOCAL enhancement failed: {e}")
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def analyze_codebase(
|
||||
directory: Path,
|
||||
output_dir: Path,
|
||||
@@ -229,6 +739,7 @@ def analyze_codebase(
|
||||
extract_test_examples: bool = True,
|
||||
build_how_to_guides: bool = True,
|
||||
extract_config_patterns: bool = True,
|
||||
extract_docs: bool = True,
|
||||
enhance_level: int = 0,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
@@ -247,7 +758,8 @@ def analyze_codebase(
|
||||
extract_test_examples: Extract usage examples from test files
|
||||
build_how_to_guides: Build how-to guides from workflow examples (C3.3)
|
||||
extract_config_patterns: Extract configuration patterns from config files (C3.4)
|
||||
enhance_level: AI enhancement level (0=off, 1=SKILL.md only, 2=+config+arch, 3=full)
|
||||
extract_docs: Extract and process markdown documentation files (default: True)
|
||||
enhance_level: AI enhancement level (0=off, 1=SKILL.md only, 2=+config+arch+docs, 3=full)
|
||||
|
||||
Returns:
|
||||
Analysis results dictionary
|
||||
@@ -622,6 +1134,33 @@ def analyze_codebase(
|
||||
else:
|
||||
logger.info("No clear architectural patterns detected")
|
||||
|
||||
# Extract markdown documentation (C3.9)
|
||||
docs_data = None
|
||||
if extract_docs:
|
||||
logger.info("Extracting project documentation...")
|
||||
try:
|
||||
# Determine AI enhancement for docs (level 2+)
|
||||
enhance_docs_ai = enhance_level >= 2
|
||||
docs_data = process_markdown_docs(
|
||||
directory=directory,
|
||||
output_dir=output_dir,
|
||||
depth=depth,
|
||||
gitignore_spec=gitignore_spec,
|
||||
enhance_with_ai=enhance_docs_ai,
|
||||
ai_mode=ai_mode,
|
||||
)
|
||||
|
||||
if docs_data and docs_data.get("total_files", 0) > 0:
|
||||
logger.info(
|
||||
f"✅ Extracted {docs_data['total_files']} documentation files "
|
||||
f"in {len(docs_data.get('categories', {}))} categories"
|
||||
)
|
||||
else:
|
||||
logger.info("No markdown documentation files found")
|
||||
except Exception as e:
|
||||
logger.warning(f"Documentation extraction failed: {e}")
|
||||
docs_data = None
|
||||
|
||||
# Generate SKILL.md and references/ directory
|
||||
logger.info("Generating SKILL.md and references...")
|
||||
_generate_skill_md(
|
||||
@@ -634,6 +1173,8 @@ def analyze_codebase(
|
||||
detect_patterns=detect_patterns,
|
||||
extract_test_examples=extract_test_examples,
|
||||
extract_config_patterns=extract_config_patterns,
|
||||
extract_docs=extract_docs,
|
||||
docs_data=docs_data,
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -649,6 +1190,8 @@ def _generate_skill_md(
|
||||
detect_patterns: bool,
|
||||
extract_test_examples: bool,
|
||||
extract_config_patterns: bool,
|
||||
extract_docs: bool = True,
|
||||
docs_data: dict[str, Any] | None = None,
|
||||
):
|
||||
"""
|
||||
Generate rich SKILL.md from codebase analysis results.
|
||||
@@ -728,7 +1271,10 @@ Use this skill when you need to:
|
||||
skill_content += "- ✅ Test Examples (C3.2)\n"
|
||||
if extract_config_patterns:
|
||||
skill_content += "- ✅ Configuration Patterns (C3.4)\n"
|
||||
skill_content += "- ✅ Architectural Analysis (C3.7)\n\n"
|
||||
skill_content += "- ✅ Architectural Analysis (C3.7)\n"
|
||||
if extract_docs:
|
||||
skill_content += "- ✅ Project Documentation (C3.9)\n"
|
||||
skill_content += "\n"
|
||||
|
||||
# Add design patterns if available
|
||||
if detect_patterns:
|
||||
@@ -759,6 +1305,12 @@ Use this skill when you need to:
|
||||
if config_content:
|
||||
skill_content += config_content
|
||||
|
||||
# Add project documentation if available
|
||||
if extract_docs and docs_data:
|
||||
docs_content = _format_documentation_section(output_dir, docs_data)
|
||||
if docs_content:
|
||||
skill_content += docs_content
|
||||
|
||||
# Available references
|
||||
skill_content += "## 📚 Available References\n\n"
|
||||
skill_content += "This skill includes detailed reference documentation:\n\n"
|
||||
@@ -788,6 +1340,9 @@ Use this skill when you need to:
|
||||
if (output_dir / "architecture").exists():
|
||||
skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
|
||||
refs_added = True
|
||||
if extract_docs and (output_dir / "documentation").exists():
|
||||
skill_content += "- **Documentation**: `references/documentation/` - Project documentation\n"
|
||||
refs_added = True
|
||||
|
||||
if not refs_added:
|
||||
skill_content += "No additional references generated (analysis features disabled).\n"
|
||||
@@ -1017,6 +1572,75 @@ def _format_config_section(output_dir: Path) -> str:
|
||||
return content
|
||||
|
||||
|
||||
def _format_documentation_section(output_dir: Path, docs_data: dict[str, Any]) -> str:
|
||||
"""Format project documentation section from extracted markdown files."""
|
||||
if not docs_data or docs_data.get("total_files", 0) == 0:
|
||||
return ""
|
||||
|
||||
categories = docs_data.get("categories", {})
|
||||
files = docs_data.get("files", [])
|
||||
|
||||
content = "## 📖 Project Documentation\n\n"
|
||||
content += "*Extracted from markdown files in the project (C3.9)*\n\n"
|
||||
content += f"**Total Documentation Files:** {docs_data['total_files']}\n"
|
||||
content += f"**Categories:** {len(categories)}\n\n"
|
||||
|
||||
# List documents by category (most important first)
|
||||
priority_order = ["overview", "architecture", "guides", "workflows", "features", "api", "examples"]
|
||||
|
||||
# Sort categories by priority
|
||||
sorted_categories = []
|
||||
for cat in priority_order:
|
||||
if cat in categories:
|
||||
sorted_categories.append(cat)
|
||||
for cat in sorted(categories.keys()):
|
||||
if cat not in sorted_categories:
|
||||
sorted_categories.append(cat)
|
||||
|
||||
for category in sorted_categories[:6]: # Limit to 6 categories in SKILL.md
|
||||
cat_files = categories[category]
|
||||
content += f"### {category.title()}\n\n"
|
||||
|
||||
# Get file details for this category
|
||||
cat_docs = [f for f in files if f.get("category") == category]
|
||||
|
||||
for doc in cat_docs[:5]: # Limit to 5 docs per category
|
||||
title = doc.get("title") or doc.get("filename", "Unknown")
|
||||
path = doc.get("path", "")
|
||||
|
||||
# Add summary if available (deep/full depth)
|
||||
if doc.get("ai_description"):
|
||||
content += f"- **{title}**: {doc['ai_description']}\n"
|
||||
elif doc.get("summary"):
|
||||
# Extract first sentence from summary
|
||||
summary = doc["summary"].split("\n")[0]
|
||||
if len(summary) > 100:
|
||||
summary = summary[:100] + "..."
|
||||
content += f"- **{title}**: {summary}\n"
|
||||
else:
|
||||
content += f"- **{title}** (`{path}`)\n"
|
||||
|
||||
if len(cat_files) > 5:
|
||||
content += f"- *...and {len(cat_files) - 5} more*\n"
|
||||
|
||||
content += "\n"
|
||||
|
||||
# AI-enhanced topics if available
|
||||
all_topics = []
|
||||
for doc in files:
|
||||
all_topics.extend(doc.get("ai_topics", []))
|
||||
|
||||
if all_topics:
|
||||
# Deduplicate and count
|
||||
from collections import Counter
|
||||
topic_counts = Counter(all_topics)
|
||||
top_topics = [t for t, _ in topic_counts.most_common(10)]
|
||||
content += f"**Key Topics:** {', '.join(top_topics)}\n\n"
|
||||
|
||||
content += "*See `references/documentation/` for all project documentation*\n\n"
|
||||
return content
|
||||
|
||||
|
||||
def _generate_references(output_dir: Path):
|
||||
"""
|
||||
Generate references/ directory structure by symlinking analysis output.
|
||||
@@ -1035,6 +1659,7 @@ def _generate_references(output_dir: Path):
|
||||
"tutorials": "tutorials",
|
||||
"config_patterns": "config_patterns",
|
||||
"architecture": "architecture",
|
||||
"documentation": "documentation",
|
||||
}
|
||||
|
||||
for source, target in mappings.items():
|
||||
@@ -1144,6 +1769,12 @@ Examples:
|
||||
default=False,
|
||||
help="Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-docs",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Skip project documentation extraction from markdown files (README, docs/, etc.) (default: enabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ai-mode",
|
||||
choices=["auto", "api", "local", "none"],
|
||||
@@ -1257,6 +1888,7 @@ Examples:
|
||||
extract_test_examples=not args.skip_test_examples,
|
||||
build_how_to_guides=not args.skip_how_to_guides,
|
||||
extract_config_patterns=not args.skip_config_patterns,
|
||||
extract_docs=not args.skip_docs,
|
||||
enhance_level=args.enhance_level, # AI enhancement level (0-3)
|
||||
)
|
||||
|
||||
|
||||
@@ -315,6 +315,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
analyze_parser.add_argument("--skip-test-examples", action="store_true", help="Skip test examples")
|
||||
analyze_parser.add_argument("--skip-how-to-guides", action="store_true", help="Skip guides")
|
||||
analyze_parser.add_argument("--skip-config-patterns", action="store_true", help="Skip config")
|
||||
analyze_parser.add_argument("--skip-docs", action="store_true", help="Skip project docs (README, docs/)")
|
||||
analyze_parser.add_argument("--no-comments", action="store_true", help="Skip comments")
|
||||
analyze_parser.add_argument("--verbose", action="store_true", help="Verbose logging")
|
||||
|
||||
@@ -609,6 +610,8 @@ def main(argv: list[str] | None = None) -> int:
|
||||
sys.argv.append("--skip-how-to-guides")
|
||||
if args.skip_config_patterns:
|
||||
sys.argv.append("--skip-config-patterns")
|
||||
if args.skip_docs:
|
||||
sys.argv.append("--skip-docs")
|
||||
if args.no_comments:
|
||||
sys.argv.append("--no-comments")
|
||||
if args.verbose:
|
||||
|
||||
@@ -464,6 +464,7 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]:
|
||||
- skip_test_examples (bool, optional): Skip test example extraction (default: False)
|
||||
- skip_how_to_guides (bool, optional): Skip how-to guide generation (default: False)
|
||||
- skip_config_patterns (bool, optional): Skip config pattern extraction (default: False)
|
||||
- skip_docs (bool, optional): Skip project documentation extraction (default: False)
|
||||
|
||||
Returns:
|
||||
List[TextContent]: Tool execution results
|
||||
@@ -497,6 +498,7 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]:
|
||||
skip_test_examples = args.get("skip_test_examples", False)
|
||||
skip_how_to_guides = args.get("skip_how_to_guides", False)
|
||||
skip_config_patterns = args.get("skip_config_patterns", False)
|
||||
skip_docs = args.get("skip_docs", False)
|
||||
|
||||
# Build command
|
||||
cmd = [sys.executable, "-m", "skill_seekers.cli.codebase_scraper"]
|
||||
@@ -526,6 +528,8 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]:
|
||||
cmd.append("--skip-how-to-guides")
|
||||
if skip_config_patterns:
|
||||
cmd.append("--skip-config-patterns")
|
||||
if skip_docs:
|
||||
cmd.append("--skip-docs")
|
||||
|
||||
# Adjust timeout based on enhance_level
|
||||
timeout = 600 # 10 minutes base
|
||||
|
||||
@@ -74,7 +74,8 @@ class TestAnalyzeSubcommand(unittest.TestCase):
|
||||
"--skip-patterns",
|
||||
"--skip-test-examples",
|
||||
"--skip-how-to-guides",
|
||||
"--skip-config-patterns"
|
||||
"--skip-config-patterns",
|
||||
"--skip-docs"
|
||||
])
|
||||
self.assertTrue(args.skip_api_reference)
|
||||
self.assertTrue(args.skip_dependency_graph)
|
||||
@@ -82,6 +83,7 @@ class TestAnalyzeSubcommand(unittest.TestCase):
|
||||
self.assertTrue(args.skip_test_examples)
|
||||
self.assertTrue(args.skip_how_to_guides)
|
||||
self.assertTrue(args.skip_config_patterns)
|
||||
self.assertTrue(args.skip_docs)
|
||||
|
||||
def test_backward_compatible_depth_flag(self):
|
||||
"""Test that deprecated --depth flag still works."""
|
||||
|
||||
@@ -21,10 +21,17 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
||||
|
||||
from skill_seekers.cli.codebase_scraper import (
|
||||
DEFAULT_EXCLUDED_DIRS,
|
||||
FOLDER_CATEGORIES,
|
||||
MARKDOWN_EXTENSIONS,
|
||||
ROOT_DOC_CATEGORIES,
|
||||
categorize_markdown_file,
|
||||
detect_language,
|
||||
extract_markdown_structure,
|
||||
generate_markdown_summary,
|
||||
load_gitignore,
|
||||
should_exclude_dir,
|
||||
walk_directory,
|
||||
walk_markdown_files,
|
||||
)
|
||||
|
||||
|
||||
@@ -201,6 +208,191 @@ class TestGitignoreLoading(unittest.TestCase):
|
||||
self.assertIsNotNone(spec)
|
||||
|
||||
|
||||
class TestMarkdownDocumentation(unittest.TestCase):
|
||||
"""Tests for markdown documentation extraction (C3.9)"""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment"""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
self.root = Path(self.temp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test environment"""
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
def test_markdown_extensions(self):
|
||||
"""Test that markdown extensions are properly defined."""
|
||||
self.assertIn(".md", MARKDOWN_EXTENSIONS)
|
||||
self.assertIn(".markdown", MARKDOWN_EXTENSIONS)
|
||||
|
||||
def test_root_doc_categories(self):
|
||||
"""Test root document category mapping."""
|
||||
self.assertEqual(ROOT_DOC_CATEGORIES.get("readme"), "overview")
|
||||
self.assertEqual(ROOT_DOC_CATEGORIES.get("changelog"), "changelog")
|
||||
self.assertEqual(ROOT_DOC_CATEGORIES.get("architecture"), "architecture")
|
||||
|
||||
def test_folder_categories(self):
|
||||
"""Test folder category mapping."""
|
||||
self.assertEqual(FOLDER_CATEGORIES.get("guides"), "guides")
|
||||
self.assertEqual(FOLDER_CATEGORIES.get("tutorials"), "guides")
|
||||
self.assertEqual(FOLDER_CATEGORIES.get("workflows"), "workflows")
|
||||
self.assertEqual(FOLDER_CATEGORIES.get("architecture"), "architecture")
|
||||
|
||||
def test_walk_markdown_files(self):
|
||||
"""Test walking directory for markdown files."""
|
||||
# Create test markdown files
|
||||
(self.root / "README.md").write_text("# Test README")
|
||||
(self.root / "test.py").write_text("print('test')")
|
||||
|
||||
docs_dir = self.root / "docs"
|
||||
docs_dir.mkdir()
|
||||
(docs_dir / "guide.md").write_text("# Guide")
|
||||
|
||||
files = walk_markdown_files(self.root)
|
||||
|
||||
# Should find markdown files only
|
||||
self.assertEqual(len(files), 2)
|
||||
filenames = [f.name for f in files]
|
||||
self.assertIn("README.md", filenames)
|
||||
self.assertIn("guide.md", filenames)
|
||||
|
||||
def test_categorize_root_readme(self):
|
||||
"""Test categorizing root README file."""
|
||||
readme_path = self.root / "README.md"
|
||||
readme_path.write_text("# Test")
|
||||
|
||||
category = categorize_markdown_file(readme_path, self.root)
|
||||
self.assertEqual(category, "overview")
|
||||
|
||||
def test_categorize_changelog(self):
|
||||
"""Test categorizing CHANGELOG file."""
|
||||
changelog_path = self.root / "CHANGELOG.md"
|
||||
changelog_path.write_text("# Changelog")
|
||||
|
||||
category = categorize_markdown_file(changelog_path, self.root)
|
||||
self.assertEqual(category, "changelog")
|
||||
|
||||
def test_categorize_docs_guide(self):
|
||||
"""Test categorizing file in docs/guides folder."""
|
||||
guides_dir = self.root / "docs" / "guides"
|
||||
guides_dir.mkdir(parents=True)
|
||||
guide_path = guides_dir / "getting-started.md"
|
||||
guide_path.write_text("# Getting Started")
|
||||
|
||||
category = categorize_markdown_file(guide_path, self.root)
|
||||
self.assertEqual(category, "guides")
|
||||
|
||||
def test_categorize_architecture(self):
|
||||
"""Test categorizing architecture documentation."""
|
||||
arch_dir = self.root / "docs" / "architecture"
|
||||
arch_dir.mkdir(parents=True)
|
||||
arch_path = arch_dir / "overview.md"
|
||||
arch_path.write_text("# Architecture")
|
||||
|
||||
category = categorize_markdown_file(arch_path, self.root)
|
||||
self.assertEqual(category, "architecture")
|
||||
|
||||
|
||||
class TestMarkdownStructureExtraction(unittest.TestCase):
|
||||
"""Tests for markdown structure extraction"""
|
||||
|
||||
def test_extract_headers(self):
|
||||
"""Test extracting headers from markdown."""
|
||||
content = """# Main Title
|
||||
|
||||
## Section 1
|
||||
Some content
|
||||
|
||||
### Subsection
|
||||
More content
|
||||
|
||||
## Section 2
|
||||
"""
|
||||
structure = extract_markdown_structure(content)
|
||||
|
||||
self.assertEqual(structure["title"], "Main Title")
|
||||
self.assertEqual(len(structure["headers"]), 4)
|
||||
self.assertEqual(structure["headers"][0]["level"], 1)
|
||||
self.assertEqual(structure["headers"][1]["level"], 2)
|
||||
|
||||
def test_extract_code_blocks(self):
|
||||
"""Test extracting code blocks from markdown."""
|
||||
content = """# Example
|
||||
|
||||
```python
|
||||
def hello():
|
||||
print("Hello")
|
||||
```
|
||||
|
||||
```javascript
|
||||
console.log("test");
|
||||
```
|
||||
"""
|
||||
structure = extract_markdown_structure(content)
|
||||
|
||||
self.assertEqual(len(structure["code_blocks"]), 2)
|
||||
self.assertEqual(structure["code_blocks"][0]["language"], "python")
|
||||
self.assertEqual(structure["code_blocks"][1]["language"], "javascript")
|
||||
|
||||
def test_extract_links(self):
|
||||
"""Test extracting links from markdown."""
|
||||
content = """# Links
|
||||
|
||||
Check out [Example](https://example.com) and [Another](./local.md).
|
||||
"""
|
||||
structure = extract_markdown_structure(content)
|
||||
|
||||
self.assertEqual(len(structure["links"]), 2)
|
||||
self.assertEqual(structure["links"][0]["text"], "Example")
|
||||
self.assertEqual(structure["links"][0]["url"], "https://example.com")
|
||||
|
||||
def test_word_and_line_count(self):
|
||||
"""Test word and line count."""
|
||||
content = "First line\nSecond line\nThird line"
|
||||
structure = extract_markdown_structure(content)
|
||||
|
||||
self.assertEqual(structure["line_count"], 3)
|
||||
self.assertEqual(structure["word_count"], 6) # First, line, Second, line, Third, line
|
||||
|
||||
|
||||
class TestMarkdownSummaryGeneration(unittest.TestCase):
|
||||
"""Tests for markdown summary generation"""
|
||||
|
||||
def test_generate_summary_with_title(self):
|
||||
"""Test summary includes title."""
|
||||
content = "# My Title\n\nSome content here."
|
||||
structure = extract_markdown_structure(content)
|
||||
summary = generate_markdown_summary(content, structure)
|
||||
|
||||
self.assertIn("**My Title**", summary)
|
||||
|
||||
def test_generate_summary_with_sections(self):
|
||||
"""Test summary includes section names."""
|
||||
content = """# Main
|
||||
|
||||
## Getting Started
|
||||
Content
|
||||
|
||||
## Installation
|
||||
Content
|
||||
|
||||
## Usage
|
||||
Content
|
||||
"""
|
||||
structure = extract_markdown_structure(content)
|
||||
summary = generate_markdown_summary(content, structure)
|
||||
|
||||
self.assertIn("Sections:", summary)
|
||||
|
||||
def test_generate_summary_truncation(self):
|
||||
"""Test summary is truncated to max length."""
|
||||
content = "# Title\n\n" + "Long content. " * 100
|
||||
structure = extract_markdown_structure(content)
|
||||
summary = generate_markdown_summary(content, structure, max_length=200)
|
||||
|
||||
self.assertLessEqual(len(summary), 210) # Allow some buffer for truncation marker
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run tests with verbose output
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
Reference in New Issue
Block a user