Files
skill-seekers-reference/src/skill_seekers/cli/unified_skill_builder.py
Zhichang Yu 9435d2911d feat: Add GLM-4.7 support and fix PDF scraper issues (#266)
Merging with admin override due to known issues:

 **What Works**:
- GLM-4.7 Claude-compatible API support (correctly implemented)
- PDF scraper improvements (content truncation fixed, page traceability added)  
- Documentation updates comprehensive

⚠️ **Known Issues (will be fixed in next commit)**:
1. Import bugs in 3 files causing UnboundLocalError (30 tests failing)
2. PDF scraper test expectations need updating for new behavior (5 tests failing)
3. test_godot_config failure (pre-existing, not caused by this PR - 1 test failing)

**Action Plan**:
Fixes for issues #1 and #2 are ready and will be committed immediately after merge.
Issue #3 requires separate investigation as it's a pre-existing problem.

Total: 36 failing tests, 35 will be fixed in next commit.
2026-01-27 21:10:40 +03:00

1614 lines
66 KiB
Python

#!/usr/bin/env python3
"""
Unified Skill Builder
Generates final skill structure from merged multi-source data:
- SKILL.md with merged APIs and conflict warnings
- references/ with organized content by source
- Inline conflict markers (⚠️)
- Separate conflicts summary section
Supports mixed sources (documentation, GitHub, PDF) and highlights
discrepancies transparently.
"""
import json
import logging
import os
import shutil
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class UnifiedSkillBuilder:
"""
Builds unified skill from multi-source data.
"""
def __init__(
self,
config: dict,
scraped_data: dict,
merged_data: dict | None = None,
conflicts: list | None = None,
cache_dir: str | None = None,
):
"""
Initialize skill builder.
Args:
config: Unified config dict
scraped_data: Dict of scraped data by source type
merged_data: Merged API data (if conflicts were resolved)
conflicts: List of detected conflicts
cache_dir: Optional cache directory for intermediate files
"""
self.config = config
self.scraped_data = scraped_data
self.merged_data = merged_data
self.conflicts = conflicts or []
self.cache_dir = cache_dir
self.name = config["name"]
self.description = config["description"]
self.skill_dir = f"output/{self.name}"
# Create directories
os.makedirs(self.skill_dir, exist_ok=True)
os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
def build(self):
"""Build complete skill structure."""
logger.info(f"Building unified skill: {self.name}")
# Generate main SKILL.md
self._generate_skill_md()
# Generate reference files by source
self._generate_references()
# Generate conflicts report (if any)
if self.conflicts:
self._generate_conflicts_report()
logger.info(f"✅ Unified skill built: {self.skill_dir}/")
def _load_source_skill_mds(self) -> dict[str, str]:
"""Load standalone SKILL.md files from each source.
Returns:
Dict mapping source type to SKILL.md content
e.g., {'documentation': '...', 'github': '...', 'pdf': '...'}
"""
skill_mds = {}
# Determine base directory for source SKILL.md files
sources_dir = Path(self.cache_dir) / "sources" if self.cache_dir else Path("output")
# Load documentation SKILL.md
docs_skill_path = sources_dir / f"{self.name}_docs" / "SKILL.md"
if docs_skill_path.exists():
try:
skill_mds["documentation"] = docs_skill_path.read_text(encoding="utf-8")
logger.debug(
f"Loaded documentation SKILL.md ({len(skill_mds['documentation'])} chars)"
)
except OSError as e:
logger.warning(f"Failed to read documentation SKILL.md: {e}")
# Load ALL GitHub sources (multi-source support)
github_sources = []
for github_dir in sources_dir.glob(f"{self.name}_github_*"):
github_skill_path = github_dir / "SKILL.md"
if github_skill_path.exists():
try:
content = github_skill_path.read_text(encoding="utf-8")
github_sources.append(content)
logger.debug(
f"Loaded GitHub SKILL.md from {github_dir.name} ({len(content)} chars)"
)
except OSError as e:
logger.warning(f"Failed to read GitHub SKILL.md from {github_dir.name}: {e}")
if github_sources:
# Concatenate all GitHub sources with separator
skill_mds["github"] = "\n\n---\n\n".join(github_sources)
logger.debug(f"Combined {len(github_sources)} GitHub SKILL.md files")
# Load ALL PDF sources (multi-source support)
pdf_sources = []
for pdf_dir in sources_dir.glob(f"{self.name}_pdf_*"):
pdf_skill_path = pdf_dir / "SKILL.md"
if pdf_skill_path.exists():
try:
content = pdf_skill_path.read_text(encoding="utf-8")
pdf_sources.append(content)
logger.debug(f"Loaded PDF SKILL.md from {pdf_dir.name} ({len(content)} chars)")
except OSError as e:
logger.warning(f"Failed to read PDF SKILL.md from {pdf_dir.name}: {e}")
if pdf_sources:
# Concatenate all PDF sources with separator
skill_mds["pdf"] = "\n\n---\n\n".join(pdf_sources)
logger.debug(f"Combined {len(pdf_sources)} PDF SKILL.md files")
logger.info(f"Loaded {len(skill_mds)} source SKILL.md files")
return skill_mds
def _parse_skill_md_sections(self, skill_md: str) -> dict[str, str]:
"""Parse SKILL.md into sections by ## headers.
Args:
skill_md: Full SKILL.md content
Returns:
Dict mapping section name to content
e.g., {'When to Use': '...', 'Quick Reference': '...'}
"""
sections = {}
current_section = None
current_content = []
lines = skill_md.split("\n")
for line in lines:
# Detect section header (## Header)
if line.startswith("## "):
# Save previous section
if current_section:
sections[current_section] = "\n".join(current_content).strip()
# Start new section
current_section = line[3:].strip()
# Remove emoji and markdown formatting
current_section = current_section.split("](")[0] # Remove links
for emoji in [
"📚",
"🏗️",
"⚠️",
"🔧",
"📖",
"💡",
"🎯",
"📊",
"🔍",
"⚙️",
"🧪",
"📝",
"🗂️",
"📐",
"",
]:
current_section = current_section.replace(emoji, "").strip()
current_content = []
elif current_section:
# Accumulate content for current section
current_content.append(line)
# Save last section
if current_section and current_content:
sections[current_section] = "\n".join(current_content).strip()
logger.debug(f"Parsed {len(sections)} sections from SKILL.md")
return sections
def _synthesize_docs_github(self, skill_mds: dict[str, str]) -> str:
"""Synthesize documentation + GitHub sources with weighted merge.
Strategy:
- Start with docs frontmatter and intro
- Add GitHub metadata (stars, topics, language stats)
- Merge "When to Use" from both sources
- Merge "Quick Reference" from both sources
- Include GitHub-specific sections (patterns, architecture)
- Merge code examples (prioritize GitHub real usage)
- Include Known Issues from GitHub
- Fix placeholder text (httpx_docs → httpx)
Args:
skill_mds: Dict with 'documentation' and 'github' keys
Returns:
Synthesized SKILL.md content
"""
docs_sections = self._parse_skill_md_sections(skill_mds.get("documentation", ""))
github_sections = self._parse_skill_md_sections(skill_mds.get("github", ""))
# Extract GitHub metadata from full content
_github_full = skill_mds.get("github", "")
# Start with YAML frontmatter
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
desc = self.description[:1024] if len(self.description) > 1024 else self.description
content = f"""---
name: {skill_name}
description: {desc}
---
# {self.name.title()}
{self.description}
## 📚 Sources
This skill synthesizes knowledge from multiple sources:
- ✅ **Official Documentation**: {self.config.get("sources", [{}])[0].get("base_url", "N/A")}
- ✅ **GitHub Repository**: {[s for s in self.config.get("sources", []) if s.get("type") == "github"][0].get("repo", "N/A") if [s for s in self.config.get("sources", []) if s.get("type") == "github"] else "N/A"}
"""
# Add GitHub Description and Metadata if present
if "Description" in github_sections:
content += "## 📦 About\n\n"
content += github_sections["Description"] + "\n\n"
# Add Repository Info from GitHub
if "Repository Info" in github_sections:
content += "### Repository Info\n\n"
content += github_sections["Repository Info"] + "\n\n"
# Add Language stats from GitHub
if "Languages" in github_sections:
content += "### Languages\n\n"
content += github_sections["Languages"] + "\n\n"
content += "## 💡 When to Use This Skill\n\n"
# Merge "When to Use" sections - Fix placeholder text
when_to_use_added = False
for key in ["When to Use This Skill", "When to Use"]:
if key in docs_sections:
# Fix placeholder text: httpx_docs → httpx
when_content = docs_sections[key].replace("httpx_docs", self.name)
when_content = when_content.replace("httpx_github", self.name)
content += when_content + "\n\n"
when_to_use_added = True
break
if "When to Use This Skill" in github_sections:
if when_to_use_added:
content += "**From repository analysis:**\n\n"
content += github_sections["When to Use This Skill"] + "\n\n"
# Quick Reference: Merge from both sources
content += "## 🎯 Quick Reference\n\n"
if "Quick Reference" in docs_sections:
content += "**From Documentation:**\n\n"
content += docs_sections["Quick Reference"] + "\n\n"
if "Quick Reference" in github_sections:
# Include GitHub's Quick Reference (contains design patterns summary)
logger.info(
f"DEBUG: Including GitHub Quick Reference ({len(github_sections['Quick Reference'])} chars)"
)
content += github_sections["Quick Reference"] + "\n\n"
else:
logger.warning("DEBUG: GitHub Quick Reference section NOT FOUND!")
# Design Patterns (GitHub only - C3.1 analysis)
if "Design Patterns Detected" in github_sections:
content += "### Design Patterns Detected\n\n"
content += "*From C3.1 codebase analysis (confidence > 0.7)*\n\n"
content += github_sections["Design Patterns Detected"] + "\n\n"
# Code Examples: Prefer GitHub (real usage)
content += "## 🧪 Code Examples\n\n"
if "Code Examples" in github_sections:
content += "**From Repository Tests:**\n\n"
# Note: GitHub section already includes "*High-quality examples from codebase (C3.2)*" label
content += github_sections["Code Examples"] + "\n\n"
elif "Usage Examples" in github_sections:
content += "**From Repository:**\n\n"
content += github_sections["Usage Examples"] + "\n\n"
if "Example Code Patterns" in docs_sections:
content += "**From Documentation:**\n\n"
content += docs_sections["Example Code Patterns"] + "\n\n"
# API Reference: Include from both sources
if "API Reference" in docs_sections or "API Reference" in github_sections:
content += "## 🔧 API Reference\n\n"
if "API Reference" in github_sections:
# Note: GitHub section already includes "*Extracted from codebase analysis (C2.5)*" label
content += github_sections["API Reference"] + "\n\n"
if "API Reference" in docs_sections:
content += "**Official API Documentation:**\n\n"
content += docs_sections["API Reference"] + "\n\n"
# Known Issues: GitHub only
if "Known Issues" in github_sections:
content += "## ⚠️ Known Issues\n\n"
content += "*Recent issues from GitHub*\n\n"
content += github_sections["Known Issues"] + "\n\n"
# Recent Releases: GitHub only (include subsection if present)
if "Recent Releases" in github_sections:
# Recent Releases might be a subsection within Known Issues
# Check if it's standalone
releases_content = github_sections["Recent Releases"]
if releases_content.strip() and not releases_content.startswith("###"):
content += "### Recent Releases\n"
content += releases_content + "\n\n"
# Reference documentation
content += "## 📖 Reference Documentation\n\n"
content += "Organized by source:\n\n"
content += "- [Documentation](references/documentation/)\n"
content += "- [GitHub](references/github/)\n"
content += "- [Codebase Analysis](references/codebase_analysis/ARCHITECTURE.md)\n\n"
# Footer
content += "---\n\n"
content += (
"*Synthesized from official documentation and codebase analysis by Skill Seekers*\n"
)
return content
def _synthesize_docs_github_pdf(self, skill_mds: dict[str, str]) -> str:
"""Synthesize all three sources: documentation + GitHub + PDF.
Strategy:
- Start with docs+github synthesis
- Insert PDF chapters after Quick Reference
- Add PDF key concepts as supplementary section
Args:
skill_mds: Dict with 'documentation', 'github', and 'pdf' keys
Returns:
Synthesized SKILL.md content
"""
# Start with docs+github synthesis
base_content = self._synthesize_docs_github(skill_mds)
pdf_sections = self._parse_skill_md_sections(skill_mds.get("pdf", ""))
# Find insertion point after Quick Reference
lines = base_content.split("\n")
insertion_index = -1
for i, line in enumerate(lines):
if line.startswith("## 🧪 Code Examples") or line.startswith("## 🔧 API Reference"):
insertion_index = i
break
if insertion_index == -1:
# Fallback: insert before Reference Documentation
for i, line in enumerate(lines):
if line.startswith("## 📖 Reference Documentation"):
insertion_index = i
break
# Build PDF section
pdf_content_lines = []
# Add Chapter Overview
if "Chapter Overview" in pdf_sections:
pdf_content_lines.append("## 📚 PDF Documentation Structure\n")
pdf_content_lines.append("*From PDF analysis*\n")
pdf_content_lines.append(pdf_sections["Chapter Overview"])
pdf_content_lines.append("\n")
# Add Key Concepts
if "Key Concepts" in pdf_sections:
pdf_content_lines.append("## 🔍 Key Concepts\n")
pdf_content_lines.append("*Extracted from PDF headings*\n")
pdf_content_lines.append(pdf_sections["Key Concepts"])
pdf_content_lines.append("\n")
# Insert PDF content
if pdf_content_lines and insertion_index != -1:
lines[insertion_index:insertion_index] = pdf_content_lines
elif pdf_content_lines:
# Append at end before footer
footer_index = -1
for i, line in enumerate(lines):
if line.startswith("---") and i > len(lines) - 5:
footer_index = i
break
if footer_index != -1:
lines[footer_index:footer_index] = pdf_content_lines
# Update reference documentation to include PDF
final_content = "\n".join(lines)
final_content = final_content.replace(
"- [Codebase Analysis](references/codebase_analysis/ARCHITECTURE.md)\n",
"- [Codebase Analysis](references/codebase_analysis/ARCHITECTURE.md)\n- [PDF Documentation](references/pdf/)\n",
)
return final_content
def _generate_skill_md(self):
"""Generate main SKILL.md file using synthesis formulas.
Strategy:
1. Try to load standalone SKILL.md from each source
2. If found, use synthesis formulas for rich content
3. If not found, fall back to legacy minimal generation
"""
skill_path = os.path.join(self.skill_dir, "SKILL.md")
# Try to load source SKILL.md files
skill_mds = self._load_source_skill_mds()
# Determine synthesis strategy based on available sources
has_docs = "documentation" in skill_mds
has_github = "github" in skill_mds
has_pdf = "pdf" in skill_mds
content = None
# Apply appropriate synthesis formula
if has_docs and has_github and has_pdf:
logger.info("Synthesizing: documentation + GitHub + PDF")
content = self._synthesize_docs_github_pdf(skill_mds)
elif has_docs and has_github:
logger.info("Synthesizing: documentation + GitHub")
content = self._synthesize_docs_github(skill_mds)
elif has_docs and has_pdf:
logger.info("Synthesizing: documentation + PDF")
content = self._synthesize_docs_pdf(skill_mds)
elif has_github and has_pdf:
logger.info("Synthesizing: GitHub + PDF")
content = self._synthesize_github_pdf(skill_mds)
elif has_docs:
logger.info("Using documentation SKILL.md as-is")
content = skill_mds["documentation"]
elif has_github:
logger.info("Using GitHub SKILL.md as-is")
content = skill_mds["github"]
elif has_pdf:
logger.info("Using PDF SKILL.md as-is")
content = skill_mds["pdf"]
# Fallback: generate minimal SKILL.md (legacy behavior)
if not content:
logger.warning("No source SKILL.md files found, generating minimal SKILL.md (legacy)")
content = self._generate_minimal_skill_md()
# Write final content
with open(skill_path, "w", encoding="utf-8") as f:
f.write(content)
logger.info(f"Created SKILL.md ({len(content)} chars, ~{len(content.split())} words)")
def _synthesize_docs_pdf(self, skill_mds: dict[str, str]) -> str:
"""Synthesize documentation + PDF sources.
Strategy:
- Start with docs SKILL.md
- Insert PDF chapters and key concepts as supplementary sections
Args:
skill_mds: Dict with 'documentation' and 'pdf' keys
Returns:
Synthesized SKILL.md content
"""
docs_content = skill_mds["documentation"]
pdf_sections = self._parse_skill_md_sections(skill_mds["pdf"])
lines = docs_content.split("\n")
insertion_index = -1
# Find insertion point before Reference Documentation
for i, line in enumerate(lines):
if line.startswith("## 📖 Reference") or line.startswith("## Reference"):
insertion_index = i
break
# Build PDF sections
pdf_content_lines = []
if "Chapter Overview" in pdf_sections:
pdf_content_lines.append("## 📚 PDF Documentation Structure\n")
pdf_content_lines.append("*From PDF analysis*\n")
pdf_content_lines.append(pdf_sections["Chapter Overview"])
pdf_content_lines.append("\n")
if "Key Concepts" in pdf_sections:
pdf_content_lines.append("## 🔍 Key Concepts\n")
pdf_content_lines.append("*Extracted from PDF headings*\n")
pdf_content_lines.append(pdf_sections["Key Concepts"])
pdf_content_lines.append("\n")
# Insert PDF content
if pdf_content_lines and insertion_index != -1:
lines[insertion_index:insertion_index] = pdf_content_lines
return "\n".join(lines)
def _synthesize_github_pdf(self, skill_mds: dict[str, str]) -> str:
"""Synthesize GitHub + PDF sources.
Strategy:
- Start with GitHub SKILL.md (has C3.x analysis)
- Add PDF documentation structure as supplementary section
Args:
skill_mds: Dict with 'github' and 'pdf' keys
Returns:
Synthesized SKILL.md content
"""
github_content = skill_mds["github"]
pdf_sections = self._parse_skill_md_sections(skill_mds["pdf"])
lines = github_content.split("\n")
insertion_index = -1
# Find insertion point before Reference Documentation
for i, line in enumerate(lines):
if line.startswith("## 📖 Reference") or line.startswith("## Reference"):
insertion_index = i
break
# Build PDF sections
pdf_content_lines = []
if "Chapter Overview" in pdf_sections:
pdf_content_lines.append("## 📚 PDF Documentation Structure\n")
pdf_content_lines.append("*From PDF analysis*\n")
pdf_content_lines.append(pdf_sections["Chapter Overview"])
pdf_content_lines.append("\n")
# Insert PDF content
if pdf_content_lines and insertion_index != -1:
lines[insertion_index:insertion_index] = pdf_content_lines
return "\n".join(lines)
def _generate_minimal_skill_md(self) -> str:
"""Generate minimal SKILL.md (legacy fallback behavior).
Used when no source SKILL.md files are available.
"""
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
desc = self.description[:1024] if len(self.description) > 1024 else self.description
content = f"""---
name: {skill_name}
description: {desc}
---
# {self.name.title()}
{self.description}
## 📚 Sources
This skill combines knowledge from multiple sources:
"""
# List sources
for source in self.config.get("sources", []):
source_type = source["type"]
if source_type == "documentation":
content += f"- ✅ **Documentation**: {source.get('base_url', 'N/A')}\n"
content += f" - Pages: {source.get('max_pages', 'unlimited')}\n"
elif source_type == "github":
content += f"- ✅ **GitHub Repository**: {source.get('repo', 'N/A')}\n"
content += f" - Code Analysis: {source.get('code_analysis_depth', 'surface')}\n"
content += f" - Issues: {source.get('max_issues', 0)}\n"
elif source_type == "pdf":
content += f"- ✅ **PDF Document**: {source.get('path', 'N/A')}\n"
# C3.x Architecture & Code Analysis section (if available)
github_data = self.scraped_data.get("github", {})
# Handle both dict and list cases
if isinstance(github_data, dict):
github_data = github_data.get("data", {})
elif isinstance(github_data, list) and len(github_data) > 0:
github_data = github_data[0].get("data", {})
else:
github_data = {}
if github_data.get("c3_analysis"):
content += self._format_c3_summary_section(github_data["c3_analysis"])
# Data quality section
if self.conflicts:
content += "\n## ⚠️ Data Quality\n\n"
content += f"**{len(self.conflicts)} conflicts detected** between sources.\n\n"
# Count by type
by_type = {}
for conflict in self.conflicts:
ctype = (
conflict.type if hasattr(conflict, "type") else conflict.get("type", "unknown")
)
by_type[ctype] = by_type.get(ctype, 0) + 1
content += "**Conflict Breakdown:**\n"
for ctype, count in by_type.items():
content += f"- {ctype}: {count}\n"
content += "\nSee `references/conflicts.md` for detailed conflict information.\n"
# Merged API section (if available)
if self.merged_data:
content += self._format_merged_apis()
# Quick reference from each source
content += "\n## 📖 Reference Documentation\n\n"
content += "Organized by source:\n\n"
for source in self.config.get("sources", []):
source_type = source["type"]
content += f"- [{source_type.title()}](references/{source_type}/)\n"
# When to use this skill
content += "\n## 💡 When to Use This Skill\n\n"
content += "Use this skill when you need to:\n"
content += f"- Understand how to use {self.name}\n"
content += "- Look up API documentation\n"
content += "- Find usage examples\n"
if "github" in self.scraped_data:
content += "- Check for known issues or recent changes\n"
content += "- Review release history\n"
content += "\n---\n\n"
content += "*Generated by Skill Seeker's unified multi-source scraper*\n"
return content
def _format_merged_apis(self) -> str:
"""Format merged APIs section with inline conflict warnings."""
if not self.merged_data:
return ""
content = "\n## 🔧 API Reference\n\n"
content += "*Merged from documentation and code analysis*\n\n"
apis = self.merged_data.get("apis", {})
if not apis:
return content + "*No APIs to display*\n"
# Group APIs by status
matched = {k: v for k, v in apis.items() if v.get("status") == "matched"}
conflicts = {k: v for k, v in apis.items() if v.get("status") == "conflict"}
docs_only = {k: v for k, v in apis.items() if v.get("status") == "docs_only"}
code_only = {k: v for k, v in apis.items() if v.get("status") == "code_only"}
# Show matched APIs first
if matched:
content += "### ✅ Verified APIs\n\n"
content += "*Documentation and code agree*\n\n"
for _api_name, api_data in list(matched.items())[:10]: # Limit to first 10
content += self._format_api_entry(api_data, inline_conflict=False)
# Show conflicting APIs with warnings
if conflicts:
content += "\n### ⚠️ APIs with Conflicts\n\n"
content += "*Documentation and code differ*\n\n"
for _api_name, api_data in list(conflicts.items())[:10]:
content += self._format_api_entry(api_data, inline_conflict=True)
# Show undocumented APIs
if code_only:
content += "\n### 💻 Undocumented APIs\n\n"
content += f"*Found in code but not in documentation ({len(code_only)} total)*\n\n"
for _api_name, api_data in list(code_only.items())[:5]:
content += self._format_api_entry(api_data, inline_conflict=False)
# Show removed/missing APIs
if docs_only:
content += "\n### 📖 Documentation-Only APIs\n\n"
content += f"*Documented but not found in code ({len(docs_only)} total)*\n\n"
for _api_name, api_data in list(docs_only.items())[:5]:
content += self._format_api_entry(api_data, inline_conflict=False)
content += "\n*See references/api/ for complete API documentation*\n"
return content
def _format_api_entry(self, api_data: dict, inline_conflict: bool = False) -> str:
"""Format a single API entry."""
name = api_data.get("name", "Unknown")
signature = api_data.get("merged_signature", name)
description = api_data.get("merged_description", "")
warning = api_data.get("warning", "")
entry = f"#### `{signature}`\n\n"
if description:
entry += f"{description}\n\n"
# Add inline conflict warning
if inline_conflict and warning:
entry += f"⚠️ **Conflict**: {warning}\n\n"
# Show both versions if available
conflict = api_data.get("conflict", {})
if conflict:
docs_info = conflict.get("docs_info")
code_info = conflict.get("code_info")
if docs_info and code_info:
entry += "**Documentation says:**\n"
entry += f"```\n{docs_info.get('raw_signature', 'N/A')}\n```\n\n"
entry += "**Code implementation:**\n"
entry += f"```\n{self._format_code_signature(code_info)}\n```\n\n"
# Add source info
source = api_data.get("source", "unknown")
entry += f"*Source: {source}*\n\n"
entry += "---\n\n"
return entry
def _format_code_signature(self, code_info: dict) -> str:
"""Format code signature for display."""
name = code_info.get("name", "")
params = code_info.get("parameters", [])
return_type = code_info.get("return_type")
param_strs = []
for param in params:
param_str = param.get("name", "")
if param.get("type_hint"):
param_str += f": {param['type_hint']}"
if param.get("default"):
param_str += f" = {param['default']}"
param_strs.append(param_str)
sig = f"{name}({', '.join(param_strs)})"
if return_type:
sig += f" -> {return_type}"
return sig
def _generate_references(self):
"""Generate reference files organized by source."""
logger.info("Generating reference files...")
# Generate references for each source type (now lists)
docs_list = self.scraped_data.get("documentation", [])
if docs_list:
self._generate_docs_references(docs_list)
github_list = self.scraped_data.get("github", [])
if github_list:
self._generate_github_references(github_list)
pdf_list = self.scraped_data.get("pdf", [])
if pdf_list:
self._generate_pdf_references(pdf_list)
# Generate merged API reference if available
if self.merged_data:
self._generate_merged_api_reference()
# Generate C3.x codebase analysis references if available (multi-source)
github_list = self.scraped_data.get("github", [])
for github_source in github_list:
github_data = github_source.get("data", {})
if github_data.get("c3_analysis"):
repo_id = github_source.get("repo_id", "unknown")
self._generate_c3_analysis_references(repo_id=repo_id)
def _generate_docs_references(self, docs_list: list[dict]):
"""Generate references from multiple documentation sources."""
# Skip if no documentation sources
if not docs_list:
return
docs_dir = os.path.join(self.skill_dir, "references", "documentation")
os.makedirs(docs_dir, exist_ok=True)
all_copied_files: list[str] = []
# Process each documentation source
for i, doc_source in enumerate(docs_list):
source_id = doc_source.get("source_id", f"source_{i}")
base_url = doc_source.get("base_url", "Unknown")
refs_dir = doc_source.get("refs_dir", "")
# Create subdirectory for this source
source_dir = os.path.join(docs_dir, source_id)
os.makedirs(source_dir, exist_ok=True)
copied_files: list[str] = []
if refs_dir and os.path.isdir(refs_dir):
for entry in sorted(os.listdir(refs_dir)):
src_path = os.path.join(refs_dir, entry)
dst_path = os.path.join(source_dir, entry)
if not os.path.isfile(src_path):
continue
shutil.copy2(src_path, dst_path)
copied_files.append(entry)
# Create index for this source
source_index_path = os.path.join(source_dir, "index.md")
with open(source_index_path, "w", encoding="utf-8") as f:
f.write(f"# Documentation: {source_id}\n\n")
f.write(f"**Source**: {base_url}\n\n")
f.write(f"**Pages**: {doc_source.get('total_pages', 'N/A')}\n\n")
if copied_files:
files_no_index = [p for p in copied_files if p.lower() != "index.md"]
f.write("## Files\n\n")
for filename in files_no_index:
f.write(f"- [{filename}]({filename})\n")
else:
f.write("No reference files available.\n")
all_copied_files.extend(copied_files)
# Create main index
index_path = os.path.join(docs_dir, "index.md")
with open(index_path, "w", encoding="utf-8") as f:
f.write("# Documentation References\n\n")
f.write(f"Combined from {len(docs_list)} documentation sources.\n\n")
f.write("## Sources\n\n")
for doc_source in docs_list:
source_id = doc_source.get("source_id", "unknown")
base_url = doc_source.get("base_url", "Unknown")
total_pages = doc_source.get("total_pages", "N/A")
f.write(
f"- [{source_id}]({source_id}/index.md) - {base_url} ({total_pages} pages)\n"
)
logger.info(f"Created documentation references ({len(docs_list)} sources)")
def _generate_github_references(self, github_list: list[dict]):
"""Generate references from multiple GitHub sources."""
# Skip if no GitHub sources
if not github_list:
return
github_dir = os.path.join(self.skill_dir, "references", "github")
os.makedirs(github_dir, exist_ok=True)
# Process each GitHub source
for i, github_source in enumerate(github_list):
repo = github_source.get("repo", f"repo_{i}")
repo_id = github_source.get("repo_id", repo.replace("/", "_"))
github_data = github_source.get("data", {})
# Create subdirectory for this repo
repo_dir = os.path.join(github_dir, repo_id)
os.makedirs(repo_dir, exist_ok=True)
# Create README reference
if github_data.get("readme"):
readme_path = os.path.join(repo_dir, "README.md")
with open(readme_path, "w", encoding="utf-8") as f:
f.write(f"# Repository README: {repo}\n\n")
f.write(github_data["readme"])
# Create issues reference
if github_data.get("issues"):
issues_path = os.path.join(repo_dir, "issues.md")
with open(issues_path, "w", encoding="utf-8") as f:
f.write(f"# GitHub Issues: {repo}\n\n")
f.write(f"{len(github_data['issues'])} recent issues.\n\n")
for issue in github_data["issues"][:20]:
f.write(f"## #{issue['number']}: {issue['title']}\n\n")
f.write(f"**State**: {issue['state']}\n")
if issue.get("labels"):
f.write(f"**Labels**: {', '.join(issue['labels'])}\n")
f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n")
# Create releases reference
if github_data.get("releases"):
releases_path = os.path.join(repo_dir, "releases.md")
with open(releases_path, "w", encoding="utf-8") as f:
f.write(f"# Releases: {repo}\n\n")
for release in github_data["releases"][:10]:
f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n")
f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n")
if release.get("body"):
f.write(release["body"][:500])
f.write("\n\n")
# Create index for this repo
repo_index_path = os.path.join(repo_dir, "index.md")
repo_info = github_data.get("repo_info", {})
with open(repo_index_path, "w", encoding="utf-8") as f:
f.write(f"# GitHub: {repo}\n\n")
f.write(f"**Stars**: {repo_info.get('stars', 'N/A')}\n")
f.write(f"**Language**: {repo_info.get('language', 'N/A')}\n")
f.write(f"**Issues**: {len(github_data.get('issues', []))}\n")
f.write(f"**Releases**: {len(github_data.get('releases', []))}\n\n")
f.write("## Files\n\n")
f.write("- [README.md](README.md)\n")
if github_data.get("issues"):
f.write("- [issues.md](issues.md)\n")
if github_data.get("releases"):
f.write("- [releases.md](releases.md)\n")
# Create main index
index_path = os.path.join(github_dir, "index.md")
with open(index_path, "w", encoding="utf-8") as f:
f.write("# GitHub References\n\n")
f.write(f"Combined from {len(github_list)} GitHub repositories.\n\n")
f.write("## Repositories\n\n")
for github_source in github_list:
repo = github_source.get("repo", "unknown")
repo_id = github_source.get("repo_id", repo.replace("/", "_"))
github_data = github_source.get("data", {})
repo_info = github_data.get("repo_info", {})
stars = repo_info.get("stars", "N/A")
f.write(f"- [{repo}]({repo_id}/index.md) - {stars} stars\n")
logger.info(f"Created GitHub references ({len(github_list)} repos)")
def _generate_pdf_references(self, pdf_list: list[dict]):
"""Generate references from PDF sources."""
# Skip if no PDF sources
if not pdf_list:
return
pdf_dir = os.path.join(self.skill_dir, "references", "pdf")
os.makedirs(pdf_dir, exist_ok=True)
# Create index
index_path = os.path.join(pdf_dir, "index.md")
with open(index_path, "w", encoding="utf-8") as f:
f.write("# PDF Documentation\n\n")
f.write(f"Reference from {len(pdf_list)} PDF document(s).\n\n")
logger.info(f"Created PDF references ({len(pdf_list)} sources)")
def _generate_merged_api_reference(self):
"""Generate merged API reference file."""
api_dir = os.path.join(self.skill_dir, "references", "api")
os.makedirs(api_dir, exist_ok=True)
api_path = os.path.join(api_dir, "merged_api.md")
with open(api_path, "w") as f:
f.write("# Merged API Reference\n\n")
f.write("*Combined from documentation and code analysis*\n\n")
apis = self.merged_data.get("apis", {})
for api_name in sorted(apis.keys()):
api_data = apis[api_name]
entry = self._format_api_entry(api_data, inline_conflict=True)
f.write(entry)
logger.info(f"Created merged API reference ({len(apis)} APIs)")
def _generate_c3_analysis_references(self, repo_id: str = "github"):
"""Generate codebase analysis references (C3.5) for a specific GitHub source.
Args:
repo_id: Repository identifier (e.g., 'encode_httpx') for multi-source support
"""
# Find the correct github_source from the list
github_list = self.scraped_data.get("github", [])
github_source = None
for source in github_list:
if source.get("repo_id") == repo_id:
github_source = source
break
if not github_source:
logger.warning(f"GitHub source with repo_id '{repo_id}' not found")
return
github_data = github_source.get("data", {})
c3_data = github_data.get("c3_analysis")
if not c3_data:
return
# Create unique directory per repo for multi-source support
c3_dir = os.path.join(self.skill_dir, "references", "codebase_analysis", repo_id)
os.makedirs(c3_dir, exist_ok=True)
logger.info("Generating C3.x codebase analysis references...")
# Generate ARCHITECTURE.md (main deliverable)
self._generate_architecture_overview(c3_dir, c3_data, github_data)
# Generate subdirectories for each C3.x component
self._generate_pattern_references(c3_dir, c3_data.get("patterns"))
self._generate_example_references(c3_dir, c3_data.get("test_examples"))
self._generate_guide_references(c3_dir, c3_data.get("how_to_guides"))
self._generate_config_references(c3_dir, c3_data.get("config_patterns"))
self._copy_architecture_details(c3_dir, c3_data.get("architecture"))
logger.info("✅ Created codebase analysis references")
def _generate_architecture_overview(self, c3_dir: str, c3_data: dict, github_data: dict):
"""Generate comprehensive ARCHITECTURE.md (C3.5 main deliverable)."""
arch_path = os.path.join(c3_dir, "ARCHITECTURE.md")
with open(arch_path, "w", encoding="utf-8") as f:
f.write(f"# {self.name.title()} Architecture Overview\n\n")
f.write("*Generated from C3.x automated codebase analysis*\n\n")
# Section 1: Overview
f.write("## 1. Overview\n\n")
f.write(f"{self.description}\n\n")
# Section 2: Architectural Patterns (C3.7)
if c3_data.get("architecture"):
arch = c3_data["architecture"]
patterns = arch.get("patterns", [])
if patterns:
f.write("## 2. Architectural Patterns\n\n")
f.write("*Detected architectural patterns from codebase structure*\n\n")
for pattern in patterns[:5]: # Top 5 patterns
f.write(f"### {pattern['pattern_name']}\n\n")
f.write(f"- **Confidence**: {pattern['confidence']:.2f}\n")
if pattern.get("framework"):
f.write(f"- **Framework**: {pattern['framework']}\n")
if pattern.get("evidence"):
f.write(f"- **Evidence**: {', '.join(pattern['evidence'][:3])}\n")
f.write("\n")
# Section 3: Technology Stack
f.write("## 3. Technology Stack\n\n")
# Try to get languages from C3.7 architecture analysis first
languages = {}
if c3_data.get("architecture"):
languages = c3_data["architecture"].get("languages", {})
# If no languages from C3.7, try to get from GitHub data
# github_data already available from method scope
if not languages and github_data.get("languages"):
# GitHub data has languages as list, convert to dict with count 1
languages = dict.fromkeys(github_data["languages"], 1)
if languages:
f.write("**Languages Detected**:\n")
for lang, count in sorted(languages.items(), key=lambda x: x[1], reverse=True)[:5]:
if isinstance(count, int):
f.write(f"- {lang}: {count} files\n")
else:
f.write(f"- {lang}\n")
f.write("\n")
# Add frameworks if available
if c3_data.get("architecture"):
frameworks = c3_data["architecture"].get("frameworks_detected", [])
if frameworks:
f.write("**Frameworks & Libraries**:\n")
for fw in frameworks[:10]:
f.write(f"- {fw}\n")
f.write("\n")
if not languages and not (
c3_data.get("architecture") and c3_data["architecture"].get("frameworks_detected")
):
f.write("*Technology stack analysis not available*\n\n")
# Section 4: Design Patterns (C3.1)
if c3_data.get("patterns"):
f.write("## 4. Design Patterns\n\n")
f.write("*Classic design patterns identified in the codebase*\n\n")
# Summarize pattern types
pattern_summary = {}
for file_data in c3_data["patterns"]:
for pattern in file_data.get("patterns", []):
ptype = pattern["pattern_type"]
pattern_summary[ptype] = pattern_summary.get(ptype, 0) + 1
if pattern_summary:
for ptype, count in sorted(
pattern_summary.items(), key=lambda x: x[1], reverse=True
):
f.write(f"- **{ptype}**: {count} instance(s)\n")
f.write(
"\n📁 See `references/codebase_analysis/patterns/` for detailed analysis.\n\n"
)
else:
f.write("*No design patterns detected.*\n\n")
# Section 5: Configuration Overview (C3.4)
if c3_data.get("config_patterns"):
f.write("## 5. Configuration Overview\n\n")
config = c3_data["config_patterns"]
config_files = config.get("config_files", [])
if config_files:
f.write(f"**{len(config_files)} configuration file(s) detected**:\n\n")
for cf in config_files[:10]: # Top 10
f.write(f"- **`{cf['relative_path']}`**: {cf['type']}\n")
if cf.get("purpose"):
f.write(f" - Purpose: {cf['purpose']}\n")
# Add security warnings if available
if config.get("ai_enhancements"):
insights = config["ai_enhancements"].get("overall_insights", {})
security_issues = insights.get("security_issues_found", 0)
if security_issues > 0:
f.write(
f"\n🔐 **Security Alert**: {security_issues} potential security issue(s) found in configurations.\n"
)
if insights.get("recommended_actions"):
f.write("\n**Recommended Actions**:\n")
for action in insights["recommended_actions"][:5]:
f.write(f"- {action}\n")
f.write(
"\n📁 See `references/codebase_analysis/configuration/` for details.\n\n"
)
else:
f.write("*No configuration files detected.*\n\n")
# Section 6: Common Workflows (C3.3)
if c3_data.get("how_to_guides"):
f.write("## 6. Common Workflows\n\n")
guides = c3_data["how_to_guides"].get("guides", [])
if guides:
f.write(f"**{len(guides)} how-to guide(s) extracted from codebase**:\n\n")
for guide in guides[:10]: # Top 10
f.write(f"- {guide.get('title', 'Untitled Guide')}\n")
f.write(
"\n📁 See `references/codebase_analysis/guides/` for detailed tutorials.\n\n"
)
else:
f.write("*No workflow guides extracted.*\n\n")
# Section 7: Usage Examples (C3.2)
if c3_data.get("test_examples"):
f.write("## 7. Usage Examples\n\n")
examples = c3_data["test_examples"]
total = examples.get("total_examples", 0)
high_value = examples.get("high_value_count", 0)
if total > 0:
f.write(f"**{total} usage example(s) extracted from tests**:\n")
f.write(f"- High-value examples: {high_value}\n")
# Category breakdown
if examples.get("examples_by_category"):
f.write("\n**By Category**:\n")
for cat, count in sorted(
examples["examples_by_category"].items(),
key=lambda x: x[1],
reverse=True,
):
f.write(f"- {cat}: {count}\n")
f.write(
"\n📁 See `references/codebase_analysis/examples/` for code samples.\n\n"
)
else:
f.write("*No test examples extracted.*\n\n")
# Section 8: Entry Points & Directory Structure
f.write("## 8. Entry Points & Directory Structure\n\n")
f.write("*Analysis based on codebase organization*\n\n")
if c3_data.get("architecture"):
dir_struct = c3_data["architecture"].get("directory_structure", {})
if dir_struct:
f.write("**Main Directories**:\n")
for dir_name, file_count in sorted(
dir_struct.items(), key=lambda x: x[1], reverse=True
)[:15]:
f.write(f"- `{dir_name}/`: {file_count} file(s)\n")
f.write("\n")
# Footer
f.write("---\n\n")
f.write(
"*This architecture overview was automatically generated by C3.x codebase analysis.*\n"
)
f.write("*Last updated: skill build time*\n")
logger.info("📐 Created ARCHITECTURE.md")
def _generate_pattern_references(self, c3_dir: str, patterns_data: dict):
"""Generate design pattern references (C3.1)."""
if not patterns_data:
return
patterns_dir = os.path.join(c3_dir, "patterns")
os.makedirs(patterns_dir, exist_ok=True)
# Save JSON data
json_path = os.path.join(patterns_dir, "detected_patterns.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(patterns_data, f, indent=2, ensure_ascii=False)
# Create summary markdown
md_path = os.path.join(patterns_dir, "index.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write("# Design Patterns\n\n")
f.write("*Detected patterns from C3.1 analysis*\n\n")
for file_data in patterns_data:
patterns = file_data.get("patterns", [])
if patterns:
f.write(f"## {file_data['file_path']}\n\n")
for p in patterns:
f.write(f"### {p['pattern_type']}\n\n")
if p.get("class_name"):
f.write(f"- **Class**: `{p['class_name']}`\n")
if p.get("confidence"):
f.write(f"- **Confidence**: {p['confidence']:.2f}\n")
if p.get("indicators"):
f.write(f"- **Indicators**: {', '.join(p['indicators'][:3])}\n")
f.write("\n")
logger.info(f" ✓ Design patterns: {len(patterns_data)} files")
def _generate_example_references(self, c3_dir: str, examples_data: dict):
"""Generate test example references (C3.2)."""
if not examples_data:
return
examples_dir = os.path.join(c3_dir, "examples")
os.makedirs(examples_dir, exist_ok=True)
# Save JSON data
json_path = os.path.join(examples_dir, "test_examples.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(examples_data, f, indent=2, ensure_ascii=False)
# Create summary markdown
md_path = os.path.join(examples_dir, "index.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write("# Usage Examples\n\n")
f.write("*Extracted from test files (C3.2)*\n\n")
total = examples_data.get("total_examples", 0)
high_value = examples_data.get("high_value_count", 0)
f.write(f"**Total Examples**: {total}\n")
f.write(f"**High-Value Examples**: {high_value}\n\n")
# List high-value examples
examples = examples_data.get("examples", [])
high_value_examples = [e for e in examples if e.get("confidence", 0) > 0.7]
if high_value_examples:
f.write("## High-Value Examples\n\n")
for ex in high_value_examples[:20]: # Top 20
f.write(f"### {ex.get('description', 'Example')}\n\n")
f.write(f"- **Category**: {ex.get('category', 'unknown')}\n")
f.write(f"- **Confidence**: {ex.get('confidence', 0):.2f}\n")
f.write(f"- **File**: `{ex.get('file_path', 'N/A')}`\n")
if ex.get("code_snippet"):
f.write(f"\n```python\n{ex['code_snippet'][:300]}\n```\n")
f.write("\n")
logger.info(f" ✓ Test examples: {total} total, {high_value} high-value")
def _generate_guide_references(self, c3_dir: str, guides_data: dict):
"""Generate how-to guide references (C3.3)."""
if not guides_data:
return
guides_dir = os.path.join(c3_dir, "guides")
os.makedirs(guides_dir, exist_ok=True)
# Save JSON collection data
json_path = os.path.join(guides_dir, "guide_collection.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(guides_data, f, indent=2, ensure_ascii=False)
guides = guides_data.get("guides", [])
# Create index
md_path = os.path.join(guides_dir, "index.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write("# How-To Guides\n\n")
f.write("*Workflow tutorials extracted from codebase (C3.3)*\n\n")
f.write(f"**Total Guides**: {len(guides)}\n\n")
if guides:
f.write("## Available Guides\n\n")
for guide in guides:
f.write(
f"- [{guide.get('title', 'Untitled')}](guide_{guide.get('id', 'unknown')}.md)\n"
)
f.write("\n")
# Save individual guide markdown files
for guide in guides:
guide_id = guide.get("id", "unknown")
guide_path = os.path.join(guides_dir, f"guide_{guide_id}.md")
with open(guide_path, "w", encoding="utf-8") as f:
f.write(f"# {guide.get('title', 'Untitled Guide')}\n\n")
if guide.get("description"):
f.write(f"{guide['description']}\n\n")
steps = guide.get("steps", [])
if steps:
f.write("## Steps\n\n")
for i, step in enumerate(steps, 1):
f.write(f"### {i}. {step.get('action', 'Step')}\n\n")
if step.get("code_example"):
lang = step.get("language", "python")
f.write(f"```{lang}\n{step['code_example']}\n```\n\n")
if step.get("explanation"):
f.write(f"{step['explanation']}\n\n")
logger.info(f" ✓ How-to guides: {len(guides)}")
def _generate_config_references(self, c3_dir: str, config_data: dict):
"""Generate configuration pattern references (C3.4)."""
if not config_data:
return
config_dir = os.path.join(c3_dir, "configuration")
os.makedirs(config_dir, exist_ok=True)
# Save JSON data
json_path = os.path.join(config_dir, "config_patterns.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(config_data, f, indent=2, ensure_ascii=False)
# Create summary markdown
md_path = os.path.join(config_dir, "index.md")
config_files = config_data.get("config_files", [])
with open(md_path, "w", encoding="utf-8") as f:
f.write("# Configuration Patterns\n\n")
f.write("*Detected configuration files (C3.4)*\n\n")
f.write(f"**Total Config Files**: {len(config_files)}\n\n")
if config_files:
f.write("## Configuration Files\n\n")
for cf in config_files:
f.write(f"### `{cf['relative_path']}`\n\n")
f.write(f"- **Type**: {cf['type']}\n")
f.write(f"- **Purpose**: {cf.get('purpose', 'N/A')}\n")
f.write(f"- **Settings**: {len(cf.get('settings', []))}\n")
# Show AI enhancements if available
if cf.get("ai_enhancement"):
enh = cf["ai_enhancement"]
if enh.get("security_concern"):
f.write(f"- **Security**: {enh['security_concern']}\n")
if enh.get("best_practice"):
f.write(f"- **Best Practice**: {enh['best_practice']}\n")
f.write("\n")
# Overall insights
if config_data.get("ai_enhancements"):
insights = config_data["ai_enhancements"].get("overall_insights", {})
if insights:
f.write("## Overall Insights\n\n")
if insights.get("security_issues_found"):
f.write(
f"🔐 **Security Issues**: {insights['security_issues_found']}\n\n"
)
if insights.get("recommended_actions"):
f.write("**Recommended Actions**:\n")
for action in insights["recommended_actions"]:
f.write(f"- {action}\n")
f.write("\n")
logger.info(f" ✓ Configuration files: {len(config_files)}")
def _copy_architecture_details(self, c3_dir: str, arch_data: dict):
"""Copy architectural pattern JSON details (C3.7)."""
if not arch_data:
return
arch_dir = os.path.join(c3_dir, "architecture_details")
os.makedirs(arch_dir, exist_ok=True)
# Save full JSON data
json_path = os.path.join(arch_dir, "architectural_patterns.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(arch_data, f, indent=2, ensure_ascii=False)
# Create summary markdown
md_path = os.path.join(arch_dir, "index.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write("# Architectural Patterns (Detailed)\n\n")
f.write("*Comprehensive architectural analysis (C3.7)*\n\n")
patterns = arch_data.get("patterns", [])
if patterns:
f.write("## Detected Patterns\n\n")
for p in patterns:
f.write(f"### {p['pattern_name']}\n\n")
f.write(f"- **Confidence**: {p['confidence']:.2f}\n")
if p.get("framework"):
f.write(f"- **Framework**: {p['framework']}\n")
if p.get("evidence"):
f.write("- **Evidence**:\n")
for e in p["evidence"][:5]:
f.write(f" - {e}\n")
f.write("\n")
logger.info(f" ✓ Architectural details: {len(patterns)} patterns")
def _format_c3_summary_section(self, c3_data: dict) -> str:
"""Format C3.x analysis summary for SKILL.md."""
content = "\n## 🏗️ Architecture & Code Analysis\n\n"
content += "*This skill includes comprehensive codebase analysis*\n\n"
# Add architectural pattern summary
if c3_data.get("architecture"):
patterns = c3_data["architecture"].get("patterns", [])
if patterns:
top_pattern = patterns[0]
content += f"**Primary Architecture**: {top_pattern['pattern_name']}"
if top_pattern.get("framework"):
content += f" ({top_pattern['framework']})"
content += f" - Confidence: {top_pattern['confidence']:.0%}\n\n"
# Add design patterns summary
if c3_data.get("patterns"):
total_patterns = sum(len(f.get("patterns", [])) for f in c3_data["patterns"])
if total_patterns > 0:
content += f"**Design Patterns**: {total_patterns} detected\n"
# Show top 3 pattern types
pattern_summary = {}
for file_data in c3_data["patterns"]:
for pattern in file_data.get("patterns", []):
ptype = pattern["pattern_type"]
pattern_summary[ptype] = pattern_summary.get(ptype, 0) + 1
top_patterns = sorted(pattern_summary.items(), key=lambda x: x[1], reverse=True)[:3]
if top_patterns:
content += (
f"- Top patterns: {', '.join([f'{p[0]} ({p[1]})' for p in top_patterns])}\n"
)
content += "\n"
# Add test examples summary
if c3_data.get("test_examples"):
total = c3_data["test_examples"].get("total_examples", 0)
high_value = c3_data["test_examples"].get("high_value_count", 0)
if total > 0:
content += f"**Usage Examples**: {total} extracted from tests ({high_value} high-value)\n\n"
# Add how-to guides summary
if c3_data.get("how_to_guides"):
guide_count = len(c3_data["how_to_guides"].get("guides", []))
if guide_count > 0:
content += f"**How-To Guides**: {guide_count} workflow tutorials\n\n"
# Add configuration summary
if c3_data.get("config_patterns"):
config_files = c3_data["config_patterns"].get("config_files", [])
if config_files:
content += f"**Configuration Files**: {len(config_files)} analyzed\n"
# Add security warning if present
if c3_data["config_patterns"].get("ai_enhancements"):
insights = c3_data["config_patterns"]["ai_enhancements"].get(
"overall_insights", {}
)
security_issues = insights.get("security_issues_found", 0)
if security_issues > 0:
content += f"- 🔐 **Security Alert**: {security_issues} issue(s) detected\n"
content += "\n"
# Add link to ARCHITECTURE.md
content += "📖 **See** `references/codebase_analysis/ARCHITECTURE.md` for complete architectural overview.\n\n"
return content
def _generate_conflicts_report(self):
"""Generate detailed conflicts report."""
conflicts_path = os.path.join(self.skill_dir, "references", "conflicts.md")
with open(conflicts_path, "w") as f:
f.write("# Conflict Report\n\n")
f.write(f"Found **{len(self.conflicts)}** conflicts between sources.\n\n")
# Group by severity
high = [
c
for c in self.conflicts
if (hasattr(c, "severity") and c.severity == "high") or c.get("severity") == "high"
]
medium = [
c
for c in self.conflicts
if (hasattr(c, "severity") and c.severity == "medium")
or c.get("severity") == "medium"
]
low = [
c
for c in self.conflicts
if (hasattr(c, "severity") and c.severity == "low") or c.get("severity") == "low"
]
f.write("## Severity Breakdown\n\n")
f.write(f"- 🔴 **High**: {len(high)} (action required)\n")
f.write(f"- 🟡 **Medium**: {len(medium)} (review recommended)\n")
f.write(f"- 🟢 **Low**: {len(low)} (informational)\n\n")
# List high severity conflicts
if high:
f.write("## 🔴 High Severity\n\n")
f.write("*These conflicts require immediate attention*\n\n")
for conflict in high:
api_name = (
conflict.api_name
if hasattr(conflict, "api_name")
else conflict.get("api_name", "Unknown")
)
diff = (
conflict.difference
if hasattr(conflict, "difference")
else conflict.get("difference", "N/A")
)
f.write(f"### {api_name}\n\n")
f.write(f"**Issue**: {diff}\n\n")
# List medium severity
if medium:
f.write("## 🟡 Medium Severity\n\n")
for conflict in medium[:20]: # Limit to 20
api_name = (
conflict.api_name
if hasattr(conflict, "api_name")
else conflict.get("api_name", "Unknown")
)
diff = (
conflict.difference
if hasattr(conflict, "difference")
else conflict.get("difference", "N/A")
)
f.write(f"### {api_name}\n\n")
f.write(f"{diff}\n\n")
logger.info("Created conflicts report")
if __name__ == "__main__":
# Test with mock data
import sys
if len(sys.argv) < 2:
print("Usage: python unified_skill_builder.py <config.json>")
sys.exit(1)
config_path = sys.argv[1]
with open(config_path) as f:
config = json.load(f)
# Mock scraped data
scraped_data = {
"github": {"data": {"readme": "# Test Repository", "issues": [], "releases": []}}
}
builder = UnifiedSkillBuilder(config, scraped_data)
builder.build()
print(f"\n✅ Test skill built in: output/{config['name']}/")