This commit is contained in:
Pablo Estevez
2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions

View File

@@ -33,17 +33,17 @@ except ImportError:
# Registry of available adaptors
ADAPTORS: Dict[str, Type[SkillAdaptor]] = {}
ADAPTORS: dict[str, type[SkillAdaptor]] = {}
# Register adaptors that are implemented
if ClaudeAdaptor:
ADAPTORS['claude'] = ClaudeAdaptor
ADAPTORS["claude"] = ClaudeAdaptor
if GeminiAdaptor:
ADAPTORS['gemini'] = GeminiAdaptor
ADAPTORS["gemini"] = GeminiAdaptor
if OpenAIAdaptor:
ADAPTORS['openai'] = OpenAIAdaptor
ADAPTORS["openai"] = OpenAIAdaptor
if MarkdownAdaptor:
ADAPTORS['markdown'] = MarkdownAdaptor
ADAPTORS["markdown"] = MarkdownAdaptor
def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:
@@ -65,15 +65,11 @@ def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:
>>> adaptor = get_adaptor('gemini', {'api_version': 'v1beta'})
"""
if platform not in ADAPTORS:
available = ', '.join(ADAPTORS.keys())
available = ", ".join(ADAPTORS.keys())
if not ADAPTORS:
raise ValueError(
f"No adaptors are currently implemented. "
f"Platform '{platform}' is not available."
)
raise ValueError(f"No adaptors are currently implemented. Platform '{platform}' is not available.")
raise ValueError(
f"Platform '{platform}' is not supported or not yet implemented. "
f"Available platforms: {available}"
f"Platform '{platform}' is not supported or not yet implemented. Available platforms: {available}"
)
adaptor_class = ADAPTORS[platform]
@@ -115,10 +111,10 @@ def is_platform_available(platform: str) -> bool:
# Export public interface
__all__ = [
'SkillAdaptor',
'SkillMetadata',
'get_adaptor',
'list_platforms',
'is_platform_available',
'ADAPTORS',
"SkillAdaptor",
"SkillMetadata",
"get_adaptor",
"list_platforms",
"is_platform_available",
"ADAPTORS",
]

View File

@@ -7,18 +7,19 @@ This enables Skill Seekers to generate skills for multiple LLM platforms (Claude
"""
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, Any, Optional
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
@dataclass
class SkillMetadata:
"""Universal skill metadata used across all platforms"""
name: str
description: str
version: str = "1.0.0"
author: Optional[str] = None
author: str | None = None
tags: list[str] = field(default_factory=list)
@@ -34,11 +35,11 @@ class SkillAdaptor(ABC):
"""
# Platform identifiers (override in subclasses)
PLATFORM: str = "unknown" # e.g., "claude", "gemini", "openai"
PLATFORM_NAME: str = "Unknown" # e.g., "Claude AI (Anthropic)"
DEFAULT_API_ENDPOINT: Optional[str] = None
PLATFORM: str = "unknown" # e.g., "claude", "gemini", "openai"
PLATFORM_NAME: str = "Unknown" # e.g., "Claude AI (Anthropic)"
DEFAULT_API_ENDPOINT: str | None = None
def __init__(self, config: Optional[Dict[str, Any]] = None):
def __init__(self, config: dict[str, Any] | None = None):
"""
Initialize adaptor with optional configuration.
@@ -86,7 +87,7 @@ class SkillAdaptor(ABC):
pass
@abstractmethod
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
"""
Upload packaged skill to platform.
@@ -168,11 +169,11 @@ class SkillAdaptor(ABC):
if not skill_md_path.exists():
return ""
content = skill_md_path.read_text(encoding='utf-8')
content = skill_md_path.read_text(encoding="utf-8")
# Strip YAML frontmatter if present
if content.startswith('---'):
parts = content.split('---', 2)
if content.startswith("---"):
parts = content.split("---", 2)
if len(parts) >= 3:
return parts[2].strip()
@@ -193,7 +194,7 @@ class SkillAdaptor(ABC):
return "See references/ directory for documentation."
# Read index and extract relevant sections
content = index_path.read_text(encoding='utf-8')
content = index_path.read_text(encoding="utf-8")
return content[:500] + "..." if len(content) > 500 else content
def _generate_toc(self, skill_dir: Path) -> str:
@@ -214,7 +215,7 @@ class SkillAdaptor(ABC):
for ref_file in sorted(refs_dir.glob("*.md")):
if ref_file.name == "index.md":
continue
title = ref_file.stem.replace('_', ' ').title()
title = ref_file.stem.replace("_", " ").title()
toc_lines.append(f"- [{title}](references/{ref_file.name})")
return "\n".join(toc_lines)

View File

@@ -6,10 +6,9 @@ Implements platform-specific handling for Claude AI (Anthropic) skills.
Refactored from upload_skill.py and enhance_skill.py.
"""
import os
import zipfile
from pathlib import Path
from typing import Dict, Any
from typing import Any
from .base import SkillAdaptor, SkillMetadata
@@ -101,16 +100,16 @@ version: {metadata.version}
skill_dir = Path(skill_dir)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith('/'):
if output_path.is_dir() or str(output_path).endswith("/"):
output_path = Path(output_path) / f"{skill_dir.name}.zip"
elif not str(output_path).endswith('.zip'):
output_path = Path(str(output_path) + '.zip')
elif not str(output_path).endswith(".zip"):
output_path = Path(str(output_path) + ".zip")
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Create ZIP file
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
# Add SKILL.md (required)
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
@@ -120,7 +119,7 @@ version: {metadata.version}
refs_dir = skill_dir / "references"
if refs_dir.exists():
for ref_file in refs_dir.rglob("*"):
if ref_file.is_file() and not ref_file.name.startswith('.'):
if ref_file.is_file() and not ref_file.name.startswith("."):
arcname = ref_file.relative_to(skill_dir)
zf.write(ref_file, str(arcname))
@@ -128,7 +127,7 @@ version: {metadata.version}
scripts_dir = skill_dir / "scripts"
if scripts_dir.exists():
for script_file in scripts_dir.rglob("*"):
if script_file.is_file() and not script_file.name.startswith('.'):
if script_file.is_file() and not script_file.name.startswith("."):
arcname = script_file.relative_to(skill_dir)
zf.write(script_file, str(arcname))
@@ -136,13 +135,13 @@ version: {metadata.version}
assets_dir = skill_dir / "assets"
if assets_dir.exists():
for asset_file in assets_dir.rglob("*"):
if asset_file.is_file() and not asset_file.name.startswith('.'):
if asset_file.is_file() and not asset_file.name.startswith("."):
arcname = asset_file.relative_to(skill_dir)
zf.write(asset_file, str(arcname))
return output_path
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
"""
Upload skill ZIP to Anthropic Skills API.
@@ -159,130 +158,99 @@ version: {metadata.version}
import requests
except ImportError:
return {
'success': False,
'skill_id': None,
'url': None,
'message': 'requests library not installed. Run: pip install requests'
"success": False,
"skill_id": None,
"url": None,
"message": "requests library not installed. Run: pip install requests",
}
# Validate ZIP file
package_path = Path(package_path)
if not package_path.exists():
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'File not found: {package_path}'
}
return {"success": False, "skill_id": None, "url": None, "message": f"File not found: {package_path}"}
if not package_path.suffix == '.zip':
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'Not a ZIP file: {package_path}'
}
if not package_path.suffix == ".zip":
return {"success": False, "skill_id": None, "url": None, "message": f"Not a ZIP file: {package_path}"}
# Prepare API request
api_url = self.DEFAULT_API_ENDPOINT
headers = {
"x-api-key": api_key,
"anthropic-version": "2023-06-01",
"anthropic-beta": "skills-2025-10-02"
}
headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01", "anthropic-beta": "skills-2025-10-02"}
timeout = kwargs.get('timeout', 60)
timeout = kwargs.get("timeout", 60)
try:
# Read ZIP file
with open(package_path, 'rb') as f:
with open(package_path, "rb") as f:
zip_data = f.read()
# Upload skill
files = {
'files[]': (package_path.name, zip_data, 'application/zip')
}
files = {"files[]": (package_path.name, zip_data, "application/zip")}
response = requests.post(
api_url,
headers=headers,
files=files,
timeout=timeout
)
response = requests.post(api_url, headers=headers, files=files, timeout=timeout)
# Check response
if response.status_code == 200:
# Extract skill ID if available
try:
response_data = response.json()
skill_id = response_data.get('id')
skill_id = response_data.get("id")
except:
skill_id = None
return {
'success': True,
'skill_id': skill_id,
'url': 'https://claude.ai/skills',
'message': 'Skill uploaded successfully to Claude AI'
"success": True,
"skill_id": skill_id,
"url": "https://claude.ai/skills",
"message": "Skill uploaded successfully to Claude AI",
}
elif response.status_code == 401:
return {
'success': False,
'skill_id': None,
'url': None,
'message': 'Authentication failed. Check your ANTHROPIC_API_KEY'
"success": False,
"skill_id": None,
"url": None,
"message": "Authentication failed. Check your ANTHROPIC_API_KEY",
}
elif response.status_code == 400:
try:
error_msg = response.json().get('error', {}).get('message', 'Unknown error')
error_msg = response.json().get("error", {}).get("message", "Unknown error")
except:
error_msg = 'Invalid skill format'
error_msg = "Invalid skill format"
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'Invalid skill format: {error_msg}'
"success": False,
"skill_id": None,
"url": None,
"message": f"Invalid skill format: {error_msg}",
}
else:
try:
error_msg = response.json().get('error', {}).get('message', 'Unknown error')
error_msg = response.json().get("error", {}).get("message", "Unknown error")
except:
error_msg = f'HTTP {response.status_code}'
error_msg = f"HTTP {response.status_code}"
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'Upload failed: {error_msg}'
}
return {"success": False, "skill_id": None, "url": None, "message": f"Upload failed: {error_msg}"}
except requests.exceptions.Timeout:
return {
'success': False,
'skill_id': None,
'url': None,
'message': 'Upload timed out. Try again or use manual upload'
"success": False,
"skill_id": None,
"url": None,
"message": "Upload timed out. Try again or use manual upload",
}
except requests.exceptions.ConnectionError:
return {
'success': False,
'skill_id': None,
'url': None,
'message': 'Connection error. Check your internet connection'
"success": False,
"skill_id": None,
"url": None,
"message": "Connection error. Check your internet connection",
}
except Exception as e:
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'Unexpected error: {str(e)}'
}
return {"success": False, "skill_id": None, "url": None, "message": f"Unexpected error: {str(e)}"}
def validate_api_key(self, api_key: str) -> bool:
"""
@@ -294,7 +262,7 @@ version: {metadata.version}
Returns:
True if key starts with 'sk-ant-'
"""
return api_key.strip().startswith('sk-ant-')
return api_key.strip().startswith("sk-ant-")
def get_env_var_name(self) -> str:
"""
@@ -355,17 +323,13 @@ version: {metadata.version}
# Read current SKILL.md
current_skill_md = None
if skill_md_path.exists():
current_skill_md = skill_md_path.read_text(encoding='utf-8')
current_skill_md = skill_md_path.read_text(encoding="utf-8")
print(f" Found existing SKILL.md ({len(current_skill_md)} chars)")
else:
print(f" No existing SKILL.md, will create new one")
print(" No existing SKILL.md, will create new one")
# Build enhancement prompt
prompt = self._build_enhancement_prompt(
skill_dir.name,
references,
current_skill_md
)
prompt = self._build_enhancement_prompt(skill_dir.name, references, current_skill_md)
print("\n🤖 Asking Claude to enhance SKILL.md...")
print(f" Input: {len(prompt):,} characters")
@@ -377,10 +341,7 @@ version: {metadata.version}
model="claude-sonnet-4-20250514",
max_tokens=4096,
temperature=0.3,
messages=[{
"role": "user",
"content": prompt
}]
messages=[{"role": "user", "content": prompt}],
)
enhanced_content = message.content[0].text
@@ -388,13 +349,13 @@ version: {metadata.version}
# Backup original
if skill_md_path.exists():
backup_path = skill_md_path.with_suffix('.md.backup')
backup_path = skill_md_path.with_suffix(".md.backup")
skill_md_path.rename(backup_path)
print(f" 💾 Backed up original to: {backup_path.name}")
# Save enhanced version
skill_md_path.write_text(enhanced_content, encoding='utf-8')
print(f" ✅ Saved enhanced SKILL.md")
skill_md_path.write_text(enhanced_content, encoding="utf-8")
print(" ✅ Saved enhanced SKILL.md")
return True
@@ -402,7 +363,7 @@ version: {metadata.version}
print(f"❌ Error calling Claude API: {e}")
return False
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> Dict[str, str]:
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> dict[str, str]:
"""
Read reference markdown files from skill directory.
@@ -425,7 +386,7 @@ version: {metadata.version}
break
try:
content = ref_file.read_text(encoding='utf-8')
content = ref_file.read_text(encoding="utf-8")
# Limit individual file size
if len(content) > 30000:
content = content[:30000] + "\n\n...(truncated)"
@@ -439,10 +400,7 @@ version: {metadata.version}
return references
def _build_enhancement_prompt(
self,
skill_name: str,
references: Dict[str, str],
current_skill_md: str = None
self, skill_name: str, references: dict[str, str], current_skill_md: str = None
) -> str:
"""
Build Claude API prompt for enhancement.
@@ -460,9 +418,9 @@ version: {metadata.version}
I've scraped documentation and organized it into reference files. Your job is to create an EXCELLENT SKILL.md that will help Claude use this documentation effectively.
CURRENT SKILL.MD:
{'```markdown' if current_skill_md else '(none - create from scratch)'}
{current_skill_md or 'No existing SKILL.md'}
{'```' if current_skill_md else ''}
{"```markdown" if current_skill_md else "(none - create from scratch)"}
{current_skill_md or "No existing SKILL.md"}
{"```" if current_skill_md else ""}
REFERENCE DOCUMENTATION:
"""

View File

@@ -6,11 +6,11 @@ Implements platform-specific handling for Google Gemini skills.
Uses Gemini Files API for grounding and Gemini 2.0 Flash for enhancement.
"""
import json
import os
import tarfile
import json
from pathlib import Path
from typing import Dict, Any
from typing import Any
from .base import SkillAdaptor, SkillMetadata
@@ -105,20 +105,20 @@ See the references directory for complete documentation with examples and best p
skill_dir = Path(skill_dir)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith('/'):
if output_path.is_dir() or str(output_path).endswith("/"):
output_path = Path(output_path) / f"{skill_dir.name}-gemini.tar.gz"
elif not str(output_path).endswith('.tar.gz'):
elif not str(output_path).endswith(".tar.gz"):
# Replace .zip with .tar.gz if needed
output_str = str(output_path).replace('.zip', '.tar.gz')
if not output_str.endswith('.tar.gz'):
output_str += '.tar.gz'
output_str = str(output_path).replace(".zip", ".tar.gz")
if not output_str.endswith(".tar.gz"):
output_str += ".tar.gz"
output_path = Path(output_str)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Create tar.gz file
with tarfile.open(output_path, 'w:gz') as tar:
with tarfile.open(output_path, "w:gz") as tar:
# Add SKILL.md as system_instructions.md
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
@@ -128,21 +128,22 @@ See the references directory for complete documentation with examples and best p
refs_dir = skill_dir / "references"
if refs_dir.exists():
for ref_file in refs_dir.rglob("*"):
if ref_file.is_file() and not ref_file.name.startswith('.'):
if ref_file.is_file() and not ref_file.name.startswith("."):
arcname = ref_file.relative_to(skill_dir)
tar.add(ref_file, arcname=str(arcname))
# Create and add metadata file
metadata = {
'platform': 'gemini',
'name': skill_dir.name,
'version': '1.0.0',
'created_with': 'skill-seekers'
"platform": "gemini",
"name": skill_dir.name,
"version": "1.0.0",
"created_with": "skill-seekers",
}
# Write metadata to temp file and add to archive
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
json.dump(metadata, tmp, indent=2)
tmp_path = tmp.name
@@ -153,7 +154,7 @@ See the references directory for complete documentation with examples and best p
return output_path
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
"""
Upload skill tar.gz to Gemini Files API.
@@ -168,30 +169,20 @@ See the references directory for complete documentation with examples and best p
# Validate package file FIRST
package_path = Path(package_path)
if not package_path.exists():
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'File not found: {package_path}'
}
return {"success": False, "skill_id": None, "url": None, "message": f"File not found: {package_path}"}
if not package_path.suffix == '.gz':
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'Not a tar.gz file: {package_path}'
}
if not package_path.suffix == ".gz":
return {"success": False, "skill_id": None, "url": None, "message": f"Not a tar.gz file: {package_path}"}
# Check for google-generativeai library
try:
import google.generativeai as genai
except ImportError:
return {
'success': False,
'skill_id': None,
'url': None,
'message': 'google-generativeai library not installed. Run: pip install google-generativeai'
"success": False,
"skill_id": None,
"url": None,
"message": "google-generativeai library not installed. Run: pip install google-generativeai",
}
# Configure Gemini
@@ -200,11 +191,10 @@ See the references directory for complete documentation with examples and best p
# Extract tar.gz to temp directory
import tempfile
import shutil
with tempfile.TemporaryDirectory() as temp_dir:
# Extract archive
with tarfile.open(package_path, 'r:gz') as tar:
with tarfile.open(package_path, "r:gz") as tar:
tar.extractall(temp_dir)
temp_path = Path(temp_dir)
@@ -213,17 +203,14 @@ See the references directory for complete documentation with examples and best p
main_file = temp_path / "system_instructions.md"
if not main_file.exists():
return {
'success': False,
'skill_id': None,
'url': None,
'message': 'Invalid package: system_instructions.md not found'
"success": False,
"skill_id": None,
"url": None,
"message": "Invalid package: system_instructions.md not found",
}
# Upload to Files API
uploaded_file = genai.upload_file(
path=str(main_file),
display_name=f"{package_path.stem}_instructions"
)
uploaded_file = genai.upload_file(path=str(main_file), display_name=f"{package_path.stem}_instructions")
# Upload reference files (if any)
refs_dir = temp_path / "references"
@@ -231,25 +218,19 @@ See the references directory for complete documentation with examples and best p
if refs_dir.exists():
for ref_file in refs_dir.glob("*.md"):
ref_uploaded = genai.upload_file(
path=str(ref_file),
display_name=f"{package_path.stem}_{ref_file.stem}"
path=str(ref_file), display_name=f"{package_path.stem}_{ref_file.stem}"
)
uploaded_refs.append(ref_uploaded.name)
return {
'success': True,
'skill_id': uploaded_file.name,
'url': f"https://aistudio.google.com/app/files/{uploaded_file.name}",
'message': f'Skill uploaded to Google AI Studio ({len(uploaded_refs) + 1} files)'
"success": True,
"skill_id": uploaded_file.name,
"url": f"https://aistudio.google.com/app/files/{uploaded_file.name}",
"message": f"Skill uploaded to Google AI Studio ({len(uploaded_refs) + 1} files)",
}
except Exception as e:
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'Upload failed: {str(e)}'
}
return {"success": False, "skill_id": None, "url": None, "message": f"Upload failed: {str(e)}"}
def validate_api_key(self, api_key: str) -> bool:
"""
@@ -261,7 +242,7 @@ See the references directory for complete documentation with examples and best p
Returns:
True if key starts with 'AIza'
"""
return api_key.strip().startswith('AIza')
return api_key.strip().startswith("AIza")
def get_env_var_name(self) -> str:
"""
@@ -319,17 +300,13 @@ See the references directory for complete documentation with examples and best p
# Read current SKILL.md
current_skill_md = None
if skill_md_path.exists():
current_skill_md = skill_md_path.read_text(encoding='utf-8')
current_skill_md = skill_md_path.read_text(encoding="utf-8")
print(f" Found existing SKILL.md ({len(current_skill_md)} chars)")
else:
print(f" No existing SKILL.md, will create new one")
print(" No existing SKILL.md, will create new one")
# Build enhancement prompt
prompt = self._build_enhancement_prompt(
skill_dir.name,
references,
current_skill_md
)
prompt = self._build_enhancement_prompt(skill_dir.name, references, current_skill_md)
print("\n🤖 Asking Gemini to enhance SKILL.md...")
print(f" Input: {len(prompt):,} characters")
@@ -337,7 +314,7 @@ See the references directory for complete documentation with examples and best p
try:
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-2.0-flash-exp')
model = genai.GenerativeModel("gemini-2.0-flash-exp")
response = model.generate_content(prompt)
@@ -346,13 +323,13 @@ See the references directory for complete documentation with examples and best p
# Backup original
if skill_md_path.exists():
backup_path = skill_md_path.with_suffix('.md.backup')
backup_path = skill_md_path.with_suffix(".md.backup")
skill_md_path.rename(backup_path)
print(f" 💾 Backed up original to: {backup_path.name}")
# Save enhanced version
skill_md_path.write_text(enhanced_content, encoding='utf-8')
print(f" ✅ Saved enhanced SKILL.md")
skill_md_path.write_text(enhanced_content, encoding="utf-8")
print(" ✅ Saved enhanced SKILL.md")
return True
@@ -360,7 +337,7 @@ See the references directory for complete documentation with examples and best p
print(f"❌ Error calling Gemini API: {e}")
return False
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> Dict[str, str]:
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> dict[str, str]:
"""
Read reference markdown files from skill directory.
@@ -383,7 +360,7 @@ See the references directory for complete documentation with examples and best p
break
try:
content = ref_file.read_text(encoding='utf-8')
content = ref_file.read_text(encoding="utf-8")
# Limit individual file size
if len(content) > 30000:
content = content[:30000] + "\n\n...(truncated)"
@@ -397,10 +374,7 @@ See the references directory for complete documentation with examples and best p
return references
def _build_enhancement_prompt(
self,
skill_name: str,
references: Dict[str, str],
current_skill_md: str = None
self, skill_name: str, references: dict[str, str], current_skill_md: str = None
) -> str:
"""
Build Gemini API prompt for enhancement.
@@ -418,9 +392,9 @@ See the references directory for complete documentation with examples and best p
I've scraped documentation and organized it into reference files. Your job is to create an EXCELLENT markdown documentation file that will help Gemini use this documentation effectively.
CURRENT DOCUMENTATION:
{'```markdown' if current_skill_md else '(none - create from scratch)'}
{current_skill_md or 'No existing documentation'}
{'```' if current_skill_md else ''}
{"```markdown" if current_skill_md else "(none - create from scratch)"}
{current_skill_md or "No existing documentation"}
{"```" if current_skill_md else ""}
REFERENCE DOCUMENTATION:
"""

View File

@@ -8,7 +8,7 @@ No platform-specific features, just clean markdown documentation.
import zipfile
from pathlib import Path
from typing import Dict, Any
from typing import Any
from .base import SkillAdaptor, SkillMetadata
@@ -100,33 +100,33 @@ Browse the reference files for detailed information on each topic. All files are
skill_dir = Path(skill_dir)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith('/'):
if output_path.is_dir() or str(output_path).endswith("/"):
output_path = Path(output_path) / f"{skill_dir.name}-markdown.zip"
elif not str(output_path).endswith('.zip'):
elif not str(output_path).endswith(".zip"):
# Replace extension if needed
output_str = str(output_path).replace('.tar.gz', '.zip')
if not output_str.endswith('-markdown.zip'):
output_str = output_str.replace('.zip', '-markdown.zip')
if not output_str.endswith('.zip'):
output_str += '.zip'
output_str = str(output_path).replace(".tar.gz", ".zip")
if not output_str.endswith("-markdown.zip"):
output_str = output_str.replace(".zip", "-markdown.zip")
if not output_str.endswith(".zip"):
output_str += ".zip"
output_path = Path(output_str)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Create ZIP file
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
# Add SKILL.md as README.md
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
content = skill_md.read_text(encoding='utf-8')
content = skill_md.read_text(encoding="utf-8")
zf.writestr("README.md", content)
# Add individual reference files
refs_dir = skill_dir / "references"
if refs_dir.exists():
for ref_file in refs_dir.rglob("*.md"):
if ref_file.is_file() and not ref_file.name.startswith('.'):
if ref_file.is_file() and not ref_file.name.startswith("."):
# Preserve directory structure under references/
arcname = ref_file.relative_to(skill_dir)
zf.write(ref_file, str(arcname))
@@ -138,20 +138,21 @@ Browse the reference files for detailed information on each topic. All files are
# Add metadata file
import json
metadata = {
'platform': 'markdown',
'name': skill_dir.name,
'version': '1.0.0',
'created_with': 'skill-seekers',
'format': 'universal_markdown',
'usage': 'Use with any LLM or documentation system'
"platform": "markdown",
"name": skill_dir.name,
"version": "1.0.0",
"created_with": "skill-seekers",
"format": "universal_markdown",
"usage": "Use with any LLM or documentation system",
}
zf.writestr("metadata.json", json.dumps(metadata, indent=2))
return output_path
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
"""
Generic markdown export does not support upload.
@@ -166,13 +167,13 @@ Browse the reference files for detailed information on each topic. All files are
Result indicating no upload capability
"""
return {
'success': False,
'skill_id': None,
'url': str(package_path.absolute()),
'message': (
'Generic markdown export does not support automatic upload. '
f'Your documentation is packaged at: {package_path.absolute()}'
)
"success": False,
"skill_id": None,
"url": str(package_path.absolute()),
"message": (
"Generic markdown export does not support automatic upload. "
f"Your documentation is packaged at: {package_path.absolute()}"
),
}
def validate_api_key(self, api_key: str) -> bool:
@@ -237,10 +238,10 @@ Browse the reference files for detailed information on each topic. All files are
# Add main content
if skill_md.exists():
content = skill_md.read_text(encoding='utf-8')
content = skill_md.read_text(encoding="utf-8")
# Strip YAML frontmatter if present
if content.startswith('---'):
parts = content.split('---', 2)
if content.startswith("---"):
parts = content.split("---", 2)
if len(parts) >= 3:
content = parts[2].strip()
combined_parts.append(content)
@@ -258,7 +259,7 @@ Browse the reference files for detailed information on each topic. All files are
continue # Skip index
try:
ref_content = ref_file.read_text(encoding='utf-8')
ref_content = ref_file.read_text(encoding="utf-8")
combined_parts.append(f"# {ref_file.stem.replace('_', ' ').title()}\n\n")
combined_parts.append(ref_content)
combined_parts.append("\n\n---\n\n")

View File

@@ -6,11 +6,10 @@ Implements platform-specific handling for OpenAI ChatGPT Assistants.
Uses Assistants API with Vector Store for file search.
"""
import os
import zipfile
import json
import zipfile
from pathlib import Path
from typing import Dict, Any
from typing import Any
from .base import SkillAdaptor, SkillMetadata
@@ -123,51 +122,51 @@ Always prioritize accuracy by consulting the attached documentation files before
skill_dir = Path(skill_dir)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith('/'):
if output_path.is_dir() or str(output_path).endswith("/"):
output_path = Path(output_path) / f"{skill_dir.name}-openai.zip"
elif not str(output_path).endswith('.zip'):
elif not str(output_path).endswith(".zip"):
# Keep .zip extension
if not str(output_path).endswith('-openai.zip'):
output_str = str(output_path).replace('.zip', '-openai.zip')
if not output_str.endswith('.zip'):
output_str += '.zip'
if not str(output_path).endswith("-openai.zip"):
output_str = str(output_path).replace(".zip", "-openai.zip")
if not output_str.endswith(".zip"):
output_str += ".zip"
output_path = Path(output_str)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Create ZIP file
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
# Add SKILL.md as assistant_instructions.txt
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
instructions = skill_md.read_text(encoding='utf-8')
instructions = skill_md.read_text(encoding="utf-8")
zf.writestr("assistant_instructions.txt", instructions)
# Add references directory as vector_store_files/
refs_dir = skill_dir / "references"
if refs_dir.exists():
for ref_file in refs_dir.rglob("*.md"):
if ref_file.is_file() and not ref_file.name.startswith('.'):
if ref_file.is_file() and not ref_file.name.startswith("."):
# Place all reference files in vector_store_files/
arcname = f"vector_store_files/{ref_file.name}"
zf.write(ref_file, arcname)
# Create and add metadata file
metadata = {
'platform': 'openai',
'name': skill_dir.name,
'version': '1.0.0',
'created_with': 'skill-seekers',
'model': 'gpt-4o',
'tools': ['file_search']
"platform": "openai",
"name": skill_dir.name,
"version": "1.0.0",
"created_with": "skill-seekers",
"model": "gpt-4o",
"tools": ["file_search"],
}
zf.writestr("openai_metadata.json", json.dumps(metadata, indent=2))
return output_path
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
"""
Upload skill ZIP to OpenAI Assistants API.
@@ -186,30 +185,20 @@ Always prioritize accuracy by consulting the attached documentation files before
# Validate package file FIRST
package_path = Path(package_path)
if not package_path.exists():
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'File not found: {package_path}'
}
return {"success": False, "skill_id": None, "url": None, "message": f"File not found: {package_path}"}
if not package_path.suffix == '.zip':
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'Not a ZIP file: {package_path}'
}
if not package_path.suffix == ".zip":
return {"success": False, "skill_id": None, "url": None, "message": f"Not a ZIP file: {package_path}"}
# Check for openai library
try:
from openai import OpenAI
except ImportError:
return {
'success': False,
'skill_id': None,
'url': None,
'message': 'openai library not installed. Run: pip install openai'
"success": False,
"skill_id": None,
"url": None,
"message": "openai library not installed. Run: pip install openai",
}
# Configure OpenAI client
@@ -218,11 +207,10 @@ Always prioritize accuracy by consulting the attached documentation files before
# Extract package to temp directory
import tempfile
import shutil
with tempfile.TemporaryDirectory() as temp_dir:
# Extract ZIP
with zipfile.ZipFile(package_path, 'r') as zf:
with zipfile.ZipFile(package_path, "r") as zf:
zf.extractall(temp_dir)
temp_path = Path(temp_dir)
@@ -231,29 +219,27 @@ Always prioritize accuracy by consulting the attached documentation files before
instructions_file = temp_path / "assistant_instructions.txt"
if not instructions_file.exists():
return {
'success': False,
'skill_id': None,
'url': None,
'message': 'Invalid package: assistant_instructions.txt not found'
"success": False,
"skill_id": None,
"url": None,
"message": "Invalid package: assistant_instructions.txt not found",
}
instructions = instructions_file.read_text(encoding='utf-8')
instructions = instructions_file.read_text(encoding="utf-8")
# Read metadata
metadata_file = temp_path / "openai_metadata.json"
skill_name = package_path.stem
model = kwargs.get('model', 'gpt-4o')
model = kwargs.get("model", "gpt-4o")
if metadata_file.exists():
with open(metadata_file, 'r') as f:
with open(metadata_file) as f:
metadata = json.load(f)
skill_name = metadata.get('name', skill_name)
model = metadata.get('model', model)
skill_name = metadata.get("name", skill_name)
model = metadata.get("model", model)
# Create vector store
vector_store = client.beta.vector_stores.create(
name=f"{skill_name} Documentation"
)
vector_store = client.beta.vector_stores.create(name=f"{skill_name} Documentation")
# Upload reference files to vector store
vector_files_dir = temp_path / "vector_store_files"
@@ -262,19 +248,13 @@ Always prioritize accuracy by consulting the attached documentation files before
if vector_files_dir.exists():
for ref_file in vector_files_dir.glob("*.md"):
# Upload file
with open(ref_file, 'rb') as f:
uploaded_file = client.files.create(
file=f,
purpose='assistants'
)
with open(ref_file, "rb") as f:
uploaded_file = client.files.create(file=f, purpose="assistants")
file_ids.append(uploaded_file.id)
# Attach files to vector store
if file_ids:
client.beta.vector_stores.files.create_batch(
vector_store_id=vector_store.id,
file_ids=file_ids
)
client.beta.vector_stores.files.create_batch(vector_store_id=vector_store.id, file_ids=file_ids)
# Create assistant
assistant = client.beta.assistants.create(
@@ -282,27 +262,18 @@ Always prioritize accuracy by consulting the attached documentation files before
instructions=instructions,
model=model,
tools=[{"type": "file_search"}],
tool_resources={
"file_search": {
"vector_store_ids": [vector_store.id]
}
}
tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)
return {
'success': True,
'skill_id': assistant.id,
'url': f"https://platform.openai.com/assistants/{assistant.id}",
'message': f'Assistant created with {len(file_ids)} knowledge files'
"success": True,
"skill_id": assistant.id,
"url": f"https://platform.openai.com/assistants/{assistant.id}",
"message": f"Assistant created with {len(file_ids)} knowledge files",
}
except Exception as e:
return {
'success': False,
'skill_id': None,
'url': None,
'message': f'Upload failed: {str(e)}'
}
return {"success": False, "skill_id": None, "url": None, "message": f"Upload failed: {str(e)}"}
def validate_api_key(self, api_key: str) -> bool:
"""
@@ -314,7 +285,7 @@ Always prioritize accuracy by consulting the attached documentation files before
Returns:
True if key starts with 'sk-'
"""
return api_key.strip().startswith('sk-')
return api_key.strip().startswith("sk-")
def get_env_var_name(self) -> str:
"""
@@ -372,17 +343,13 @@ Always prioritize accuracy by consulting the attached documentation files before
# Read current SKILL.md
current_skill_md = None
if skill_md_path.exists():
current_skill_md = skill_md_path.read_text(encoding='utf-8')
current_skill_md = skill_md_path.read_text(encoding="utf-8")
print(f" Found existing SKILL.md ({len(current_skill_md)} chars)")
else:
print(f" No existing SKILL.md, will create new one")
print(" No existing SKILL.md, will create new one")
# Build enhancement prompt
prompt = self._build_enhancement_prompt(
skill_dir.name,
references,
current_skill_md
)
prompt = self._build_enhancement_prompt(skill_dir.name, references, current_skill_md)
print("\n🤖 Asking GPT-4o to enhance SKILL.md...")
print(f" Input: {len(prompt):,} characters")
@@ -395,15 +362,12 @@ Always prioritize accuracy by consulting the attached documentation files before
messages=[
{
"role": "system",
"content": "You are an expert technical writer creating Assistant instructions for OpenAI ChatGPT."
"content": "You are an expert technical writer creating Assistant instructions for OpenAI ChatGPT.",
},
{
"role": "user",
"content": prompt
}
{"role": "user", "content": prompt},
],
temperature=0.3,
max_tokens=4096
max_tokens=4096,
)
enhanced_content = response.choices[0].message.content
@@ -411,13 +375,13 @@ Always prioritize accuracy by consulting the attached documentation files before
# Backup original
if skill_md_path.exists():
backup_path = skill_md_path.with_suffix('.md.backup')
backup_path = skill_md_path.with_suffix(".md.backup")
skill_md_path.rename(backup_path)
print(f" 💾 Backed up original to: {backup_path.name}")
# Save enhanced version
skill_md_path.write_text(enhanced_content, encoding='utf-8')
print(f" ✅ Saved enhanced SKILL.md")
skill_md_path.write_text(enhanced_content, encoding="utf-8")
print(" ✅ Saved enhanced SKILL.md")
return True
@@ -425,7 +389,7 @@ Always prioritize accuracy by consulting the attached documentation files before
print(f"❌ Error calling OpenAI API: {e}")
return False
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> Dict[str, str]:
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> dict[str, str]:
"""
Read reference markdown files from skill directory.
@@ -448,7 +412,7 @@ Always prioritize accuracy by consulting the attached documentation files before
break
try:
content = ref_file.read_text(encoding='utf-8')
content = ref_file.read_text(encoding="utf-8")
# Limit individual file size
if len(content) > 30000:
content = content[:30000] + "\n\n...(truncated)"
@@ -462,10 +426,7 @@ Always prioritize accuracy by consulting the attached documentation files before
return references
def _build_enhancement_prompt(
self,
skill_name: str,
references: Dict[str, str],
current_skill_md: str = None
self, skill_name: str, references: dict[str, str], current_skill_md: str = None
) -> str:
"""
Build OpenAI API prompt for enhancement.
@@ -483,9 +444,9 @@ Always prioritize accuracy by consulting the attached documentation files before
I've scraped documentation and organized it into reference files. Your job is to create EXCELLENT Assistant instructions that will help the Assistant use this documentation effectively.
CURRENT INSTRUCTIONS:
{'```' if current_skill_md else '(none - create from scratch)'}
{current_skill_md or 'No existing instructions'}
{'```' if current_skill_md else ''}
{"```" if current_skill_md else "(none - create from scratch)"}
{current_skill_md or "No existing instructions"}
{"```" if current_skill_md else ""}
REFERENCE DOCUMENTATION:
"""

View File

@@ -17,9 +17,8 @@ Credits:
- Graceful degradation if API unavailable
"""
import os
import logging
from typing import List, Dict, Optional, Any
import os
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@@ -28,18 +27,19 @@ logger = logging.getLogger(__name__)
@dataclass
class AIAnalysis:
"""AI analysis result for patterns or examples"""
explanation: str
issues: List[str]
recommendations: List[str]
related_items: List[str] # Related patterns or examples
best_practices: List[str]
issues: list[str]
recommendations: list[str]
related_items: list[str] # Related patterns or examples
best_practices: list[str]
confidence_boost: float # -0.2 to +0.2 adjustment to confidence
class AIEnhancer:
"""Base class for AI enhancement"""
def __init__(self, api_key: Optional[str] = None, enabled: bool = True, mode: str = "auto"):
def __init__(self, api_key: str | None = None, enabled: bool = True, mode: str = "auto"):
"""
Initialize AI enhancer.
@@ -53,7 +53,7 @@ class AIEnhancer:
"""
self.enabled = enabled
self.mode = mode
self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY')
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
self.client = None
# Determine actual mode
@@ -72,6 +72,7 @@ class AIEnhancer:
if self.mode == "api" and self.enabled:
try:
import anthropic
self.client = anthropic.Anthropic(api_key=self.api_key)
logger.info("✅ AI enhancement enabled (using Claude API)")
except ImportError:
@@ -88,16 +89,14 @@ class AIEnhancer:
logger.info(" Use API mode (set ANTHROPIC_API_KEY) or 'skill-seekers enhance' for SKILL.md")
self.enabled = False
def _call_claude(self, prompt: str, max_tokens: int = 1000) -> Optional[str]:
def _call_claude(self, prompt: str, max_tokens: int = 1000) -> str | None:
"""Call Claude API with error handling"""
if not self.client:
return None
try:
response = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}]
model="claude-sonnet-4-20250514", max_tokens=max_tokens, messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text
except Exception as e:
@@ -108,7 +107,7 @@ class AIEnhancer:
class PatternEnhancer(AIEnhancer):
"""Enhance design pattern detection with AI analysis"""
def enhance_patterns(self, patterns: List[Dict]) -> List[Dict]:
def enhance_patterns(self, patterns: list[dict]) -> list[dict]:
"""
Enhance detected patterns with AI analysis.
@@ -128,19 +127,19 @@ class PatternEnhancer(AIEnhancer):
enhanced = []
for i in range(0, len(patterns), batch_size):
batch = patterns[i:i+batch_size]
batch = patterns[i : i + batch_size]
batch_results = self._enhance_pattern_batch(batch)
enhanced.extend(batch_results)
logger.info(f"✅ Enhanced {len(enhanced)} patterns")
return enhanced
def _enhance_pattern_batch(self, patterns: List[Dict]) -> List[Dict]:
def _enhance_pattern_batch(self, patterns: list[dict]) -> list[dict]:
"""Enhance a batch of patterns"""
# Prepare prompt
pattern_descriptions = []
for idx, p in enumerate(patterns):
desc = f"{idx+1}. {p['pattern_type']} in {p.get('class_name', 'unknown')}"
desc = f"{idx + 1}. {p['pattern_type']} in {p.get('class_name', 'unknown')}"
desc += f"\n Evidence: {', '.join(p.get('evidence', []))}"
pattern_descriptions.append(desc)
@@ -166,24 +165,25 @@ Format as JSON array matching input order. Be concise and actionable.
try:
import json
analyses = json.loads(response)
# Merge AI analysis into patterns
for idx, pattern in enumerate(patterns):
if idx < len(analyses):
analysis = analyses[idx]
pattern['ai_analysis'] = {
'explanation': analysis.get('explanation', ''),
'issues': analysis.get('issues', []),
'recommendations': analysis.get('recommendations', []),
'related_patterns': analysis.get('related_patterns', []),
'confidence_boost': analysis.get('confidence_boost', 0.0)
pattern["ai_analysis"] = {
"explanation": analysis.get("explanation", ""),
"issues": analysis.get("issues", []),
"recommendations": analysis.get("recommendations", []),
"related_patterns": analysis.get("related_patterns", []),
"confidence_boost": analysis.get("confidence_boost", 0.0),
}
# Adjust confidence
boost = analysis.get('confidence_boost', 0.0)
boost = analysis.get("confidence_boost", 0.0)
if -0.2 <= boost <= 0.2:
pattern['confidence'] = min(1.0, max(0.0, pattern['confidence'] + boost))
pattern["confidence"] = min(1.0, max(0.0, pattern["confidence"] + boost))
return patterns
@@ -198,7 +198,7 @@ Format as JSON array matching input order. Be concise and actionable.
class TestExampleEnhancer(AIEnhancer):
"""Enhance test examples with AI analysis"""
def enhance_examples(self, examples: List[Dict]) -> List[Dict]:
def enhance_examples(self, examples: list[dict]) -> list[dict]:
"""
Enhance test examples with AI context and explanations.
@@ -218,21 +218,21 @@ class TestExampleEnhancer(AIEnhancer):
enhanced = []
for i in range(0, len(examples), batch_size):
batch = examples[i:i+batch_size]
batch = examples[i : i + batch_size]
batch_results = self._enhance_example_batch(batch)
enhanced.extend(batch_results)
logger.info(f"✅ Enhanced {len(enhanced)} examples")
return enhanced
def _enhance_example_batch(self, examples: List[Dict]) -> List[Dict]:
def _enhance_example_batch(self, examples: list[dict]) -> list[dict]:
"""Enhance a batch of examples"""
# Prepare prompt
example_descriptions = []
for idx, ex in enumerate(examples):
desc = f"{idx+1}. {ex.get('category', 'unknown')} - {ex.get('test_name', 'unknown')}"
desc = f"{idx + 1}. {ex.get('category', 'unknown')} - {ex.get('test_name', 'unknown')}"
desc += f"\n Code: {ex.get('code', '')[:100]}..."
if ex.get('expected_behavior'):
if ex.get("expected_behavior"):
desc += f"\n Expected: {ex['expected_behavior']}"
example_descriptions.append(desc)
@@ -257,18 +257,19 @@ Format as JSON array matching input order. Focus on educational value.
try:
import json
analyses = json.loads(response)
# Merge AI analysis into examples
for idx, example in enumerate(examples):
if idx < len(analyses):
analysis = analyses[idx]
example['ai_analysis'] = {
'explanation': analysis.get('explanation', ''),
'best_practices': analysis.get('best_practices', []),
'common_mistakes': analysis.get('common_mistakes', []),
'related_examples': analysis.get('related_examples', []),
'tutorial_group': analysis.get('tutorial_group', '')
example["ai_analysis"] = {
"explanation": analysis.get("explanation", ""),
"best_practices": analysis.get("best_practices", []),
"common_mistakes": analysis.get("common_mistakes", []),
"related_examples": analysis.get("related_examples", []),
"tutorial_group": analysis.get("tutorial_group", ""),
}
return examples
@@ -280,7 +281,7 @@ Format as JSON array matching input order. Focus on educational value.
logger.warning(f"⚠️ Error processing AI analysis: {e}")
return examples
def generate_tutorials(self, examples: List[Dict]) -> Dict[str, List[Dict]]:
def generate_tutorials(self, examples: list[dict]) -> dict[str, list[dict]]:
"""
Group enhanced examples into tutorial sections.
@@ -293,8 +294,8 @@ Format as JSON array matching input order. Focus on educational value.
tutorials = {}
for example in examples:
ai_analysis = example.get('ai_analysis', {})
group = ai_analysis.get('tutorial_group', 'Miscellaneous')
ai_analysis = example.get("ai_analysis", {})
group = ai_analysis.get("tutorial_group", "Miscellaneous")
if group not in tutorials:
tutorials[group] = []

View File

@@ -17,10 +17,9 @@ Usage:
builder.build_reference(output_dir)
"""
import os
import json
from pathlib import Path
from typing import Dict, List, Any, Optional
from typing import Any
class APIReferenceBuilder:
@@ -31,7 +30,7 @@ class APIReferenceBuilder:
documentation for each analyzed source file.
"""
def __init__(self, code_analysis: Dict[str, Any]):
def __init__(self, code_analysis: dict[str, Any]):
"""
Initialize builder with code analysis results.
@@ -40,9 +39,9 @@ class APIReferenceBuilder:
Expected format: {'files': [{'file': 'path', 'classes': [...], 'functions': [...]}]}
"""
self.code_analysis = code_analysis
self.files_data = code_analysis.get('files', [])
self.files_data = code_analysis.get("files", [])
def build_reference(self, output_dir: Path) -> Dict[str, Path]:
def build_reference(self, output_dir: Path) -> dict[str, Path]:
"""
Generate markdown files for each analyzed source file.
@@ -58,11 +57,11 @@ class APIReferenceBuilder:
generated_files = {}
for file_data in self.files_data:
source_file = file_data.get('file', 'unknown')
language = file_data.get('language', 'Unknown')
source_file = file_data.get("file", "unknown")
language = file_data.get("language", "Unknown")
# Skip files with no analysis
if not file_data.get('classes') and not file_data.get('functions'):
if not file_data.get("classes") and not file_data.get("functions"):
continue
# Generate markdown content
@@ -73,7 +72,7 @@ class APIReferenceBuilder:
output_path = output_dir / output_filename
# Write markdown file
output_path.write_text(markdown_content, encoding='utf-8')
output_path.write_text(markdown_content, encoding="utf-8")
generated_files[source_file] = output_path
return generated_files
@@ -92,11 +91,10 @@ class APIReferenceBuilder:
basename = Path(source_file).name
# Replace extension with .md
name_without_ext = basename.rsplit('.', 1)[0] if '.' in basename else basename
name_without_ext = basename.rsplit(".", 1)[0] if "." in basename else basename
return f"{name_without_ext}.md"
def _generate_file_reference(self, file_data: Dict[str, Any],
source_file: str, language: str) -> str:
def _generate_file_reference(self, file_data: dict[str, Any], source_file: str, language: str) -> str:
"""
Generate complete markdown reference for a single file.
@@ -118,7 +116,7 @@ class APIReferenceBuilder:
lines.append("---\n")
# Classes section
classes = file_data.get('classes', [])
classes = file_data.get("classes", [])
if classes:
lines.append("## Classes\n")
for cls in classes:
@@ -126,16 +124,16 @@ class APIReferenceBuilder:
lines.append("\n")
# Functions section
functions = file_data.get('functions', [])
functions = file_data.get("functions", [])
if functions:
lines.append("## Functions\n")
for func in functions:
lines.append(self._format_function(func))
lines.append("\n")
return '\n'.join(lines)
return "\n".join(lines)
def _format_class(self, class_sig: Dict[str, Any]) -> str:
def _format_class(self, class_sig: dict[str, Any]) -> str:
"""
Format class signature as markdown.
@@ -148,33 +146,33 @@ class APIReferenceBuilder:
lines = []
# Class name
class_name = class_sig.get('name', 'Unknown')
class_name = class_sig.get("name", "Unknown")
lines.append(f"### {class_name}\n")
# Docstring
docstring = class_sig.get('docstring')
docstring = class_sig.get("docstring")
if docstring:
lines.append(f"{docstring}\n")
# Inheritance
base_classes = class_sig.get('base_classes', [])
base_classes = class_sig.get("base_classes", [])
if base_classes:
bases_str = ', '.join(base_classes)
bases_str = ", ".join(base_classes)
lines.append(f"**Inherits from**: {bases_str}\n")
else:
lines.append("**Inherits from**: (none)\n")
# Methods
methods = class_sig.get('methods', [])
methods = class_sig.get("methods", [])
if methods:
lines.append("#### Methods\n")
for method in methods:
lines.append(self._format_method(method))
lines.append("")
return '\n'.join(lines)
return "\n".join(lines)
def _format_method(self, method_sig: Dict[str, Any]) -> str:
def _format_method(self, method_sig: dict[str, Any]) -> str:
"""
Format method signature as markdown.
@@ -191,30 +189,30 @@ class APIReferenceBuilder:
lines.append(f"##### {signature}\n")
# Docstring
docstring = method_sig.get('docstring')
docstring = method_sig.get("docstring")
if docstring:
lines.append(f"{docstring}\n")
# Decorators
decorators = method_sig.get('decorators', [])
decorators = method_sig.get("decorators", [])
if decorators:
dec_str = ', '.join(f"`@{d}`" for d in decorators)
dec_str = ", ".join(f"`@{d}`" for d in decorators)
lines.append(f"**Decorators**: {dec_str}\n")
# Parameters table
params = method_sig.get('parameters', [])
params = method_sig.get("parameters", [])
if params:
lines.append(self._format_parameters(params))
lines.append("")
# Return type
return_type = method_sig.get('return_type')
return_type = method_sig.get("return_type")
if return_type:
lines.append(f"**Returns**: `{return_type}`\n")
return '\n'.join(lines)
return "\n".join(lines)
def _format_function(self, func_sig: Dict[str, Any]) -> str:
def _format_function(self, func_sig: dict[str, Any]) -> str:
"""
Format function signature as markdown.
@@ -231,30 +229,30 @@ class APIReferenceBuilder:
lines.append(f"### {signature}\n")
# Async indicator
if func_sig.get('is_async'):
if func_sig.get("is_async"):
lines.append("**Async function**\n")
# Docstring
docstring = func_sig.get('docstring')
docstring = func_sig.get("docstring")
if docstring:
lines.append(f"{docstring}\n")
# Parameters table
params = func_sig.get('parameters', [])
params = func_sig.get("parameters", [])
if params:
lines.append(self._format_parameters(params))
lines.append("")
# Return type
return_type = func_sig.get('return_type')
return_type = func_sig.get("return_type")
if return_type:
lines.append(f"**Returns**: `{return_type}`\n")
else:
lines.append("**Returns**: (none)\n")
return '\n'.join(lines)
return "\n".join(lines)
def _build_signature(self, sig: Dict[str, Any]) -> str:
def _build_signature(self, sig: dict[str, Any]) -> str:
"""
Build function/method signature string.
@@ -264,28 +262,28 @@ class APIReferenceBuilder:
Returns:
Formatted signature string
"""
name = sig.get('name', 'unknown')
params = sig.get('parameters', [])
return_type = sig.get('return_type')
name = sig.get("name", "unknown")
params = sig.get("parameters", [])
return_type = sig.get("return_type")
# Build parameter list
param_strs = []
for param in params:
param_str = param.get('name', '')
param_str = param.get("name", "")
# Add type hint if available
type_hint = param.get('type_hint')
type_hint = param.get("type_hint")
if type_hint:
param_str += f": {type_hint}"
# Add default value if available
default = param.get('default')
default = param.get("default")
if default:
param_str += f" = {default}"
param_strs.append(param_str)
params_str = ', '.join(param_strs)
params_str = ", ".join(param_strs)
# Build full signature
if return_type:
@@ -293,7 +291,7 @@ class APIReferenceBuilder:
else:
return f"{name}({params_str})"
def _format_parameters(self, params: List[Dict]) -> str:
def _format_parameters(self, params: list[dict]) -> str:
"""
Format parameter list as markdown table.
@@ -313,19 +311,19 @@ class APIReferenceBuilder:
lines.append("|------|------|---------|-------------|")
for param in params:
name = param.get('name', '-')
type_hint = param.get('type_hint', '-')
default = param.get('default')
name = param.get("name", "-")
type_hint = param.get("type_hint", "-")
default = param.get("default")
# Show "-" for parameters without defaults
default_str = default if default is not None else '-'
default_str = default if default is not None else "-"
# For description, use empty for now (would need JSDoc/docstring parsing)
description = "-"
lines.append(f"| {name} | {type_hint} | {default_str} | {description} |")
return '\n'.join(lines)
return "\n".join(lines)
def main():
@@ -336,12 +334,10 @@ def main():
"""
import argparse
parser = argparse.ArgumentParser(
description='Generate API reference from code analysis results'
)
parser = argparse.ArgumentParser(description="Generate API reference from code analysis results")
parser.add_argument('input_file', help='Code analysis JSON file')
parser.add_argument('output_dir', help='Output directory for markdown files')
parser.add_argument("input_file", help="Code analysis JSON file")
parser.add_argument("output_dir", help="Output directory for markdown files")
args = parser.parse_args()
@@ -351,7 +347,7 @@ def main():
print(f"Error: Input file not found: {input_path}")
return 1
with open(input_path, 'r', encoding='utf-8') as f:
with open(input_path, encoding="utf-8") as f:
code_analysis = json.load(f)
# Build API reference
@@ -367,6 +363,7 @@ def main():
return 0
if __name__ == '__main__':
if __name__ == "__main__":
import sys
sys.exit(main())

View File

@@ -21,11 +21,9 @@ Credits:
"""
import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Dict, Optional, Set
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
@@ -33,41 +31,43 @@ logger = logging.getLogger(__name__)
@dataclass
class ArchitecturalPattern:
"""Detected architectural pattern"""
pattern_name: str # e.g., "MVC", "MVVM", "Repository"
confidence: float # 0.0-1.0
evidence: List[str] # List of evidence supporting detection
components: Dict[str, List[str]] # Component type -> file paths
framework: Optional[str] = None # Detected framework (Django, Spring, etc.)
evidence: list[str] # List of evidence supporting detection
components: dict[str, list[str]] # Component type -> file paths
framework: str | None = None # Detected framework (Django, Spring, etc.)
description: str = "" # Human-readable description
@dataclass
class ArchitecturalReport:
"""Complete architectural analysis report"""
patterns: List[ArchitecturalPattern]
directory_structure: Dict[str, int] # Directory name -> file count
total_files_analyzed: int
frameworks_detected: List[str]
ai_analysis: Optional[Dict] = None # AI enhancement (C3.6 integration)
def to_dict(self) -> Dict:
patterns: list[ArchitecturalPattern]
directory_structure: dict[str, int] # Directory name -> file count
total_files_analyzed: int
frameworks_detected: list[str]
ai_analysis: dict | None = None # AI enhancement (C3.6 integration)
def to_dict(self) -> dict:
"""Export to dictionary"""
return {
'patterns': [
"patterns": [
{
'pattern_name': p.pattern_name,
'confidence': p.confidence,
'evidence': p.evidence,
'components': p.components,
'framework': p.framework,
'description': p.description
"pattern_name": p.pattern_name,
"confidence": p.confidence,
"evidence": p.evidence,
"components": p.components,
"framework": p.framework,
"description": p.description,
}
for p in self.patterns
],
'directory_structure': self.directory_structure,
'total_files_analyzed': self.total_files_analyzed,
'frameworks_detected': self.frameworks_detected,
'ai_analysis': self.ai_analysis
"directory_structure": self.directory_structure,
"total_files_analyzed": self.total_files_analyzed,
"frameworks_detected": self.frameworks_detected,
"ai_analysis": self.ai_analysis,
}
@@ -79,25 +79,25 @@ class ArchitecturalPatternDetector:
"""
# Common directory patterns for architectures
MVC_DIRS = {'models', 'views', 'controllers', 'model', 'view', 'controller'}
MVVM_DIRS = {'models', 'views', 'viewmodels', 'viewmodel'}
LAYERED_DIRS = {'presentation', 'business', 'data', 'dal', 'bll', 'ui'}
CLEAN_ARCH_DIRS = {'domain', 'application', 'infrastructure', 'presentation'}
REPO_DIRS = {'repositories', 'repository'}
SERVICE_DIRS = {'services', 'service'}
MVC_DIRS = {"models", "views", "controllers", "model", "view", "controller"}
MVVM_DIRS = {"models", "views", "viewmodels", "viewmodel"}
LAYERED_DIRS = {"presentation", "business", "data", "dal", "bll", "ui"}
CLEAN_ARCH_DIRS = {"domain", "application", "infrastructure", "presentation"}
REPO_DIRS = {"repositories", "repository"}
SERVICE_DIRS = {"services", "service"}
# Framework detection patterns
FRAMEWORK_MARKERS = {
'Django': ['django', 'manage.py', 'settings.py', 'urls.py'],
'Flask': ['flask', 'app.py', 'wsgi.py'],
'Spring': ['springframework', '@Controller', '@Service', '@Repository'],
'ASP.NET': ['Controllers', 'Models', 'Views', '.cshtml', 'Startup.cs'],
'Rails': ['app/models', 'app/views', 'app/controllers', 'config/routes.rb'],
'Angular': ['app.module.ts', '@Component', '@Injectable', 'angular.json'],
'React': ['package.json', 'react', 'components'],
'Vue.js': ['vue', '.vue', 'components'],
'Express': ['express', 'app.js', 'routes'],
'Laravel': ['artisan', 'app/Http/Controllers', 'app/Models']
"Django": ["django", "manage.py", "settings.py", "urls.py"],
"Flask": ["flask", "app.py", "wsgi.py"],
"Spring": ["springframework", "@Controller", "@Service", "@Repository"],
"ASP.NET": ["Controllers", "Models", "Views", ".cshtml", "Startup.cs"],
"Rails": ["app/models", "app/views", "app/controllers", "config/routes.rb"],
"Angular": ["app.module.ts", "@Component", "@Injectable", "angular.json"],
"React": ["package.json", "react", "components"],
"Vue.js": ["vue", ".vue", "components"],
"Express": ["express", "app.js", "routes"],
"Laravel": ["artisan", "app/Http/Controllers", "app/Models"],
}
def __init__(self, enhance_with_ai: bool = True):
@@ -113,12 +113,13 @@ class ArchitecturalPatternDetector:
if self.enhance_with_ai:
try:
from skill_seekers.cli.ai_enhancer import AIEnhancer
self.ai_enhancer = AIEnhancer()
except Exception as e:
logger.warning(f"⚠️ Failed to initialize AI enhancer: {e}")
self.enhance_with_ai = False
def analyze(self, directory: Path, files_analysis: List[Dict]) -> ArchitecturalReport:
def analyze(self, directory: Path, files_analysis: list[dict]) -> ArchitecturalReport:
"""
Analyze codebase for architectural patterns.
@@ -151,7 +152,7 @@ class ArchitecturalPatternDetector:
patterns=patterns,
directory_structure=dir_structure,
total_files_analyzed=len(files_analysis),
frameworks_detected=frameworks
frameworks_detected=frameworks,
)
# Enhance with AI if enabled (C3.6)
@@ -161,11 +162,11 @@ class ArchitecturalPatternDetector:
logger.info(f"✅ Detected {len(patterns)} architectural patterns")
return report
def _analyze_directory_structure(self, directory: Path) -> Dict[str, int]:
def _analyze_directory_structure(self, directory: Path) -> dict[str, int]:
"""Analyze directory structure and count files"""
structure = defaultdict(int)
for path in directory.rglob('*'):
for path in directory.rglob("*"):
if path.is_file():
# Get relative directory path
rel_dir = path.parent.relative_to(directory)
@@ -180,13 +181,13 @@ class ArchitecturalPatternDetector:
return dict(structure)
def _detect_frameworks(self, directory: Path, files: List[Dict]) -> List[str]:
def _detect_frameworks(self, directory: Path, files: list[dict]) -> list[str]:
"""Detect frameworks being used"""
detected = []
# Check file paths and content
all_paths = [str(f.get('file', '')) for f in files]
all_content = ' '.join(all_paths)
all_paths = [str(f.get("file", "")) for f in files]
all_content = " ".join(all_paths)
for framework, markers in self.FRAMEWORK_MARKERS.items():
matches = sum(1 for marker in markers if marker.lower() in all_content.lower())
@@ -196,7 +197,7 @@ class ArchitecturalPatternDetector:
return detected
def _detect_mvc(self, dirs: Dict[str, int], files: List[Dict], frameworks: List[str]) -> List[ArchitecturalPattern]:
def _detect_mvc(self, dirs: dict[str, int], files: list[dict], frameworks: list[str]) -> list[ArchitecturalPattern]:
"""Detect MVC pattern"""
patterns = []
@@ -213,58 +214,62 @@ class ArchitecturalPatternDetector:
# Find MVC files
for file in files:
file_path = str(file.get('file', '')).lower()
file_path = str(file.get("file", "")).lower()
if 'model' in file_path and ('models/' in file_path or '/model/' in file_path):
components['Models'].append(file.get('file', ''))
if len(components['Models']) == 1:
if "model" in file_path and ("models/" in file_path or "/model/" in file_path):
components["Models"].append(file.get("file", ""))
if len(components["Models"]) == 1:
evidence.append("Models directory with model classes")
if 'view' in file_path and ('views/' in file_path or '/view/' in file_path):
components['Views'].append(file.get('file', ''))
if len(components['Views']) == 1:
if "view" in file_path and ("views/" in file_path or "/view/" in file_path):
components["Views"].append(file.get("file", ""))
if len(components["Views"]) == 1:
evidence.append("Views directory with view files")
if 'controller' in file_path and ('controllers/' in file_path or '/controller/' in file_path):
components['Controllers'].append(file.get('file', ''))
if len(components['Controllers']) == 1:
if "controller" in file_path and ("controllers/" in file_path or "/controller/" in file_path):
components["Controllers"].append(file.get("file", ""))
if len(components["Controllers"]) == 1:
evidence.append("Controllers directory with controller classes")
# Calculate confidence
has_models = len(components['Models']) > 0
has_views = len(components['Views']) > 0
has_controllers = len(components['Controllers']) > 0
has_models = len(components["Models"]) > 0
has_views = len(components["Views"]) > 0
has_controllers = len(components["Controllers"]) > 0
if sum([has_models, has_views, has_controllers]) >= 2:
confidence = 0.6 + (sum([has_models, has_views, has_controllers]) * 0.15)
# Boost confidence if framework detected
framework = None
for fw in ['Django', 'Flask', 'Spring', 'ASP.NET', 'Rails', 'Laravel']:
for fw in ["Django", "Flask", "Spring", "ASP.NET", "Rails", "Laravel"]:
if fw in frameworks:
confidence = min(0.95, confidence + 0.1)
framework = fw
evidence.append(f"{fw} framework detected (uses MVC)")
break
patterns.append(ArchitecturalPattern(
pattern_name="MVC (Model-View-Controller)",
confidence=confidence,
evidence=evidence,
components=dict(components),
framework=framework,
description="Separates application into Models (data), Views (UI), and Controllers (logic)"
))
patterns.append(
ArchitecturalPattern(
pattern_name="MVC (Model-View-Controller)",
confidence=confidence,
evidence=evidence,
components=dict(components),
framework=framework,
description="Separates application into Models (data), Views (UI), and Controllers (logic)",
)
)
return patterns
def _detect_mvvm(self, dirs: Dict[str, int], files: List[Dict], frameworks: List[str]) -> List[ArchitecturalPattern]:
def _detect_mvvm(
self, dirs: dict[str, int], files: list[dict], frameworks: list[str]
) -> list[ArchitecturalPattern]:
"""Detect MVVM pattern"""
patterns = []
# Look for ViewModels directory or classes ending with ViewModel
has_viewmodel_dir = 'viewmodels' in dirs or 'viewmodel' in dirs
viewmodel_files = [f for f in files if 'viewmodel' in str(f.get('file', '')).lower()]
has_viewmodel_dir = "viewmodels" in dirs or "viewmodel" in dirs
viewmodel_files = [f for f in files if "viewmodel" in str(f.get("file", "")).lower()]
if not (has_viewmodel_dir or len(viewmodel_files) >= 2):
return patterns
@@ -274,63 +279,68 @@ class ArchitecturalPatternDetector:
# Find MVVM files
for file in files:
file_path = str(file.get('file', '')).lower()
classes = file.get('classes', [])
file_path = str(file.get("file", "")).lower()
classes = file.get("classes", [])
if 'model' in file_path and 'viewmodel' not in file_path:
components['Models'].append(file.get('file', ''))
if "model" in file_path and "viewmodel" not in file_path:
components["Models"].append(file.get("file", ""))
if 'view' in file_path:
components['Views'].append(file.get('file', ''))
if "view" in file_path:
components["Views"].append(file.get("file", ""))
if 'viewmodel' in file_path or any('viewmodel' in c.get('name', '').lower() for c in classes):
components['ViewModels'].append(file.get('file', ''))
if "viewmodel" in file_path or any("viewmodel" in c.get("name", "").lower() for c in classes):
components["ViewModels"].append(file.get("file", ""))
if len(components['ViewModels']) >= 2:
if len(components["ViewModels"]) >= 2:
evidence.append(f"ViewModels directory with {len(components['ViewModels'])} ViewModel classes")
if len(components['Views']) >= 2:
if len(components["Views"]) >= 2:
evidence.append(f"Views directory with {len(components['Views'])} view files")
if len(components['Models']) >= 1:
if len(components["Models"]) >= 1:
evidence.append(f"Models directory with {len(components['Models'])} model files")
# Calculate confidence
has_models = len(components['Models']) > 0
has_views = len(components['Views']) > 0
has_viewmodels = len(components['ViewModels']) >= 2
has_models = len(components["Models"]) > 0
has_views = len(components["Views"]) > 0
has_viewmodels = len(components["ViewModels"]) >= 2
if has_viewmodels and (has_models or has_views):
confidence = 0.7 if (has_models and has_views and has_viewmodels) else 0.6
framework = None
for fw in ['ASP.NET', 'Angular', 'Vue.js']:
for fw in ["ASP.NET", "Angular", "Vue.js"]:
if fw in frameworks:
confidence = min(0.95, confidence + 0.1)
framework = fw
evidence.append(f"{fw} framework detected (supports MVVM)")
break
patterns.append(ArchitecturalPattern(
pattern_name="MVVM (Model-View-ViewModel)",
confidence=confidence,
evidence=evidence,
components=dict(components),
framework=framework,
description="ViewModels provide data-binding between Views and Models"
))
patterns.append(
ArchitecturalPattern(
pattern_name="MVVM (Model-View-ViewModel)",
confidence=confidence,
evidence=evidence,
components=dict(components),
framework=framework,
description="ViewModels provide data-binding between Views and Models",
)
)
return patterns
def _detect_repository(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]:
def _detect_repository(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]:
"""Detect Repository pattern"""
patterns = []
# Look for repositories directory or classes ending with Repository
has_repo_dir = any(d in dirs for d in self.REPO_DIRS)
repo_files = [f for f in files
if 'repository' in str(f.get('file', '')).lower() or
any('repository' in c.get('name', '').lower() for c in f.get('classes', []))]
repo_files = [
f
for f in files
if "repository" in str(f.get("file", "")).lower()
or any("repository" in c.get("name", "").lower() for c in f.get("classes", []))
]
if not (has_repo_dir or len(repo_files) >= 2):
return patterns
@@ -339,30 +349,35 @@ class ArchitecturalPatternDetector:
components = defaultdict(list)
for file in repo_files:
components['Repositories'].append(file.get('file', ''))
components["Repositories"].append(file.get("file", ""))
if len(components['Repositories']) >= 2:
if len(components["Repositories"]) >= 2:
evidence.append(f"Repository pattern: {len(components['Repositories'])} repository classes")
evidence.append("Repositories abstract data access logic")
patterns.append(ArchitecturalPattern(
pattern_name="Repository Pattern",
confidence=0.75,
evidence=evidence,
components=dict(components),
description="Encapsulates data access logic in repository classes"
))
patterns.append(
ArchitecturalPattern(
pattern_name="Repository Pattern",
confidence=0.75,
evidence=evidence,
components=dict(components),
description="Encapsulates data access logic in repository classes",
)
)
return patterns
def _detect_service_layer(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]:
def _detect_service_layer(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]:
"""Detect Service Layer pattern"""
patterns = []
has_service_dir = any(d in dirs for d in self.SERVICE_DIRS)
service_files = [f for f in files
if 'service' in str(f.get('file', '')).lower() or
any('service' in c.get('name', '').lower() for c in f.get('classes', []))]
service_files = [
f
for f in files
if "service" in str(f.get("file", "")).lower()
or any("service" in c.get("name", "").lower() for c in f.get("classes", []))
]
if not (has_service_dir or len(service_files) >= 3):
return patterns
@@ -371,23 +386,25 @@ class ArchitecturalPatternDetector:
components = defaultdict(list)
for file in service_files:
components['Services'].append(file.get('file', ''))
components["Services"].append(file.get("file", ""))
if len(components['Services']) >= 3:
if len(components["Services"]) >= 3:
evidence.append(f"Service layer: {len(components['Services'])} service classes")
evidence.append("Services encapsulate business logic")
patterns.append(ArchitecturalPattern(
pattern_name="Service Layer Pattern",
confidence=0.75,
evidence=evidence,
components=dict(components),
description="Encapsulates business logic in service classes"
))
patterns.append(
ArchitecturalPattern(
pattern_name="Service Layer Pattern",
confidence=0.75,
evidence=evidence,
components=dict(components),
description="Encapsulates business logic in service classes",
)
)
return patterns
def _detect_layered_architecture(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]:
def _detect_layered_architecture(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]:
"""Detect Layered Architecture (3-tier, N-tier)"""
patterns = []
@@ -400,32 +417,34 @@ class ArchitecturalPatternDetector:
components = defaultdict(list)
layers_found = []
if 'presentation' in dirs or 'ui' in dirs:
if "presentation" in dirs or "ui" in dirs:
layers_found.append("Presentation Layer")
evidence.append("Presentation/UI layer detected")
if 'business' in dirs or 'bll' in dirs:
if "business" in dirs or "bll" in dirs:
layers_found.append("Business Logic Layer")
evidence.append("Business logic layer detected")
if 'data' in dirs or 'dal' in dirs:
if "data" in dirs or "dal" in dirs:
layers_found.append("Data Access Layer")
evidence.append("Data access layer detected")
if len(layers_found) >= 2:
confidence = 0.65 + (len(layers_found) * 0.1)
patterns.append(ArchitecturalPattern(
pattern_name=f"Layered Architecture ({len(layers_found)}-tier)",
confidence=min(confidence, 0.9),
evidence=evidence,
components={'Layers': layers_found},
description=f"Separates concerns into {len(layers_found)} distinct layers"
))
patterns.append(
ArchitecturalPattern(
pattern_name=f"Layered Architecture ({len(layers_found)}-tier)",
confidence=min(confidence, 0.9),
evidence=evidence,
components={"Layers": layers_found},
description=f"Separates concerns into {len(layers_found)} distinct layers",
)
)
return patterns
def _detect_clean_architecture(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]:
def _detect_clean_architecture(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]:
"""Detect Clean Architecture"""
patterns = []
@@ -437,50 +456,52 @@ class ArchitecturalPatternDetector:
evidence = []
components = defaultdict(list)
if 'domain' in dirs:
if "domain" in dirs:
evidence.append("Domain layer (core business logic)")
components['Domain'].append('domain/')
components["Domain"].append("domain/")
if 'application' in dirs:
if "application" in dirs:
evidence.append("Application layer (use cases)")
components['Application'].append('application/')
components["Application"].append("application/")
if 'infrastructure' in dirs:
if "infrastructure" in dirs:
evidence.append("Infrastructure layer (external dependencies)")
components['Infrastructure'].append('infrastructure/')
components["Infrastructure"].append("infrastructure/")
if 'presentation' in dirs:
if "presentation" in dirs:
evidence.append("Presentation layer (UI/API)")
components['Presentation'].append('presentation/')
components["Presentation"].append("presentation/")
if len(components) >= 3:
patterns.append(ArchitecturalPattern(
pattern_name="Clean Architecture",
confidence=0.85,
evidence=evidence,
components=dict(components),
description="Dependency inversion with domain at center, infrastructure at edges"
))
patterns.append(
ArchitecturalPattern(
pattern_name="Clean Architecture",
confidence=0.85,
evidence=evidence,
components=dict(components),
description="Dependency inversion with domain at center, infrastructure at edges",
)
)
return patterns
def _enhance_with_ai(self, report: ArchitecturalReport) -> Dict:
def _enhance_with_ai(self, report: ArchitecturalReport) -> dict:
"""Enhance architectural analysis with AI insights"""
if not self.ai_enhancer:
return {}
# Prepare summary for AI
summary = f"""Detected {len(report.patterns)} architectural patterns:
{chr(10).join(f'- {p.pattern_name} (confidence: {p.confidence:.2f})' for p in report.patterns)}
{chr(10).join(f"- {p.pattern_name} (confidence: {p.confidence:.2f})" for p in report.patterns)}
Frameworks: {', '.join(report.frameworks_detected) if report.frameworks_detected else 'None'}
Frameworks: {", ".join(report.frameworks_detected) if report.frameworks_detected else "None"}
Total files: {report.total_files_analyzed}
Provide brief architectural insights and recommendations."""
try:
response = self.ai_enhancer._call_claude(summary, max_tokens=500)
return {'insights': response} if response else {}
return {"insights": response} if response else {}
except Exception as e:
logger.warning(f"⚠️ AI enhancement failed: {e}")
return {}

File diff suppressed because it is too large Load Diff

View File

@@ -24,65 +24,80 @@ Credits:
- pathspec for .gitignore support: https://pypi.org/project/pathspec/
"""
import argparse
import json
import logging
import os
import sys
import json
import argparse
import logging
from pathlib import Path
from typing import Dict, List, Optional, Any
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from skill_seekers.cli.code_analyzer import CodeAnalyzer
from skill_seekers.cli.api_reference_builder import APIReferenceBuilder
from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer
from skill_seekers.cli.code_analyzer import CodeAnalyzer
from skill_seekers.cli.config_extractor import ConfigExtractor
from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer
# Try to import pathspec for .gitignore support
try:
import pathspec
PATHSPEC_AVAILABLE = True
except ImportError:
PATHSPEC_AVAILABLE = False
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Language extension mapping
LANGUAGE_EXTENSIONS = {
'.py': 'Python',
'.js': 'JavaScript',
'.jsx': 'JavaScript',
'.ts': 'TypeScript',
'.tsx': 'TypeScript',
'.cpp': 'C++',
'.cc': 'C++',
'.cxx': 'C++',
'.h': 'C++',
'.hpp': 'C++',
'.hxx': 'C++',
'.c': 'C',
'.cs': 'C#',
'.go': 'Go',
'.rs': 'Rust',
'.java': 'Java',
'.rb': 'Ruby',
'.php': 'PHP',
".py": "Python",
".js": "JavaScript",
".jsx": "JavaScript",
".ts": "TypeScript",
".tsx": "TypeScript",
".cpp": "C++",
".cc": "C++",
".cxx": "C++",
".h": "C++",
".hpp": "C++",
".hxx": "C++",
".c": "C",
".cs": "C#",
".go": "Go",
".rs": "Rust",
".java": "Java",
".rb": "Ruby",
".php": "PHP",
}
# Default directories to exclude
DEFAULT_EXCLUDED_DIRS = {
'node_modules', 'venv', '__pycache__', '.git', '.svn', '.hg',
'build', 'dist', 'target', '.pytest_cache', '.tox', '.mypy_cache',
'htmlcov', 'coverage', '.coverage', '.eggs', '*.egg-info',
'.idea', '.vscode', '.vs', '__pypackages__'
"node_modules",
"venv",
"__pycache__",
".git",
".svn",
".hg",
"build",
"dist",
"target",
".pytest_cache",
".tox",
".mypy_cache",
"htmlcov",
"coverage",
".coverage",
".eggs",
"*.egg-info",
".idea",
".vscode",
".vs",
"__pypackages__",
}
@@ -97,10 +112,10 @@ def detect_language(file_path: Path) -> str:
Language name or 'Unknown'
"""
extension = file_path.suffix.lower()
return LANGUAGE_EXTENSIONS.get(extension, 'Unknown')
return LANGUAGE_EXTENSIONS.get(extension, "Unknown")
def load_gitignore(directory: Path) -> Optional[pathspec.PathSpec]:
def load_gitignore(directory: Path) -> pathspec.PathSpec | None:
"""
Load .gitignore file and create pathspec matcher.
@@ -115,14 +130,14 @@ def load_gitignore(directory: Path) -> Optional[pathspec.PathSpec]:
logger.warning("Install with: pip install pathspec")
return None
gitignore_path = directory / '.gitignore'
gitignore_path = directory / ".gitignore"
if not gitignore_path.exists():
logger.debug(f"No .gitignore found in {directory}")
return None
try:
with open(gitignore_path, 'r', encoding='utf-8') as f:
spec = pathspec.PathSpec.from_lines('gitwildmatch', f)
with open(gitignore_path, encoding="utf-8") as f:
spec = pathspec.PathSpec.from_lines("gitwildmatch", f)
logger.info(f"Loaded .gitignore from {gitignore_path}")
return spec
except Exception as e:
@@ -146,10 +161,10 @@ def should_exclude_dir(dir_name: str, excluded_dirs: set) -> bool:
def walk_directory(
root: Path,
patterns: Optional[List[str]] = None,
gitignore_spec: Optional[pathspec.PathSpec] = None,
excluded_dirs: Optional[set] = None
) -> List[Path]:
patterns: list[str] | None = None,
gitignore_spec: pathspec.PathSpec | None = None,
excluded_dirs: set | None = None,
) -> list[Path]:
"""
Walk directory tree and collect source files.
@@ -205,9 +220,9 @@ def walk_directory(
def analyze_codebase(
directory: Path,
output_dir: Path,
depth: str = 'deep',
languages: Optional[List[str]] = None,
file_patterns: Optional[List[str]] = None,
depth: str = "deep",
languages: list[str] | None = None,
file_patterns: list[str] | None = None,
build_api_reference: bool = True,
extract_comments: bool = True,
build_dependency_graph: bool = True,
@@ -216,8 +231,8 @@ def analyze_codebase(
build_how_to_guides: bool = True,
extract_config_patterns: bool = True,
enhance_with_ai: bool = True,
ai_mode: str = "auto"
) -> Dict[str, Any]:
ai_mode: str = "auto",
) -> dict[str, Any]:
"""
Analyze local codebase and extract code knowledge.
@@ -255,11 +270,7 @@ def analyze_codebase(
# Walk directory tree
logger.info("Scanning directory tree...")
files = walk_directory(
directory,
patterns=file_patterns,
gitignore_spec=gitignore_spec
)
files = walk_directory(directory, patterns=file_patterns, gitignore_spec=gitignore_spec)
logger.info(f"Found {len(files)} source files")
@@ -273,27 +284,25 @@ def analyze_codebase(
analyzer = CodeAnalyzer(depth=depth)
# Analyze each file
results = {'files': []}
results = {"files": []}
analyzed_count = 0
for file_path in files:
try:
content = file_path.read_text(encoding='utf-8', errors='ignore')
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language == 'Unknown':
if language == "Unknown":
continue
# Analyze file
analysis = analyzer.analyze_file(str(file_path), content, language)
# Only include files with actual analysis results
if analysis and (analysis.get('classes') or analysis.get('functions')):
results['files'].append({
'file': str(file_path.relative_to(directory)),
'language': language,
**analysis
})
if analysis and (analysis.get("classes") or analysis.get("functions")):
results["files"].append(
{"file": str(file_path.relative_to(directory)), "language": language, **analysis}
)
analyzed_count += 1
if analyzed_count % 10 == 0:
@@ -306,17 +315,17 @@ def analyze_codebase(
logger.info(f"✅ Successfully analyzed {analyzed_count} files")
# Save results
output_json = output_dir / 'code_analysis.json'
with open(output_json, 'w', encoding='utf-8') as f:
output_json = output_dir / "code_analysis.json"
with open(output_json, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2)
logger.info(f"📁 Saved analysis to: {output_json}")
# Build API reference if requested
if build_api_reference and results['files']:
if build_api_reference and results["files"]:
logger.info("Building API reference documentation...")
builder = APIReferenceBuilder(results)
api_output_dir = output_dir / 'api_reference'
api_output_dir = output_dir / "api_reference"
generated_files = builder.build_reference(api_output_dir)
logger.info(f"✅ Generated {len(generated_files)} API reference files")
logger.info(f"📁 API reference: {api_output_dir}")
@@ -329,10 +338,10 @@ def analyze_codebase(
# Analyze dependencies for all files
for file_path in files:
try:
content = file_path.read_text(encoding='utf-8', errors='ignore')
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language != 'Unknown':
if language != "Unknown":
# Use relative path from directory for better graph readability
rel_path = str(file_path.relative_to(directory))
dep_analyzer.analyze_file(rel_path, content, language)
@@ -348,7 +357,7 @@ def analyze_codebase(
if cycles:
logger.warning(f"⚠️ Found {len(cycles)} circular dependencies:")
for i, cycle in enumerate(cycles[:5], 1): # Show first 5
cycle_str = ''.join(cycle) + f"{cycle[0]}"
cycle_str = "".join(cycle) + f"{cycle[0]}"
logger.warning(f" {i}. {cycle_str}")
if len(cycles) > 5:
logger.warning(f" ... and {len(cycles) - 5} more")
@@ -356,32 +365,34 @@ def analyze_codebase(
logger.info("✅ No circular dependencies found")
# Save dependency graph data
dep_output_dir = output_dir / 'dependencies'
dep_output_dir = output_dir / "dependencies"
dep_output_dir.mkdir(parents=True, exist_ok=True)
# Export as JSON
dep_json = dep_output_dir / 'dependency_graph.json'
with open(dep_json, 'w', encoding='utf-8') as f:
dep_json = dep_output_dir / "dependency_graph.json"
with open(dep_json, "w", encoding="utf-8") as f:
json.dump(dep_analyzer.export_json(), f, indent=2)
logger.info(f"📁 Saved dependency graph: {dep_json}")
# Export as Mermaid diagram
mermaid_file = dep_output_dir / 'dependency_graph.mmd'
mermaid_file = dep_output_dir / "dependency_graph.mmd"
mermaid_file.write_text(dep_analyzer.export_mermaid())
logger.info(f"📁 Saved Mermaid diagram: {mermaid_file}")
# Save statistics
stats = dep_analyzer.get_statistics()
stats_file = dep_output_dir / 'statistics.json'
with open(stats_file, 'w', encoding='utf-8') as f:
stats_file = dep_output_dir / "statistics.json"
with open(stats_file, "w", encoding="utf-8") as f:
json.dump(stats, f, indent=2)
logger.info(f"📊 Statistics: {stats['total_files']} files, "
f"{stats['total_dependencies']} dependencies, "
f"{stats['circular_dependencies']} cycles")
logger.info(
f"📊 Statistics: {stats['total_files']} files, "
f"{stats['total_dependencies']} dependencies, "
f"{stats['circular_dependencies']} cycles"
)
# Try to export as DOT (requires pydot)
try:
dot_file = dep_output_dir / 'dependency_graph.dot'
dot_file = dep_output_dir / "dependency_graph.dot"
dep_analyzer.export_dot(str(dot_file))
except:
pass # pydot not installed, skip DOT export
@@ -396,13 +407,11 @@ def analyze_codebase(
for file_path in files:
try:
content = file_path.read_text(encoding='utf-8', errors='ignore')
content = file_path.read_text(encoding="utf-8", errors="ignore")
language = detect_language(file_path)
if language != 'Unknown':
report = pattern_recognizer.analyze_file(
str(file_path), content, language
)
if language != "Unknown":
report = pattern_recognizer.analyze_file(str(file_path), content, language)
if report.patterns:
pattern_results.append(report.to_dict())
@@ -412,14 +421,14 @@ def analyze_codebase(
# Save pattern results
if pattern_results:
pattern_output = output_dir / 'patterns'
pattern_output = output_dir / "patterns"
pattern_output.mkdir(parents=True, exist_ok=True)
pattern_json = pattern_output / 'detected_patterns.json'
with open(pattern_json, 'w', encoding='utf-8') as f:
pattern_json = pattern_output / "detected_patterns.json"
with open(pattern_json, "w", encoding="utf-8") as f:
json.dump(pattern_results, f, indent=2)
total_patterns = sum(len(r['patterns']) for r in pattern_results)
total_patterns = sum(len(r["patterns"]) for r in pattern_results)
logger.info(f"✅ Detected {total_patterns} patterns in {len(pattern_results)} files")
logger.info(f"📁 Saved to: {pattern_json}")
else:
@@ -432,35 +441,31 @@ def analyze_codebase(
# Create extractor
test_extractor = TestExampleExtractor(
min_confidence=0.5,
max_per_file=10,
languages=languages,
enhance_with_ai=enhance_with_ai
min_confidence=0.5, max_per_file=10, languages=languages, enhance_with_ai=enhance_with_ai
)
# Extract examples from directory
try:
example_report = test_extractor.extract_from_directory(
directory,
recursive=True
)
example_report = test_extractor.extract_from_directory(directory, recursive=True)
if example_report.total_examples > 0:
# Save results
examples_output = output_dir / 'test_examples'
examples_output = output_dir / "test_examples"
examples_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
examples_json = examples_output / 'test_examples.json'
with open(examples_json, 'w', encoding='utf-8') as f:
examples_json = examples_output / "test_examples.json"
with open(examples_json, "w", encoding="utf-8") as f:
json.dump(example_report.to_dict(), f, indent=2)
# Save as Markdown
examples_md = examples_output / 'test_examples.md'
examples_md.write_text(example_report.to_markdown(), encoding='utf-8')
examples_md = examples_output / "test_examples.md"
examples_md.write_text(example_report.to_markdown(), encoding="utf-8")
logger.info(f"✅ Extracted {example_report.total_examples} test examples "
f"({example_report.high_value_count} high-value)")
logger.info(
f"✅ Extracted {example_report.total_examples} test examples "
f"({example_report.high_value_count} high-value)"
)
logger.info(f"📁 Saved to: {examples_output}")
else:
logger.info("No test examples extracted")
@@ -479,25 +484,25 @@ def analyze_codebase(
guide_builder = HowToGuideBuilder(enhance_with_ai=enhance_with_ai)
# Build guides from workflow examples
tutorials_dir = output_dir / 'tutorials'
tutorials_dir = output_dir / "tutorials"
# Get workflow examples from the example_report if available
if 'example_report' in locals() and example_report and example_report.total_examples > 0:
if "example_report" in locals() and example_report and example_report.total_examples > 0:
# Convert example_report to list of dicts for processing
examples_list = example_report.to_dict().get('examples', [])
examples_list = example_report.to_dict().get("examples", [])
guide_collection = guide_builder.build_guides_from_examples(
examples_list,
grouping_strategy='ai-tutorial-group',
grouping_strategy="ai-tutorial-group",
output_dir=tutorials_dir,
enhance_with_ai=enhance_with_ai,
ai_mode=ai_mode
ai_mode=ai_mode,
)
if guide_collection and guide_collection.total_guides > 0:
# Save collection summary
collection_json = tutorials_dir / 'guide_collection.json'
with open(collection_json, 'w', encoding='utf-8') as f:
collection_json = tutorials_dir / "guide_collection.json"
with open(collection_json, "w", encoding="utf-8") as f:
json.dump(guide_collection.to_dict(), f, indent=2)
logger.info(f"✅ Built {guide_collection.total_guides} how-to guides")
@@ -524,9 +529,10 @@ def analyze_codebase(
result_dict = config_extractor.to_dict(extraction_result)
# AI Enhancement (if enabled)
if enhance_with_ai and ai_mode != 'none':
if enhance_with_ai and ai_mode != "none":
try:
from skill_seekers.cli.config_enhancer import ConfigEnhancer
logger.info(f"🤖 Enhancing config analysis with AI (mode: {ai_mode})...")
enhancer = ConfigEnhancer(mode=ai_mode)
result_dict = enhancer.enhance_config_result(result_dict)
@@ -535,28 +541,30 @@ def analyze_codebase(
logger.warning(f"⚠️ Config AI enhancement failed: {e}")
# Save results
config_output = output_dir / 'config_patterns'
config_output = output_dir / "config_patterns"
config_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
config_json = config_output / 'config_patterns.json'
with open(config_json, 'w', encoding='utf-8') as f:
config_json = config_output / "config_patterns.json"
with open(config_json, "w", encoding="utf-8") as f:
json.dump(result_dict, f, indent=2)
# Save as Markdown (basic - AI enhancements in JSON only for now)
config_md = config_output / 'config_patterns.md'
config_md.write_text(extraction_result.to_markdown(), encoding='utf-8')
config_md = config_output / "config_patterns.md"
config_md.write_text(extraction_result.to_markdown(), encoding="utf-8")
# Count total settings across all files
total_settings = sum(len(cf.settings) for cf in extraction_result.config_files)
total_patterns = sum(len(cf.patterns) for cf in extraction_result.config_files)
logger.info(f"✅ Extracted {len(extraction_result.config_files)} config files "
f"with {total_settings} settings and {total_patterns} detected patterns")
logger.info(
f"✅ Extracted {len(extraction_result.config_files)} config files "
f"with {total_settings} settings and {total_patterns} detected patterns"
)
if 'ai_enhancements' in result_dict:
insights = result_dict['ai_enhancements'].get('overall_insights', {})
if insights.get('security_issues_found'):
if "ai_enhancements" in result_dict:
insights = result_dict["ai_enhancements"].get("overall_insights", {})
if insights.get("security_issues_found"):
logger.info(f"🔐 Security issues found: {insights['security_issues_found']}")
logger.info(f"📁 Saved to: {config_output}")
@@ -572,15 +580,15 @@ def analyze_codebase(
from skill_seekers.cli.architectural_pattern_detector import ArchitecturalPatternDetector
arch_detector = ArchitecturalPatternDetector(enhance_with_ai=enhance_with_ai)
arch_report = arch_detector.analyze(directory, results['files'])
arch_report = arch_detector.analyze(directory, results["files"])
if arch_report.patterns:
arch_output = output_dir / 'architecture'
arch_output = output_dir / "architecture"
arch_output.mkdir(parents=True, exist_ok=True)
# Save as JSON
arch_json = arch_output / 'architectural_patterns.json'
with open(arch_json, 'w', encoding='utf-8') as f:
arch_json = arch_output / "architectural_patterns.json"
with open(arch_json, "w", encoding="utf-8") as f:
json.dump(arch_report.to_dict(), f, indent=2)
logger.info(f"🏗️ Detected {len(arch_report.patterns)} architectural patterns")
@@ -601,7 +609,7 @@ def analyze_codebase(
build_dependency_graph=build_dependency_graph,
detect_patterns=detect_patterns,
extract_test_examples=extract_test_examples,
extract_config_patterns=extract_config_patterns
extract_config_patterns=extract_config_patterns,
)
return results
@@ -610,13 +618,13 @@ def analyze_codebase(
def _generate_skill_md(
output_dir: Path,
directory: Path,
results: Dict[str, Any],
results: dict[str, Any],
depth: str,
build_api_reference: bool,
build_dependency_graph: bool,
detect_patterns: bool,
extract_test_examples: bool,
extract_config_patterns: bool
extract_config_patterns: bool,
):
"""
Generate rich SKILL.md from codebase analysis results.
@@ -635,14 +643,14 @@ def _generate_skill_md(
repo_name = directory.name
# Generate skill name (lowercase, hyphens only, max 64 chars)
skill_name = repo_name.lower().replace('_', '-').replace(' ', '-')[:64]
skill_name = repo_name.lower().replace("_", "-").replace(" ", "-")[:64]
# Generate description
description = f"Local codebase analysis for {repo_name}"
# Count files by language
language_stats = _get_language_stats(results.get('files', []))
total_files = len(results.get('files', []))
language_stats = _get_language_stats(results.get("files", []))
total_files = len(results.get("files", []))
# Start building content
skill_content = f"""---
@@ -658,7 +666,7 @@ Local codebase analysis and documentation generated from code analysis.
**Path:** `{directory}`
**Files Analyzed:** {total_files}
**Languages:** {', '.join(language_stats.keys())}
**Languages:** {", ".join(language_stats.keys())}
**Analysis Depth:** {depth}
## When to Use This Skill
@@ -732,22 +740,22 @@ Use this skill when you need to:
skill_content += "This skill includes detailed reference documentation:\n\n"
refs_added = False
if build_api_reference and (output_dir / 'api_reference').exists():
if build_api_reference and (output_dir / "api_reference").exists():
skill_content += "- **API Reference**: `references/api_reference/` - Complete API documentation\n"
refs_added = True
if build_dependency_graph and (output_dir / 'dependencies').exists():
if build_dependency_graph and (output_dir / "dependencies").exists():
skill_content += "- **Dependencies**: `references/dependencies/` - Dependency graph and analysis\n"
refs_added = True
if detect_patterns and (output_dir / 'patterns').exists():
if detect_patterns and (output_dir / "patterns").exists():
skill_content += "- **Patterns**: `references/patterns/` - Detected design patterns\n"
refs_added = True
if extract_test_examples and (output_dir / 'test_examples').exists():
if extract_test_examples and (output_dir / "test_examples").exists():
skill_content += "- **Examples**: `references/test_examples/` - Usage examples from tests\n"
refs_added = True
if extract_config_patterns and (output_dir / 'config_patterns').exists():
if extract_config_patterns and (output_dir / "config_patterns").exists():
skill_content += "- **Configuration**: `references/config_patterns/` - Configuration patterns\n"
refs_added = True
if (output_dir / 'architecture').exists():
if (output_dir / "architecture").exists():
skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
refs_added = True
@@ -762,34 +770,34 @@ Use this skill when you need to:
# Write SKILL.md
skill_path = output_dir / "SKILL.md"
skill_path.write_text(skill_content, encoding='utf-8')
skill_path.write_text(skill_content, encoding="utf-8")
line_count = len(skill_content.split('\n'))
line_count = len(skill_content.split("\n"))
logger.info(f"✅ Generated SKILL.md: {skill_path} ({line_count} lines)")
# Generate references/ directory structure
_generate_references(output_dir)
def _get_language_stats(files: List[Dict]) -> Dict[str, int]:
def _get_language_stats(files: list[dict]) -> dict[str, int]:
"""Count files by language from analysis results."""
stats = {}
for file_data in files:
# files is a list of dicts with 'language' key
lang = file_data.get('language', 'Unknown')
if lang != 'Unknown':
lang = file_data.get("language", "Unknown")
if lang != "Unknown":
stats[lang] = stats.get(lang, 0) + 1
return stats
def _format_patterns_section(output_dir: Path) -> str:
"""Format design patterns section from patterns/detected_patterns.json."""
patterns_file = output_dir / 'patterns' / 'detected_patterns.json'
patterns_file = output_dir / "patterns" / "detected_patterns.json"
if not patterns_file.exists():
return ""
try:
with open(patterns_file, 'r', encoding='utf-8') as f:
with open(patterns_file, encoding="utf-8") as f:
patterns_data = json.load(f)
except Exception:
return ""
@@ -802,10 +810,10 @@ def _format_patterns_section(output_dir: Path) -> str:
by_class = {}
for pattern_file in patterns_data:
for pattern in pattern_file.get('patterns', []):
ptype = pattern.get('pattern_type', 'Unknown')
cls = pattern.get('class_name', '')
confidence = pattern.get('confidence', 0)
for pattern in pattern_file.get("patterns", []):
ptype = pattern.get("pattern_type", "Unknown")
cls = pattern.get("class_name", "")
confidence = pattern.get("confidence", 0)
# Skip low confidence
if confidence < 0.7:
@@ -813,7 +821,7 @@ def _format_patterns_section(output_dir: Path) -> str:
# Deduplicate by class
key = f"{cls}:{ptype}"
if key not in by_class or by_class[key]['confidence'] < confidence:
if key not in by_class or by_class[key]["confidence"] < confidence:
by_class[key] = pattern
# Count by type
@@ -836,22 +844,22 @@ def _format_patterns_section(output_dir: Path) -> str:
def _format_examples_section(output_dir: Path) -> str:
"""Format code examples section from test_examples/test_examples.json."""
examples_file = output_dir / 'test_examples' / 'test_examples.json'
examples_file = output_dir / "test_examples" / "test_examples.json"
if not examples_file.exists():
return ""
try:
with open(examples_file, 'r', encoding='utf-8') as f:
with open(examples_file, encoding="utf-8") as f:
examples_data = json.load(f)
except Exception:
return ""
examples = examples_data.get('examples', [])
examples = examples_data.get("examples", [])
if not examples:
return ""
# Filter high-value examples (complexity > 0.7)
high_value = [ex for ex in examples if ex.get('complexity_score', 0) > 0.7]
high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7]
if not high_value:
# If no high complexity, take any examples
@@ -864,11 +872,11 @@ def _format_examples_section(output_dir: Path) -> str:
content += "*High-quality examples extracted from test files (C3.2)*\n\n"
# Top 10 examples
for ex in sorted(high_value, key=lambda x: x.get('complexity_score', 0), reverse=True)[:10]:
desc = ex.get('description', 'Example')
lang = ex.get('language', 'python').lower()
code = ex.get('code', '')
complexity = ex.get('complexity_score', 0)
for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]:
desc = ex.get("description", "Example")
lang = ex.get("language", "python").lower()
code = ex.get("code", "")
complexity = ex.get("complexity_score", 0)
content += f"**{desc}** (complexity: {complexity:.2f})\n\n"
content += f"```{lang}\n{code}\n```\n\n"
@@ -879,16 +887,16 @@ def _format_examples_section(output_dir: Path) -> str:
def _format_api_section(output_dir: Path) -> str:
"""Format API reference section."""
api_dir = output_dir / 'api_reference'
api_dir = output_dir / "api_reference"
if not api_dir.exists():
return ""
api_md = api_dir / 'api_reference.md'
api_md = api_dir / "api_reference.md"
if not api_md.exists():
return ""
try:
api_content = api_md.read_text(encoding='utf-8')
api_content = api_md.read_text(encoding="utf-8")
except Exception:
return ""
@@ -906,17 +914,17 @@ def _format_api_section(output_dir: Path) -> str:
def _format_architecture_section(output_dir: Path) -> str:
"""Format architecture section from architecture/architectural_patterns.json."""
arch_file = output_dir / 'architecture' / 'architectural_patterns.json'
arch_file = output_dir / "architecture" / "architectural_patterns.json"
if not arch_file.exists():
return ""
try:
with open(arch_file, 'r', encoding='utf-8') as f:
with open(arch_file, encoding="utf-8") as f:
arch_data = json.load(f)
except Exception:
return ""
patterns = arch_data.get('patterns', [])
patterns = arch_data.get("patterns", [])
if not patterns:
return ""
@@ -925,9 +933,9 @@ def _format_architecture_section(output_dir: Path) -> str:
content += "**Detected Architectural Patterns:**\n\n"
for pattern in patterns[:5]:
name = pattern.get('pattern_name', 'Unknown')
confidence = pattern.get('confidence', 0)
indicators = pattern.get('indicators', [])
name = pattern.get("pattern_name", "Unknown")
confidence = pattern.get("confidence", 0)
indicators = pattern.get("indicators", [])
content += f"- **{name}** (confidence: {confidence:.2f})\n"
if indicators:
@@ -940,22 +948,22 @@ def _format_architecture_section(output_dir: Path) -> str:
def _format_config_section(output_dir: Path) -> str:
"""Format configuration patterns section."""
config_file = output_dir / 'config_patterns' / 'config_patterns.json'
config_file = output_dir / "config_patterns" / "config_patterns.json"
if not config_file.exists():
return ""
try:
with open(config_file, 'r', encoding='utf-8') as f:
with open(config_file, encoding="utf-8") as f:
config_data = json.load(f)
except Exception:
return ""
config_files = config_data.get('config_files', [])
config_files = config_data.get("config_files", [])
if not config_files:
return ""
total_settings = sum(len(cf.get('settings', [])) for cf in config_files)
total_patterns = sum(len(cf.get('patterns', [])) for cf in config_files)
total_settings = sum(len(cf.get("settings", [])) for cf in config_files)
total_patterns = sum(len(cf.get("patterns", [])) for cf in config_files)
content = "## ⚙️ Configuration Patterns\n\n"
content += "*From C3.4 configuration analysis*\n\n"
@@ -966,7 +974,7 @@ def _format_config_section(output_dir: Path) -> str:
# List config file types found
file_types = {}
for cf in config_files:
ctype = cf.get('config_type', 'unknown')
ctype = cf.get("config_type", "unknown")
file_types[ctype] = file_types.get(ctype, 0) + 1
if file_types:
@@ -985,18 +993,18 @@ def _generate_references(output_dir: Path):
Creates a clean references/ directory that links to all analysis outputs.
"""
references_dir = output_dir / 'references'
references_dir = output_dir / "references"
references_dir.mkdir(exist_ok=True)
# Map analysis directories to reference names
mappings = {
'api_reference': 'api_reference',
'dependencies': 'dependencies',
'patterns': 'patterns',
'test_examples': 'test_examples',
'tutorials': 'tutorials',
'config_patterns': 'config_patterns',
'architecture': 'architecture'
"api_reference": "api_reference",
"dependencies": "dependencies",
"patterns": "patterns",
"test_examples": "test_examples",
"tutorials": "tutorials",
"config_patterns": "config_patterns",
"architecture": "architecture",
}
for source, target in mappings.items():
@@ -1007,9 +1015,11 @@ def _generate_references(output_dir: Path):
# Copy directory to references/ (not symlink, for portability)
if target_dir.exists():
import shutil
shutil.rmtree(target_dir)
import shutil
shutil.copytree(source_dir, target_dir)
logger.debug(f"Copied {source} → references/{target}")
@@ -1019,7 +1029,7 @@ def _generate_references(output_dir: Path):
def main():
"""Command-line interface for codebase analysis."""
parser = argparse.ArgumentParser(
description='Analyze local codebases and extract code knowledge',
description="Analyze local codebases and extract code knowledge",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -1043,101 +1053,78 @@ Examples:
# Skip specific features
codebase-scraper --directory . --skip-patterns --skip-test-examples
"""
""",
)
parser.add_argument("--directory", required=True, help="Directory to analyze")
parser.add_argument("--output", default="output/codebase/", help="Output directory (default: output/codebase/)")
parser.add_argument(
'--directory',
required=True,
help='Directory to analyze'
"--depth", choices=["surface", "deep", "full"], default="deep", help="Analysis depth (default: deep)"
)
parser.add_argument("--languages", help="Comma-separated languages to analyze (e.g., Python,JavaScript,C++)")
parser.add_argument("--file-patterns", help="Comma-separated file patterns (e.g., *.py,src/**/*.js)")
parser.add_argument(
'--output',
default='output/codebase/',
help='Output directory (default: output/codebase/)'
)
parser.add_argument(
'--depth',
choices=['surface', 'deep', 'full'],
default='deep',
help='Analysis depth (default: deep)'
)
parser.add_argument(
'--languages',
help='Comma-separated languages to analyze (e.g., Python,JavaScript,C++)'
)
parser.add_argument(
'--file-patterns',
help='Comma-separated file patterns (e.g., *.py,src/**/*.js)'
)
parser.add_argument(
'--skip-api-reference',
action='store_true',
"--skip-api-reference",
action="store_true",
default=False,
help='Skip API reference markdown documentation generation (default: enabled)'
help="Skip API reference markdown documentation generation (default: enabled)",
)
parser.add_argument(
'--skip-dependency-graph',
action='store_true',
"--skip-dependency-graph",
action="store_true",
default=False,
help='Skip dependency graph and circular dependency detection (default: enabled)'
help="Skip dependency graph and circular dependency detection (default: enabled)",
)
parser.add_argument(
'--skip-patterns',
action='store_true',
"--skip-patterns",
action="store_true",
default=False,
help='Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)'
help="Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)",
)
parser.add_argument(
'--skip-test-examples',
action='store_true',
"--skip-test-examples",
action="store_true",
default=False,
help='Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)'
help="Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)",
)
parser.add_argument(
'--skip-how-to-guides',
action='store_true',
"--skip-how-to-guides",
action="store_true",
default=False,
help='Skip how-to guide generation from workflow examples (default: enabled)'
help="Skip how-to guide generation from workflow examples (default: enabled)",
)
parser.add_argument(
'--skip-config-patterns',
action='store_true',
"--skip-config-patterns",
action="store_true",
default=False,
help='Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)'
help="Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)",
)
parser.add_argument(
'--ai-mode',
choices=['auto', 'api', 'local', 'none'],
default='auto',
help='AI enhancement mode for how-to guides: auto (detect best), api (Claude API), local (Claude Code CLI), none (disable) (default: auto)'
)
parser.add_argument(
'--no-comments',
action='store_true',
help='Skip comment extraction'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Enable verbose logging'
"--ai-mode",
choices=["auto", "api", "local", "none"],
default="auto",
help="AI enhancement mode for how-to guides: auto (detect best), api (Claude API), local (Claude Code CLI), none (disable) (default: auto)",
)
parser.add_argument("--no-comments", action="store_true", help="Skip comment extraction")
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
# Check for deprecated flags
deprecated_flags = {
'--build-api-reference': '--skip-api-reference',
'--build-dependency-graph': '--skip-dependency-graph',
'--detect-patterns': '--skip-patterns',
'--extract-test-examples': '--skip-test-examples',
'--build-how-to-guides': '--skip-how-to-guides',
'--extract-config-patterns': '--skip-config-patterns'
"--build-api-reference": "--skip-api-reference",
"--build-dependency-graph": "--skip-dependency-graph",
"--detect-patterns": "--skip-patterns",
"--extract-test-examples": "--skip-test-examples",
"--build-how-to-guides": "--skip-how-to-guides",
"--extract-config-patterns": "--skip-config-patterns",
}
for old_flag, new_flag in deprecated_flags.items():
if old_flag in sys.argv:
logger.warning(f"⚠️ DEPRECATED: {old_flag} is deprecated. "
f"All features are now enabled by default. "
f"Use {new_flag} to disable this feature.")
logger.warning(
f"⚠️ DEPRECATED: {old_flag} is deprecated. "
f"All features are now enabled by default. "
f"Use {new_flag} to disable this feature."
)
args = parser.parse_args()
@@ -1158,12 +1145,12 @@ Examples:
# Parse languages
languages = None
if args.languages:
languages = [lang.strip() for lang in args.languages.split(',')]
languages = [lang.strip() for lang in args.languages.split(",")]
# Parse file patterns
file_patterns = None
if args.file_patterns:
file_patterns = [p.strip() for p in args.file_patterns.split(',')]
file_patterns = [p.strip() for p in args.file_patterns.split(",")]
# Analyze codebase
try:
@@ -1181,18 +1168,18 @@ Examples:
build_how_to_guides=not args.skip_how_to_guides,
extract_config_patterns=not args.skip_config_patterns,
enhance_with_ai=True, # Auto-disables if no API key present
ai_mode=args.ai_mode # NEW: AI enhancement mode for how-to guides
ai_mode=args.ai_mode, # NEW: AI enhancement mode for how-to guides
)
# Print summary
print(f"\n{'='*60}")
print(f"CODEBASE ANALYSIS COMPLETE")
print(f"{'='*60}")
print(f"\n{'=' * 60}")
print("CODEBASE ANALYSIS COMPLETE")
print(f"{'=' * 60}")
print(f"Files analyzed: {len(results['files'])}")
print(f"Output directory: {args.output}")
if args.build_api_reference:
print(f"API reference: {Path(args.output) / 'api_reference'}")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
return 0
@@ -1202,9 +1189,10 @@ Examples:
except Exception as e:
logger.error(f"Analysis failed: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main())

View File

@@ -4,9 +4,8 @@ Interactive Configuration Wizard for Skill Seekers
Provides user-friendly setup for GitHub tokens, API keys, and settings.
"""
import sys
import webbrowser
from typing import Optional
from .config_manager import get_config_manager
@@ -46,7 +45,7 @@ Documentation: https://github.com/SkillSeekers/skill-seekers
# Ask if user wants to run setup now
response = input("Would you like to run the configuration wizard now? [y/N]: ").strip().lower()
if response in ['y', 'yes']:
if response in ["y", "yes"]:
main_menu()
else:
print("\nYou can run the configuration wizard anytime with:")
@@ -158,7 +157,7 @@ def add_github_profile():
if name in config.config["github"]["profiles"]:
print(f"❌ Profile '{name}' already exists.")
overwrite = input("Overwrite? [y/N]: ").strip().lower()
if overwrite not in ['y', 'yes']:
if overwrite not in ["y", "yes"]:
continue
break
@@ -175,7 +174,7 @@ def add_github_profile():
print(" 4. Copy the token (ghp_...)\n")
open_now = input("Open GitHub token page in browser? [Y/n]: ").strip().lower()
if open_now not in ['n', 'no']:
if open_now not in ["n", "no"]:
open_github_token_page()
while True:
@@ -186,7 +185,7 @@ def add_github_profile():
if not (token.startswith("ghp_") or token.startswith("github_pat_")):
print("⚠️ Warning: Token doesn't match GitHub format")
proceed = input("Continue anyway? [y/N]: ").strip().lower()
if proceed not in ['y', 'yes']:
if proceed not in ["y", "yes"]:
continue
break
@@ -198,12 +197,7 @@ def add_github_profile():
print(" 4. fail - Fail immediately")
strategy_choice = input("\nSelect strategy [1-4] (default: 1): ").strip() or "1"
strategy_map = {
"1": "prompt",
"2": "wait",
"3": "switch",
"4": "fail"
}
strategy_map = {"1": "prompt", "2": "wait", "3": "switch", "4": "fail"}
strategy = strategy_map.get(strategy_choice, "prompt")
# Timeout
@@ -217,7 +211,7 @@ def add_github_profile():
# Set as default
has_profiles = bool(config.config["github"]["profiles"])
if has_profiles:
set_default = input("\nSet as default profile? [y/N]: ").strip().lower() in ['y', 'yes']
set_default = input("\nSet as default profile? [y/N]: ").strip().lower() in ["y", "yes"]
else:
set_default = True # First profile is always default
@@ -228,7 +222,7 @@ def add_github_profile():
description=description,
rate_limit_strategy=strategy,
timeout_minutes=timeout,
set_as_default=set_default
set_as_default=set_default,
)
print(f"\n✅ GitHub profile '{name}' added successfully!")
@@ -258,7 +252,7 @@ def remove_github_profile():
if 1 <= choice_idx <= len(profiles):
profile_name = profiles[choice_idx - 1]["name"]
confirm = input(f"Really remove profile '{profile_name}'? [y/N]: ").strip().lower()
if confirm in ['y', 'yes']:
if confirm in ["y", "yes"]:
config.remove_github_profile(profile_name)
else:
print("❌ Invalid choice.")
@@ -325,11 +319,10 @@ def api_keys_menu():
source = ""
if key:
import os
env_var = {
"anthropic": "ANTHROPIC_API_KEY",
"google": "GOOGLE_API_KEY",
"openai": "OPENAI_API_KEY"
}[provider]
env_var = {"anthropic": "ANTHROPIC_API_KEY", "google": "GOOGLE_API_KEY", "openai": "OPENAI_API_KEY"}[
provider
]
if os.getenv(env_var):
source = " (from environment)"
else:
@@ -347,7 +340,7 @@ def api_keys_menu():
provider_map = {
"1": ("anthropic", "https://console.anthropic.com/settings/keys"),
"2": ("google", "https://makersuite.google.com/app/apikey"),
"3": ("openai", "https://platform.openai.com/api-keys")
"3": ("openai", "https://platform.openai.com/api-keys"),
}
if choice in provider_map:
@@ -365,7 +358,7 @@ def set_api_key(provider: str, url: str):
print(f"Get your API key at: {url}\n")
open_now = input("Open in browser? [Y/n]: ").strip().lower()
if open_now not in ['n', 'no']:
if open_now not in ["n", "no"]:
try:
webbrowser.open(url)
print("✅ Opened in browser\n")
@@ -390,7 +383,7 @@ def rate_limit_settings():
current = config.config["rate_limit"]
print(f"Current settings:")
print("Current settings:")
print(f" • Default timeout: {current['default_timeout_minutes']} minutes")
print(f" • Auto-switch profiles: {current['auto_switch_profiles']}")
print(f" • Show countdown: {current['show_countdown']}\n")
@@ -404,14 +397,16 @@ def rate_limit_settings():
print("⚠️ Invalid input, keeping current value")
# Auto-switch
auto_switch_input = input(f"Auto-switch to other profiles? [y/n] ({current['auto_switch_profiles']}): ").strip().lower()
auto_switch_input = (
input(f"Auto-switch to other profiles? [y/n] ({current['auto_switch_profiles']}): ").strip().lower()
)
if auto_switch_input:
config.config["rate_limit"]["auto_switch_profiles"] = auto_switch_input in ['y', 'yes']
config.config["rate_limit"]["auto_switch_profiles"] = auto_switch_input in ["y", "yes"]
# Show countdown
countdown_input = input(f"Show countdown timer? [y/n] ({current['show_countdown']}): ").strip().lower()
if countdown_input:
config.config["rate_limit"]["show_countdown"] = countdown_input in ['y', 'yes']
config.config["rate_limit"]["show_countdown"] = countdown_input in ["y", "yes"]
config.save_config()
print("\n✅ Rate limit settings updated")
@@ -427,7 +422,7 @@ def resume_settings():
current = config.config["resume"]
print(f"Current settings:")
print("Current settings:")
print(f" • Auto-save interval: {current['auto_save_interval_seconds']} seconds")
print(f" • Keep progress for: {current['keep_progress_days']} days\n")
@@ -467,13 +462,12 @@ def test_connections():
print(" ⚠️ No GitHub profiles configured")
else:
import requests
for p in profiles:
token = config.config["github"]["profiles"][p["name"]]["token"]
try:
response = requests.get(
"https://api.github.com/rate_limit",
headers={"Authorization": f"token {token}"},
timeout=5
"https://api.github.com/rate_limit", headers={"Authorization": f"token {token}"}, timeout=5
)
if response.status_code == 200:
data = response.json()
@@ -518,34 +512,12 @@ def main():
"""Main entry point for config command."""
import argparse
parser = argparse.ArgumentParser(
description="Configure Skill Seekers settings"
)
parser.add_argument(
"--github",
action="store_true",
help="Go directly to GitHub token setup"
)
parser.add_argument(
"--api-keys",
action="store_true",
help="Go directly to API keys setup"
)
parser.add_argument(
"--show",
action="store_true",
help="Show current configuration and exit"
)
parser.add_argument(
"--test",
action="store_true",
help="Test connections and exit"
)
parser.add_argument(
"--welcome",
action="store_true",
help="Show welcome message"
)
parser = argparse.ArgumentParser(description="Configure Skill Seekers settings")
parser.add_argument("--github", action="store_true", help="Go directly to GitHub token setup")
parser.add_argument("--api-keys", action="store_true", help="Go directly to API keys setup")
parser.add_argument("--show", action="store_true", help="Show current configuration and exit")
parser.add_argument("--test", action="store_true", help="Test connections and exit")
parser.add_argument("--welcome", action="store_true", help="Show welcome message")
args = parser.parse_args()

View File

@@ -12,24 +12,24 @@ Provides dual-mode AI enhancement (API + LOCAL) for configuration analysis:
Similar to GuideEnhancer (C3.3) but for configuration files.
"""
import os
import sys
import json
import logging
import os
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
from pathlib import Path
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Optional anthropic import
ANTHROPIC_AVAILABLE = False
try:
import anthropic
ANTHROPIC_AVAILABLE = True
except ImportError:
pass
@@ -38,6 +38,7 @@ except ImportError:
@dataclass
class ConfigEnhancement:
"""AI-generated enhancement for a configuration"""
explanation: str = "" # What this setting does
best_practice: str = "" # Suggested improvement
security_concern: str = "" # Security issue (if any)
@@ -48,11 +49,12 @@ class ConfigEnhancement:
@dataclass
class EnhancedConfigFile:
"""Configuration file with AI enhancements"""
file_path: str
config_type: str
purpose: str
enhancement: ConfigEnhancement
setting_enhancements: Dict[str, ConfigEnhancement] = field(default_factory=dict)
setting_enhancements: dict[str, ConfigEnhancement] = field(default_factory=dict)
class ConfigEnhancer:
@@ -73,7 +75,7 @@ class ConfigEnhancer:
mode: Enhancement mode - "api", "local", or "auto" (default)
"""
self.mode = self._detect_mode(mode)
self.api_key = os.environ.get('ANTHROPIC_API_KEY')
self.api_key = os.environ.get("ANTHROPIC_API_KEY")
self.client = None
if self.mode == "api" and ANTHROPIC_AVAILABLE and self.api_key:
@@ -93,14 +95,14 @@ class ConfigEnhancer:
return requested_mode
# Auto-detect
if os.environ.get('ANTHROPIC_API_KEY') and ANTHROPIC_AVAILABLE:
if os.environ.get("ANTHROPIC_API_KEY") and ANTHROPIC_AVAILABLE:
logger.info("🤖 AI enhancement: API mode (Claude API detected)")
return "api"
else:
logger.info("🤖 AI enhancement: LOCAL mode (using Claude Code CLI)")
return "local"
def enhance_config_result(self, result: Dict) -> Dict:
def enhance_config_result(self, result: dict) -> dict:
"""
Enhance entire configuration extraction result.
@@ -121,7 +123,7 @@ class ConfigEnhancer:
# API MODE - Direct Claude API calls
# =========================================================================
def _enhance_via_api(self, result: Dict) -> Dict:
def _enhance_via_api(self, result: dict) -> dict:
"""Enhance configs using Claude API"""
if not self.client:
logger.error("❌ API mode requested but no API key available")
@@ -134,12 +136,7 @@ class ConfigEnhancer:
# Call Claude API
logger.info("📡 Calling Claude API for config analysis...")
response = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=8000,
messages=[{
"role": "user",
"content": prompt
}]
model="claude-sonnet-4-20250514", max_tokens=8000, messages=[{"role": "user", "content": prompt}]
)
# Parse response
@@ -151,23 +148,23 @@ class ConfigEnhancer:
logger.error(f"❌ API enhancement failed: {e}")
return result
def _create_enhancement_prompt(self, result: Dict) -> str:
def _create_enhancement_prompt(self, result: dict) -> str:
"""Create prompt for Claude API"""
config_files = result.get('config_files', [])
config_files = result.get("config_files", [])
# Summarize configs for prompt
config_summary = []
for cf in config_files[:10]: # Limit to first 10 files
settings_summary = []
for setting in cf.get('settings', [])[:5]: # First 5 settings per file
for setting in cf.get("settings", [])[:5]: # First 5 settings per file
settings_summary.append(f" - {setting['key']}: {setting['value']} ({setting['value_type']})")
config_summary.append(f"""
File: {cf['relative_path']} ({cf['config_type']})
Purpose: {cf['purpose']}
File: {cf["relative_path"]} ({cf["config_type"]})
Purpose: {cf["purpose"]}
Settings:
{chr(10).join(settings_summary)}
Patterns: {', '.join(cf.get('patterns', []))}
Patterns: {", ".join(cf.get("patterns", []))}
""")
prompt = f"""Analyze these configuration files and provide AI-enhanced insights.
@@ -207,12 +204,13 @@ Focus on actionable insights that help developers understand and improve their c
"""
return prompt
def _parse_api_response(self, response_text: str, original_result: Dict) -> Dict:
def _parse_api_response(self, response_text: str, original_result: dict) -> dict:
"""Parse Claude API response and merge with original result"""
try:
# Extract JSON from response
import re
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
json_match = re.search(r"\{.*\}", response_text, re.DOTALL)
if not json_match:
logger.warning("⚠️ No JSON found in API response")
return original_result
@@ -220,14 +218,14 @@ Focus on actionable insights that help developers understand and improve their c
enhancements = json.loads(json_match.group())
# Merge enhancements into original result
original_result['ai_enhancements'] = enhancements
original_result["ai_enhancements"] = enhancements
# Add enhancement flags to config files
file_enhancements = {e['file_path']: e for e in enhancements.get('file_enhancements', [])}
for cf in original_result.get('config_files', []):
file_path = cf.get('relative_path', cf.get('file_path'))
file_enhancements = {e["file_path"]: e for e in enhancements.get("file_enhancements", [])}
for cf in original_result.get("config_files", []):
file_path = cf.get("relative_path", cf.get("file_path"))
if file_path in file_enhancements:
cf['ai_enhancement'] = file_enhancements[file_path]
cf["ai_enhancement"] = file_enhancements[file_path]
return original_result
@@ -239,11 +237,11 @@ Focus on actionable insights that help developers understand and improve their c
# LOCAL MODE - Claude Code CLI
# =========================================================================
def _enhance_via_local(self, result: Dict) -> Dict:
def _enhance_via_local(self, result: dict) -> dict:
"""Enhance configs using Claude Code CLI"""
try:
# Create temporary prompt file
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
prompt_file = Path(f.name)
f.write(self._create_local_prompt(result))
@@ -263,7 +261,7 @@ Focus on actionable insights that help developers understand and improve their c
if result_data:
# Merge LOCAL enhancements
original_result['ai_enhancements'] = result_data
original_result["ai_enhancements"] = result_data
logger.info("✅ LOCAL enhancement complete")
return original_result
else:
@@ -274,18 +272,18 @@ Focus on actionable insights that help developers understand and improve their c
logger.error(f"❌ LOCAL enhancement failed: {e}")
return result
def _create_local_prompt(self, result: Dict) -> str:
def _create_local_prompt(self, result: dict) -> str:
"""Create prompt file for Claude Code CLI"""
config_files = result.get('config_files', [])
config_files = result.get("config_files", [])
# Format config data for Claude
config_data = []
for cf in config_files[:10]:
config_data.append(f"""
### {cf['relative_path']} ({cf['config_type']})
- Purpose: {cf['purpose']}
- Patterns: {', '.join(cf.get('patterns', []))}
- Settings count: {len(cf.get('settings', []))}
### {cf["relative_path"]} ({cf["config_type"]})
- Purpose: {cf["purpose"]}
- Patterns: {", ".join(cf.get("patterns", []))}
- Settings count: {len(cf.get("settings", []))}
""")
prompt = f"""# Configuration Analysis Task
@@ -332,15 +330,15 @@ Focus on actionable insights:
"""
return prompt
def _run_claude_cli(self, prompt_file: Path, output_file: Path) -> Optional[Dict]:
def _run_claude_cli(self, prompt_file: Path, output_file: Path) -> dict | None:
"""Run Claude Code CLI and wait for completion"""
try:
# Run claude command
result = subprocess.run(
['claude', str(prompt_file)],
["claude", str(prompt_file)],
capture_output=True,
text=True,
timeout=300 # 5 minute timeout
timeout=300, # 5 minute timeout
)
if result.returncode != 0:
@@ -350,6 +348,7 @@ Focus on actionable insights:
# Try to find output file (Claude might save it with different name)
# Look for JSON files created in the last minute
import time
current_time = time.time()
potential_files = []
@@ -360,9 +359,9 @@ Focus on actionable insights:
# Try to load the most recent JSON file
for json_file in sorted(potential_files, key=lambda f: f.stat().st_mtime, reverse=True):
try:
with open(json_file, 'r') as f:
with open(json_file) as f:
data = json.load(f)
if 'file_enhancements' in data or 'overall_insights' in data:
if "file_enhancements" in data or "overall_insights" in data:
logger.info(f"✅ Found enhancement data in {json_file.name}")
return data
except:
@@ -383,29 +382,18 @@ def main():
"""Command-line interface for config enhancement"""
import argparse
parser = argparse.ArgumentParser(
description='AI-enhance configuration extraction results'
)
parser = argparse.ArgumentParser(description="AI-enhance configuration extraction results")
parser.add_argument("result_file", help="Path to config extraction JSON result file")
parser.add_argument(
'result_file',
help='Path to config extraction JSON result file'
)
parser.add_argument(
'--mode',
choices=['auto', 'api', 'local'],
default='auto',
help='Enhancement mode (default: auto)'
)
parser.add_argument(
'--output',
help='Output file for enhanced results (default: <input>_enhanced.json)'
"--mode", choices=["auto", "api", "local"], default="auto", help="Enhancement mode (default: auto)"
)
parser.add_argument("--output", help="Output file for enhanced results (default: <input>_enhanced.json)")
args = parser.parse_args()
# Load result file
try:
with open(args.result_file, 'r') as f:
with open(args.result_file) as f:
result = json.load(f)
except Exception as e:
logger.error(f"❌ Failed to load result file: {e}")
@@ -416,9 +404,9 @@ def main():
enhanced_result = enhancer.enhance_config_result(result)
# Save
output_file = args.output or args.result_file.replace('.json', '_enhanced.json')
output_file = args.output or args.result_file.replace(".json", "_enhanced.json")
try:
with open(output_file, 'w') as f:
with open(output_file, "w") as f:
json.dump(enhanced_result, f, indent=2)
logger.info(f"✅ Enhanced results saved to: {output_file}")
except Exception as e:
@@ -428,5 +416,5 @@ def main():
return 0
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main())

View File

@@ -9,19 +9,20 @@ This is different from C3.2 which extracts config examples from test code.
C3.4 focuses on documenting the actual project configuration.
"""
import ast
import json
import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Any, Set, Literal
import ast
from typing import Any, Literal
logger = logging.getLogger(__name__)
# Optional dependencies
try:
import yaml
YAML_AVAILABLE = True
except ImportError:
YAML_AVAILABLE = False
@@ -29,10 +30,12 @@ except ImportError:
try:
import tomli
TOML_AVAILABLE = True
except ImportError:
try:
import toml
TOML_AVAILABLE = True
except ImportError:
TOML_AVAILABLE = False
@@ -42,68 +45,71 @@ except ImportError:
@dataclass
class ConfigSetting:
"""Individual configuration setting"""
key: str
value: Any
value_type: str # 'string', 'integer', 'boolean', 'array', 'object', 'null'
default_value: Optional[Any] = None
default_value: Any | None = None
required: bool = False
env_var: Optional[str] = None
env_var: str | None = None
description: str = ""
validation: Dict[str, Any] = field(default_factory=dict)
nested_path: List[str] = field(default_factory=list) # For nested configs
validation: dict[str, Any] = field(default_factory=dict)
nested_path: list[str] = field(default_factory=list) # For nested configs
@dataclass
class ConfigFile:
"""Represents a configuration file"""
file_path: str
relative_path: str
config_type: Literal["json", "yaml", "toml", "env", "ini", "python", "javascript", "dockerfile", "docker-compose"]
purpose: str # Inferred purpose: database, api, logging, etc.
settings: List[ConfigSetting] = field(default_factory=list)
patterns: List[str] = field(default_factory=list)
raw_content: Optional[str] = None
parse_errors: List[str] = field(default_factory=list)
settings: list[ConfigSetting] = field(default_factory=list)
patterns: list[str] = field(default_factory=list)
raw_content: str | None = None
parse_errors: list[str] = field(default_factory=list)
@dataclass
class ConfigExtractionResult:
"""Result of config extraction"""
config_files: List[ConfigFile] = field(default_factory=list)
config_files: list[ConfigFile] = field(default_factory=list)
total_files: int = 0
total_settings: int = 0
detected_patterns: Dict[str, List[str]] = field(default_factory=dict) # pattern -> files
errors: List[str] = field(default_factory=list)
detected_patterns: dict[str, list[str]] = field(default_factory=dict) # pattern -> files
errors: list[str] = field(default_factory=list)
def to_dict(self) -> Dict:
def to_dict(self) -> dict:
"""Convert result to dictionary for JSON output"""
return {
'total_files': self.total_files,
'total_settings': self.total_settings,
'detected_patterns': self.detected_patterns,
'config_files': [
"total_files": self.total_files,
"total_settings": self.total_settings,
"detected_patterns": self.detected_patterns,
"config_files": [
{
'file_path': cf.file_path,
'relative_path': cf.relative_path,
'type': cf.config_type,
'purpose': cf.purpose,
'patterns': cf.patterns,
'settings_count': len(cf.settings),
'settings': [
"file_path": cf.file_path,
"relative_path": cf.relative_path,
"type": cf.config_type,
"purpose": cf.purpose,
"patterns": cf.patterns,
"settings_count": len(cf.settings),
"settings": [
{
'key': s.key,
'value': s.value,
'type': s.value_type,
'env_var': s.env_var,
'description': s.description,
"key": s.key,
"value": s.value,
"type": s.value_type,
"env_var": s.env_var,
"description": s.description,
}
for s in cf.settings
],
'parse_errors': cf.parse_errors,
"parse_errors": cf.parse_errors,
}
for cf in self.config_files
],
'errors': self.errors,
"errors": self.errors,
}
def to_markdown(self) -> str:
@@ -115,11 +121,11 @@ class ConfigExtractionResult:
# Handle both dict and list formats for detected_patterns
if self.detected_patterns:
if isinstance(self.detected_patterns, dict):
patterns_str = ', '.join(self.detected_patterns.keys())
patterns_str = ", ".join(self.detected_patterns.keys())
else:
patterns_str = ', '.join(self.detected_patterns)
patterns_str = ", ".join(self.detected_patterns)
else:
patterns_str = 'None'
patterns_str = "None"
md += f"**Detected Patterns:** {patterns_str}\n\n"
if self.config_files:
@@ -148,52 +154,64 @@ class ConfigFileDetector:
# Config file patterns by type
CONFIG_PATTERNS = {
'json': {
'patterns': ['*.json', 'package.json', 'tsconfig.json', 'jsconfig.json'],
'names': ['config.json', 'settings.json', 'app.json', '.eslintrc.json', '.prettierrc.json'],
"json": {
"patterns": ["*.json", "package.json", "tsconfig.json", "jsconfig.json"],
"names": ["config.json", "settings.json", "app.json", ".eslintrc.json", ".prettierrc.json"],
},
'yaml': {
'patterns': ['*.yaml', '*.yml'],
'names': ['config.yml', 'settings.yml', '.travis.yml', '.gitlab-ci.yml', 'docker-compose.yml'],
"yaml": {
"patterns": ["*.yaml", "*.yml"],
"names": ["config.yml", "settings.yml", ".travis.yml", ".gitlab-ci.yml", "docker-compose.yml"],
},
'toml': {
'patterns': ['*.toml'],
'names': ['pyproject.toml', 'Cargo.toml', 'config.toml'],
"toml": {
"patterns": ["*.toml"],
"names": ["pyproject.toml", "Cargo.toml", "config.toml"],
},
'env': {
'patterns': ['.env*', '*.env'],
'names': ['.env', '.env.example', '.env.local', '.env.production'],
"env": {
"patterns": [".env*", "*.env"],
"names": [".env", ".env.example", ".env.local", ".env.production"],
},
'ini': {
'patterns': ['*.ini', '*.cfg'],
'names': ['config.ini', 'setup.cfg', 'tox.ini'],
"ini": {
"patterns": ["*.ini", "*.cfg"],
"names": ["config.ini", "setup.cfg", "tox.ini"],
},
'python': {
'patterns': [],
'names': ['settings.py', 'config.py', 'configuration.py', 'constants.py'],
"python": {
"patterns": [],
"names": ["settings.py", "config.py", "configuration.py", "constants.py"],
},
'javascript': {
'patterns': ['*.config.js', '*.config.ts'],
'names': ['config.js', 'next.config.js', 'vue.config.js', 'webpack.config.js'],
"javascript": {
"patterns": ["*.config.js", "*.config.ts"],
"names": ["config.js", "next.config.js", "vue.config.js", "webpack.config.js"],
},
'dockerfile': {
'patterns': ['Dockerfile*'],
'names': ['Dockerfile', 'Dockerfile.dev', 'Dockerfile.prod'],
"dockerfile": {
"patterns": ["Dockerfile*"],
"names": ["Dockerfile", "Dockerfile.dev", "Dockerfile.prod"],
},
'docker-compose': {
'patterns': ['docker-compose*.yml', 'docker-compose*.yaml'],
'names': ['docker-compose.yml', 'docker-compose.yaml'],
"docker-compose": {
"patterns": ["docker-compose*.yml", "docker-compose*.yaml"],
"names": ["docker-compose.yml", "docker-compose.yaml"],
},
}
# Directories to skip
SKIP_DIRS = {
'node_modules', 'venv', 'env', '.venv', '__pycache__', '.git',
'build', 'dist', '.tox', '.mypy_cache', '.pytest_cache',
'htmlcov', 'coverage', '.eggs', '*.egg-info'
"node_modules",
"venv",
"env",
".venv",
"__pycache__",
".git",
"build",
"dist",
".tox",
".mypy_cache",
".pytest_cache",
"htmlcov",
"coverage",
".eggs",
"*.egg-info",
}
def find_config_files(self, directory: Path, max_files: int = 100) -> List[ConfigFile]:
def find_config_files(self, directory: Path, max_files: int = 100) -> list[ConfigFile]:
"""
Find all configuration files in directory.
@@ -219,7 +237,7 @@ class ConfigFileDetector:
file_path=str(file_path),
relative_path=relative_path,
config_type=config_type,
purpose=self._infer_purpose(file_path, config_type)
purpose=self._infer_purpose(file_path, config_type),
)
config_files.append(config_file)
found_count += 1
@@ -230,7 +248,7 @@ class ConfigFileDetector:
def _walk_directory(self, directory: Path):
"""Walk directory, skipping excluded directories"""
for item in directory.rglob('*'):
for item in directory.rglob("*"):
# Skip directories
if item.is_dir():
continue
@@ -241,18 +259,18 @@ class ConfigFileDetector:
yield item
def _detect_config_type(self, file_path: Path) -> Optional[str]:
def _detect_config_type(self, file_path: Path) -> str | None:
"""Detect configuration file type"""
filename = file_path.name.lower()
# Check each config type
for config_type, patterns in self.CONFIG_PATTERNS.items():
# Check exact name matches
if filename in patterns['names']:
if filename in patterns["names"]:
return config_type
# Check pattern matches
for pattern in patterns['patterns']:
for pattern in patterns["patterns"]:
if file_path.match(pattern):
return config_type
@@ -264,43 +282,43 @@ class ConfigFileDetector:
filename = file_path.name.lower()
# Database configs
if any(word in path_lower for word in ['database', 'db', 'postgres', 'mysql', 'mongo']):
return 'database_configuration'
if any(word in path_lower for word in ["database", "db", "postgres", "mysql", "mongo"]):
return "database_configuration"
# API configs
if any(word in path_lower for word in ['api', 'rest', 'graphql', 'endpoint']):
return 'api_configuration'
if any(word in path_lower for word in ["api", "rest", "graphql", "endpoint"]):
return "api_configuration"
# Logging configs
if any(word in path_lower for word in ['log', 'logger', 'logging']):
return 'logging_configuration'
if any(word in path_lower for word in ["log", "logger", "logging"]):
return "logging_configuration"
# Docker configs
if 'docker' in filename:
return 'docker_configuration'
if "docker" in filename:
return "docker_configuration"
# CI/CD configs
if any(word in path_lower for word in ['.travis', '.gitlab', '.github', 'ci', 'cd']):
return 'ci_cd_configuration'
if any(word in path_lower for word in [".travis", ".gitlab", ".github", "ci", "cd"]):
return "ci_cd_configuration"
# Package configs
if filename in ['package.json', 'pyproject.toml', 'cargo.toml']:
return 'package_configuration'
if filename in ["package.json", "pyproject.toml", "cargo.toml"]:
return "package_configuration"
# TypeScript/JavaScript configs
if filename in ['tsconfig.json', 'jsconfig.json']:
return 'typescript_configuration'
if filename in ["tsconfig.json", "jsconfig.json"]:
return "typescript_configuration"
# Framework configs
if 'next.config' in filename or 'vue.config' in filename or 'webpack.config' in filename:
return 'framework_configuration'
if "next.config" in filename or "vue.config" in filename or "webpack.config" in filename:
return "framework_configuration"
# Environment configs
if '.env' in filename:
return 'environment_configuration'
if ".env" in filename:
return "environment_configuration"
# Default
return 'general_configuration'
return "general_configuration"
class ConfigParser:
@@ -318,27 +336,27 @@ class ConfigParser:
"""
try:
# Read file content
with open(config_file.file_path, 'r', encoding='utf-8') as f:
with open(config_file.file_path, encoding="utf-8") as f:
config_file.raw_content = f.read()
# Parse based on type
if config_file.config_type == 'json':
if config_file.config_type == "json":
self._parse_json(config_file)
elif config_file.config_type == 'yaml':
elif config_file.config_type == "yaml":
self._parse_yaml(config_file)
elif config_file.config_type == 'toml':
elif config_file.config_type == "toml":
self._parse_toml(config_file)
elif config_file.config_type == 'env':
elif config_file.config_type == "env":
self._parse_env(config_file)
elif config_file.config_type == 'ini':
elif config_file.config_type == "ini":
self._parse_ini(config_file)
elif config_file.config_type == 'python':
elif config_file.config_type == "python":
self._parse_python_config(config_file)
elif config_file.config_type == 'javascript':
elif config_file.config_type == "javascript":
self._parse_javascript_config(config_file)
elif config_file.config_type == 'dockerfile':
elif config_file.config_type == "dockerfile":
self._parse_dockerfile(config_file)
elif config_file.config_type == 'docker-compose':
elif config_file.config_type == "docker-compose":
self._parse_yaml(config_file) # Docker compose is YAML
except Exception as e:
@@ -376,10 +394,11 @@ class ConfigParser:
return
try:
if 'tomli' in globals():
if "tomli" in globals():
data = tomli.loads(config_file.raw_content)
else:
import toml
data = toml.loads(config_file.raw_content)
self._extract_settings_from_dict(data, config_file)
@@ -388,17 +407,17 @@ class ConfigParser:
def _parse_env(self, config_file: ConfigFile):
"""Parse .env file"""
lines = config_file.raw_content.split('\n')
lines = config_file.raw_content.split("\n")
for line_num, line in enumerate(lines, 1):
line = line.strip()
# Skip comments and empty lines
if not line or line.startswith('#'):
if not line or line.startswith("#"):
continue
# Parse KEY=VALUE
match = re.match(r'([A-Z_][A-Z0-9_]*)\s*=\s*(.+)', line)
match = re.match(r"([A-Z_][A-Z0-9_]*)\s*=\s*(.+)", line)
if match:
key, value = match.groups()
value = value.strip().strip('"').strip("'")
@@ -408,7 +427,7 @@ class ConfigParser:
value=value,
value_type=self._infer_type(value),
env_var=key,
description=self._extract_env_description(lines, line_num - 1)
description=self._extract_env_description(lines, line_num - 1),
)
config_file.settings.append(setting)
@@ -426,7 +445,7 @@ class ConfigParser:
key=f"{section}.{key}",
value=value,
value_type=self._infer_type(value),
nested_path=[section, key]
nested_path=[section, key],
)
config_file.settings.append(setting)
except Exception as e:
@@ -444,7 +463,7 @@ class ConfigParser:
key = node.targets[0].id
# Skip private variables
if key.startswith('_'):
if key.startswith("_"):
continue
# Extract value
@@ -454,7 +473,7 @@ class ConfigParser:
key=key,
value=value,
value_type=self._infer_type(value),
description=self._extract_python_docstring(node)
description=self._extract_python_docstring(node),
)
config_file.settings.append(setting)
except (ValueError, TypeError):
@@ -469,8 +488,8 @@ class ConfigParser:
# Simple regex-based extraction for common patterns
patterns = [
r'(?:const|let|var)\s+(\w+)\s*[:=]\s*(["\'])(.*?)\2', # String values
r'(?:const|let|var)\s+(\w+)\s*[:=]\s*(\d+)', # Number values
r'(?:const|let|var)\s+(\w+)\s*[:=]\s*(true|false)', # Boolean values
r"(?:const|let|var)\s+(\w+)\s*[:=]\s*(\d+)", # Number values
r"(?:const|let|var)\s+(\w+)\s*[:=]\s*(true|false)", # Boolean values
]
for pattern in patterns:
@@ -479,47 +498,36 @@ class ConfigParser:
key = match.group(1)
value = match.group(3) if len(match.groups()) > 2 else match.group(2)
setting = ConfigSetting(
key=key,
value=value,
value_type=self._infer_type(value)
)
setting = ConfigSetting(key=key, value=value, value_type=self._infer_type(value))
config_file.settings.append(setting)
def _parse_dockerfile(self, config_file: ConfigFile):
"""Parse Dockerfile configuration"""
lines = config_file.raw_content.split('\n')
lines = config_file.raw_content.split("\n")
for line in lines:
line = line.strip()
# Extract ENV variables
if line.startswith('ENV '):
parts = line[4:].split('=', 1)
if line.startswith("ENV "):
parts = line[4:].split("=", 1)
if len(parts) == 2:
key, value = parts
setting = ConfigSetting(
key=key.strip(),
value=value.strip(),
value_type='string',
env_var=key.strip()
key=key.strip(), value=value.strip(), value_type="string", env_var=key.strip()
)
config_file.settings.append(setting)
# Extract ARG variables
elif line.startswith('ARG '):
parts = line[4:].split('=', 1)
elif line.startswith("ARG "):
parts = line[4:].split("=", 1)
key = parts[0].strip()
value = parts[1].strip() if len(parts) == 2 else None
setting = ConfigSetting(
key=key,
value=value,
value_type='string'
)
setting = ConfigSetting(key=key, value=value, value_type="string")
config_file.settings.append(setting)
def _extract_settings_from_dict(self, data: Dict, config_file: ConfigFile, parent_path: List[str] = None):
def _extract_settings_from_dict(self, data: dict, config_file: ConfigFile, parent_path: list[str] = None):
"""Recursively extract settings from dictionary"""
if parent_path is None:
parent_path = []
@@ -530,35 +538,35 @@ class ConfigParser:
self._extract_settings_from_dict(value, config_file, parent_path + [key])
else:
setting = ConfigSetting(
key='.'.join(parent_path + [key]) if parent_path else key,
key=".".join(parent_path + [key]) if parent_path else key,
value=value,
value_type=self._infer_type(value),
nested_path=parent_path + [key]
nested_path=parent_path + [key],
)
config_file.settings.append(setting)
def _infer_type(self, value: Any) -> str:
"""Infer value type"""
if value is None:
return 'null'
return "null"
elif isinstance(value, bool):
return 'boolean'
return "boolean"
elif isinstance(value, int):
return 'integer'
return "integer"
elif isinstance(value, float):
return 'number'
return "number"
elif isinstance(value, (list, tuple)):
return 'array'
return "array"
elif isinstance(value, dict):
return 'object'
return "object"
else:
return 'string'
return "string"
def _extract_env_description(self, lines: List[str], line_index: int) -> str:
def _extract_env_description(self, lines: list[str], line_index: int) -> str:
"""Extract description from comment above env variable"""
if line_index > 0:
prev_line = lines[line_index - 1].strip()
if prev_line.startswith('#'):
if prev_line.startswith("#"):
return prev_line[1:].strip()
return ""
@@ -573,37 +581,37 @@ class ConfigPatternDetector:
# Known configuration patterns
KNOWN_PATTERNS = {
'database_config': {
'keys': ['host', 'port', 'database', 'user', 'username', 'password', 'db_name'],
'min_match': 3,
"database_config": {
"keys": ["host", "port", "database", "user", "username", "password", "db_name"],
"min_match": 3,
},
'api_config': {
'keys': ['base_url', 'api_key', 'api_secret', 'timeout', 'retry', 'endpoint'],
'min_match': 2,
"api_config": {
"keys": ["base_url", "api_key", "api_secret", "timeout", "retry", "endpoint"],
"min_match": 2,
},
'logging_config': {
'keys': ['level', 'format', 'handler', 'file', 'console', 'log_level'],
'min_match': 2,
"logging_config": {
"keys": ["level", "format", "handler", "file", "console", "log_level"],
"min_match": 2,
},
'cache_config': {
'keys': ['backend', 'ttl', 'timeout', 'max_size', 'redis', 'memcached'],
'min_match': 2,
"cache_config": {
"keys": ["backend", "ttl", "timeout", "max_size", "redis", "memcached"],
"min_match": 2,
},
'email_config': {
'keys': ['smtp_host', 'smtp_port', 'email', 'from_email', 'mail_server'],
'min_match': 2,
"email_config": {
"keys": ["smtp_host", "smtp_port", "email", "from_email", "mail_server"],
"min_match": 2,
},
'auth_config': {
'keys': ['secret_key', 'jwt_secret', 'token', 'oauth', 'authentication'],
'min_match': 1,
"auth_config": {
"keys": ["secret_key", "jwt_secret", "token", "oauth", "authentication"],
"min_match": 1,
},
'server_config': {
'keys': ['host', 'port', 'bind', 'workers', 'threads'],
'min_match': 2,
"server_config": {
"keys": ["host", "port", "bind", "workers", "threads"],
"min_match": 2,
},
}
def detect_patterns(self, config_file: ConfigFile) -> List[str]:
def detect_patterns(self, config_file: ConfigFile) -> list[str]:
"""
Detect which patterns this config file matches.
@@ -620,8 +628,8 @@ class ConfigPatternDetector:
# Check against each known pattern
for pattern_name, pattern_def in self.KNOWN_PATTERNS.items():
pattern_keys = {k.lower() for k in pattern_def['keys']}
min_match = pattern_def['min_match']
pattern_keys = {k.lower() for k in pattern_def["keys"]}
min_match = pattern_def["min_match"]
# Count matches
matches = len(setting_keys & pattern_keys)
@@ -641,11 +649,7 @@ class ConfigExtractor:
self.parser = ConfigParser()
self.pattern_detector = ConfigPatternDetector()
def extract_from_directory(
self,
directory: Path,
max_files: int = 100
) -> ConfigExtractionResult:
def extract_from_directory(self, directory: Path, max_files: int = 100) -> ConfigExtractionResult:
"""
Extract configuration patterns from directory.
@@ -696,35 +700,35 @@ class ConfigExtractor:
return result
def to_dict(self, result: ConfigExtractionResult) -> Dict:
def to_dict(self, result: ConfigExtractionResult) -> dict:
"""Convert result to dictionary for JSON output"""
return {
'total_files': result.total_files,
'total_settings': result.total_settings,
'detected_patterns': result.detected_patterns,
'config_files': [
"total_files": result.total_files,
"total_settings": result.total_settings,
"detected_patterns": result.detected_patterns,
"config_files": [
{
'file_path': cf.file_path,
'relative_path': cf.relative_path,
'type': cf.config_type,
'purpose': cf.purpose,
'patterns': cf.patterns,
'settings_count': len(cf.settings),
'settings': [
"file_path": cf.file_path,
"relative_path": cf.relative_path,
"type": cf.config_type,
"purpose": cf.purpose,
"patterns": cf.patterns,
"settings_count": len(cf.settings),
"settings": [
{
'key': s.key,
'value': s.value,
'type': s.value_type,
'env_var': s.env_var,
'description': s.description,
"key": s.key,
"value": s.value,
"type": s.value_type,
"env_var": s.env_var,
"description": s.description,
}
for s in cf.settings
],
'parse_errors': cf.parse_errors,
"parse_errors": cf.parse_errors,
}
for cf in result.config_files
],
'errors': result.errors,
"errors": result.errors,
}
@@ -732,19 +736,29 @@ def main():
"""CLI entry point for config extraction"""
import argparse
parser = argparse.ArgumentParser(description="Extract configuration patterns from codebase with optional AI enhancement")
parser.add_argument('directory', type=Path, help='Directory to analyze')
parser.add_argument('--output', '-o', type=Path, help='Output JSON file')
parser.add_argument('--max-files', type=int, default=100, help='Maximum config files to process')
parser.add_argument('--enhance', action='store_true', help='Enhance with AI analysis (API mode, requires ANTHROPIC_API_KEY)')
parser.add_argument('--enhance-local', action='store_true', help='Enhance with AI analysis (LOCAL mode, uses Claude Code CLI)')
parser.add_argument('--ai-mode', choices=['auto', 'api', 'local', 'none'], default='none',
help='AI enhancement mode: auto (detect), api (Claude API), local (Claude Code CLI), none (disable)')
parser = argparse.ArgumentParser(
description="Extract configuration patterns from codebase with optional AI enhancement"
)
parser.add_argument("directory", type=Path, help="Directory to analyze")
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
parser.add_argument("--max-files", type=int, default=100, help="Maximum config files to process")
parser.add_argument(
"--enhance", action="store_true", help="Enhance with AI analysis (API mode, requires ANTHROPIC_API_KEY)"
)
parser.add_argument(
"--enhance-local", action="store_true", help="Enhance with AI analysis (LOCAL mode, uses Claude Code CLI)"
)
parser.add_argument(
"--ai-mode",
choices=["auto", "api", "local", "none"],
default="none",
help="AI enhancement mode: auto (detect), api (Claude API), local (Claude Code CLI), none (disable)",
)
args = parser.parse_args()
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
# Extract
extractor = ConfigExtractor()
@@ -756,13 +770,14 @@ def main():
# AI Enhancement (if requested)
enhance_mode = args.ai_mode
if args.enhance:
enhance_mode = 'api'
enhance_mode = "api"
elif args.enhance_local:
enhance_mode = 'local'
enhance_mode = "local"
if enhance_mode != 'none':
if enhance_mode != "none":
try:
from skill_seekers.cli.config_enhancer import ConfigEnhancer
logger.info(f"🤖 Starting AI enhancement (mode: {enhance_mode})...")
enhancer = ConfigEnhancer(mode=enhance_mode)
output_dict = enhancer.enhance_config_result(output_dict)
@@ -774,27 +789,27 @@ def main():
# Output
if args.output:
with open(args.output, 'w') as f:
with open(args.output, "w") as f:
json.dump(output_dict, f, indent=2)
print(f"✅ Saved config extraction results to: {args.output}")
else:
print(json.dumps(output_dict, indent=2))
# Summary
print(f"\n📊 Summary:")
print("\n📊 Summary:")
print(f" Config files found: {result.total_files}")
print(f" Total settings: {result.total_settings}")
print(f" Detected patterns: {', '.join(result.detected_patterns.keys()) or 'None'}")
if 'ai_enhancements' in output_dict:
if "ai_enhancements" in output_dict:
print(f" ✨ AI enhancements: Yes ({enhance_mode} mode)")
insights = output_dict['ai_enhancements'].get('overall_insights', {})
if insights.get('security_issues_found'):
insights = output_dict["ai_enhancements"].get("overall_insights", {})
if insights.get("security_issues_found"):
print(f" 🔐 Security issues found: {insights['security_issues_found']}")
if result.errors:
print(f"\n⚠️ Errors: {len(result.errors)}")
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -8,10 +8,10 @@ Provides secure storage with file permissions and auto-detection capabilities.
import json
import os
import stat
import sys
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any
import sys
from typing import Any
class ConfigManager:
@@ -26,28 +26,11 @@ class ConfigManager:
# Default configuration
DEFAULT_CONFIG = {
"version": "1.0",
"github": {
"default_profile": None,
"profiles": {}
},
"rate_limit": {
"default_timeout_minutes": 30,
"auto_switch_profiles": True,
"show_countdown": True
},
"resume": {
"auto_save_interval_seconds": 60,
"keep_progress_days": 7
},
"api_keys": {
"anthropic": None,
"google": None,
"openai": None
},
"first_run": {
"completed": False,
"version": "2.7.0"
}
"github": {"default_profile": None, "profiles": {}},
"rate_limit": {"default_timeout_minutes": 30, "auto_switch_profiles": True, "show_countdown": True},
"resume": {"auto_save_interval_seconds": 60, "keep_progress_days": 7},
"api_keys": {"anthropic": None, "google": None, "openai": None},
"first_run": {"completed": False, "version": "2.7.0"},
}
def __init__(self):
@@ -65,25 +48,26 @@ class ConfigManager:
# Set directory permissions to 700 (rwx------)
directory.chmod(stat.S_IRWXU)
def _load_config(self) -> Dict[str, Any]:
def _load_config(self) -> dict[str, Any]:
"""Load configuration from file or create default."""
if not self.config_file.exists():
return self.DEFAULT_CONFIG.copy()
try:
with open(self.config_file, 'r') as f:
with open(self.config_file) as f:
config = json.load(f)
# Merge with defaults for any missing keys
config = self._merge_with_defaults(config)
return config
except (json.JSONDecodeError, IOError) as e:
except (OSError, json.JSONDecodeError) as e:
print(f"⚠️ Warning: Could not load config file: {e}")
print(f" Using default configuration.")
print(" Using default configuration.")
return self.DEFAULT_CONFIG.copy()
def _merge_with_defaults(self, config: Dict[str, Any]) -> Dict[str, Any]:
def _merge_with_defaults(self, config: dict[str, Any]) -> dict[str, Any]:
"""Merge loaded config with defaults to ensure all keys exist."""
def deep_merge(default: dict, custom: dict) -> dict:
result = default.copy()
for key, value in custom.items():
@@ -98,13 +82,13 @@ class ConfigManager:
def save_config(self):
"""Save configuration to file with secure permissions."""
try:
with open(self.config_file, 'w') as f:
with open(self.config_file, "w") as f:
json.dump(self.config, f, indent=2)
# Set file permissions to 600 (rw-------)
self.config_file.chmod(stat.S_IRUSR | stat.S_IWUSR)
except IOError as e:
except OSError as e:
print(f"❌ Error saving config: {e}")
sys.exit(1)
@@ -117,7 +101,7 @@ class ConfigManager:
description: str = "",
rate_limit_strategy: str = "prompt",
timeout_minutes: int = 30,
set_as_default: bool = False
set_as_default: bool = False,
):
"""Add a new GitHub profile."""
if not name:
@@ -131,7 +115,7 @@ class ConfigManager:
"description": description,
"rate_limit_strategy": rate_limit_strategy,
"timeout_minutes": timeout_minutes,
"added_at": datetime.now().isoformat()
"added_at": datetime.now().isoformat(),
}
self.config["github"]["profiles"][name] = profile
@@ -142,7 +126,7 @@ class ConfigManager:
self.save_config()
print(f"✅ Added GitHub profile: {name}")
if set_as_default:
print(f"✅ Set as default profile")
print("✅ Set as default profile")
def remove_github_profile(self, name: str):
"""Remove a GitHub profile."""
@@ -159,7 +143,7 @@ class ConfigManager:
self.save_config()
print(f"✅ Removed GitHub profile: {name}")
def list_github_profiles(self) -> List[Dict[str, Any]]:
def list_github_profiles(self) -> list[dict[str, Any]]:
"""List all GitHub profiles."""
profiles = []
default = self.config["github"]["default_profile"]
@@ -171,17 +155,13 @@ class ConfigManager:
"strategy": data.get("rate_limit_strategy", "prompt"),
"timeout": data.get("timeout_minutes", 30),
"is_default": name == default,
"added_at": data.get("added_at", "Unknown")
"added_at": data.get("added_at", "Unknown"),
}
profiles.append(profile_info)
return profiles
def get_github_token(
self,
profile_name: Optional[str] = None,
repo_url: Optional[str] = None
) -> Optional[str]:
def get_github_token(self, profile_name: str | None = None, repo_url: str | None = None) -> str | None:
"""
Get GitHub token with smart fallback chain.
@@ -214,14 +194,14 @@ class ConfigManager:
# 4. No token available
return None
def get_profile_for_token(self, token: str) -> Optional[str]:
def get_profile_for_token(self, token: str) -> str | None:
"""Get profile name for a given token."""
for name, profile in self.config["github"]["profiles"].items():
if profile["token"] == token:
return name
return None
def get_next_profile(self, current_token: str) -> Optional[tuple]:
def get_next_profile(self, current_token: str) -> tuple | None:
"""
Get next available profile for rate limit switching.
@@ -248,7 +228,7 @@ class ConfigManager:
name, profile = profiles[next_idx]
return (name, profile["token"])
def get_rate_limit_strategy(self, token: Optional[str] = None) -> str:
def get_rate_limit_strategy(self, token: str | None = None) -> str:
"""Get rate limit strategy for a token (or default)."""
if token:
profile_name = self.get_profile_for_token(token)
@@ -259,7 +239,7 @@ class ConfigManager:
# Default strategy
return "prompt"
def get_timeout_minutes(self, token: Optional[str] = None) -> int:
def get_timeout_minutes(self, token: str | None = None) -> int:
"""Get timeout minutes for a token (or default)."""
if token:
profile_name = self.get_profile_for_token(token)
@@ -280,7 +260,7 @@ class ConfigManager:
self.save_config()
print(f"✅ Set {provider.capitalize()} API key")
def get_api_key(self, provider: str) -> Optional[str]:
def get_api_key(self, provider: str) -> str | None:
"""
Get API key with environment variable fallback.
@@ -289,11 +269,7 @@ class ConfigManager:
2. Config file
"""
# Check environment first
env_map = {
"anthropic": "ANTHROPIC_API_KEY",
"google": "GOOGLE_API_KEY",
"openai": "OPENAI_API_KEY"
}
env_map = {"anthropic": "ANTHROPIC_API_KEY", "google": "GOOGLE_API_KEY", "openai": "OPENAI_API_KEY"}
env_var = env_map.get(provider)
if env_var:
@@ -306,19 +282,19 @@ class ConfigManager:
# Progress Management
def save_progress(self, job_id: str, progress_data: Dict[str, Any]):
def save_progress(self, job_id: str, progress_data: dict[str, Any]):
"""Save progress for a job."""
progress_file = self.progress_dir / f"{job_id}.json"
progress_data["last_updated"] = datetime.now().isoformat()
with open(progress_file, 'w') as f:
with open(progress_file, "w") as f:
json.dump(progress_data, f, indent=2)
# Set file permissions to 600
progress_file.chmod(stat.S_IRUSR | stat.S_IWUSR)
def load_progress(self, job_id: str) -> Optional[Dict[str, Any]]:
def load_progress(self, job_id: str) -> dict[str, Any] | None:
"""Load progress for a job."""
progress_file = self.progress_dir / f"{job_id}.json"
@@ -326,29 +302,31 @@ class ConfigManager:
return None
try:
with open(progress_file, 'r') as f:
with open(progress_file) as f:
return json.load(f)
except (json.JSONDecodeError, IOError):
except (OSError, json.JSONDecodeError):
return None
def list_resumable_jobs(self) -> List[Dict[str, Any]]:
def list_resumable_jobs(self) -> list[dict[str, Any]]:
"""List all resumable jobs."""
jobs = []
for progress_file in self.progress_dir.glob("*.json"):
try:
with open(progress_file, 'r') as f:
with open(progress_file) as f:
data = json.load(f)
if data.get("can_resume", False):
jobs.append({
"job_id": data.get("job_id", progress_file.stem),
"started_at": data.get("started_at"),
"command": data.get("command"),
"progress": data.get("progress", {}),
"last_updated": data.get("last_updated")
})
except (json.JSONDecodeError, IOError):
jobs.append(
{
"job_id": data.get("job_id", progress_file.stem),
"started_at": data.get("started_at"),
"command": data.get("command"),
"progress": data.get("progress", {}),
"last_updated": data.get("last_updated"),
}
)
except (OSError, json.JSONDecodeError):
continue
# Sort by last updated (newest first)
@@ -447,8 +425,8 @@ class ConfigManager:
print(f"\n📦 Resumable Jobs: {len(jobs)}")
for job in jobs[:5]: # Show max 5
print(f"{job['job_id']}")
if job.get('progress'):
phase = job['progress'].get('phase', 'unknown')
if job.get("progress"):
phase = job["progress"].get("phase", "unknown")
print(f" Phase: {phase}, Last: {job['last_updated']}")

View File

@@ -12,8 +12,8 @@ Also provides backward compatibility detection for legacy configs.
import json
import logging
from typing import Dict, Any, List, Optional, Union
from pathlib import Path
from typing import Any
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -25,18 +25,18 @@ class ConfigValidator:
"""
# Valid source types
VALID_SOURCE_TYPES = {'documentation', 'github', 'pdf'}
VALID_SOURCE_TYPES = {"documentation", "github", "pdf"}
# Valid merge modes
VALID_MERGE_MODES = {'rule-based', 'claude-enhanced'}
VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}
# Valid code analysis depth levels
VALID_DEPTH_LEVELS = {'surface', 'deep', 'full'}
VALID_DEPTH_LEVELS = {"surface", "deep", "full"}
# Valid AI modes for C3.x enhancement
VALID_AI_MODES = {'auto', 'api', 'local', 'none'}
VALID_AI_MODES = {"auto", "api", "local", "none"}
def __init__(self, config_or_path: Union[Dict[str, Any], str]):
def __init__(self, config_or_path: dict[str, Any] | str):
"""
Initialize validator with config dict or file path.
@@ -51,10 +51,10 @@ class ConfigValidator:
self.config = self._load_config()
self.is_unified = self._detect_format()
def _load_config(self) -> Dict[str, Any]:
def _load_config(self) -> dict[str, Any]:
"""Load JSON config file."""
try:
with open(self.config_path, 'r', encoding='utf-8') as f:
with open(self.config_path, encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
raise ValueError(f"Config file not found: {self.config_path}")
@@ -69,7 +69,7 @@ class ConfigValidator:
True if unified format (has 'sources' array)
False if legacy format
"""
return 'sources' in self.config and isinstance(self.config['sources'], list)
return "sources" in self.config and isinstance(self.config["sources"], list)
def validate(self) -> bool:
"""
@@ -91,17 +91,17 @@ class ConfigValidator:
logger.info("Validating unified config format...")
# Required top-level fields
if 'name' not in self.config:
if "name" not in self.config:
raise ValueError("Missing required field: 'name'")
if 'description' not in self.config:
if "description" not in self.config:
raise ValueError("Missing required field: 'description'")
if 'sources' not in self.config:
if "sources" not in self.config:
raise ValueError("Missing required field: 'sources'")
# Validate sources array
sources = self.config['sources']
sources = self.config["sources"]
if not isinstance(sources, list):
raise ValueError("'sources' must be an array")
@@ -110,7 +110,7 @@ class ConfigValidator:
raise ValueError("'sources' array cannot be empty")
# Validate merge_mode (optional)
merge_mode = self.config.get('merge_mode', 'rule-based')
merge_mode = self.config.get("merge_mode", "rule-based")
if merge_mode not in self.VALID_MERGE_MODES:
raise ValueError(f"Invalid merge_mode: '{merge_mode}'. Must be one of {self.VALID_MERGE_MODES}")
@@ -121,56 +121,52 @@ class ConfigValidator:
logger.info(f"✅ Unified config valid: {len(sources)} sources")
return True
def _validate_source(self, source: Dict[str, Any], index: int):
def _validate_source(self, source: dict[str, Any], index: int):
"""Validate individual source configuration."""
# Check source has 'type' field
if 'type' not in source:
if "type" not in source:
raise ValueError(f"Source {index}: Missing required field 'type'")
source_type = source['type']
source_type = source["type"]
if source_type not in self.VALID_SOURCE_TYPES:
raise ValueError(
f"Source {index}: Invalid type '{source_type}'. "
f"Must be one of {self.VALID_SOURCE_TYPES}"
)
raise ValueError(f"Source {index}: Invalid type '{source_type}'. Must be one of {self.VALID_SOURCE_TYPES}")
# Type-specific validation
if source_type == 'documentation':
if source_type == "documentation":
self._validate_documentation_source(source, index)
elif source_type == 'github':
elif source_type == "github":
self._validate_github_source(source, index)
elif source_type == 'pdf':
elif source_type == "pdf":
self._validate_pdf_source(source, index)
def _validate_documentation_source(self, source: Dict[str, Any], index: int):
def _validate_documentation_source(self, source: dict[str, Any], index: int):
"""Validate documentation source configuration."""
if 'base_url' not in source:
if "base_url" not in source:
raise ValueError(f"Source {index} (documentation): Missing required field 'base_url'")
# Optional but recommended fields
if 'selectors' not in source:
if "selectors" not in source:
logger.warning(f"Source {index} (documentation): No 'selectors' specified, using defaults")
if 'max_pages' in source and not isinstance(source['max_pages'], int):
if "max_pages" in source and not isinstance(source["max_pages"], int):
raise ValueError(f"Source {index} (documentation): 'max_pages' must be an integer")
def _validate_github_source(self, source: Dict[str, Any], index: int):
def _validate_github_source(self, source: dict[str, Any], index: int):
"""Validate GitHub source configuration."""
if 'repo' not in source:
if "repo" not in source:
raise ValueError(f"Source {index} (github): Missing required field 'repo'")
# Validate repo format (owner/repo)
repo = source['repo']
if '/' not in repo:
repo = source["repo"]
if "/" not in repo:
raise ValueError(
f"Source {index} (github): Invalid repo format '{repo}'. "
f"Must be 'owner/repo' (e.g., 'facebook/react')"
f"Source {index} (github): Invalid repo format '{repo}'. Must be 'owner/repo' (e.g., 'facebook/react')"
)
# Validate code_analysis_depth if specified
if 'code_analysis_depth' in source:
depth = source['code_analysis_depth']
if "code_analysis_depth" in source:
depth = source["code_analysis_depth"]
if depth not in self.VALID_DEPTH_LEVELS:
raise ValueError(
f"Source {index} (github): Invalid code_analysis_depth '{depth}'. "
@@ -178,29 +174,28 @@ class ConfigValidator:
)
# Validate max_issues if specified
if 'max_issues' in source and not isinstance(source['max_issues'], int):
if "max_issues" in source and not isinstance(source["max_issues"], int):
raise ValueError(f"Source {index} (github): 'max_issues' must be an integer")
# Validate enable_codebase_analysis if specified (C3.5)
if 'enable_codebase_analysis' in source and not isinstance(source['enable_codebase_analysis'], bool):
if "enable_codebase_analysis" in source and not isinstance(source["enable_codebase_analysis"], bool):
raise ValueError(f"Source {index} (github): 'enable_codebase_analysis' must be a boolean")
# Validate ai_mode if specified (C3.5)
if 'ai_mode' in source:
ai_mode = source['ai_mode']
if "ai_mode" in source:
ai_mode = source["ai_mode"]
if ai_mode not in self.VALID_AI_MODES:
raise ValueError(
f"Source {index} (github): Invalid ai_mode '{ai_mode}'. "
f"Must be one of {self.VALID_AI_MODES}"
f"Source {index} (github): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
)
def _validate_pdf_source(self, source: Dict[str, Any], index: int):
def _validate_pdf_source(self, source: dict[str, Any], index: int):
"""Validate PDF source configuration."""
if 'path' not in source:
if "path" not in source:
raise ValueError(f"Source {index} (pdf): Missing required field 'path'")
# Check if file exists
pdf_path = source['path']
pdf_path = source["path"]
if not Path(pdf_path).exists():
logger.warning(f"Source {index} (pdf): File not found: {pdf_path}")
@@ -213,18 +208,18 @@ class ConfigValidator:
logger.info("Detected legacy config format (backward compatible)")
# Detect which legacy type based on fields
if 'base_url' in self.config:
if "base_url" in self.config:
logger.info("Legacy type: documentation")
elif 'repo' in self.config:
elif "repo" in self.config:
logger.info("Legacy type: github")
elif 'pdf' in self.config or 'path' in self.config:
elif "pdf" in self.config or "path" in self.config:
logger.info("Legacy type: pdf")
else:
raise ValueError("Cannot detect legacy config type (missing base_url, repo, or pdf)")
return True
def convert_legacy_to_unified(self) -> Dict[str, Any]:
def convert_legacy_to_unified(self) -> dict[str, Any]:
"""
Convert legacy config to unified format.
@@ -238,64 +233,50 @@ class ConfigValidator:
logger.info("Converting legacy config to unified format...")
# Detect legacy type and convert
if 'base_url' in self.config:
if "base_url" in self.config:
return self._convert_legacy_documentation()
elif 'repo' in self.config:
elif "repo" in self.config:
return self._convert_legacy_github()
elif 'pdf' in self.config or 'path' in self.config:
elif "pdf" in self.config or "path" in self.config:
return self._convert_legacy_pdf()
else:
raise ValueError("Cannot convert: unknown legacy format")
def _convert_legacy_documentation(self) -> Dict[str, Any]:
def _convert_legacy_documentation(self) -> dict[str, Any]:
"""Convert legacy documentation config to unified."""
unified = {
'name': self.config.get('name', 'unnamed'),
'description': self.config.get('description', 'Documentation skill'),
'merge_mode': 'rule-based',
'sources': [
{
'type': 'documentation',
**{k: v for k, v in self.config.items()
if k not in ['name', 'description']}
}
]
"name": self.config.get("name", "unnamed"),
"description": self.config.get("description", "Documentation skill"),
"merge_mode": "rule-based",
"sources": [
{"type": "documentation", **{k: v for k, v in self.config.items() if k not in ["name", "description"]}}
],
}
return unified
def _convert_legacy_github(self) -> Dict[str, Any]:
def _convert_legacy_github(self) -> dict[str, Any]:
"""Convert legacy GitHub config to unified."""
unified = {
'name': self.config.get('name', 'unnamed'),
'description': self.config.get('description', 'GitHub repository skill'),
'merge_mode': 'rule-based',
'sources': [
{
'type': 'github',
**{k: v for k, v in self.config.items()
if k not in ['name', 'description']}
}
]
"name": self.config.get("name", "unnamed"),
"description": self.config.get("description", "GitHub repository skill"),
"merge_mode": "rule-based",
"sources": [
{"type": "github", **{k: v for k, v in self.config.items() if k not in ["name", "description"]}}
],
}
return unified
def _convert_legacy_pdf(self) -> Dict[str, Any]:
def _convert_legacy_pdf(self) -> dict[str, Any]:
"""Convert legacy PDF config to unified."""
unified = {
'name': self.config.get('name', 'unnamed'),
'description': self.config.get('description', 'PDF document skill'),
'merge_mode': 'rule-based',
'sources': [
{
'type': 'pdf',
**{k: v for k, v in self.config.items()
if k not in ['name', 'description']}
}
]
"name": self.config.get("name", "unnamed"),
"description": self.config.get("description", "PDF document skill"),
"merge_mode": "rule-based",
"sources": [{"type": "pdf", **{k: v for k, v in self.config.items() if k not in ["name", "description"]}}],
}
return unified
def get_sources_by_type(self, source_type: str) -> List[Dict[str, Any]]:
def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]:
"""
Get all sources of a specific type.
@@ -308,17 +289,17 @@ class ConfigValidator:
if not self.is_unified:
# For legacy, convert and get sources
unified = self.convert_legacy_to_unified()
sources = unified['sources']
sources = unified["sources"]
else:
sources = self.config['sources']
sources = self.config["sources"]
return [s for s in sources if s.get('type') == source_type]
return [s for s in sources if s.get("type") == source_type]
def has_multiple_sources(self) -> bool:
"""Check if config has multiple sources (requires merging)."""
if not self.is_unified:
return False
return len(self.config['sources']) > 1
return len(self.config["sources"]) > 1
def needs_api_merge(self) -> bool:
"""
@@ -331,13 +312,11 @@ class ConfigValidator:
return False
has_docs_api = any(
s.get('type') == 'documentation' and s.get('extract_api', True)
for s in self.config['sources']
s.get("type") == "documentation" and s.get("extract_api", True) for s in self.config["sources"]
)
has_github_code = any(
s.get('type') == 'github' and s.get('include_code', False)
for s in self.config['sources']
s.get("type") == "github" and s.get("include_code", False) for s in self.config["sources"]
)
return has_docs_api and has_github_code
@@ -361,7 +340,7 @@ def validate_config(config_path: str) -> ConfigValidator:
return validator
if __name__ == '__main__':
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
@@ -373,18 +352,18 @@ if __name__ == '__main__':
try:
validator = validate_config(config_file)
print(f"\n✅ Config valid!")
print("\n✅ Config valid!")
print(f" Format: {'Unified' if validator.is_unified else 'Legacy'}")
print(f" Name: {validator.config.get('name')}")
if validator.is_unified:
sources = validator.config['sources']
sources = validator.config["sources"]
print(f" Sources: {len(sources)}")
for i, source in enumerate(sources):
print(f" {i+1}. {source['type']}")
print(f" {i + 1}. {source['type']}")
if validator.needs_api_merge():
merge_mode = validator.config.get('merge_mode', 'rule-based')
merge_mode = validator.config.get("merge_mode", "rule-based")
print(f" ⚠️ API merge required (mode: {merge_mode})")
except ValueError as e:

View File

@@ -13,9 +13,9 @@ Used by unified scraper to identify discrepancies before merging.
import json
import logging
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, asdict
from dataclasses import asdict, dataclass
from difflib import SequenceMatcher
from typing import Any
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -24,13 +24,14 @@ logger = logging.getLogger(__name__)
@dataclass
class Conflict:
"""Represents a conflict between documentation and code."""
type: str # 'missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch'
severity: str # 'low', 'medium', 'high'
api_name: str
docs_info: Optional[Dict[str, Any]] = None
code_info: Optional[Dict[str, Any]] = None
difference: Optional[str] = None
suggestion: Optional[str] = None
docs_info: dict[str, Any] | None = None
code_info: dict[str, Any] | None = None
difference: str | None = None
suggestion: str | None = None
class ConflictDetector:
@@ -38,7 +39,7 @@ class ConflictDetector:
Detects conflicts between documentation and code sources.
"""
def __init__(self, docs_data: Dict[str, Any], github_data: Dict[str, Any]):
def __init__(self, docs_data: dict[str, Any], github_data: dict[str, Any]):
"""
Initialize conflict detector.
@@ -56,7 +57,7 @@ class ConflictDetector:
logger.info(f"Loaded {len(self.docs_apis)} APIs from documentation")
logger.info(f"Loaded {len(self.code_apis)} APIs from code")
def _extract_docs_apis(self) -> Dict[str, Dict[str, Any]]:
def _extract_docs_apis(self) -> dict[str, dict[str, Any]]:
"""
Extract API information from documentation data.
@@ -66,42 +67,43 @@ class ConflictDetector:
apis = {}
# Documentation structure varies, but typically has 'pages' or 'references'
pages = self.docs_data.get('pages', {})
pages = self.docs_data.get("pages", {})
# Handle both dict and list formats
if isinstance(pages, dict):
# Format: {url: page_data, ...}
for url, page_data in pages.items():
content = page_data.get('content', '')
title = page_data.get('title', '')
content = page_data.get("content", "")
title = page_data.get("title", "")
# Simple heuristic: if title or URL contains "api", "reference", "class", "function"
# it might be an API page
if any(keyword in title.lower() or keyword in url.lower()
for keyword in ['api', 'reference', 'class', 'function', 'method']):
if any(
keyword in title.lower() or keyword in url.lower()
for keyword in ["api", "reference", "class", "function", "method"]
):
# Extract API signatures from content (simplified)
extracted_apis = self._parse_doc_content_for_apis(content, url)
apis.update(extracted_apis)
elif isinstance(pages, list):
# Format: [{url: '...', apis: [...]}, ...]
for page in pages:
url = page.get('url', '')
page_apis = page.get('apis', [])
url = page.get("url", "")
page_apis = page.get("apis", [])
# If APIs are already extracted in the page data
for api in page_apis:
api_name = api.get('name', '')
api_name = api.get("name", "")
if api_name:
apis[api_name] = {
'parameters': api.get('parameters', []),
'return_type': api.get('return_type', 'Any'),
'source_url': url
"parameters": api.get("parameters", []),
"return_type": api.get("return_type", "Any"),
"source_url": url,
}
return apis
def _parse_doc_content_for_apis(self, content: str, source_url: str) -> Dict[str, Dict]:
def _parse_doc_content_for_apis(self, content: str, source_url: str) -> dict[str, dict]:
"""
Parse documentation content to extract API signatures.
@@ -121,13 +123,13 @@ class ConflictDetector:
# Pattern for common API signatures
patterns = [
# Python style: def name(params) -> return
r'def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*(\w+))?',
r"def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*(\w+))?",
# JavaScript style: function name(params)
r'function\s+(\w+)\s*\(([^)]*)\)',
r"function\s+(\w+)\s*\(([^)]*)\)",
# C++ style: return_type name(params)
r'(\w+)\s+(\w+)\s*\(([^)]*)\)',
r"(\w+)\s+(\w+)\s*\(([^)]*)\)",
# Method style: ClassName.method_name(params)
r'(\w+)\.(\w+)\s*\(([^)]*)\)'
r"(\w+)\.(\w+)\s*\(([^)]*)\)",
]
for pattern in patterns:
@@ -135,17 +137,17 @@ class ConflictDetector:
groups = match.groups()
# Parse based on pattern matched
if 'def' in pattern:
if "def" in pattern:
# Python function
name = groups[0]
params_str = groups[1]
return_type = groups[2] if len(groups) > 2 else None
elif 'function' in pattern:
elif "function" in pattern:
# JavaScript function
name = groups[0]
params_str = groups[1]
return_type = None
elif '.' in pattern:
elif "." in pattern:
# Class method
class_name = groups[0]
method_name = groups[1]
@@ -162,54 +164,54 @@ class ConflictDetector:
params = self._parse_param_string(params_str)
apis[name] = {
'name': name,
'parameters': params,
'return_type': return_type,
'source': source_url,
'raw_signature': match.group(0)
"name": name,
"parameters": params,
"return_type": return_type,
"source": source_url,
"raw_signature": match.group(0),
}
return apis
def _parse_param_string(self, params_str: str) -> List[Dict]:
def _parse_param_string(self, params_str: str) -> list[dict]:
"""Parse parameter string into list of parameter dicts."""
if not params_str.strip():
return []
params = []
for param in params_str.split(','):
for param in params_str.split(","):
param = param.strip()
if not param:
continue
# Try to extract name and type
param_info = {'name': param, 'type': None, 'default': None}
param_info = {"name": param, "type": None, "default": None}
# Check for type annotation (: type)
if ':' in param:
parts = param.split(':', 1)
param_info['name'] = parts[0].strip()
if ":" in param:
parts = param.split(":", 1)
param_info["name"] = parts[0].strip()
type_part = parts[1].strip()
# Check for default value (= value)
if '=' in type_part:
type_str, default_str = type_part.split('=', 1)
param_info['type'] = type_str.strip()
param_info['default'] = default_str.strip()
if "=" in type_part:
type_str, default_str = type_part.split("=", 1)
param_info["type"] = type_str.strip()
param_info["default"] = default_str.strip()
else:
param_info['type'] = type_part
param_info["type"] = type_part
# Check for default without type (= value)
elif '=' in param:
parts = param.split('=', 1)
param_info['name'] = parts[0].strip()
param_info['default'] = parts[1].strip()
elif "=" in param:
parts = param.split("=", 1)
param_info["name"] = parts[0].strip()
param_info["default"] = parts[1].strip()
params.append(param_info)
return params
def _extract_code_apis(self) -> Dict[str, Dict[str, Any]]:
def _extract_code_apis(self) -> dict[str, dict[str, Any]]:
"""
Extract API information from GitHub code analysis.
@@ -218,61 +220,61 @@ class ConflictDetector:
"""
apis = {}
code_analysis = self.github_data.get('code_analysis', {})
code_analysis = self.github_data.get("code_analysis", {})
if not code_analysis:
return apis
# Support both 'files' and 'analyzed_files' keys
files = code_analysis.get('files', code_analysis.get('analyzed_files', []))
files = code_analysis.get("files", code_analysis.get("analyzed_files", []))
for file_info in files:
file_path = file_info.get('file', 'unknown')
file_path = file_info.get("file", "unknown")
# Extract classes and their methods
for class_info in file_info.get('classes', []):
class_name = class_info['name']
for class_info in file_info.get("classes", []):
class_name = class_info["name"]
# Add class itself
apis[class_name] = {
'name': class_name,
'type': 'class',
'source': file_path,
'line': class_info.get('line_number'),
'base_classes': class_info.get('base_classes', []),
'docstring': class_info.get('docstring')
"name": class_name,
"type": "class",
"source": file_path,
"line": class_info.get("line_number"),
"base_classes": class_info.get("base_classes", []),
"docstring": class_info.get("docstring"),
}
# Add methods
for method in class_info.get('methods', []):
for method in class_info.get("methods", []):
method_name = f"{class_name}.{method['name']}"
apis[method_name] = {
'name': method_name,
'type': 'method',
'parameters': method.get('parameters', []),
'return_type': method.get('return_type'),
'source': file_path,
'line': method.get('line_number'),
'docstring': method.get('docstring'),
'is_async': method.get('is_async', False)
"name": method_name,
"type": "method",
"parameters": method.get("parameters", []),
"return_type": method.get("return_type"),
"source": file_path,
"line": method.get("line_number"),
"docstring": method.get("docstring"),
"is_async": method.get("is_async", False),
}
# Extract standalone functions
for func_info in file_info.get('functions', []):
func_name = func_info['name']
for func_info in file_info.get("functions", []):
func_name = func_info["name"]
apis[func_name] = {
'name': func_name,
'type': 'function',
'parameters': func_info.get('parameters', []),
'return_type': func_info.get('return_type'),
'source': file_path,
'line': func_info.get('line_number'),
'docstring': func_info.get('docstring'),
'is_async': func_info.get('is_async', False)
"name": func_name,
"type": "function",
"parameters": func_info.get("parameters", []),
"return_type": func_info.get("return_type"),
"source": file_path,
"line": func_info.get("line_number"),
"docstring": func_info.get("docstring"),
"is_async": func_info.get("is_async", False),
}
return apis
def detect_all_conflicts(self) -> List[Conflict]:
def detect_all_conflicts(self) -> list[Conflict]:
"""
Detect all types of conflicts.
@@ -296,7 +298,7 @@ class ConflictDetector:
return conflicts
def _find_missing_in_docs(self) -> List[Conflict]:
def _find_missing_in_docs(self) -> list[Conflict]:
"""Find APIs that exist in code but not in documentation."""
conflicts = []
@@ -304,40 +306,46 @@ class ConflictDetector:
# Simple name matching (can be enhanced with fuzzy matching)
if api_name not in self.docs_apis:
# Check if it's a private/internal API (often not documented)
is_private = api_name.startswith('_') or '__' in api_name
severity = 'low' if is_private else 'medium'
is_private = api_name.startswith("_") or "__" in api_name
severity = "low" if is_private else "medium"
conflicts.append(Conflict(
type='missing_in_docs',
severity=severity,
api_name=api_name,
code_info=code_info,
difference=f"API exists in code ({code_info['source']}) but not found in documentation",
suggestion="Add documentation for this API" if not is_private else "Consider if this internal API should be documented"
))
conflicts.append(
Conflict(
type="missing_in_docs",
severity=severity,
api_name=api_name,
code_info=code_info,
difference=f"API exists in code ({code_info['source']}) but not found in documentation",
suggestion="Add documentation for this API"
if not is_private
else "Consider if this internal API should be documented",
)
)
logger.info(f"Found {len(conflicts)} APIs missing in documentation")
return conflicts
def _find_missing_in_code(self) -> List[Conflict]:
def _find_missing_in_code(self) -> list[Conflict]:
"""Find APIs that are documented but don't exist in code."""
conflicts = []
for api_name, docs_info in self.docs_apis.items():
if api_name not in self.code_apis:
conflicts.append(Conflict(
type='missing_in_code',
severity='high', # This is serious - documented but doesn't exist
api_name=api_name,
docs_info=docs_info,
difference=f"API documented ({docs_info.get('source', 'unknown')}) but not found in code",
suggestion="Update documentation to remove this API, or add it to codebase"
))
conflicts.append(
Conflict(
type="missing_in_code",
severity="high", # This is serious - documented but doesn't exist
api_name=api_name,
docs_info=docs_info,
difference=f"API documented ({docs_info.get('source', 'unknown')}) but not found in code",
suggestion="Update documentation to remove this API, or add it to codebase",
)
)
logger.info(f"Found {len(conflicts)} APIs missing in code")
return conflicts
def _find_signature_mismatches(self) -> List[Conflict]:
def _find_signature_mismatches(self) -> list[Conflict]:
"""Find APIs where signature differs between docs and code."""
conflicts = []
@@ -352,41 +360,43 @@ class ConflictDetector:
mismatch = self._compare_signatures(docs_info, code_info)
if mismatch:
conflicts.append(Conflict(
type='signature_mismatch',
severity=mismatch['severity'],
api_name=api_name,
docs_info=docs_info,
code_info=code_info,
difference=mismatch['difference'],
suggestion=mismatch['suggestion']
))
conflicts.append(
Conflict(
type="signature_mismatch",
severity=mismatch["severity"],
api_name=api_name,
docs_info=docs_info,
code_info=code_info,
difference=mismatch["difference"],
suggestion=mismatch["suggestion"],
)
)
logger.info(f"Found {len(conflicts)} signature mismatches")
return conflicts
def _compare_signatures(self, docs_info: Dict, code_info: Dict) -> Optional[Dict]:
def _compare_signatures(self, docs_info: dict, code_info: dict) -> dict | None:
"""
Compare signatures between docs and code.
Returns:
Dict with mismatch details if conflict found, None otherwise
"""
docs_params = docs_info.get('parameters', [])
code_params = code_info.get('parameters', [])
docs_params = docs_info.get("parameters", [])
code_params = code_info.get("parameters", [])
# Compare parameter counts
if len(docs_params) != len(code_params):
return {
'severity': 'medium',
'difference': f"Parameter count mismatch: docs has {len(docs_params)}, code has {len(code_params)}",
'suggestion': f"Documentation shows {len(docs_params)} parameters, but code has {len(code_params)}"
"severity": "medium",
"difference": f"Parameter count mismatch: docs has {len(docs_params)}, code has {len(code_params)}",
"suggestion": f"Documentation shows {len(docs_params)} parameters, but code has {len(code_params)}",
}
# Compare parameter names and types
for i, (doc_param, code_param) in enumerate(zip(docs_params, code_params)):
doc_name = doc_param.get('name', '')
code_name = code_param.get('name', '')
doc_name = doc_param.get("name", "")
code_name = code_param.get("name", "")
# Parameter name mismatch
if doc_name != code_name:
@@ -394,36 +404,36 @@ class ConflictDetector:
similarity = SequenceMatcher(None, doc_name, code_name).ratio()
if similarity < 0.8: # Not similar enough
return {
'severity': 'medium',
'difference': f"Parameter {i+1} name mismatch: '{doc_name}' in docs vs '{code_name}' in code",
'suggestion': f"Update documentation to use parameter name '{code_name}'"
"severity": "medium",
"difference": f"Parameter {i + 1} name mismatch: '{doc_name}' in docs vs '{code_name}' in code",
"suggestion": f"Update documentation to use parameter name '{code_name}'",
}
# Type mismatch
doc_type = doc_param.get('type')
code_type = code_param.get('type_hint')
doc_type = doc_param.get("type")
code_type = code_param.get("type_hint")
if doc_type and code_type and doc_type != code_type:
return {
'severity': 'low',
'difference': f"Parameter '{doc_name}' type mismatch: '{doc_type}' in docs vs '{code_type}' in code",
'suggestion': f"Verify correct type for parameter '{doc_name}'"
"severity": "low",
"difference": f"Parameter '{doc_name}' type mismatch: '{doc_type}' in docs vs '{code_type}' in code",
"suggestion": f"Verify correct type for parameter '{doc_name}'",
}
# Compare return types if both have them
docs_return = docs_info.get('return_type')
code_return = code_info.get('return_type')
docs_return = docs_info.get("return_type")
code_return = code_info.get("return_type")
if docs_return and code_return and docs_return != code_return:
return {
'severity': 'low',
'difference': f"Return type mismatch: '{docs_return}' in docs vs '{code_return}' in code",
'suggestion': "Verify correct return type"
"severity": "low",
"difference": f"Return type mismatch: '{docs_return}' in docs vs '{code_return}' in code",
"suggestion": "Verify correct return type",
}
return None
def generate_summary(self, conflicts: List[Conflict]) -> Dict[str, Any]:
def generate_summary(self, conflicts: list[Conflict]) -> dict[str, Any]:
"""
Generate summary statistics for conflicts.
@@ -434,25 +444,25 @@ class ConflictDetector:
Summary dict with statistics
"""
summary = {
'total': len(conflicts),
'by_type': {},
'by_severity': {},
'apis_affected': len(set(c.api_name for c in conflicts))
"total": len(conflicts),
"by_type": {},
"by_severity": {},
"apis_affected": len(set(c.api_name for c in conflicts)),
}
# Count by type
for conflict_type in ['missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch']:
for conflict_type in ["missing_in_docs", "missing_in_code", "signature_mismatch", "description_mismatch"]:
count = sum(1 for c in conflicts if c.type == conflict_type)
summary['by_type'][conflict_type] = count
summary["by_type"][conflict_type] = count
# Count by severity
for severity in ['low', 'medium', 'high']:
for severity in ["low", "medium", "high"]:
count = sum(1 for c in conflicts if c.severity == severity)
summary['by_severity'][severity] = count
summary["by_severity"][severity] = count
return summary
def save_conflicts(self, conflicts: List[Conflict], output_path: str):
def save_conflicts(self, conflicts: list[Conflict], output_path: str):
"""
Save conflicts to JSON file.
@@ -460,18 +470,15 @@ class ConflictDetector:
conflicts: List of Conflict objects
output_path: Path to output JSON file
"""
data = {
'conflicts': [asdict(c) for c in conflicts],
'summary': self.generate_summary(conflicts)
}
data = {"conflicts": [asdict(c) for c in conflicts], "summary": self.generate_summary(conflicts)}
with open(output_path, 'w', encoding='utf-8') as f:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Conflicts saved to: {output_path}")
if __name__ == '__main__':
if __name__ == "__main__":
import sys
if len(sys.argv) < 3:
@@ -482,10 +489,10 @@ if __name__ == '__main__':
github_file = sys.argv[2]
# Load data
with open(docs_file, 'r') as f:
with open(docs_file) as f:
docs_data = json.load(f)
with open(github_file, 'r') as f:
with open(github_file) as f:
github_data = json.load(f)
# Detect conflicts
@@ -498,16 +505,16 @@ if __name__ == '__main__':
print(f" Total conflicts: {summary['total']}")
print(f" APIs affected: {summary['apis_affected']}")
print("\n By Type:")
for conflict_type, count in summary['by_type'].items():
for conflict_type, count in summary["by_type"].items():
if count > 0:
print(f" {conflict_type}: {count}")
print("\n By Severity:")
for severity, count in summary['by_severity'].items():
for severity, count in summary["by_severity"].items():
if count > 0:
emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢'
emoji = "🔴" if severity == "high" else "🟡" if severity == "medium" else "🟢"
print(f" {emoji} {severity}: {count}")
# Save to file
output_file = 'conflicts.json'
output_file = "conflicts.json"
detector.save_conflicts(conflicts, output_file)
print(f"\n✅ Full report saved to: {output_file}")

View File

@@ -8,7 +8,7 @@ across the CLI tools to improve maintainability and clarity.
# Default scraping limits
DEFAULT_RATE_LIMIT = 0.5 # seconds between requests
DEFAULT_MAX_PAGES = 500 # maximum pages to scrape
DEFAULT_MAX_PAGES = 500 # maximum pages to scrape
DEFAULT_CHECKPOINT_INTERVAL = 1000 # pages between checkpoints
DEFAULT_ASYNC_MODE = False # use async mode for parallel scraping (opt-in)
@@ -26,7 +26,7 @@ CONTENT_MATCH_POINTS = 1 # points for content keyword match
# API-based enhancement limits (uses Anthropic API)
API_CONTENT_LIMIT = 100000 # max characters for API enhancement
API_PREVIEW_LIMIT = 40000 # max characters for preview
API_PREVIEW_LIMIT = 40000 # max characters for preview
# Local enhancement limits (uses Claude Code Max)
LOCAL_CONTENT_LIMIT = 50000 # max characters for local enhancement
@@ -36,7 +36,7 @@ LOCAL_PREVIEW_LIMIT = 20000 # max characters for preview
# Estimation and discovery settings
DEFAULT_MAX_DISCOVERY = 1000 # default max pages to discover
DISCOVERY_THRESHOLD = 10000 # threshold for warnings
DISCOVERY_THRESHOLD = 10000 # threshold for warnings
# ===== FILE LIMITS =====
@@ -48,25 +48,25 @@ MAX_CODE_BLOCKS_PER_PAGE = 5 # maximum code blocks to extract per page
__all__ = [
# Scraping
'DEFAULT_RATE_LIMIT',
'DEFAULT_MAX_PAGES',
'DEFAULT_CHECKPOINT_INTERVAL',
'DEFAULT_ASYNC_MODE',
'CONTENT_PREVIEW_LENGTH',
'MAX_PAGES_WARNING_THRESHOLD',
'MIN_CATEGORIZATION_SCORE',
'URL_MATCH_POINTS',
'TITLE_MATCH_POINTS',
'CONTENT_MATCH_POINTS',
"DEFAULT_RATE_LIMIT",
"DEFAULT_MAX_PAGES",
"DEFAULT_CHECKPOINT_INTERVAL",
"DEFAULT_ASYNC_MODE",
"CONTENT_PREVIEW_LENGTH",
"MAX_PAGES_WARNING_THRESHOLD",
"MIN_CATEGORIZATION_SCORE",
"URL_MATCH_POINTS",
"TITLE_MATCH_POINTS",
"CONTENT_MATCH_POINTS",
# Enhancement
'API_CONTENT_LIMIT',
'API_PREVIEW_LIMIT',
'LOCAL_CONTENT_LIMIT',
'LOCAL_PREVIEW_LIMIT',
"API_CONTENT_LIMIT",
"API_PREVIEW_LIMIT",
"LOCAL_CONTENT_LIMIT",
"LOCAL_PREVIEW_LIMIT",
# Estimation
'DEFAULT_MAX_DISCOVERY',
'DISCOVERY_THRESHOLD',
"DEFAULT_MAX_DISCOVERY",
"DISCOVERY_THRESHOLD",
# Limits
'MAX_REFERENCE_FILES',
'MAX_CODE_BLOCKS_PER_PAGE',
"MAX_REFERENCE_FILES",
"MAX_CODE_BLOCKS_PER_PAGE",
]

View File

@@ -37,15 +37,16 @@ Credits:
- NetworkX for graph algorithms: https://networkx.org/
"""
import re
import ast
import logging
from pathlib import Path
from typing import Dict, List, Set, Tuple, Optional, Any
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
try:
import networkx as nx
NETWORKX_AVAILABLE = True
except ImportError:
NETWORKX_AVAILABLE = False
@@ -56,6 +57,7 @@ logger = logging.getLogger(__name__)
@dataclass
class DependencyInfo:
"""Information about a single dependency relationship."""
source_file: str
imported_module: str
import_type: str # 'import', 'from', 'require', 'include'
@@ -66,10 +68,11 @@ class DependencyInfo:
@dataclass
class FileNode:
"""Represents a file node in the dependency graph."""
file_path: str
language: str
dependencies: List[str] = field(default_factory=list)
imported_by: List[str] = field(default_factory=list)
dependencies: list[str] = field(default_factory=list)
imported_by: list[str] = field(default_factory=list)
class DependencyAnalyzer:
@@ -83,16 +86,13 @@ class DependencyAnalyzer:
def __init__(self):
"""Initialize dependency analyzer."""
if not NETWORKX_AVAILABLE:
raise ImportError(
"NetworkX is required for dependency analysis. "
"Install with: pip install networkx"
)
raise ImportError("NetworkX is required for dependency analysis. Install with: pip install networkx")
self.graph = nx.DiGraph() # Directed graph for dependencies
self.file_dependencies: Dict[str, List[DependencyInfo]] = {}
self.file_nodes: Dict[str, FileNode] = {}
self.file_dependencies: dict[str, list[DependencyInfo]] = {}
self.file_nodes: dict[str, FileNode] = {}
def analyze_file(self, file_path: str, content: str, language: str) -> List[DependencyInfo]:
def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
"""
Extract dependencies from a source file.
@@ -104,23 +104,23 @@ class DependencyAnalyzer:
Returns:
List of DependencyInfo objects
"""
if language == 'Python':
if language == "Python":
deps = self._extract_python_imports(content, file_path)
elif language in ('JavaScript', 'TypeScript'):
elif language in ("JavaScript", "TypeScript"):
deps = self._extract_js_imports(content, file_path)
elif language in ('C++', 'C'):
elif language in ("C++", "C"):
deps = self._extract_cpp_includes(content, file_path)
elif language == 'C#':
elif language == "C#":
deps = self._extract_csharp_imports(content, file_path)
elif language == 'Go':
elif language == "Go":
deps = self._extract_go_imports(content, file_path)
elif language == 'Rust':
elif language == "Rust":
deps = self._extract_rust_imports(content, file_path)
elif language == 'Java':
elif language == "Java":
deps = self._extract_java_imports(content, file_path)
elif language == 'Ruby':
elif language == "Ruby":
deps = self._extract_ruby_imports(content, file_path)
elif language == 'PHP':
elif language == "PHP":
deps = self._extract_php_imports(content, file_path)
else:
logger.warning(f"Unsupported language: {language}")
@@ -130,15 +130,11 @@ class DependencyAnalyzer:
# Create file node
imported_modules = [dep.imported_module for dep in deps]
self.file_nodes[file_path] = FileNode(
file_path=file_path,
language=language,
dependencies=imported_modules
)
self.file_nodes[file_path] = FileNode(file_path=file_path, language=language, dependencies=imported_modules)
return deps
def _extract_python_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
def _extract_python_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract Python import statements using AST.
@@ -159,33 +155,37 @@ class DependencyAnalyzer:
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
deps.append(DependencyInfo(
source_file=file_path,
imported_module=alias.name,
import_type='import',
is_relative=False,
line_number=node.lineno
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=alias.name,
import_type="import",
is_relative=False,
line_number=node.lineno,
)
)
elif isinstance(node, ast.ImportFrom):
module = node.module or ''
module = node.module or ""
is_relative = node.level > 0
# Handle relative imports
if is_relative:
module = '.' * node.level + module
module = "." * node.level + module
deps.append(DependencyInfo(
source_file=file_path,
imported_module=module,
import_type='from',
is_relative=is_relative,
line_number=node.lineno
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=module,
import_type="from",
is_relative=is_relative,
line_number=node.lineno,
)
)
return deps
def _extract_js_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
def _extract_js_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract JavaScript/TypeScript import statements.
@@ -202,35 +202,39 @@ class DependencyAnalyzer:
import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]"
for match in re.finditer(import_pattern, content):
module = match.group(1)
line_num = content[:match.start()].count('\n') + 1
is_relative = module.startswith('.') or module.startswith('/')
line_num = content[: match.start()].count("\n") + 1
is_relative = module.startswith(".") or module.startswith("/")
deps.append(DependencyInfo(
source_file=file_path,
imported_module=module,
import_type='import',
is_relative=is_relative,
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=module,
import_type="import",
is_relative=is_relative,
line_number=line_num,
)
)
# CommonJS requires: require('module')
require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)"
for match in re.finditer(require_pattern, content):
module = match.group(1)
line_num = content[:match.start()].count('\n') + 1
is_relative = module.startswith('.') or module.startswith('/')
line_num = content[: match.start()].count("\n") + 1
is_relative = module.startswith(".") or module.startswith("/")
deps.append(DependencyInfo(
source_file=file_path,
imported_module=module,
import_type='require',
is_relative=is_relative,
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=module,
import_type="require",
is_relative=is_relative,
line_number=line_num,
)
)
return deps
def _extract_cpp_includes(self, content: str, file_path: str) -> List[DependencyInfo]:
def _extract_cpp_includes(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract C++ #include directives.
@@ -244,22 +248,24 @@ class DependencyAnalyzer:
include_pattern = r'#include\s+[<"]([^>"]+)[>"]'
for match in re.finditer(include_pattern, content):
header = match.group(1)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
# Headers with "" are usually local, <> are system headers
is_relative = '"' in match.group(0)
deps.append(DependencyInfo(
source_file=file_path,
imported_module=header,
import_type='include',
is_relative=is_relative,
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=header,
import_type="include",
is_relative=is_relative,
line_number=line_num,
)
)
return deps
def _extract_csharp_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
def _extract_csharp_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract C# using statements.
@@ -275,27 +281,29 @@ class DependencyAnalyzer:
deps = []
# Match using statements: using [static] Namespace[.Type];
using_pattern = r'using\s+(?:static\s+)?(?:(\w+)\s*=\s*)?([A-Za-z_][\w.]*)\s*;'
using_pattern = r"using\s+(?:static\s+)?(?:(\w+)\s*=\s*)?([A-Za-z_][\w.]*)\s*;"
for match in re.finditer(using_pattern, content):
alias = match.group(1) # Optional alias
namespace = match.group(2)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
# Skip 'using' statements for IDisposable (using var x = ...)
if '=' in match.group(0) and not alias:
if "=" in match.group(0) and not alias:
continue
deps.append(DependencyInfo(
source_file=file_path,
imported_module=namespace,
import_type='using',
is_relative=False, # C# uses absolute namespaces
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=namespace,
import_type="using",
is_relative=False, # C# uses absolute namespaces
line_number=line_num,
)
)
return deps
def _extract_go_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
def _extract_go_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract Go import statements.
@@ -314,21 +322,23 @@ class DependencyAnalyzer:
for match in re.finditer(single_import_pattern, content):
alias = match.group(1) # Optional alias
package = match.group(2)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
# Check if relative (starts with ./ or ../)
is_relative = package.startswith('./')
is_relative = package.startswith("./")
deps.append(DependencyInfo(
source_file=file_path,
imported_module=package,
import_type='import',
is_relative=is_relative,
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=package,
import_type="import",
is_relative=is_relative,
line_number=line_num,
)
)
# Multi-import block: import ( ... )
multi_import_pattern = r'import\s*\((.*?)\)'
multi_import_pattern = r"import\s*\((.*?)\)"
for match in re.finditer(multi_import_pattern, content, re.DOTALL):
block = match.group(1)
block_start = match.start()
@@ -338,21 +348,23 @@ class DependencyAnalyzer:
for line_match in re.finditer(import_line_pattern, block):
alias = line_match.group(1)
package = line_match.group(2)
line_num = content[:block_start + line_match.start()].count('\n') + 1
line_num = content[: block_start + line_match.start()].count("\n") + 1
is_relative = package.startswith('./')
is_relative = package.startswith("./")
deps.append(DependencyInfo(
source_file=file_path,
imported_module=package,
import_type='import',
is_relative=is_relative,
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=package,
import_type="import",
is_relative=is_relative,
line_number=line_num,
)
)
return deps
def _extract_rust_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
def _extract_rust_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract Rust use statements.
@@ -369,43 +381,47 @@ class DependencyAnalyzer:
# Match use statements: use path::to::item; (including curly braces with spaces)
# This pattern matches: use word::word; or use word::{item, item};
use_pattern = r'use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;'
use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;"
for match in re.finditer(use_pattern, content):
module_path = match.group(1)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
# Determine if relative
is_relative = module_path.startswith(('self::', 'super::'))
is_relative = module_path.startswith(("self::", "super::"))
# Handle curly brace imports (use std::{io, fs})
if '{' in module_path:
if "{" in module_path:
# Extract base path
base_path = module_path.split('{')[0].rstrip(':')
base_path = module_path.split("{")[0].rstrip(":")
# Extract items inside braces
items_match = re.search(r'\{([^}]+)\}', module_path)
items_match = re.search(r"\{([^}]+)\}", module_path)
if items_match:
items = [item.strip() for item in items_match.group(1).split(',')]
items = [item.strip() for item in items_match.group(1).split(",")]
for item in items:
full_path = f"{base_path}::{item}" if base_path else item
deps.append(DependencyInfo(
source_file=file_path,
imported_module=full_path,
import_type='use',
is_relative=is_relative,
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=full_path,
import_type="use",
is_relative=is_relative,
line_number=line_num,
)
)
else:
deps.append(DependencyInfo(
source_file=file_path,
imported_module=module_path,
import_type='use',
is_relative=is_relative,
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=module_path,
import_type="use",
is_relative=is_relative,
line_number=line_num,
)
)
return deps
def _extract_java_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
def _extract_java_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract Java import statements.
@@ -420,22 +436,24 @@ class DependencyAnalyzer:
deps = []
# Match import statements: import [static] package.Class;
import_pattern = r'import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;'
import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;"
for match in re.finditer(import_pattern, content):
import_path = match.group(1)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
deps.append(DependencyInfo(
source_file=file_path,
imported_module=import_path,
import_type='import',
is_relative=False, # Java uses absolute package names
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=import_path,
import_type="import",
is_relative=False, # Java uses absolute package names
line_number=line_num,
)
)
return deps
def _extract_ruby_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
def _extract_ruby_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract Ruby require/require_relative/load statements.
@@ -453,47 +471,53 @@ class DependencyAnalyzer:
require_pattern = r"require\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(require_pattern, content):
module = match.group(1)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
deps.append(DependencyInfo(
source_file=file_path,
imported_module=module,
import_type='require',
is_relative=False, # require looks in load path
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=module,
import_type="require",
is_relative=False, # require looks in load path
line_number=line_num,
)
)
# Match require_relative: require_relative 'file'
require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(require_relative_pattern, content):
module = match.group(1)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
deps.append(DependencyInfo(
source_file=file_path,
imported_module=module,
import_type='require_relative',
is_relative=True,
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=module,
import_type="require_relative",
is_relative=True,
line_number=line_num,
)
)
# Match load: load 'script.rb'
load_pattern = r"load\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(load_pattern, content):
module = match.group(1)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
deps.append(DependencyInfo(
source_file=file_path,
imported_module=module,
import_type='load',
is_relative=True, # load is usually relative
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=module,
import_type="load",
is_relative=True, # load is usually relative
line_number=line_num,
)
)
return deps
def _extract_php_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
def _extract_php_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract PHP require/include/use statements.
@@ -513,35 +537,39 @@ class DependencyAnalyzer:
require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(require_pattern, content):
module = match.group(1)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
# Determine import type
import_type = 'require' if 'require' in match.group(0) else 'include'
import_type = "require" if "require" in match.group(0) else "include"
# PHP file paths are relative by default
is_relative = not module.startswith(('/', 'http://', 'https://'))
is_relative = not module.startswith(("/", "http://", "https://"))
deps.append(DependencyInfo(
source_file=file_path,
imported_module=module,
import_type=import_type,
is_relative=is_relative,
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=module,
import_type=import_type,
is_relative=is_relative,
line_number=line_num,
)
)
# Match namespace use: use Namespace\Class;
use_pattern = r'use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;'
use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;"
for match in re.finditer(use_pattern, content):
namespace = match.group(1)
line_num = content[:match.start()].count('\n') + 1
line_num = content[: match.start()].count("\n") + 1
deps.append(DependencyInfo(
source_file=file_path,
imported_module=namespace,
import_type='use',
is_relative=False, # Namespaces are absolute
line_number=line_num
))
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=namespace,
import_type="use",
is_relative=False, # Namespaces are absolute
line_number=line_num,
)
)
return deps
@@ -566,12 +594,7 @@ class DependencyAnalyzer:
if target and target in self.file_nodes:
# Add edge from source to dependency
self.graph.add_edge(
file_path,
target,
import_type=dep.import_type,
line_number=dep.line_number
)
self.graph.add_edge(file_path, target, import_type=dep.import_type, line_number=dep.line_number)
# Update imported_by lists
if target in self.file_nodes:
@@ -579,7 +602,7 @@ class DependencyAnalyzer:
return self.graph
def _resolve_import(self, source_file: str, imported_module: str, is_relative: bool) -> Optional[str]:
def _resolve_import(self, source_file: str, imported_module: str, is_relative: bool) -> str | None:
"""
Resolve import statement to actual file path.
@@ -609,7 +632,7 @@ class DependencyAnalyzer:
return None
def detect_cycles(self) -> List[List[str]]:
def detect_cycles(self) -> list[list[str]]:
"""
Detect circular dependencies in the graph.
@@ -627,7 +650,7 @@ class DependencyAnalyzer:
logger.error(f"Error detecting cycles: {e}")
return []
def get_strongly_connected_components(self) -> List[Set[str]]:
def get_strongly_connected_components(self) -> list[set[str]]:
"""
Get strongly connected components (groups of mutually dependent files).
@@ -645,13 +668,14 @@ class DependencyAnalyzer:
"""
try:
from networkx.drawing.nx_pydot import write_dot
write_dot(self.graph, output_path)
logger.info(f"Exported graph to DOT format: {output_path}")
except ImportError:
logger.warning("pydot not installed - cannot export to DOT format")
logger.warning("Install with: pip install pydot")
def export_json(self) -> Dict[str, Any]:
def export_json(self) -> dict[str, Any]:
"""
Export graph as JSON structure.
@@ -659,22 +683,19 @@ class DependencyAnalyzer:
Dictionary with nodes and edges
"""
return {
'nodes': [
{
'file': node,
'language': data.get('language', 'Unknown')
}
"nodes": [
{"file": node, "language": data.get("language", "Unknown")}
for node, data in self.graph.nodes(data=True)
],
'edges': [
"edges": [
{
'source': source,
'target': target,
'import_type': data.get('import_type', 'unknown'),
'line_number': data.get('line_number', 0)
"source": source,
"target": target,
"import_type": data.get("import_type", "unknown"),
"line_number": data.get("line_number", 0),
}
for source, target, data in self.graph.edges(data=True)
]
],
}
def export_mermaid(self) -> str:
@@ -684,7 +705,7 @@ class DependencyAnalyzer:
Returns:
Mermaid diagram as string
"""
lines = ['graph TD']
lines = ["graph TD"]
# Create node labels (shorten file paths for readability)
node_ids = {}
@@ -700,9 +721,9 @@ class DependencyAnalyzer:
target_id = node_ids[target]
lines.append(f" {source_id} --> {target_id}")
return '\n'.join(lines)
return "\n".join(lines)
def get_statistics(self) -> Dict[str, Any]:
def get_statistics(self) -> dict[str, Any]:
"""
Get graph statistics.
@@ -710,20 +731,15 @@ class DependencyAnalyzer:
Dictionary with various statistics
"""
return {
'total_files': self.graph.number_of_nodes(),
'total_dependencies': self.graph.number_of_edges(),
'circular_dependencies': len(self.detect_cycles()),
'strongly_connected_components': len(self.get_strongly_connected_components()),
'avg_dependencies_per_file': (
self.graph.number_of_edges() / self.graph.number_of_nodes()
if self.graph.number_of_nodes() > 0 else 0
"total_files": self.graph.number_of_nodes(),
"total_dependencies": self.graph.number_of_edges(),
"circular_dependencies": len(self.detect_cycles()),
"strongly_connected_components": len(self.get_strongly_connected_components()),
"avg_dependencies_per_file": (
self.graph.number_of_edges() / self.graph.number_of_nodes() if self.graph.number_of_nodes() > 0 else 0
),
'files_with_no_dependencies': len([
node for node in self.graph.nodes()
if self.graph.out_degree(node) == 0
]),
'files_not_imported': len([
node for node in self.graph.nodes()
if self.graph.in_degree(node) == 0
]),
"files_with_no_dependencies": len(
[node for node in self.graph.nodes() if self.graph.out_degree(node) == 0]
),
"files_not_imported": len([node for node in self.graph.nodes() if self.graph.in_degree(node) == 0]),
}

File diff suppressed because it is too large Load Diff

View File

@@ -15,10 +15,9 @@ Usage:
skill-seekers enhance output/react/ --target openai --api-key sk-proj-...
"""
import argparse
import os
import sys
import json
import argparse
from pathlib import Path
# Add parent directory to path for imports when run as script
@@ -42,9 +41,7 @@ class SkillEnhancer:
self.skill_md_path = self.skill_dir / "SKILL.md"
# Get API key - support both ANTHROPIC_API_KEY and ANTHROPIC_AUTH_TOKEN
self.api_key = (api_key or
os.environ.get('ANTHROPIC_API_KEY') or
os.environ.get('ANTHROPIC_AUTH_TOKEN'))
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")
if not self.api_key:
raise ValueError(
"No API key provided. Set ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN "
@@ -52,10 +49,10 @@ class SkillEnhancer:
)
# Support custom base URL for alternative API endpoints
base_url = os.environ.get('ANTHROPIC_BASE_URL')
client_kwargs = {'api_key': self.api_key}
base_url = os.environ.get("ANTHROPIC_BASE_URL")
client_kwargs = {"api_key": self.api_key}
if base_url:
client_kwargs['base_url'] = base_url
client_kwargs["base_url"] = base_url
print(f" Using custom API base URL: {base_url}")
self.client = anthropic.Anthropic(**client_kwargs)
@@ -64,7 +61,7 @@ class SkillEnhancer:
"""Read existing SKILL.md"""
if not self.skill_md_path.exists():
return None
return self.skill_md_path.read_text(encoding='utf-8')
return self.skill_md_path.read_text(encoding="utf-8")
def enhance_skill_md(self, references, current_skill_md):
"""Use Claude to enhance SKILL.md"""
@@ -80,17 +77,14 @@ class SkillEnhancer:
model="claude-sonnet-4-20250514",
max_tokens=4096,
temperature=0.3,
messages=[{
"role": "user",
"content": prompt
}]
messages=[{"role": "user", "content": prompt}],
)
# Handle response content - newer SDK versions may include ThinkingBlock
# Find the TextBlock containing the actual response
enhanced_content = None
for block in message.content:
if hasattr(block, 'text'):
if hasattr(block, "text"):
enhanced_content = block.text
break
@@ -113,10 +107,10 @@ class SkillEnhancer:
# Analyze sources
sources_found = set()
for metadata in references.values():
sources_found.add(metadata['source'])
sources_found.add(metadata["source"])
# Analyze conflicts if present
has_conflicts = any('conflicts' in meta['path'] for meta in references.values())
has_conflicts = any("conflicts" in meta["path"] for meta in references.values())
prompt = f"""You are enhancing a Claude skill's SKILL.md file. This skill is about: {skill_name}
@@ -124,14 +118,14 @@ I've scraped documentation from multiple sources and organized it into reference
SKILL OVERVIEW:
- Name: {skill_name}
- Source Types: {', '.join(sorted(sources_found))}
- Multi-Source: {'Yes' if len(sources_found) > 1 else 'No'}
- Conflicts Detected: {'Yes - see conflicts.md in references' if has_conflicts else 'No'}
- Source Types: {", ".join(sorted(sources_found))}
- Multi-Source: {"Yes" if len(sources_found) > 1 else "No"}
- Conflicts Detected: {"Yes - see conflicts.md in references" if has_conflicts else "No"}
CURRENT SKILL.MD:
{'```markdown' if current_skill_md else '(none - create from scratch)'}
{current_skill_md or 'No existing SKILL.md'}
{'```' if current_skill_md else ''}
{"```markdown" if current_skill_md else "(none - create from scratch)"}
{current_skill_md or "No existing SKILL.md"}
{"```" if current_skill_md else ""}
SOURCE ANALYSIS:
This skill combines knowledge from {len(sources_found)} source type(s):
@@ -141,8 +135,8 @@ This skill combines knowledge from {len(sources_found)} source type(s):
# Group references by (source_type, repo_id) for multi-source support
by_source = {}
for filename, metadata in references.items():
source = metadata['source']
repo_id = metadata.get('repo_id') # None for single-source
source = metadata["source"]
repo_id = metadata.get("repo_id") # None for single-source
key = (source, repo_id) if repo_id else (source, None)
if key not in by_source:
@@ -150,7 +144,7 @@ This skill combines knowledge from {len(sources_found)} source type(s):
by_source[key].append((filename, metadata))
# Add source breakdown with repo identity
for (source, repo_id) in sorted(by_source.keys()):
for source, repo_id in sorted(by_source.keys()):
files = by_source[(source, repo_id)]
if repo_id:
prompt += f"\n**{source.upper()} - {repo_id} ({len(files)} file(s))**\n"
@@ -164,14 +158,14 @@ This skill combines knowledge from {len(sources_found)} source type(s):
prompt += "\n\nREFERENCE DOCUMENTATION:\n"
# Add references grouped by (source, repo_id) with metadata
for (source, repo_id) in sorted(by_source.keys()):
for source, repo_id in sorted(by_source.keys()):
if repo_id:
prompt += f"\n### {source.upper()} SOURCES - {repo_id}\n\n"
else:
prompt += f"\n### {source.upper()} SOURCES\n\n"
for filename, metadata in by_source[(source, repo_id)]:
content = metadata['content']
content = metadata["content"]
# Limit per-file to 30K
if len(content) > 30000:
content = content[:30000] + "\n\n[Content truncated for size...]"
@@ -197,12 +191,12 @@ MULTI-REPOSITORY HANDLING:
# Detect multiple repos from same source type
repo_ids = set()
for metadata in references.values():
if metadata.get('repo_id'):
repo_ids.add(metadata['repo_id'])
if metadata.get("repo_id"):
repo_ids.add(metadata["repo_id"])
if len(repo_ids) > 1:
prompt += f"""
⚠️ MULTIPLE REPOSITORIES DETECTED: {', '.join(sorted(repo_ids))}
⚠️ MULTIPLE REPOSITORIES DETECTED: {", ".join(sorted(repo_ids))}
This skill combines codebase analysis from {len(repo_ids)} different repositories.
Each repo has its own ARCHITECTURE.md, patterns, examples, and configuration.
@@ -285,27 +279,23 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
"""Save the enhanced SKILL.md"""
# Backup original
if self.skill_md_path.exists():
backup_path = self.skill_md_path.with_suffix('.md.backup')
backup_path = self.skill_md_path.with_suffix(".md.backup")
self.skill_md_path.rename(backup_path)
print(f" 💾 Backed up original to: {backup_path.name}")
# Save enhanced version
self.skill_md_path.write_text(content, encoding='utf-8')
print(f" ✅ Saved enhanced SKILL.md")
self.skill_md_path.write_text(content, encoding="utf-8")
print(" ✅ Saved enhanced SKILL.md")
def run(self):
"""Main enhancement workflow"""
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"ENHANCING SKILL: {self.skill_dir.name}")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
# Read reference files
print("📖 Reading reference documentation...")
references = read_reference_files(
self.skill_dir,
max_chars=API_CONTENT_LIMIT,
preview_limit=API_PREVIEW_LIMIT
)
references = read_reference_files(self.skill_dir, max_chars=API_CONTENT_LIMIT, preview_limit=API_PREVIEW_LIMIT)
if not references:
print("❌ No reference files found to analyze")
@@ -314,11 +304,11 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
# Analyze sources
sources_found = set()
for metadata in references.values():
sources_found.add(metadata['source'])
sources_found.add(metadata["source"])
print(f" ✓ Read {len(references)} reference files")
print(f" ✓ Sources: {', '.join(sorted(sources_found))}")
total_size = sum(meta['size'] for meta in references.values())
total_size = sum(meta["size"] for meta in references.values())
print(f" ✓ Total size: {total_size:,} characters\n")
# Read current SKILL.md
@@ -326,7 +316,7 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
if current_skill_md:
print(f" Found existing SKILL.md ({len(current_skill_md)} chars)")
else:
print(f" No existing SKILL.md, will create new one")
print(" No existing SKILL.md, will create new one")
# Enhance with Claude
enhanced = self.enhance_skill_md(references, current_skill_md)
@@ -341,11 +331,11 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
print("💾 Saving enhanced SKILL.md...")
self.save_enhanced_skill_md(enhanced)
print(f"\n✅ Enhancement complete!")
print(f"\nNext steps:")
print("\n✅ Enhancement complete!")
print("\nNext steps:")
print(f" 1. Review: {self.skill_md_path}")
print(f" 2. If you don't like it, restore backup: {self.skill_md_path.with_suffix('.md.backup')}")
print(f" 3. Package your skill:")
print(" 3. Package your skill:")
print(f" skill-seekers package {self.skill_dir}/")
return True
@@ -353,7 +343,7 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
def main():
parser = argparse.ArgumentParser(
description='Enhance SKILL.md using platform AI APIs',
description="Enhance SKILL.md using platform AI APIs",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -374,19 +364,18 @@ Examples:
# Dry run
skill-seekers enhance output/godot/ --dry-run
"""
""",
)
parser.add_argument('skill_dir', type=str,
help='Path to skill directory (e.g., output/steam-inventory/)')
parser.add_argument('--api-key', type=str,
help='Platform API key (or set environment variable)')
parser.add_argument('--target',
choices=['claude', 'gemini', 'openai'],
default='claude',
help='Target LLM platform (default: claude)')
parser.add_argument('--dry-run', action='store_true',
help='Show what would be done without calling API')
parser.add_argument("skill_dir", type=str, help="Path to skill directory (e.g., output/steam-inventory/)")
parser.add_argument("--api-key", type=str, help="Platform API key (or set environment variable)")
parser.add_argument(
"--target",
choices=["claude", "gemini", "openai"],
default="claude",
help="Target LLM platform (default: claude)",
)
parser.add_argument("--dry-run", action="store_true", help="Show what would be done without calling API")
args = parser.parse_args()
@@ -402,7 +391,7 @@ Examples:
# Dry run mode
if args.dry_run:
print(f"🔍 DRY RUN MODE")
print("🔍 DRY RUN MODE")
print(f" Would enhance: {skill_dir}")
print(f" References: {skill_dir / 'references'}")
print(f" SKILL.md: {skill_dir / 'SKILL.md'}")
@@ -427,7 +416,7 @@ Examples:
if not adaptor.supports_enhancement():
print(f"❌ Error: {adaptor.PLATFORM_NAME} does not support AI enhancement")
print(f"\nSupported platforms for enhancement:")
print("\nSupported platforms for enhancement:")
print(" - Claude AI (Anthropic)")
print(" - Google Gemini")
print(" - OpenAI ChatGPT")
@@ -436,7 +425,7 @@ Examples:
# Get API key
api_key = args.api_key
if not api_key:
api_key = os.environ.get(adaptor.get_env_var_name(), '').strip()
api_key = os.environ.get(adaptor.get_env_var_name(), "").strip()
if not api_key:
print(f"❌ Error: {adaptor.get_env_var_name()} not set")
@@ -447,19 +436,19 @@ Examples:
sys.exit(1)
# Run enhancement using adaptor
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"ENHANCING SKILL: {skill_dir}")
print(f"Platform: {adaptor.PLATFORM_NAME}")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
success = adaptor.enhance(Path(skill_dir), api_key)
if success:
print(f"\n✅ Enhancement complete!")
print(f"\nNext steps:")
print("\n✅ Enhancement complete!")
print("\nNext steps:")
print(f" 1. Review: {Path(skill_dir) / 'SKILL.md'}")
print(f" 2. If you don't like it, restore backup: {Path(skill_dir) / 'SKILL.md.backup'}")
print(f" 3. Package your skill:")
print(" 3. Package your skill:")
print(f" skill-seekers package {skill_dir}/ --target {args.target}")
sys.exit(0 if success else 1)
@@ -474,6 +463,7 @@ Examples:
except Exception as e:
print(f"❌ Unexpected error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -36,15 +36,15 @@ Terminal Selection:
Supported terminals: Ghostty, iTerm, Terminal, WezTerm
"""
import os
import sys
import time
import subprocess
import tempfile
import json
import os
import subprocess
import sys
import tempfile
import threading
from pathlib import Path
import time
from datetime import datetime
from pathlib import Path
# Add parent directory to path for imports when run as script
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -77,29 +77,29 @@ def detect_terminal_app():
"""
# Map TERM_PROGRAM values to macOS app names
TERMINAL_MAP = {
'Apple_Terminal': 'Terminal',
'iTerm.app': 'iTerm',
'ghostty': 'Ghostty',
'WezTerm': 'WezTerm',
"Apple_Terminal": "Terminal",
"iTerm.app": "iTerm",
"ghostty": "Ghostty",
"WezTerm": "WezTerm",
}
# Priority 1: Check SKILL_SEEKER_TERMINAL env var (explicit preference)
preferred_terminal = os.environ.get('SKILL_SEEKER_TERMINAL', '').strip()
preferred_terminal = os.environ.get("SKILL_SEEKER_TERMINAL", "").strip()
if preferred_terminal:
return preferred_terminal, 'SKILL_SEEKER_TERMINAL'
return preferred_terminal, "SKILL_SEEKER_TERMINAL"
# Priority 2: Check TERM_PROGRAM (inherit current terminal)
term_program = os.environ.get('TERM_PROGRAM', '').strip()
term_program = os.environ.get("TERM_PROGRAM", "").strip()
if term_program and term_program in TERMINAL_MAP:
return TERMINAL_MAP[term_program], 'TERM_PROGRAM'
return TERMINAL_MAP[term_program], "TERM_PROGRAM"
# Priority 3: Fallback to Terminal.app
if term_program:
# TERM_PROGRAM is set but unknown
return 'Terminal', f'unknown TERM_PROGRAM ({term_program})'
return "Terminal", f"unknown TERM_PROGRAM ({term_program})"
else:
# No TERM_PROGRAM set
return 'Terminal', 'default'
return "Terminal", "default"
class LocalSkillEnhancer:
@@ -132,7 +132,7 @@ class LocalSkillEnhancer:
Returns:
Summarized content
"""
lines = content.split('\n')
lines = content.split("\n")
target_lines = int(len(lines) * target_ratio)
# Priority 1: Keep introduction (first 20%)
@@ -146,7 +146,7 @@ class LocalSkillEnhancer:
block_start_idx = 0
for i, line in enumerate(lines[intro_lines:], start=intro_lines):
if line.strip().startswith('```'):
if line.strip().startswith("```"):
if in_code_block:
# End of code block - add closing ``` and save
current_block.append(line)
@@ -174,9 +174,9 @@ class LocalSkillEnhancer:
headings_added = 0
while i < len(lines) and headings_added < 10:
line = lines[i]
if line.startswith('#'):
if line.startswith("#"):
# Found heading - keep it and next 3 lines
chunk = lines[i:min(i+4, len(lines))]
chunk = lines[i : min(i + 4, len(lines))]
result.extend(chunk)
headings_added += 1
i += 4
@@ -185,7 +185,7 @@ class LocalSkillEnhancer:
result.append("\n\n[Content intelligently summarized - full details in reference files]")
return '\n'.join(result)
return "\n".join(result)
def create_enhancement_prompt(self, use_summarization=False, summarization_ratio=0.3):
"""Create the prompt file for Claude Code
@@ -197,9 +197,7 @@ class LocalSkillEnhancer:
# Read reference files (with enriched metadata)
references = read_reference_files(
self.skill_dir,
max_chars=LOCAL_CONTENT_LIMIT,
preview_limit=LOCAL_PREVIEW_LIMIT
self.skill_dir, max_chars=LOCAL_CONTENT_LIMIT, preview_limit=LOCAL_PREVIEW_LIMIT
)
if not references:
@@ -209,52 +207,54 @@ class LocalSkillEnhancer:
# Analyze sources
sources_found = set()
for metadata in references.values():
sources_found.add(metadata['source'])
sources_found.add(metadata["source"])
# Calculate total size
total_ref_size = sum(meta['size'] for meta in references.values())
total_ref_size = sum(meta["size"] for meta in references.values())
# Apply summarization if requested or if content is too large
if use_summarization or total_ref_size > 30000:
if not use_summarization:
print(f" ⚠️ Large skill detected ({total_ref_size:,} chars)")
print(f" 📊 Applying smart summarization (target: {int(summarization_ratio*100)}% of original)")
print(f" 📊 Applying smart summarization (target: {int(summarization_ratio * 100)}% of original)")
print()
# Summarize each reference
for filename, metadata in references.items():
summarized = self.summarize_reference(metadata['content'], summarization_ratio)
metadata['content'] = summarized
metadata['size'] = len(summarized)
summarized = self.summarize_reference(metadata["content"], summarization_ratio)
metadata["content"] = summarized
metadata["size"] = len(summarized)
new_size = sum(meta['size'] for meta in references.values())
print(f" ✓ Reduced from {total_ref_size:,} to {new_size:,} chars ({int(new_size/total_ref_size*100)}%)")
new_size = sum(meta["size"] for meta in references.values())
print(
f" ✓ Reduced from {total_ref_size:,} to {new_size:,} chars ({int(new_size / total_ref_size * 100)}%)"
)
print()
# Read current SKILL.md
current_skill_md = ""
if self.skill_md_path.exists():
current_skill_md = self.skill_md_path.read_text(encoding='utf-8')
current_skill_md = self.skill_md_path.read_text(encoding="utf-8")
# Analyze conflicts if present
has_conflicts = any('conflicts' in meta['path'] for meta in references.values())
has_conflicts = any("conflicts" in meta["path"] for meta in references.values())
# Build prompt with multi-source awareness
prompt = f"""I need you to enhance the SKILL.md file for the {self.skill_dir.name} skill.
SKILL OVERVIEW:
- Name: {self.skill_dir.name}
- Source Types: {', '.join(sorted(sources_found))}
- Multi-Source: {'Yes' if len(sources_found) > 1 else 'No'}
- Conflicts Detected: {'Yes - see conflicts.md in references' if has_conflicts else 'No'}
- Source Types: {", ".join(sorted(sources_found))}
- Multi-Source: {"Yes" if len(sources_found) > 1 else "No"}
- Conflicts Detected: {"Yes - see conflicts.md in references" if has_conflicts else "No"}
CURRENT SKILL.MD:
{'-'*60}
{current_skill_md if current_skill_md else '(No existing SKILL.md - create from scratch)'}
{'-'*60}
{"-" * 60}
{current_skill_md if current_skill_md else "(No existing SKILL.md - create from scratch)"}
{"-" * 60}
SOURCE ANALYSIS:
{'-'*60}
{"-" * 60}
This skill combines knowledge from {len(sources_found)} source type(s):
"""
@@ -262,8 +262,8 @@ This skill combines knowledge from {len(sources_found)} source type(s):
# Group references by (source_type, repo_id) for multi-source support
by_source = {}
for filename, metadata in references.items():
source = metadata['source']
repo_id = metadata.get('repo_id') # None for single-source
source = metadata["source"]
repo_id = metadata.get("repo_id") # None for single-source
key = (source, repo_id) if repo_id else (source, None)
if key not in by_source:
@@ -271,7 +271,7 @@ This skill combines knowledge from {len(sources_found)} source type(s):
by_source[key].append((filename, metadata))
# Add source breakdown with repo identity
for (source, repo_id) in sorted(by_source.keys()):
for source, repo_id in sorted(by_source.keys()):
files = by_source[(source, repo_id)]
if repo_id:
prompt += f"\n**{source.upper()} - {repo_id} ({len(files)} file(s))**\n"
@@ -283,14 +283,14 @@ This skill combines knowledge from {len(sources_found)} source type(s):
prompt += f"- ... and {len(files) - 5} more\n"
prompt += f"""
{'-'*60}
{"-" * 60}
REFERENCE DOCUMENTATION:
{'-'*60}
{"-" * 60}
"""
# Add references grouped by (source, repo_id) with metadata
for (source, repo_id) in sorted(by_source.keys()):
for source, repo_id in sorted(by_source.keys()):
if repo_id:
prompt += f"\n### {source.upper()} SOURCES - {repo_id}\n\n"
else:
@@ -298,7 +298,7 @@ REFERENCE DOCUMENTATION:
for filename, metadata in by_source[(source, repo_id)]:
# Further limit per-file to 12K to be safe
content = metadata['content']
content = metadata["content"]
max_per_file = 12000
if len(content) > max_per_file:
content = content[:max_per_file] + "\n\n[Content truncated for size...]"
@@ -311,7 +311,7 @@ REFERENCE DOCUMENTATION:
prompt += f"{content}\n"
prompt += f"""
{'-'*60}
{"-" * 60}
REFERENCE PRIORITY (when sources differ):
1. **Code patterns (codebase_analysis)**: Ground truth - what the code actually does
@@ -325,12 +325,12 @@ MULTI-REPOSITORY HANDLING:
# Detect multiple repos from same source type
repo_ids = set()
for metadata in references.values():
if metadata.get('repo_id'):
repo_ids.add(metadata['repo_id'])
if metadata.get("repo_id"):
repo_ids.add(metadata["repo_id"])
if len(repo_ids) > 1:
prompt += f"""
⚠️ MULTIPLE REPOSITORIES DETECTED: {', '.join(sorted(repo_ids))}
⚠️ MULTIPLE REPOSITORIES DETECTED: {", ".join(sorted(repo_ids))}
This skill combines codebase analysis from {len(repo_ids)} different repositories.
Each repo has its own ARCHITECTURE.md, patterns, examples, and configuration.
@@ -435,10 +435,10 @@ After writing, the file SKILL.md should:
"progress": progress,
"timestamp": datetime.now().isoformat(),
"skill_dir": str(self.skill_dir),
"error": error
"error": error,
}
self.status_file.write_text(json.dumps(status_data, indent=2), encoding='utf-8')
self.status_file.write_text(json.dumps(status_data, indent=2), encoding="utf-8")
def read_status(self):
"""Read enhancement status from file.
@@ -450,7 +450,7 @@ After writing, the file SKILL.md should:
return None
try:
return json.loads(self.status_file.read_text(encoding='utf-8'))
return json.loads(self.status_file.read_text(encoding="utf-8"))
except:
return None
@@ -482,9 +482,9 @@ After writing, the file SKILL.md should:
# Daemon mode: Run as persistent process with monitoring
if daemon:
return self._run_daemon(timeout)
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"LOCAL ENHANCEMENT: {self.skill_dir.name}")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
# Validate
if not self.skill_dir.exists():
@@ -494,9 +494,7 @@ After writing, the file SKILL.md should:
# Read reference files
print("📖 Reading reference documentation...")
references = read_reference_files(
self.skill_dir,
max_chars=LOCAL_CONTENT_LIMIT,
preview_limit=LOCAL_PREVIEW_LIMIT
self.skill_dir, max_chars=LOCAL_CONTENT_LIMIT, preview_limit=LOCAL_PREVIEW_LIMIT
)
if not references:
@@ -504,7 +502,7 @@ After writing, the file SKILL.md should:
return False
print(f" ✓ Read {len(references)} reference files")
total_size = sum(ref['size'] for ref in references.values())
total_size = sum(ref["size"] for ref in references.values())
print(f" ✓ Total size: {total_size:,} characters\n")
# Check if we need smart summarization
@@ -513,7 +511,7 @@ After writing, the file SKILL.md should:
if use_summarization:
print("⚠️ LARGE SKILL DETECTED")
print(f" 📊 Reference content: {total_size:,} characters")
print(f" 💡 Claude CLI limit: ~30,000-40,000 characters")
print(" 💡 Claude CLI limit: ~30,000-40,000 characters")
print()
print(" 🔧 Applying smart summarization to ensure success...")
print(" • Keeping introductions and overviews")
@@ -530,13 +528,13 @@ After writing, the file SKILL.md should:
return False
# Save prompt to temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
prompt_file = f.name
f.write(prompt)
if use_summarization:
print(f" ✓ Prompt created and optimized ({len(prompt):,} characters)")
print(f" ✓ Ready for Claude CLI (within safe limits)")
print(" ✓ Ready for Claude CLI (within safe limits)")
print()
else:
print(f" ✓ Prompt saved ({len(prompt):,} characters)\n")
@@ -555,49 +553,49 @@ After writing, the file SKILL.md should:
print()
# Create a shell script to run in the terminal
shell_script = f'''#!/bin/bash
shell_script = f"""#!/bin/bash
claude {prompt_file}
echo ""
echo "✅ Enhancement complete!"
echo "Press any key to close..."
read -n 1
rm {prompt_file}
'''
"""
# Save shell script
with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f:
script_file = f.name
f.write(shell_script)
os.chmod(script_file, 0o755)
# Launch in new terminal (macOS specific)
if sys.platform == 'darwin':
if sys.platform == "darwin":
# Detect which terminal app to use
terminal_app, detection_method = detect_terminal_app()
# Show detection info
if detection_method == 'SKILL_SEEKER_TERMINAL':
if detection_method == "SKILL_SEEKER_TERMINAL":
print(f" Using terminal: {terminal_app} (from SKILL_SEEKER_TERMINAL)")
elif detection_method == 'TERM_PROGRAM':
elif detection_method == "TERM_PROGRAM":
print(f" Using terminal: {terminal_app} (inherited from current terminal)")
elif detection_method.startswith('unknown TERM_PROGRAM'):
elif detection_method.startswith("unknown TERM_PROGRAM"):
print(f"⚠️ {detection_method}")
print(f" → Using Terminal.app as fallback")
print(" → Using Terminal.app as fallback")
else:
print(f" Using terminal: {terminal_app} (default)")
try:
subprocess.Popen(['open', '-a', terminal_app, script_file])
subprocess.Popen(["open", "-a", terminal_app, script_file])
except Exception as e:
print(f"⚠️ Error launching {terminal_app}: {e}")
print(f"\nManually run: {script_file}")
return False
else:
print("⚠️ Auto-launch only works on macOS")
print(f"\nManually run this command in a new terminal:")
print("\nManually run this command in a new terminal:")
print(f" claude '{prompt_file}'")
print(f"\nThen delete the prompt file:")
print("\nThen delete the prompt file:")
print(f" rm '{prompt_file}'")
return False
@@ -614,7 +612,9 @@ rm {prompt_file}
print()
print("💡 When done:")
print(f" 1. Check the enhanced SKILL.md: {self.skill_md_path}")
print(f" 2. If you don't like it, restore: mv {self.skill_md_path.with_suffix('.md.backup')} {self.skill_md_path}")
print(
f" 2. If you don't like it, restore: mv {self.skill_md_path.with_suffix('.md.backup')} {self.skill_md_path}"
)
print(f" 3. Package: skill-seekers package {self.skill_dir}/")
return True
@@ -630,10 +630,9 @@ rm {prompt_file}
bool: True if enhancement succeeded
"""
import time
from pathlib import Path
print("✨ Running Claude Code enhancement (headless mode)...")
print(f" Timeout: {timeout} seconds ({timeout//60} minutes)")
print(f" Timeout: {timeout} seconds ({timeout // 60} minutes)")
print()
# Record initial state
@@ -652,11 +651,11 @@ rm {prompt_file}
print()
result = subprocess.run(
['claude', '--dangerously-skip-permissions', prompt_file],
["claude", "--dangerously-skip-permissions", prompt_file],
capture_output=True,
text=True,
timeout=timeout,
cwd=str(self.skill_dir) # Run from skill directory
cwd=str(self.skill_dir), # Run from skill directory
)
elapsed = time.time() - start_time
@@ -681,21 +680,21 @@ rm {prompt_file}
return True
else:
print(f"⚠️ Claude finished but SKILL.md was not updated")
print("⚠️ Claude finished but SKILL.md was not updated")
print(f" Initial: mtime={initial_mtime}, size={initial_size}")
print(f" Final: mtime={new_mtime}, size={new_size}")
print(f" This might indicate an error during enhancement")
print(" This might indicate an error during enhancement")
print()
# Show last 20 lines of stdout for debugging
if result.stdout:
print(" Last output from Claude:")
lines = result.stdout.strip().split('\n')[-20:]
lines = result.stdout.strip().split("\n")[-20:]
for line in lines:
print(f" | {line}")
print()
return False
else:
print(f"❌ SKILL.md not found after enhancement")
print("❌ SKILL.md not found after enhancement")
return False
else:
print(f"❌ Claude Code returned error (exit code: {result.returncode})")
@@ -750,9 +749,9 @@ rm {prompt_file}
Returns:
bool: True if background task started successfully
"""
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"BACKGROUND ENHANCEMENT: {self.skill_dir.name}")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
# Write initial status
self.write_status("pending", "Starting background enhancement...")
@@ -764,9 +763,7 @@ rm {prompt_file}
# Read reference files
references = read_reference_files(
self.skill_dir,
max_chars=LOCAL_CONTENT_LIMIT,
preview_limit=LOCAL_PREVIEW_LIMIT
self.skill_dir, max_chars=LOCAL_CONTENT_LIMIT, preview_limit=LOCAL_PREVIEW_LIMIT
)
if not references:
@@ -785,7 +782,7 @@ rm {prompt_file}
return
# Save prompt to temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
prompt_file = f.name
f.write(prompt)
@@ -794,12 +791,7 @@ rm {prompt_file}
# Run enhancement
if headless:
# Run headless (subprocess.run - blocking in thread)
result = subprocess.run(
['claude', prompt_file],
capture_output=True,
text=True,
timeout=timeout
)
result = subprocess.run(["claude", prompt_file], capture_output=True, text=True, timeout=timeout)
# Clean up
try:
@@ -848,9 +840,9 @@ rm {prompt_file}
Returns:
bool: True if daemon started successfully
"""
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"DAEMON MODE: {self.skill_dir.name}")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
# Write initial status
self.write_status("pending", "Starting daemon process...")
@@ -939,7 +931,7 @@ except Exception as e:
# Save daemon script
daemon_script_path = self.skill_dir / ".enhancement_daemon.py"
daemon_script_path.write_text(daemon_script, encoding='utf-8')
daemon_script_path.write_text(daemon_script, encoding="utf-8")
daemon_script_path.chmod(0o755)
# Start daemon process (fully detached)
@@ -950,19 +942,16 @@ except Exception as e:
if self.force:
# Force mode: No output, fully silent
subprocess.Popen(
['nohup', 'python3', str(daemon_script_path)],
["nohup", "python3", str(daemon_script_path)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True
start_new_session=True,
)
else:
# Normal mode: Log to file
with open(log_file, 'w') as log:
with open(log_file, "w") as log:
subprocess.Popen(
['nohup', 'python3', str(daemon_script_path)],
stdout=log,
stderr=log,
start_new_session=True
["nohup", "python3", str(daemon_script_path)], stdout=log, stderr=log, start_new_session=True
)
# Give daemon time to start
@@ -971,7 +960,7 @@ except Exception as e:
# Read status to verify it started
status = self.read_status()
if status and status.get('status') in ['pending', 'running']:
if status and status.get("status") in ["pending", "running"]:
print("✅ Daemon process started successfully!")
print()
print("📊 Monitoring:")
@@ -1032,43 +1021,31 @@ Mode Comparison:
Force Mode (Default ON):
By default, all modes skip confirmations (auto-yes).
Use --no-force to enable confirmation prompts.
"""
""",
)
parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)")
parser.add_argument(
"--interactive-enhancement",
action="store_true",
help="Open terminal window for enhancement (default: headless mode)",
)
parser.add_argument(
'skill_directory',
help='Path to skill directory (e.g., output/react/)'
"--background", action="store_true", help="Run in background and return immediately (non-blocking)"
)
parser.add_argument("--daemon", action="store_true", help="Run as persistent daemon process (fully detached)")
parser.add_argument(
"--no-force",
action="store_true",
help="Disable force mode: enable confirmation prompts (default: force mode ON)",
)
parser.add_argument(
'--interactive-enhancement',
action='store_true',
help='Open terminal window for enhancement (default: headless mode)'
)
parser.add_argument(
'--background',
action='store_true',
help='Run in background and return immediately (non-blocking)'
)
parser.add_argument(
'--daemon',
action='store_true',
help='Run as persistent daemon process (fully detached)'
)
parser.add_argument(
'--no-force',
action='store_true',
help='Disable force mode: enable confirmation prompts (default: force mode ON)'
)
parser.add_argument(
'--timeout',
type=int,
default=600,
help='Timeout in seconds for headless mode (default: 600 = 10 minutes)'
"--timeout", type=int, default=600, help="Timeout in seconds for headless mode (default: 600 = 10 minutes)"
)
args = parser.parse_args()
@@ -1084,12 +1061,7 @@ Force Mode (Default ON):
# Force mode is ON by default, use --no-force to disable
enhancer = LocalSkillEnhancer(args.skill_directory, force=not args.no_force)
headless = not args.interactive_enhancement # Invert: default is headless
success = enhancer.run(
headless=headless,
timeout=args.timeout,
background=args.background,
daemon=args.daemon
)
success = enhancer.run(headless=headless, timeout=args.timeout, background=args.background, daemon=args.daemon)
sys.exit(0 if success else 1)

View File

@@ -10,9 +10,8 @@ Usage:
skill-seekers enhance-status output/react/ --json
"""
import os
import sys
import json
import sys
import time
from pathlib import Path
@@ -32,7 +31,7 @@ def read_status(skill_dir):
return None
try:
return json.loads(status_file.read_text(encoding='utf-8'))
return json.loads(status_file.read_text(encoding="utf-8"))
except Exception as e:
return {"error": f"Failed to read status: {e}"}
@@ -53,26 +52,21 @@ def format_status(status):
return f"{status['error']}"
# Status emoji mapping
status_emojis = {
"pending": "",
"running": "🔄",
"completed": "",
"failed": ""
}
status_emojis = {"pending": "", "running": "🔄", "completed": "", "failed": ""}
emoji = status_emojis.get(status.get('status', ''), '')
status_text = status.get('status', 'unknown').upper()
message = status.get('message', '')
progress = status.get('progress', 0.0)
timestamp = status.get('timestamp', 'unknown')
error = status.get('error')
pid = status.get('pid')
emoji = status_emojis.get(status.get("status", ""), "")
status_text = status.get("status", "unknown").upper()
message = status.get("message", "")
progress = status.get("progress", 0.0)
timestamp = status.get("timestamp", "unknown")
error = status.get("error")
pid = status.get("pid")
# Build output
lines = []
lines.append(f"\n{'='*60}")
lines.append(f"\n{'=' * 60}")
lines.append(f"ENHANCEMENT STATUS: {status_text}")
lines.append(f"{'='*60}\n")
lines.append(f"{'=' * 60}\n")
lines.append(f"{emoji} Status: {status_text}")
@@ -81,7 +75,7 @@ def format_status(status):
if progress > 0:
progress_pct = int(progress * 100)
progress_bar = '' * (progress_pct // 5) + '' * (20 - progress_pct // 5)
progress_bar = "" * (progress_pct // 5) + "" * (20 - progress_pct // 5)
lines.append(f" Progress: [{progress_bar}] {progress_pct}%")
if pid:
@@ -94,7 +88,7 @@ def format_status(status):
lines.append("")
return '\n'.join(lines)
return "\n".join(lines)
def watch_status(skill_dir, interval=2):
@@ -106,7 +100,7 @@ def watch_status(skill_dir, interval=2):
"""
print(f"👀 Watching enhancement status for: {skill_dir}")
print(f" Update interval: {interval} seconds")
print(f" Press Ctrl+C to stop\n")
print(" Press Ctrl+C to stop\n")
try:
last_status = None
@@ -123,7 +117,7 @@ def watch_status(skill_dir, interval=2):
last_status = status
# Exit if completed or failed
if status and status.get('status') in ['completed', 'failed']:
if status and status.get("status") in ["completed", "failed"]:
break
time.sleep(interval)
@@ -149,32 +143,18 @@ Examples:
# Get JSON output (for scripts)
skill-seekers enhance-status output/react/ --json
"""
""",
)
parser.add_argument(
'skill_directory',
help='Path to skill directory (e.g., output/react/)'
)
parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)")
parser.add_argument(
'--watch', '-w',
action='store_true',
help='Watch status in real-time (updates every 2 seconds)'
"--watch", "-w", action="store_true", help="Watch status in real-time (updates every 2 seconds)"
)
parser.add_argument(
'--json',
action='store_true',
help='Output raw JSON (for scripting)'
)
parser.add_argument("--json", action="store_true", help="Output raw JSON (for scripting)")
parser.add_argument(
'--interval',
type=int,
default=2,
help='Watch update interval in seconds (default: 2)'
)
parser.add_argument("--interval", type=int, default=2, help="Watch update interval in seconds (default: 2)")
args = parser.parse_args()
@@ -197,9 +177,9 @@ Examples:
# Exit code based on status
if not status:
sys.exit(2) # No status found
elif status.get('status') == 'completed':
elif status.get("status") == "completed":
sys.exit(0) # Success
elif status.get('status') == 'failed':
elif status.get("status") == "failed":
sys.exit(1) # Failed
else:
sys.exit(0) # In progress

View File

@@ -4,23 +4,20 @@ Page Count Estimator for Skill Seeker
Quickly estimates how many pages a config will scrape without downloading content
"""
import sys
import json
import os
import sys
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import json
from pathlib import Path
# Add parent directory to path for imports when run as script
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from skill_seekers.cli.constants import (
DEFAULT_RATE_LIMIT,
DEFAULT_MAX_DISCOVERY,
DISCOVERY_THRESHOLD
)
from skill_seekers.cli.constants import DEFAULT_MAX_DISCOVERY, DEFAULT_RATE_LIMIT, DISCOVERY_THRESHOLD
def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
@@ -35,20 +32,20 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
Returns:
dict with estimation results
"""
base_url = config['base_url']
start_urls = config.get('start_urls', [base_url])
url_patterns = config.get('url_patterns', {'include': [], 'exclude': []})
rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
base_url = config["base_url"]
start_urls = config.get("start_urls", [base_url])
url_patterns = config.get("url_patterns", {"include": [], "exclude": []})
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
visited = set()
pending = list(start_urls)
discovered = 0
include_patterns = url_patterns.get('include', [])
exclude_patterns = url_patterns.get('exclude', [])
include_patterns = url_patterns.get("include", [])
exclude_patterns = url_patterns.get("exclude", [])
# Handle unlimited mode
unlimited = (max_discovery == -1 or max_discovery is None)
unlimited = max_discovery == -1 or max_discovery is None
print(f"🔍 Estimating pages for: {config['name']}")
print(f"📍 Base URL: {base_url}")
@@ -56,8 +53,8 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
print(f"⏱️ Rate limit: {rate_limit}s")
if unlimited:
print(f"🔢 Max discovery: UNLIMITED (will discover all pages)")
print(f"⚠️ WARNING: This may take a long time!")
print("🔢 Max discovery: UNLIMITED (will discover all pages)")
print("⚠️ WARNING: This may take a long time!")
else:
print(f"🔢 Max discovery: {max_discovery}")
@@ -80,26 +77,26 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
if discovered % 10 == 0:
elapsed = time.time() - start_time
rate = discovered / elapsed if elapsed > 0 else 0
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end='\r')
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end="\r")
try:
# HEAD request first to check if page exists (faster)
head_response = requests.head(url, timeout=timeout, allow_redirects=True)
# Skip non-HTML content
content_type = head_response.headers.get('Content-Type', '')
if 'text/html' not in content_type:
content_type = head_response.headers.get("Content-Type", "")
if "text/html" not in content_type:
continue
# Now GET the page to find links
response = requests.get(url, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
soup = BeautifulSoup(response.content, "html.parser")
# Find all links
for link in soup.find_all('a', href=True):
href = link['href']
for link in soup.find_all("a", href=True):
href = link["href"]
full_url = urljoin(url, href)
# Normalize URL
@@ -117,10 +114,10 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
# Rate limiting
time.sleep(rate_limit)
except requests.RequestException as e:
except requests.RequestException:
# Silently skip errors during estimation
pass
except Exception as e:
except Exception:
# Silently skip other errors
pass
@@ -128,13 +125,13 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
# Results
results = {
'discovered': discovered,
'pending': len(pending),
'estimated_total': discovered + len(pending),
'elapsed_seconds': round(elapsed, 2),
'discovery_rate': round(discovered / elapsed if elapsed > 0 else 0, 2),
'hit_limit': (not unlimited) and (discovered >= max_discovery),
'unlimited': unlimited
"discovered": discovered,
"pending": len(pending),
"estimated_total": discovered + len(pending),
"elapsed_seconds": round(elapsed, 2),
"discovery_rate": round(discovered / elapsed if elapsed > 0 else 0, 2),
"hit_limit": (not unlimited) and (discovered >= max_discovery),
"unlimited": unlimited,
}
return results
@@ -143,7 +140,7 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
def is_valid_url(url, base_url, include_patterns, exclude_patterns):
"""Check if URL should be crawled"""
# Must be same domain
if not url.startswith(base_url.rstrip('/')):
if not url.startswith(base_url.rstrip("/")):
return False
# Check exclude patterns first
@@ -180,11 +177,11 @@ def print_results(results, config):
print(f"⏱️ Time Elapsed: {results['elapsed_seconds']}s")
print(f"⚡ Discovery Rate: {results['discovery_rate']} pages/sec")
if results.get('unlimited', False):
if results.get("unlimited", False):
print()
print("✅ UNLIMITED MODE - Discovered all reachable pages")
print(f" Total pages: {results['estimated_total']}")
elif results['hit_limit']:
elif results["hit_limit"]:
print()
print("⚠️ Hit discovery limit - actual total may be higher")
print(" Increase max_discovery parameter for more accurate estimate")
@@ -195,8 +192,8 @@ def print_results(results, config):
print("=" * 70)
print()
estimated = results['estimated_total']
current_max = config.get('max_pages', 100)
estimated = results["estimated_total"]
current_max = config.get("max_pages", 100)
if estimated <= current_max:
print(f"✅ Current max_pages ({current_max}) is sufficient")
@@ -207,7 +204,7 @@ def print_results(results, config):
print(f" (Estimated {estimated} + 50 buffer)")
# Estimate time for full scrape
rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
estimated_time = (estimated * rate_limit) / 60 # in minutes
print()
@@ -220,7 +217,7 @@ def print_results(results, config):
def load_config(config_path):
"""Load configuration from JSON file"""
try:
with open(config_path, 'r') as f:
with open(config_path) as f:
config = json.load(f)
return config
except FileNotFoundError:
@@ -298,7 +295,7 @@ def list_all_configs():
# Try to load the config to get name and description
try:
with open(config_file, 'r') as f:
with open(config_file) as f:
config_data = json.load(f)
name = config_data.get("name", config_file.stem)
@@ -308,20 +305,19 @@ def list_all_configs():
if len(description) > 60:
description = description[:57] + "..."
by_category[category].append({
"file": config_file.name,
"path": str(rel_path),
"name": name,
"description": description
})
by_category[category].append(
{"file": config_file.name, "path": str(rel_path), "name": name, "description": description}
)
except Exception as e:
# If we can't parse the config, just use the filename
by_category[category].append({
"file": config_file.name,
"path": str(rel_path),
"name": config_file.stem,
"description": f"⚠️ Error loading config: {e}"
})
by_category[category].append(
{
"file": config_file.name,
"path": str(rel_path),
"name": config_file.stem,
"description": f"⚠️ Error loading config: {e}",
}
)
# Print configs by category
total = 0
@@ -351,7 +347,7 @@ def main():
import argparse
parser = argparse.ArgumentParser(
description='Estimate page count for Skill Seeker configs',
description="Estimate page count for Skill Seeker configs",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -366,18 +362,25 @@ Examples:
# Quick estimate (stop at 100 pages)
skill-seekers estimate configs/vue.json --max-discovery 100
"""
""",
)
parser.add_argument('config', nargs='?', help='Path to config JSON file')
parser.add_argument('--all', action='store_true',
help='List all available configs from api/configs_repo/official/')
parser.add_argument('--max-discovery', '-m', type=int, default=DEFAULT_MAX_DISCOVERY,
help=f'Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)')
parser.add_argument('--unlimited', '-u', action='store_true',
help='Remove discovery limit - discover all pages (same as --max-discovery -1)')
parser.add_argument('--timeout', '-t', type=int, default=30,
help='HTTP request timeout in seconds (default: 30)')
parser.add_argument("config", nargs="?", help="Path to config JSON file")
parser.add_argument("--all", action="store_true", help="List all available configs from api/configs_repo/official/")
parser.add_argument(
"--max-discovery",
"-m",
type=int,
default=DEFAULT_MAX_DISCOVERY,
help=f"Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)",
)
parser.add_argument(
"--unlimited",
"-u",
action="store_true",
help="Remove discovery limit - discover all pages (same as --max-discovery -1)",
)
parser.add_argument("--timeout", "-t", type=int, default=30, help="HTTP request timeout in seconds (default: 30)")
args = parser.parse_args()
@@ -401,7 +404,7 @@ Examples:
print_results(results, config)
# Return exit code based on results
if results['hit_limit']:
if results["hit_limit"]:
return 2 # Warning: hit limit
return 0 # Success
@@ -413,5 +416,5 @@ Examples:
return 1
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main())

View File

@@ -12,17 +12,17 @@ Phase 4 enhancements:
- GitHub issue links for context
"""
import argparse
import json
import sys
import argparse
from pathlib import Path
from typing import Dict, List, Any, Tuple, Optional
from typing import Any, Optional
# Import three-stream data classes (Phase 1)
try:
from .github_fetcher import ThreeStreamData, DocsStream, InsightsStream
from .merge_sources import categorize_issues_by_topic
from .github_fetcher import DocsStream, InsightsStream, ThreeStreamData
from .markdown_cleaner import MarkdownCleaner
from .merge_sources import categorize_issues_by_topic
except ImportError:
# Fallback if github_fetcher not available
ThreeStreamData = None
@@ -34,10 +34,9 @@ except ImportError:
class RouterGenerator:
"""Generates router skills that direct to specialized sub-skills with GitHub integration"""
def __init__(self,
config_paths: List[str],
router_name: str = None,
github_streams: Optional['ThreeStreamData'] = None):
def __init__(
self, config_paths: list[str], router_name: str = None, github_streams: Optional["ThreeStreamData"] = None
):
"""
Initialize router generator with optional GitHub streams.
@@ -60,21 +59,21 @@ class RouterGenerator:
if github_streams and github_streams.insights_stream:
self.github_metadata = github_streams.insights_stream.metadata
self.github_issues = {
'common_problems': github_streams.insights_stream.common_problems,
'known_solutions': github_streams.insights_stream.known_solutions,
'top_labels': github_streams.insights_stream.top_labels
"common_problems": github_streams.insights_stream.common_problems,
"known_solutions": github_streams.insights_stream.known_solutions,
"top_labels": github_streams.insights_stream.top_labels,
}
if github_streams and github_streams.docs_stream:
self.github_docs = {
'readme': github_streams.docs_stream.readme,
'contributing': github_streams.docs_stream.contributing
"readme": github_streams.docs_stream.readme,
"contributing": github_streams.docs_stream.contributing,
}
def load_config(self, path: Path) -> Dict[str, Any]:
def load_config(self, path: Path) -> dict[str, Any]:
"""Load a config file"""
try:
with open(path, 'r') as f:
with open(path) as f:
return json.load(f)
except Exception as e:
print(f"❌ Error loading {path}: {e}")
@@ -83,17 +82,17 @@ class RouterGenerator:
def infer_router_name(self) -> str:
"""Infer router name from sub-skill names"""
# Find common prefix
names = [cfg['name'] for cfg in self.configs]
names = [cfg["name"] for cfg in self.configs]
if not names:
return "router"
# Get common prefix before first dash
first_name = names[0]
if '-' in first_name:
return first_name.split('-')[0]
if "-" in first_name:
return first_name.split("-")[0]
return first_name
def extract_routing_keywords(self) -> Dict[str, List[str]]:
def extract_routing_keywords(self) -> dict[str, list[str]]:
"""
Extract keywords for routing to each skill (Phase 4 enhanced).
@@ -103,26 +102,26 @@ class RouterGenerator:
routing = {}
for config in self.configs:
name = config['name']
name = config["name"]
keywords = []
# Extract from categories (base weight: 1x)
if 'categories' in config:
keywords.extend(config['categories'].keys())
if "categories" in config:
keywords.extend(config["categories"].keys())
# Extract from name (part after dash)
if '-' in name:
skill_topic = name.split('-', 1)[1]
if "-" in name:
skill_topic = name.split("-", 1)[1]
keywords.append(skill_topic)
# Phase 4: Add GitHub issue labels (weight 2x by including twice)
if self.github_issues:
# Get top labels related to this skill topic
top_labels = self.github_issues.get('top_labels', [])
top_labels = self.github_issues.get("top_labels", [])
skill_keywords = set(keywords)
for label_info in top_labels[:10]: # Top 10 labels
label = label_info['label'].lower()
label = label_info["label"].lower()
# Check if label relates to any skill keyword
if any(keyword.lower() in label or label in keyword.lower() for keyword in skill_keywords):
@@ -141,7 +140,7 @@ class RouterGenerator:
return routing
def _extract_skill_specific_labels(self, skill_name: str, skill_keywords: set) -> List[str]:
def _extract_skill_specific_labels(self, skill_name: str, skill_keywords: set) -> list[str]:
"""
Extract labels from GitHub issues that match this specific skill.
@@ -159,14 +158,14 @@ class RouterGenerator:
if not self.github_issues:
return []
common_problems = self.github_issues.get('common_problems', [])
known_solutions = self.github_issues.get('known_solutions', [])
common_problems = self.github_issues.get("common_problems", [])
known_solutions = self.github_issues.get("known_solutions", [])
all_issues = common_problems + known_solutions
matching_labels = set()
for issue in all_issues:
issue_labels = issue.get('labels', [])
issue_labels = issue.get("labels", [])
issue_labels_lower = [label.lower() for label in issue_labels]
# Check if this issue relates to the skill
@@ -180,13 +179,20 @@ class RouterGenerator:
# Add ALL labels from this matching issue
for label in issue_labels_lower:
# Skip generic labels that don't add routing value
if label not in ['bug', 'enhancement', 'question', 'help wanted',
'good first issue', 'documentation', 'duplicate']:
if label not in [
"bug",
"enhancement",
"question",
"help wanted",
"good first issue",
"documentation",
"duplicate",
]:
matching_labels.add(label)
return list(matching_labels)
def _generate_frontmatter(self, routing_keywords: Dict[str, List[str]]) -> str:
def _generate_frontmatter(self, routing_keywords: dict[str, list[str]]) -> str:
"""
Generate YAML frontmatter compliant with agentskills.io spec.
@@ -201,16 +207,16 @@ class RouterGenerator:
# Build comprehensive description from all sub-skills
all_topics = []
for config in self.configs:
desc = config.get('description', '')
desc = config.get("description", "")
# Extract key topics from description (simple extraction)
topics = [word.strip() for word in desc.split(',') if word.strip()]
topics = [word.strip() for word in desc.split(",") if word.strip()]
all_topics.extend(topics[:2]) # Max 2 topics per skill
# Create keyword-rich description
unique_topics = list(dict.fromkeys(all_topics))[:7] # Top 7 unique topics
if unique_topics:
topics_str = ', '.join(unique_topics)
topics_str = ", ".join(unique_topics)
description = f"{self.router_name.title()} framework. Use when working with: {topics_str}"
else:
description = f"Use when working with {self.router_name.title()} development and programming"
@@ -225,21 +231,21 @@ class RouterGenerator:
# Try to get language-specific compatibility if GitHub metadata available
if self.github_metadata:
language = self.github_metadata.get('language', '')
language = self.github_metadata.get("language", "")
compatibility_map = {
'Python': f'Python 3.10+, requires {self.router_name} package',
'JavaScript': f'Node.js 18+, requires {self.router_name} package',
'TypeScript': f'Node.js 18+, TypeScript 5+, requires {self.router_name} package',
'Go': f'Go 1.20+, requires {self.router_name} package',
'Rust': f'Rust 1.70+, requires {self.router_name} package',
'Java': f'Java 17+, requires {self.router_name} package',
"Python": f"Python 3.10+, requires {self.router_name} package",
"JavaScript": f"Node.js 18+, requires {self.router_name} package",
"TypeScript": f"Node.js 18+, TypeScript 5+, requires {self.router_name} package",
"Go": f"Go 1.20+, requires {self.router_name} package",
"Rust": f"Rust 1.70+, requires {self.router_name} package",
"Java": f"Java 17+, requires {self.router_name} package",
}
if language in compatibility_map:
compatibility = compatibility_map[language]
# Try to extract license
if isinstance(self.github_metadata.get('license'), dict):
license_info = self.github_metadata['license'].get('name', 'MIT')
if isinstance(self.github_metadata.get("license"), dict):
license_info = self.github_metadata["license"].get("name", "MIT")
frontmatter = f"""---
name: {self.router_name}
@@ -289,27 +295,27 @@ compatibility: {compatibility}
"""
# Remove router name prefix
if skill_name.startswith(f"{self.router_name}-"):
topic = skill_name[len(self.router_name)+1:]
topic = skill_name[len(self.router_name) + 1 :]
else:
topic = skill_name
# Capitalize and add context
topic = topic.replace('-', ' ').title()
topic = topic.replace("-", " ").title()
# Add common suffixes for context
topic_map = {
'oauth': 'OAuth authentication',
'auth': 'authentication',
'async': 'async patterns',
'api': 'API integration',
'orm': 'ORM queries',
'hooks': 'hooks',
'routing': 'routing',
'testing': 'testing',
'2d': '2D development',
'3d': '3D development',
'scripting': 'scripting',
'physics': 'physics',
"oauth": "OAuth authentication",
"auth": "authentication",
"async": "async patterns",
"api": "API integration",
"orm": "ORM queries",
"hooks": "hooks",
"routing": "routing",
"testing": "testing",
"2d": "2D development",
"3d": "3D development",
"scripting": "scripting",
"physics": "physics",
}
topic_lower = topic.lower()
@@ -319,7 +325,7 @@ compatibility: {compatibility}
return topic
def _generate_dynamic_examples(self, routing_keywords: Dict[str, List[str]]) -> str:
def _generate_dynamic_examples(self, routing_keywords: dict[str, list[str]]) -> str:
"""
Generate examples dynamically from actual sub-skill names and keywords.
@@ -351,10 +357,7 @@ compatibility: {compatibility}
topic = self._extract_topic_from_skill(first_skill)
keyword = first_keywords[0] if first_keywords else topic
examples.append(
f'**Q:** "How do I implement {keyword}?"\n'
f'**A:** Activates {first_skill} skill'
)
examples.append(f'**Q:** "How do I implement {keyword}?"\n**A:** Activates {first_skill} skill')
# Example 2: Different skill (second sub-skill if available)
if len(skill_names) >= 2:
@@ -365,8 +368,7 @@ compatibility: {compatibility}
keyword = second_keywords[0] if second_keywords else topic
examples.append(
f'**Q:** "Working with {keyword} in {self.router_name.title()}"\n'
f'**A:** Activates {second_skill} skill'
f'**Q:** "Working with {keyword} in {self.router_name.title()}"\n**A:** Activates {second_skill} skill'
)
# Example 3: Multi-skill activation (if 2+ skills)
@@ -378,13 +380,12 @@ compatibility: {compatibility}
topic_2 = self._extract_topic_from_skill(skill_2)
examples.append(
f'**Q:** "Combining {topic_1} with {topic_2}"\n'
f'**A:** Activates {skill_1} + {skill_2} skills'
f'**Q:** "Combining {topic_1} with {topic_2}"\n**A:** Activates {skill_1} + {skill_2} skills'
)
return '\n\n'.join(examples)
return "\n\n".join(examples)
def _generate_examples_from_github(self, routing_keywords: Dict[str, List[str]]) -> str:
def _generate_examples_from_github(self, routing_keywords: dict[str, list[str]]) -> str:
"""
Generate examples from real GitHub issue titles.
@@ -402,7 +403,7 @@ compatibility: {compatibility}
return self._generate_dynamic_examples(routing_keywords)
examples = []
common_problems = self.github_issues.get('common_problems', [])
common_problems = self.github_issues.get("common_problems", [])
if not common_problems:
return self._generate_dynamic_examples(routing_keywords)
@@ -414,29 +415,26 @@ compatibility: {compatibility}
# Find first issue matching this skill's keywords
for issue in common_problems:
issue_labels = [label.lower() for label in issue.get('labels', [])]
issue_labels = [label.lower() for label in issue.get("labels", [])]
if any(label in skill_keywords_lower for label in issue_labels):
matched_issue = issue
common_problems.remove(issue) # Don't reuse same issue
break
if matched_issue:
title = matched_issue.get('title', '')
title = matched_issue.get("title", "")
question = self._convert_issue_to_question(title)
examples.append(
f'**Q:** "{question}"\n'
f'**A:** Activates {skill_name} skill'
)
examples.append(f'**Q:** "{question}"\n**A:** Activates {skill_name} skill')
else:
# Fallback to keyword-based example for this skill
topic = self._extract_topic_from_skill(skill_name)
keyword = keywords[0] if keywords else topic
examples.append(
f'**Q:** "Working with {keyword} in {self.router_name.title()}"\n'
f'**A:** Activates {skill_name} skill'
f"**A:** Activates {skill_name} skill"
)
return '\n\n'.join(examples) if examples else self._generate_dynamic_examples(routing_keywords)
return "\n\n".join(examples) if examples else self._generate_dynamic_examples(routing_keywords)
def _convert_issue_to_question(self, issue_title: str) -> str:
"""
@@ -456,24 +454,24 @@ compatibility: {compatibility}
title_lower = issue_title.lower()
# Pattern 1: Error/Failure issues
if 'fail' in title_lower or 'error' in title_lower or 'issue' in title_lower:
cleaned = issue_title.replace(' fails', '').replace(' errors', '').replace(' issue', '')
if "fail" in title_lower or "error" in title_lower or "issue" in title_lower:
cleaned = issue_title.replace(" fails", "").replace(" errors", "").replace(" issue", "")
return f"How do I fix {cleaned.lower()}?"
# Pattern 2: Documentation requests
if 'documentation' in title_lower or 'docs' in title_lower:
cleaned = issue_title.replace(' documentation', '').replace(' docs', '')
if "documentation" in title_lower or "docs" in title_lower:
cleaned = issue_title.replace(" documentation", "").replace(" docs", "")
return f"How do I use {cleaned.lower()}?"
# Pattern 3: Feature requests
if title_lower.startswith('add ') or title_lower.startswith('added '):
feature = issue_title.replace('Add ', '').replace('Added ', '')
if title_lower.startswith("add ") or title_lower.startswith("added "):
feature = issue_title.replace("Add ", "").replace("Added ", "")
return f"How do I implement {feature.lower()}?"
# Default: Generic question
return f"How do I handle {issue_title.lower()}?"
def _extract_common_patterns(self) -> List[Dict[str, str]]:
def _extract_common_patterns(self) -> list[dict[str, str]]:
"""
Extract problem-solution patterns from closed GitHub issues.
@@ -487,25 +485,21 @@ compatibility: {compatibility}
if not self.github_issues:
return []
known_solutions = self.github_issues.get('known_solutions', [])
known_solutions = self.github_issues.get("known_solutions", [])
if not known_solutions:
return []
patterns = []
# Top 5 closed issues with most engagement (comments indicate usefulness)
top_solutions = sorted(known_solutions, key=lambda x: x.get('comments', 0), reverse=True)[:5]
top_solutions = sorted(known_solutions, key=lambda x: x.get("comments", 0), reverse=True)[:5]
for issue in top_solutions:
title = issue.get('title', '')
number = issue.get('number', 0)
title = issue.get("title", "")
number = issue.get("number", 0)
problem, solution = self._parse_issue_pattern(title)
patterns.append({
'problem': problem,
'solution': solution,
'issue_number': number
})
patterns.append({"problem": problem, "solution": solution, "issue_number": number})
return patterns
@@ -530,24 +524,24 @@ compatibility: {compatibility}
title_lower = issue_title.lower()
# Pattern 1: "Fixed X" → "X not working" / "See fix"
if title_lower.startswith('fixed ') or title_lower.startswith('fix '):
problem_text = issue_title.replace('Fixed ', '').replace('Fix ', '')
if title_lower.startswith("fixed ") or title_lower.startswith("fix "):
problem_text = issue_title.replace("Fixed ", "").replace("Fix ", "")
return (f"{problem_text} not working", "See fix implementation details")
# Pattern 2: "Resolved X" → "X issue" / "See resolution"
if title_lower.startswith('resolved ') or title_lower.startswith('resolve '):
problem_text = issue_title.replace('Resolved ', '').replace('Resolve ', '')
if title_lower.startswith("resolved ") or title_lower.startswith("resolve "):
problem_text = issue_title.replace("Resolved ", "").replace("Resolve ", "")
return (f"{problem_text} issue", "See resolution approach")
# Pattern 3: "Added X" → "Missing X" / "Use X"
if title_lower.startswith('added ') or title_lower.startswith('add '):
feature_text = issue_title.replace('Added ', '').replace('Add ', '')
if title_lower.startswith("added ") or title_lower.startswith("add "):
feature_text = issue_title.replace("Added ", "").replace("Add ", "")
return (f"Missing {feature_text}", f"Use {feature_text} feature")
# Default: Use title as-is
return (issue_title, "See issue for solution details")
def _detect_framework(self) -> Optional[str]:
def _detect_framework(self) -> str | None:
"""
Detect framework from router name and GitHub metadata.
@@ -561,14 +555,14 @@ compatibility: {compatibility}
router_lower = self.router_name.lower()
framework_keywords = {
'fastapi': 'fastapi',
'django': 'django',
'flask': 'flask',
'react': 'react',
'vue': 'vue',
'express': 'express',
'fastmcp': 'fastmcp',
'mcp': 'fastmcp',
"fastapi": "fastapi",
"django": "django",
"flask": "flask",
"react": "react",
"vue": "vue",
"express": "express",
"fastmcp": "fastmcp",
"mcp": "fastmcp",
}
# Check router name first
@@ -578,7 +572,7 @@ compatibility: {compatibility}
# Check GitHub description if available
if self.github_metadata:
description = self.github_metadata.get('description', '').lower()
description = self.github_metadata.get("description", "").lower()
for keyword, framework in framework_keywords.items():
if keyword in description:
return framework
@@ -599,7 +593,7 @@ compatibility: {compatibility}
Formatted Quick Start section with install + hello world code
"""
templates = {
'fastapi': """## Quick Start
"fastapi": """## Quick Start
```bash
pip install fastapi uvicorn
@@ -617,7 +611,7 @@ def read_root():
# Run: uvicorn main:app --reload
```
""",
'fastmcp': """## Quick Start
"fastmcp": """## Quick Start
```bash
pip install fastmcp
@@ -633,7 +627,7 @@ def greet(name: str) -> str:
return f"Hello, {name}!"
```
""",
'django': """## Quick Start
"django": """## Quick Start
```bash
pip install django
@@ -644,7 +638,7 @@ python manage.py runserver
Visit http://127.0.0.1:8000/ to see your Django app.
""",
'react': """## Quick Start
"react": """## Quick Start
```bash
npx create-react-app my-app
@@ -677,16 +671,16 @@ export default App;
all_topics = []
for config in self.configs:
desc = config.get('description', '')
desc = config.get("description", "")
# Extract key topics from description (simple comma-separated extraction)
topics = [topic.strip() for topic in desc.split(',') if topic.strip()]
topics = [topic.strip() for topic in desc.split(",") if topic.strip()]
all_topics.extend(topics[:2]) # Max 2 topics per skill
# Deduplicate and take top 5-7 topics
unique_topics = list(dict.fromkeys(all_topics))[:7]
if not unique_topics:
return f'Use when working with {self.router_name} development and programming'
return f"Use when working with {self.router_name} development and programming"
# Format as user-friendly bulleted list
description = f"""Use this skill when working with:
@@ -695,8 +689,8 @@ export default App;
for topic in unique_topics:
# Clean up topic text (remove "when working with" prefixes if present)
topic = topic.replace('when working with', '').strip()
topic = topic.replace('Use when', '').strip()
topic = topic.replace("when working with", "").strip()
topic = topic.replace("Use when", "").strip()
if topic:
description += f"- {topic}\n"
@@ -721,7 +715,10 @@ export default App;
# NEW: Generate comprehensive description from all sub-skills
when_to_use = self._generate_comprehensive_description()
skill_md = frontmatter + "\n\n" + f"""# {self.router_name.replace('-', ' ').title()} Documentation
skill_md = (
frontmatter
+ "\n\n"
+ f"""# {self.router_name.replace("-", " ").title()} Documentation
## When to Use This Skill
@@ -730,26 +727,27 @@ export default App;
This is a router skill that directs your questions to specialized sub-skills for efficient, focused assistance.
"""
)
# Phase 4: Add GitHub repository metadata
if self.github_metadata:
# NEW: Use html_url from GitHub metadata instead of base_url from config
repo_url = self.github_metadata.get('html_url', '')
stars = self.github_metadata.get('stars', 0)
language = self.github_metadata.get('language', 'Unknown')
description = self.github_metadata.get('description', '')
repo_url = self.github_metadata.get("html_url", "")
stars = self.github_metadata.get("stars", 0)
language = self.github_metadata.get("language", "Unknown")
description = self.github_metadata.get("description", "")
skill_md += f"""## Repository Info
**Repository:** {repo_url}
**Stars:** ⭐ {stars:,} | **Language:** {language}
{f'**Description:** {description}' if description else ''}
{f"**Description:** {description}" if description else ""}
"""
# Phase 4: Add Quick Start from README
if self.github_docs and self.github_docs.get('readme'):
readme = self.github_docs['readme']
if self.github_docs and self.github_docs.get("readme"):
readme = self.github_docs["readme"]
# NEW: Clean HTML and extract meaningful content
quick_start = self._extract_clean_readme_section(readme)
@@ -768,14 +766,20 @@ This is a router skill that directs your questions to specialized sub-skills for
if framework:
hello_world = self._get_framework_hello_world(framework)
if hello_world:
skill_md += hello_world + "\n*Note: Generic template. See references/getting_started.md for project-specific setup.*\n\n"
skill_md += (
hello_world
+ "\n*Note: Generic template. See references/getting_started.md for project-specific setup.*\n\n"
)
else:
# No README available - try framework fallback
framework = self._detect_framework()
if framework:
hello_world = self._get_framework_hello_world(framework)
if hello_world:
skill_md += hello_world + "\n*Note: Generic template. Check repository for specific installation instructions.*\n\n"
skill_md += (
hello_world
+ "\n*Note: Generic template. Check repository for specific installation instructions.*\n\n"
)
skill_md += """## How It Works
@@ -785,11 +789,11 @@ This skill analyzes your question and activates the appropriate specialized skil
# List sub-skills
for config in self.configs:
name = config['name']
desc = config.get('description', '')
name = config["name"]
desc = config.get("description", "")
# Remove router name prefix from description if present
if desc.startswith(f"{self.router_name.title()} -"):
desc = desc.split(' - ', 1)[1]
desc = desc.split(" - ", 1)[1]
skill_md += f"### {name}\n{desc}\n\n"
@@ -808,7 +812,7 @@ The router analyzes your question for topic keywords and activates relevant skil
skill_md += f"- {keyword_str} → **{skill_name}**\n"
# Quick reference
skill_md += f"""
skill_md += """
## Quick Reference
@@ -839,7 +843,7 @@ For quick answers, this router provides basic overview information. For detailed
# Phase 4: Add Common Issues from GitHub (Summary with Reference)
if self.github_issues:
common_problems = self.github_issues.get('common_problems', [])[:5] # Top 5
common_problems = self.github_issues.get("common_problems", [])[:5] # Top 5
if common_problems:
skill_md += """
@@ -850,9 +854,9 @@ Top 5 GitHub issues from the community:
"""
for i, issue in enumerate(common_problems, 1):
title = issue.get('title', '')
number = issue.get('number', 0)
comments = issue.get('comments', 0)
title = issue.get("title", "")
number = issue.get("number", 0)
comments = issue.get("comments", 0)
skill_md += f"{i}. **{title}** (Issue #{number}, {comments} comments)\n"
@@ -871,9 +875,9 @@ Problem-solution patterns from resolved GitHub issues:
"""
for i, pattern in enumerate(patterns, 1):
problem = pattern['problem']
solution = pattern['solution']
issue_num = pattern['issue_number']
problem = pattern["problem"]
solution = pattern["solution"]
issue_num = pattern["issue_number"]
skill_md += f"**Pattern {i}**: {problem}\n"
skill_md += f"→ **Solution**: {solution} ([Issue #{issue_num}](references/github_issues.md))\n\n"
@@ -888,10 +892,10 @@ Detailed documentation available in:
"""
if self.github_issues:
skill_md += "- `references/github_issues.md` - Community problems and solutions\n"
if self.github_docs and self.github_docs.get('readme'):
if self.github_docs and self.github_docs.get("readme"):
skill_md += "- `references/getting_started.md` - Detailed setup guide\n"
skill_md += f"""
skill_md += """
## Need Help?
@@ -904,7 +908,7 @@ Simply ask your question and mention the topic. The router will find the right s
return skill_md
def generate_subskill_issues_section(self, skill_name: str, topics: List[str]) -> str:
def generate_subskill_issues_section(self, skill_name: str, topics: list[str]) -> str:
"""
Generate "Common Issues" section for a sub-skill (Phase 4).
@@ -918,8 +922,8 @@ Simply ask your question and mention the topic. The router will find the right s
if not self.github_issues or not categorize_issues_by_topic:
return ""
common_problems = self.github_issues.get('common_problems', [])
known_solutions = self.github_issues.get('known_solutions', [])
common_problems = self.github_issues.get("common_problems", [])
known_solutions = self.github_issues.get("known_solutions", [])
# Categorize issues by topic
categorized = categorize_issues_by_topic(common_problems, known_solutions, topics)
@@ -944,11 +948,11 @@ GitHub issues related to this topic:
issues_md += f"\n### {topic.title()}\n\n"
for issue in issues[:3]: # Top 3 per topic
title = issue.get('title', '')
number = issue.get('number', 0)
state = issue.get('state', 'unknown')
comments = issue.get('comments', 0)
labels = issue.get('labels', [])
title = issue.get("title", "")
number = issue.get("number", 0)
state = issue.get("state", "unknown")
comments = issue.get("comments", 0)
labels = issue.get("labels", [])
# Format issue
state_icon = "🔴" if state == "open" else ""
@@ -964,21 +968,24 @@ GitHub issues related to this topic:
return issues_md
def create_router_config(self) -> Dict[str, Any]:
def create_router_config(self) -> dict[str, Any]:
"""Create router configuration"""
routing_keywords = self.extract_routing_keywords()
router_config = {
"name": self.router_name,
"description": self.base_config.get('description', f'Use when working with {self.router_name} documentation (router for multiple sub-skills)'),
"base_url": self.base_config['base_url'],
"selectors": self.base_config.get('selectors', {}),
"url_patterns": self.base_config.get('url_patterns', {}),
"rate_limit": self.base_config.get('rate_limit', 0.5),
"description": self.base_config.get(
"description",
f"Use when working with {self.router_name} documentation (router for multiple sub-skills)",
),
"base_url": self.base_config["base_url"],
"selectors": self.base_config.get("selectors", {}),
"url_patterns": self.base_config.get("url_patterns", {}),
"rate_limit": self.base_config.get("rate_limit", 0.5),
"max_pages": 500, # Router only scrapes overview pages
"_router": True,
"_sub_skills": [cfg['name'] for cfg in self.configs],
"_routing_keywords": routing_keywords
"_sub_skills": [cfg["name"] for cfg in self.configs],
"_routing_keywords": routing_keywords,
}
return router_config
@@ -993,34 +1000,38 @@ GitHub issues related to this topic:
md = "# Common GitHub Issues\n\n"
md += "Top issues reported by the community:\n\n"
common_problems = self.github_issues.get('common_problems', [])[:10] if self.github_issues else []
known_solutions = self.github_issues.get('known_solutions', [])[:10] if self.github_issues else []
common_problems = self.github_issues.get("common_problems", [])[:10] if self.github_issues else []
known_solutions = self.github_issues.get("known_solutions", [])[:10] if self.github_issues else []
if common_problems:
md += "## Open Issues (Common Problems)\n\n"
for i, issue in enumerate(common_problems, 1):
title = issue.get('title', '')
number = issue.get('number', 0)
comments = issue.get('comments', 0)
labels = issue.get('labels', [])
title = issue.get("title", "")
number = issue.get("number", 0)
comments = issue.get("comments", 0)
labels = issue.get("labels", [])
if isinstance(labels, list):
labels_str = ', '.join(str(label) for label in labels)
labels_str = ", ".join(str(label) for label in labels)
else:
labels_str = str(labels) if labels else ''
labels_str = str(labels) if labels else ""
md += f"### {i}. {title}\n\n"
md += f"**Issue**: #{number}\n"
md += f"**Comments**: {comments}\n"
if labels_str:
md += f"**Labels**: {labels_str}\n"
md += f"**Link**: https://github.com/{self.github_metadata.get('html_url', '').replace('https://github.com/', '')}/issues/{number}\n\n" if self.github_metadata else "\n\n"
md += (
f"**Link**: https://github.com/{self.github_metadata.get('html_url', '').replace('https://github.com/', '')}/issues/{number}\n\n"
if self.github_metadata
else "\n\n"
)
if known_solutions:
md += "\n## Closed Issues (Known Solutions)\n\n"
for i, issue in enumerate(known_solutions, 1):
title = issue.get('title', '')
number = issue.get('number', 0)
comments = issue.get('comments', 0)
title = issue.get("title", "")
number = issue.get("number", 0)
comments = issue.get("comments", 0)
md += f"### {i}. {title}\n\n"
md += f"**Issue**: #{number} (Closed)\n"
@@ -1042,8 +1053,8 @@ GitHub issues related to this topic:
md = "# Getting Started\n\n"
md += "*Extracted from project README*\n\n"
if self.github_docs and self.github_docs.get('readme'):
readme = self.github_docs['readme']
if self.github_docs and self.github_docs.get("readme"):
readme = self.github_docs["readme"]
# Clean and extract full quick start section (up to 2000 chars)
cleaner = MarkdownCleaner()
@@ -1069,16 +1080,16 @@ GitHub issues related to this topic:
# 1. GitHub Issues Reference
if self.github_issues:
issues_md = self._generate_github_issues_reference()
with open(references_dir / 'github_issues.md', 'w') as f:
with open(references_dir / "github_issues.md", "w") as f:
f.write(issues_md)
# 2. Getting Started Reference
if self.github_docs and self.github_docs.get('readme'):
if self.github_docs and self.github_docs.get("readme"):
getting_started_md = self._generate_getting_started_reference()
with open(references_dir / 'getting_started.md', 'w') as f:
with open(references_dir / "getting_started.md", "w") as f:
f.write(getting_started_md)
def generate(self, output_dir: Path = None) -> Tuple[Path, Path]:
def generate(self, output_dir: Path = None) -> tuple[Path, Path]:
"""Generate router skill and config with progressive disclosure"""
if output_dir is None:
output_dir = self.config_paths[0].parent
@@ -1090,11 +1101,11 @@ GitHub issues related to this topic:
skill_path = output_dir.parent / f"output/{self.router_name}/SKILL.md"
skill_path.parent.mkdir(parents=True, exist_ok=True)
with open(skill_path, 'w') as f:
with open(skill_path, "w") as f:
f.write(skill_md)
# NEW: Create references/ directory and generate reference files
references_dir = skill_path.parent / 'references'
references_dir = skill_path.parent / "references"
references_dir.mkdir(parents=True, exist_ok=True)
self._generate_reference_files(references_dir)
@@ -1102,7 +1113,7 @@ GitHub issues related to this topic:
router_config = self.create_router_config()
config_path = output_dir / f"{self.router_name}.json"
with open(config_path, 'w') as f:
with open(config_path, "w") as f:
json.dump(router_config, f, indent=2)
return config_path, skill_path
@@ -1125,24 +1136,14 @@ Examples:
# Custom output directory
python3 generate_router.py configs/godot-*.json --output-dir configs/routers/
"""
""",
)
parser.add_argument(
'configs',
nargs='+',
help='Sub-skill config files'
)
parser.add_argument("configs", nargs="+", help="Sub-skill config files")
parser.add_argument(
'--name',
help='Router skill name (default: inferred from sub-skills)'
)
parser.add_argument("--name", help="Router skill name (default: inferred from sub-skills)")
parser.add_argument(
'--output-dir',
help='Output directory (default: same as input configs)'
)
parser.add_argument("--output-dir", help="Output directory (default: same as input configs)")
args = parser.parse_args()
@@ -1150,16 +1151,16 @@ Examples:
config_files = []
for path_str in args.configs:
path = Path(path_str)
if path.exists() and not path.stem.endswith('-router'):
if path.exists() and not path.stem.endswith("-router"):
config_files.append(path_str)
if not config_files:
print("❌ Error: No valid config files provided")
sys.exit(1)
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print("ROUTER SKILL GENERATOR")
print(f"{'='*60}")
print(f"{'=' * 60}")
print(f"Sub-skills: {len(config_files)}")
for cfg in config_files:
print(f" - {Path(cfg).stem}")
@@ -1172,11 +1173,11 @@ Examples:
print(f"✅ Router config created: {config_path}")
print(f"✅ Router SKILL.md created: {skill_path}")
print("")
print(f"{'='*60}")
print(f"{'=' * 60}")
print("NEXT STEPS")
print(f"{'='*60}")
print(f"{'=' * 60}")
print(f"1. Review router SKILL.md: {skill_path}")
print(f"2. Optionally scrape router (for overview pages):")
print("2. Optionally scrape router (for overview pages):")
print(f" skill-seekers scrape --config {config_path}")
print("3. Package router skill:")
print(f" skill-seekers package output/{generator.router_name}/")

View File

@@ -12,43 +12,47 @@ This is the foundation of the unified codebase analyzer architecture.
import os
import subprocess
import tempfile
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from collections import Counter
import requests
from .rate_limit_handler import RateLimitHandler, RateLimitError, create_github_headers
from .config_manager import get_config_manager
from .rate_limit_handler import RateLimitError, RateLimitHandler, create_github_headers
@dataclass
class CodeStream:
"""Code files for C3.x analysis."""
directory: Path
files: List[Path]
files: list[Path]
@dataclass
class DocsStream:
"""Documentation files from repository."""
readme: Optional[str]
contributing: Optional[str]
docs_files: List[Dict] # [{"path": "docs/oauth.md", "content": "..."}]
readme: str | None
contributing: str | None
docs_files: list[dict] # [{"path": "docs/oauth.md", "content": "..."}]
@dataclass
class InsightsStream:
"""GitHub metadata and issues."""
metadata: Dict # stars, forks, language, etc.
common_problems: List[Dict]
known_solutions: List[Dict]
top_labels: List[Dict]
metadata: dict # stars, forks, language, etc.
common_problems: list[dict]
known_solutions: list[dict]
top_labels: list[dict]
@dataclass
class ThreeStreamData:
"""Complete output from GitHub fetcher."""
code_stream: CodeStream
docs_stream: DocsStream
insights_stream: InsightsStream
@@ -73,11 +77,7 @@ class GitHubThreeStreamFetcher:
"""
def __init__(
self,
repo_url: str,
github_token: Optional[str] = None,
interactive: bool = True,
profile_name: Optional[str] = None
self, repo_url: str, github_token: str | None = None, interactive: bool = True, profile_name: str | None = None
):
"""
Initialize fetcher.
@@ -89,7 +89,7 @@ class GitHubThreeStreamFetcher:
profile_name: Name of the GitHub profile being used
"""
self.repo_url = repo_url
self.github_token = github_token or os.getenv('GITHUB_TOKEN')
self.github_token = github_token or os.getenv("GITHUB_TOKEN")
self.owner, self.repo = self.parse_repo_url(repo_url)
self.interactive = interactive
@@ -99,12 +99,10 @@ class GitHubThreeStreamFetcher:
profile_name = config.get_profile_for_token(self.github_token)
self.rate_limiter = RateLimitHandler(
token=self.github_token,
interactive=interactive,
profile_name=profile_name
token=self.github_token, interactive=interactive, profile_name=profile_name
)
def parse_repo_url(self, url: str) -> Tuple[str, str]:
def parse_repo_url(self, url: str) -> tuple[str, str]:
"""
Parse GitHub URL to extract owner and repo.
@@ -115,18 +113,18 @@ class GitHubThreeStreamFetcher:
Tuple of (owner, repo)
"""
# Remove .git suffix if present
if url.endswith('.git'):
if url.endswith(".git"):
url = url[:-4] # Remove last 4 characters (.git)
# Handle git@ URLs (SSH format)
if url.startswith('git@github.com:'):
parts = url.replace('git@github.com:', '').split('/')
if url.startswith("git@github.com:"):
parts = url.replace("git@github.com:", "").split("/")
if len(parts) >= 2:
return parts[0], parts[1]
# Handle HTTPS URLs
if 'github.com/' in url:
parts = url.split('github.com/')[-1].split('/')
if "github.com/" in url:
parts = url.split("github.com/")[-1].split("/")
if len(parts) >= 2:
return parts[0], parts[1]
@@ -150,18 +148,18 @@ class GitHubThreeStreamFetcher:
raise RateLimitError("Rate limit check failed during startup")
if output_dir is None:
output_dir = Path(tempfile.mkdtemp(prefix='github_fetch_'))
output_dir = Path(tempfile.mkdtemp(prefix="github_fetch_"))
print(f"📦 Cloning {self.repo_url}...")
local_path = self.clone_repo(output_dir)
print(f"🔍 Fetching GitHub metadata...")
print("🔍 Fetching GitHub metadata...")
metadata = self.fetch_github_metadata()
print(f"🐛 Fetching issues...")
print("🐛 Fetching issues...")
issues = self.fetch_issues(max_issues=100)
print(f"📂 Classifying files...")
print("📂 Classifying files...")
code_files, doc_files = self.classify_files(local_path)
print(f" - Code: {len(code_files)} files")
print(f" - Docs: {len(doc_files)} files")
@@ -171,25 +169,22 @@ class GitHubThreeStreamFetcher:
# Build three streams
return ThreeStreamData(
code_stream=CodeStream(
directory=local_path,
files=code_files
),
code_stream=CodeStream(directory=local_path, files=code_files),
docs_stream=DocsStream(
readme=self.read_file(local_path / 'README.md'),
contributing=self.read_file(local_path / 'CONTRIBUTING.md'),
readme=self.read_file(local_path / "README.md"),
contributing=self.read_file(local_path / "CONTRIBUTING.md"),
docs_files=[
{'path': str(f.relative_to(local_path)), 'content': self.read_file(f)}
{"path": str(f.relative_to(local_path)), "content": self.read_file(f)}
for f in doc_files
if f.name not in ['README.md', 'CONTRIBUTING.md']
]
if f.name not in ["README.md", "CONTRIBUTING.md"]
],
),
insights_stream=InsightsStream(
metadata=metadata,
common_problems=issue_insights['common_problems'],
known_solutions=issue_insights['known_solutions'],
top_labels=issue_insights['top_labels']
)
common_problems=issue_insights["common_problems"],
known_solutions=issue_insights["known_solutions"],
top_labels=issue_insights["top_labels"],
),
)
def clone_repo(self, output_dir: Path) -> Path:
@@ -206,7 +201,7 @@ class GitHubThreeStreamFetcher:
repo_dir.mkdir(parents=True, exist_ok=True)
# Clone with depth 1 for speed
cmd = ['git', 'clone', '--depth', '1', self.repo_url, str(repo_dir)]
cmd = ["git", "clone", "--depth", "1", self.repo_url, str(repo_dir)]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
@@ -214,7 +209,7 @@ class GitHubThreeStreamFetcher:
return repo_dir
def fetch_github_metadata(self) -> Dict:
def fetch_github_metadata(self) -> dict:
"""
Fetch repo metadata via GitHub API.
@@ -238,35 +233,35 @@ class GitHubThreeStreamFetcher:
data = response.json()
return {
'stars': data.get('stargazers_count', 0),
'forks': data.get('forks_count', 0),
'open_issues': data.get('open_issues_count', 0),
'language': data.get('language', 'Unknown'),
'description': data.get('description', ''),
'homepage': data.get('homepage', ''),
'created_at': data.get('created_at', ''),
'updated_at': data.get('updated_at', ''),
'html_url': data.get('html_url', ''), # NEW: Repository URL
'license': data.get('license', {}) # NEW: License info
"stars": data.get("stargazers_count", 0),
"forks": data.get("forks_count", 0),
"open_issues": data.get("open_issues_count", 0),
"language": data.get("language", "Unknown"),
"description": data.get("description", ""),
"homepage": data.get("homepage", ""),
"created_at": data.get("created_at", ""),
"updated_at": data.get("updated_at", ""),
"html_url": data.get("html_url", ""), # NEW: Repository URL
"license": data.get("license", {}), # NEW: License info
}
except RateLimitError:
raise
except Exception as e:
print(f"⚠️ Failed to fetch metadata: {e}")
return {
'stars': 0,
'forks': 0,
'open_issues': 0,
'language': 'Unknown',
'description': '',
'homepage': '',
'created_at': '',
'updated_at': '',
'html_url': '', # NEW: Repository URL
'license': {} # NEW: License info
"stars": 0,
"forks": 0,
"open_issues": 0,
"language": "Unknown",
"description": "",
"homepage": "",
"created_at": "",
"updated_at": "",
"html_url": "", # NEW: Repository URL
"license": {}, # NEW: License info
}
def fetch_issues(self, max_issues: int = 100) -> List[Dict]:
def fetch_issues(self, max_issues: int = 100) -> list[dict]:
"""
Fetch GitHub issues (open + closed).
@@ -279,14 +274,14 @@ class GitHubThreeStreamFetcher:
all_issues = []
# Fetch open issues
all_issues.extend(self._fetch_issues_page(state='open', max_count=max_issues // 2))
all_issues.extend(self._fetch_issues_page(state="open", max_count=max_issues // 2))
# Fetch closed issues
all_issues.extend(self._fetch_issues_page(state='closed', max_count=max_issues // 2))
all_issues.extend(self._fetch_issues_page(state="closed", max_count=max_issues // 2))
return all_issues
def _fetch_issues_page(self, state: str, max_count: int) -> List[Dict]:
def _fetch_issues_page(self, state: str, max_count: int) -> list[dict]:
"""
Fetch one page of issues.
@@ -304,10 +299,10 @@ class GitHubThreeStreamFetcher:
headers = create_github_headers(self.github_token)
params = {
'state': state,
'per_page': min(max_count, 100), # GitHub API limit
'sort': 'comments',
'direction': 'desc'
"state": state,
"per_page": min(max_count, 100), # GitHub API limit
"sort": "comments",
"direction": "desc",
}
try:
@@ -321,7 +316,7 @@ class GitHubThreeStreamFetcher:
issues = response.json()
# Filter out pull requests (they appear in issues endpoint)
issues = [issue for issue in issues if 'pull_request' not in issue]
issues = [issue for issue in issues if "pull_request" not in issue]
return issues
except RateLimitError:
@@ -330,7 +325,7 @@ class GitHubThreeStreamFetcher:
print(f"⚠️ Failed to fetch {state} issues: {e}")
return []
def classify_files(self, repo_path: Path) -> Tuple[List[Path], List[Path]]:
def classify_files(self, repo_path: Path) -> tuple[list[Path], list[Path]]:
"""
Split files into code vs documentation.
@@ -354,36 +349,61 @@ class GitHubThreeStreamFetcher:
# Documentation patterns
doc_patterns = [
'**/README.md',
'**/CONTRIBUTING.md',
'**/CHANGELOG.md',
'**/LICENSE.md',
'docs/*.md', # Files directly in docs/
'docs/**/*.md', # Files in subdirectories of docs/
'doc/*.md', # Files directly in doc/
'doc/**/*.md', # Files in subdirectories of doc/
'documentation/*.md', # Files directly in documentation/
'documentation/**/*.md', # Files in subdirectories of documentation/
'**/*.rst',
"**/README.md",
"**/CONTRIBUTING.md",
"**/CHANGELOG.md",
"**/LICENSE.md",
"docs/*.md", # Files directly in docs/
"docs/**/*.md", # Files in subdirectories of docs/
"doc/*.md", # Files directly in doc/
"doc/**/*.md", # Files in subdirectories of doc/
"documentation/*.md", # Files directly in documentation/
"documentation/**/*.md", # Files in subdirectories of documentation/
"**/*.rst",
]
# Code extensions
code_extensions = [
'.py', '.js', '.ts', '.jsx', '.tsx',
'.go', '.rs', '.java', '.kt',
'.c', '.cpp', '.h', '.hpp',
'.rb', '.php', '.swift', '.cs',
'.scala', '.clj', '.cljs'
".py",
".js",
".ts",
".jsx",
".tsx",
".go",
".rs",
".java",
".kt",
".c",
".cpp",
".h",
".hpp",
".rb",
".php",
".swift",
".cs",
".scala",
".clj",
".cljs",
]
# Directories to exclude
exclude_dirs = [
'node_modules', '__pycache__', 'venv', '.venv',
'.git', 'build', 'dist', '.tox', '.pytest_cache',
'htmlcov', '.mypy_cache', '.eggs', '*.egg-info'
"node_modules",
"__pycache__",
"venv",
".venv",
".git",
"build",
"dist",
".tox",
".pytest_cache",
"htmlcov",
".mypy_cache",
".eggs",
"*.egg-info",
]
for file_path in repo_path.rglob('*'):
for file_path in repo_path.rglob("*"):
if not file_path.is_file():
continue
@@ -392,8 +412,8 @@ class GitHubThreeStreamFetcher:
continue
# Skip hidden files (but allow docs in docs/ directories)
is_in_docs_dir = any(pattern in str(file_path) for pattern in ['docs/', 'doc/', 'documentation/'])
if any(part.startswith('.') for part in file_path.parts):
is_in_docs_dir = any(pattern in str(file_path) for pattern in ["docs/", "doc/", "documentation/"])
if any(part.startswith(".") for part in file_path.parts):
if not is_in_docs_dir:
continue
@@ -407,7 +427,7 @@ class GitHubThreeStreamFetcher:
return code_files, doc_files
def analyze_issues(self, issues: List[Dict]) -> Dict:
def analyze_issues(self, issues: list[dict]) -> dict:
"""
Analyze GitHub issues to extract insights.
@@ -446,44 +466,41 @@ class GitHubThreeStreamFetcher:
for issue in issues:
# Handle both string labels and dict labels (GitHub API format)
raw_labels = issue.get('labels', [])
raw_labels = issue.get("labels", [])
labels = []
for label in raw_labels:
if isinstance(label, dict):
labels.append(label.get('name', ''))
labels.append(label.get("name", ""))
else:
labels.append(str(label))
all_labels.extend(labels)
issue_data = {
'title': issue.get('title', ''),
'number': issue.get('number', 0),
'labels': labels,
'comments': issue.get('comments', 0),
'state': issue.get('state', 'unknown')
"title": issue.get("title", ""),
"number": issue.get("number", 0),
"labels": labels,
"comments": issue.get("comments", 0),
"state": issue.get("state", "unknown"),
}
# Open issues with many comments = common problems
if issue['state'] == 'open' and issue.get('comments', 0) >= 5:
if issue["state"] == "open" and issue.get("comments", 0) >= 5:
common_problems.append(issue_data)
# Closed issues with comments = known solutions
elif issue['state'] == 'closed' and issue.get('comments', 0) > 0:
elif issue["state"] == "closed" and issue.get("comments", 0) > 0:
known_solutions.append(issue_data)
# Count label frequency
label_counts = Counter(all_labels)
return {
'common_problems': sorted(common_problems, key=lambda x: x['comments'], reverse=True)[:10],
'known_solutions': sorted(known_solutions, key=lambda x: x['comments'], reverse=True)[:10],
'top_labels': [
{'label': label, 'count': count}
for label, count in label_counts.most_common(10)
]
"common_problems": sorted(common_problems, key=lambda x: x["comments"], reverse=True)[:10],
"known_solutions": sorted(known_solutions, key=lambda x: x["comments"], reverse=True)[:10],
"top_labels": [{"label": label, "count": count} for label, count in label_counts.most_common(10)],
}
def read_file(self, file_path: Path) -> Optional[str]:
def read_file(self, file_path: Path) -> str | None:
"""
Read file content safely.
@@ -497,10 +514,10 @@ class GitHubThreeStreamFetcher:
return None
try:
return file_path.read_text(encoding='utf-8')
return file_path.read_text(encoding="utf-8")
except Exception:
# Try with different encoding
try:
return file_path.read_text(encoding='latin-1')
return file_path.read_text(encoding="latin-1")
except Exception:
return None

File diff suppressed because it is too large Load Diff

View File

@@ -20,7 +20,7 @@ import subprocess
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, TYPE_CHECKING
from typing import TYPE_CHECKING
# Avoid circular imports by using TYPE_CHECKING
if TYPE_CHECKING:
@@ -40,15 +40,17 @@ else:
@dataclass
class TroubleshootingItem:
problem: str
symptoms: List[str] = field(default_factory=list)
symptoms: list[str] = field(default_factory=list)
solution: str = ""
diagnostic_steps: List[str] = field(default_factory=list)
diagnostic_steps: list[str] = field(default_factory=list)
logger = logging.getLogger(__name__)
# Conditional import for Anthropic API
try:
import anthropic
ANTHROPIC_AVAILABLE = True
except ImportError:
ANTHROPIC_AVAILABLE = False
@@ -58,9 +60,10 @@ except ImportError:
@dataclass
class StepEnhancement:
"""Enhanced step information (internal use only)"""
step_index: int
explanation: str # Natural language explanation
variations: List[str] = field(default_factory=list) # Alternative approaches
variations: list[str] = field(default_factory=list) # Alternative approaches
class GuideEnhancer:
@@ -81,7 +84,7 @@ class GuideEnhancer:
mode: Enhancement mode - "api", "local", or "auto"
"""
self.mode = self._detect_mode(mode)
self.api_key = os.environ.get('ANTHROPIC_API_KEY')
self.api_key = os.environ.get("ANTHROPIC_API_KEY")
self.client = None
if self.mode == "api":
@@ -119,7 +122,7 @@ class GuideEnhancer:
"""
if requested_mode == "auto":
# Prefer API if key available, else LOCAL
if os.environ.get('ANTHROPIC_API_KEY') and ANTHROPIC_AVAILABLE:
if os.environ.get("ANTHROPIC_API_KEY") and ANTHROPIC_AVAILABLE:
return "api"
elif self._check_claude_cli():
return "local"
@@ -130,17 +133,12 @@ class GuideEnhancer:
def _check_claude_cli(self) -> bool:
"""Check if Claude Code CLI is available."""
try:
result = subprocess.run(
['claude', '--version'],
capture_output=True,
text=True,
timeout=5
)
result = subprocess.run(["claude", "--version"], capture_output=True, text=True, timeout=5)
return result.returncode == 0
except (FileNotFoundError, subprocess.TimeoutExpired):
return False
def enhance_guide(self, guide_data: Dict) -> Dict:
def enhance_guide(self, guide_data: dict) -> dict:
"""
Apply all 5 enhancements to a guide.
@@ -164,7 +162,7 @@ class GuideEnhancer:
logger.info("📝 Returning original guide without enhancement")
return guide_data
def enhance_step_descriptions(self, steps: List[Dict]) -> List[StepEnhancement]:
def enhance_step_descriptions(self, steps: list[dict]) -> list[StepEnhancement]:
"""
Enhancement 1: Add natural language explanations to steps.
@@ -187,17 +185,17 @@ class GuideEnhancer:
data = json.loads(response)
return [
StepEnhancement(
step_index=item.get('step_index', i),
explanation=item.get('explanation', ''),
variations=item.get('variations', [])
step_index=item.get("step_index", i),
explanation=item.get("explanation", ""),
variations=item.get("variations", []),
)
for i, item in enumerate(data.get('step_descriptions', []))
for i, item in enumerate(data.get("step_descriptions", []))
]
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"⚠️ Failed to parse step descriptions: {e}")
return []
def enhance_troubleshooting(self, guide_data: Dict) -> List[TroubleshootingItem]:
def enhance_troubleshooting(self, guide_data: dict) -> list[TroubleshootingItem]:
"""
Enhancement 2: Generate diagnostic flows + solutions.
@@ -220,18 +218,18 @@ class GuideEnhancer:
data = json.loads(response)
return [
TroubleshootingItem(
problem=item.get('problem', ''),
symptoms=item.get('symptoms', []),
diagnostic_steps=item.get('diagnostic_steps', []),
solution=item.get('solution', '')
problem=item.get("problem", ""),
symptoms=item.get("symptoms", []),
diagnostic_steps=item.get("diagnostic_steps", []),
solution=item.get("solution", ""),
)
for item in data.get('troubleshooting', [])
for item in data.get("troubleshooting", [])
]
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"⚠️ Failed to parse troubleshooting items: {e}")
return []
def enhance_prerequisites(self, prereqs: List[str]) -> List[PrerequisiteItem]:
def enhance_prerequisites(self, prereqs: list[str]) -> list[PrerequisiteItem]:
"""
Enhancement 3: Explain why prerequisites are needed.
@@ -253,18 +251,14 @@ class GuideEnhancer:
try:
data = json.loads(response)
return [
PrerequisiteItem(
name=item.get('name', ''),
why=item.get('why', ''),
setup=item.get('setup', '')
)
for item in data.get('prerequisites_detailed', [])
PrerequisiteItem(name=item.get("name", ""), why=item.get("why", ""), setup=item.get("setup", ""))
for item in data.get("prerequisites_detailed", [])
]
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"⚠️ Failed to parse prerequisites: {e}")
return []
def enhance_next_steps(self, guide_data: Dict) -> List[str]:
def enhance_next_steps(self, guide_data: dict) -> list[str]:
"""
Enhancement 4: Suggest related guides and variations.
@@ -285,12 +279,12 @@ class GuideEnhancer:
try:
data = json.loads(response)
return data.get('next_steps', [])
return data.get("next_steps", [])
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"⚠️ Failed to parse next steps: {e}")
return []
def enhance_use_cases(self, guide_data: Dict) -> List[str]:
def enhance_use_cases(self, guide_data: dict) -> list[str]:
"""
Enhancement 5: Generate real-world scenario examples.
@@ -311,14 +305,14 @@ class GuideEnhancer:
try:
data = json.loads(response)
return data.get('use_cases', [])
return data.get("use_cases", [])
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"⚠️ Failed to parse use cases: {e}")
return []
# === AI Call Methods ===
def _call_ai(self, prompt: str, max_tokens: int = 4000) -> Optional[str]:
def _call_ai(self, prompt: str, max_tokens: int = 4000) -> str | None:
"""
Call AI with the given prompt.
@@ -335,7 +329,7 @@ class GuideEnhancer:
return self._call_claude_local(prompt)
return None
def _call_claude_api(self, prompt: str, max_tokens: int = 4000) -> Optional[str]:
def _call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str | None:
"""
Call Claude API.
@@ -351,16 +345,14 @@ class GuideEnhancer:
try:
response = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}]
model="claude-sonnet-4-20250514", max_tokens=max_tokens, messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text
except Exception as e:
logger.warning(f"⚠️ Claude API call failed: {e}")
return None
def _call_claude_local(self, prompt: str) -> Optional[str]:
def _call_claude_local(self, prompt: str) -> str | None:
"""
Call Claude Code CLI.
@@ -372,16 +364,16 @@ class GuideEnhancer:
"""
try:
# Create temporary prompt file
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
f.write(prompt)
prompt_file = f.name
# Run claude CLI
result = subprocess.run(
['claude', prompt_file],
["claude", prompt_file],
capture_output=True,
text=True,
timeout=300 # 5 min timeout
timeout=300, # 5 min timeout
)
# Clean up prompt file
@@ -399,7 +391,7 @@ class GuideEnhancer:
# === Prompt Creation Methods ===
def _enhance_via_api(self, guide_data: Dict) -> Dict:
def _enhance_via_api(self, guide_data: dict) -> dict:
"""
Enhance guide via API mode.
@@ -417,7 +409,7 @@ class GuideEnhancer:
return self._parse_enhancement_response(response, guide_data)
def _enhance_via_local(self, guide_data: Dict) -> Dict:
def _enhance_via_local(self, guide_data: dict) -> dict:
"""
Enhance guide via LOCAL mode.
@@ -435,7 +427,7 @@ class GuideEnhancer:
return self._parse_enhancement_response(response, guide_data)
def _create_enhancement_prompt(self, guide_data: Dict) -> str:
def _create_enhancement_prompt(self, guide_data: dict) -> str:
"""
Create comprehensive enhancement prompt for all 5 enhancements.
@@ -445,13 +437,13 @@ class GuideEnhancer:
Returns:
Complete prompt text
"""
title = guide_data.get('title', 'Unknown Guide')
steps = guide_data.get('steps', [])
language = guide_data.get('language', 'python')
prerequisites = guide_data.get('prerequisites', [])
title = guide_data.get("title", "Unknown Guide")
steps = guide_data.get("steps", [])
language = guide_data.get("language", "python")
prerequisites = guide_data.get("prerequisites", [])
steps_text = self._format_steps_for_prompt(steps)
prereqs_text = ', '.join(prerequisites) if prerequisites else 'None specified'
prereqs_text = ", ".join(prerequisites) if prerequisites else "None specified"
prompt = f"""I need you to enhance this how-to guide with 5 improvements:
@@ -528,7 +520,7 @@ IMPORTANT: Return ONLY valid JSON, no markdown code blocks or extra text.
"""
return prompt
def _create_step_description_prompt(self, steps: List[Dict]) -> str:
def _create_step_description_prompt(self, steps: list[dict]) -> str:
"""Create prompt for step descriptions only."""
steps_text = self._format_steps_for_prompt(steps)
return f"""Generate natural language explanations for these code steps:
@@ -546,11 +538,11 @@ Return JSON:
IMPORTANT: Return ONLY valid JSON.
"""
def _create_troubleshooting_prompt(self, guide_data: Dict) -> str:
def _create_troubleshooting_prompt(self, guide_data: dict) -> str:
"""Create prompt for troubleshooting items."""
title = guide_data.get('title', 'Unknown')
language = guide_data.get('language', 'python')
steps = guide_data.get('steps', [])
title = guide_data.get("title", "Unknown")
language = guide_data.get("language", "python")
steps = guide_data.get("steps", [])
steps_text = self._format_steps_for_prompt(steps)
return f"""Generate troubleshooting guidance for this {language} workflow:
@@ -575,9 +567,9 @@ Return JSON with 3-5 common errors:
IMPORTANT: Return ONLY valid JSON.
"""
def _create_prerequisites_prompt(self, prereqs: List[str]) -> str:
def _create_prerequisites_prompt(self, prereqs: list[str]) -> str:
"""Create prompt for prerequisites enhancement."""
prereqs_text = ', '.join(prereqs)
prereqs_text = ", ".join(prereqs)
return f"""Explain why these prerequisites are needed and how to install them:
Prerequisites: {prereqs_text}
@@ -593,9 +585,9 @@ Return JSON:
IMPORTANT: Return ONLY valid JSON.
"""
def _create_next_steps_prompt(self, guide_data: Dict) -> str:
def _create_next_steps_prompt(self, guide_data: dict) -> str:
"""Create prompt for next steps suggestions."""
title = guide_data.get('title', 'Unknown')
title = guide_data.get("title", "Unknown")
return f"""Suggest 3-5 related guides and learning paths after completing: {title}
Return JSON:
@@ -610,10 +602,10 @@ Return JSON:
IMPORTANT: Return ONLY valid JSON.
"""
def _create_use_cases_prompt(self, guide_data: Dict) -> str:
def _create_use_cases_prompt(self, guide_data: dict) -> str:
"""Create prompt for use case examples."""
title = guide_data.get('title', 'Unknown')
description = guide_data.get('description', '')
title = guide_data.get("title", "Unknown")
description = guide_data.get("description", "")
return f"""Generate 2-3 real-world use cases for this guide:
@@ -632,23 +624,23 @@ Return JSON:
IMPORTANT: Return ONLY valid JSON.
"""
def _format_steps_for_prompt(self, steps: List[Dict]) -> str:
def _format_steps_for_prompt(self, steps: list[dict]) -> str:
"""Format steps for inclusion in prompts."""
if not steps:
return "No steps provided"
formatted = []
for i, step in enumerate(steps):
desc = step.get('description', '')
code = step.get('code', '')
desc = step.get("description", "")
code = step.get("code", "")
if code:
formatted.append(f"Step {i+1}: {desc}\n```\n{code}\n```")
formatted.append(f"Step {i + 1}: {desc}\n```\n{code}\n```")
else:
formatted.append(f"Step {i+1}: {desc}")
formatted.append(f"Step {i + 1}: {desc}")
return "\n\n".join(formatted)
def _parse_enhancement_response(self, response: str, guide_data: Dict) -> Dict:
def _parse_enhancement_response(self, response: str, guide_data: dict) -> dict:
"""
Parse AI enhancement response.
@@ -661,8 +653,8 @@ IMPORTANT: Return ONLY valid JSON.
"""
try:
# Try to extract JSON from response (in case there's extra text)
json_start = response.find('{')
json_end = response.rfind('}') + 1
json_start = response.find("{")
json_end = response.rfind("}") + 1
if json_start >= 0 and json_end > json_start:
json_text = response[json_start:json_end]
data = json.loads(json_text)
@@ -673,46 +665,42 @@ IMPORTANT: Return ONLY valid JSON.
enhanced = guide_data.copy()
# Step descriptions
if 'step_descriptions' in data:
enhanced['step_enhancements'] = [
if "step_descriptions" in data:
enhanced["step_enhancements"] = [
StepEnhancement(
step_index=item.get('step_index', i),
explanation=item.get('explanation', ''),
variations=item.get('variations', [])
step_index=item.get("step_index", i),
explanation=item.get("explanation", ""),
variations=item.get("variations", []),
)
for i, item in enumerate(data['step_descriptions'])
for i, item in enumerate(data["step_descriptions"])
]
# Troubleshooting
if 'troubleshooting' in data:
enhanced['troubleshooting_detailed'] = [
if "troubleshooting" in data:
enhanced["troubleshooting_detailed"] = [
TroubleshootingItem(
problem=item.get('problem', ''),
symptoms=item.get('symptoms', []),
diagnostic_steps=item.get('diagnostic_steps', []),
solution=item.get('solution', '')
problem=item.get("problem", ""),
symptoms=item.get("symptoms", []),
diagnostic_steps=item.get("diagnostic_steps", []),
solution=item.get("solution", ""),
)
for item in data['troubleshooting']
for item in data["troubleshooting"]
]
# Prerequisites
if 'prerequisites_detailed' in data:
enhanced['prerequisites_detailed'] = [
PrerequisiteItem(
name=item.get('name', ''),
why=item.get('why', ''),
setup=item.get('setup', '')
)
for item in data['prerequisites_detailed']
if "prerequisites_detailed" in data:
enhanced["prerequisites_detailed"] = [
PrerequisiteItem(name=item.get("name", ""), why=item.get("why", ""), setup=item.get("setup", ""))
for item in data["prerequisites_detailed"]
]
# Next steps
if 'next_steps' in data:
enhanced['next_steps_detailed'] = data['next_steps']
if "next_steps" in data:
enhanced["next_steps_detailed"] = data["next_steps"]
# Use cases
if 'use_cases' in data:
enhanced['use_cases'] = data['use_cases']
if "use_cases" in data:
enhanced["use_cases"] = data["use_cases"]
logger.info("✅ Successfully enhanced guide with all 5 improvements")
return enhanced

File diff suppressed because it is too large Load Diff

View File

@@ -26,30 +26,28 @@ Examples:
import argparse
import shutil
import sys
from pathlib import Path
from typing import Dict, Optional, Tuple, Union
from difflib import get_close_matches
from pathlib import Path
# Agent installation paths
# Global paths (install to home directory): Use ~/.{agent}/skills/
# Project paths (install to current directory): Use .{agent}/skills/
AGENT_PATHS = {
'claude': '~/.claude/skills/', # Global (home)
'cursor': '.cursor/skills/', # Project-relative
'vscode': '.github/skills/', # Project-relative
'copilot': '.github/skills/', # Same as VSCode
'amp': '~/.amp/skills/', # Global
'goose': '~/.config/goose/skills/', # Global
'opencode': '~/.opencode/skills/', # Global
'letta': '~/.letta/skills/', # Global
'aide': '~/.aide/skills/', # Global
'windsurf': '~/.windsurf/skills/', # Global
'neovate': '~/.neovate/skills/', # Global
"claude": "~/.claude/skills/", # Global (home)
"cursor": ".cursor/skills/", # Project-relative
"vscode": ".github/skills/", # Project-relative
"copilot": ".github/skills/", # Same as VSCode
"amp": "~/.amp/skills/", # Global
"goose": "~/.config/goose/skills/", # Global
"opencode": "~/.opencode/skills/", # Global
"letta": "~/.letta/skills/", # Global
"aide": "~/.aide/skills/", # Global
"windsurf": "~/.windsurf/skills/", # Global
"neovate": "~/.neovate/skills/", # Global
}
def get_agent_path(agent_name: str, project_root: Optional[Path] = None) -> Path:
def get_agent_path(agent_name: str, project_root: Path | None = None) -> Path:
"""
Resolve the installation path for a given agent.
@@ -75,7 +73,7 @@ def get_agent_path(agent_name: str, project_root: Optional[Path] = None) -> Path
path_template = AGENT_PATHS[agent_name]
# Handle home directory expansion (~)
if path_template.startswith('~'):
if path_template.startswith("~"):
return Path(path_template).expanduser()
# Handle project-relative paths
@@ -95,7 +93,7 @@ def get_available_agents() -> list:
return sorted(AGENT_PATHS.keys())
def validate_agent_name(agent_name: str) -> Tuple[bool, Optional[str]]:
def validate_agent_name(agent_name: str) -> tuple[bool, str | None]:
"""
Validate an agent name and provide suggestions if invalid.
@@ -111,7 +109,7 @@ def validate_agent_name(agent_name: str) -> Tuple[bool, Optional[str]]:
- error_message: None if valid, error message with suggestions if invalid
"""
# Special case: 'all' is valid for installing to all agents
if agent_name.lower() == 'all':
if agent_name.lower() == "all":
return True, None
# Case-insensitive check
@@ -130,13 +128,13 @@ def validate_agent_name(agent_name: str) -> Tuple[bool, Optional[str]]:
error_msg += f"Did you mean: {suggestions[0]}?\n\n"
error_msg += "Available agents:\n "
error_msg += ", ".join(available + ['all'])
error_msg += ", ".join(available + ["all"])
error_msg += f"\n\nUsage:\n skill-seekers install-agent <skill_directory> --agent {suggestions[0] if suggestions else 'claude'}"
return False, error_msg
def validate_skill_directory(skill_dir: Path) -> Tuple[bool, Optional[str]]:
def validate_skill_directory(skill_dir: Path) -> tuple[bool, str | None]:
"""
Validate that a directory is a valid skill directory.
@@ -165,11 +163,8 @@ def validate_skill_directory(skill_dir: Path) -> Tuple[bool, Optional[str]]:
def install_to_agent(
skill_dir: Union[str, Path],
agent_name: str,
force: bool = False,
dry_run: bool = False
) -> Tuple[bool, str]:
skill_dir: str | Path, agent_name: str, force: bool = False, dry_run: bool = False
) -> tuple[bool, str]:
"""
Install a skill to a specific agent's directory.
@@ -212,7 +207,7 @@ def install_to_agent(
# Check if already exists
if target_path.exists() and not force:
error_msg = f"❌ Skill already installed\n\n"
error_msg = "❌ Skill already installed\n\n"
error_msg += f"Location: {target_path}\n\n"
error_msg += "Options:\n"
error_msg += f" 1. Overwrite: skill-seekers install-agent {skill_dir} --agent {agent_name} --force\n"
@@ -222,34 +217,34 @@ def install_to_agent(
# Dry run mode - just preview
if dry_run:
msg = f"🔍 DRY RUN - No changes will be made\n\n"
msg = "🔍 DRY RUN - No changes will be made\n\n"
msg += f"Would install skill: {skill_name}\n"
msg += f" Source: {skill_dir}\n"
msg += f" Target: {target_path}\n\n"
# Calculate total size
total_size = sum(f.stat().st_size for f in skill_dir.rglob('*') if f.is_file())
total_size = sum(f.stat().st_size for f in skill_dir.rglob("*") if f.is_file())
msg += f"Files to copy:\n"
msg += "Files to copy:\n"
msg += f" SKILL.md ({(skill_dir / 'SKILL.md').stat().st_size / 1024:.1f} KB)\n"
references_dir = skill_dir / 'references'
references_dir = skill_dir / "references"
if references_dir.exists():
ref_files = list(references_dir.rglob('*.md'))
ref_files = list(references_dir.rglob("*.md"))
ref_size = sum(f.stat().st_size for f in ref_files)
msg += f" references/ ({len(ref_files)} files, {ref_size / 1024:.1f} KB)\n"
for subdir in ['scripts', 'assets']:
for subdir in ["scripts", "assets"]:
subdir_path = skill_dir / subdir
if subdir_path.exists():
files = list(subdir_path.rglob('*'))
files = list(subdir_path.rglob("*"))
if files:
msg += f" {subdir}/ ({len(files)} files)\n"
else:
msg += f" {subdir}/ (empty)\n"
msg += f"\nTotal size: {total_size / 1024:.1f} KB\n\n"
msg += f"To actually install, run:\n"
msg += "To actually install, run:\n"
msg += f" skill-seekers install-agent {skill_dir} --agent {agent_name}"
return True, msg
@@ -258,7 +253,10 @@ def install_to_agent(
try:
agent_base_path.mkdir(parents=True, exist_ok=True)
except PermissionError:
return False, f"❌ Permission denied: {agent_base_path}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}"
return (
False,
f"❌ Permission denied: {agent_base_path}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}",
)
# Copy skill directory
def ignore_files(directory, files):
@@ -266,16 +264,13 @@ def install_to_agent(
ignored = []
for f in files:
# Exclude backup files
if f.endswith('.backup'):
ignored.append(f)
# Exclude Python cache
elif f == '__pycache__':
ignored.append(f)
# Exclude macOS metadata
elif f == '.DS_Store':
ignored.append(f)
# Exclude hidden files (except .github for vscode)
elif f.startswith('.') and f not in ['.github', '.cursor']:
if (
f.endswith(".backup")
or f == "__pycache__"
or f == ".DS_Store"
or f.startswith(".")
and f not in [".github", ".cursor"]
):
ignored.append(f)
return ignored
@@ -288,16 +283,16 @@ def install_to_agent(
shutil.copytree(skill_dir, target_path, ignore=ignore_files)
# Success message
msg = f"✅ Installation complete!\n\n"
msg = "✅ Installation complete!\n\n"
msg += f"Skill '{skill_name}' installed to {agent_name}\n"
msg += f"Location: {target_path}\n\n"
# Agent-specific restart instructions
if agent_name.lower() == 'claude':
if agent_name.lower() == "claude":
msg += "Restart Claude Code to load the new skill."
elif agent_name.lower() == 'cursor':
elif agent_name.lower() == "cursor":
msg += "Restart Cursor to load the new skill."
elif agent_name.lower() in ['vscode', 'copilot']:
elif agent_name.lower() in ["vscode", "copilot"]:
msg += "Restart VS Code to load the new skill."
else:
msg += f"Restart {agent_name.capitalize()} to load the new skill."
@@ -305,16 +300,17 @@ def install_to_agent(
return True, msg
except PermissionError as e:
return False, f"❌ Permission denied: {e}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}"
return (
False,
f"❌ Permission denied: {e}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}",
)
except Exception as e:
return False, f"❌ Installation failed: {e}"
def install_to_all_agents(
skill_dir: Union[str, Path],
force: bool = False,
dry_run: bool = False
) -> Dict[str, Tuple[bool, str]]:
skill_dir: str | Path, force: bool = False, dry_run: bool = False
) -> dict[str, tuple[bool, str]]:
"""
Install a skill to all available agents.
@@ -365,31 +361,16 @@ Examples:
Supported agents:
claude, cursor, vscode, copilot, amp, goose, opencode, letta, aide, windsurf, neovate, all
"""
""",
)
parser.add_argument(
"skill_directory",
help="Path to skill directory (e.g., output/react/)"
)
parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)")
parser.add_argument(
"--agent",
required=True,
help="Agent name (use 'all' to install to all agents)"
)
parser.add_argument("--agent", required=True, help="Agent name (use 'all' to install to all agents)")
parser.add_argument(
"--force",
action="store_true",
help="Overwrite existing installation without asking"
)
parser.add_argument("--force", action="store_true", help="Overwrite existing installation without asking")
parser.add_argument(
"--dry-run",
action="store_true",
help="Preview installation without making changes"
)
parser.add_argument("--dry-run", action="store_true", help="Preview installation without making changes")
args = parser.parse_args()
@@ -398,7 +379,7 @@ Supported agents:
skill_name = skill_dir.name
# Handle 'all' agent
if args.agent.lower() == 'all':
if args.agent.lower() == "all":
print(f"\n📋 Installing skill to all agents: {skill_name}\n")
if args.dry_run:
@@ -433,7 +414,7 @@ Supported agents:
skipped_count += 1
# Summary
print(f"\n📊 Summary:")
print("\n📊 Summary:")
if args.dry_run:
print(f" Would install: {installed_count} agents")
else:

View File

@@ -26,8 +26,8 @@ Examples:
skill-seekers install --config react --dry-run
"""
import asyncio
import argparse
import asyncio
import sys
from pathlib import Path
@@ -78,51 +78,35 @@ Phases:
3. AI Enhancement (MANDATORY - no skip option)
4. Package for target platform (ZIP or tar.gz)
5. Upload to target platform (optional)
"""
""",
)
parser.add_argument(
"--config",
required=True,
help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')"
"--config", required=True, help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')"
)
parser.add_argument(
"--destination",
default="output",
help="Output directory for skill files (default: output/)"
)
parser.add_argument("--destination", default="output", help="Output directory for skill files (default: output/)")
parser.add_argument("--no-upload", action="store_true", help="Skip automatic upload to Claude")
parser.add_argument(
"--no-upload",
action="store_true",
help="Skip automatic upload to Claude"
"--unlimited", action="store_true", help="Remove page limits during scraping (WARNING: Can take hours)"
)
parser.add_argument(
"--unlimited",
action="store_true",
help="Remove page limits during scraping (WARNING: Can take hours)"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Preview workflow without executing"
)
parser.add_argument("--dry-run", action="store_true", help="Preview workflow without executing")
parser.add_argument(
"--target",
choices=['claude', 'gemini', 'openai', 'markdown'],
default='claude',
help="Target LLM platform (default: claude)"
choices=["claude", "gemini", "openai", "markdown"],
default="claude",
help="Target LLM platform (default: claude)",
)
args = parser.parse_args()
# Determine if config is a name or path
config_arg = args.config
if config_arg.endswith('.json') or '/' in config_arg or '\\' in config_arg:
if config_arg.endswith(".json") or "/" in config_arg or "\\" in config_arg:
# It's a path
config_path = config_arg
config_name = None
@@ -139,7 +123,7 @@ Phases:
"auto_upload": not args.no_upload,
"unlimited": args.unlimited,
"dry_run": args.dry_run,
"target": args.target
"target": args.target,
}
# Run async tool

View File

@@ -8,9 +8,8 @@ Supports 20+ programming languages with weighted pattern matching.
Author: Skill Seekers Project
"""
import re
import logging
from typing import Optional, Tuple, Dict, List
import re
logger = logging.getLogger(__name__)
@@ -18,19 +17,11 @@ logger = logging.getLogger(__name__)
try:
from skill_seekers.cli.swift_patterns import SWIFT_PATTERNS
except ImportError as e:
logger.warning(
"Swift language detection patterns unavailable. "
"Swift code detection will be disabled. Error: %s",
e
)
SWIFT_PATTERNS: Dict[str, List[Tuple[str, int]]] = {}
logger.warning("Swift language detection patterns unavailable. Swift code detection will be disabled. Error: %s", e)
SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {}
except Exception as e:
logger.error(
"Failed to load Swift patterns due to unexpected error: %s. "
"Swift detection disabled.",
e
)
SWIFT_PATTERNS: Dict[str, List[Tuple[str, int]]] = {}
logger.error("Failed to load Swift patterns due to unexpected error: %s. Swift detection disabled.", e)
SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {}
# Verify Swift patterns were loaded correctly
if not SWIFT_PATTERNS:
@@ -38,15 +29,13 @@ if not SWIFT_PATTERNS:
"Swift pattern dictionary is empty. Swift detection is disabled. "
"This may indicate swift_patterns.py has no patterns defined."
)
elif 'swift' not in SWIFT_PATTERNS:
elif "swift" not in SWIFT_PATTERNS:
logger.error(
"Swift patterns loaded but 'swift' key is missing. "
"Swift detection is broken. Please file a bug report."
"Swift patterns loaded but 'swift' key is missing. Swift detection is broken. Please file a bug report."
)
else:
logger.info(
"Swift patterns loaded successfully: %d patterns for language detection",
len(SWIFT_PATTERNS.get('swift', []))
"Swift patterns loaded successfully: %d patterns for language detection", len(SWIFT_PATTERNS.get("swift", []))
)
# Comprehensive language patterns with weighted confidence scoring
@@ -56,355 +45,325 @@ else:
# Weight 2: Moderate indicators
# Weight 1: Weak indicators
LANGUAGE_PATTERNS: Dict[str, List[Tuple[str, int]]] = {
LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = {
# ===== PRIORITY 1: Unity C# (Critical - User's Primary Issue) =====
'csharp': [
"csharp": [
# Unity-specific patterns (weight 4-5, CRITICAL)
(r'\busing\s+UnityEngine', 5),
(r'\bMonoBehaviour\b', 5),
(r'\bGameObject\b', 4),
(r'\bTransform\b', 4),
(r'\bVector[23]\b', 3),
(r'\bQuaternion\b', 3),
(r'\bvoid\s+Start\s*\(\)', 4),
(r'\bvoid\s+Update\s*\(\)', 4),
(r'\bvoid\s+Awake\s*\(\)', 4),
(r'\bvoid\s+OnEnable\s*\(\)', 3),
(r'\bvoid\s+OnDisable\s*\(\)', 3),
(r'\bvoid\s+FixedUpdate\s*\(\)', 4),
(r'\bvoid\s+LateUpdate\s*\(\)', 4),
(r'\bvoid\s+OnCollisionEnter', 4),
(r'\bvoid\s+OnTriggerEnter', 4),
(r'\bIEnumerator\b', 4),
(r'\bStartCoroutine\s*\(', 4),
(r'\byield\s+return\s+new\s+WaitForSeconds', 4),
(r'\byield\s+return\s+null', 3),
(r'\byield\s+return', 4),
(r'\[SerializeField\]', 4),
(r'\[RequireComponent', 4),
(r'\[Header\(', 3),
(r'\[Range\(', 3),
(r'\bTime\.deltaTime\b', 4),
(r'\bInput\.Get', 4),
(r'\bRigidbody\b', 3),
(r'\bCollider\b', 3),
(r'\bRenderer\b', 3),
(r'\bGetComponent<', 3),
(r"\busing\s+UnityEngine", 5),
(r"\bMonoBehaviour\b", 5),
(r"\bGameObject\b", 4),
(r"\bTransform\b", 4),
(r"\bVector[23]\b", 3),
(r"\bQuaternion\b", 3),
(r"\bvoid\s+Start\s*\(\)", 4),
(r"\bvoid\s+Update\s*\(\)", 4),
(r"\bvoid\s+Awake\s*\(\)", 4),
(r"\bvoid\s+OnEnable\s*\(\)", 3),
(r"\bvoid\s+OnDisable\s*\(\)", 3),
(r"\bvoid\s+FixedUpdate\s*\(\)", 4),
(r"\bvoid\s+LateUpdate\s*\(\)", 4),
(r"\bvoid\s+OnCollisionEnter", 4),
(r"\bvoid\s+OnTriggerEnter", 4),
(r"\bIEnumerator\b", 4),
(r"\bStartCoroutine\s*\(", 4),
(r"\byield\s+return\s+new\s+WaitForSeconds", 4),
(r"\byield\s+return\s+null", 3),
(r"\byield\s+return", 4),
(r"\[SerializeField\]", 4),
(r"\[RequireComponent", 4),
(r"\[Header\(", 3),
(r"\[Range\(", 3),
(r"\bTime\.deltaTime\b", 4),
(r"\bInput\.Get", 4),
(r"\bRigidbody\b", 3),
(r"\bCollider\b", 3),
(r"\bRenderer\b", 3),
(r"\bGetComponent<", 3),
# Basic C# patterns (weight 2-4)
(r'\bnamespace\s+\w+', 3),
(r'\busing\s+System', 3),
(r'\bConsole\.WriteLine', 4), # C#-specific output
(r'\bConsole\.Write', 3),
(r'\bpublic\s+class\s+\w+', 4), # Increased to match Java weight
(r'\bprivate\s+class\s+\w+', 3),
(r'\binternal\s+class\s+\w+', 4), # C#-specific modifier
(r'\bstring\s+\w+\s*[;=]', 2), # C#-specific lowercase string
(r'\bprivate\s+\w+\s+\w+\s*;', 2), # Private fields (common in both C# and Java)
(r'\{\s*get;\s*set;\s*\}', 3), # Auto properties
(r'\{\s*get;\s*private\s+set;\s*\}', 3),
(r'\{\s*get\s*=>\s*', 2), # Expression properties
(r'\bpublic\s+static\s+void\s+', 2),
(r"\bnamespace\s+\w+", 3),
(r"\busing\s+System", 3),
(r"\bConsole\.WriteLine", 4), # C#-specific output
(r"\bConsole\.Write", 3),
(r"\bpublic\s+class\s+\w+", 4), # Increased to match Java weight
(r"\bprivate\s+class\s+\w+", 3),
(r"\binternal\s+class\s+\w+", 4), # C#-specific modifier
(r"\bstring\s+\w+\s*[;=]", 2), # C#-specific lowercase string
(r"\bprivate\s+\w+\s+\w+\s*;", 2), # Private fields (common in both C# and Java)
(r"\{\s*get;\s*set;\s*\}", 3), # Auto properties
(r"\{\s*get;\s*private\s+set;\s*\}", 3),
(r"\{\s*get\s*=>\s*", 2), # Expression properties
(r"\bpublic\s+static\s+void\s+", 2),
# Modern C# patterns (weight 2)
(r'\bfrom\s+\w+\s+in\s+', 2), # LINQ
(r'\.Where\s*\(', 2),
(r'\.Select\s*\(', 2),
(r'\basync\s+Task', 2),
(r'\bawait\s+', 2),
(r'\bvar\s+\w+\s*=', 1),
(r"\bfrom\s+\w+\s+in\s+", 2), # LINQ
(r"\.Where\s*\(", 2),
(r"\.Select\s*\(", 2),
(r"\basync\s+Task", 2),
(r"\bawait\s+", 2),
(r"\bvar\s+\w+\s*=", 1),
],
# ===== PRIORITY 2: Frontend Languages =====
'typescript': [
"typescript": [
# TypeScript-specific (weight 4-5)
(r'\binterface\s+\w+\s*\{', 5),
(r'\btype\s+\w+\s*=', 4),
(r':\s*\w+\s*=', 3), # Type annotation
(r':\s*\w+\[\]', 3), # Array type
(r'<[\w,\s]+>', 2), # Generic type
(r'\bas\s+\w+', 2), # Type assertion
(r'\benum\s+\w+\s*\{', 4),
(r'\bimplements\s+\w+', 3),
(r'\bexport\s+interface', 4),
(r'\bexport\s+type', 4),
(r"\binterface\s+\w+\s*\{", 5),
(r"\btype\s+\w+\s*=", 4),
(r":\s*\w+\s*=", 3), # Type annotation
(r":\s*\w+\[\]", 3), # Array type
(r"<[\w,\s]+>", 2), # Generic type
(r"\bas\s+\w+", 2), # Type assertion
(r"\benum\s+\w+\s*\{", 4),
(r"\bimplements\s+\w+", 3),
(r"\bexport\s+interface", 4),
(r"\bexport\s+type", 4),
# Also has JS patterns (weight 1)
(r'\bconst\s+\w+\s*=', 1),
(r'\blet\s+\w+\s*=', 1),
(r'=>', 1),
(r"\bconst\s+\w+\s*=", 1),
(r"\blet\s+\w+\s*=", 1),
(r"=>", 1),
],
'javascript': [
(r'\bfunction\s+\w+\s*\(', 3),
(r'\bconst\s+\w+\s*=', 2),
(r'\blet\s+\w+\s*=', 2),
(r'=>', 2), # Arrow function
(r'\bconsole\.log', 2),
(r'\bvar\s+\w+\s*=', 1),
(r'\.then\s*\(', 2), # Promise
(r'\.catch\s*\(', 2), # Promise
(r'\basync\s+function', 3),
(r'\bawait\s+', 2),
(r'require\s*\(', 2), # CommonJS
(r'\bexport\s+default', 2), # ES6
(r'\bexport\s+const', 2),
"javascript": [
(r"\bfunction\s+\w+\s*\(", 3),
(r"\bconst\s+\w+\s*=", 2),
(r"\blet\s+\w+\s*=", 2),
(r"=>", 2), # Arrow function
(r"\bconsole\.log", 2),
(r"\bvar\s+\w+\s*=", 1),
(r"\.then\s*\(", 2), # Promise
(r"\.catch\s*\(", 2), # Promise
(r"\basync\s+function", 3),
(r"\bawait\s+", 2),
(r"require\s*\(", 2), # CommonJS
(r"\bexport\s+default", 2), # ES6
(r"\bexport\s+const", 2),
],
'jsx': [
"jsx": [
# JSX patterns (weight 4-5)
(r'<\w+\s+[^>]*>', 4), # JSX tag with attributes
(r'<\w+\s*/>', 4), # Self-closing tag
(r'className=', 3), # React className
(r'onClick=', 3), # React event
(r'\brender\s*\(\s*\)\s*\{', 4), # React render
(r'\buseState\s*\(', 4), # React hook
(r'\buseEffect\s*\(', 4), # React hook
(r'\buseRef\s*\(', 3),
(r'\buseCallback\s*\(', 3),
(r'\buseMemo\s*\(', 3),
(r"<\w+\s+[^>]*>", 4), # JSX tag with attributes
(r"<\w+\s*/>", 4), # Self-closing tag
(r"className=", 3), # React className
(r"onClick=", 3), # React event
(r"\brender\s*\(\s*\)\s*\{", 4), # React render
(r"\buseState\s*\(", 4), # React hook
(r"\buseEffect\s*\(", 4), # React hook
(r"\buseRef\s*\(", 3),
(r"\buseCallback\s*\(", 3),
(r"\buseMemo\s*\(", 3),
# Also has JS patterns
(r'\bconst\s+\w+\s*=', 1),
(r'=>', 1),
(r"\bconst\s+\w+\s*=", 1),
(r"=>", 1),
],
'tsx': [
"tsx": [
# TSX = TypeScript + JSX (weight 5)
(r'<\w+\s+[^>]*>', 3), # JSX tag
(r':\s*React\.\w+', 5), # React types
(r'interface\s+\w+Props', 5), # Props interface
(r'\bFunctionComponent<', 4),
(r'\bReact\.FC<', 4),
(r'\buseState<', 4), # Typed hook
(r'\buseRef<', 3),
(r"<\w+\s+[^>]*>", 3), # JSX tag
(r":\s*React\.\w+", 5), # React types
(r"interface\s+\w+Props", 5), # Props interface
(r"\bFunctionComponent<", 4),
(r"\bReact\.FC<", 4),
(r"\buseState<", 4), # Typed hook
(r"\buseRef<", 3),
# Also has TS patterns
(r'\binterface\s+\w+', 2),
(r'\btype\s+\w+\s*=', 2),
(r"\binterface\s+\w+", 2),
(r"\btype\s+\w+\s*=", 2),
],
'vue': [
"vue": [
# Vue SFC patterns (weight 4-5)
(r'<template>', 5),
(r'<script>', 3),
(r'<style\s+scoped>', 4),
(r'\bexport\s+default\s*\{', 3),
(r'\bdata\s*\(\s*\)\s*\{', 4), # Vue 2
(r'\bcomputed\s*:', 3),
(r'\bmethods\s*:', 3),
(r'\bsetup\s*\(', 4), # Vue 3 Composition
(r'\bref\s*\(', 4), # Vue 3
(r'\breactive\s*\(', 4), # Vue 3
(r'v-bind:', 3),
(r'v-for=', 3),
(r'v-if=', 3),
(r'v-model=', 3),
(r"<template>", 5),
(r"<script>", 3),
(r"<style\s+scoped>", 4),
(r"\bexport\s+default\s*\{", 3),
(r"\bdata\s*\(\s*\)\s*\{", 4), # Vue 2
(r"\bcomputed\s*:", 3),
(r"\bmethods\s*:", 3),
(r"\bsetup\s*\(", 4), # Vue 3 Composition
(r"\bref\s*\(", 4), # Vue 3
(r"\breactive\s*\(", 4), # Vue 3
(r"v-bind:", 3),
(r"v-for=", 3),
(r"v-if=", 3),
(r"v-model=", 3),
],
# ===== PRIORITY 3: Backend Languages =====
'java': [
(r'\bpublic\s+class\s+\w+', 4),
(r'\bprivate\s+\w+\s+\w+', 2),
(r'\bSystem\.out\.println', 3),
(r'\bpublic\s+static\s+void\s+main', 4),
(r'\bpublic\s+\w+\s+\w+\s*\(', 2),
(r'@Override', 3),
(r'@Autowired', 3), # Spring
(r'@Service', 3), # Spring
(r'@RestController', 3), # Spring
(r'@GetMapping', 3), # Spring
(r'@PostMapping', 3), # Spring
(r'\bimport\s+java\.', 2),
(r'\bextends\s+\w+', 2),
"java": [
(r"\bpublic\s+class\s+\w+", 4),
(r"\bprivate\s+\w+\s+\w+", 2),
(r"\bSystem\.out\.println", 3),
(r"\bpublic\s+static\s+void\s+main", 4),
(r"\bpublic\s+\w+\s+\w+\s*\(", 2),
(r"@Override", 3),
(r"@Autowired", 3), # Spring
(r"@Service", 3), # Spring
(r"@RestController", 3), # Spring
(r"@GetMapping", 3), # Spring
(r"@PostMapping", 3), # Spring
(r"\bimport\s+java\.", 2),
(r"\bextends\s+\w+", 2),
],
'go': [
(r'\bfunc\s+\w+\s*\(', 3),
(r'\bpackage\s+\w+', 4),
(r':=', 3), # Short declaration
(r'\bfmt\.Print', 2),
(r'\bfunc\s+\(.*\)\s+\w+\s*\(', 4), # Method
(r'\bdefer\s+', 3),
(r'\bgo\s+\w+\s*\(', 3), # Goroutine
(r'\bchan\s+', 3), # Channel
(r'\binterface\{\}', 2), # Empty interface
(r'\bfunc\s+main\s*\(\)', 4),
"go": [
(r"\bfunc\s+\w+\s*\(", 3),
(r"\bpackage\s+\w+", 4),
(r":=", 3), # Short declaration
(r"\bfmt\.Print", 2),
(r"\bfunc\s+\(.*\)\s+\w+\s*\(", 4), # Method
(r"\bdefer\s+", 3),
(r"\bgo\s+\w+\s*\(", 3), # Goroutine
(r"\bchan\s+", 3), # Channel
(r"\binterface\{\}", 2), # Empty interface
(r"\bfunc\s+main\s*\(\)", 4),
],
'rust': [
(r'\bfn\s+\w+\s*\(', 4),
(r'\blet\s+mut\s+\w+', 3),
(r'\bprintln!', 3),
(r'\bimpl\s+\w+', 3),
(r'\buse\s+\w+::', 3),
(r'\bpub\s+fn\s+', 3),
(r'\bmatch\s+\w+\s*\{', 3),
(r'\bSome\(', 2),
(r'\bNone\b', 2),
(r'\bResult<', 3),
(r'\bOption<', 3),
(r'&str\b', 2),
(r'\bfn\s+main\s*\(\)', 4),
"rust": [
(r"\bfn\s+\w+\s*\(", 4),
(r"\blet\s+mut\s+\w+", 3),
(r"\bprintln!", 3),
(r"\bimpl\s+\w+", 3),
(r"\buse\s+\w+::", 3),
(r"\bpub\s+fn\s+", 3),
(r"\bmatch\s+\w+\s*\{", 3),
(r"\bSome\(", 2),
(r"\bNone\b", 2),
(r"\bResult<", 3),
(r"\bOption<", 3),
(r"&str\b", 2),
(r"\bfn\s+main\s*\(\)", 4),
],
'php': [
(r'<\?php', 5),
(r'\$\w+\s*=', 2),
(r'\bfunction\s+\w+\s*\(', 2),
(r'\bpublic\s+function', 3),
(r'\bprivate\s+function', 3),
(r'\bclass\s+\w+', 3),
(r'\bnamespace\s+\w+', 3),
(r'\buse\s+\w+\\', 2),
(r'->', 2), # Object operator
(r'::', 1), # Static operator
"php": [
(r"<\?php", 5),
(r"\$\w+\s*=", 2),
(r"\bfunction\s+\w+\s*\(", 2),
(r"\bpublic\s+function", 3),
(r"\bprivate\s+function", 3),
(r"\bclass\s+\w+", 3),
(r"\bnamespace\s+\w+", 3),
(r"\buse\s+\w+\\", 2),
(r"->", 2), # Object operator
(r"::", 1), # Static operator
],
# ===== PRIORITY 4: System/Data Languages =====
'python': [
(r'\bdef\s+\w+\s*\(', 3),
(r'\bimport\s+\w+', 2),
(r'\bclass\s+\w+:', 3),
(r'\bfrom\s+\w+\s+import', 2),
(r':\s*$', 1), # Lines ending with :
(r'@\w+', 2), # Decorator
(r'\bself\.\w+', 2),
(r'\b__init__\s*\(', 3),
(r'\basync\s+def\s+', 3),
(r'\bawait\s+', 2),
(r'\bprint\s*\(', 1),
"python": [
(r"\bdef\s+\w+\s*\(", 3),
(r"\bimport\s+\w+", 2),
(r"\bclass\s+\w+:", 3),
(r"\bfrom\s+\w+\s+import", 2),
(r":\s*$", 1), # Lines ending with :
(r"@\w+", 2), # Decorator
(r"\bself\.\w+", 2),
(r"\b__init__\s*\(", 3),
(r"\basync\s+def\s+", 3),
(r"\bawait\s+", 2),
(r"\bprint\s*\(", 1),
],
'r': [
(r'<-', 4), # Assignment operator
(r'\bfunction\s*\(', 2),
(r'\blibrary\s*\(', 3),
(r'\bggplot\s*\(', 4), # ggplot2
(r'\bdata\.frame\s*\(', 3),
(r'\%>\%', 4), # Pipe operator
(r'\bsummary\s*\(', 2),
(r'\bread\.csv\s*\(', 3),
"r": [
(r"<-", 4), # Assignment operator
(r"\bfunction\s*\(", 2),
(r"\blibrary\s*\(", 3),
(r"\bggplot\s*\(", 4), # ggplot2
(r"\bdata\.frame\s*\(", 3),
(r"\%>\%", 4), # Pipe operator
(r"\bsummary\s*\(", 2),
(r"\bread\.csv\s*\(", 3),
],
'julia': [
(r'\bfunction\s+\w+\s*\(', 3),
(r'\bend\b', 2),
(r'\busing\s+\w+', 3),
(r'::', 2), # Type annotation
(r'\bmodule\s+\w+', 3),
(r'\babstract\s+type', 3),
(r'\bstruct\s+\w+', 3),
"julia": [
(r"\bfunction\s+\w+\s*\(", 3),
(r"\bend\b", 2),
(r"\busing\s+\w+", 3),
(r"::", 2), # Type annotation
(r"\bmodule\s+\w+", 3),
(r"\babstract\s+type", 3),
(r"\bstruct\s+\w+", 3),
],
'sql': [
(r'\bSELECT\s+', 4),
(r'\bFROM\s+', 3),
(r'\bWHERE\s+', 2),
(r'\bINSERT\s+INTO', 4),
(r'\bCREATE\s+TABLE', 4),
(r'\bJOIN\s+', 3),
(r'\bGROUP\s+BY', 3),
(r'\bORDER\s+BY', 3),
(r'\bUPDATE\s+', 3),
(r'\bDELETE\s+FROM', 3),
"sql": [
(r"\bSELECT\s+", 4),
(r"\bFROM\s+", 3),
(r"\bWHERE\s+", 2),
(r"\bINSERT\s+INTO", 4),
(r"\bCREATE\s+TABLE", 4),
(r"\bJOIN\s+", 3),
(r"\bGROUP\s+BY", 3),
(r"\bORDER\s+BY", 3),
(r"\bUPDATE\s+", 3),
(r"\bDELETE\s+FROM", 3),
],
# ===== Additional Languages =====
'cpp': [
(r'#include\s*<', 4),
(r'\bstd::', 3),
(r'\bnamespace\s+\w+', 3),
(r'\bcout\s*<<', 3),
(r'\bvoid\s+\w+\s*\(', 2),
(r'\bint\s+main\s*\(', 4),
(r'->', 2), # Pointer
"cpp": [
(r"#include\s*<", 4),
(r"\bstd::", 3),
(r"\bnamespace\s+\w+", 3),
(r"\bcout\s*<<", 3),
(r"\bvoid\s+\w+\s*\(", 2),
(r"\bint\s+main\s*\(", 4),
(r"->", 2), # Pointer
],
'c': [
(r'#include\s*<', 4),
(r'\bprintf\s*\(', 3),
(r'\bint\s+main\s*\(', 4),
(r'\bvoid\s+\w+\s*\(', 2),
(r'\bstruct\s+\w+', 3),
"c": [
(r"#include\s*<", 4),
(r"\bprintf\s*\(", 3),
(r"\bint\s+main\s*\(", 4),
(r"\bvoid\s+\w+\s*\(", 2),
(r"\bstruct\s+\w+", 3),
],
'gdscript': [
(r'\bfunc\s+\w+\s*\(', 3),
(r'\bvar\s+\w+\s*=', 3),
(r'\bextends\s+\w+', 4),
(r'\b_ready\s*\(', 4),
(r'\b_process\s*\(', 4),
"gdscript": [
(r"\bfunc\s+\w+\s*\(", 3),
(r"\bvar\s+\w+\s*=", 3),
(r"\bextends\s+\w+", 4),
(r"\b_ready\s*\(", 4),
(r"\b_process\s*\(", 4),
],
# ===== Markup/Config Languages =====
'html': [
(r'<!DOCTYPE\s+html>', 5),
(r'<html', 4),
(r'<head>', 3),
(r'<body>', 3),
(r'<div', 2),
(r'<span', 2),
(r'<script', 2),
"html": [
(r"<!DOCTYPE\s+html>", 5),
(r"<html", 4),
(r"<head>", 3),
(r"<body>", 3),
(r"<div", 2),
(r"<span", 2),
(r"<script", 2),
],
'css': [
(r'\{\s*[\w-]+\s*:', 3),
(r'@media', 3),
(r'\.[\w-]+\s*\{', 2),
(r'#[\w-]+\s*\{', 2),
(r'@import', 2),
"css": [
(r"\{\s*[\w-]+\s*:", 3),
(r"@media", 3),
(r"\.[\w-]+\s*\{", 2),
(r"#[\w-]+\s*\{", 2),
(r"@import", 2),
],
'json': [
(r'^\s*\{', 3),
(r'^\s*\[', 3),
"json": [
(r"^\s*\{", 3),
(r"^\s*\[", 3),
(r'"\w+"\s*:', 3),
(r':\s*["\d\[\{]', 2),
],
'yaml': [
(r'^\w+:', 3),
(r'^\s+-\s+\w+', 2),
(r'---', 2),
(r'^\s+\w+:', 2),
"yaml": [
(r"^\w+:", 3),
(r"^\s+-\s+\w+", 2),
(r"---", 2),
(r"^\s+\w+:", 2),
],
'xml': [
(r'<\?xml', 5),
(r'<\w+\s+\w+=', 2),
(r'<\w+>', 1),
(r'</\w+>', 1),
"xml": [
(r"<\?xml", 5),
(r"<\w+\s+\w+=", 2),
(r"<\w+>", 1),
(r"</\w+>", 1),
],
'markdown': [
(r'^#+\s+', 3),
(r'^\*\*\w+\*\*', 2),
(r'^\s*[-*]\s+', 2),
(r'\[.*\]\(.*\)', 2),
"markdown": [
(r"^#+\s+", 3),
(r"^\*\*\w+\*\*", 2),
(r"^\s*[-*]\s+", 2),
(r"\[.*\]\(.*\)", 2),
],
'bash': [
(r'#!/bin/bash', 5),
(r'#!/bin/sh', 5),
(r'\becho\s+', 2),
(r'\$\{?\w+\}?', 2),
(r'\bif\s+\[', 2),
(r'\bfor\s+\w+\s+in', 2),
"bash": [
(r"#!/bin/bash", 5),
(r"#!/bin/sh", 5),
(r"\becho\s+", 2),
(r"\$\{?\w+\}?", 2),
(r"\bif\s+\[", 2),
(r"\bfor\s+\w+\s+in", 2),
],
'shell': [
(r'#!/bin/bash', 5),
(r'#!/bin/sh', 5),
(r'\becho\s+', 2),
(r'\$\{?\w+\}?', 2),
"shell": [
(r"#!/bin/bash", 5),
(r"#!/bin/sh", 5),
(r"\becho\s+", 2),
(r"\$\{?\w+\}?", 2),
],
'powershell': [
(r'\$\w+\s*=', 2),
(r'Get-\w+', 3),
(r'Set-\w+', 3),
(r'\bWrite-Host\s+', 2),
"powershell": [
(r"\$\w+\s*=", 2),
(r"Get-\w+", 3),
(r"Set-\w+", 3),
(r"\bWrite-Host\s+", 2),
],
}
@@ -414,11 +373,42 @@ LANGUAGE_PATTERNS.update(SWIFT_PATTERNS)
# Known language list for CSS class detection
KNOWN_LANGUAGES = [
"javascript", "java", "xml", "html", "python", "bash", "cpp", "typescript",
"go", "rust", "php", "ruby", "swift", "kotlin", "csharp", "c", "sql",
"yaml", "json", "markdown", "css", "scss", "sass", "jsx", "tsx", "vue",
"shell", "powershell", "r", "scala", "dart", "perl", "lua", "elixir",
"julia", "gdscript",
"javascript",
"java",
"xml",
"html",
"python",
"bash",
"cpp",
"typescript",
"go",
"rust",
"php",
"ruby",
"swift",
"kotlin",
"csharp",
"c",
"sql",
"yaml",
"json",
"markdown",
"css",
"scss",
"sass",
"jsx",
"tsx",
"vue",
"shell",
"powershell",
"r",
"scala",
"dart",
"perl",
"lua",
"elixir",
"julia",
"gdscript",
]
@@ -452,7 +442,7 @@ class LanguageDetector:
0.3 = low, 0.5 = medium, 0.7 = high
"""
self.min_confidence = min_confidence
self._pattern_cache: Dict[str, List[Tuple[re.Pattern, int]]] = {}
self._pattern_cache: dict[str, list[tuple[re.Pattern, int]]] = {}
self._compile_patterns()
def _compile_patterns(self) -> None:
@@ -465,27 +455,28 @@ class LanguageDetector:
compiled_patterns.append((compiled, weight))
except re.error as e:
logger.error(
"Invalid regex pattern for language '%s' at index %d: '%s'. "
"Error: %s. Pattern skipped.",
lang, i, pattern[:50], e
"Invalid regex pattern for language '%s' at index %d: '%s'. Error: %s. Pattern skipped.",
lang,
i,
pattern[:50],
e,
)
except TypeError as e:
except TypeError:
logger.error(
"Pattern for language '%s' at index %d is not a string: %s. "
"Pattern skipped.",
lang, i, type(pattern).__name__
"Pattern for language '%s' at index %d is not a string: %s. Pattern skipped.",
lang,
i,
type(pattern).__name__,
)
if compiled_patterns:
self._pattern_cache[lang] = compiled_patterns
else:
logger.warning(
"No valid patterns compiled for language '%s'. "
"Detection for this language is disabled.",
lang
"No valid patterns compiled for language '%s'. Detection for this language is disabled.", lang
)
def detect_from_html(self, elem, code: str) -> Tuple[str, float]:
def detect_from_html(self, elem, code: str) -> tuple[str, float]:
"""
Detect language from HTML element with CSS classes + code content.
@@ -498,21 +489,21 @@ class LanguageDetector:
"""
# Tier 1: CSS classes (confidence 1.0)
if elem:
css_lang = self.extract_language_from_classes(elem.get('class', []))
css_lang = self.extract_language_from_classes(elem.get("class", []))
if css_lang:
return css_lang, 1.0
# Check parent pre element
parent = elem.parent
if parent and parent.name == 'pre':
css_lang = self.extract_language_from_classes(parent.get('class', []))
if parent and parent.name == "pre":
css_lang = self.extract_language_from_classes(parent.get("class", []))
if css_lang:
return css_lang, 1.0
# Tier 2: Pattern matching
return self.detect_from_code(code)
def detect_from_code(self, code: str) -> Tuple[str, float]:
def detect_from_code(self, code: str) -> tuple[str, float]:
"""
Detect language from code content only (for PDFs, GitHub files).
@@ -524,13 +515,13 @@ class LanguageDetector:
"""
# Edge case: code too short
if len(code.strip()) < 10:
return 'unknown', 0.0
return "unknown", 0.0
# Calculate confidence scores for all languages
scores = self._calculate_confidence(code)
if not scores:
return 'unknown', 0.0
return "unknown", 0.0
# Get language with highest score
best_lang = max(scores.items(), key=lambda x: x[1])
@@ -538,11 +529,11 @@ class LanguageDetector:
# Apply minimum confidence threshold
if confidence < self.min_confidence:
return 'unknown', 0.0
return "unknown", 0.0
return lang, confidence
def extract_language_from_classes(self, classes: List[str]) -> Optional[str]:
def extract_language_from_classes(self, classes: list[str]) -> str | None:
"""
Extract language from CSS class list.
@@ -563,21 +554,21 @@ class LanguageDetector:
for cls in classes:
# Handle brush: pattern
if 'brush:' in cls:
parts = cls.split('brush:')
if "brush:" in cls:
parts = cls.split("brush:")
if len(parts) > 1:
lang = parts[1].strip().lower()
if lang in KNOWN_LANGUAGES:
return lang
# Handle language- prefix
if cls.startswith('language-'):
if cls.startswith("language-"):
lang = cls[9:].lower()
if lang in KNOWN_LANGUAGES:
return lang
# Handle lang- prefix
if cls.startswith('lang-'):
if cls.startswith("lang-"):
lang = cls[5:].lower()
if lang in KNOWN_LANGUAGES:
return lang
@@ -588,7 +579,7 @@ class LanguageDetector:
return None
def _calculate_confidence(self, code: str) -> Dict[str, float]:
def _calculate_confidence(self, code: str) -> dict[str, float]:
"""
Calculate weighted confidence scores for all languages.
@@ -598,7 +589,7 @@ class LanguageDetector:
Returns:
Dictionary mapping language names to confidence scores (0.0-1.0)
"""
scores: Dict[str, float] = {}
scores: dict[str, float] = {}
for lang, compiled_patterns in self._pattern_cache.items():
total_score = 0

View File

@@ -1,23 +1,20 @@
# ABOUTME: Detects and validates llms.txt file availability at documentation URLs
# ABOUTME: Supports llms-full.txt, llms.txt, and llms-small.txt variants
import requests
from typing import Optional, Dict, List
from urllib.parse import urlparse
import requests
class LlmsTxtDetector:
"""Detect llms.txt files at documentation URLs"""
VARIANTS = [
('llms-full.txt', 'full'),
('llms.txt', 'standard'),
('llms-small.txt', 'small')
]
VARIANTS = [("llms-full.txt", "full"), ("llms.txt", "standard"), ("llms-small.txt", "small")]
def __init__(self, base_url: str):
self.base_url = base_url.rstrip('/')
self.base_url = base_url.rstrip("/")
def detect(self) -> Optional[Dict[str, str]]:
def detect(self) -> dict[str, str] | None:
"""
Detect available llms.txt variant.
@@ -31,11 +28,11 @@ class LlmsTxtDetector:
url = f"{root_url}/{filename}"
if self._check_url_exists(url):
return {'url': url, 'variant': variant}
return {"url": url, "variant": variant}
return None
def detect_all(self) -> List[Dict[str, str]]:
def detect_all(self) -> list[dict[str, str]]:
"""
Detect all available llms.txt variants.
@@ -50,10 +47,7 @@ class LlmsTxtDetector:
url = f"{root_url}/{filename}"
if self._check_url_exists(url):
found_variants.append({
'url': url,
'variant': variant
})
found_variants.append({"url": url, "variant": variant})
return found_variants

View File

@@ -1,9 +1,11 @@
"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff"""
import requests
import time
from typing import Optional
import requests
class LlmsTxtDownloader:
"""Download llms.txt content from URLs with retry logic"""
@@ -27,12 +29,13 @@ class LlmsTxtDownloader:
"""
# Extract filename from URL
from urllib.parse import urlparse
parsed = urlparse(self.url)
filename = parsed.path.split('/')[-1]
filename = parsed.path.split("/")[-1]
# Replace .txt with .md
if filename.endswith('.txt'):
filename = filename[:-4] + '.md'
if filename.endswith(".txt"):
filename = filename[:-4] + ".md"
return filename
@@ -46,37 +49,31 @@ class LlmsTxtDownloader:
# First, reject HTML content (common redirect trap)
content_start = content.strip()[:500].lower()
html_indicators = [
'<!doctype html',
'<html',
'<!doctype',
'<head>',
'<meta charset',
"<!doctype html",
"<html",
"<!doctype",
"<head>",
"<meta charset",
]
if any(indicator in content_start for indicator in html_indicators):
return False
# Then check for markdown patterns
markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
markdown_patterns = ["# ", "## ", "```", "- ", "* ", "`"]
return any(pattern in content for pattern in markdown_patterns)
def download(self) -> Optional[str]:
def download(self) -> str | None:
"""
Download llms.txt content with retry logic.
Returns:
String content or None if download fails
"""
headers = {
'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
}
headers = {"User-Agent": "Skill-Seekers-llms.txt-Reader/1.0"}
for attempt in range(self.max_retries):
try:
response = requests.get(
self.url,
headers=headers,
timeout=self.timeout
)
response = requests.get(self.url, headers=headers, timeout=self.timeout)
response.raise_for_status()
content = response.text
@@ -88,7 +85,7 @@ class LlmsTxtDownloader:
# Validate content looks like markdown
if not self._is_markdown(content):
print(f"⚠️ Content doesn't look like markdown")
print("⚠️ Content doesn't look like markdown")
return None
return content
@@ -96,7 +93,7 @@ class LlmsTxtDownloader:
except requests.RequestException as e:
if attempt < self.max_retries - 1:
# Calculate exponential backoff delay: 1s, 2s, 4s, etc.
delay = 2 ** attempt
delay = 2**attempt
print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}")
print(f" Retrying in {delay}s...")
time.sleep(delay)

View File

@@ -1,10 +1,11 @@
"""ABOUTME: Parses llms.txt markdown content into structured page data"""
"""ABOUTME: Extracts titles, content, code samples, and headings from markdown"""
import re
from typing import List, Dict
from urllib.parse import urljoin
class LlmsTxtParser:
"""Parse llms.txt markdown content into page structures"""
@@ -12,7 +13,7 @@ class LlmsTxtParser:
self.content = content
self.base_url = base_url
def extract_urls(self) -> List[str]:
def extract_urls(self) -> list[str]:
"""
Extract all URLs from the llms.txt content.
@@ -33,13 +34,13 @@ class LlmsTxtParser:
urls = set()
# Match markdown links: [text](url)
md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
md_links = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", self.content)
for _, url in md_links:
if url.startswith('http'):
if url.startswith("http"):
clean_url = self._clean_url(url)
if clean_url:
urls.add(clean_url)
elif self.base_url and not url.startswith('#'):
elif self.base_url and not url.startswith("#"):
clean_url = self._clean_url(urljoin(self.base_url, url))
if clean_url:
urls.add(clean_url)
@@ -48,7 +49,7 @@ class LlmsTxtParser:
bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
for url in bare_urls:
# Clean trailing punctuation
url = url.rstrip('.,;:')
url = url.rstrip(".,;:")
clean_url = self._clean_url(url)
if clean_url:
urls.add(clean_url)
@@ -79,16 +80,16 @@ class LlmsTxtParser:
"""
# Skip URLs with path after anchor (e.g., #section/index.html.md)
# These are malformed and return duplicate HTML content
if '#' in url:
anchor_pos = url.index('#')
after_anchor = url[anchor_pos + 1:]
if "#" in url:
anchor_pos = url.index("#")
after_anchor = url[anchor_pos + 1 :]
# If there's a path separator after anchor, it's invalid
if '/' in after_anchor:
if "/" in after_anchor:
# Extract the base URL without the malformed anchor
return url[:anchor_pos]
return url
def parse(self) -> List[Dict]:
def parse(self) -> list[dict]:
"""
Parse markdown content into page structures.
@@ -98,55 +99,50 @@ class LlmsTxtParser:
pages = []
# Split by h1 headers (# Title)
sections = re.split(r'\n# ', self.content)
sections = re.split(r"\n# ", self.content)
for section in sections:
if not section.strip():
continue
# First line is title
lines = section.split('\n')
title = lines[0].strip('#').strip()
lines = section.split("\n")
title = lines[0].strip("#").strip()
# Parse content
page = self._parse_section('\n'.join(lines[1:]), title)
page = self._parse_section("\n".join(lines[1:]), title)
pages.append(page)
return pages
def _parse_section(self, content: str, title: str) -> Dict:
def _parse_section(self, content: str, title: str) -> dict:
"""Parse a single section into page structure"""
page = {
'title': title,
'content': '',
'code_samples': [],
'headings': [],
'url': f'llms-txt#{title.lower().replace(" ", "-")}',
'links': []
"title": title,
"content": "",
"code_samples": [],
"headings": [],
"url": f"llms-txt#{title.lower().replace(' ', '-')}",
"links": [],
}
# Extract code blocks
code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
code_blocks = re.findall(r"```(\w+)?\n(.*?)```", content, re.DOTALL)
for lang, code in code_blocks:
page['code_samples'].append({
'code': code.strip(),
'language': lang or 'unknown'
})
page["code_samples"].append({"code": code.strip(), "language": lang or "unknown"})
# Extract h2/h3 headings
headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
headings = re.findall(r"^(#{2,3})\s+(.+)$", content, re.MULTILINE)
for level_markers, text in headings:
page['headings'].append({
'level': f'h{len(level_markers)}',
'text': text.strip(),
'id': text.lower().replace(' ', '-')
})
page["headings"].append(
{"level": f"h{len(level_markers)}", "text": text.strip(), "id": text.lower().replace(" ", "-")}
)
# Remove code blocks from content for plain text
content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
# Extract paragraphs
paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
page['content'] = '\n\n'.join(paragraphs)
paragraphs = [p.strip() for p in content_no_code.split("\n\n") if len(p.strip()) > 20]
page["content"] = "\n\n".join(paragraphs)
return page

View File

@@ -31,9 +31,8 @@ Examples:
skill-seekers install-agent output/react/ --agent cursor
"""
import sys
import argparse
from typing import List, Optional
import sys
def create_parser() -> argparse.ArgumentParser:
@@ -61,54 +60,27 @@ Examples:
skill-seekers upload output/react.zip
For more information: https://github.com/yusufkaraaslan/Skill_Seekers
"""
""",
)
parser.add_argument(
"--version",
action="version",
version="%(prog)s 2.7.0"
)
parser.add_argument("--version", action="version", version="%(prog)s 2.7.0")
subparsers = parser.add_subparsers(
dest="command",
title="commands",
description="Available Skill Seekers commands",
help="Command to run"
dest="command", title="commands", description="Available Skill Seekers commands", help="Command to run"
)
# === config subcommand ===
config_parser = subparsers.add_parser(
"config",
help="Configure GitHub tokens, API keys, and settings",
description="Interactive configuration wizard"
)
config_parser.add_argument(
"--github",
action="store_true",
help="Go directly to GitHub token setup"
)
config_parser.add_argument(
"--api-keys",
action="store_true",
help="Go directly to API keys setup"
)
config_parser.add_argument(
"--show",
action="store_true",
help="Show current configuration and exit"
)
config_parser.add_argument(
"--test",
action="store_true",
help="Test connections and exit"
"config", help="Configure GitHub tokens, API keys, and settings", description="Interactive configuration wizard"
)
config_parser.add_argument("--github", action="store_true", help="Go directly to GitHub token setup")
config_parser.add_argument("--api-keys", action="store_true", help="Go directly to API keys setup")
config_parser.add_argument("--show", action="store_true", help="Show current configuration and exit")
config_parser.add_argument("--test", action="store_true", help="Test connections and exit")
# === scrape subcommand ===
scrape_parser = subparsers.add_parser(
"scrape",
help="Scrape documentation website",
description="Scrape documentation website and generate skill"
"scrape", help="Scrape documentation website", description="Scrape documentation website and generate skill"
)
scrape_parser.add_argument("--config", help="Config JSON file")
scrape_parser.add_argument("--name", help="Skill name")
@@ -123,9 +95,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
# === github subcommand ===
github_parser = subparsers.add_parser(
"github",
help="Scrape GitHub repository",
description="Scrape GitHub repository and generate skill"
"github", help="Scrape GitHub repository", description="Scrape GitHub repository and generate skill"
)
github_parser.add_argument("--config", help="Config JSON file")
github_parser.add_argument("--repo", help="GitHub repo (owner/repo)")
@@ -134,14 +104,14 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
github_parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)")
github_parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)")
github_parser.add_argument("--api-key", type=str, help="Anthropic API key for --enhance")
github_parser.add_argument("--non-interactive", action="store_true", help="Non-interactive mode (fail fast on rate limits)")
github_parser.add_argument(
"--non-interactive", action="store_true", help="Non-interactive mode (fail fast on rate limits)"
)
github_parser.add_argument("--profile", type=str, help="GitHub profile name from config")
# === pdf subcommand ===
pdf_parser = subparsers.add_parser(
"pdf",
help="Extract from PDF file",
description="Extract content from PDF and generate skill"
"pdf", help="Extract from PDF file", description="Extract content from PDF and generate skill"
)
pdf_parser.add_argument("--config", help="Config JSON file")
pdf_parser.add_argument("--pdf", help="PDF file path")
@@ -153,7 +123,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
unified_parser = subparsers.add_parser(
"unified",
help="Multi-source scraping (docs + GitHub + PDF)",
description="Combine multiple sources into one skill"
description="Combine multiple sources into one skill",
)
unified_parser.add_argument("--config", required=True, help="Unified config JSON file")
unified_parser.add_argument("--merge-mode", help="Merge mode (rule-based, claude-enhanced)")
@@ -163,7 +133,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
enhance_parser = subparsers.add_parser(
"enhance",
help="AI-powered enhancement (local, no API key)",
description="Enhance SKILL.md using Claude Code (local)"
description="Enhance SKILL.md using Claude Code (local)",
)
enhance_parser.add_argument("skill_directory", help="Skill directory path")
enhance_parser.add_argument("--background", action="store_true", help="Run in background")
@@ -175,7 +145,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
enhance_status_parser = subparsers.add_parser(
"enhance-status",
help="Check enhancement status (for background/daemon modes)",
description="Monitor background enhancement processes"
description="Monitor background enhancement processes",
)
enhance_status_parser.add_argument("skill_directory", help="Skill directory path")
enhance_status_parser.add_argument("--watch", "-w", action="store_true", help="Watch in real-time")
@@ -184,9 +154,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
# === package subcommand ===
package_parser = subparsers.add_parser(
"package",
help="Package skill into .zip file",
description="Package skill directory into uploadable .zip"
"package", help="Package skill into .zip file", description="Package skill directory into uploadable .zip"
)
package_parser.add_argument("skill_directory", help="Skill directory path")
package_parser.add_argument("--no-open", action="store_true", help="Don't open output folder")
@@ -194,9 +162,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
# === upload subcommand ===
upload_parser = subparsers.add_parser(
"upload",
help="Upload skill to Claude",
description="Upload .zip file to Claude via Anthropic API"
"upload", help="Upload skill to Claude", description="Upload .zip file to Claude via Anthropic API"
)
upload_parser.add_argument("zip_file", help=".zip file to upload")
upload_parser.add_argument("--api-key", help="Anthropic API key")
@@ -205,7 +171,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
estimate_parser = subparsers.add_parser(
"estimate",
help="Estimate page count before scraping",
description="Estimate total pages for documentation scraping"
description="Estimate total pages for documentation scraping",
)
estimate_parser.add_argument("config", nargs="?", help="Config JSON file")
estimate_parser.add_argument("--all", action="store_true", help="List all available configs")
@@ -215,128 +181,63 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
test_examples_parser = subparsers.add_parser(
"extract-test-examples",
help="Extract usage examples from test files",
description="Analyze test files to extract real API usage patterns"
description="Analyze test files to extract real API usage patterns",
)
test_examples_parser.add_argument("directory", nargs="?", help="Directory containing test files")
test_examples_parser.add_argument("--file", help="Single test file to analyze")
test_examples_parser.add_argument("--language", help="Filter by programming language (python, javascript, etc.)")
test_examples_parser.add_argument(
"--min-confidence", type=float, default=0.5, help="Minimum confidence threshold (0.0-1.0, default: 0.5)"
)
test_examples_parser.add_argument(
"directory",
nargs="?",
help="Directory containing test files"
)
test_examples_parser.add_argument(
"--file",
help="Single test file to analyze"
)
test_examples_parser.add_argument(
"--language",
help="Filter by programming language (python, javascript, etc.)"
)
test_examples_parser.add_argument(
"--min-confidence",
type=float,
default=0.5,
help="Minimum confidence threshold (0.0-1.0, default: 0.5)"
)
test_examples_parser.add_argument(
"--max-per-file",
type=int,
default=10,
help="Maximum examples per file (default: 10)"
)
test_examples_parser.add_argument(
"--json",
action="store_true",
help="Output JSON format"
)
test_examples_parser.add_argument(
"--markdown",
action="store_true",
help="Output Markdown format"
"--max-per-file", type=int, default=10, help="Maximum examples per file (default: 10)"
)
test_examples_parser.add_argument("--json", action="store_true", help="Output JSON format")
test_examples_parser.add_argument("--markdown", action="store_true", help="Output Markdown format")
# === install-agent subcommand ===
install_agent_parser = subparsers.add_parser(
"install-agent",
help="Install skill to AI agent directories",
description="Copy skill to agent-specific installation directories"
description="Copy skill to agent-specific installation directories",
)
install_agent_parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)")
install_agent_parser.add_argument(
"--agent", required=True, help="Agent name (claude, cursor, vscode, amp, goose, opencode, all)"
)
install_agent_parser.add_argument(
"skill_directory",
help="Skill directory path (e.g., output/react/)"
"--force", action="store_true", help="Overwrite existing installation without asking"
)
install_agent_parser.add_argument(
"--agent",
required=True,
help="Agent name (claude, cursor, vscode, amp, goose, opencode, all)"
)
install_agent_parser.add_argument(
"--force",
action="store_true",
help="Overwrite existing installation without asking"
)
install_agent_parser.add_argument(
"--dry-run",
action="store_true",
help="Preview installation without making changes"
"--dry-run", action="store_true", help="Preview installation without making changes"
)
# === install subcommand ===
install_parser = subparsers.add_parser(
"install",
help="Complete workflow: fetch → scrape → enhance → package → upload",
description="One-command skill installation (AI enhancement MANDATORY)"
description="One-command skill installation (AI enhancement MANDATORY)",
)
install_parser.add_argument(
"--config",
required=True,
help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')"
)
install_parser.add_argument(
"--destination",
default="output",
help="Output directory (default: output/)"
)
install_parser.add_argument(
"--no-upload",
action="store_true",
help="Skip automatic upload to Claude"
)
install_parser.add_argument(
"--unlimited",
action="store_true",
help="Remove page limits during scraping"
)
install_parser.add_argument(
"--dry-run",
action="store_true",
help="Preview workflow without executing"
"--config", required=True, help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')"
)
install_parser.add_argument("--destination", default="output", help="Output directory (default: output/)")
install_parser.add_argument("--no-upload", action="store_true", help="Skip automatic upload to Claude")
install_parser.add_argument("--unlimited", action="store_true", help="Remove page limits during scraping")
install_parser.add_argument("--dry-run", action="store_true", help="Preview workflow without executing")
# === resume subcommand ===
resume_parser = subparsers.add_parser(
"resume",
help="Resume interrupted scraping job",
description="Continue from saved progress checkpoint"
)
resume_parser.add_argument(
"job_id",
nargs="?",
help="Job ID to resume (or use --list to see available jobs)"
)
resume_parser.add_argument(
"--list",
action="store_true",
help="List all resumable jobs"
)
resume_parser.add_argument(
"--clean",
action="store_true",
help="Clean up old progress files"
"resume", help="Resume interrupted scraping job", description="Continue from saved progress checkpoint"
)
resume_parser.add_argument("job_id", nargs="?", help="Job ID to resume (or use --list to see available jobs)")
resume_parser.add_argument("--list", action="store_true", help="List all resumable jobs")
resume_parser.add_argument("--clean", action="store_true", help="Clean up old progress files")
return parser
def main(argv: Optional[List[str]] = None) -> int:
def main(argv: list[str] | None = None) -> int:
"""Main entry point for the unified CLI.
Args:
@@ -356,6 +257,7 @@ def main(argv: Optional[List[str]] = None) -> int:
try:
if args.command == "config":
from skill_seekers.cli.config_command import main as config_main
sys.argv = ["config_command.py"]
if args.github:
sys.argv.append("--github")
@@ -369,6 +271,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "scrape":
from skill_seekers.cli.doc_scraper import main as scrape_main
# Convert args namespace to sys.argv format for doc_scraper
sys.argv = ["doc_scraper.py"]
if args.config:
@@ -395,6 +298,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "github":
from skill_seekers.cli.github_scraper import main as github_main
sys.argv = ["github_scraper.py"]
if args.config:
sys.argv.extend(["--config", args.config])
@@ -418,6 +322,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "pdf":
from skill_seekers.cli.pdf_scraper import main as pdf_main
sys.argv = ["pdf_scraper.py"]
if args.config:
sys.argv.extend(["--config", args.config])
@@ -433,6 +338,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "unified":
from skill_seekers.cli.unified_scraper import main as unified_main
sys.argv = ["unified_scraper.py", "--config", args.config]
if args.merge_mode:
sys.argv.extend(["--merge-mode", args.merge_mode])
@@ -442,6 +348,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "enhance":
from skill_seekers.cli.enhance_skill_local import main as enhance_main
sys.argv = ["enhance_skill_local.py", args.skill_directory]
if args.background:
sys.argv.append("--background")
@@ -455,6 +362,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "enhance-status":
from skill_seekers.cli.enhance_status import main as enhance_status_main
sys.argv = ["enhance_status.py", args.skill_directory]
if args.watch:
sys.argv.append("--watch")
@@ -466,6 +374,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "package":
from skill_seekers.cli.package_skill import main as package_main
sys.argv = ["package_skill.py", args.skill_directory]
if args.no_open:
sys.argv.append("--no-open")
@@ -475,6 +384,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "upload":
from skill_seekers.cli.upload_skill import main as upload_main
sys.argv = ["upload_skill.py", args.zip_file]
if args.api_key:
sys.argv.extend(["--api-key", args.api_key])
@@ -482,6 +392,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "estimate":
from skill_seekers.cli.estimate_pages import main as estimate_main
sys.argv = ["estimate_pages.py"]
if args.all:
sys.argv.append("--all")
@@ -493,6 +404,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "extract-test-examples":
from skill_seekers.cli.test_example_extractor import main as test_examples_main
sys.argv = ["test_example_extractor.py"]
if args.directory:
sys.argv.append(args.directory)
@@ -512,6 +424,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "install-agent":
from skill_seekers.cli.install_agent import main as install_agent_main
sys.argv = ["install_agent.py", args.skill_directory, "--agent", args.agent]
if args.force:
sys.argv.append("--force")
@@ -521,6 +434,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "install":
from skill_seekers.cli.install_skill import main as install_main
sys.argv = ["install_skill.py"]
if args.config:
sys.argv.extend(["--config", args.config])
@@ -536,6 +450,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif args.command == "resume":
from skill_seekers.cli.resume_command import main as resume_main
sys.argv = ["resume_command.py"]
if args.job_id:
sys.argv.append(args.job_id)

View File

@@ -24,13 +24,13 @@ class MarkdownCleaner:
Cleaned markdown with HTML tags removed
"""
# Remove HTML comments
text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
# Remove HTML tags but keep content
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r"<[^>]+>", "", text)
# Remove empty lines created by HTML removal
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
return text.strip()
@@ -58,7 +58,7 @@ class MarkdownCleaner:
return text.strip()
# For longer text, extract smartly
lines = text.split('\n')
lines = text.split("\n")
content_lines = []
char_count = 0
section_count = 0
@@ -66,11 +66,11 @@ class MarkdownCleaner:
for line in lines:
# Check for code fence (```)
if line.strip().startswith('```'):
if line.strip().startswith("```"):
in_code_block = not in_code_block
# Check for any heading (H1-H6)
is_heading = re.match(r'^#{1,6}\s+', line)
is_heading = re.match(r"^#{1,6}\s+", line)
if is_heading:
section_count += 1
@@ -91,7 +91,7 @@ class MarkdownCleaner:
if char_count >= max_chars and not in_code_block:
break
result = '\n'.join(content_lines).strip()
result = "\n".join(content_lines).strip()
# If we truncated, ensure we don't break markdown (only if not in code block)
if char_count >= max_chars and not in_code_block:
@@ -119,17 +119,13 @@ class MarkdownCleaner:
truncated = text[:max_chars]
# Look for last period, exclamation, or question mark
last_sentence = max(
truncated.rfind('. '),
truncated.rfind('! '),
truncated.rfind('? ')
)
last_sentence = max(truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
if last_sentence > max_chars // 2: # At least half the content
return truncated[:last_sentence + 1]
return truncated[: last_sentence + 1]
# Fall back to word boundary
last_space = truncated.rfind(' ')
last_space = truncated.rfind(" ")
if last_space > 0:
return truncated[:last_space] + "..."

View File

@@ -17,16 +17,16 @@ Multi-layer architecture (Phase 3):
import json
import logging
import os
import subprocess
import tempfile
import os
from pathlib import Path
from typing import Dict, List, Any, Optional
from typing import Any, Optional
from .conflict_detector import Conflict, ConflictDetector
# Import three-stream data classes (Phase 1)
try:
from .github_fetcher import ThreeStreamData, CodeStream, DocsStream, InsightsStream
from .github_fetcher import CodeStream, DocsStream, InsightsStream, ThreeStreamData
except ImportError:
# Fallback if github_fetcher not available
ThreeStreamData = None
@@ -38,11 +38,7 @@ logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def categorize_issues_by_topic(
problems: List[Dict],
solutions: List[Dict],
topics: List[str]
) -> Dict[str, List[Dict]]:
def categorize_issues_by_topic(problems: list[dict], solutions: list[dict], topics: list[str]) -> dict[str, list[dict]]:
"""
Categorize GitHub issues by topic keywords.
@@ -55,14 +51,14 @@ def categorize_issues_by_topic(
Dict mapping topic to relevant issues
"""
categorized = {topic: [] for topic in topics}
categorized['other'] = []
categorized["other"] = []
all_issues = problems + solutions
for issue in all_issues:
# Get searchable text
title = issue.get('title', '').lower()
labels = [label.lower() for label in issue.get('labels', [])]
title = issue.get("title", "").lower()
labels = [label.lower() for label in issue.get("labels", [])]
text = f"{title} {' '.join(labels)}"
# Find best matching topic
@@ -82,18 +78,15 @@ def categorize_issues_by_topic(
if matched_topic and max_matches > 0:
categorized[matched_topic].append(issue)
else:
categorized['other'].append(issue)
categorized["other"].append(issue)
# Remove empty categories
return {k: v for k, v in categorized.items() if v}
def generate_hybrid_content(
api_data: Dict,
github_docs: Optional[Dict],
github_insights: Optional[Dict],
conflicts: List[Conflict]
) -> Dict[str, Any]:
api_data: dict, github_docs: dict | None, github_insights: dict | None, conflicts: list[Conflict]
) -> dict[str, Any]:
"""
Generate hybrid content combining API data with GitHub context.
@@ -106,76 +99,67 @@ def generate_hybrid_content(
Returns:
Hybrid content dict with enriched API reference
"""
hybrid = {
'api_reference': api_data,
'github_context': {}
}
hybrid = {"api_reference": api_data, "github_context": {}}
# Add GitHub documentation layer
if github_docs:
hybrid['github_context']['docs'] = {
'readme': github_docs.get('readme'),
'contributing': github_docs.get('contributing'),
'docs_files_count': len(github_docs.get('docs_files', []))
hybrid["github_context"]["docs"] = {
"readme": github_docs.get("readme"),
"contributing": github_docs.get("contributing"),
"docs_files_count": len(github_docs.get("docs_files", [])),
}
# Add GitHub insights layer
if github_insights:
metadata = github_insights.get('metadata', {})
hybrid['github_context']['metadata'] = {
'stars': metadata.get('stars', 0),
'forks': metadata.get('forks', 0),
'language': metadata.get('language', 'Unknown'),
'description': metadata.get('description', '')
metadata = github_insights.get("metadata", {})
hybrid["github_context"]["metadata"] = {
"stars": metadata.get("stars", 0),
"forks": metadata.get("forks", 0),
"language": metadata.get("language", "Unknown"),
"description": metadata.get("description", ""),
}
# Add issue insights
common_problems = github_insights.get('common_problems', [])
known_solutions = github_insights.get('known_solutions', [])
common_problems = github_insights.get("common_problems", [])
known_solutions = github_insights.get("known_solutions", [])
hybrid['github_context']['issues'] = {
'common_problems_count': len(common_problems),
'known_solutions_count': len(known_solutions),
'top_problems': common_problems[:5], # Top 5 most-discussed
'top_solutions': known_solutions[:5]
hybrid["github_context"]["issues"] = {
"common_problems_count": len(common_problems),
"known_solutions_count": len(known_solutions),
"top_problems": common_problems[:5], # Top 5 most-discussed
"top_solutions": known_solutions[:5],
}
hybrid['github_context']['top_labels'] = github_insights.get('top_labels', [])
hybrid["github_context"]["top_labels"] = github_insights.get("top_labels", [])
# Add conflict summary
hybrid['conflict_summary'] = {
'total_conflicts': len(conflicts),
'by_type': {},
'by_severity': {}
}
hybrid["conflict_summary"] = {"total_conflicts": len(conflicts), "by_type": {}, "by_severity": {}}
for conflict in conflicts:
# Count by type
conflict_type = conflict.type
hybrid['conflict_summary']['by_type'][conflict_type] = \
hybrid['conflict_summary']['by_type'].get(conflict_type, 0) + 1
hybrid["conflict_summary"]["by_type"][conflict_type] = (
hybrid["conflict_summary"]["by_type"].get(conflict_type, 0) + 1
)
# Count by severity
severity = conflict.severity
hybrid['conflict_summary']['by_severity'][severity] = \
hybrid['conflict_summary']['by_severity'].get(severity, 0) + 1
hybrid["conflict_summary"]["by_severity"][severity] = (
hybrid["conflict_summary"]["by_severity"].get(severity, 0) + 1
)
# Add GitHub issue links for relevant APIs
if github_insights:
hybrid['issue_links'] = _match_issues_to_apis(
api_data.get('apis', {}),
github_insights.get('common_problems', []),
github_insights.get('known_solutions', [])
hybrid["issue_links"] = _match_issues_to_apis(
api_data.get("apis", {}),
github_insights.get("common_problems", []),
github_insights.get("known_solutions", []),
)
return hybrid
def _match_issues_to_apis(
apis: Dict[str, Dict],
problems: List[Dict],
solutions: List[Dict]
) -> Dict[str, List[Dict]]:
def _match_issues_to_apis(apis: dict[str, dict], problems: list[dict], solutions: list[dict]) -> dict[str, list[dict]]:
"""
Match GitHub issues to specific APIs by keyword matching.
@@ -190,24 +174,26 @@ def _match_issues_to_apis(
issue_links = {}
all_issues = problems + solutions
for api_name in apis.keys():
for api_name in apis:
# Extract searchable keywords from API name
api_keywords = api_name.lower().replace('_', ' ').split('.')
api_keywords = api_name.lower().replace("_", " ").split(".")
matched_issues = []
for issue in all_issues:
title = issue.get('title', '').lower()
labels = [label.lower() for label in issue.get('labels', [])]
title = issue.get("title", "").lower()
labels = [label.lower() for label in issue.get("labels", [])]
text = f"{title} {' '.join(labels)}"
# Check if any API keyword appears in issue
if any(keyword in text for keyword in api_keywords):
matched_issues.append({
'number': issue.get('number'),
'title': issue.get('title'),
'state': issue.get('state'),
'comments': issue.get('comments')
})
matched_issues.append(
{
"number": issue.get("number"),
"title": issue.get("title"),
"state": issue.get("state"),
"comments": issue.get("comments"),
}
)
if matched_issues:
issue_links[api_name] = matched_issues
@@ -232,11 +218,13 @@ class RuleBasedMerger:
4. If conflict → Include both versions with [CONFLICT] tag, prefer code signature
"""
def __init__(self,
docs_data: Dict,
github_data: Dict,
conflicts: List[Conflict],
github_streams: Optional['ThreeStreamData'] = None):
def __init__(
self,
docs_data: dict,
github_data: dict,
conflicts: list[Conflict],
github_streams: Optional["ThreeStreamData"] = None,
):
"""
Initialize rule-based merger with GitHub streams support.
@@ -266,21 +254,21 @@ class RuleBasedMerger:
# Layer 3: GitHub docs
if github_streams.docs_stream:
self.github_docs = {
'readme': github_streams.docs_stream.readme,
'contributing': github_streams.docs_stream.contributing,
'docs_files': github_streams.docs_stream.docs_files
"readme": github_streams.docs_stream.readme,
"contributing": github_streams.docs_stream.contributing,
"docs_files": github_streams.docs_stream.docs_files,
}
# Layer 4: GitHub insights
if github_streams.insights_stream:
self.github_insights = {
'metadata': github_streams.insights_stream.metadata,
'common_problems': github_streams.insights_stream.common_problems,
'known_solutions': github_streams.insights_stream.known_solutions,
'top_labels': github_streams.insights_stream.top_labels
"metadata": github_streams.insights_stream.metadata,
"common_problems": github_streams.insights_stream.common_problems,
"known_solutions": github_streams.insights_stream.known_solutions,
"top_labels": github_streams.insights_stream.top_labels,
}
def merge_all(self) -> Dict[str, Any]:
def merge_all(self) -> dict[str, Any]:
"""
Merge all APIs using rule-based logic with GitHub insights (Phase 3).
@@ -302,15 +290,15 @@ class RuleBasedMerger:
# Build base result
merged_data = {
'merge_mode': 'rule-based',
'apis': merged_apis,
'summary': {
'total_apis': len(merged_apis),
'docs_only': sum(1 for api in merged_apis.values() if api['status'] == 'docs_only'),
'code_only': sum(1 for api in merged_apis.values() if api['status'] == 'code_only'),
'matched': sum(1 for api in merged_apis.values() if api['status'] == 'matched'),
'conflict': sum(1 for api in merged_apis.values() if api['status'] == 'conflict')
}
"merge_mode": "rule-based",
"apis": merged_apis,
"summary": {
"total_apis": len(merged_apis),
"docs_only": sum(1 for api in merged_apis.values() if api["status"] == "docs_only"),
"code_only": sum(1 for api in merged_apis.values() if api["status"] == "code_only"),
"matched": sum(1 for api in merged_apis.values() if api["status"] == "matched"),
"conflict": sum(1 for api in merged_apis.values() if api["status"] == "conflict"),
},
}
# Generate hybrid content if GitHub streams available (Phase 3)
@@ -320,20 +308,22 @@ class RuleBasedMerger:
api_data=merged_data,
github_docs=self.github_docs,
github_insights=self.github_insights,
conflicts=self.conflicts
conflicts=self.conflicts,
)
# Merge hybrid content into result
merged_data['github_context'] = hybrid_content.get('github_context', {})
merged_data['conflict_summary'] = hybrid_content.get('conflict_summary', {})
merged_data['issue_links'] = hybrid_content.get('issue_links', {})
merged_data["github_context"] = hybrid_content.get("github_context", {})
merged_data["conflict_summary"] = hybrid_content.get("conflict_summary", {})
merged_data["issue_links"] = hybrid_content.get("issue_links", {})
logger.info(f"Added GitHub context: {len(self.github_insights.get('common_problems', []))} problems, "
f"{len(self.github_insights.get('known_solutions', []))} solutions")
logger.info(
f"Added GitHub context: {len(self.github_insights.get('common_problems', []))} problems, "
f"{len(self.github_insights.get('known_solutions', []))} solutions"
)
return merged_data
def _merge_single_api(self, api_name: str) -> Dict[str, Any]:
def _merge_single_api(self, api_name: str) -> dict[str, Any]:
"""
Merge a single API using rules.
@@ -351,25 +341,27 @@ class RuleBasedMerger:
if in_docs and not in_code:
conflict = self.conflict_index.get(api_name)
return {
'name': api_name,
'status': 'docs_only',
'source': 'documentation',
'data': self.docs_apis[api_name],
'warning': 'This API is documented but not found in codebase',
'conflict': conflict.__dict__ if conflict else None
"name": api_name,
"status": "docs_only",
"source": "documentation",
"data": self.docs_apis[api_name],
"warning": "This API is documented but not found in codebase",
"conflict": conflict.__dict__ if conflict else None,
}
# Rule 2: Only in code
if in_code and not in_docs:
is_private = api_name.startswith('_')
is_private = api_name.startswith("_")
conflict = self.conflict_index.get(api_name)
return {
'name': api_name,
'status': 'code_only',
'source': 'code',
'data': self.code_apis[api_name],
'warning': 'This API exists in code but is not documented' if not is_private else 'Internal/private API',
'conflict': conflict.__dict__ if conflict else None
"name": api_name,
"status": "code_only",
"source": "code",
"data": self.code_apis[api_name],
"warning": "This API exists in code but is not documented"
if not is_private
else "Internal/private API",
"conflict": conflict.__dict__ if conflict else None,
}
# Both exist - check for conflicts
@@ -379,32 +371,32 @@ class RuleBasedMerger:
# Rule 3: Both match perfectly (no conflict)
if not has_conflict:
return {
'name': api_name,
'status': 'matched',
'source': 'both',
'docs_data': docs_info,
'code_data': code_info,
'merged_signature': self._create_merged_signature(code_info, docs_info),
'merged_description': docs_info.get('docstring') or code_info.get('docstring')
"name": api_name,
"status": "matched",
"source": "both",
"docs_data": docs_info,
"code_data": code_info,
"merged_signature": self._create_merged_signature(code_info, docs_info),
"merged_description": docs_info.get("docstring") or code_info.get("docstring"),
}
# Rule 4: Conflict exists - prefer code signature, keep docs description
conflict = self.conflict_index[api_name]
return {
'name': api_name,
'status': 'conflict',
'source': 'both',
'docs_data': docs_info,
'code_data': code_info,
'conflict': conflict.__dict__,
'resolution': 'prefer_code_signature',
'merged_signature': self._create_merged_signature(code_info, docs_info),
'merged_description': docs_info.get('docstring') or code_info.get('docstring'),
'warning': conflict.difference
"name": api_name,
"status": "conflict",
"source": "both",
"docs_data": docs_info,
"code_data": code_info,
"conflict": conflict.__dict__,
"resolution": "prefer_code_signature",
"merged_signature": self._create_merged_signature(code_info, docs_info),
"merged_description": docs_info.get("docstring") or code_info.get("docstring"),
"warning": conflict.difference,
}
def _create_merged_signature(self, code_info: Dict, docs_info: Dict) -> str:
def _create_merged_signature(self, code_info: dict, docs_info: dict) -> str:
"""
Create merged signature preferring code data.
@@ -415,17 +407,17 @@ class RuleBasedMerger:
Returns:
Merged signature string
"""
name = code_info.get('name', docs_info.get('name'))
params = code_info.get('parameters', docs_info.get('parameters', []))
return_type = code_info.get('return_type', docs_info.get('return_type'))
name = code_info.get("name", docs_info.get("name"))
params = code_info.get("parameters", docs_info.get("parameters", []))
return_type = code_info.get("return_type", docs_info.get("return_type"))
# Build parameter string
param_strs = []
for param in params:
param_str = param['name']
if param.get('type_hint'):
param_str = param["name"]
if param.get("type_hint"):
param_str += f": {param['type_hint']}"
if param.get('default'):
if param.get("default"):
param_str += f" = {param['default']}"
param_strs.append(param_str)
@@ -451,11 +443,13 @@ class ClaudeEnhancedMerger:
- Layer 4: GitHub insights (issues)
"""
def __init__(self,
docs_data: Dict,
github_data: Dict,
conflicts: List[Conflict],
github_streams: Optional['ThreeStreamData'] = None):
def __init__(
self,
docs_data: dict,
github_data: dict,
conflicts: list[Conflict],
github_streams: Optional["ThreeStreamData"] = None,
):
"""
Initialize Claude-enhanced merger with GitHub streams support.
@@ -473,7 +467,7 @@ class ClaudeEnhancedMerger:
# First do rule-based merge as baseline
self.rule_merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)
def merge_all(self) -> Dict[str, Any]:
def merge_all(self) -> dict[str, Any]:
"""
Merge all APIs using Claude enhancement.
@@ -510,7 +504,7 @@ class ClaudeEnhancedMerger:
Returns:
Path to workspace directory
"""
workspace = tempfile.mkdtemp(prefix='skill_merge_')
workspace = tempfile.mkdtemp(prefix="skill_merge_")
logger.info(f"Created merge workspace: {workspace}")
# Write context files for Claude
@@ -522,26 +516,30 @@ class ClaudeEnhancedMerger:
"""Write context files for Claude to analyze."""
# 1. Write conflicts summary
conflicts_file = os.path.join(workspace, 'conflicts.json')
with open(conflicts_file, 'w') as f:
json.dump({
'conflicts': [c.__dict__ for c in self.conflicts],
'summary': {
'total': len(self.conflicts),
'by_type': self._count_by_field('type'),
'by_severity': self._count_by_field('severity')
}
}, f, indent=2)
conflicts_file = os.path.join(workspace, "conflicts.json")
with open(conflicts_file, "w") as f:
json.dump(
{
"conflicts": [c.__dict__ for c in self.conflicts],
"summary": {
"total": len(self.conflicts),
"by_type": self._count_by_field("type"),
"by_severity": self._count_by_field("severity"),
},
},
f,
indent=2,
)
# 2. Write documentation APIs
docs_apis_file = os.path.join(workspace, 'docs_apis.json')
docs_apis_file = os.path.join(workspace, "docs_apis.json")
detector = ConflictDetector(self.docs_data, self.github_data)
with open(docs_apis_file, 'w') as f:
with open(docs_apis_file, "w") as f:
json.dump(detector.docs_apis, f, indent=2)
# 3. Write code APIs
code_apis_file = os.path.join(workspace, 'code_apis.json')
with open(code_apis_file, 'w') as f:
code_apis_file = os.path.join(workspace, "code_apis.json")
with open(code_apis_file, "w") as f:
json.dump(detector.code_apis, f, indent=2)
# 4. Write merge instructions for Claude
@@ -602,13 +600,13 @@ Create `merged_apis.json` with this structure:
Take your time to analyze each conflict carefully. The goal is to create the most accurate and helpful API reference possible.
"""
instructions_file = os.path.join(workspace, 'MERGE_INSTRUCTIONS.md')
with open(instructions_file, 'w') as f:
instructions_file = os.path.join(workspace, "MERGE_INSTRUCTIONS.md")
with open(instructions_file, "w") as f:
f.write(instructions)
logger.info(f"Wrote context files to {workspace}")
def _count_by_field(self, field: str) -> Dict[str, int]:
def _count_by_field(self, field: str) -> dict[str, int]:
"""Count conflicts by a specific field."""
counts = {}
for conflict in self.conflicts:
@@ -623,7 +621,7 @@ Take your time to analyze each conflict carefully. The goal is to create the mos
Similar to enhance_skill_local.py approach.
"""
# Create a script that Claude will execute
script_path = os.path.join(workspace, 'merge_script.sh')
script_path = os.path.join(workspace, "merge_script.sh")
script_content = f"""#!/bin/bash
# Automatic merge script for Claude Code
@@ -646,23 +644,18 @@ echo "When done, save merged_apis.json and close this terminal."
read -p "Press Enter when merge is complete..."
"""
with open(script_path, 'w') as f:
with open(script_path, "w") as f:
f.write(script_content)
os.chmod(script_path, 0o755)
# Open new terminal with Claude Code
# Try different terminal emulators
terminals = [
['x-terminal-emulator', '-e'],
['gnome-terminal', '--'],
['xterm', '-e'],
['konsole', '-e']
]
terminals = [["x-terminal-emulator", "-e"], ["gnome-terminal", "--"], ["xterm", "-e"], ["konsole", "-e"]]
for terminal_cmd in terminals:
try:
cmd = terminal_cmd + ['bash', script_path]
cmd = terminal_cmd + ["bash", script_path]
subprocess.Popen(cmd)
logger.info(f"Opened terminal with {terminal_cmd[0]}")
break
@@ -670,12 +663,13 @@ read -p "Press Enter when merge is complete..."
continue
# Wait for merge to complete
merged_file = os.path.join(workspace, 'merged_apis.json')
merged_file = os.path.join(workspace, "merged_apis.json")
logger.info(f"Waiting for merged results at: {merged_file}")
logger.info("Close the terminal when done to continue...")
# Poll for file existence
import time
timeout = 3600 # 1 hour max
elapsed = 0
while not os.path.exists(merged_file) and elapsed < timeout:
@@ -685,27 +679,26 @@ read -p "Press Enter when merge is complete..."
if not os.path.exists(merged_file):
raise TimeoutError("Claude merge timed out after 1 hour")
def _read_merged_results(self, workspace: str) -> Dict[str, Any]:
def _read_merged_results(self, workspace: str) -> dict[str, Any]:
"""Read merged results from workspace."""
merged_file = os.path.join(workspace, 'merged_apis.json')
merged_file = os.path.join(workspace, "merged_apis.json")
if not os.path.exists(merged_file):
raise FileNotFoundError(f"Merged results not found: {merged_file}")
with open(merged_file, 'r') as f:
with open(merged_file) as f:
merged_data = json.load(f)
return {
'merge_mode': 'claude-enhanced',
**merged_data
}
return {"merge_mode": "claude-enhanced", **merged_data}
def merge_sources(docs_data_path: str,
github_data_path: str,
output_path: str,
mode: str = 'rule-based',
github_streams: Optional['ThreeStreamData'] = None) -> Dict[str, Any]:
def merge_sources(
docs_data_path: str,
github_data_path: str,
output_path: str,
mode: str = "rule-based",
github_streams: Optional["ThreeStreamData"] = None,
) -> dict[str, Any]:
"""
Merge documentation and GitHub data with optional GitHub streams (Phase 3).
@@ -726,10 +719,10 @@ def merge_sources(docs_data_path: str,
Merged data dict with hybrid content
"""
# Load data
with open(docs_data_path, 'r') as f:
with open(docs_data_path) as f:
docs_data = json.load(f)
with open(github_data_path, 'r') as f:
with open(github_data_path) as f:
github_data = json.load(f)
# Detect conflicts
@@ -749,7 +742,7 @@ def merge_sources(docs_data_path: str,
logger.info(f" - Insights stream: {problems} problems, {solutions} solutions")
# Merge based on mode
if mode == 'claude-enhanced':
if mode == "claude-enhanced":
merger = ClaudeEnhancedMerger(docs_data, github_data, conflicts, github_streams)
else:
merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)
@@ -757,7 +750,7 @@ def merge_sources(docs_data_path: str,
merged_data = merger.merge_all()
# Save merged data
with open(output_path, 'w') as f:
with open(output_path, "w") as f:
json.dump(merged_data, f, indent=2, ensure_ascii=False)
logger.info(f"Merged data saved to: {output_path}")
@@ -765,22 +758,23 @@ def merge_sources(docs_data_path: str,
return merged_data
if __name__ == '__main__':
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Merge documentation and code sources')
parser.add_argument('docs_data', help='Path to documentation data JSON')
parser.add_argument('github_data', help='Path to GitHub data JSON')
parser.add_argument('--output', '-o', default='merged_data.json', help='Output file path')
parser.add_argument('--mode', '-m', choices=['rule-based', 'claude-enhanced'],
default='rule-based', help='Merge mode')
parser = argparse.ArgumentParser(description="Merge documentation and code sources")
parser.add_argument("docs_data", help="Path to documentation data JSON")
parser.add_argument("github_data", help="Path to GitHub data JSON")
parser.add_argument("--output", "-o", default="merged_data.json", help="Output file path")
parser.add_argument(
"--mode", "-m", choices=["rule-based", "claude-enhanced"], default="rule-based", help="Merge mode"
)
args = parser.parse_args()
merged = merge_sources(args.docs_data, args.github_data, args.output, args.mode)
# Print summary
summary = merged.get('summary', {})
summary = merged.get("summary", {})
print(f"\n✅ Merge complete ({merged.get('merge_mode')})")
print(f" Total APIs: {summary.get('total_apis', 0)}")
print(f" Matched: {summary.get('matched', 0)}")

View File

@@ -5,10 +5,10 @@ Multi-Skill Packager
Package multiple skills at once. Useful for packaging router + sub-skills together.
"""
import sys
import argparse
from pathlib import Path
import subprocess
import sys
from pathlib import Path
def package_skill(skill_dir: Path) -> bool:
@@ -17,7 +17,7 @@ def package_skill(skill_dir: Path) -> bool:
result = subprocess.run(
[sys.executable, str(Path(__file__).parent / "package_skill.py"), str(skill_dir)],
capture_output=True,
text=True
text=True,
)
return result.returncode == 0
except Exception as e:
@@ -36,20 +36,16 @@ Examples:
# Package specific skills
python3 package_multi.py output/godot-2d/ output/godot-3d/ output/godot-scripting/
"""
""",
)
parser.add_argument(
'skill_dirs',
nargs='+',
help='Skill directories to package'
)
parser.add_argument("skill_dirs", nargs="+", help="Skill directories to package")
args = parser.parse_args()
print(f"\n{'='*60}")
print(f"MULTI-SKILL PACKAGER")
print(f"{'='*60}\n")
print(f"\n{'=' * 60}")
print("MULTI-SKILL PACKAGER")
print(f"{'=' * 60}\n")
skill_dirs = [Path(d) for d in args.skill_dirs]
success_count = 0
@@ -67,14 +63,14 @@ Examples:
print(f"📦 Packaging: {skill_dir.name}")
if package_skill(skill_dir):
success_count += 1
print(f" ✅ Success")
print(" ✅ Success")
else:
print(f" ❌ Failed")
print(" ❌ Failed")
print("")
print(f"{'='*60}")
print(f"{'=' * 60}")
print(f"SUMMARY: {success_count}/{total_count} skills packaged")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
if __name__ == "__main__":

View File

@@ -9,34 +9,23 @@ Usage:
skill-seekers package output/react/ --no-open # Don't open folder
"""
import argparse
import os
import sys
import zipfile
import argparse
from pathlib import Path
# Import utilities
try:
from utils import (
open_folder,
print_upload_instructions,
format_file_size,
validate_skill_directory
)
from quality_checker import SkillQualityChecker, print_report
from utils import format_file_size, open_folder, print_upload_instructions, validate_skill_directory
except ImportError:
# If running from different directory, add cli to path
sys.path.insert(0, str(Path(__file__).parent))
from utils import (
open_folder,
print_upload_instructions,
format_file_size,
validate_skill_directory
)
from quality_checker import SkillQualityChecker, print_report
from utils import format_file_size, open_folder, print_upload_instructions, validate_skill_directory
def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, target='claude'):
def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, target="claude"):
"""
Package a skill directory into platform-specific format
@@ -73,7 +62,7 @@ def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, t
if report.has_errors or report.has_warnings:
print("=" * 60)
response = input("\nContinue with packaging? (y/n): ").strip().lower()
if response != 'y':
if response != "y":
print("\n❌ Packaging cancelled by user")
return False, None
print()
@@ -84,6 +73,7 @@ def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, t
# Get platform-specific adaptor
try:
from skill_seekers.cli.adaptors import get_adaptor
adaptor = get_adaptor(target)
except (ImportError, ValueError) as e:
print(f"❌ Error: {e}")
@@ -140,37 +130,24 @@ Examples:
# Get help
skill-seekers package --help
"""
""",
)
parser.add_argument("skill_dir", help="Path to skill directory (e.g., output/react/)")
parser.add_argument("--no-open", action="store_true", help="Do not open the output folder after packaging")
parser.add_argument("--skip-quality-check", action="store_true", help="Skip quality checks before packaging")
parser.add_argument(
"--target",
choices=["claude", "gemini", "openai", "markdown"],
default="claude",
help="Target LLM platform (default: claude)",
)
parser.add_argument(
'skill_dir',
help='Path to skill directory (e.g., output/react/)'
)
parser.add_argument(
'--no-open',
action='store_true',
help='Do not open the output folder after packaging'
)
parser.add_argument(
'--skip-quality-check',
action='store_true',
help='Skip quality checks before packaging'
)
parser.add_argument(
'--target',
choices=['claude', 'gemini', 'openai', 'markdown'],
default='claude',
help='Target LLM platform (default: claude)'
)
parser.add_argument(
'--upload',
action='store_true',
help='Automatically upload after packaging (requires platform API key)'
"--upload", action="store_true", help="Automatically upload after packaging (requires platform API key)"
)
args = parser.parse_args()
@@ -179,7 +156,7 @@ Examples:
args.skill_dir,
open_folder_after=not args.no_open,
skip_quality_check=args.skip_quality_check,
target=args.target
target=args.target,
)
if not success:
@@ -194,42 +171,42 @@ Examples:
adaptor = get_adaptor(args.target)
# Get API key from environment
api_key = os.environ.get(adaptor.get_env_var_name(), '').strip()
api_key = os.environ.get(adaptor.get_env_var_name(), "").strip()
if not api_key:
# No API key - show helpful message but DON'T fail
print("\n" + "="*60)
print("\n" + "=" * 60)
print("💡 Automatic Upload")
print("="*60)
print("=" * 60)
print()
print(f"To enable automatic upload to {adaptor.PLATFORM_NAME}:")
print(f" 1. Get API key from the platform")
print(" 1. Get API key from the platform")
print(f" 2. Set: export {adaptor.get_env_var_name()}=...")
print(f" 3. Run package command with --upload flag")
print(" 3. Run package command with --upload flag")
print()
print("For now, use manual upload (instructions above) ☝️")
print("="*60)
print("=" * 60)
# Exit successfully - packaging worked!
sys.exit(0)
# API key exists - try upload
print("\n" + "="*60)
print("\n" + "=" * 60)
print(f"📤 Uploading to {adaptor.PLATFORM_NAME}...")
print("="*60)
print("=" * 60)
result = adaptor.upload(package_path, api_key)
if result['success']:
if result["success"]:
print(f"\n{result['message']}")
if result['url']:
if result["url"]:
print(f" View at: {result['url']}")
print("="*60)
print("=" * 60)
sys.exit(0)
else:
print(f"\n❌ Upload failed: {result['message']}")
print()
print("💡 Try manual upload instead (instructions above) ☝️")
print("="*60)
print("=" * 60)
# Exit successfully - packaging worked even if upload failed
sys.exit(0)

File diff suppressed because it is too large Load Diff

View File

@@ -48,11 +48,11 @@ Example:
--extract-tables --parallel
"""
import os
import sys
import json
import re
import argparse
import json
import os
import re
import sys
from pathlib import Path
# Import unified language detector
@@ -70,12 +70,14 @@ except ImportError:
try:
import pytesseract
from PIL import Image
TESSERACT_AVAILABLE = True
except ImportError:
TESSERACT_AVAILABLE = False
try:
import concurrent.futures
CONCURRENT_AVAILABLE = True
except ImportError:
CONCURRENT_AVAILABLE = False
@@ -84,10 +86,22 @@ except ImportError:
class PDFExtractor:
"""Extract text and code from PDF documentation"""
def __init__(self, pdf_path, verbose=False, chunk_size=10, min_quality=0.0,
extract_images=False, image_dir=None, min_image_size=100,
use_ocr=False, password=None, extract_tables=False,
parallel=False, max_workers=None, use_cache=True):
def __init__(
self,
pdf_path,
verbose=False,
chunk_size=10,
min_quality=0.0,
extract_images=False,
image_dir=None,
min_image_size=100,
use_ocr=False,
password=None,
extract_tables=False,
parallel=False,
max_workers=None,
use_cache=True,
):
self.pdf_path = pdf_path
self.verbose = verbose
self.chunk_size = chunk_size # Pages per chunk (0 = no chunking)
@@ -175,11 +189,11 @@ class PDFExtractor:
tabs = page.find_tables()
for idx, tab in enumerate(tabs.tables):
table_data = {
'table_index': idx,
'rows': tab.extract(),
'bbox': tab.bbox,
'row_count': len(tab.extract()),
'col_count': len(tab.extract()[0]) if tab.extract() else 0
"table_index": idx,
"rows": tab.extract(),
"bbox": tab.bbox,
"row_count": len(tab.extract()),
"col_count": len(tab.extract()[0]) if tab.extract() else 0,
}
tables.append(table_data)
self.log(f" Found table {idx}: {table_data['row_count']}x{table_data['col_count']}")
@@ -236,54 +250,54 @@ class PDFExtractor:
# Common syntax checks
if not code.strip():
return False, ['Empty code block']
return False, ["Empty code block"]
# Language-specific validation
if language == 'python':
if language == "python":
# Check indentation consistency
lines = code.split('\n')
lines = code.split("\n")
indent_chars = set()
for line in lines:
if line.startswith(' '):
indent_chars.add('space')
elif line.startswith('\t'):
indent_chars.add('tab')
if line.startswith(" "):
indent_chars.add("space")
elif line.startswith("\t"):
indent_chars.add("tab")
if len(indent_chars) > 1:
issues.append('Mixed tabs and spaces')
issues.append("Mixed tabs and spaces")
# Check for unclosed brackets/parens
open_count = code.count('(') + code.count('[') + code.count('{')
close_count = code.count(')') + code.count(']') + code.count('}')
open_count = code.count("(") + code.count("[") + code.count("{")
close_count = code.count(")") + code.count("]") + code.count("}")
if abs(open_count - close_count) > 2: # Allow small mismatch
issues.append('Unbalanced brackets')
issues.append("Unbalanced brackets")
elif language in ['javascript', 'java', 'cpp', 'c', 'csharp', 'go']:
elif language in ["javascript", "java", "cpp", "c", "csharp", "go"]:
# Check for balanced braces
open_braces = code.count('{')
close_braces = code.count('}')
open_braces = code.count("{")
close_braces = code.count("}")
if abs(open_braces - close_braces) > 1:
issues.append('Unbalanced braces')
issues.append("Unbalanced braces")
elif language == 'json':
elif language == "json":
# Try to parse JSON
try:
json.loads(code)
except (json.JSONDecodeError, ValueError) as e:
issues.append(f'Invalid JSON syntax: {str(e)[:50]}')
issues.append(f"Invalid JSON syntax: {str(e)[:50]}")
# General checks
# Check if code looks like natural language (too many common words)
common_words = ['the', 'and', 'for', 'with', 'this', 'that', 'have', 'from']
common_words = ["the", "and", "for", "with", "this", "that", "have", "from"]
word_count = sum(1 for word in common_words if word in code.lower())
if word_count > 5 and len(code.split()) < 50:
issues.append('May be natural language, not code')
issues.append("May be natural language, not code")
# Check code/comment ratio
comment_lines = sum(1 for line in code.split('\n') if line.strip().startswith(('#', '//', '/*', '*', '--')))
total_lines = len([l for l in code.split('\n') if l.strip()])
comment_lines = sum(1 for line in code.split("\n") if line.strip().startswith(("#", "//", "/*", "*", "--")))
total_lines = len([l for l in code.split("\n") if l.strip()])
if total_lines > 0 and comment_lines / total_lines > 0.7:
issues.append('Mostly comments')
issues.append("Mostly comments")
return len(issues) == 0, issues
@@ -309,18 +323,18 @@ class PDFExtractor:
score -= 2.0
# Factor 3: Number of lines
lines = [l for l in code.split('\n') if l.strip()]
lines = [l for l in code.split("\n") if l.strip()]
if 2 <= len(lines) <= 50:
score += 1.0
elif len(lines) > 100:
score -= 1.0
# Factor 4: Has function/class definitions
if re.search(r'\b(def|function|class|func|fn|public class)\b', code):
if re.search(r"\b(def|function|class|func|fn|public class)\b", code):
score += 1.5
# Factor 5: Has meaningful variable names (not just x, y, i)
meaningful_vars = re.findall(r'\b[a-z_][a-z0-9_]{3,}\b', code.lower())
meaningful_vars = re.findall(r"\b[a-z_][a-z0-9_]{3,}\b", code.lower())
if len(meaningful_vars) >= 2:
score += 1.0
@@ -344,19 +358,19 @@ class PDFExtractor:
code_blocks = []
blocks = page.get_text("dict")["blocks"]
monospace_fonts = ['courier', 'mono', 'consolas', 'menlo', 'monaco', 'dejavu']
monospace_fonts = ["courier", "mono", "consolas", "menlo", "monaco", "dejavu"]
current_code = []
current_font = None
for block in blocks:
if 'lines' not in block:
if "lines" not in block:
continue
for line in block['lines']:
for span in line['spans']:
font = span['font'].lower()
text = span['text']
for line in block["lines"]:
for span in line["spans"]:
font = span["font"].lower()
text = span["text"]
# Check if font is monospace
is_monospace = any(mf in font for mf in monospace_fonts)
@@ -364,47 +378,51 @@ class PDFExtractor:
if is_monospace:
# Accumulate code text
current_code.append(text)
current_font = span['font']
current_font = span["font"]
else:
# End of code block
if current_code:
code_text = ''.join(current_code).strip()
code_text = "".join(current_code).strip()
if len(code_text) > 10: # Minimum code length
lang, confidence = self.detect_language_from_code(code_text)
quality = self.score_code_quality(code_text, lang, confidence)
is_valid, issues = self.validate_code_syntax(code_text, lang)
code_blocks.append({
'code': code_text,
'language': lang,
'confidence': confidence,
'quality_score': quality,
'is_valid': is_valid,
'validation_issues': issues if not is_valid else [],
'font': current_font,
'detection_method': 'font'
})
code_blocks.append(
{
"code": code_text,
"language": lang,
"confidence": confidence,
"quality_score": quality,
"is_valid": is_valid,
"validation_issues": issues if not is_valid else [],
"font": current_font,
"detection_method": "font",
}
)
current_code = []
current_font = None
# Handle final code block
if current_code:
code_text = ''.join(current_code).strip()
code_text = "".join(current_code).strip()
if len(code_text) > 10:
lang, confidence = self.detect_language_from_code(code_text)
quality = self.score_code_quality(code_text, lang, confidence)
is_valid, issues = self.validate_code_syntax(code_text, lang)
code_blocks.append({
'code': code_text,
'language': lang,
'confidence': confidence,
'quality_score': quality,
'is_valid': is_valid,
'validation_issues': issues if not is_valid else [],
'font': current_font,
'detection_method': 'font'
})
code_blocks.append(
{
"code": code_text,
"language": lang,
"confidence": confidence,
"quality_score": quality,
"is_valid": is_valid,
"validation_issues": issues if not is_valid else [],
"font": current_font,
"detection_method": "font",
}
)
return code_blocks
@@ -416,55 +434,59 @@ class PDFExtractor:
Returns list of detected code blocks.
"""
code_blocks = []
lines = text.split('\n')
lines = text.split("\n")
current_block = []
indent_pattern = None
for line in lines:
# Check for indentation (4 spaces or tab)
if line.startswith(' ') or line.startswith('\t'):
if line.startswith(" ") or line.startswith("\t"):
# Start or continue code block
if not indent_pattern:
indent_pattern = line[:4] if line.startswith(' ') else '\t'
indent_pattern = line[:4] if line.startswith(" ") else "\t"
current_block.append(line)
else:
# End of code block
if current_block and len(current_block) >= 2: # At least 2 lines
code_text = '\n'.join(current_block).strip()
code_text = "\n".join(current_block).strip()
if len(code_text) > 20: # Minimum code length
lang, confidence = self.detect_language_from_code(code_text)
quality = self.score_code_quality(code_text, lang, confidence)
is_valid, issues = self.validate_code_syntax(code_text, lang)
code_blocks.append({
'code': code_text,
'language': lang,
'confidence': confidence,
'quality_score': quality,
'is_valid': is_valid,
'validation_issues': issues if not is_valid else [],
'detection_method': 'indent'
})
code_blocks.append(
{
"code": code_text,
"language": lang,
"confidence": confidence,
"quality_score": quality,
"is_valid": is_valid,
"validation_issues": issues if not is_valid else [],
"detection_method": "indent",
}
)
current_block = []
indent_pattern = None
# Handle final block
if current_block and len(current_block) >= 2:
code_text = '\n'.join(current_block).strip()
code_text = "\n".join(current_block).strip()
if len(code_text) > 20:
lang, confidence = self.detect_language_from_code(code_text)
quality = self.score_code_quality(code_text, lang, confidence)
is_valid, issues = self.validate_code_syntax(code_text, lang)
code_blocks.append({
'code': code_text,
'language': lang,
'confidence': confidence,
'quality_score': quality,
'is_valid': is_valid,
'validation_issues': issues if not is_valid else [],
'detection_method': 'indent'
})
code_blocks.append(
{
"code": code_text,
"language": lang,
"confidence": confidence,
"quality_score": quality,
"is_valid": is_valid,
"validation_issues": issues if not is_valid else [],
"detection_method": "indent",
}
)
return code_blocks
@@ -479,11 +501,11 @@ class PDFExtractor:
# Common code patterns that span multiple lines
patterns = [
# Function definitions
(r'((?:def|function|func|fn|public|private)\s+\w+\s*\([^)]*\)\s*[{:]?[^}]*[}]?)', 'function'),
(r"((?:def|function|func|fn|public|private)\s+\w+\s*\([^)]*\)\s*[{:]?[^}]*[}]?)", "function"),
# Class definitions
(r'(class\s+\w+[^{]*\{[^}]*\})', 'class'),
(r"(class\s+\w+[^{]*\{[^}]*\})", "class"),
# Import statements block
(r'((?:import|require|use|include)[^\n]+(?:\n(?:import|require|use|include)[^\n]+)*)', 'imports'),
(r"((?:import|require|use|include)[^\n]+(?:\n(?:import|require|use|include)[^\n]+)*)", "imports"),
]
for pattern, block_type in patterns:
@@ -495,16 +517,18 @@ class PDFExtractor:
quality = self.score_code_quality(code_text, lang, confidence)
is_valid, issues = self.validate_code_syntax(code_text, lang)
code_blocks.append({
'code': code_text,
'language': lang,
'confidence': confidence,
'quality_score': quality,
'is_valid': is_valid,
'validation_issues': issues if not is_valid else [],
'detection_method': 'pattern',
'pattern_type': block_type
})
code_blocks.append(
{
"code": code_text,
"language": lang,
"confidence": confidence,
"quality_score": quality,
"is_valid": is_valid,
"validation_issues": issues if not is_valid else [],
"detection_method": "pattern",
"pattern_type": block_type,
}
)
return code_blocks
@@ -514,24 +538,24 @@ class PDFExtractor:
Returns (is_chapter_start, chapter_title) tuple.
"""
headings = page_data.get('headings', [])
headings = page_data.get("headings", [])
# Check for h1 or h2 at start of page
if headings:
first_heading = headings[0]
# H1 headings are strong indicators of chapters
if first_heading['level'] in ['h1', 'h2']:
return True, first_heading['text']
if first_heading["level"] in ["h1", "h2"]:
return True, first_heading["text"]
# Check for specific chapter markers in text
text = page_data.get('text', '')
first_line = text.split('\n')[0] if text else ''
text = page_data.get("text", "")
first_line = text.split("\n")[0] if text else ""
chapter_patterns = [
r'^Chapter\s+\d+',
r'^Part\s+\d+',
r'^Section\s+\d+',
r'^\d+\.\s+[A-Z]', # "1. Introduction"
r"^Chapter\s+\d+",
r"^Part\s+\d+",
r"^Section\s+\d+",
r"^\d+\.\s+[A-Z]", # "1. Introduction"
]
for pattern in chapter_patterns:
@@ -552,42 +576,43 @@ class PDFExtractor:
next_page = pages[i + 1]
# Check if current page has code blocks
if not current_page['code_samples']:
if not current_page["code_samples"]:
continue
# Get last code block of current page
last_code = current_page['code_samples'][-1]
last_code = current_page["code_samples"][-1]
# Check if next page starts with code
if not next_page['code_samples']:
if not next_page["code_samples"]:
continue
first_next_code = next_page['code_samples'][0]
first_next_code = next_page["code_samples"][0]
# Same language and detection method = likely continuation
if (last_code['language'] == first_next_code['language'] and
last_code['detection_method'] == first_next_code['detection_method']):
if (
last_code["language"] == first_next_code["language"]
and last_code["detection_method"] == first_next_code["detection_method"]
):
# Check if last code block looks incomplete (doesn't end with closing brace/etc)
last_code_text = last_code['code'].rstrip()
last_code_text = last_code["code"].rstrip()
continuation_indicators = [
not last_code_text.endswith('}'),
not last_code_text.endswith(';'),
last_code_text.endswith(','),
last_code_text.endswith('\\'),
not last_code_text.endswith("}"),
not last_code_text.endswith(";"),
last_code_text.endswith(","),
last_code_text.endswith("\\"),
]
if any(continuation_indicators):
# Merge the code blocks
merged_code = last_code['code'] + '\n' + first_next_code['code']
last_code['code'] = merged_code
last_code['merged_from_next_page'] = True
merged_code = last_code["code"] + "\n" + first_next_code["code"]
last_code["code"] = merged_code
last_code["merged_from_next_page"] = True
# Remove the first code block from next page
next_page['code_samples'].pop(0)
next_page['code_blocks_count'] -= 1
next_page["code_samples"].pop(0)
next_page["code_blocks_count"] -= 1
self.log(f" Merged code block from page {i+1} to {i+2}")
self.log(f" Merged code block from page {i + 1} to {i + 2}")
return pages
@@ -603,13 +628,7 @@ class PDFExtractor:
"""
if self.chunk_size == 0:
# No chunking - return all pages as one chunk
return [{
'chunk_number': 1,
'start_page': 1,
'end_page': len(pages),
'pages': pages,
'chapter_title': None
}]
return [{"chunk_number": 1, "start_page": 1, "end_page": len(pages), "pages": pages, "chapter_title": None}]
chunks = []
current_chunk = []
@@ -622,13 +641,15 @@ class PDFExtractor:
if is_chapter and current_chunk:
# Save current chunk before starting new one
chunks.append({
'chunk_number': len(chunks) + 1,
'start_page': chunk_start + 1,
'end_page': i,
'pages': current_chunk,
'chapter_title': current_chapter
})
chunks.append(
{
"chunk_number": len(chunks) + 1,
"start_page": chunk_start + 1,
"end_page": i,
"pages": current_chunk,
"chapter_title": current_chapter,
}
)
current_chunk = []
chunk_start = i
current_chapter = chapter_title
@@ -640,26 +661,30 @@ class PDFExtractor:
# Check if chunk size reached (but don't break chapters)
if not is_chapter and len(current_chunk) >= self.chunk_size:
chunks.append({
'chunk_number': len(chunks) + 1,
'start_page': chunk_start + 1,
'end_page': i + 1,
'pages': current_chunk,
'chapter_title': current_chapter
})
chunks.append(
{
"chunk_number": len(chunks) + 1,
"start_page": chunk_start + 1,
"end_page": i + 1,
"pages": current_chunk,
"chapter_title": current_chapter,
}
)
current_chunk = []
chunk_start = i + 1
current_chapter = None
# Add remaining pages as final chunk
if current_chunk:
chunks.append({
'chunk_number': len(chunks) + 1,
'start_page': chunk_start + 1,
'end_page': len(pages),
'pages': current_chunk,
'chapter_title': current_chapter
})
chunks.append(
{
"chunk_number": len(chunks) + 1,
"start_page": chunk_start + 1,
"end_page": len(pages),
"pages": current_chunk,
"chapter_title": current_chapter,
}
)
return chunks
@@ -696,7 +721,7 @@ class PDFExtractor:
# Generate filename
pdf_basename = Path(self.pdf_path).stem
image_filename = f"{pdf_basename}_page{page_num+1}_img{img_index+1}.{image_ext}"
image_filename = f"{pdf_basename}_page{page_num + 1}_img{img_index + 1}.{image_ext}"
# Save image
image_path = Path(self.image_dir) / image_filename
@@ -707,14 +732,14 @@ class PDFExtractor:
# Store metadata
image_info = {
'filename': image_filename,
'path': str(image_path),
'page_number': page_num + 1,
'width': width,
'height': height,
'format': image_ext,
'size_bytes': len(image_bytes),
'xref': xref
"filename": image_filename,
"path": str(image_path),
"page_number": page_num + 1,
"width": width,
"height": height,
"format": image_ext,
"size_bytes": len(image_bytes),
"xref": xref,
}
extracted.append(image_info)
@@ -771,12 +796,12 @@ class PDFExtractor:
# Simple deduplication by code content
unique_code = {}
for block in all_code_blocks:
code_hash = hash(block['code'])
code_hash = hash(block["code"])
if code_hash not in unique_code:
unique_code[code_hash] = block
else:
# Keep the one with higher quality score
if block['quality_score'] > unique_code[code_hash]['quality_score']:
if block["quality_score"] > unique_code[code_hash]["quality_score"]:
unique_code[code_hash] = block
code_samples = list(unique_code.values())
@@ -784,44 +809,43 @@ class PDFExtractor:
# Filter by minimum quality (NEW in B1.4)
if self.min_quality > 0:
code_samples_before = len(code_samples)
code_samples = [c for c in code_samples if c['quality_score'] >= self.min_quality]
code_samples = [c for c in code_samples if c["quality_score"] >= self.min_quality]
filtered_count = code_samples_before - len(code_samples)
if filtered_count > 0:
self.log(f" Filtered out {filtered_count} low-quality code blocks (min_quality={self.min_quality})")
# Sort by quality score (highest first)
code_samples.sort(key=lambda x: x['quality_score'], reverse=True)
code_samples.sort(key=lambda x: x["quality_score"], reverse=True)
# Extract headings from markdown
headings = []
for line in markdown.split('\n'):
if line.startswith('#'):
level = len(line) - len(line.lstrip('#'))
text = line.lstrip('#').strip()
for line in markdown.split("\n"):
if line.startswith("#"):
level = len(line) - len(line.lstrip("#"))
text = line.lstrip("#").strip()
if text:
headings.append({
'level': f'h{level}',
'text': text
})
headings.append({"level": f"h{level}", "text": text})
page_data = {
'page_number': page_num + 1, # 1-indexed for humans
'text': text.strip(),
'markdown': markdown.strip(),
'headings': headings,
'code_samples': code_samples,
'images_count': len(images),
'extracted_images': extracted_images, # NEW in B1.5
'tables': tables, # NEW in Priority 2
'char_count': len(text),
'code_blocks_count': len(code_samples),
'tables_count': len(tables) # NEW in Priority 2
"page_number": page_num + 1, # 1-indexed for humans
"text": text.strip(),
"markdown": markdown.strip(),
"headings": headings,
"code_samples": code_samples,
"images_count": len(images),
"extracted_images": extracted_images, # NEW in B1.5
"tables": tables, # NEW in Priority 2
"char_count": len(text),
"code_blocks_count": len(code_samples),
"tables_count": len(tables), # NEW in Priority 2
}
# Cache the result (Priority 3)
self.set_cached(cache_key, page_data)
self.log(f" Page {page_num + 1}: {len(text)} chars, {len(code_samples)} code blocks, {len(headings)} headings, {len(extracted_images)} images, {len(tables)} tables")
self.log(
f" Page {page_num + 1}: {len(text)} chars, {len(code_samples)} code blocks, {len(headings)} headings, {len(extracted_images)} images, {len(tables)} tables"
)
return page_data
@@ -841,15 +865,15 @@ class PDFExtractor:
# Handle encrypted PDFs (Priority 2)
if self.doc.is_encrypted:
if self.password:
print(f" 🔐 PDF is encrypted, trying password...")
print(" 🔐 PDF is encrypted, trying password...")
if self.doc.authenticate(self.password):
print(f" ✅ Password accepted")
print(" ✅ Password accepted")
else:
print(f" ❌ Invalid password")
print(" ❌ Invalid password")
return None
else:
print(f" ❌ PDF is encrypted but no password provided")
print(f" Use --password option to provide password")
print(" ❌ PDF is encrypted but no password provided")
print(" Use --password option to provide password")
return None
except Exception as e:
@@ -870,12 +894,12 @@ class PDFExtractor:
status = "✅ enabled" if TESSERACT_AVAILABLE else "⚠️ not available (install pytesseract)"
print(f" OCR: {status}")
if self.extract_tables:
print(f" Table extraction: ✅ enabled")
print(" Table extraction: ✅ enabled")
if self.parallel:
status = "✅ enabled" if CONCURRENT_AVAILABLE else "⚠️ not available"
print(f" Parallel processing: {status} ({self.max_workers} workers)")
if self.use_cache:
print(f" Caching: ✅ enabled")
print(" Caching: ✅ enabled")
print("")
@@ -900,73 +924,71 @@ class PDFExtractor:
chunks = self.create_chunks(self.pages)
# Build summary
total_chars = sum(p['char_count'] for p in self.pages)
total_code_blocks = sum(p['code_blocks_count'] for p in self.pages)
total_headings = sum(len(p['headings']) for p in self.pages)
total_images = sum(p['images_count'] for p in self.pages)
total_tables = sum(p['tables_count'] for p in self.pages) # NEW in Priority 2
total_chars = sum(p["char_count"] for p in self.pages)
total_code_blocks = sum(p["code_blocks_count"] for p in self.pages)
total_headings = sum(len(p["headings"]) for p in self.pages)
total_images = sum(p["images_count"] for p in self.pages)
total_tables = sum(p["tables_count"] for p in self.pages) # NEW in Priority 2
# Detect languages used
languages = {}
all_code_blocks_list = []
for page in self.pages:
for code in page['code_samples']:
lang = code['language']
for code in page["code_samples"]:
lang = code["language"]
languages[lang] = languages.get(lang, 0) + 1
all_code_blocks_list.append(code)
# Calculate quality statistics (NEW in B1.4)
quality_stats = {}
if all_code_blocks_list:
quality_scores = [c['quality_score'] for c in all_code_blocks_list]
confidences = [c['confidence'] for c in all_code_blocks_list]
valid_count = sum(1 for c in all_code_blocks_list if c['is_valid'])
quality_scores = [c["quality_score"] for c in all_code_blocks_list]
confidences = [c["confidence"] for c in all_code_blocks_list]
valid_count = sum(1 for c in all_code_blocks_list if c["is_valid"])
quality_stats = {
'average_quality': sum(quality_scores) / len(quality_scores),
'average_confidence': sum(confidences) / len(confidences),
'valid_code_blocks': valid_count,
'invalid_code_blocks': total_code_blocks - valid_count,
'validation_rate': valid_count / total_code_blocks if total_code_blocks > 0 else 0,
'high_quality_blocks': sum(1 for s in quality_scores if s >= 7.0),
'medium_quality_blocks': sum(1 for s in quality_scores if 4.0 <= s < 7.0),
'low_quality_blocks': sum(1 for s in quality_scores if s < 4.0),
"average_quality": sum(quality_scores) / len(quality_scores),
"average_confidence": sum(confidences) / len(confidences),
"valid_code_blocks": valid_count,
"invalid_code_blocks": total_code_blocks - valid_count,
"validation_rate": valid_count / total_code_blocks if total_code_blocks > 0 else 0,
"high_quality_blocks": sum(1 for s in quality_scores if s >= 7.0),
"medium_quality_blocks": sum(1 for s in quality_scores if 4.0 <= s < 7.0),
"low_quality_blocks": sum(1 for s in quality_scores if s < 4.0),
}
# Extract chapter information
chapters = []
for chunk in chunks:
if chunk['chapter_title']:
chapters.append({
'title': chunk['chapter_title'],
'start_page': chunk['start_page'],
'end_page': chunk['end_page']
})
if chunk["chapter_title"]:
chapters.append(
{"title": chunk["chapter_title"], "start_page": chunk["start_page"], "end_page": chunk["end_page"]}
)
result = {
'source_file': self.pdf_path,
'metadata': self.doc.metadata,
'total_pages': len(self.doc),
'total_chars': total_chars,
'total_code_blocks': total_code_blocks,
'total_headings': total_headings,
'total_images': total_images,
'total_extracted_images': len(self.extracted_images), # NEW in B1.5
'total_tables': total_tables, # NEW in Priority 2
'image_directory': self.image_dir if self.extract_images else None, # NEW in B1.5
'extracted_images': self.extracted_images, # NEW in B1.5
'total_chunks': len(chunks),
'chapters': chapters,
'languages_detected': languages,
'quality_statistics': quality_stats, # NEW in B1.4
'chunks': chunks,
'pages': self.pages # Still include all pages for compatibility
"source_file": self.pdf_path,
"metadata": self.doc.metadata,
"total_pages": len(self.doc),
"total_chars": total_chars,
"total_code_blocks": total_code_blocks,
"total_headings": total_headings,
"total_images": total_images,
"total_extracted_images": len(self.extracted_images), # NEW in B1.5
"total_tables": total_tables, # NEW in Priority 2
"image_directory": self.image_dir if self.extract_images else None, # NEW in B1.5
"extracted_images": self.extracted_images, # NEW in B1.5
"total_chunks": len(chunks),
"chapters": chapters,
"languages_detected": languages,
"quality_statistics": quality_stats, # NEW in B1.4
"chunks": chunks,
"pages": self.pages, # Still include all pages for compatibility
}
# Close document
self.doc.close()
print(f"\n✅ Extraction complete:")
print("\n✅ Extraction complete:")
print(f" Total characters: {total_chars:,}")
print(f" Code blocks found: {total_code_blocks}")
print(f" Headings found: {total_headings}")
@@ -983,10 +1005,12 @@ class PDFExtractor:
# Print quality statistics (NEW in B1.4)
if quality_stats:
print(f"\n📊 Code Quality Statistics:")
print("\n📊 Code Quality Statistics:")
print(f" Average quality: {quality_stats['average_quality']:.1f}/10")
print(f" Average confidence: {quality_stats['average_confidence']:.1%}")
print(f" Valid code blocks: {quality_stats['valid_code_blocks']}/{total_code_blocks} ({quality_stats['validation_rate']:.1%})")
print(
f" Valid code blocks: {quality_stats['valid_code_blocks']}/{total_code_blocks} ({quality_stats['validation_rate']:.1%})"
)
print(f" High quality (7+): {quality_stats['high_quality_blocks']}")
print(f" Medium quality (4-7): {quality_stats['medium_quality_blocks']}")
print(f" Low quality (<4): {quality_stats['low_quality_blocks']}")
@@ -996,7 +1020,7 @@ class PDFExtractor:
def main():
parser = argparse.ArgumentParser(
description='Extract text and code blocks from PDF documentation',
description="Extract text and code blocks from PDF documentation",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -1011,39 +1035,39 @@ Examples:
# Extract and save
python3 pdf_extractor_poc.py docs/python.pdf -o python_extracted.json -v
"""
""",
)
parser.add_argument('pdf_file', help='Path to PDF file to extract')
parser.add_argument('-o', '--output', help='Output JSON file path (default: print to stdout)')
parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
parser.add_argument('--pretty', action='store_true', help='Pretty-print JSON output')
parser.add_argument('--chunk-size', type=int, default=10,
help='Pages per chunk (0 = no chunking, default: 10)')
parser.add_argument('--no-merge', action='store_true',
help='Disable merging code blocks across pages')
parser.add_argument('--min-quality', type=float, default=0.0,
help='Minimum code quality score (0-10, default: 0 = no filtering)')
parser.add_argument('--extract-images', action='store_true',
help='Extract images to files (NEW in B1.5)')
parser.add_argument('--image-dir', type=str, default=None,
help='Directory to save extracted images (default: output/{pdf_name}_images)')
parser.add_argument('--min-image-size', type=int, default=100,
help='Minimum image dimension in pixels (filters icons, default: 100)')
parser.add_argument("pdf_file", help="Path to PDF file to extract")
parser.add_argument("-o", "--output", help="Output JSON file path (default: print to stdout)")
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
parser.add_argument("--chunk-size", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)")
parser.add_argument("--no-merge", action="store_true", help="Disable merging code blocks across pages")
parser.add_argument(
"--min-quality", type=float, default=0.0, help="Minimum code quality score (0-10, default: 0 = no filtering)"
)
parser.add_argument("--extract-images", action="store_true", help="Extract images to files (NEW in B1.5)")
parser.add_argument(
"--image-dir",
type=str,
default=None,
help="Directory to save extracted images (default: output/{pdf_name}_images)",
)
parser.add_argument(
"--min-image-size",
type=int,
default=100,
help="Minimum image dimension in pixels (filters icons, default: 100)",
)
# Advanced features (Priority 2 & 3)
parser.add_argument('--ocr', action='store_true',
help='Use OCR for scanned PDFs (requires pytesseract)')
parser.add_argument('--password', type=str, default=None,
help='Password for encrypted PDF')
parser.add_argument('--extract-tables', action='store_true',
help='Extract tables from PDF (Priority 2)')
parser.add_argument('--parallel', action='store_true',
help='Process pages in parallel (Priority 3)')
parser.add_argument('--workers', type=int, default=None,
help='Number of parallel workers (default: CPU count)')
parser.add_argument('--no-cache', action='store_true',
help='Disable caching of expensive operations')
parser.add_argument("--ocr", action="store_true", help="Use OCR for scanned PDFs (requires pytesseract)")
parser.add_argument("--password", type=str, default=None, help="Password for encrypted PDF")
parser.add_argument("--extract-tables", action="store_true", help="Extract tables from PDF (Priority 2)")
parser.add_argument("--parallel", action="store_true", help="Process pages in parallel (Priority 3)")
parser.add_argument("--workers", type=int, default=None, help="Number of parallel workers (default: CPU count)")
parser.add_argument("--no-cache", action="store_true", help="Disable caching of expensive operations")
args = parser.parse_args()
@@ -1052,8 +1076,8 @@ Examples:
print(f"❌ Error: File not found: {args.pdf_file}")
sys.exit(1)
if not args.pdf_file.lower().endswith('.pdf'):
print(f"⚠️ Warning: File does not have .pdf extension")
if not args.pdf_file.lower().endswith(".pdf"):
print("⚠️ Warning: File does not have .pdf extension")
# Extract
extractor = PDFExtractor(
@@ -1070,7 +1094,7 @@ Examples:
extract_tables=args.extract_tables,
parallel=args.parallel,
max_workers=args.workers,
use_cache=not args.no_cache
use_cache=not args.no_cache,
)
result = extractor.extract_all()
@@ -1080,7 +1104,7 @@ Examples:
# Output
if args.output:
# Save to file
with open(args.output, 'w', encoding='utf-8') as f:
with open(args.output, "w", encoding="utf-8") as f:
if args.pretty:
json.dump(result, f, indent=2, ensure_ascii=False)
else:
@@ -1094,5 +1118,5 @@ Examples:
print(json.dumps(result, ensure_ascii=False))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -11,18 +11,18 @@ Usage:
python3 pdf_scraper.py --from-json manual_extracted.json
"""
import os
import sys
import json
import re
import argparse
import json
import os
import re
import sys
from pathlib import Path
# Import the PDF extractor
from .pdf_extractor_poc import PDFExtractor
def infer_description_from_pdf(pdf_metadata: dict = None, name: str = '') -> str:
def infer_description_from_pdf(pdf_metadata: dict = None, name: str = "") -> str:
"""
Infer skill description from PDF metadata or document properties.
@@ -39,22 +39,22 @@ def infer_description_from_pdf(pdf_metadata: dict = None, name: str = '') -> str
"""
if pdf_metadata:
# Try to use subject field (often contains description)
if 'subject' in pdf_metadata and pdf_metadata['subject']:
desc = str(pdf_metadata['subject']).strip()
if "subject" in pdf_metadata and pdf_metadata["subject"]:
desc = str(pdf_metadata["subject"]).strip()
if len(desc) > 20:
if len(desc) > 150:
desc = desc[:147] + '...'
return f'Use when {desc.lower()}'
desc = desc[:147] + "..."
return f"Use when {desc.lower()}"
# Try title field if meaningful
if 'title' in pdf_metadata and pdf_metadata['title']:
title = str(pdf_metadata['title']).strip()
if "title" in pdf_metadata and pdf_metadata["title"]:
title = str(pdf_metadata["title"]).strip()
# Skip if it's just the filename
if len(title) > 10 and not title.endswith('.pdf'):
return f'Use when working with {title.lower()}'
if len(title) > 10 and not title.endswith(".pdf"):
return f"Use when working with {title.lower()}"
# Improved fallback
return f'Use when referencing {name} documentation' if name else 'Use when referencing this documentation'
return f"Use when referencing {name} documentation" if name else "Use when referencing this documentation"
class PDFToSkillConverter:
@@ -62,20 +62,20 @@ class PDFToSkillConverter:
def __init__(self, config):
self.config = config
self.name = config['name']
self.pdf_path = config.get('pdf_path', '')
self.name = config["name"]
self.pdf_path = config.get("pdf_path", "")
# Set initial description (will be improved after extraction if metadata available)
self.description = config.get('description', f'Use when referencing {self.name} documentation')
self.description = config.get("description", f"Use when referencing {self.name} documentation")
# Paths
self.skill_dir = f"output/{self.name}"
self.data_file = f"output/{self.name}_extracted.json"
# Extraction options
self.extract_options = config.get('extract_options', {})
self.extract_options = config.get("extract_options", {})
# Categories
self.categories = config.get('categories', {})
self.categories = config.get("categories", {})
# Extracted data
self.extracted_data = None
@@ -88,11 +88,11 @@ class PDFToSkillConverter:
extractor = PDFExtractor(
self.pdf_path,
verbose=True,
chunk_size=self.extract_options.get('chunk_size', 10),
min_quality=self.extract_options.get('min_quality', 5.0),
extract_images=self.extract_options.get('extract_images', True),
chunk_size=self.extract_options.get("chunk_size", 10),
min_quality=self.extract_options.get("min_quality", 5.0),
extract_images=self.extract_options.get("extract_images", True),
image_dir=f"{self.skill_dir}/assets/images",
min_image_size=self.extract_options.get('min_image_size', 100)
min_image_size=self.extract_options.get("min_image_size", 100),
)
# Extract
@@ -103,7 +103,7 @@ class PDFToSkillConverter:
raise RuntimeError(f"Failed to extract PDF: {self.pdf_path}")
# Save extracted data
with open(self.data_file, 'w', encoding='utf-8') as f:
with open(self.data_file, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"\n💾 Saved extracted data to: {self.data_file}")
@@ -114,7 +114,7 @@ class PDFToSkillConverter:
"""Load previously extracted data from JSON"""
print(f"\n📂 Loading extracted data from: {json_path}")
with open(json_path, 'r', encoding='utf-8') as f:
with open(json_path, encoding="utf-8") as f:
self.extracted_data = json.load(f)
print(f"✅ Loaded {self.extracted_data['total_pages']} pages")
@@ -122,28 +122,25 @@ class PDFToSkillConverter:
def categorize_content(self):
"""Categorize pages based on chapters or keywords"""
print(f"\n📋 Categorizing content...")
print("\n📋 Categorizing content...")
categorized = {}
# Use chapters if available
if self.extracted_data.get('chapters'):
for chapter in self.extracted_data['chapters']:
category_key = self._sanitize_filename(chapter['title'])
categorized[category_key] = {
'title': chapter['title'],
'pages': []
}
if self.extracted_data.get("chapters"):
for chapter in self.extracted_data["chapters"]:
category_key = self._sanitize_filename(chapter["title"])
categorized[category_key] = {"title": chapter["title"], "pages": []}
# Assign pages to chapters
for page in self.extracted_data['pages']:
page_num = page['page_number']
for page in self.extracted_data["pages"]:
page_num = page["page_number"]
# Find which chapter this page belongs to
for chapter in self.extracted_data['chapters']:
if chapter['start_page'] <= page_num <= chapter['end_page']:
category_key = self._sanitize_filename(chapter['title'])
categorized[category_key]['pages'].append(page)
for chapter in self.extracted_data["chapters"]:
if chapter["start_page"] <= page_num <= chapter["end_page"]:
category_key = self._sanitize_filename(chapter["title"])
categorized[category_key]["pages"].append(page)
break
# Fall back to keyword-based categorization
@@ -154,31 +151,28 @@ class PDFToSkillConverter:
if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
# Already categorized - convert to expected format
for cat_key, pages in self.categories.items():
categorized[cat_key] = {
'title': cat_key.replace('_', ' ').title(),
'pages': pages
}
categorized[cat_key] = {"title": cat_key.replace("_", " ").title(), "pages": pages}
else:
# Keyword-based categorization
# Initialize categories
for cat_key, keywords in self.categories.items():
categorized[cat_key] = {
'title': cat_key.replace('_', ' ').title(),
'pages': []
}
categorized[cat_key] = {"title": cat_key.replace("_", " ").title(), "pages": []}
# Categorize by keywords
for page in self.extracted_data['pages']:
text = page.get('text', '').lower()
headings_text = ' '.join([h['text'] for h in page.get('headings', [])]).lower()
for page in self.extracted_data["pages"]:
text = page.get("text", "").lower()
headings_text = " ".join([h["text"] for h in page.get("headings", [])]).lower()
# Score against each category
scores = {}
for cat_key, keywords in self.categories.items():
# Handle both string keywords and dict keywords (shouldn't happen, but be safe)
if isinstance(keywords, list):
score = sum(1 for kw in keywords
if isinstance(kw, str) and (kw.lower() in text or kw.lower() in headings_text))
score = sum(
1
for kw in keywords
if isinstance(kw, str) and (kw.lower() in text or kw.lower() in headings_text)
)
else:
score = 0
if score > 0:
@@ -187,19 +181,16 @@ class PDFToSkillConverter:
# Assign to highest scoring category
if scores:
best_cat = max(scores, key=scores.get)
categorized[best_cat]['pages'].append(page)
categorized[best_cat]["pages"].append(page)
else:
# Default category
if 'other' not in categorized:
categorized['other'] = {'title': 'Other', 'pages': []}
categorized['other']['pages'].append(page)
if "other" not in categorized:
categorized["other"] = {"title": "Other", "pages": []}
categorized["other"]["pages"].append(page)
else:
# No categorization - use single category
categorized['content'] = {
'title': 'Content',
'pages': self.extracted_data['pages']
}
categorized["content"] = {"title": "Content", "pages": self.extracted_data["pages"]}
print(f"✅ Created {len(categorized)} categories")
for cat_key, cat_data in categorized.items():
@@ -220,7 +211,7 @@ class PDFToSkillConverter:
categorized = self.categorize_content()
# Generate reference files
print(f"\n📝 Generating reference files...")
print("\n📝 Generating reference files...")
for cat_key, cat_data in categorized.items():
self._generate_reference_file(cat_key, cat_data)
@@ -237,42 +228,42 @@ class PDFToSkillConverter:
"""Generate a reference markdown file for a category"""
filename = f"{self.skill_dir}/references/{cat_key}.md"
with open(filename, 'w', encoding='utf-8') as f:
with open(filename, "w", encoding="utf-8") as f:
f.write(f"# {cat_data['title']}\n\n")
for page in cat_data['pages']:
for page in cat_data["pages"]:
# Add headings as section markers
if page.get('headings'):
if page.get("headings"):
f.write(f"## {page['headings'][0]['text']}\n\n")
# Add text content
if page.get('text'):
if page.get("text"):
# Limit to first 1000 chars per page to avoid huge files
text = page['text'][:1000]
text = page["text"][:1000]
f.write(f"{text}\n\n")
# Add code samples (check both 'code_samples' and 'code_blocks' for compatibility)
code_list = page.get('code_samples') or page.get('code_blocks')
code_list = page.get("code_samples") or page.get("code_blocks")
if code_list:
f.write("### Code Examples\n\n")
for code in code_list[:3]: # Limit to top 3
lang = code.get('language', '')
lang = code.get("language", "")
f.write(f"```{lang}\n{code['code']}\n```\n\n")
# Add images
if page.get('images'):
if page.get("images"):
# Create assets directory if needed
assets_dir = os.path.join(self.skill_dir, 'assets')
assets_dir = os.path.join(self.skill_dir, "assets")
os.makedirs(assets_dir, exist_ok=True)
f.write("### Images\n\n")
for img in page['images']:
for img in page["images"]:
# Save image to assets
img_filename = f"page_{page['page_number']}_img_{img['index']}.png"
img_path = os.path.join(assets_dir, img_filename)
with open(img_path, 'wb') as img_file:
img_file.write(img['data'])
with open(img_path, "wb") as img_file:
img_file.write(img["data"])
# Add markdown image reference
f.write(f"![Image {img['index']}](../assets/{img_filename})\n\n")
@@ -285,16 +276,16 @@ class PDFToSkillConverter:
"""Generate reference index"""
filename = f"{self.skill_dir}/references/index.md"
with open(filename, 'w', encoding='utf-8') as f:
with open(filename, "w", encoding="utf-8") as f:
f.write(f"# {self.name.title()} Documentation Reference\n\n")
f.write("## Categories\n\n")
for cat_key, cat_data in categorized.items():
page_count = len(cat_data['pages'])
page_count = len(cat_data["pages"])
f.write(f"- [{cat_data['title']}]({cat_key}.md) ({page_count} pages)\n")
f.write("\n## Statistics\n\n")
stats = self.extracted_data.get('quality_statistics', {})
stats = self.extracted_data.get("quality_statistics", {})
f.write(f"- Total pages: {self.extracted_data.get('total_pages', 0)}\n")
f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n")
f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n")
@@ -309,37 +300,37 @@ class PDFToSkillConverter:
filename = f"{self.skill_dir}/SKILL.md"
# Generate skill name (lowercase, hyphens only, max 64 chars)
skill_name = self.name.lower().replace('_', '-').replace(' ', '-')[:64]
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
# Truncate description to 1024 chars if needed
desc = self.description[:1024] if len(self.description) > 1024 else self.description
with open(filename, 'w', encoding='utf-8') as f:
with open(filename, "w", encoding="utf-8") as f:
# Write YAML frontmatter
f.write(f"---\n")
f.write("---\n")
f.write(f"name: {skill_name}\n")
f.write(f"description: {desc}\n")
f.write(f"---\n\n")
f.write("---\n\n")
f.write(f"# {self.name.title()} Documentation Skill\n\n")
f.write(f"{self.description}\n\n")
# Enhanced "When to Use" section
f.write("## 💡 When to Use This Skill\n\n")
f.write(f"Use this skill when you need to:\n")
f.write("Use this skill when you need to:\n")
f.write(f"- Understand {self.name} concepts and fundamentals\n")
f.write(f"- Look up API references and technical specifications\n")
f.write(f"- Find code examples and implementation patterns\n")
f.write(f"- Review tutorials, guides, and best practices\n")
f.write(f"- Explore the complete documentation structure\n\n")
f.write("- Look up API references and technical specifications\n")
f.write("- Find code examples and implementation patterns\n")
f.write("- Review tutorials, guides, and best practices\n")
f.write("- Explore the complete documentation structure\n\n")
# Chapter Overview (PDF structure)
f.write("## 📖 Chapter Overview\n\n")
total_pages = self.extracted_data.get('total_pages', 0)
total_pages = self.extracted_data.get("total_pages", 0)
f.write(f"**Total Pages:** {total_pages}\n\n")
f.write("**Content Breakdown:**\n\n")
for cat_key, cat_data in categorized.items():
page_count = len(cat_data['pages'])
page_count = len(cat_data["pages"])
f.write(f"- **{cat_data['title']}**: {page_count} pages\n")
f.write("\n")
@@ -352,11 +343,11 @@ class PDFToSkillConverter:
# Enhanced code examples section (top 15, grouped by language)
all_code = []
for page in self.extracted_data['pages']:
all_code.extend(page.get('code_samples', []))
for page in self.extracted_data["pages"]:
all_code.extend(page.get("code_samples", []))
# Sort by quality and get top 15
all_code.sort(key=lambda x: x.get('quality_score', 0), reverse=True)
all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
top_code = all_code[:15]
if top_code:
@@ -366,7 +357,7 @@ class PDFToSkillConverter:
# Group by language
by_lang = {}
for code in top_code:
lang = code.get('language', 'unknown')
lang = code.get("language", "unknown")
if lang not in by_lang:
by_lang[lang] = []
by_lang[lang].append(code)
@@ -377,8 +368,8 @@ class PDFToSkillConverter:
f.write(f"### {lang.title()} Examples ({len(examples)})\n\n")
for i, code in enumerate(examples[:5], 1): # Top 5 per language
quality = code.get('quality_score', 0)
code_text = code.get('code', '')
quality = code.get("quality_score", 0)
code_text = code.get("code", "")
f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n")
f.write(f"```{lang}\n")
@@ -394,13 +385,13 @@ class PDFToSkillConverter:
# Statistics
f.write("## 📊 Documentation Statistics\n\n")
f.write(f"- **Total Pages**: {total_pages}\n")
total_code_blocks = self.extracted_data.get('total_code_blocks', 0)
total_code_blocks = self.extracted_data.get("total_code_blocks", 0)
f.write(f"- **Code Blocks**: {total_code_blocks}\n")
total_images = self.extracted_data.get('total_images', 0)
total_images = self.extracted_data.get("total_images", 0)
f.write(f"- **Images/Diagrams**: {total_images}\n")
# Language statistics
langs = self.extracted_data.get('languages_detected', {})
langs = self.extracted_data.get("languages_detected", {})
if langs:
f.write(f"- **Programming Languages**: {len(langs)}\n\n")
f.write("**Language Breakdown:**\n\n")
@@ -409,11 +400,11 @@ class PDFToSkillConverter:
f.write("\n")
# Quality metrics
quality_stats = self.extracted_data.get('quality_statistics', {})
quality_stats = self.extracted_data.get("quality_statistics", {})
if quality_stats:
avg_quality = quality_stats.get('average_quality', 0)
valid_blocks = quality_stats.get('valid_code_blocks', 0)
f.write(f"**Code Quality:**\n\n")
avg_quality = quality_stats.get("average_quality", 0)
valid_blocks = quality_stats.get("valid_code_blocks", 0)
f.write("**Code Quality:**\n\n")
f.write(f"- Average Quality Score: {avg_quality:.1f}/10\n")
f.write(f"- Valid Code Blocks: {valid_blocks}\n\n")
@@ -421,7 +412,7 @@ class PDFToSkillConverter:
f.write("## 🗺️ Navigation\n\n")
f.write("**Reference Files:**\n\n")
for cat_key, cat_data in categorized.items():
cat_file = self._sanitize_filename(cat_data['title'])
cat_file = self._sanitize_filename(cat_data["title"])
f.write(f"- `references/{cat_file}.md` - {cat_data['title']}\n")
f.write("\n")
f.write("See `references/index.md` for complete documentation structure.\n\n")
@@ -430,18 +421,18 @@ class PDFToSkillConverter:
f.write("---\n\n")
f.write("**Generated by Skill Seeker** | PDF Documentation Scraper\n")
line_count = len(open(filename, 'r', encoding='utf-8').read().split('\n'))
line_count = len(open(filename, encoding="utf-8").read().split("\n"))
print(f" Generated: {filename} ({line_count} lines)")
def _format_key_concepts(self) -> str:
"""Extract key concepts from headings across all pages."""
all_headings = []
for page in self.extracted_data.get('pages', []):
headings = page.get('headings', [])
for page in self.extracted_data.get("pages", []):
headings = page.get("headings", [])
for heading in headings:
text = heading.get('text', '').strip()
level = heading.get('level', 'h1')
text = heading.get("text", "").strip()
level = heading.get("level", "h1")
if text and len(text) > 3: # Skip very short headings
all_headings.append((level, text))
@@ -452,8 +443,8 @@ class PDFToSkillConverter:
content += "*Main topics covered in this documentation*\n\n"
# Group by level and show top concepts
h1_headings = [text for level, text in all_headings if level == 'h1']
h2_headings = [text for level, text in all_headings if level == 'h2']
h1_headings = [text for level, text in all_headings if level == "h1"]
h2_headings = [text for level, text in all_headings if level == "h2"]
if h1_headings:
content += "**Major Topics:**\n\n"
@@ -475,27 +466,31 @@ class PDFToSkillConverter:
patterns = []
# Simple pattern extraction from headings and emphasized text
for page in self.extracted_data.get('pages', []):
text = page.get('text', '')
headings = page.get('headings', [])
for page in self.extracted_data.get("pages", []):
text = page.get("text", "")
headings = page.get("headings", [])
# Look for common pattern keywords in headings
pattern_keywords = [
'getting started', 'installation', 'configuration',
'usage', 'api', 'examples', 'tutorial', 'guide',
'best practices', 'troubleshooting', 'faq'
"getting started",
"installation",
"configuration",
"usage",
"api",
"examples",
"tutorial",
"guide",
"best practices",
"troubleshooting",
"faq",
]
for heading in headings:
heading_text = heading.get('text', '').lower()
heading_text = heading.get("text", "").lower()
for keyword in pattern_keywords:
if keyword in heading_text:
page_num = page.get('page_number', 0)
patterns.append({
'type': keyword.title(),
'heading': heading.get('text', ''),
'page': page_num
})
page_num = page.get("page_number", 0)
patterns.append({"type": keyword.title(), "heading": heading.get("text", ""), "page": page_num})
break # Only add once per heading
if not patterns:
@@ -506,7 +501,7 @@ class PDFToSkillConverter:
# Group by type
by_type = {}
for pattern in patterns:
ptype = pattern['type']
ptype = pattern["type"]
if ptype not in by_type:
by_type[ptype] = []
by_type[ptype].append(pattern)
@@ -524,22 +519,21 @@ class PDFToSkillConverter:
def _sanitize_filename(self, name):
"""Convert string to safe filename"""
# Remove special chars, replace spaces with underscores
safe = re.sub(r'[^\w\s-]', '', name.lower())
safe = re.sub(r'[-\s]+', '_', safe)
safe = re.sub(r"[^\w\s-]", "", name.lower())
safe = re.sub(r"[-\s]+", "_", safe)
return safe
def main():
parser = argparse.ArgumentParser(
description='Convert PDF documentation to Claude skill',
formatter_class=argparse.RawDescriptionHelpFormatter
description="Convert PDF documentation to Claude skill", formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument('--config', help='PDF config JSON file')
parser.add_argument('--pdf', help='Direct PDF file path')
parser.add_argument('--name', help='Skill name (with --pdf)')
parser.add_argument('--from-json', help='Build skill from extracted JSON')
parser.add_argument('--description', help='Skill description')
parser.add_argument("--config", help="PDF config JSON file")
parser.add_argument("--pdf", help="Direct PDF file path")
parser.add_argument("--name", help="Skill name (with --pdf)")
parser.add_argument("--from-json", help="Build skill from extracted JSON")
parser.add_argument("--description", help="Skill description")
args = parser.parse_args()
@@ -549,15 +543,12 @@ def main():
# Load or create config
if args.config:
with open(args.config, 'r') as f:
with open(args.config) as f:
config = json.load(f)
elif args.from_json:
# Build from extracted JSON
name = Path(args.from_json).stem.replace('_extracted', '')
config = {
'name': name,
'description': args.description or f'Use when referencing {name} documentation'
}
name = Path(args.from_json).stem.replace("_extracted", "")
config = {"name": name, "description": args.description or f"Use when referencing {name} documentation"}
converter = PDFToSkillConverter(config)
converter.load_extracted_data(args.from_json)
converter.build_skill()
@@ -567,22 +558,17 @@ def main():
if not args.name:
parser.error("Must specify --name with --pdf")
config = {
'name': args.name,
'pdf_path': args.pdf,
'description': args.description or f'Use when referencing {args.name} documentation',
'extract_options': {
'chunk_size': 10,
'min_quality': 5.0,
'extract_images': True,
'min_image_size': 100
}
"name": args.name,
"pdf_path": args.pdf,
"description": args.description or f"Use when referencing {args.name} documentation",
"extract_options": {"chunk_size": 10, "min_quality": 5.0, "extract_images": True, "min_image_size": 100},
}
# Create converter
converter = PDFToSkillConverter(config)
# Extract if needed
if config.get('pdf_path'):
if config.get("pdf_path"):
if not converter.extract_pdf():
sys.exit(1)
@@ -590,5 +576,5 @@ def main():
converter.build_skill()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -8,44 +8,44 @@ Usage:
python3 quality_checker.py output/godot/ --verbose
"""
import os
import re
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass, field
from pathlib import Path
@dataclass
class QualityIssue:
"""Represents a quality issue found during validation."""
level: str # 'error', 'warning', 'info'
category: str # 'enhancement', 'content', 'links', 'structure'
message: str
file: Optional[str] = None
line: Optional[int] = None
file: str | None = None
line: int | None = None
@dataclass
class QualityReport:
"""Complete quality report for a skill."""
skill_name: str
skill_path: Path
errors: List[QualityIssue] = field(default_factory=list)
warnings: List[QualityIssue] = field(default_factory=list)
info: List[QualityIssue] = field(default_factory=list)
errors: list[QualityIssue] = field(default_factory=list)
warnings: list[QualityIssue] = field(default_factory=list)
info: list[QualityIssue] = field(default_factory=list)
def add_error(self, category: str, message: str, file: str = None, line: int = None):
"""Add an error to the report."""
self.errors.append(QualityIssue('error', category, message, file, line))
self.errors.append(QualityIssue("error", category, message, file, line))
def add_warning(self, category: str, message: str, file: str = None, line: int = None):
"""Add a warning to the report."""
self.warnings.append(QualityIssue('warning', category, message, file, line))
self.warnings.append(QualityIssue("warning", category, message, file, line))
def add_info(self, category: str, message: str, file: str = None, line: int = None):
"""Add info to the report."""
self.info.append(QualityIssue('info', category, message, file, line))
self.info.append(QualityIssue("info", category, message, file, line))
@property
def has_errors(self) -> bool:
@@ -80,15 +80,15 @@ class QualityReport:
"""Get quality grade (A-F)."""
score = self.quality_score
if score >= 90:
return 'A'
return "A"
elif score >= 80:
return 'B'
return "B"
elif score >= 70:
return 'C'
return "C"
elif score >= 60:
return 'D'
return "D"
else:
return 'F'
return "F"
class SkillQualityChecker:
@@ -103,10 +103,7 @@ class SkillQualityChecker:
self.skill_dir = Path(skill_dir)
self.skill_md_path = self.skill_dir / "SKILL.md"
self.references_dir = self.skill_dir / "references"
self.report = QualityReport(
skill_name=self.skill_dir.name,
skill_path=self.skill_dir
)
self.report = QualityReport(skill_name=self.skill_dir.name, skill_path=self.skill_dir)
def check_all(self) -> QualityReport:
"""Run all quality checks and return report.
@@ -135,25 +132,19 @@ class SkillQualityChecker:
"""Check basic skill structure."""
# Check SKILL.md exists
if not self.skill_md_path.exists():
self.report.add_error(
'structure',
'SKILL.md file not found',
str(self.skill_md_path)
)
self.report.add_error("structure", "SKILL.md file not found", str(self.skill_md_path))
return
# Check references directory exists
if not self.references_dir.exists():
self.report.add_warning(
'structure',
'references/ directory not found - skill may be incomplete',
str(self.references_dir)
"structure", "references/ directory not found - skill may be incomplete", str(self.references_dir)
)
elif not list(self.references_dir.rglob('*.md')):
elif not list(self.references_dir.rglob("*.md")):
self.report.add_warning(
'structure',
'references/ directory is empty - no reference documentation found',
str(self.references_dir)
"structure",
"references/ directory is empty - no reference documentation found",
str(self.references_dir),
)
def _check_enhancement_quality(self):
@@ -161,7 +152,7 @@ class SkillQualityChecker:
if not self.skill_md_path.exists():
return
content = self.skill_md_path.read_text(encoding='utf-8')
content = self.skill_md_path.read_text(encoding="utf-8")
# Check for template indicators (signs it wasn't enhanced)
template_indicators = [
@@ -174,140 +165,90 @@ class SkillQualityChecker:
for indicator in template_indicators:
if indicator.lower() in content.lower():
self.report.add_warning(
'enhancement',
"enhancement",
f'Found template placeholder: "{indicator}" - SKILL.md may not be enhanced',
'SKILL.md'
"SKILL.md",
)
# Check for good signs of enhancement
enhancement_indicators = {
'code_examples': re.compile(r'```[\w-]+\n', re.MULTILINE),
'real_examples': re.compile(r'Example:', re.IGNORECASE),
'sections': re.compile(r'^## .+', re.MULTILINE),
"code_examples": re.compile(r"```[\w-]+\n", re.MULTILINE),
"real_examples": re.compile(r"Example:", re.IGNORECASE),
"sections": re.compile(r"^## .+", re.MULTILINE),
}
code_blocks = len(enhancement_indicators['code_examples'].findall(content))
real_examples = len(enhancement_indicators['real_examples'].findall(content))
sections = len(enhancement_indicators['sections'].findall(content))
code_blocks = len(enhancement_indicators["code_examples"].findall(content))
real_examples = len(enhancement_indicators["real_examples"].findall(content))
sections = len(enhancement_indicators["sections"].findall(content))
# Quality thresholds
if code_blocks == 0:
self.report.add_warning(
'enhancement',
'No code examples found in SKILL.md - consider enhancing',
'SKILL.md'
"enhancement", "No code examples found in SKILL.md - consider enhancing", "SKILL.md"
)
elif code_blocks < 3:
self.report.add_info(
'enhancement',
f'Only {code_blocks} code examples found - more examples would improve quality',
'SKILL.md'
"enhancement",
f"Only {code_blocks} code examples found - more examples would improve quality",
"SKILL.md",
)
else:
self.report.add_info(
'enhancement',
f'✓ Found {code_blocks} code examples',
'SKILL.md'
)
self.report.add_info("enhancement", f"✓ Found {code_blocks} code examples", "SKILL.md")
if sections < 4:
self.report.add_warning(
'enhancement',
f'Only {sections} sections found - SKILL.md may be too basic',
'SKILL.md'
"enhancement", f"Only {sections} sections found - SKILL.md may be too basic", "SKILL.md"
)
else:
self.report.add_info(
'enhancement',
f'✓ Found {sections} sections',
'SKILL.md'
)
self.report.add_info("enhancement", f"✓ Found {sections} sections", "SKILL.md")
def _check_content_quality(self):
"""Check content quality."""
if not self.skill_md_path.exists():
return
content = self.skill_md_path.read_text(encoding='utf-8')
content = self.skill_md_path.read_text(encoding="utf-8")
# Check YAML frontmatter
if not content.startswith('---'):
self.report.add_error(
'content',
'Missing YAML frontmatter - SKILL.md must start with ---',
'SKILL.md',
1
)
if not content.startswith("---"):
self.report.add_error("content", "Missing YAML frontmatter - SKILL.md must start with ---", "SKILL.md", 1)
else:
# Extract frontmatter
try:
frontmatter_match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
frontmatter_match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
if frontmatter_match:
frontmatter = frontmatter_match.group(1)
# Check for required fields
if 'name:' not in frontmatter:
self.report.add_error(
'content',
'Missing "name:" field in YAML frontmatter',
'SKILL.md',
2
)
if "name:" not in frontmatter:
self.report.add_error("content", 'Missing "name:" field in YAML frontmatter', "SKILL.md", 2)
# Check for description
if 'description:' in frontmatter:
self.report.add_info(
'content',
'✓ YAML frontmatter includes description',
'SKILL.md'
)
if "description:" in frontmatter:
self.report.add_info("content", "✓ YAML frontmatter includes description", "SKILL.md")
else:
self.report.add_error(
'content',
'Invalid YAML frontmatter format',
'SKILL.md',
1
)
self.report.add_error("content", "Invalid YAML frontmatter format", "SKILL.md", 1)
except Exception as e:
self.report.add_error(
'content',
f'Error parsing YAML frontmatter: {e}',
'SKILL.md',
1
)
self.report.add_error("content", f"Error parsing YAML frontmatter: {e}", "SKILL.md", 1)
# Check code block language tags
code_blocks_without_lang = re.findall(r'```\n[^`]', content)
code_blocks_without_lang = re.findall(r"```\n[^`]", content)
if code_blocks_without_lang:
self.report.add_warning(
'content',
f'Found {len(code_blocks_without_lang)} code blocks without language tags',
'SKILL.md'
"content", f"Found {len(code_blocks_without_lang)} code blocks without language tags", "SKILL.md"
)
# Check for "When to Use" section
if 'when to use' not in content.lower():
self.report.add_warning(
'content',
'Missing "When to Use This Skill" section',
'SKILL.md'
)
if "when to use" not in content.lower():
self.report.add_warning("content", 'Missing "When to Use This Skill" section', "SKILL.md")
else:
self.report.add_info(
'content',
'✓ Found "When to Use" section',
'SKILL.md'
)
self.report.add_info("content", '✓ Found "When to Use" section', "SKILL.md")
# Check reference files
if self.references_dir.exists():
ref_files = list(self.references_dir.rglob('*.md'))
ref_files = list(self.references_dir.rglob("*.md"))
if ref_files:
self.report.add_info(
'content',
f'✓ Found {len(ref_files)} reference files',
'references/'
)
self.report.add_info("content", f"✓ Found {len(ref_files)} reference files", "references/")
# Check if references are mentioned in SKILL.md
mentioned_refs = 0
@@ -317,9 +258,7 @@ class SkillQualityChecker:
if mentioned_refs == 0:
self.report.add_warning(
'content',
'Reference files exist but none are mentioned in SKILL.md',
'SKILL.md'
"content", "Reference files exist but none are mentioned in SKILL.md", "SKILL.md"
)
def _check_links(self):
@@ -327,21 +266,21 @@ class SkillQualityChecker:
if not self.skill_md_path.exists():
return
content = self.skill_md_path.read_text(encoding='utf-8')
content = self.skill_md_path.read_text(encoding="utf-8")
# Find all markdown links [text](path)
link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
links = link_pattern.findall(content)
broken_links = []
for text, link in links:
# Skip external links (http/https)
if link.startswith('http://') or link.startswith('https://'):
if link.startswith("http://") or link.startswith("https://"):
continue
# Skip anchor links
if link.startswith('#'):
if link.startswith("#"):
continue
# Check if file exists (relative to SKILL.md)
@@ -351,20 +290,12 @@ class SkillQualityChecker:
if broken_links:
for text, link in broken_links:
self.report.add_warning(
'links',
f'Broken link: [{text}]({link})',
'SKILL.md'
)
self.report.add_warning("links", f"Broken link: [{text}]({link})", "SKILL.md")
else:
if links:
internal_links = [l for t, l in links if not l.startswith('http')]
internal_links = [l for t, l in links if not l.startswith("http")]
if internal_links:
self.report.add_info(
'links',
f'✓ All {len(internal_links)} internal links are valid',
'SKILL.md'
)
self.report.add_info("links", f"✓ All {len(internal_links)} internal links are valid", "SKILL.md")
def _check_skill_completeness(self):
"""Check skill completeness based on best practices.
@@ -375,83 +306,61 @@ class SkillQualityChecker:
if not self.skill_md_path.exists():
return
content = self.skill_md_path.read_text(encoding='utf-8')
content = self.skill_md_path.read_text(encoding="utf-8")
# Check for grounding/verification section (prerequisites)
grounding_patterns = [
r'before\s+(executing|running|proceeding|you\s+start)',
r'verify\s+that',
r'prerequisites?',
r'requirements?:',
r'make\s+sure\s+you\s+have',
r"before\s+(executing|running|proceeding|you\s+start)",
r"verify\s+that",
r"prerequisites?",
r"requirements?:",
r"make\s+sure\s+you\s+have",
]
has_grounding = any(
re.search(pattern, content, re.IGNORECASE)
for pattern in grounding_patterns
)
has_grounding = any(re.search(pattern, content, re.IGNORECASE) for pattern in grounding_patterns)
if has_grounding:
self.report.add_info(
'completeness',
'✓ Found verification/prerequisites section',
'SKILL.md'
)
self.report.add_info("completeness", "✓ Found verification/prerequisites section", "SKILL.md")
else:
self.report.add_info(
'completeness',
'Consider adding prerequisites section - helps Claude verify conditions first',
'SKILL.md'
"completeness",
"Consider adding prerequisites section - helps Claude verify conditions first",
"SKILL.md",
)
# Check for error handling/troubleshooting guidance
error_patterns = [
r'if\s+.*\s+(fails?|errors?)',
r'troubleshoot',
r'common\s+(issues?|problems?)',
r'error\s+handling',
r'when\s+things\s+go\s+wrong',
r"if\s+.*\s+(fails?|errors?)",
r"troubleshoot",
r"common\s+(issues?|problems?)",
r"error\s+handling",
r"when\s+things\s+go\s+wrong",
]
has_error_handling = any(
re.search(pattern, content, re.IGNORECASE)
for pattern in error_patterns
)
has_error_handling = any(re.search(pattern, content, re.IGNORECASE) for pattern in error_patterns)
if has_error_handling:
self.report.add_info(
'completeness',
'✓ Found error handling/troubleshooting guidance',
'SKILL.md'
)
self.report.add_info("completeness", "✓ Found error handling/troubleshooting guidance", "SKILL.md")
else:
self.report.add_info(
'completeness',
'Consider adding troubleshooting section for common issues',
'SKILL.md'
"completeness", "Consider adding troubleshooting section for common issues", "SKILL.md"
)
# Check for workflow steps (numbered or sequential indicators)
step_patterns = [
r'step\s+\d',
r'##\s+\d\.',
r'first,?\s+',
r'then,?\s+',
r'finally,?\s+',
r'next,?\s+',
r"step\s+\d",
r"##\s+\d\.",
r"first,?\s+",
r"then,?\s+",
r"finally,?\s+",
r"next,?\s+",
]
steps_found = sum(
1 for pattern in step_patterns
if re.search(pattern, content, re.IGNORECASE)
)
steps_found = sum(1 for pattern in step_patterns if re.search(pattern, content, re.IGNORECASE))
if steps_found >= 3:
self.report.add_info(
'completeness',
f'✓ Found clear workflow indicators ({steps_found} step markers)',
'SKILL.md'
"completeness", f"✓ Found clear workflow indicators ({steps_found} step markers)", "SKILL.md"
)
elif steps_found > 0:
self.report.add_info(
'completeness',
f'Some workflow guidance found ({steps_found} markers) - '
'consider adding numbered steps for clarity',
'SKILL.md'
"completeness",
f"Some workflow guidance found ({steps_found} markers) - consider adding numbered steps for clarity",
"SKILL.md",
)
@@ -475,7 +384,13 @@ def print_report(report: QualityReport, verbose: bool = False):
if report.errors:
print(f"❌ ERRORS ({len(report.errors)}):")
for issue in report.errors:
location = f" ({issue.file}:{issue.line})" if issue.file and issue.line else f" ({issue.file})" if issue.file else ""
location = (
f" ({issue.file}:{issue.line})"
if issue.file and issue.line
else f" ({issue.file})"
if issue.file
else ""
)
print(f" [{issue.category}] {issue.message}{location}")
print()
@@ -483,7 +398,13 @@ def print_report(report: QualityReport, verbose: bool = False):
if report.warnings:
print(f"⚠️ WARNINGS ({len(report.warnings)}):")
for issue in report.warnings:
location = f" ({issue.file}:{issue.line})" if issue.file and issue.line else f" ({issue.file})" if issue.file else ""
location = (
f" ({issue.file}:{issue.line})"
if issue.file and issue.line
else f" ({issue.file})"
if issue.file
else ""
)
print(f" [{issue.category}] {issue.message}{location}")
print()
@@ -523,25 +444,14 @@ Examples:
# Exit with error code if issues found
python3 quality_checker.py output/django/ --strict
"""
""",
)
parser.add_argument(
'skill_directory',
help='Path to skill directory (e.g., output/react/)'
)
parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)")
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show all info messages'
)
parser.add_argument("--verbose", "-v", action="store_true", help="Show all info messages")
parser.add_argument(
'--strict',
action='store_true',
help='Exit with error code if any warnings or errors found'
)
parser.add_argument("--strict", action="store_true", help="Exit with error code if any warnings or errors found")
args = parser.parse_args()
@@ -559,9 +469,7 @@ Examples:
print_report(report, verbose=args.verbose)
# Exit code
if args.strict and (report.has_errors or report.has_warnings):
sys.exit(1)
elif report.has_errors:
if args.strict and (report.has_errors or report.has_warnings) or report.has_errors:
sys.exit(1)
else:
sys.exit(0)

View File

@@ -9,16 +9,19 @@ Handles GitHub API rate limits with smart strategies:
- Non-interactive mode for CI/CD
"""
import time
import sys
from datetime import datetime, timedelta
from typing import Optional, Dict, Any
import time
from datetime import datetime
from typing import Any
import requests
from .config_manager import get_config_manager
class RateLimitError(Exception):
"""Raised when rate limit is exceeded and cannot be handled."""
pass
@@ -43,10 +46,10 @@ class RateLimitHandler:
def __init__(
self,
token: Optional[str] = None,
token: str | None = None,
interactive: bool = True,
profile_name: Optional[str] = None,
auto_switch: bool = True
profile_name: str | None = None,
auto_switch: bool = True,
):
"""
Initialize rate limit handler.
@@ -91,7 +94,7 @@ class RateLimitHandler:
if self.interactive:
response = input("Continue without token? [Y/n]: ").strip().lower()
if response in ['n', 'no']:
if response in ["n", "no"]:
print("\n✅ Run 'skill-seekers config --github' to set up a token.\n")
return False
@@ -100,12 +103,12 @@ class RateLimitHandler:
# Check current rate limit status
try:
rate_info = self.get_rate_limit_info()
remaining = rate_info.get('remaining', 0)
limit = rate_info.get('limit', 5000)
remaining = rate_info.get("remaining", 0)
limit = rate_info.get("limit", 5000)
if remaining == 0:
print(f"\n⚠️ Warning: GitHub rate limit already exhausted (0/{limit})")
reset_time = rate_info.get('reset_time')
reset_time = rate_info.get("reset_time")
if reset_time:
wait_minutes = (reset_time - datetime.now()).total_seconds() / 60
print(f" Resets in {int(wait_minutes)} minutes")
@@ -146,9 +149,9 @@ class RateLimitHandler:
if response.status_code == 403:
try:
error_data = response.json()
message = error_data.get('message', '')
message = error_data.get("message", "")
if 'rate limit' in message.lower() or 'api rate limit exceeded' in message.lower():
if "rate limit" in message.lower() or "api rate limit exceeded" in message.lower():
# Extract rate limit info from headers
rate_info = self.extract_rate_limit_info(response)
return self.handle_rate_limit(rate_info)
@@ -158,7 +161,7 @@ class RateLimitHandler:
return True
def extract_rate_limit_info(self, response: requests.Response) -> Dict[str, Any]:
def extract_rate_limit_info(self, response: requests.Response) -> dict[str, Any]:
"""
Extract rate limit information from response headers.
@@ -170,20 +173,15 @@ class RateLimitHandler:
"""
headers = response.headers
limit = int(headers.get('X-RateLimit-Limit', 0))
remaining = int(headers.get('X-RateLimit-Remaining', 0))
reset_timestamp = int(headers.get('X-RateLimit-Reset', 0))
limit = int(headers.get("X-RateLimit-Limit", 0))
remaining = int(headers.get("X-RateLimit-Remaining", 0))
reset_timestamp = int(headers.get("X-RateLimit-Reset", 0))
reset_time = datetime.fromtimestamp(reset_timestamp) if reset_timestamp else None
return {
'limit': limit,
'remaining': remaining,
'reset_timestamp': reset_timestamp,
'reset_time': reset_time
}
return {"limit": limit, "remaining": remaining, "reset_timestamp": reset_timestamp, "reset_time": reset_time}
def get_rate_limit_info(self) -> Dict[str, Any]:
def get_rate_limit_info(self) -> dict[str, Any]:
"""
Get current rate limit status from GitHub API.
@@ -193,25 +191,25 @@ class RateLimitHandler:
url = "https://api.github.com/rate_limit"
headers = {}
if self.token:
headers['Authorization'] = f'token {self.token}'
headers["Authorization"] = f"token {self.token}"
response = requests.get(url, headers=headers, timeout=5)
response.raise_for_status()
data = response.json()
core = data.get('rate', {})
core = data.get("rate", {})
reset_timestamp = core.get('reset', 0)
reset_timestamp = core.get("reset", 0)
reset_time = datetime.fromtimestamp(reset_timestamp) if reset_timestamp else None
return {
'limit': core.get('limit', 0),
'remaining': core.get('remaining', 0),
'reset_timestamp': reset_timestamp,
'reset_time': reset_time
"limit": core.get("limit", 0),
"remaining": core.get("remaining", 0),
"reset_timestamp": reset_timestamp,
"reset_time": reset_time,
}
def handle_rate_limit(self, rate_info: Dict[str, Any]) -> bool:
def handle_rate_limit(self, rate_info: dict[str, Any]) -> bool:
"""
Handle rate limit based on strategy.
@@ -224,11 +222,11 @@ class RateLimitHandler:
Raises:
RateLimitError: If cannot handle in non-interactive mode
"""
reset_time = rate_info.get('reset_time')
remaining = rate_info.get('remaining', 0)
limit = rate_info.get('limit', 0)
reset_time = rate_info.get("reset_time")
remaining = rate_info.get("remaining", 0)
limit = rate_info.get("limit", 0)
print(f"\n⚠️ GitHub Rate Limit Reached")
print("\n⚠️ GitHub Rate Limit Reached")
print(f" Profile: {self.profile_name or 'default'}")
print(f" Limit: {remaining}/{limit} requests")
@@ -294,8 +292,8 @@ class RateLimitHandler:
self.token = next_token
rate_info = self.get_rate_limit_info()
remaining = rate_info.get('remaining', 0)
limit = rate_info.get('limit', 0)
remaining = rate_info.get("remaining", 0)
limit = rate_info.get("limit", 0)
if remaining > 0:
print(f"✅ Profile '{next_name}' has {remaining}/{limit} requests available")
@@ -394,24 +392,24 @@ class RateLimitHandler:
while True:
choice = input("Select an option [w/s/t/c]: ").strip().lower()
if choice == 'w':
if choice == "w":
return self.wait_for_reset(wait_seconds, wait_minutes)
elif choice == 's':
elif choice == "s":
if self.try_switch_profile():
return True
else:
print("⚠️ Profile switching failed. Choose another option.")
continue
elif choice == 't':
elif choice == "t":
print("\n💡 Opening GitHub token setup...")
print(" Run this command in another terminal:")
print(" $ skill-seekers config --github\n")
print(" Then restart your scraping job.\n")
return False
elif choice == 'c':
elif choice == "c":
print("\n⏸️ Operation cancelled by user\n")
return False
@@ -419,7 +417,7 @@ class RateLimitHandler:
print("❌ Invalid choice. Please enter w, s, t, or c.")
def create_github_headers(token: Optional[str] = None) -> Dict[str, str]:
def create_github_headers(token: str | None = None) -> dict[str, str]:
"""
Create GitHub API headers with optional token.
@@ -431,5 +429,5 @@ def create_github_headers(token: Optional[str] = None) -> Dict[str, str]:
"""
headers = {}
if token:
headers['Authorization'] = f'token {token}'
headers["Authorization"] = f"token {token}"
return headers

View File

@@ -4,9 +4,9 @@ Resume Command for Skill Seekers
Allows users to resume interrupted scraping jobs from saved progress.
"""
import sys
import argparse
from typing import Optional
import sys
from .config_manager import get_config_manager
@@ -132,24 +132,10 @@ def clean_old_jobs():
def main():
"""Main entry point for resume command."""
parser = argparse.ArgumentParser(
description="Resume interrupted Skill Seekers jobs"
)
parser.add_argument(
"job_id",
nargs="?",
help="Job ID to resume"
)
parser.add_argument(
"--list",
action="store_true",
help="List all resumable jobs"
)
parser.add_argument(
"--clean",
action="store_true",
help="Clean up old progress files"
)
parser = argparse.ArgumentParser(description="Resume interrupted Skill Seekers jobs")
parser.add_argument("job_id", nargs="?", help="Job ID to resume")
parser.add_argument("--list", action="store_true", help="List all resumable jobs")
parser.add_argument("--clean", action="store_true", help="Clean up old progress files")
args = parser.parse_args()

View File

@@ -6,21 +6,18 @@ Runs all test suites and generates a comprehensive test report
import sys
import unittest
import os
from io import StringIO
from pathlib import Path
class ColoredTextTestResult(unittest.TextTestResult):
"""Custom test result class with colored output"""
# ANSI color codes
GREEN = '\033[92m'
RED = '\033[91m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
RESET = '\033[0m'
BOLD = '\033[1m'
GREEN = "\033[92m"
RED = "\033[91m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
RESET = "\033[0m"
BOLD = "\033[1m"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -28,7 +25,7 @@ class ColoredTextTestResult(unittest.TextTestResult):
def addSuccess(self, test):
super().addSuccess(test)
self.test_results.append(('PASS', test))
self.test_results.append(("PASS", test))
if self.showAll:
self.stream.write(f"{self.GREEN}✓ PASS{self.RESET}\n")
elif self.dots:
@@ -37,7 +34,7 @@ class ColoredTextTestResult(unittest.TextTestResult):
def addError(self, test, err):
super().addError(test, err)
self.test_results.append(('ERROR', test))
self.test_results.append(("ERROR", test))
if self.showAll:
self.stream.write(f"{self.RED}✗ ERROR{self.RESET}\n")
elif self.dots:
@@ -46,7 +43,7 @@ class ColoredTextTestResult(unittest.TextTestResult):
def addFailure(self, test, err):
super().addFailure(test, err)
self.test_results.append(('FAIL', test))
self.test_results.append(("FAIL", test))
if self.showAll:
self.stream.write(f"{self.RED}✗ FAIL{self.RESET}\n")
elif self.dots:
@@ -55,7 +52,7 @@ class ColoredTextTestResult(unittest.TextTestResult):
def addSkip(self, test, reason):
super().addSkip(test, reason)
self.test_results.append(('SKIP', test))
self.test_results.append(("SKIP", test))
if self.showAll:
self.stream.write(f"{self.YELLOW}⊘ SKIP{self.RESET}\n")
elif self.dots:
@@ -65,14 +62,15 @@ class ColoredTextTestResult(unittest.TextTestResult):
class ColoredTextTestRunner(unittest.TextTestRunner):
"""Custom test runner with colored output"""
resultclass = ColoredTextTestResult
def discover_tests(test_dir='tests'):
def discover_tests(test_dir="tests"):
"""Discover all test files in the tests directory"""
loader = unittest.TestLoader()
start_dir = test_dir
pattern = 'test_*.py'
pattern = "test_*.py"
suite = loader.discover(start_dir, pattern=pattern)
return suite
@@ -83,9 +81,9 @@ def run_specific_suite(suite_name):
loader = unittest.TestLoader()
suite_map = {
'config': 'tests.test_config_validation',
'features': 'tests.test_scraper_features',
'integration': 'tests.test_integration'
"config": "tests.test_config_validation",
"features": "tests.test_scraper_features",
"integration": "tests.test_integration",
}
if suite_name not in suite_map:
@@ -110,9 +108,9 @@ def print_summary(result):
errors = len(result.errors)
skipped = len(result.skipped)
print("\n" + "="*70)
print("\n" + "=" * 70)
print("TEST SUMMARY")
print("="*70)
print("=" * 70)
# Overall stats
print(f"\n{ColoredTextTestResult.BOLD}Total Tests:{ColoredTextTestResult.RESET} {total}")
@@ -127,31 +125,35 @@ def print_summary(result):
# Success rate
if total > 0:
success_rate = (passed / total) * 100
color = ColoredTextTestResult.GREEN if success_rate == 100 else \
ColoredTextTestResult.YELLOW if success_rate >= 80 else \
ColoredTextTestResult.RED
color = (
ColoredTextTestResult.GREEN
if success_rate == 100
else ColoredTextTestResult.YELLOW
if success_rate >= 80
else ColoredTextTestResult.RED
)
print(f"\n{color}Success Rate: {success_rate:.1f}%{ColoredTextTestResult.RESET}")
# Category breakdown
if hasattr(result, 'test_results'):
if hasattr(result, "test_results"):
print(f"\n{ColoredTextTestResult.BOLD}Test Breakdown by Category:{ColoredTextTestResult.RESET}")
categories = {}
for status, test in result.test_results:
test_name = str(test)
# Extract test class name
if '.' in test_name:
class_name = test_name.split('.')[0].split()[-1]
if "." in test_name:
class_name = test_name.split(".")[0].split()[-1]
if class_name not in categories:
categories[class_name] = {'PASS': 0, 'FAIL': 0, 'ERROR': 0, 'SKIP': 0}
categories[class_name] = {"PASS": 0, "FAIL": 0, "ERROR": 0, "SKIP": 0}
categories[class_name][status] += 1
for category, stats in sorted(categories.items()):
total_cat = sum(stats.values())
passed_cat = stats['PASS']
passed_cat = stats["PASS"]
print(f" {category}: {passed_cat}/{total_cat} passed")
print("\n" + "="*70)
print("\n" + "=" * 70)
# Return status
return failed == 0 and errors == 0
@@ -162,20 +164,14 @@ def main():
import argparse
parser = argparse.ArgumentParser(
description='Run tests for Skill Seeker',
formatter_class=argparse.RawDescriptionHelpFormatter
description="Run tests for Skill Seeker", formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument('--suite', '-s', type=str,
help='Run specific test suite (config, features, integration)')
parser.add_argument('--verbose', '-v', action='store_true',
help='Verbose output (show each test)')
parser.add_argument('--quiet', '-q', action='store_true',
help='Quiet output (minimal output)')
parser.add_argument('--failfast', '-f', action='store_true',
help='Stop on first failure')
parser.add_argument('--list', '-l', action='store_true',
help='List all available tests')
parser.add_argument("--suite", "-s", type=str, help="Run specific test suite (config, features, integration)")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output (show each test)")
parser.add_argument("--quiet", "-q", action="store_true", help="Quiet output (minimal output)")
parser.add_argument("--failfast", "-f", action="store_true", help="Stop on first failure")
parser.add_argument("--list", "-l", action="store_true", help="List all available tests")
args = parser.parse_args()
@@ -186,9 +182,9 @@ def main():
elif args.quiet:
verbosity = 0
print(f"\n{ColoredTextTestResult.BOLD}{'='*70}{ColoredTextTestResult.RESET}")
print(f"\n{ColoredTextTestResult.BOLD}{'=' * 70}{ColoredTextTestResult.RESET}")
print(f"{ColoredTextTestResult.BOLD}SKILL SEEKER TEST SUITE{ColoredTextTestResult.RESET}")
print(f"{ColoredTextTestResult.BOLD}{'='*70}{ColoredTextTestResult.RESET}\n")
print(f"{ColoredTextTestResult.BOLD}{'=' * 70}{ColoredTextTestResult.RESET}\n")
# Discover or load specific suite
if args.suite:
@@ -210,10 +206,7 @@ def main():
return 0
# Run tests
runner = ColoredTextTestRunner(
verbosity=verbosity,
failfast=args.failfast
)
runner = ColoredTextTestRunner(verbosity=verbosity, failfast=args.failfast)
result = runner.run(suite)
@@ -224,5 +217,5 @@ def main():
return 0 if success else 1
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main())

View File

@@ -6,12 +6,12 @@ Splits large documentation configs into multiple smaller, focused skill configs.
Supports multiple splitting strategies: category-based, size-based, and automatic.
"""
import argparse
import json
import sys
import argparse
from pathlib import Path
from typing import Dict, List, Any, Tuple
from collections import defaultdict
from pathlib import Path
from typing import Any
class ConfigSplitter:
@@ -22,12 +22,12 @@ class ConfigSplitter:
self.strategy = strategy
self.target_pages = target_pages
self.config = self.load_config()
self.base_name = self.config['name']
self.base_name = self.config["name"]
def load_config(self) -> Dict[str, Any]:
def load_config(self) -> dict[str, Any]:
"""Load configuration from file"""
try:
with open(self.config_path, 'r') as f:
with open(self.config_path) as f:
return json.load(f)
except FileNotFoundError:
print(f"❌ Error: Config file not found: {self.config_path}")
@@ -38,45 +38,45 @@ class ConfigSplitter:
def is_unified_config(self) -> bool:
"""Check if this is a unified multi-source config"""
return 'sources' in self.config
return "sources" in self.config
def get_split_strategy(self) -> str:
"""Determine split strategy"""
# For unified configs, default to source-based splitting
if self.is_unified_config():
if self.strategy == "auto":
num_sources = len(self.config.get('sources', []))
num_sources = len(self.config.get("sources", []))
if num_sources <= 1:
print(f" Single source unified config - no splitting needed")
print(" Single source unified config - no splitting needed")
return "none"
else:
print(f" Multi-source unified config ({num_sources} sources) - source split recommended")
return "source"
# For unified configs, only 'source' and 'none' strategies are valid
elif self.strategy in ['source', 'none']:
elif self.strategy in ["source", "none"]:
return self.strategy
else:
print(f"⚠️ Warning: Strategy '{self.strategy}' not supported for unified configs")
print(f" Using 'source' strategy instead")
print(" Using 'source' strategy instead")
return "source"
# Check if strategy is defined in config (documentation configs)
if 'split_strategy' in self.config:
config_strategy = self.config['split_strategy']
if "split_strategy" in self.config:
config_strategy = self.config["split_strategy"]
if config_strategy != "none":
return config_strategy
# Use provided strategy or auto-detect (documentation configs)
if self.strategy == "auto":
max_pages = self.config.get('max_pages', 500)
max_pages = self.config.get("max_pages", 500)
if max_pages < 5000:
print(f" Small documentation ({max_pages} pages) - no splitting needed")
return "none"
elif max_pages < 10000 and 'categories' in self.config:
elif max_pages < 10000 and "categories" in self.config:
print(f" Medium documentation ({max_pages} pages) - category split recommended")
return "category"
elif 'categories' in self.config and len(self.config['categories']) >= 3:
elif "categories" in self.config and len(self.config["categories"]) >= 3:
print(f" Large documentation ({max_pages} pages) - router + categories recommended")
return "router"
else:
@@ -85,14 +85,14 @@ class ConfigSplitter:
return self.strategy
def split_by_category(self, create_router: bool = False) -> List[Dict[str, Any]]:
def split_by_category(self, create_router: bool = False) -> list[dict[str, Any]]:
"""Split config by categories"""
if 'categories' not in self.config:
if "categories" not in self.config:
print("❌ Error: No categories defined in config")
sys.exit(1)
categories = self.config['categories']
split_categories = self.config.get('split_config', {}).get('split_by_categories')
categories = self.config["categories"]
split_categories = self.config.get("split_config", {}).get("split_by_categories")
# If specific categories specified, use only those
if split_categories:
@@ -103,34 +103,36 @@ class ConfigSplitter:
for category_name, keywords in categories.items():
# Create new config for this category
new_config = self.config.copy()
new_config['name'] = f"{self.base_name}-{category_name}"
new_config['description'] = f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}"
new_config["name"] = f"{self.base_name}-{category_name}"
new_config["description"] = (
f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}"
)
# Update URL patterns to focus on this category
url_patterns = new_config.get('url_patterns', {})
url_patterns = new_config.get("url_patterns", {})
# Add category keywords to includes
includes = url_patterns.get('include', [])
includes = url_patterns.get("include", [])
for keyword in keywords:
if keyword.startswith('/'):
if keyword.startswith("/"):
includes.append(keyword)
if includes:
url_patterns['include'] = list(set(includes))
new_config['url_patterns'] = url_patterns
url_patterns["include"] = list(set(includes))
new_config["url_patterns"] = url_patterns
# Keep only this category
new_config['categories'] = {category_name: keywords}
new_config["categories"] = {category_name: keywords}
# Remove split config from child
if 'split_strategy' in new_config:
del new_config['split_strategy']
if 'split_config' in new_config:
del new_config['split_config']
if "split_strategy" in new_config:
del new_config["split_strategy"]
if "split_config" in new_config:
del new_config["split_config"]
# Adjust max_pages estimate
if 'max_pages' in new_config:
new_config['max_pages'] = self.target_pages
if "max_pages" in new_config:
new_config["max_pages"] = self.target_pages
configs.append(new_config)
@@ -144,9 +146,9 @@ class ConfigSplitter:
return configs
def split_by_size(self) -> List[Dict[str, Any]]:
def split_by_size(self) -> list[dict[str, Any]]:
"""Split config by size (page count)"""
max_pages = self.config.get('max_pages', 500)
max_pages = self.config.get("max_pages", 500)
num_splits = (max_pages + self.target_pages - 1) // self.target_pages
configs = []
@@ -154,28 +156,30 @@ class ConfigSplitter:
for i in range(num_splits):
new_config = self.config.copy()
part_num = i + 1
new_config['name'] = f"{self.base_name}-part{part_num}"
new_config['description'] = f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}"
new_config['max_pages'] = self.target_pages
new_config["name"] = f"{self.base_name}-part{part_num}"
new_config["description"] = (
f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}"
)
new_config["max_pages"] = self.target_pages
# Remove split config from child
if 'split_strategy' in new_config:
del new_config['split_strategy']
if 'split_config' in new_config:
del new_config['split_config']
if "split_strategy" in new_config:
del new_config["split_strategy"]
if "split_config" in new_config:
del new_config["split_config"]
configs.append(new_config)
print(f"✅ Created {len(configs)} size-based configs ({self.target_pages} pages each)")
return configs
def split_by_source(self) -> List[Dict[str, Any]]:
def split_by_source(self) -> list[dict[str, Any]]:
"""Split unified config by source type"""
if not self.is_unified_config():
print("❌ Error: Config is not a unified config (missing 'sources' key)")
sys.exit(1)
sources = self.config.get('sources', [])
sources = self.config.get("sources", [])
if not sources:
print("❌ Error: No sources defined in unified config")
sys.exit(1)
@@ -184,20 +188,20 @@ class ConfigSplitter:
source_type_counts = defaultdict(int)
for source in sources:
source_type = source.get('type', 'unknown')
source_type = source.get("type", "unknown")
source_type_counts[source_type] += 1
count = source_type_counts[source_type]
# Create new config for this source
new_config = {
'name': f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""),
'description': f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}",
'sources': [source] # Single source per config
"name": f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""),
"description": f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}",
"sources": [source], # Single source per config
}
# Copy merge_mode if it exists
if 'merge_mode' in self.config:
new_config['merge_mode'] = self.config['merge_mode']
if "merge_mode" in self.config:
new_config["merge_mode"] = self.config["merge_mode"]
configs.append(new_config)
@@ -209,36 +213,33 @@ class ConfigSplitter:
return configs
def create_router_config(self, sub_configs: List[Dict[str, Any]]) -> Dict[str, Any]:
def create_router_config(self, sub_configs: list[dict[str, Any]]) -> dict[str, Any]:
"""Create a router config that references sub-skills"""
router_name = self.config.get('split_config', {}).get('router_name', self.base_name)
router_name = self.config.get("split_config", {}).get("router_name", self.base_name)
router_config = {
"name": router_name,
"description": self.config.get('description', ''),
"base_url": self.config['base_url'],
"selectors": self.config['selectors'],
"url_patterns": self.config.get('url_patterns', {}),
"rate_limit": self.config.get('rate_limit', 0.5),
"description": self.config.get("description", ""),
"base_url": self.config["base_url"],
"selectors": self.config["selectors"],
"url_patterns": self.config.get("url_patterns", {}),
"rate_limit": self.config.get("rate_limit", 0.5),
"max_pages": 500, # Router only needs overview pages
"_router": True,
"_sub_skills": [cfg['name'] for cfg in sub_configs],
"_routing_keywords": {
cfg['name']: list(cfg.get('categories', {}).keys())
for cfg in sub_configs
}
"_sub_skills": [cfg["name"] for cfg in sub_configs],
"_routing_keywords": {cfg["name"]: list(cfg.get("categories", {}).keys()) for cfg in sub_configs},
}
return router_config
def split(self) -> List[Dict[str, Any]]:
def split(self) -> list[dict[str, Any]]:
"""Execute split based on strategy"""
strategy = self.get_split_strategy()
config_type = "UNIFIED" if self.is_unified_config() else "DOCUMENTATION"
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"CONFIG SPLITTER: {self.base_name} ({config_type})")
print(f"{'='*60}")
print(f"{'=' * 60}")
print(f"Strategy: {strategy}")
if not self.is_unified_config():
print(f"Target pages per skill: {self.target_pages}")
@@ -255,7 +256,7 @@ class ConfigSplitter:
return self.split_by_category(create_router=False)
elif strategy == "router":
create_router = self.config.get('split_config', {}).get('create_router', True)
create_router = self.config.get("split_config", {}).get("create_router", True)
return self.split_by_category(create_router=create_router)
elif strategy == "size":
@@ -265,7 +266,7 @@ class ConfigSplitter:
print(f"❌ Error: Unknown strategy: {strategy}")
sys.exit(1)
def save_configs(self, configs: List[Dict[str, Any]], output_dir: Path = None) -> List[Path]:
def save_configs(self, configs: list[dict[str, Any]], output_dir: Path = None) -> list[Path]:
"""Save configs to files"""
if output_dir is None:
output_dir = self.config_path.parent
@@ -279,7 +280,7 @@ class ConfigSplitter:
filename = f"{config['name']}.json"
filepath = output_dir / filename
with open(filepath, 'w') as f:
with open(filepath, "w") as f:
json.dump(config, f, indent=2)
saved_files.append(filepath)
@@ -320,38 +321,23 @@ Split Strategies:
Config Types:
Documentation - Single base_url config (supports: category, router, size)
Unified - Multi-source config (supports: source)
"""
""",
)
parser.add_argument(
'config',
help='Path to config file (e.g., configs/godot.json)'
)
parser.add_argument("config", help="Path to config file (e.g., configs/godot.json)")
parser.add_argument(
'--strategy',
choices=['auto', 'none', 'source', 'category', 'router', 'size'],
default='auto',
help='Splitting strategy (default: auto)'
"--strategy",
choices=["auto", "none", "source", "category", "router", "size"],
default="auto",
help="Splitting strategy (default: auto)",
)
parser.add_argument(
'--target-pages',
type=int,
default=5000,
help='Target pages per skill (default: 5000)'
)
parser.add_argument("--target-pages", type=int, default=5000, help="Target pages per skill (default: 5000)")
parser.add_argument(
'--output-dir',
help='Output directory for configs (default: same as input)'
)
parser.add_argument("--output-dir", help="Output directory for configs (default: same as input)")
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be created without saving files'
)
parser.add_argument("--dry-run", action="store_true", help="Show what would be created without saving files")
args = parser.parse_args()
@@ -362,23 +348,23 @@ Config Types:
configs = splitter.split()
if args.dry_run:
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print("DRY RUN - No files saved")
print(f"{'='*60}")
print(f"{'=' * 60}")
print(f"Would create {len(configs)} config files:")
for cfg in configs:
is_router = cfg.get('_router', False)
is_router = cfg.get("_router", False)
router_marker = " (ROUTER)" if is_router else ""
print(f" 📄 {cfg['name']}.json{router_marker}")
else:
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print("SAVING CONFIGS")
print(f"{'='*60}")
print(f"{'=' * 60}")
saved_files = splitter.save_configs(configs, args.output_dir)
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print("NEXT STEPS")
print(f"{'='*60}")
print(f"{'=' * 60}")
print("1. Review generated configs")
print("2. Scrape each config:")
for filepath in saved_files:

File diff suppressed because it is too large Load Diff

View File

@@ -27,19 +27,18 @@ Example usage:
python test_example_extractor.py tests/ --min-confidence 0.7
"""
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Optional, Literal, Set
from pathlib import Path
import ast
import re
import hashlib
import logging
import argparse
import ast
import hashlib
import json
import sys
import logging
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Literal
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
@@ -47,22 +46,23 @@ logger = logging.getLogger(__name__)
# DATA MODELS
# ============================================================================
@dataclass
class TestExample:
"""Single extracted usage example from test code"""
# Identity
example_id: str # Unique hash of example
test_name: str # Test function/method name
example_id: str # Unique hash of example
test_name: str # Test function/method name
category: Literal["instantiation", "method_call", "config", "setup", "workflow"]
# Code
code: str # Actual example code
language: str # Programming language
code: str # Actual example code
language: str # Programming language
# Context
description: str # What this demonstrates
expected_behavior: str # Expected outcome from assertions
description: str # What this demonstrates
expected_behavior: str # Expected outcome from assertions
# Source
file_path: str
@@ -71,13 +71,13 @@ class TestExample:
# Quality
complexity_score: float # 0-1 scale (higher = more complex/valuable)
confidence: float # 0-1 scale (higher = more confident extraction)
confidence: float # 0-1 scale (higher = more confident extraction)
# Optional fields (must come after required fields)
setup_code: Optional[str] = None # Required setup code
tags: List[str] = field(default_factory=list) # ["pytest", "mock", "async"]
dependencies: List[str] = field(default_factory=list) # Imported modules
ai_analysis: Optional[Dict] = None # AI-generated analysis (C3.6)
setup_code: str | None = None # Required setup code
tags: list[str] = field(default_factory=list) # ["pytest", "mock", "async"]
dependencies: list[str] = field(default_factory=list) # Imported modules
ai_analysis: dict | None = None # AI-generated analysis (C3.6)
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization"""
@@ -96,12 +96,12 @@ class TestExample:
# Add AI analysis if available (C3.6)
if self.ai_analysis:
md += f"\n**🤖 AI Analysis:** \n"
if self.ai_analysis.get('explanation'):
md += "\n**🤖 AI Analysis:** \n"
if self.ai_analysis.get("explanation"):
md += f"*{self.ai_analysis['explanation']}* \n"
if self.ai_analysis.get('best_practices'):
if self.ai_analysis.get("best_practices"):
md += f"**Best Practices:** {', '.join(self.ai_analysis['best_practices'])} \n"
if self.ai_analysis.get('tutorial_group'):
if self.ai_analysis.get("tutorial_group"):
md += f"**Tutorial Group:** {self.ai_analysis['tutorial_group']} \n"
md += f"\n```{self.language.lower()}\n"
@@ -117,13 +117,13 @@ class ExampleReport:
"""Summary of test example extraction results"""
total_examples: int
examples_by_category: Dict[str, int]
examples_by_language: Dict[str, int]
examples: List[TestExample]
examples_by_category: dict[str, int]
examples_by_language: dict[str, int]
examples: list[TestExample]
avg_complexity: float
high_value_count: int # confidence > 0.7
file_path: Optional[str] = None # If single file
directory: Optional[str] = None # If directory
file_path: str | None = None # If single file
directory: str | None = None # If directory
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization"""
@@ -135,7 +135,7 @@ class ExampleReport:
"high_value_count": self.high_value_count,
"file_path": self.file_path,
"directory": self.directory,
"examples": [ex.to_dict() for ex in self.examples]
"examples": [ex.to_dict() for ex in self.examples],
}
def to_markdown(self) -> str:
@@ -164,19 +164,20 @@ class ExampleReport:
# PYTHON TEST ANALYZER (AST-based)
# ============================================================================
class PythonTestAnalyzer:
"""Deep AST-based test example extraction for Python"""
def __init__(self):
self.trivial_patterns = {
'assertTrue(True)',
'assertFalse(False)',
'assertEqual(1, 1)',
'assertIsNone(None)',
'assertIsNotNone(None)',
"assertTrue(True)",
"assertFalse(False)",
"assertEqual(1, 1)",
"assertIsNone(None)",
"assertIsNotNone(None)",
}
def extract(self, file_path: str, code: str) -> List[TestExample]:
def extract(self, file_path: str, code: str) -> list[TestExample]:
"""Extract examples from Python test file"""
examples = []
@@ -193,20 +194,16 @@ class PythonTestAnalyzer:
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef):
if self._is_test_class(node):
examples.extend(self._extract_from_test_class(
node, file_path, imports
))
examples.extend(self._extract_from_test_class(node, file_path, imports))
# Find test functions (pytest)
elif isinstance(node, ast.FunctionDef):
if self._is_test_function(node):
examples.extend(self._extract_from_test_function(
node, file_path, imports
))
examples.extend(self._extract_from_test_function(node, file_path, imports))
return examples
def _extract_imports(self, tree: ast.AST) -> List[str]:
def _extract_imports(self, tree: ast.AST) -> list[str]:
"""Extract imported modules"""
imports = []
for node in ast.walk(tree):
@@ -221,30 +218,30 @@ class PythonTestAnalyzer:
"""Check if class is a test class"""
# unittest.TestCase pattern
for base in node.bases:
if isinstance(base, ast.Name) and 'Test' in base.id:
return True
elif isinstance(base, ast.Attribute) and base.attr == 'TestCase':
if (
isinstance(base, ast.Name)
and "Test" in base.id
or isinstance(base, ast.Attribute)
and base.attr == "TestCase"
):
return True
return False
def _is_test_function(self, node: ast.FunctionDef) -> bool:
"""Check if function is a test function"""
# pytest pattern: starts with test_
if node.name.startswith('test_'):
if node.name.startswith("test_"):
return True
# Has @pytest.mark decorator
for decorator in node.decorator_list:
if isinstance(decorator, ast.Attribute):
if 'pytest' in ast.unparse(decorator):
if "pytest" in ast.unparse(decorator):
return True
return False
def _extract_from_test_class(
self,
class_node: ast.ClassDef,
file_path: str,
imports: List[str]
) -> List[TestExample]:
self, class_node: ast.ClassDef, file_path: str, imports: list[str]
) -> list[TestExample]:
"""Extract examples from unittest.TestCase class"""
examples = []
@@ -253,63 +250,46 @@ class PythonTestAnalyzer:
# Process each test method
for node in class_node.body:
if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'):
examples.extend(self._analyze_test_body(
node,
file_path,
imports,
setup_code=setup_code
))
if isinstance(node, ast.FunctionDef) and node.name.startswith("test_"):
examples.extend(self._analyze_test_body(node, file_path, imports, setup_code=setup_code))
return examples
def _extract_from_test_function(
self,
func_node: ast.FunctionDef,
file_path: str,
imports: List[str]
) -> List[TestExample]:
self, func_node: ast.FunctionDef, file_path: str, imports: list[str]
) -> list[TestExample]:
"""Extract examples from pytest test function"""
# Check for fixture parameters
fixture_setup = self._extract_fixtures(func_node)
return self._analyze_test_body(
func_node,
file_path,
imports,
setup_code=fixture_setup
)
return self._analyze_test_body(func_node, file_path, imports, setup_code=fixture_setup)
def _extract_setup_method(self, class_node: ast.ClassDef) -> Optional[str]:
def _extract_setup_method(self, class_node: ast.ClassDef) -> str | None:
"""Extract setUp method code"""
for node in class_node.body:
if isinstance(node, ast.FunctionDef) and node.name == 'setUp':
if isinstance(node, ast.FunctionDef) and node.name == "setUp":
return ast.unparse(node.body)
return None
def _extract_fixtures(self, func_node: ast.FunctionDef) -> Optional[str]:
def _extract_fixtures(self, func_node: ast.FunctionDef) -> str | None:
"""Extract pytest fixture parameters"""
if not func_node.args.args:
return None
# Skip 'self' parameter
params = [arg.arg for arg in func_node.args.args if arg.arg != 'self']
params = [arg.arg for arg in func_node.args.args if arg.arg != "self"]
if params:
return f"# Fixtures: {', '.join(params)}"
return None
def _analyze_test_body(
self,
func_node: ast.FunctionDef,
file_path: str,
imports: List[str],
setup_code: Optional[str] = None
) -> List[TestExample]:
self, func_node: ast.FunctionDef, file_path: str, imports: list[str], setup_code: str | None = None
) -> list[TestExample]:
"""Analyze test function body for extractable patterns"""
examples = []
# Get docstring for description
docstring = ast.get_docstring(func_node) or func_node.name.replace('_', ' ')
docstring = ast.get_docstring(func_node) or func_node.name.replace("_", " ")
# Detect tags
tags = self._detect_tags(func_node, imports)
@@ -321,7 +301,9 @@ class PythonTestAnalyzer:
examples.extend(instantiations)
# 2. Method calls with assertions
method_calls = self._find_method_calls_with_assertions(func_node, file_path, docstring, setup_code, tags, imports)
method_calls = self._find_method_calls_with_assertions(
func_node, file_path, docstring, setup_code, tags, imports
)
examples.extend(method_calls)
# 3. Configuration dictionaries
@@ -334,28 +316,28 @@ class PythonTestAnalyzer:
return examples
def _detect_tags(self, func_node: ast.FunctionDef, imports: List[str]) -> List[str]:
def _detect_tags(self, func_node: ast.FunctionDef, imports: list[str]) -> list[str]:
"""Detect test tags (pytest, mock, async, etc.)"""
tags = []
# Check decorators
for decorator in func_node.decorator_list:
decorator_str = ast.unparse(decorator).lower()
if 'pytest' in decorator_str:
tags.append('pytest')
if 'mock' in decorator_str:
tags.append('mock')
if 'async' in decorator_str or func_node.name.startswith('test_async'):
tags.append('async')
if "pytest" in decorator_str:
tags.append("pytest")
if "mock" in decorator_str:
tags.append("mock")
if "async" in decorator_str or func_node.name.startswith("test_async"):
tags.append("async")
# Check if using unittest
if 'unittest' in imports:
tags.append('unittest')
if "unittest" in imports:
tags.append("unittest")
# Check function body for mock usage
func_str = ast.unparse(func_node).lower()
if 'mock' in func_str or 'patch' in func_str:
tags.append('mock')
if "mock" in func_str or "patch" in func_str:
tags.append("mock")
return list(set(tags))
@@ -364,10 +346,10 @@ class PythonTestAnalyzer:
func_node: ast.FunctionDef,
file_path: str,
description: str,
setup_code: Optional[str],
tags: List[str],
imports: List[str]
) -> List[TestExample]:
setup_code: str | None,
tags: list[str],
imports: list[str],
) -> list[TestExample]:
"""Find object instantiation patterns: obj = ClassName(...)"""
examples = []
@@ -379,7 +361,7 @@ class PythonTestAnalyzer:
code = ast.unparse(node)
# Skip trivial or mock-only
if len(code) < 20 or 'Mock()' in code:
if len(code) < 20 or "Mock()" in code:
continue
# Get class name
@@ -400,7 +382,7 @@ class PythonTestAnalyzer:
complexity_score=self._calculate_complexity(code),
confidence=0.8,
tags=tags,
dependencies=imports
dependencies=imports,
)
examples.append(example)
@@ -411,10 +393,10 @@ class PythonTestAnalyzer:
func_node: ast.FunctionDef,
file_path: str,
description: str,
setup_code: Optional[str],
tags: List[str],
imports: List[str]
) -> List[TestExample]:
setup_code: str | None,
tags: list[str],
imports: list[str],
) -> list[TestExample]:
"""Find method calls followed by assertions"""
examples = []
@@ -450,7 +432,7 @@ class PythonTestAnalyzer:
complexity_score=self._calculate_complexity(code),
confidence=0.85,
tags=tags,
dependencies=imports
dependencies=imports,
)
examples.append(example)
@@ -461,10 +443,10 @@ class PythonTestAnalyzer:
func_node: ast.FunctionDef,
file_path: str,
description: str,
setup_code: Optional[str],
tags: List[str],
imports: List[str]
) -> List[TestExample]:
setup_code: str | None,
tags: list[str],
imports: list[str],
) -> list[TestExample]:
"""Find configuration dictionary patterns"""
examples = []
@@ -491,7 +473,7 @@ class PythonTestAnalyzer:
complexity_score=self._calculate_complexity(code),
confidence=0.75,
tags=tags,
dependencies=imports
dependencies=imports,
)
examples.append(example)
@@ -502,10 +484,10 @@ class PythonTestAnalyzer:
func_node: ast.FunctionDef,
file_path: str,
description: str,
setup_code: Optional[str],
tags: List[str],
imports: List[str]
) -> List[TestExample]:
setup_code: str | None,
tags: list[str],
imports: list[str],
) -> list[TestExample]:
"""Find multi-step workflow patterns (integration tests)"""
examples = []
@@ -515,7 +497,7 @@ class PythonTestAnalyzer:
code = ast.unparse(func_node.body)
# Skip if too long (> 30 lines)
if code.count('\n') > 30:
if code.count("\n") > 30:
return examples
example = TestExample(
@@ -532,8 +514,8 @@ class PythonTestAnalyzer:
line_end=func_node.end_lineno or func_node.lineno,
complexity_score=min(1.0, len(func_node.body) / 10),
confidence=0.9,
tags=tags + ['workflow', 'integration'],
dependencies=imports
tags=tags + ["workflow", "integration"],
dependencies=imports,
)
examples.append(example)
@@ -568,7 +550,7 @@ class PythonTestAnalyzer:
if isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
call_str = ast.unparse(node.value).lower()
assertion_methods = ['assert', 'expect', 'should']
assertion_methods = ["assert", "expect", "should"]
return any(method in call_str for method in assertion_methods)
return False
@@ -584,7 +566,7 @@ class PythonTestAnalyzer:
def _is_integration_test(self, func_node: ast.FunctionDef) -> bool:
"""Check if test looks like an integration test"""
test_name = func_node.name.lower()
integration_keywords = ['workflow', 'integration', 'end_to_end', 'e2e', 'full']
integration_keywords = ["workflow", "integration", "end_to_end", "e2e", "full"]
return any(keyword in test_name for keyword in integration_keywords)
def _extract_assertion_after(self, func_node: ast.FunctionDef, target_node: ast.AST) -> str:
@@ -608,8 +590,8 @@ class PythonTestAnalyzer:
def _calculate_complexity(self, code: str) -> float:
"""Calculate code complexity score (0-1)"""
# Simple heuristic: more lines + more parameters = more complex
lines = code.count('\n') + 1
params = code.count(',') + 1
lines = code.count("\n") + 1
params = code.count(",") + 1
complexity = min(1.0, (lines * 0.1) + (params * 0.05))
return round(complexity, 2)
@@ -623,57 +605,58 @@ class PythonTestAnalyzer:
# GENERIC TEST ANALYZER (Regex-based for non-Python languages)
# ============================================================================
class GenericTestAnalyzer:
"""Regex-based test example extraction for non-Python languages"""
# Language-specific regex patterns
PATTERNS = {
"javascript": {
"instantiation": r'(?:const|let|var)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)',
"assertion": r'expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)',
"instantiation": r"(?:const|let|var)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
"assertion": r"expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)",
"test_function": r'(?:test|it)\(["\']([^"\']+)["\']',
"config": r'(?:const|let)\s+config\s*=\s*\{[\s\S]{20,500}?\}',
"config": r"(?:const|let)\s+config\s*=\s*\{[\s\S]{20,500}?\}",
},
"typescript": {
"instantiation": r'(?:const|let|var)\s+(\w+):\s*\w+\s*=\s*new\s+(\w+)\(([^)]*)\)',
"assertion": r'expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)',
"instantiation": r"(?:const|let|var)\s+(\w+):\s*\w+\s*=\s*new\s+(\w+)\(([^)]*)\)",
"assertion": r"expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)",
"test_function": r'(?:test|it)\(["\']([^"\']+)["\']',
"config": r'(?:const|let)\s+config:\s*\w+\s*=\s*\{[\s\S]{20,500}?\}',
"config": r"(?:const|let)\s+config:\s*\w+\s*=\s*\{[\s\S]{20,500}?\}",
},
"go": {
"instantiation": r'(\w+)\s*:=\s*(\w+)\{([^}]+)\}',
"instantiation": r"(\w+)\s*:=\s*(\w+)\{([^}]+)\}",
"assertion": r't\.(?:Error|Fatal)(?:f)?\(["\']([^"\']+)["\']',
"test_function": r'func\s+(Test\w+)\(t\s+\*testing\.T\)',
"table_test": r'tests\s*:=\s*\[\]struct\s*\{[\s\S]{50,1000}?\}',
"test_function": r"func\s+(Test\w+)\(t\s+\*testing\.T\)",
"table_test": r"tests\s*:=\s*\[\]struct\s*\{[\s\S]{50,1000}?\}",
},
"rust": {
"instantiation": r'let\s+(\w+)\s*=\s*(\w+)::new\(([^)]*)\)',
"assertion": r'assert(?:_eq)?!\(([^)]+)\)',
"test_function": r'#\[test\]\s*fn\s+(\w+)\(\)',
"instantiation": r"let\s+(\w+)\s*=\s*(\w+)::new\(([^)]*)\)",
"assertion": r"assert(?:_eq)?!\(([^)]+)\)",
"test_function": r"#\[test\]\s*fn\s+(\w+)\(\)",
},
"java": {
"instantiation": r'(\w+)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)',
"assertion": r'assert(?:Equals|True|False|NotNull)\(([^)]+)\)',
"test_function": r'@Test\s+public\s+void\s+(\w+)\(\)',
"instantiation": r"(\w+)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
"assertion": r"assert(?:Equals|True|False|NotNull)\(([^)]+)\)",
"test_function": r"@Test\s+public\s+void\s+(\w+)\(\)",
},
"csharp": {
"instantiation": r'var\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)',
"assertion": r'Assert\.(?:AreEqual|IsTrue|IsFalse|IsNotNull)\(([^)]+)\)',
"test_function": r'\[Test\]\s+public\s+void\s+(\w+)\(\)',
"instantiation": r"var\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
"assertion": r"Assert\.(?:AreEqual|IsTrue|IsFalse|IsNotNull)\(([^)]+)\)",
"test_function": r"\[Test\]\s+public\s+void\s+(\w+)\(\)",
},
"php": {
"instantiation": r'\$(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)',
"assertion": r'\$this->assert(?:Equals|True|False|NotNull)\(([^)]+)\)',
"test_function": r'public\s+function\s+(test\w+)\(\)',
"instantiation": r"\$(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
"assertion": r"\$this->assert(?:Equals|True|False|NotNull)\(([^)]+)\)",
"test_function": r"public\s+function\s+(test\w+)\(\)",
},
"ruby": {
"instantiation": r'(\w+)\s*=\s*(\w+)\.new\(([^)]*)\)',
"assertion": r'expect\(([^)]+)\)\.to\s+(?:eq|be|match)\(([^)]+)\)',
"instantiation": r"(\w+)\s*=\s*(\w+)\.new\(([^)]*)\)",
"assertion": r"expect\(([^)]+)\)\.to\s+(?:eq|be|match)\(([^)]+)\)",
"test_function": r'(?:test|it)\s+["\']([^"\']+)["\']',
}
},
}
def extract(self, file_path: str, code: str, language: str) -> List[TestExample]:
def extract(self, file_path: str, code: str, language: str) -> list[TestExample]:
"""Extract examples from test file using regex patterns"""
examples = []
@@ -704,7 +687,7 @@ class GenericTestAnalyzer:
code=inst_match.group(0),
language=language,
file_path=file_path,
line_number=code[:start_pos + inst_match.start()].count('\n') + 1
line_number=code[: start_pos + inst_match.start()].count("\n") + 1,
)
examples.append(example)
@@ -717,20 +700,14 @@ class GenericTestAnalyzer:
code=config_match.group(0),
language=language,
file_path=file_path,
line_number=code[:start_pos + config_match.start()].count('\n') + 1
line_number=code[: start_pos + config_match.start()].count("\n") + 1,
)
examples.append(example)
return examples
def _create_example(
self,
test_name: str,
category: str,
code: str,
language: str,
file_path: str,
line_number: int
self, test_name: str, category: str, code: str, language: str, file_path: str, line_number: int
) -> TestExample:
"""Create TestExample from regex match"""
return TestExample(
@@ -743,11 +720,11 @@ class GenericTestAnalyzer:
expected_behavior="",
file_path=file_path,
line_start=line_number,
line_end=line_number + code.count('\n'),
complexity_score=min(1.0, (code.count('\n') + 1) * 0.1),
line_end=line_number + code.count("\n"),
complexity_score=min(1.0, (code.count("\n") + 1) * 0.1),
confidence=0.6, # Lower confidence for regex extraction
tags=[],
dependencies=[]
dependencies=[],
)
@@ -755,6 +732,7 @@ class GenericTestAnalyzer:
# EXAMPLE QUALITY FILTER
# ============================================================================
class ExampleQualityFilter:
"""Filter out trivial or low-quality examples"""
@@ -764,16 +742,16 @@ class ExampleQualityFilter:
# Trivial patterns to exclude
self.trivial_patterns = [
'Mock()',
'MagicMock()',
'assertTrue(True)',
'assertFalse(False)',
'assertEqual(1, 1)',
'pass',
'...',
"Mock()",
"MagicMock()",
"assertTrue(True)",
"assertFalse(False)",
"assertEqual(1, 1)",
"pass",
"...",
]
def filter(self, examples: List[TestExample]) -> List[TestExample]:
def filter(self, examples: list[TestExample]) -> list[TestExample]:
"""Filter examples by quality criteria"""
filtered = []
@@ -803,42 +781,43 @@ class ExampleQualityFilter:
# TEST EXAMPLE EXTRACTOR (Main Orchestrator)
# ============================================================================
class TestExampleExtractor:
"""Main orchestrator for test example extraction"""
# Test file patterns
TEST_PATTERNS = [
'test_*.py',
'*_test.py',
'test*.js',
'*test.js',
'*_test.go',
'*_test.rs',
'Test*.java',
'Test*.cs',
'*Test.php',
'*_spec.rb',
"test_*.py",
"*_test.py",
"test*.js",
"*test.js",
"*_test.go",
"*_test.rs",
"Test*.java",
"Test*.cs",
"*Test.php",
"*_spec.rb",
]
# Language detection by extension
LANGUAGE_MAP = {
'.py': 'Python',
'.js': 'JavaScript',
'.ts': 'TypeScript',
'.go': 'Go',
'.rs': 'Rust',
'.java': 'Java',
'.cs': 'C#',
'.php': 'PHP',
'.rb': 'Ruby',
".py": "Python",
".js": "JavaScript",
".ts": "TypeScript",
".go": "Go",
".rs": "Rust",
".java": "Java",
".cs": "C#",
".php": "PHP",
".rb": "Ruby",
}
def __init__(
self,
min_confidence: float = 0.7,
max_per_file: int = 10,
languages: Optional[List[str]] = None,
enhance_with_ai: bool = True
languages: list[str] | None = None,
enhance_with_ai: bool = True,
):
self.python_analyzer = PythonTestAnalyzer()
self.generic_analyzer = GenericTestAnalyzer()
@@ -852,16 +831,13 @@ class TestExampleExtractor:
if self.enhance_with_ai:
try:
from skill_seekers.cli.ai_enhancer import TestExampleEnhancer
self.ai_enhancer = TestExampleEnhancer()
except Exception as e:
logger.warning(f"⚠️ Failed to initialize AI enhancer: {e}")
self.enhance_with_ai = False
def extract_from_directory(
self,
directory: Path,
recursive: bool = True
) -> ExampleReport:
def extract_from_directory(self, directory: Path, recursive: bool = True) -> ExampleReport:
"""Extract examples from all test files in directory"""
directory = Path(directory)
@@ -882,7 +858,7 @@ class TestExampleExtractor:
# Generate report
return self._create_report(all_examples, directory=str(directory))
def extract_from_file(self, file_path: Path) -> List[TestExample]:
def extract_from_file(self, file_path: Path) -> list[TestExample]:
"""Extract examples from single test file"""
file_path = Path(file_path)
@@ -898,13 +874,13 @@ class TestExampleExtractor:
# Read file
try:
code = file_path.read_text(encoding='utf-8')
code = file_path.read_text(encoding="utf-8")
except UnicodeDecodeError:
logger.warning(f"Failed to read {file_path} (encoding error)")
return []
# Extract examples based on language
if language == 'Python':
if language == "Python":
examples = self.python_analyzer.extract(str(file_path), code)
else:
examples = self.generic_analyzer.extract(str(file_path), code, language)
@@ -915,17 +891,13 @@ class TestExampleExtractor:
# Limit per file
if len(filtered_examples) > self.max_per_file:
# Sort by confidence and take top N
filtered_examples = sorted(
filtered_examples,
key=lambda x: x.confidence,
reverse=True
)[:self.max_per_file]
filtered_examples = sorted(filtered_examples, key=lambda x: x.confidence, reverse=True)[: self.max_per_file]
logger.info(f"Extracted {len(filtered_examples)} examples from {file_path.name}")
return filtered_examples
def _find_test_files(self, directory: Path, recursive: bool) -> List[Path]:
def _find_test_files(self, directory: Path, recursive: bool) -> list[Path]:
"""Find test files in directory"""
test_files = []
@@ -940,13 +912,10 @@ class TestExampleExtractor:
def _detect_language(self, file_path: Path) -> str:
"""Detect programming language from file extension"""
suffix = file_path.suffix.lower()
return self.LANGUAGE_MAP.get(suffix, 'Unknown')
return self.LANGUAGE_MAP.get(suffix, "Unknown")
def _create_report(
self,
examples: List[TestExample],
file_path: Optional[str] = None,
directory: Optional[str] = None
self, examples: list[TestExample], file_path: str | None = None, directory: str | None = None
) -> ExampleReport:
"""Create summary report from examples"""
# Enhance examples with AI analysis (C3.6)
@@ -957,20 +926,18 @@ class TestExampleExtractor:
# Update examples with AI analysis
for i, example in enumerate(examples):
if i < len(enhanced_dicts) and 'ai_analysis' in enhanced_dicts[i]:
example.ai_analysis = enhanced_dicts[i]['ai_analysis']
if i < len(enhanced_dicts) and "ai_analysis" in enhanced_dicts[i]:
example.ai_analysis = enhanced_dicts[i]["ai_analysis"]
# Count by category
examples_by_category = {}
for example in examples:
examples_by_category[example.category] = \
examples_by_category.get(example.category, 0) + 1
examples_by_category[example.category] = examples_by_category.get(example.category, 0) + 1
# Count by language
examples_by_language = {}
for example in examples:
examples_by_language[example.language] = \
examples_by_language.get(example.language, 0) + 1
examples_by_language[example.language] = examples_by_language.get(example.language, 0) + 1
# Calculate averages
avg_complexity = sum(ex.complexity_score for ex in examples) / len(examples) if examples else 0.0
@@ -984,7 +951,7 @@ class TestExampleExtractor:
avg_complexity=round(avg_complexity, 2),
high_value_count=high_value_count,
file_path=file_path,
directory=directory
directory=directory,
)
@@ -992,10 +959,11 @@ class TestExampleExtractor:
# COMMAND-LINE INTERFACE
# ============================================================================
def main():
"""Main entry point for CLI"""
parser = argparse.ArgumentParser(
description='Extract usage examples from test files',
description="Extract usage examples from test files",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -1010,49 +978,20 @@ Examples:
# Filter by confidence
%(prog)s tests/ --min-confidence 0.7
"""
""",
)
parser.add_argument("directory", nargs="?", help="Directory containing test files")
parser.add_argument("--file", help="Single test file to analyze")
parser.add_argument("--language", help="Filter by programming language (python, javascript, etc.)")
parser.add_argument(
'directory',
nargs='?',
help='Directory containing test files'
"--min-confidence", type=float, default=0.5, help="Minimum confidence threshold (0.0-1.0, default: 0.5)"
)
parser.add_argument("--max-per-file", type=int, default=10, help="Maximum examples per file (default: 10)")
parser.add_argument("--json", action="store_true", help="Output JSON format")
parser.add_argument("--markdown", action="store_true", help="Output Markdown format")
parser.add_argument(
'--file',
help='Single test file to analyze'
)
parser.add_argument(
'--language',
help='Filter by programming language (python, javascript, etc.)'
)
parser.add_argument(
'--min-confidence',
type=float,
default=0.5,
help='Minimum confidence threshold (0.0-1.0, default: 0.5)'
)
parser.add_argument(
'--max-per-file',
type=int,
default=10,
help='Maximum examples per file (default: 10)'
)
parser.add_argument(
'--json',
action='store_true',
help='Output JSON format'
)
parser.add_argument(
'--markdown',
action='store_true',
help='Output Markdown format'
)
parser.add_argument(
'--recursive',
action='store_true',
default=True,
help='Search directory recursively (default: True)'
"--recursive", action="store_true", default=True, help="Search directory recursively (default: True)"
)
args = parser.parse_args()
@@ -1064,9 +1003,7 @@ Examples:
# Create extractor
languages = [args.language] if args.language else None
extractor = TestExampleExtractor(
min_confidence=args.min_confidence,
max_per_file=args.max_per_file,
languages=languages
min_confidence=args.min_confidence, max_per_file=args.max_per_file, languages=languages
)
# Extract examples
@@ -1074,10 +1011,7 @@ Examples:
examples = extractor.extract_from_file(Path(args.file))
report = extractor._create_report(examples, file_path=args.file)
else:
report = extractor.extract_from_directory(
Path(args.directory),
recursive=args.recursive
)
report = extractor.extract_from_directory(Path(args.directory), recursive=args.recursive)
# Output results
if args.json:
@@ -1086,19 +1020,19 @@ Examples:
print(report.to_markdown())
else:
# Human-readable summary
print(f"\nTest Example Extraction Results")
print(f"=" * 50)
print("\nTest Example Extraction Results")
print("=" * 50)
print(f"Total Examples: {report.total_examples}")
print(f"High Value (confidence > 0.7): {report.high_value_count}")
print(f"Average Complexity: {report.avg_complexity:.2f}")
print(f"\nExamples by Category:")
print("\nExamples by Category:")
for category, count in sorted(report.examples_by_category.items()):
print(f" {category}: {count}")
print(f"\nExamples by Language:")
print("\nExamples by Language:")
for language, count in sorted(report.examples_by_language.items()):
print(f" {language}: {count}")
print(f"\nUse --json or --markdown for detailed output")
print("\nUse --json or --markdown for detailed output")
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -5,9 +5,9 @@ Simple Integration Tests for Unified Multi-Source Scraper
Focuses on real-world usage patterns rather than unit tests.
"""
import json
import os
import sys
import json
import tempfile
from pathlib import Path
@@ -16,16 +16,12 @@ sys.path.insert(0, str(Path(__file__).parent))
from .config_validator import validate_config
def test_validate_existing_unified_configs():
"""Test that all existing unified configs are valid"""
configs_dir = Path(__file__).parent.parent / 'configs'
configs_dir = Path(__file__).parent.parent / "configs"
unified_configs = [
'godot_unified.json',
'react_unified.json',
'django_unified.json',
'fastapi_unified.json'
]
unified_configs = ["godot_unified.json", "react_unified.json", "django_unified.json", "fastapi_unified.json"]
for config_name in unified_configs:
config_path = configs_dir / config_name
@@ -40,13 +36,9 @@ def test_validate_existing_unified_configs():
def test_backward_compatibility():
"""Test that legacy configs still work"""
configs_dir = Path(__file__).parent.parent / 'configs'
configs_dir = Path(__file__).parent.parent / "configs"
legacy_configs = [
'react.json',
'godot.json',
'django.json'
]
legacy_configs = ["react.json", "godot.json", "django.json"]
for config_name in legacy_configs:
config_path = configs_dir / config_name
@@ -54,7 +46,7 @@ def test_backward_compatibility():
print(f"\n✓ Validating legacy {config_name}...")
validator = validate_config(str(config_path))
assert not validator.is_unified, f"{config_name} should be legacy format"
print(f" Format: Legacy")
print(" Format: Legacy")
def test_create_temp_unified_config():
@@ -64,22 +56,12 @@ def test_create_temp_unified_config():
"description": "Test unified config",
"merge_mode": "rule-based",
"sources": [
{
"type": "documentation",
"base_url": "https://example.com/docs",
"extract_api": True,
"max_pages": 50
},
{
"type": "github",
"repo": "test/repo",
"include_code": True,
"code_analysis_depth": "surface"
}
]
{"type": "documentation", "base_url": "https://example.com/docs", "extract_api": True, "max_pages": 50},
{"type": "github", "repo": "test/repo", "include_code": True, "code_analysis_depth": "surface"},
],
}
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(config, f)
config_path = f.name
@@ -88,7 +70,7 @@ def test_create_temp_unified_config():
validator = validate_config(config_path)
assert validator.is_unified
assert validator.needs_api_merge()
assert len(validator.config['sources']) == 2
assert len(validator.config["sources"]) == 2
print(" ✓ Config is valid unified format")
print(f" Sources: {len(validator.config['sources'])}")
finally:
@@ -102,22 +84,13 @@ def test_mixed_source_types():
"description": "Test mixed sources",
"merge_mode": "rule-based",
"sources": [
{
"type": "documentation",
"base_url": "https://example.com"
},
{
"type": "github",
"repo": "test/repo"
},
{
"type": "pdf",
"path": "/path/to/manual.pdf"
}
]
{"type": "documentation", "base_url": "https://example.com"},
{"type": "github", "repo": "test/repo"},
{"type": "pdf", "path": "/path/to/manual.pdf"},
],
}
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(config, f)
config_path = f.name
@@ -125,13 +98,13 @@ def test_mixed_source_types():
print("\n✓ Validating mixed source types...")
validator = validate_config(config_path)
assert validator.is_unified
assert len(validator.config['sources']) == 3
assert len(validator.config["sources"]) == 3
# Check each source type
source_types = [s['type'] for s in validator.config['sources']]
assert 'documentation' in source_types
assert 'github' in source_types
assert 'pdf' in source_types
source_types = [s["type"] for s in validator.config["sources"]]
assert "documentation" in source_types
assert "github" in source_types
assert "pdf" in source_types
print(" ✓ All 3 source types validated")
finally:
os.unlink(config_path)
@@ -143,12 +116,10 @@ def test_config_validation_errors():
config = {
"name": "test",
"description": "Test",
"sources": [
{"type": "invalid_type", "url": "https://example.com"}
]
"sources": [{"type": "invalid_type", "url": "https://example.com"}],
}
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(config, f)
config_path = f.name
@@ -166,7 +137,7 @@ def test_config_validation_errors():
# Run tests
if __name__ == '__main__':
if __name__ == "__main__":
print("=" * 60)
print("Running Unified Scraper Integration Tests")
print("=" * 60)
@@ -188,5 +159,6 @@ if __name__ == '__main__':
except Exception as e:
print(f"\n❌ Unexpected error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -13,21 +13,21 @@ Analysis modes:
"""
import os
from pathlib import Path
from typing import Dict, Optional, List
from dataclasses import dataclass
from pathlib import Path
from skill_seekers.cli.github_fetcher import GitHubThreeStreamFetcher, ThreeStreamData
from skill_seekers.cli.github_fetcher import GitHubThreeStreamFetcher
@dataclass
class AnalysisResult:
"""Unified analysis result from any codebase source."""
code_analysis: Dict
github_docs: Optional[Dict] = None
github_insights: Optional[Dict] = None
source_type: str = 'local' # 'local' or 'github'
analysis_depth: str = 'basic' # 'basic' or 'c3x'
code_analysis: dict
github_docs: dict | None = None
github_insights: dict | None = None
source_type: str = "local" # 'local' or 'github'
analysis_depth: str = "basic" # 'basic' or 'c3x'
class UnifiedCodebaseAnalyzer:
@@ -59,21 +59,17 @@ class UnifiedCodebaseAnalyzer:
)
"""
def __init__(self, github_token: Optional[str] = None):
def __init__(self, github_token: str | None = None):
"""
Initialize analyzer.
Args:
github_token: Optional GitHub API token for higher rate limits
"""
self.github_token = github_token or os.getenv('GITHUB_TOKEN')
self.github_token = github_token or os.getenv("GITHUB_TOKEN")
def analyze(
self,
source: str,
depth: str = 'c3x',
fetch_github_metadata: bool = True,
output_dir: Optional[Path] = None
self, source: str, depth: str = "c3x", fetch_github_metadata: bool = True, output_dir: Path | None = None
) -> AnalysisResult:
"""
Analyze codebase with specified depth.
@@ -92,18 +88,14 @@ class UnifiedCodebaseAnalyzer:
# Step 1: Acquire source
if self.is_github_url(source):
print(f"📦 Source type: GitHub repository")
print("📦 Source type: GitHub repository")
return self._analyze_github(source, depth, fetch_github_metadata, output_dir)
else:
print(f"📁 Source type: Local directory")
print("📁 Source type: Local directory")
return self._analyze_local(source, depth)
def _analyze_github(
self,
repo_url: str,
depth: str,
fetch_metadata: bool,
output_dir: Optional[Path]
self, repo_url: str, depth: str, fetch_metadata: bool, output_dir: Path | None
) -> AnalysisResult:
"""
Analyze GitHub repository with three-stream fetcher.
@@ -123,32 +115,28 @@ class UnifiedCodebaseAnalyzer:
# Analyze code with specified depth
code_directory = three_streams.code_stream.directory
if depth == 'basic':
if depth == "basic":
code_analysis = self.basic_analysis(code_directory)
elif depth == 'c3x':
elif depth == "c3x":
code_analysis = self.c3x_analysis(code_directory)
else:
raise ValueError(f"Unknown depth: {depth}. Use 'basic' or 'c3x'")
# Build result with all streams
result = AnalysisResult(
code_analysis=code_analysis,
source_type='github',
analysis_depth=depth
)
result = AnalysisResult(code_analysis=code_analysis, source_type="github", analysis_depth=depth)
# Add GitHub-specific data if available
if fetch_metadata:
result.github_docs = {
'readme': three_streams.docs_stream.readme,
'contributing': three_streams.docs_stream.contributing,
'docs_files': three_streams.docs_stream.docs_files
"readme": three_streams.docs_stream.readme,
"contributing": three_streams.docs_stream.contributing,
"docs_files": three_streams.docs_stream.docs_files,
}
result.github_insights = {
'metadata': three_streams.insights_stream.metadata,
'common_problems': three_streams.insights_stream.common_problems,
'known_solutions': three_streams.insights_stream.known_solutions,
'top_labels': three_streams.insights_stream.top_labels
"metadata": three_streams.insights_stream.metadata,
"common_problems": three_streams.insights_stream.common_problems,
"known_solutions": three_streams.insights_stream.known_solutions,
"top_labels": three_streams.insights_stream.top_labels,
}
return result
@@ -173,20 +161,16 @@ class UnifiedCodebaseAnalyzer:
raise NotADirectoryError(f"Not a directory: {directory}")
# Analyze code with specified depth
if depth == 'basic':
if depth == "basic":
code_analysis = self.basic_analysis(code_directory)
elif depth == 'c3x':
elif depth == "c3x":
code_analysis = self.c3x_analysis(code_directory)
else:
raise ValueError(f"Unknown depth: {depth}. Use 'basic' or 'c3x'")
return AnalysisResult(
code_analysis=code_analysis,
source_type='local',
analysis_depth=depth
)
return AnalysisResult(code_analysis=code_analysis, source_type="local", analysis_depth=depth)
def basic_analysis(self, directory: Path) -> Dict:
def basic_analysis(self, directory: Path) -> dict:
"""
Fast, shallow analysis (1-2 min).
@@ -205,19 +189,19 @@ class UnifiedCodebaseAnalyzer:
print("📊 Running basic analysis (1-2 min)...")
analysis = {
'directory': str(directory),
'analysis_type': 'basic',
'files': self.list_files(directory),
'structure': self.get_directory_structure(directory),
'imports': self.extract_imports(directory),
'entry_points': self.find_entry_points(directory),
'statistics': self.compute_statistics(directory)
"directory": str(directory),
"analysis_type": "basic",
"files": self.list_files(directory),
"structure": self.get_directory_structure(directory),
"imports": self.extract_imports(directory),
"entry_points": self.find_entry_points(directory),
"statistics": self.compute_statistics(directory),
}
print(f"✅ Basic analysis complete: {len(analysis['files'])} files analyzed")
return analysis
def c3x_analysis(self, directory: Path) -> Dict:
def c3x_analysis(self, directory: Path) -> dict:
"""
Deep C3.x analysis (20-60 min).
@@ -245,17 +229,18 @@ class UnifiedCodebaseAnalyzer:
try:
# Import codebase analyzer
from .codebase_scraper import analyze_codebase
import tempfile
from .codebase_scraper import analyze_codebase
# Create temporary output directory for C3.x analysis
temp_output = Path(tempfile.mkdtemp(prefix='c3x_analysis_'))
temp_output = Path(tempfile.mkdtemp(prefix="c3x_analysis_"))
# Run full C3.x analysis
analyze_codebase(
directory=directory,
output_dir=temp_output,
depth='deep',
depth="deep",
languages=None, # All languages
file_patterns=None, # All files
build_api_reference=True,
@@ -265,20 +250,16 @@ class UnifiedCodebaseAnalyzer:
build_how_to_guides=True,
extract_config_patterns=True,
enhance_with_ai=False, # Disable AI for speed
ai_mode='none'
ai_mode="none",
)
# Load C3.x results from output files
c3x_data = self._load_c3x_results(temp_output)
# Merge with basic analysis
c3x = {
**basic,
'analysis_type': 'c3x',
**c3x_data
}
c3x = {**basic, "analysis_type": "c3x", **c3x_data}
print(f"✅ C3.x analysis complete!")
print("✅ C3.x analysis complete!")
print(f" - {len(c3x_data.get('c3_1_patterns', []))} design patterns detected")
print(f" - {c3x_data.get('c3_2_examples_count', 0)} test examples extracted")
print(f" - {len(c3x_data.get('c3_3_guides', []))} how-to guides generated")
@@ -289,24 +270,24 @@ class UnifiedCodebaseAnalyzer:
except Exception as e:
print(f"⚠️ C3.x analysis failed: {e}")
print(f" Falling back to basic analysis with placeholders")
print(" Falling back to basic analysis with placeholders")
# Fall back to placeholders
c3x = {
**basic,
'analysis_type': 'c3x',
'c3_1_patterns': [],
'c3_2_examples': [],
'c3_2_examples_count': 0,
'c3_3_guides': [],
'c3_4_configs': [],
'c3_7_architecture': [],
'error': str(e)
"analysis_type": "c3x",
"c3_1_patterns": [],
"c3_2_examples": [],
"c3_2_examples_count": 0,
"c3_3_guides": [],
"c3_4_configs": [],
"c3_7_architecture": [],
"error": str(e),
}
return c3x
def _load_c3x_results(self, output_dir: Path) -> Dict:
def _load_c3x_results(self, output_dir: Path) -> dict:
"""
Load C3.x analysis results from output directory.
@@ -321,65 +302,65 @@ class UnifiedCodebaseAnalyzer:
c3x_data = {}
# C3.1: Design Patterns
patterns_file = output_dir / 'patterns' / 'design_patterns.json'
patterns_file = output_dir / "patterns" / "design_patterns.json"
if patterns_file.exists():
with open(patterns_file, 'r') as f:
with open(patterns_file) as f:
patterns_data = json.load(f)
c3x_data['c3_1_patterns'] = patterns_data.get('patterns', [])
c3x_data["c3_1_patterns"] = patterns_data.get("patterns", [])
else:
c3x_data['c3_1_patterns'] = []
c3x_data["c3_1_patterns"] = []
# C3.2: Test Examples
examples_file = output_dir / 'test_examples' / 'test_examples.json'
examples_file = output_dir / "test_examples" / "test_examples.json"
if examples_file.exists():
with open(examples_file, 'r') as f:
with open(examples_file) as f:
examples_data = json.load(f)
c3x_data['c3_2_examples'] = examples_data.get('examples', [])
c3x_data['c3_2_examples_count'] = examples_data.get('total_examples', 0)
c3x_data["c3_2_examples"] = examples_data.get("examples", [])
c3x_data["c3_2_examples_count"] = examples_data.get("total_examples", 0)
else:
c3x_data['c3_2_examples'] = []
c3x_data['c3_2_examples_count'] = 0
c3x_data["c3_2_examples"] = []
c3x_data["c3_2_examples_count"] = 0
# C3.3: How-to Guides
guides_file = output_dir / 'tutorials' / 'guide_collection.json'
guides_file = output_dir / "tutorials" / "guide_collection.json"
if guides_file.exists():
with open(guides_file, 'r') as f:
with open(guides_file) as f:
guides_data = json.load(f)
c3x_data['c3_3_guides'] = guides_data.get('guides', [])
c3x_data["c3_3_guides"] = guides_data.get("guides", [])
else:
c3x_data['c3_3_guides'] = []
c3x_data["c3_3_guides"] = []
# C3.4: Config Patterns
config_file = output_dir / 'config_patterns' / 'config_patterns.json'
config_file = output_dir / "config_patterns" / "config_patterns.json"
if config_file.exists():
with open(config_file, 'r') as f:
with open(config_file) as f:
config_data = json.load(f)
c3x_data['c3_4_configs'] = config_data.get('config_files', [])
c3x_data["c3_4_configs"] = config_data.get("config_files", [])
else:
c3x_data['c3_4_configs'] = []
c3x_data["c3_4_configs"] = []
# C3.7: Architecture
arch_file = output_dir / 'architecture' / 'architectural_patterns.json'
arch_file = output_dir / "architecture" / "architectural_patterns.json"
if arch_file.exists():
with open(arch_file, 'r') as f:
with open(arch_file) as f:
arch_data = json.load(f)
c3x_data['c3_7_architecture'] = arch_data.get('patterns', [])
c3x_data["c3_7_architecture"] = arch_data.get("patterns", [])
else:
c3x_data['c3_7_architecture'] = []
c3x_data["c3_7_architecture"] = []
# Add dependency graph data
dep_file = output_dir / 'dependencies' / 'dependency_graph.json'
dep_file = output_dir / "dependencies" / "dependency_graph.json"
if dep_file.exists():
with open(dep_file, 'r') as f:
with open(dep_file) as f:
dep_data = json.load(f)
c3x_data['dependency_graph'] = dep_data
c3x_data["dependency_graph"] = dep_data
# Add API reference data
api_file = output_dir / 'code_analysis.json'
api_file = output_dir / "code_analysis.json"
if api_file.exists():
with open(api_file, 'r') as f:
with open(api_file) as f:
api_data = json.load(f)
c3x_data['api_reference'] = api_data
c3x_data["api_reference"] = api_data
return c3x_data
@@ -393,9 +374,9 @@ class UnifiedCodebaseAnalyzer:
Returns:
True if GitHub URL, False otherwise
"""
return 'github.com' in source
return "github.com" in source
def list_files(self, directory: Path) -> List[Dict]:
def list_files(self, directory: Path) -> list[dict]:
"""
List all files in directory with metadata.
@@ -406,20 +387,22 @@ class UnifiedCodebaseAnalyzer:
List of file info dicts
"""
files = []
for file_path in directory.rglob('*'):
for file_path in directory.rglob("*"):
if file_path.is_file():
try:
files.append({
'path': str(file_path.relative_to(directory)),
'size': file_path.stat().st_size,
'extension': file_path.suffix
})
files.append(
{
"path": str(file_path.relative_to(directory)),
"size": file_path.stat().st_size,
"extension": file_path.suffix,
}
)
except Exception:
# Skip files we can't access
continue
return files
def get_directory_structure(self, directory: Path) -> Dict:
def get_directory_structure(self, directory: Path) -> dict:
"""
Get directory structure tree.
@@ -429,35 +412,24 @@ class UnifiedCodebaseAnalyzer:
Returns:
Dict representing directory structure
"""
structure = {
'name': directory.name,
'type': 'directory',
'children': []
}
structure = {"name": directory.name, "type": "directory", "children": []}
try:
for item in sorted(directory.iterdir()):
if item.name.startswith('.'):
if item.name.startswith("."):
continue # Skip hidden files
if item.is_dir():
# Only include immediate subdirectories
structure['children'].append({
'name': item.name,
'type': 'directory'
})
structure["children"].append({"name": item.name, "type": "directory"})
elif item.is_file():
structure['children'].append({
'name': item.name,
'type': 'file',
'extension': item.suffix
})
structure["children"].append({"name": item.name, "type": "file", "extension": item.suffix})
except Exception:
pass
return structure
def extract_imports(self, directory: Path) -> Dict[str, List[str]]:
def extract_imports(self, directory: Path) -> dict[str, list[str]]:
"""
Extract import statements from code files.
@@ -467,27 +439,23 @@ class UnifiedCodebaseAnalyzer:
Returns:
Dict mapping file extensions to import lists
"""
imports = {
'.py': [],
'.js': [],
'.ts': []
}
imports = {".py": [], ".js": [], ".ts": []}
# Sample up to 10 files per extension
for ext in imports.keys():
files = list(directory.rglob(f'*{ext}'))[:10]
for ext in imports:
files = list(directory.rglob(f"*{ext}"))[:10]
for file_path in files:
try:
content = file_path.read_text(encoding='utf-8')
if ext == '.py':
content = file_path.read_text(encoding="utf-8")
if ext == ".py":
# Extract Python imports
for line in content.split('\n')[:50]: # Check first 50 lines
if line.strip().startswith(('import ', 'from ')):
for line in content.split("\n")[:50]: # Check first 50 lines
if line.strip().startswith(("import ", "from ")):
imports[ext].append(line.strip())
elif ext in ['.js', '.ts']:
elif ext in [".js", ".ts"]:
# Extract JS/TS imports
for line in content.split('\n')[:50]:
if line.strip().startswith(('import ', 'require(')):
for line in content.split("\n")[:50]:
if line.strip().startswith(("import ", "require(")):
imports[ext].append(line.strip())
except Exception:
continue
@@ -495,7 +463,7 @@ class UnifiedCodebaseAnalyzer:
# Remove empty lists
return {k: v for k, v in imports.items() if v}
def find_entry_points(self, directory: Path) -> List[str]:
def find_entry_points(self, directory: Path) -> list[str]:
"""
Find potential entry points (main files, setup files, etc.).
@@ -509,10 +477,20 @@ class UnifiedCodebaseAnalyzer:
# Common entry point patterns
entry_patterns = [
'main.py', '__main__.py', 'app.py', 'server.py',
'index.js', 'index.ts', 'main.js', 'main.ts',
'setup.py', 'pyproject.toml', 'package.json',
'Makefile', 'docker-compose.yml', 'Dockerfile'
"main.py",
"__main__.py",
"app.py",
"server.py",
"index.js",
"index.ts",
"main.js",
"main.ts",
"setup.py",
"pyproject.toml",
"package.json",
"Makefile",
"docker-compose.yml",
"Dockerfile",
]
for pattern in entry_patterns:
@@ -525,7 +503,7 @@ class UnifiedCodebaseAnalyzer:
return entry_points
def compute_statistics(self, directory: Path) -> Dict:
def compute_statistics(self, directory: Path) -> dict:
"""
Compute basic statistics about the codebase.
@@ -535,39 +513,34 @@ class UnifiedCodebaseAnalyzer:
Returns:
Dict with statistics
"""
stats = {
'total_files': 0,
'total_size_bytes': 0,
'file_types': {},
'languages': {}
}
stats = {"total_files": 0, "total_size_bytes": 0, "file_types": {}, "languages": {}}
for file_path in directory.rglob('*'):
for file_path in directory.rglob("*"):
if not file_path.is_file():
continue
try:
stats['total_files'] += 1
stats['total_size_bytes'] += file_path.stat().st_size
stats["total_files"] += 1
stats["total_size_bytes"] += file_path.stat().st_size
ext = file_path.suffix
if ext:
stats['file_types'][ext] = stats['file_types'].get(ext, 0) + 1
stats["file_types"][ext] = stats["file_types"].get(ext, 0) + 1
# Map extensions to languages
language_map = {
'.py': 'Python',
'.js': 'JavaScript',
'.ts': 'TypeScript',
'.go': 'Go',
'.rs': 'Rust',
'.java': 'Java',
'.rb': 'Ruby',
'.php': 'PHP'
".py": "Python",
".js": "JavaScript",
".ts": "TypeScript",
".go": "Go",
".rs": "Rust",
".java": "Java",
".rb": "Ruby",
".php": "PHP",
}
if ext in language_map:
lang = language_map[ext]
stats['languages'][lang] = stats['languages'].get(lang, 0) + 1
stats["languages"][lang] = stats["languages"].get(lang, 0) + 1
except Exception:
continue

View File

@@ -12,31 +12,28 @@ Usage:
skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
"""
import os
import sys
import argparse
import json
import logging
import argparse
import subprocess
import os
import shutil
import subprocess
import sys
from pathlib import Path
from typing import Dict, List, Any, Optional
from typing import Any
# Import validators and scrapers
try:
from skill_seekers.cli.config_validator import ConfigValidator, validate_config
from skill_seekers.cli.conflict_detector import ConflictDetector
from skill_seekers.cli.merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
from skill_seekers.cli.merge_sources import ClaudeEnhancedMerger, RuleBasedMerger
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
except ImportError as e:
print(f"Error importing modules: {e}")
print("Make sure you're running from the project root directory")
sys.exit(1)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
@@ -52,7 +49,7 @@ class UnifiedScraper:
5. Build unified skill
"""
def __init__(self, config_path: str, merge_mode: Optional[str] = None):
def __init__(self, config_path: str, merge_mode: str | None = None):
"""
Initialize unified scraper.
@@ -68,21 +65,21 @@ class UnifiedScraper:
self.config = self.validator.config
# Determine merge mode
self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based')
self.merge_mode = merge_mode or self.config.get("merge_mode", "rule-based")
logger.info(f"Merge mode: {self.merge_mode}")
# Storage for scraped data - use lists to support multiple sources of same type
self.scraped_data = {
'documentation': [], # List of doc sources
'github': [], # List of github sources
'pdf': [] # List of pdf sources
"documentation": [], # List of doc sources
"github": [], # List of github sources
"pdf": [], # List of pdf sources
}
# Track source index for unique naming (multi-source support)
self._source_counters = {'documentation': 0, 'github': 0, 'pdf': 0}
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0}
# Output paths - cleaner organization
self.name = self.config['name']
self.name = self.config["name"]
self.output_dir = f"output/{self.name}" # Final skill only
# Use hidden cache directory for intermediate files
@@ -107,17 +104,16 @@ class UnifiedScraper:
from datetime import datetime
# Create log filename with timestamp
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_file = f"{self.logs_dir}/unified_{timestamp}.log"
# Add file handler to root logger
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler = logging.FileHandler(log_file, encoding="utf-8")
file_handler.setLevel(logging.DEBUG)
# Create formatter
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
file_handler.setFormatter(formatter)
@@ -141,18 +137,18 @@ class UnifiedScraper:
logger.warning("Config is not unified format, converting...")
self.config = self.validator.convert_legacy_to_unified()
sources = self.config.get('sources', [])
sources = self.config.get("sources", [])
for i, source in enumerate(sources):
source_type = source['type']
logger.info(f"\n[{i+1}/{len(sources)}] Scraping {source_type} source...")
source_type = source["type"]
logger.info(f"\n[{i + 1}/{len(sources)}] Scraping {source_type} source...")
try:
if source_type == 'documentation':
if source_type == "documentation":
self._scrape_documentation(source)
elif source_type == 'github':
elif source_type == "github":
self._scrape_github(source)
elif source_type == 'pdf':
elif source_type == "pdf":
self._scrape_pdf(source)
else:
logger.warning(f"Unknown source type: {source_type}")
@@ -162,40 +158,40 @@ class UnifiedScraper:
logger.info(f"\n✅ Scraped {len(self.scraped_data)} sources successfully")
def _scrape_documentation(self, source: Dict[str, Any]):
def _scrape_documentation(self, source: dict[str, Any]):
"""Scrape documentation website."""
# Create temporary config for doc scraper
doc_config = {
'name': f"{self.name}_docs",
'base_url': source['base_url'],
'selectors': source.get('selectors', {}),
'url_patterns': source.get('url_patterns', {}),
'categories': source.get('categories', {}),
'rate_limit': source.get('rate_limit', 0.5),
'max_pages': source.get('max_pages', 100)
"name": f"{self.name}_docs",
"base_url": source["base_url"],
"selectors": source.get("selectors", {}),
"url_patterns": source.get("url_patterns", {}),
"categories": source.get("categories", {}),
"rate_limit": source.get("rate_limit", 0.5),
"max_pages": source.get("max_pages", 100),
}
# Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
if 'llms_txt_url' in source:
doc_config['llms_txt_url'] = source.get('llms_txt_url')
if "llms_txt_url" in source:
doc_config["llms_txt_url"] = source.get("llms_txt_url")
if 'skip_llms_txt' in source:
doc_config['skip_llms_txt'] = source.get('skip_llms_txt')
if "skip_llms_txt" in source:
doc_config["skip_llms_txt"] = source.get("skip_llms_txt")
# Optional: support overriding start URLs
if 'start_urls' in source:
doc_config['start_urls'] = source.get('start_urls')
if "start_urls" in source:
doc_config["start_urls"] = source.get("start_urls")
# Write temporary config
temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
with open(temp_config_path, 'w', encoding='utf-8') as f:
temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json")
with open(temp_config_path, "w", encoding="utf-8") as f:
json.dump(doc_config, f, indent=2)
# Run doc_scraper as subprocess
logger.info(f"Scraping documentation from {source['base_url']}")
doc_scraper_path = Path(__file__).parent / "doc_scraper.py"
cmd = [sys.executable, str(doc_scraper_path), '--config', temp_config_path, '--fresh']
cmd = [sys.executable, str(doc_scraper_path), "--config", temp_config_path, "--fresh"]
result = subprocess.run(cmd, capture_output=True, text=True, stdin=subprocess.DEVNULL)
@@ -213,18 +209,20 @@ class UnifiedScraper:
docs_data_file = f"output/{doc_config['name']}_data/summary.json"
if os.path.exists(docs_data_file):
with open(docs_data_file, 'r', encoding='utf-8') as f:
with open(docs_data_file, encoding="utf-8") as f:
summary = json.load(f)
# Append to documentation list (multi-source support)
self.scraped_data['documentation'].append({
'source_id': doc_config['name'],
'base_url': source['base_url'],
'pages': summary.get('pages', []),
'total_pages': summary.get('total_pages', 0),
'data_file': docs_data_file,
'refs_dir': '' # Will be set after moving to cache
})
self.scraped_data["documentation"].append(
{
"source_id": doc_config["name"],
"base_url": source["base_url"],
"pages": summary.get("pages", []),
"total_pages": summary.get("total_pages", 0),
"data_file": docs_data_file,
"refs_dir": "", # Will be set after moving to cache
}
)
logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped")
else:
@@ -246,9 +244,9 @@ class UnifiedScraper:
logger.info(f"📦 Moved docs output to cache: {cache_docs_dir}")
# Update refs_dir in scraped_data with cache location
refs_dir_path = os.path.join(cache_docs_dir, 'references')
if self.scraped_data['documentation']:
self.scraped_data['documentation'][-1]['refs_dir'] = refs_dir_path
refs_dir_path = os.path.join(cache_docs_dir, "references")
if self.scraped_data["documentation"]:
self.scraped_data["documentation"][-1]["refs_dir"] = refs_dir_path
if os.path.exists(docs_data_dir):
cache_data_dir = os.path.join(self.data_dir, f"{doc_config['name']}_data")
@@ -257,7 +255,7 @@ class UnifiedScraper:
shutil.move(docs_data_dir, cache_data_dir)
logger.info(f"📦 Moved docs data to cache: {cache_data_dir}")
def _clone_github_repo(self, repo_name: str, idx: int = 0) -> Optional[str]:
def _clone_github_repo(self, repo_name: str, idx: int = 0) -> str | None:
"""
Clone GitHub repository to cache directory for C3.x analysis.
Reuses existing clone if already present.
@@ -274,9 +272,9 @@ class UnifiedScraper:
clone_path = os.path.join(self.repos_dir, repo_dir_name)
# Check if already cloned
if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, '.git')):
if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, ".git")):
logger.info(f"♻️ Found existing repository clone: {clone_path}")
logger.info(f" Reusing for C3.x analysis (skip re-cloning)")
logger.info(" Reusing for C3.x analysis (skip re-cloning)")
return clone_path
# repos_dir already created in __init__
@@ -285,18 +283,18 @@ class UnifiedScraper:
repo_url = f"https://github.com/{repo_name}.git"
logger.info(f"🔄 Cloning repository for C3.x analysis: {repo_url}")
logger.info(f"{clone_path}")
logger.info(f" 💾 Clone will be saved for future reuse")
logger.info(" 💾 Clone will be saved for future reuse")
try:
result = subprocess.run(
['git', 'clone', repo_url, clone_path],
["git", "clone", repo_url, clone_path],
capture_output=True,
text=True,
timeout=600 # 10 minute timeout for full clone
timeout=600, # 10 minute timeout for full clone
)
if result.returncode == 0:
logger.info(f"✅ Repository cloned successfully")
logger.info("✅ Repository cloned successfully")
logger.info(f" 📁 Saved to: {clone_path}")
return clone_path
else:
@@ -307,7 +305,7 @@ class UnifiedScraper:
return None
except subprocess.TimeoutExpired:
logger.error(f"❌ Git clone timed out after 10 minutes")
logger.error("❌ Git clone timed out after 10 minutes")
if os.path.exists(clone_path):
shutil.rmtree(clone_path)
return None
@@ -317,7 +315,7 @@ class UnifiedScraper:
shutil.rmtree(clone_path)
return None
def _scrape_github(self, source: Dict[str, Any]):
def _scrape_github(self, source: dict[str, Any]):
"""Scrape GitHub repository."""
try:
from skill_seekers.cli.github_scraper import GitHubScraper
@@ -326,16 +324,16 @@ class UnifiedScraper:
return
# Multi-source support: Get unique index for this GitHub source
idx = self._source_counters['github']
self._source_counters['github'] += 1
idx = self._source_counters["github"]
self._source_counters["github"] += 1
# Extract repo identifier for unique naming
repo = source['repo']
repo_id = repo.replace('/', '_')
repo = source["repo"]
repo_id = repo.replace("/", "_")
# Check if we need to clone for C3.x analysis
enable_codebase_analysis = source.get('enable_codebase_analysis', True)
local_repo_path = source.get('local_repo_path')
enable_codebase_analysis = source.get("enable_codebase_analysis", True)
local_repo_path = source.get("local_repo_path")
cloned_repo_path = None
# Auto-clone if C3.x analysis is enabled but no local path provided
@@ -351,24 +349,24 @@ class UnifiedScraper:
# Create config for GitHub scraper
github_config = {
'repo': repo,
'name': f"{self.name}_github_{idx}_{repo_id}",
'github_token': source.get('github_token'),
'include_issues': source.get('include_issues', True),
'max_issues': source.get('max_issues', 100),
'include_changelog': source.get('include_changelog', True),
'include_releases': source.get('include_releases', True),
'include_code': source.get('include_code', True),
'code_analysis_depth': source.get('code_analysis_depth', 'surface'),
'file_patterns': source.get('file_patterns', []),
'local_repo_path': local_repo_path # Use cloned path if available
"repo": repo,
"name": f"{self.name}_github_{idx}_{repo_id}",
"github_token": source.get("github_token"),
"include_issues": source.get("include_issues", True),
"max_issues": source.get("max_issues", 100),
"include_changelog": source.get("include_changelog", True),
"include_releases": source.get("include_releases", True),
"include_code": source.get("include_code", True),
"code_analysis_depth": source.get("code_analysis_depth", "surface"),
"file_patterns": source.get("file_patterns", []),
"local_repo_path": local_repo_path, # Use cloned path if available
}
# Pass directory exclusions if specified (optional)
if 'exclude_dirs' in source:
github_config['exclude_dirs'] = source['exclude_dirs']
if 'exclude_dirs_additional' in source:
github_config['exclude_dirs_additional'] = source['exclude_dirs_additional']
if "exclude_dirs" in source:
github_config["exclude_dirs"] = source["exclude_dirs"]
if "exclude_dirs_additional" in source:
github_config["exclude_dirs_additional"] = source["exclude_dirs_additional"]
# Scrape
logger.info(f"Scraping GitHub repository: {source['repo']}")
@@ -381,13 +379,14 @@ class UnifiedScraper:
try:
c3_data = self._run_c3_analysis(local_repo_path, source)
if c3_data:
github_data['c3_analysis'] = c3_data
github_data["c3_analysis"] = c3_data
logger.info("✅ C3.x analysis complete")
else:
logger.warning("⚠️ C3.x analysis returned no data")
except Exception as e:
logger.warning(f"⚠️ C3.x analysis failed: {e}")
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Continue without C3.x data - graceful degradation
@@ -396,32 +395,29 @@ class UnifiedScraper:
logger.info(f"📁 Repository clone saved for future use: {cloned_repo_path}")
# Save data to unified location with unique filename
github_data_file = os.path.join(self.data_dir, f'github_data_{idx}_{repo_id}.json')
with open(github_data_file, 'w', encoding='utf-8') as f:
github_data_file = os.path.join(self.data_dir, f"github_data_{idx}_{repo_id}.json")
with open(github_data_file, "w", encoding="utf-8") as f:
json.dump(github_data, f, indent=2, ensure_ascii=False)
# ALSO save to the location GitHubToSkillConverter expects (with C3.x data!)
converter_data_file = f"output/{github_config['name']}_github_data.json"
with open(converter_data_file, 'w', encoding='utf-8') as f:
with open(converter_data_file, "w", encoding="utf-8") as f:
json.dump(github_data, f, indent=2, ensure_ascii=False)
# Append to list instead of overwriting (multi-source support)
self.scraped_data['github'].append({
'repo': repo,
'repo_id': repo_id,
'idx': idx,
'data': github_data,
'data_file': github_data_file
})
self.scraped_data["github"].append(
{"repo": repo, "repo_id": repo_id, "idx": idx, "data": github_data, "data_file": github_data_file}
)
# Build standalone SKILL.md for synthesis using GitHubToSkillConverter
try:
from skill_seekers.cli.github_scraper import GitHubToSkillConverter
# Use github_config which has the correct name field
# Converter will load from output/{name}_github_data.json which now has C3.x data
converter = GitHubToSkillConverter(config=github_config)
converter.build_skill()
logger.info(f"✅ GitHub: Standalone SKILL.md created")
logger.info("✅ GitHub: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone GitHub SKILL.md: {e}")
@@ -430,7 +426,7 @@ class UnifiedScraper:
github_data_file_path = f"output/{github_config['name']}_github_data.json"
if os.path.exists(github_output_dir):
cache_github_dir = os.path.join(self.sources_dir, github_config['name'])
cache_github_dir = os.path.join(self.sources_dir, github_config["name"])
if os.path.exists(cache_github_dir):
shutil.rmtree(cache_github_dir)
shutil.move(github_output_dir, cache_github_dir)
@@ -443,9 +439,9 @@ class UnifiedScraper:
shutil.move(github_data_file_path, cache_github_data)
logger.info(f"📦 Moved GitHub data to cache: {cache_github_data}")
logger.info(f"✅ GitHub: Repository scraped successfully")
logger.info("✅ GitHub: Repository scraped successfully")
def _scrape_pdf(self, source: Dict[str, Any]):
def _scrape_pdf(self, source: dict[str, Any]):
"""Scrape PDF document."""
try:
from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
@@ -454,20 +450,20 @@ class UnifiedScraper:
return
# Multi-source support: Get unique index for this PDF source
idx = self._source_counters['pdf']
self._source_counters['pdf'] += 1
idx = self._source_counters["pdf"]
self._source_counters["pdf"] += 1
# Extract PDF identifier for unique naming (filename without extension)
pdf_path = source['path']
pdf_path = source["path"]
pdf_id = os.path.splitext(os.path.basename(pdf_path))[0]
# Create config for PDF scraper
pdf_config = {
'name': f"{self.name}_pdf_{idx}_{pdf_id}",
'pdf': source['path'],
'extract_tables': source.get('extract_tables', False),
'ocr': source.get('ocr', False),
'password': source.get('password')
"name": f"{self.name}_pdf_{idx}_{pdf_id}",
"pdf": source["path"],
"extract_tables": source.get("extract_tables", False),
"ocr": source.get("ocr", False),
"password": source.get("password"),
}
# Scrape
@@ -476,29 +472,25 @@ class UnifiedScraper:
pdf_data = converter.extract_all()
# Save data
pdf_data_file = os.path.join(self.data_dir, f'pdf_data_{idx}_{pdf_id}.json')
with open(pdf_data_file, 'w', encoding='utf-8') as f:
pdf_data_file = os.path.join(self.data_dir, f"pdf_data_{idx}_{pdf_id}.json")
with open(pdf_data_file, "w", encoding="utf-8") as f:
json.dump(pdf_data, f, indent=2, ensure_ascii=False)
# Append to list instead of overwriting
self.scraped_data['pdf'].append({
'pdf_path': pdf_path,
'pdf_id': pdf_id,
'idx': idx,
'data': pdf_data,
'data_file': pdf_data_file
})
self.scraped_data["pdf"].append(
{"pdf_path": pdf_path, "pdf_id": pdf_id, "idx": idx, "data": pdf_data, "data_file": pdf_data_file}
)
# Build standalone SKILL.md for synthesis
try:
converter.build_skill()
logger.info(f"✅ PDF: Standalone SKILL.md created")
logger.info("✅ PDF: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone PDF SKILL.md: {e}")
logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted")
def _load_json(self, file_path: Path) -> Dict:
def _load_json(self, file_path: Path) -> dict:
"""
Load JSON file safely.
@@ -513,13 +505,13 @@ class UnifiedScraper:
return {}
try:
with open(file_path, 'r', encoding='utf-8') as f:
with open(file_path, encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, IOError) as e:
except (OSError, json.JSONDecodeError) as e:
logger.warning(f"Failed to load JSON {file_path}: {e}")
return {}
def _load_guide_collection(self, tutorials_dir: Path) -> Dict:
def _load_guide_collection(self, tutorials_dir: Path) -> dict:
"""
Load how-to guide collection from tutorials directory.
@@ -531,22 +523,22 @@ class UnifiedScraper:
"""
if not tutorials_dir.exists():
logger.warning(f"Tutorials directory not found: {tutorials_dir}")
return {'guides': []}
return {"guides": []}
collection_file = tutorials_dir / 'guide_collection.json'
collection_file = tutorials_dir / "guide_collection.json"
if collection_file.exists():
return self._load_json(collection_file)
# Fallback: scan for individual guide JSON files
guides = []
for guide_file in tutorials_dir.glob('guide_*.json'):
for guide_file in tutorials_dir.glob("guide_*.json"):
guide_data = self._load_json(guide_file)
if guide_data:
guides.append(guide_data)
return {'guides': guides, 'total_count': len(guides)}
return {"guides": guides, "total_count": len(guides)}
def _load_api_reference(self, api_dir: Path) -> Dict[str, Any]:
def _load_api_reference(self, api_dir: Path) -> dict[str, Any]:
"""
Load API reference markdown files from api_reference directory.
@@ -561,16 +553,16 @@ class UnifiedScraper:
return {}
api_refs = {}
for md_file in api_dir.glob('*.md'):
for md_file in api_dir.glob("*.md"):
try:
module_name = md_file.stem
api_refs[module_name] = md_file.read_text(encoding='utf-8')
except IOError as e:
api_refs[module_name] = md_file.read_text(encoding="utf-8")
except OSError as e:
logger.warning(f"Failed to read API reference {md_file}: {e}")
return api_refs
def _run_c3_analysis(self, local_repo_path: str, source: Dict[str, Any]) -> Dict[str, Any]:
def _run_c3_analysis(self, local_repo_path: str, source: dict[str, Any]) -> dict[str, Any]:
"""
Run comprehensive C3.x codebase analysis.
@@ -592,7 +584,7 @@ class UnifiedScraper:
return {}
# Create temp output dir for C3.x analysis
temp_output = Path(self.data_dir) / 'c3_analysis_temp'
temp_output = Path(self.data_dir) / "c3_analysis_temp"
temp_output.mkdir(parents=True, exist_ok=True)
logger.info(f" Analyzing codebase: {local_repo_path}")
@@ -602,37 +594,37 @@ class UnifiedScraper:
results = analyze_codebase(
directory=Path(local_repo_path),
output_dir=temp_output,
depth='deep',
depth="deep",
languages=None, # Analyze all languages
file_patterns=source.get('file_patterns'),
build_api_reference=True, # C2.5: API Reference
extract_comments=False, # Not needed
file_patterns=source.get("file_patterns"),
build_api_reference=True, # C2.5: API Reference
extract_comments=False, # Not needed
build_dependency_graph=True, # C2.6: Dependency Graph
detect_patterns=True, # C3.1: Design patterns
extract_test_examples=True, # C3.2: Test examples
build_how_to_guides=True, # C3.3: How-to guides
detect_patterns=True, # C3.1: Design patterns
extract_test_examples=True, # C3.2: Test examples
build_how_to_guides=True, # C3.3: How-to guides
extract_config_patterns=True, # C3.4: Config patterns
enhance_with_ai=source.get('ai_mode', 'auto') != 'none',
ai_mode=source.get('ai_mode', 'auto')
enhance_with_ai=source.get("ai_mode", "auto") != "none",
ai_mode=source.get("ai_mode", "auto"),
)
# Load C3.x outputs into memory
c3_data = {
'patterns': self._load_json(temp_output / 'patterns' / 'detected_patterns.json'),
'test_examples': self._load_json(temp_output / 'test_examples' / 'test_examples.json'),
'how_to_guides': self._load_guide_collection(temp_output / 'tutorials'),
'config_patterns': self._load_json(temp_output / 'config_patterns' / 'config_patterns.json'),
'architecture': self._load_json(temp_output / 'architecture' / 'architectural_patterns.json'),
'api_reference': self._load_api_reference(temp_output / 'api_reference'), # C2.5
'dependency_graph': self._load_json(temp_output / 'dependencies' / 'dependency_graph.json') # C2.6
"patterns": self._load_json(temp_output / "patterns" / "detected_patterns.json"),
"test_examples": self._load_json(temp_output / "test_examples" / "test_examples.json"),
"how_to_guides": self._load_guide_collection(temp_output / "tutorials"),
"config_patterns": self._load_json(temp_output / "config_patterns" / "config_patterns.json"),
"architecture": self._load_json(temp_output / "architecture" / "architectural_patterns.json"),
"api_reference": self._load_api_reference(temp_output / "api_reference"), # C2.5
"dependency_graph": self._load_json(temp_output / "dependencies" / "dependency_graph.json"), # C2.6
}
# Log summary
total_patterns = sum(len(f.get('patterns', [])) for f in c3_data.get('patterns', []))
total_examples = c3_data.get('test_examples', {}).get('total_examples', 0)
total_guides = len(c3_data.get('how_to_guides', {}).get('guides', []))
total_configs = len(c3_data.get('config_patterns', {}).get('config_files', []))
arch_patterns = len(c3_data.get('architecture', {}).get('patterns', []))
total_patterns = sum(len(f.get("patterns", [])) for f in c3_data.get("patterns", []))
total_examples = c3_data.get("test_examples", {}).get("total_examples", 0)
total_guides = len(c3_data.get("how_to_guides", {}).get("guides", []))
total_configs = len(c3_data.get("config_patterns", {}).get("config_files", []))
arch_patterns = len(c3_data.get("architecture", {}).get("patterns", []))
logger.info(f" ✓ Design Patterns: {total_patterns}")
logger.info(f" ✓ Test Examples: {total_examples}")
@@ -645,6 +637,7 @@ class UnifiedScraper:
except Exception as e:
logger.error(f"C3.x analysis failed: {e}")
import traceback
traceback.print_exc()
return {}
@@ -656,7 +649,7 @@ class UnifiedScraper:
except Exception as e:
logger.warning(f"Failed to clean up temp directory: {e}")
def detect_conflicts(self) -> List:
def detect_conflicts(self) -> list:
"""
Detect conflicts between documentation and code.
@@ -674,18 +667,18 @@ class UnifiedScraper:
return []
# Get documentation and GitHub data
docs_data = self.scraped_data.get('documentation', {})
github_data = self.scraped_data.get('github', {})
docs_data = self.scraped_data.get("documentation", {})
github_data = self.scraped_data.get("github", {})
if not docs_data or not github_data:
logger.warning("Missing documentation or GitHub data for conflict detection")
return []
# Load data files
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
with open(docs_data["data_file"], encoding="utf-8") as f:
docs_json = json.load(f)
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
with open(github_data["data_file"], encoding="utf-8") as f:
github_json = json.load(f)
# Detect conflicts
@@ -693,26 +686,26 @@ class UnifiedScraper:
conflicts = detector.detect_all_conflicts()
# Save conflicts
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
detector.save_conflicts(conflicts, conflicts_file)
# Print summary
summary = detector.generate_summary(conflicts)
logger.info(f"\n📊 Conflict Summary:")
logger.info("\n📊 Conflict Summary:")
logger.info(f" Total: {summary['total']}")
logger.info(f" By Type:")
for ctype, count in summary['by_type'].items():
logger.info(" By Type:")
for ctype, count in summary["by_type"].items():
if count > 0:
logger.info(f" - {ctype}: {count}")
logger.info(f" By Severity:")
for severity, count in summary['by_severity'].items():
logger.info(" By Severity:")
for severity, count in summary["by_severity"].items():
if count > 0:
emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢'
emoji = "🔴" if severity == "high" else "🟡" if severity == "medium" else "🟢"
logger.info(f" {emoji} {severity}: {count}")
return conflicts
def merge_sources(self, conflicts: List):
def merge_sources(self, conflicts: list):
"""
Merge data from multiple sources.
@@ -728,18 +721,18 @@ class UnifiedScraper:
return None
# Get data files
docs_data = self.scraped_data.get('documentation', {})
github_data = self.scraped_data.get('github', {})
docs_data = self.scraped_data.get("documentation", {})
github_data = self.scraped_data.get("github", {})
# Load data
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
with open(docs_data["data_file"], encoding="utf-8") as f:
docs_json = json.load(f)
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
with open(github_data["data_file"], encoding="utf-8") as f:
github_json = json.load(f)
# Choose merger
if self.merge_mode == 'claude-enhanced':
if self.merge_mode == "claude-enhanced":
merger = ClaudeEnhancedMerger(docs_json, github_json, conflicts)
else:
merger = RuleBasedMerger(docs_json, github_json, conflicts)
@@ -748,15 +741,15 @@ class UnifiedScraper:
merged_data = merger.merge_all()
# Save merged data
merged_file = os.path.join(self.data_dir, 'merged_data.json')
with open(merged_file, 'w', encoding='utf-8') as f:
merged_file = os.path.join(self.data_dir, "merged_data.json")
with open(merged_file, "w", encoding="utf-8") as f:
json.dump(merged_data, f, indent=2, ensure_ascii=False)
logger.info(f"✅ Merged data saved: {merged_file}")
return merged_data
def build_skill(self, merged_data: Optional[Dict] = None):
def build_skill(self, merged_data: dict | None = None):
"""
Build final unified skill.
@@ -769,20 +762,14 @@ class UnifiedScraper:
# Load conflicts if they exist
conflicts = []
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
if os.path.exists(conflicts_file):
with open(conflicts_file, 'r', encoding='utf-8') as f:
with open(conflicts_file, encoding="utf-8") as f:
conflicts_data = json.load(f)
conflicts = conflicts_data.get('conflicts', [])
conflicts = conflicts_data.get("conflicts", [])
# Build skill
builder = UnifiedSkillBuilder(
self.config,
self.scraped_data,
merged_data,
conflicts,
cache_dir=self.cache_dir
)
builder = UnifiedSkillBuilder(self.config, self.scraped_data, merged_data, conflicts, cache_dir=self.cache_dir)
builder.build()
@@ -824,6 +811,7 @@ class UnifiedScraper:
except Exception as e:
logger.error(f"\n\n❌ Error during scraping: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
@@ -831,7 +819,7 @@ class UnifiedScraper:
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Unified multi-source scraper',
description="Unified multi-source scraper",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -843,17 +831,18 @@ Examples:
# Backward compatible with legacy configs
skill-seekers unified --config configs/react.json
"""
""",
)
parser.add_argument('--config', '-c', required=True,
help='Path to unified config JSON file')
parser.add_argument('--merge-mode', '-m',
choices=['rule-based', 'claude-enhanced'],
help='Override config merge mode')
parser.add_argument('--skip-codebase-analysis',
action='store_true',
help='Skip C3.x codebase analysis for GitHub sources (default: enabled)')
parser.add_argument("--config", "-c", required=True, help="Path to unified config JSON file")
parser.add_argument(
"--merge-mode", "-m", choices=["rule-based", "claude-enhanced"], help="Override config merge mode"
)
parser.add_argument(
"--skip-codebase-analysis",
action="store_true",
help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
)
args = parser.parse_args()
@@ -862,14 +851,14 @@ Examples:
# Disable codebase analysis if requested
if args.skip_codebase_analysis:
for source in scraper.config.get('sources', []):
if source['type'] == 'github':
source['enable_codebase_analysis'] = False
for source in scraper.config.get("sources", []):
if source["type"] == "github":
source["enable_codebase_analysis"] = False
logger.info(f"⏭️ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}")
# Run scraper
scraper.run()
if __name__ == '__main__':
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -17,27 +17,20 @@ Usage:
skill-seekers upload output/react-openai.zip --target openai
"""
import argparse
import os
import sys
import json
import argparse
from pathlib import Path
# Import utilities
try:
from utils import (
print_upload_instructions,
validate_zip_file
)
from utils import print_upload_instructions, validate_zip_file
except ImportError:
sys.path.insert(0, str(Path(__file__).parent))
from utils import (
print_upload_instructions,
validate_zip_file
)
from utils import print_upload_instructions
def upload_skill_api(package_path, target='claude', api_key=None):
def upload_skill_api(package_path, target="claude", api_key=None):
"""
Upload skill package to LLM platform
@@ -62,7 +55,7 @@ def upload_skill_api(package_path, target='claude', api_key=None):
# Get API key
if not api_key:
api_key = os.environ.get(adaptor.get_env_var_name(), '').strip()
api_key = os.environ.get(adaptor.get_env_var_name(), "").strip()
if not api_key:
return False, f"{adaptor.get_env_var_name()} not set. Export your API key first."
@@ -91,19 +84,19 @@ def upload_skill_api(package_path, target='claude', api_key=None):
try:
result = adaptor.upload(package_path, api_key)
if result['success']:
if result["success"]:
print()
print(f"{result['message']}")
print()
if result['url']:
if result["url"]:
print("Your skill is now available at:")
print(f" {result['url']}")
if result['skill_id']:
if result["skill_id"]:
print(f" Skill ID: {result['skill_id']}")
print()
return True, "Upload successful"
else:
return False, result['message']
return False, result["message"]
except Exception as e:
return False, f"Unexpected error: {str(e)}"
@@ -136,25 +129,19 @@ Examples:
# Upload with explicit API key
skill-seekers upload output/react.zip --api-key sk-ant-...
"""
""",
)
parser.add_argument(
'package_file',
help='Path to skill package file (e.g., output/react.zip)'
)
parser.add_argument("package_file", help="Path to skill package file (e.g., output/react.zip)")
parser.add_argument(
'--target',
choices=['claude', 'gemini', 'openai'],
default='claude',
help='Target LLM platform (default: claude)'
"--target",
choices=["claude", "gemini", "openai"],
default="claude",
help="Target LLM platform (default: claude)",
)
parser.add_argument(
'--api-key',
help='Platform API key (or set environment variable)'
)
parser.add_argument("--api-key", help="Platform API key (or set environment variable)")
args = parser.parse_args()

View File

@@ -3,21 +3,21 @@
Utility functions for Skill Seeker CLI tools
"""
import os
import sys
import subprocess
import platform
import time
import logging
import os
import platform
import subprocess
import time
from collections.abc import Callable
from pathlib import Path
from typing import Optional, Tuple, Dict, Union, TypeVar, Callable
from typing import TypeVar
logger = logging.getLogger(__name__)
T = TypeVar('T')
T = TypeVar("T")
def open_folder(folder_path: Union[str, Path]) -> bool:
def open_folder(folder_path: str | Path) -> bool:
"""
Open a folder in the system file browser
@@ -50,10 +50,10 @@ def open_folder(folder_path: Union[str, Path]) -> bool:
return True
except subprocess.CalledProcessError:
print(f"⚠️ Could not open folder automatically")
print("⚠️ Could not open folder automatically")
return False
except FileNotFoundError:
print(f"⚠️ File browser not found on system")
print("⚠️ File browser not found on system")
return False
@@ -64,18 +64,18 @@ def has_api_key() -> bool:
Returns:
bool: True if API key is set, False otherwise
"""
api_key = os.environ.get('ANTHROPIC_API_KEY', '').strip()
api_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
return len(api_key) > 0
def get_api_key() -> Optional[str]:
def get_api_key() -> str | None:
"""
Get ANTHROPIC_API_KEY from environment
Returns:
str: API key or None if not set
"""
api_key = os.environ.get('ANTHROPIC_API_KEY', '').strip()
api_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
return api_key if api_key else None
@@ -89,7 +89,7 @@ def get_upload_url() -> str:
return "https://claude.ai/skills"
def print_upload_instructions(zip_path: Union[str, Path]) -> None:
def print_upload_instructions(zip_path: str | Path) -> None:
"""
Print clear upload instructions for manual upload
@@ -106,7 +106,7 @@ def print_upload_instructions(zip_path: Union[str, Path]) -> None:
print(f"📤 Upload to Claude: {get_upload_url()}")
print()
print(f"1. Go to {get_upload_url()}")
print("2. Click \"Upload Skill\"")
print('2. Click "Upload Skill"')
print(f"3. Select: {zip_path}")
print("4. Done! ✅")
print()
@@ -130,7 +130,7 @@ def format_file_size(size_bytes: int) -> str:
return f"{size_bytes / (1024 * 1024):.1f} MB"
def validate_skill_directory(skill_dir: Union[str, Path]) -> Tuple[bool, Optional[str]]:
def validate_skill_directory(skill_dir: str | Path) -> tuple[bool, str | None]:
"""
Validate that a directory is a valid skill directory
@@ -155,7 +155,7 @@ def validate_skill_directory(skill_dir: Union[str, Path]) -> Tuple[bool, Optiona
return True, None
def validate_zip_file(zip_path: Union[str, Path]) -> Tuple[bool, Optional[str]]:
def validate_zip_file(zip_path: str | Path) -> tuple[bool, str | None]:
"""
Validate that a file is a valid skill .zip file
@@ -173,13 +173,13 @@ def validate_zip_file(zip_path: Union[str, Path]) -> Tuple[bool, Optional[str]]:
if not zip_path.is_file():
return False, f"Not a file: {zip_path}"
if not zip_path.suffix == '.zip':
if not zip_path.suffix == ".zip":
return False, f"Not a .zip file: {zip_path}"
return True, None
def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, preview_limit: int = 40000) -> Dict[str, Dict]:
def read_reference_files(skill_dir: str | Path, max_chars: int = 100000, preview_limit: int = 40000) -> dict[str, dict]:
"""Read reference files from a skill directory with enriched metadata.
This function reads markdown files from the references/ subdirectory
@@ -210,13 +210,13 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
skill_path = Path(skill_dir)
references_dir = skill_path / "references"
references: Dict[str, Dict] = {}
references: dict[str, dict] = {}
if not references_dir.exists():
print(f"⚠ No references directory found at {references_dir}")
return references
def _determine_source_metadata(relative_path: Path) -> Tuple[str, str, Optional[str]]:
def _determine_source_metadata(relative_path: Path) -> tuple[str, str, str | None]:
"""Determine source type, confidence level, and repo_id from path.
For multi-source support, extracts repo_id from paths like:
@@ -230,54 +230,54 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
repo_id = None # Default: no repo identity
# Documentation sources (official docs)
if path_str.startswith('documentation/'):
return 'documentation', 'high', None
if path_str.startswith("documentation/"):
return "documentation", "high", None
# GitHub sources
elif path_str.startswith('github/'):
elif path_str.startswith("github/"):
# README and releases are medium confidence
if 'README' in path_str or 'releases' in path_str:
return 'github', 'medium', None
if "README" in path_str or "releases" in path_str:
return "github", "medium", None
# Issues are low confidence (user reports)
elif 'issues' in path_str:
return 'github', 'low', None
elif "issues" in path_str:
return "github", "low", None
else:
return 'github', 'medium', None
return "github", "medium", None
# PDF sources (books, manuals)
elif path_str.startswith('pdf/'):
return 'pdf', 'high', None
elif path_str.startswith("pdf/"):
return "pdf", "high", None
# Merged API (synthesized from multiple sources)
elif path_str.startswith('api/'):
return 'api', 'high', None
elif path_str.startswith("api/"):
return "api", "high", None
# Codebase analysis (C3.x automated analysis)
elif path_str.startswith('codebase_analysis/'):
elif path_str.startswith("codebase_analysis/"):
# Extract repo_id from path: codebase_analysis/{repo_id}/...
parts = Path(path_str).parts
if len(parts) >= 2:
repo_id = parts[1] # e.g., 'encode_httpx', 'encode_httpcore'
# ARCHITECTURE.md is high confidence (comprehensive)
if 'ARCHITECTURE' in path_str:
return 'codebase_analysis', 'high', repo_id
if "ARCHITECTURE" in path_str:
return "codebase_analysis", "high", repo_id
# Patterns and examples are medium (heuristic-based)
elif 'patterns' in path_str or 'examples' in path_str:
return 'codebase_analysis', 'medium', repo_id
elif "patterns" in path_str or "examples" in path_str:
return "codebase_analysis", "medium", repo_id
# Configuration is high (direct extraction)
elif 'configuration' in path_str:
return 'codebase_analysis', 'high', repo_id
elif "configuration" in path_str:
return "codebase_analysis", "high", repo_id
else:
return 'codebase_analysis', 'medium', repo_id
return "codebase_analysis", "medium", repo_id
# Conflicts report (discrepancy detection)
elif 'conflicts' in path_str:
return 'conflicts', 'medium', None
elif "conflicts" in path_str:
return "conflicts", "medium", None
# Fallback
else:
return 'unknown', 'medium', None
return "unknown", "medium", None
total_chars = 0
# Search recursively for all .md files (including subdirectories like github/README.md)
@@ -285,7 +285,7 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
# Note: We now include index.md files as they contain important content
# (patterns, examples, configuration analysis)
content = ref_file.read_text(encoding='utf-8')
content = ref_file.read_text(encoding="utf-8")
# Limit size per file
truncated = False
@@ -299,13 +299,13 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
# Build enriched metadata (with repo_id for multi-source support)
references[str(relative_path)] = {
'content': content,
'source': source_type,
'confidence': confidence,
'path': str(relative_path),
'truncated': truncated,
'size': len(content),
'repo_id': repo_id # None for single-source, repo identifier for multi-source
"content": content,
"source": source_type,
"confidence": confidence,
"path": str(relative_path),
"truncated": truncated,
"size": len(content),
"repo_id": repo_id, # None for single-source, repo identifier for multi-source
}
total_chars += len(content)
@@ -319,10 +319,7 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
def retry_with_backoff(
operation: Callable[[], T],
max_attempts: int = 3,
base_delay: float = 1.0,
operation_name: str = "operation"
operation: Callable[[], T], max_attempts: int = 3, base_delay: float = 1.0, operation_name: str = "operation"
) -> T:
"""Retry an operation with exponential backoff.
@@ -348,7 +345,7 @@ def retry_with_backoff(
... return response.text
>>> content = retry_with_backoff(fetch_page, max_attempts=3, operation_name=f"fetch {url}")
"""
last_exception: Optional[Exception] = None
last_exception: Exception | None = None
for attempt in range(1, max_attempts + 1):
try:
@@ -358,15 +355,11 @@ def retry_with_backoff(
if attempt < max_attempts:
delay = base_delay * (2 ** (attempt - 1))
logger.warning(
"%s failed (attempt %d/%d), retrying in %.1fs: %s",
operation_name, attempt, max_attempts, delay, e
"%s failed (attempt %d/%d), retrying in %.1fs: %s", operation_name, attempt, max_attempts, delay, e
)
time.sleep(delay)
else:
logger.error(
"%s failed after %d attempts: %s",
operation_name, max_attempts, e
)
logger.error("%s failed after %d attempts: %s", operation_name, max_attempts, e)
# This should always have a value, but mypy doesn't know that
if last_exception is not None:
@@ -375,10 +368,7 @@ def retry_with_backoff(
async def retry_with_backoff_async(
operation: Callable[[], T],
max_attempts: int = 3,
base_delay: float = 1.0,
operation_name: str = "operation"
operation: Callable[[], T], max_attempts: int = 3, base_delay: float = 1.0, operation_name: str = "operation"
) -> T:
"""Async version of retry_with_backoff for async operations.
@@ -403,7 +393,7 @@ async def retry_with_backoff_async(
"""
import asyncio
last_exception: Optional[Exception] = None
last_exception: Exception | None = None
for attempt in range(1, max_attempts + 1):
try:
@@ -413,15 +403,11 @@ async def retry_with_backoff_async(
if attempt < max_attempts:
delay = base_delay * (2 ** (attempt - 1))
logger.warning(
"%s failed (attempt %d/%d), retrying in %.1fs: %s",
operation_name, attempt, max_attempts, delay, e
"%s failed (attempt %d/%d), retrying in %.1fs: %s", operation_name, attempt, max_attempts, delay, e
)
await asyncio.sleep(delay)
else:
logger.error(
"%s failed after %d attempts: %s",
operation_name, max_attempts, e
)
logger.error("%s failed after %d attempts: %s", operation_name, max_attempts, e)
if last_exception is not None:
raise last_exception