run ruff
This commit is contained in:
@@ -33,17 +33,17 @@ except ImportError:
|
||||
|
||||
|
||||
# Registry of available adaptors
|
||||
ADAPTORS: Dict[str, Type[SkillAdaptor]] = {}
|
||||
ADAPTORS: dict[str, type[SkillAdaptor]] = {}
|
||||
|
||||
# Register adaptors that are implemented
|
||||
if ClaudeAdaptor:
|
||||
ADAPTORS['claude'] = ClaudeAdaptor
|
||||
ADAPTORS["claude"] = ClaudeAdaptor
|
||||
if GeminiAdaptor:
|
||||
ADAPTORS['gemini'] = GeminiAdaptor
|
||||
ADAPTORS["gemini"] = GeminiAdaptor
|
||||
if OpenAIAdaptor:
|
||||
ADAPTORS['openai'] = OpenAIAdaptor
|
||||
ADAPTORS["openai"] = OpenAIAdaptor
|
||||
if MarkdownAdaptor:
|
||||
ADAPTORS['markdown'] = MarkdownAdaptor
|
||||
ADAPTORS["markdown"] = MarkdownAdaptor
|
||||
|
||||
|
||||
def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:
|
||||
@@ -65,15 +65,11 @@ def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:
|
||||
>>> adaptor = get_adaptor('gemini', {'api_version': 'v1beta'})
|
||||
"""
|
||||
if platform not in ADAPTORS:
|
||||
available = ', '.join(ADAPTORS.keys())
|
||||
available = ", ".join(ADAPTORS.keys())
|
||||
if not ADAPTORS:
|
||||
raise ValueError(
|
||||
f"No adaptors are currently implemented. "
|
||||
f"Platform '{platform}' is not available."
|
||||
)
|
||||
raise ValueError(f"No adaptors are currently implemented. Platform '{platform}' is not available.")
|
||||
raise ValueError(
|
||||
f"Platform '{platform}' is not supported or not yet implemented. "
|
||||
f"Available platforms: {available}"
|
||||
f"Platform '{platform}' is not supported or not yet implemented. Available platforms: {available}"
|
||||
)
|
||||
|
||||
adaptor_class = ADAPTORS[platform]
|
||||
@@ -115,10 +111,10 @@ def is_platform_available(platform: str) -> bool:
|
||||
|
||||
# Export public interface
|
||||
__all__ = [
|
||||
'SkillAdaptor',
|
||||
'SkillMetadata',
|
||||
'get_adaptor',
|
||||
'list_platforms',
|
||||
'is_platform_available',
|
||||
'ADAPTORS',
|
||||
"SkillAdaptor",
|
||||
"SkillMetadata",
|
||||
"get_adaptor",
|
||||
"list_platforms",
|
||||
"is_platform_available",
|
||||
"ADAPTORS",
|
||||
]
|
||||
|
||||
@@ -7,18 +7,19 @@ This enables Skill Seekers to generate skills for multiple LLM platforms (Claude
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class SkillMetadata:
|
||||
"""Universal skill metadata used across all platforms"""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
version: str = "1.0.0"
|
||||
author: Optional[str] = None
|
||||
author: str | None = None
|
||||
tags: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@@ -34,11 +35,11 @@ class SkillAdaptor(ABC):
|
||||
"""
|
||||
|
||||
# Platform identifiers (override in subclasses)
|
||||
PLATFORM: str = "unknown" # e.g., "claude", "gemini", "openai"
|
||||
PLATFORM_NAME: str = "Unknown" # e.g., "Claude AI (Anthropic)"
|
||||
DEFAULT_API_ENDPOINT: Optional[str] = None
|
||||
PLATFORM: str = "unknown" # e.g., "claude", "gemini", "openai"
|
||||
PLATFORM_NAME: str = "Unknown" # e.g., "Claude AI (Anthropic)"
|
||||
DEFAULT_API_ENDPOINT: str | None = None
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
def __init__(self, config: dict[str, Any] | None = None):
|
||||
"""
|
||||
Initialize adaptor with optional configuration.
|
||||
|
||||
@@ -86,7 +87,7 @@ class SkillAdaptor(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Upload packaged skill to platform.
|
||||
|
||||
@@ -168,11 +169,11 @@ class SkillAdaptor(ABC):
|
||||
if not skill_md_path.exists():
|
||||
return ""
|
||||
|
||||
content = skill_md_path.read_text(encoding='utf-8')
|
||||
content = skill_md_path.read_text(encoding="utf-8")
|
||||
|
||||
# Strip YAML frontmatter if present
|
||||
if content.startswith('---'):
|
||||
parts = content.split('---', 2)
|
||||
if content.startswith("---"):
|
||||
parts = content.split("---", 2)
|
||||
if len(parts) >= 3:
|
||||
return parts[2].strip()
|
||||
|
||||
@@ -193,7 +194,7 @@ class SkillAdaptor(ABC):
|
||||
return "See references/ directory for documentation."
|
||||
|
||||
# Read index and extract relevant sections
|
||||
content = index_path.read_text(encoding='utf-8')
|
||||
content = index_path.read_text(encoding="utf-8")
|
||||
return content[:500] + "..." if len(content) > 500 else content
|
||||
|
||||
def _generate_toc(self, skill_dir: Path) -> str:
|
||||
@@ -214,7 +215,7 @@ class SkillAdaptor(ABC):
|
||||
for ref_file in sorted(refs_dir.glob("*.md")):
|
||||
if ref_file.name == "index.md":
|
||||
continue
|
||||
title = ref_file.stem.replace('_', ' ').title()
|
||||
title = ref_file.stem.replace("_", " ").title()
|
||||
toc_lines.append(f"- [{title}](references/{ref_file.name})")
|
||||
|
||||
return "\n".join(toc_lines)
|
||||
|
||||
@@ -6,10 +6,9 @@ Implements platform-specific handling for Claude AI (Anthropic) skills.
|
||||
Refactored from upload_skill.py and enhance_skill.py.
|
||||
"""
|
||||
|
||||
import os
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
|
||||
@@ -101,16 +100,16 @@ version: {metadata.version}
|
||||
skill_dir = Path(skill_dir)
|
||||
|
||||
# Determine output filename
|
||||
if output_path.is_dir() or str(output_path).endswith('/'):
|
||||
if output_path.is_dir() or str(output_path).endswith("/"):
|
||||
output_path = Path(output_path) / f"{skill_dir.name}.zip"
|
||||
elif not str(output_path).endswith('.zip'):
|
||||
output_path = Path(str(output_path) + '.zip')
|
||||
elif not str(output_path).endswith(".zip"):
|
||||
output_path = Path(str(output_path) + ".zip")
|
||||
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create ZIP file
|
||||
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
||||
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
# Add SKILL.md (required)
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
if skill_md.exists():
|
||||
@@ -120,7 +119,7 @@ version: {metadata.version}
|
||||
refs_dir = skill_dir / "references"
|
||||
if refs_dir.exists():
|
||||
for ref_file in refs_dir.rglob("*"):
|
||||
if ref_file.is_file() and not ref_file.name.startswith('.'):
|
||||
if ref_file.is_file() and not ref_file.name.startswith("."):
|
||||
arcname = ref_file.relative_to(skill_dir)
|
||||
zf.write(ref_file, str(arcname))
|
||||
|
||||
@@ -128,7 +127,7 @@ version: {metadata.version}
|
||||
scripts_dir = skill_dir / "scripts"
|
||||
if scripts_dir.exists():
|
||||
for script_file in scripts_dir.rglob("*"):
|
||||
if script_file.is_file() and not script_file.name.startswith('.'):
|
||||
if script_file.is_file() and not script_file.name.startswith("."):
|
||||
arcname = script_file.relative_to(skill_dir)
|
||||
zf.write(script_file, str(arcname))
|
||||
|
||||
@@ -136,13 +135,13 @@ version: {metadata.version}
|
||||
assets_dir = skill_dir / "assets"
|
||||
if assets_dir.exists():
|
||||
for asset_file in assets_dir.rglob("*"):
|
||||
if asset_file.is_file() and not asset_file.name.startswith('.'):
|
||||
if asset_file.is_file() and not asset_file.name.startswith("."):
|
||||
arcname = asset_file.relative_to(skill_dir)
|
||||
zf.write(asset_file, str(arcname))
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Upload skill ZIP to Anthropic Skills API.
|
||||
|
||||
@@ -159,130 +158,99 @@ version: {metadata.version}
|
||||
import requests
|
||||
except ImportError:
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': 'requests library not installed. Run: pip install requests'
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": None,
|
||||
"message": "requests library not installed. Run: pip install requests",
|
||||
}
|
||||
|
||||
# Validate ZIP file
|
||||
package_path = Path(package_path)
|
||||
if not package_path.exists():
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'File not found: {package_path}'
|
||||
}
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"File not found: {package_path}"}
|
||||
|
||||
if not package_path.suffix == '.zip':
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'Not a ZIP file: {package_path}'
|
||||
}
|
||||
if not package_path.suffix == ".zip":
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"Not a ZIP file: {package_path}"}
|
||||
|
||||
# Prepare API request
|
||||
api_url = self.DEFAULT_API_ENDPOINT
|
||||
headers = {
|
||||
"x-api-key": api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "skills-2025-10-02"
|
||||
}
|
||||
headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01", "anthropic-beta": "skills-2025-10-02"}
|
||||
|
||||
timeout = kwargs.get('timeout', 60)
|
||||
timeout = kwargs.get("timeout", 60)
|
||||
|
||||
try:
|
||||
# Read ZIP file
|
||||
with open(package_path, 'rb') as f:
|
||||
with open(package_path, "rb") as f:
|
||||
zip_data = f.read()
|
||||
|
||||
# Upload skill
|
||||
files = {
|
||||
'files[]': (package_path.name, zip_data, 'application/zip')
|
||||
}
|
||||
files = {"files[]": (package_path.name, zip_data, "application/zip")}
|
||||
|
||||
response = requests.post(
|
||||
api_url,
|
||||
headers=headers,
|
||||
files=files,
|
||||
timeout=timeout
|
||||
)
|
||||
response = requests.post(api_url, headers=headers, files=files, timeout=timeout)
|
||||
|
||||
# Check response
|
||||
if response.status_code == 200:
|
||||
# Extract skill ID if available
|
||||
try:
|
||||
response_data = response.json()
|
||||
skill_id = response_data.get('id')
|
||||
skill_id = response_data.get("id")
|
||||
except:
|
||||
skill_id = None
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'skill_id': skill_id,
|
||||
'url': 'https://claude.ai/skills',
|
||||
'message': 'Skill uploaded successfully to Claude AI'
|
||||
"success": True,
|
||||
"skill_id": skill_id,
|
||||
"url": "https://claude.ai/skills",
|
||||
"message": "Skill uploaded successfully to Claude AI",
|
||||
}
|
||||
|
||||
elif response.status_code == 401:
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': 'Authentication failed. Check your ANTHROPIC_API_KEY'
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": None,
|
||||
"message": "Authentication failed. Check your ANTHROPIC_API_KEY",
|
||||
}
|
||||
|
||||
elif response.status_code == 400:
|
||||
try:
|
||||
error_msg = response.json().get('error', {}).get('message', 'Unknown error')
|
||||
error_msg = response.json().get("error", {}).get("message", "Unknown error")
|
||||
except:
|
||||
error_msg = 'Invalid skill format'
|
||||
error_msg = "Invalid skill format"
|
||||
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'Invalid skill format: {error_msg}'
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": None,
|
||||
"message": f"Invalid skill format: {error_msg}",
|
||||
}
|
||||
|
||||
else:
|
||||
try:
|
||||
error_msg = response.json().get('error', {}).get('message', 'Unknown error')
|
||||
error_msg = response.json().get("error", {}).get("message", "Unknown error")
|
||||
except:
|
||||
error_msg = f'HTTP {response.status_code}'
|
||||
error_msg = f"HTTP {response.status_code}"
|
||||
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'Upload failed: {error_msg}'
|
||||
}
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"Upload failed: {error_msg}"}
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': 'Upload timed out. Try again or use manual upload'
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": None,
|
||||
"message": "Upload timed out. Try again or use manual upload",
|
||||
}
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': 'Connection error. Check your internet connection'
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": None,
|
||||
"message": "Connection error. Check your internet connection",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'Unexpected error: {str(e)}'
|
||||
}
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"Unexpected error: {str(e)}"}
|
||||
|
||||
def validate_api_key(self, api_key: str) -> bool:
|
||||
"""
|
||||
@@ -294,7 +262,7 @@ version: {metadata.version}
|
||||
Returns:
|
||||
True if key starts with 'sk-ant-'
|
||||
"""
|
||||
return api_key.strip().startswith('sk-ant-')
|
||||
return api_key.strip().startswith("sk-ant-")
|
||||
|
||||
def get_env_var_name(self) -> str:
|
||||
"""
|
||||
@@ -355,17 +323,13 @@ version: {metadata.version}
|
||||
# Read current SKILL.md
|
||||
current_skill_md = None
|
||||
if skill_md_path.exists():
|
||||
current_skill_md = skill_md_path.read_text(encoding='utf-8')
|
||||
current_skill_md = skill_md_path.read_text(encoding="utf-8")
|
||||
print(f" ℹ Found existing SKILL.md ({len(current_skill_md)} chars)")
|
||||
else:
|
||||
print(f" ℹ No existing SKILL.md, will create new one")
|
||||
print(" ℹ No existing SKILL.md, will create new one")
|
||||
|
||||
# Build enhancement prompt
|
||||
prompt = self._build_enhancement_prompt(
|
||||
skill_dir.name,
|
||||
references,
|
||||
current_skill_md
|
||||
)
|
||||
prompt = self._build_enhancement_prompt(skill_dir.name, references, current_skill_md)
|
||||
|
||||
print("\n🤖 Asking Claude to enhance SKILL.md...")
|
||||
print(f" Input: {len(prompt):,} characters")
|
||||
@@ -377,10 +341,7 @@ version: {metadata.version}
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=4096,
|
||||
temperature=0.3,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}]
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
|
||||
enhanced_content = message.content[0].text
|
||||
@@ -388,13 +349,13 @@ version: {metadata.version}
|
||||
|
||||
# Backup original
|
||||
if skill_md_path.exists():
|
||||
backup_path = skill_md_path.with_suffix('.md.backup')
|
||||
backup_path = skill_md_path.with_suffix(".md.backup")
|
||||
skill_md_path.rename(backup_path)
|
||||
print(f" 💾 Backed up original to: {backup_path.name}")
|
||||
|
||||
# Save enhanced version
|
||||
skill_md_path.write_text(enhanced_content, encoding='utf-8')
|
||||
print(f" ✅ Saved enhanced SKILL.md")
|
||||
skill_md_path.write_text(enhanced_content, encoding="utf-8")
|
||||
print(" ✅ Saved enhanced SKILL.md")
|
||||
|
||||
return True
|
||||
|
||||
@@ -402,7 +363,7 @@ version: {metadata.version}
|
||||
print(f"❌ Error calling Claude API: {e}")
|
||||
return False
|
||||
|
||||
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> Dict[str, str]:
|
||||
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> dict[str, str]:
|
||||
"""
|
||||
Read reference markdown files from skill directory.
|
||||
|
||||
@@ -425,7 +386,7 @@ version: {metadata.version}
|
||||
break
|
||||
|
||||
try:
|
||||
content = ref_file.read_text(encoding='utf-8')
|
||||
content = ref_file.read_text(encoding="utf-8")
|
||||
# Limit individual file size
|
||||
if len(content) > 30000:
|
||||
content = content[:30000] + "\n\n...(truncated)"
|
||||
@@ -439,10 +400,7 @@ version: {metadata.version}
|
||||
return references
|
||||
|
||||
def _build_enhancement_prompt(
|
||||
self,
|
||||
skill_name: str,
|
||||
references: Dict[str, str],
|
||||
current_skill_md: str = None
|
||||
self, skill_name: str, references: dict[str, str], current_skill_md: str = None
|
||||
) -> str:
|
||||
"""
|
||||
Build Claude API prompt for enhancement.
|
||||
@@ -460,9 +418,9 @@ version: {metadata.version}
|
||||
I've scraped documentation and organized it into reference files. Your job is to create an EXCELLENT SKILL.md that will help Claude use this documentation effectively.
|
||||
|
||||
CURRENT SKILL.MD:
|
||||
{'```markdown' if current_skill_md else '(none - create from scratch)'}
|
||||
{current_skill_md or 'No existing SKILL.md'}
|
||||
{'```' if current_skill_md else ''}
|
||||
{"```markdown" if current_skill_md else "(none - create from scratch)"}
|
||||
{current_skill_md or "No existing SKILL.md"}
|
||||
{"```" if current_skill_md else ""}
|
||||
|
||||
REFERENCE DOCUMENTATION:
|
||||
"""
|
||||
|
||||
@@ -6,11 +6,11 @@ Implements platform-specific handling for Google Gemini skills.
|
||||
Uses Gemini Files API for grounding and Gemini 2.0 Flash for enhancement.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tarfile
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
|
||||
@@ -105,20 +105,20 @@ See the references directory for complete documentation with examples and best p
|
||||
skill_dir = Path(skill_dir)
|
||||
|
||||
# Determine output filename
|
||||
if output_path.is_dir() or str(output_path).endswith('/'):
|
||||
if output_path.is_dir() or str(output_path).endswith("/"):
|
||||
output_path = Path(output_path) / f"{skill_dir.name}-gemini.tar.gz"
|
||||
elif not str(output_path).endswith('.tar.gz'):
|
||||
elif not str(output_path).endswith(".tar.gz"):
|
||||
# Replace .zip with .tar.gz if needed
|
||||
output_str = str(output_path).replace('.zip', '.tar.gz')
|
||||
if not output_str.endswith('.tar.gz'):
|
||||
output_str += '.tar.gz'
|
||||
output_str = str(output_path).replace(".zip", ".tar.gz")
|
||||
if not output_str.endswith(".tar.gz"):
|
||||
output_str += ".tar.gz"
|
||||
output_path = Path(output_str)
|
||||
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create tar.gz file
|
||||
with tarfile.open(output_path, 'w:gz') as tar:
|
||||
with tarfile.open(output_path, "w:gz") as tar:
|
||||
# Add SKILL.md as system_instructions.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
if skill_md.exists():
|
||||
@@ -128,21 +128,22 @@ See the references directory for complete documentation with examples and best p
|
||||
refs_dir = skill_dir / "references"
|
||||
if refs_dir.exists():
|
||||
for ref_file in refs_dir.rglob("*"):
|
||||
if ref_file.is_file() and not ref_file.name.startswith('.'):
|
||||
if ref_file.is_file() and not ref_file.name.startswith("."):
|
||||
arcname = ref_file.relative_to(skill_dir)
|
||||
tar.add(ref_file, arcname=str(arcname))
|
||||
|
||||
# Create and add metadata file
|
||||
metadata = {
|
||||
'platform': 'gemini',
|
||||
'name': skill_dir.name,
|
||||
'version': '1.0.0',
|
||||
'created_with': 'skill-seekers'
|
||||
"platform": "gemini",
|
||||
"name": skill_dir.name,
|
||||
"version": "1.0.0",
|
||||
"created_with": "skill-seekers",
|
||||
}
|
||||
|
||||
# Write metadata to temp file and add to archive
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
|
||||
json.dump(metadata, tmp, indent=2)
|
||||
tmp_path = tmp.name
|
||||
|
||||
@@ -153,7 +154,7 @@ See the references directory for complete documentation with examples and best p
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Upload skill tar.gz to Gemini Files API.
|
||||
|
||||
@@ -168,30 +169,20 @@ See the references directory for complete documentation with examples and best p
|
||||
# Validate package file FIRST
|
||||
package_path = Path(package_path)
|
||||
if not package_path.exists():
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'File not found: {package_path}'
|
||||
}
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"File not found: {package_path}"}
|
||||
|
||||
if not package_path.suffix == '.gz':
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'Not a tar.gz file: {package_path}'
|
||||
}
|
||||
if not package_path.suffix == ".gz":
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"Not a tar.gz file: {package_path}"}
|
||||
|
||||
# Check for google-generativeai library
|
||||
try:
|
||||
import google.generativeai as genai
|
||||
except ImportError:
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': 'google-generativeai library not installed. Run: pip install google-generativeai'
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": None,
|
||||
"message": "google-generativeai library not installed. Run: pip install google-generativeai",
|
||||
}
|
||||
|
||||
# Configure Gemini
|
||||
@@ -200,11 +191,10 @@ See the references directory for complete documentation with examples and best p
|
||||
|
||||
# Extract tar.gz to temp directory
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Extract archive
|
||||
with tarfile.open(package_path, 'r:gz') as tar:
|
||||
with tarfile.open(package_path, "r:gz") as tar:
|
||||
tar.extractall(temp_dir)
|
||||
|
||||
temp_path = Path(temp_dir)
|
||||
@@ -213,17 +203,14 @@ See the references directory for complete documentation with examples and best p
|
||||
main_file = temp_path / "system_instructions.md"
|
||||
if not main_file.exists():
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': 'Invalid package: system_instructions.md not found'
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": None,
|
||||
"message": "Invalid package: system_instructions.md not found",
|
||||
}
|
||||
|
||||
# Upload to Files API
|
||||
uploaded_file = genai.upload_file(
|
||||
path=str(main_file),
|
||||
display_name=f"{package_path.stem}_instructions"
|
||||
)
|
||||
uploaded_file = genai.upload_file(path=str(main_file), display_name=f"{package_path.stem}_instructions")
|
||||
|
||||
# Upload reference files (if any)
|
||||
refs_dir = temp_path / "references"
|
||||
@@ -231,25 +218,19 @@ See the references directory for complete documentation with examples and best p
|
||||
if refs_dir.exists():
|
||||
for ref_file in refs_dir.glob("*.md"):
|
||||
ref_uploaded = genai.upload_file(
|
||||
path=str(ref_file),
|
||||
display_name=f"{package_path.stem}_{ref_file.stem}"
|
||||
path=str(ref_file), display_name=f"{package_path.stem}_{ref_file.stem}"
|
||||
)
|
||||
uploaded_refs.append(ref_uploaded.name)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'skill_id': uploaded_file.name,
|
||||
'url': f"https://aistudio.google.com/app/files/{uploaded_file.name}",
|
||||
'message': f'Skill uploaded to Google AI Studio ({len(uploaded_refs) + 1} files)'
|
||||
"success": True,
|
||||
"skill_id": uploaded_file.name,
|
||||
"url": f"https://aistudio.google.com/app/files/{uploaded_file.name}",
|
||||
"message": f"Skill uploaded to Google AI Studio ({len(uploaded_refs) + 1} files)",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'Upload failed: {str(e)}'
|
||||
}
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"Upload failed: {str(e)}"}
|
||||
|
||||
def validate_api_key(self, api_key: str) -> bool:
|
||||
"""
|
||||
@@ -261,7 +242,7 @@ See the references directory for complete documentation with examples and best p
|
||||
Returns:
|
||||
True if key starts with 'AIza'
|
||||
"""
|
||||
return api_key.strip().startswith('AIza')
|
||||
return api_key.strip().startswith("AIza")
|
||||
|
||||
def get_env_var_name(self) -> str:
|
||||
"""
|
||||
@@ -319,17 +300,13 @@ See the references directory for complete documentation with examples and best p
|
||||
# Read current SKILL.md
|
||||
current_skill_md = None
|
||||
if skill_md_path.exists():
|
||||
current_skill_md = skill_md_path.read_text(encoding='utf-8')
|
||||
current_skill_md = skill_md_path.read_text(encoding="utf-8")
|
||||
print(f" ℹ Found existing SKILL.md ({len(current_skill_md)} chars)")
|
||||
else:
|
||||
print(f" ℹ No existing SKILL.md, will create new one")
|
||||
print(" ℹ No existing SKILL.md, will create new one")
|
||||
|
||||
# Build enhancement prompt
|
||||
prompt = self._build_enhancement_prompt(
|
||||
skill_dir.name,
|
||||
references,
|
||||
current_skill_md
|
||||
)
|
||||
prompt = self._build_enhancement_prompt(skill_dir.name, references, current_skill_md)
|
||||
|
||||
print("\n🤖 Asking Gemini to enhance SKILL.md...")
|
||||
print(f" Input: {len(prompt):,} characters")
|
||||
@@ -337,7 +314,7 @@ See the references directory for complete documentation with examples and best p
|
||||
try:
|
||||
genai.configure(api_key=api_key)
|
||||
|
||||
model = genai.GenerativeModel('gemini-2.0-flash-exp')
|
||||
model = genai.GenerativeModel("gemini-2.0-flash-exp")
|
||||
|
||||
response = model.generate_content(prompt)
|
||||
|
||||
@@ -346,13 +323,13 @@ See the references directory for complete documentation with examples and best p
|
||||
|
||||
# Backup original
|
||||
if skill_md_path.exists():
|
||||
backup_path = skill_md_path.with_suffix('.md.backup')
|
||||
backup_path = skill_md_path.with_suffix(".md.backup")
|
||||
skill_md_path.rename(backup_path)
|
||||
print(f" 💾 Backed up original to: {backup_path.name}")
|
||||
|
||||
# Save enhanced version
|
||||
skill_md_path.write_text(enhanced_content, encoding='utf-8')
|
||||
print(f" ✅ Saved enhanced SKILL.md")
|
||||
skill_md_path.write_text(enhanced_content, encoding="utf-8")
|
||||
print(" ✅ Saved enhanced SKILL.md")
|
||||
|
||||
return True
|
||||
|
||||
@@ -360,7 +337,7 @@ See the references directory for complete documentation with examples and best p
|
||||
print(f"❌ Error calling Gemini API: {e}")
|
||||
return False
|
||||
|
||||
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> Dict[str, str]:
|
||||
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> dict[str, str]:
|
||||
"""
|
||||
Read reference markdown files from skill directory.
|
||||
|
||||
@@ -383,7 +360,7 @@ See the references directory for complete documentation with examples and best p
|
||||
break
|
||||
|
||||
try:
|
||||
content = ref_file.read_text(encoding='utf-8')
|
||||
content = ref_file.read_text(encoding="utf-8")
|
||||
# Limit individual file size
|
||||
if len(content) > 30000:
|
||||
content = content[:30000] + "\n\n...(truncated)"
|
||||
@@ -397,10 +374,7 @@ See the references directory for complete documentation with examples and best p
|
||||
return references
|
||||
|
||||
def _build_enhancement_prompt(
|
||||
self,
|
||||
skill_name: str,
|
||||
references: Dict[str, str],
|
||||
current_skill_md: str = None
|
||||
self, skill_name: str, references: dict[str, str], current_skill_md: str = None
|
||||
) -> str:
|
||||
"""
|
||||
Build Gemini API prompt for enhancement.
|
||||
@@ -418,9 +392,9 @@ See the references directory for complete documentation with examples and best p
|
||||
I've scraped documentation and organized it into reference files. Your job is to create an EXCELLENT markdown documentation file that will help Gemini use this documentation effectively.
|
||||
|
||||
CURRENT DOCUMENTATION:
|
||||
{'```markdown' if current_skill_md else '(none - create from scratch)'}
|
||||
{current_skill_md or 'No existing documentation'}
|
||||
{'```' if current_skill_md else ''}
|
||||
{"```markdown" if current_skill_md else "(none - create from scratch)"}
|
||||
{current_skill_md or "No existing documentation"}
|
||||
{"```" if current_skill_md else ""}
|
||||
|
||||
REFERENCE DOCUMENTATION:
|
||||
"""
|
||||
|
||||
@@ -8,7 +8,7 @@ No platform-specific features, just clean markdown documentation.
|
||||
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
|
||||
@@ -100,33 +100,33 @@ Browse the reference files for detailed information on each topic. All files are
|
||||
skill_dir = Path(skill_dir)
|
||||
|
||||
# Determine output filename
|
||||
if output_path.is_dir() or str(output_path).endswith('/'):
|
||||
if output_path.is_dir() or str(output_path).endswith("/"):
|
||||
output_path = Path(output_path) / f"{skill_dir.name}-markdown.zip"
|
||||
elif not str(output_path).endswith('.zip'):
|
||||
elif not str(output_path).endswith(".zip"):
|
||||
# Replace extension if needed
|
||||
output_str = str(output_path).replace('.tar.gz', '.zip')
|
||||
if not output_str.endswith('-markdown.zip'):
|
||||
output_str = output_str.replace('.zip', '-markdown.zip')
|
||||
if not output_str.endswith('.zip'):
|
||||
output_str += '.zip'
|
||||
output_str = str(output_path).replace(".tar.gz", ".zip")
|
||||
if not output_str.endswith("-markdown.zip"):
|
||||
output_str = output_str.replace(".zip", "-markdown.zip")
|
||||
if not output_str.endswith(".zip"):
|
||||
output_str += ".zip"
|
||||
output_path = Path(output_str)
|
||||
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create ZIP file
|
||||
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
||||
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
# Add SKILL.md as README.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
if skill_md.exists():
|
||||
content = skill_md.read_text(encoding='utf-8')
|
||||
content = skill_md.read_text(encoding="utf-8")
|
||||
zf.writestr("README.md", content)
|
||||
|
||||
# Add individual reference files
|
||||
refs_dir = skill_dir / "references"
|
||||
if refs_dir.exists():
|
||||
for ref_file in refs_dir.rglob("*.md"):
|
||||
if ref_file.is_file() and not ref_file.name.startswith('.'):
|
||||
if ref_file.is_file() and not ref_file.name.startswith("."):
|
||||
# Preserve directory structure under references/
|
||||
arcname = ref_file.relative_to(skill_dir)
|
||||
zf.write(ref_file, str(arcname))
|
||||
@@ -138,20 +138,21 @@ Browse the reference files for detailed information on each topic. All files are
|
||||
|
||||
# Add metadata file
|
||||
import json
|
||||
|
||||
metadata = {
|
||||
'platform': 'markdown',
|
||||
'name': skill_dir.name,
|
||||
'version': '1.0.0',
|
||||
'created_with': 'skill-seekers',
|
||||
'format': 'universal_markdown',
|
||||
'usage': 'Use with any LLM or documentation system'
|
||||
"platform": "markdown",
|
||||
"name": skill_dir.name,
|
||||
"version": "1.0.0",
|
||||
"created_with": "skill-seekers",
|
||||
"format": "universal_markdown",
|
||||
"usage": "Use with any LLM or documentation system",
|
||||
}
|
||||
|
||||
zf.writestr("metadata.json", json.dumps(metadata, indent=2))
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Generic markdown export does not support upload.
|
||||
|
||||
@@ -166,13 +167,13 @@ Browse the reference files for detailed information on each topic. All files are
|
||||
Result indicating no upload capability
|
||||
"""
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': str(package_path.absolute()),
|
||||
'message': (
|
||||
'Generic markdown export does not support automatic upload. '
|
||||
f'Your documentation is packaged at: {package_path.absolute()}'
|
||||
)
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": str(package_path.absolute()),
|
||||
"message": (
|
||||
"Generic markdown export does not support automatic upload. "
|
||||
f"Your documentation is packaged at: {package_path.absolute()}"
|
||||
),
|
||||
}
|
||||
|
||||
def validate_api_key(self, api_key: str) -> bool:
|
||||
@@ -237,10 +238,10 @@ Browse the reference files for detailed information on each topic. All files are
|
||||
|
||||
# Add main content
|
||||
if skill_md.exists():
|
||||
content = skill_md.read_text(encoding='utf-8')
|
||||
content = skill_md.read_text(encoding="utf-8")
|
||||
# Strip YAML frontmatter if present
|
||||
if content.startswith('---'):
|
||||
parts = content.split('---', 2)
|
||||
if content.startswith("---"):
|
||||
parts = content.split("---", 2)
|
||||
if len(parts) >= 3:
|
||||
content = parts[2].strip()
|
||||
combined_parts.append(content)
|
||||
@@ -258,7 +259,7 @@ Browse the reference files for detailed information on each topic. All files are
|
||||
continue # Skip index
|
||||
|
||||
try:
|
||||
ref_content = ref_file.read_text(encoding='utf-8')
|
||||
ref_content = ref_file.read_text(encoding="utf-8")
|
||||
combined_parts.append(f"# {ref_file.stem.replace('_', ' ').title()}\n\n")
|
||||
combined_parts.append(ref_content)
|
||||
combined_parts.append("\n\n---\n\n")
|
||||
|
||||
@@ -6,11 +6,10 @@ Implements platform-specific handling for OpenAI ChatGPT Assistants.
|
||||
Uses Assistants API with Vector Store for file search.
|
||||
"""
|
||||
|
||||
import os
|
||||
import zipfile
|
||||
import json
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
|
||||
@@ -123,51 +122,51 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
skill_dir = Path(skill_dir)
|
||||
|
||||
# Determine output filename
|
||||
if output_path.is_dir() or str(output_path).endswith('/'):
|
||||
if output_path.is_dir() or str(output_path).endswith("/"):
|
||||
output_path = Path(output_path) / f"{skill_dir.name}-openai.zip"
|
||||
elif not str(output_path).endswith('.zip'):
|
||||
elif not str(output_path).endswith(".zip"):
|
||||
# Keep .zip extension
|
||||
if not str(output_path).endswith('-openai.zip'):
|
||||
output_str = str(output_path).replace('.zip', '-openai.zip')
|
||||
if not output_str.endswith('.zip'):
|
||||
output_str += '.zip'
|
||||
if not str(output_path).endswith("-openai.zip"):
|
||||
output_str = str(output_path).replace(".zip", "-openai.zip")
|
||||
if not output_str.endswith(".zip"):
|
||||
output_str += ".zip"
|
||||
output_path = Path(output_str)
|
||||
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create ZIP file
|
||||
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
||||
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
# Add SKILL.md as assistant_instructions.txt
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
if skill_md.exists():
|
||||
instructions = skill_md.read_text(encoding='utf-8')
|
||||
instructions = skill_md.read_text(encoding="utf-8")
|
||||
zf.writestr("assistant_instructions.txt", instructions)
|
||||
|
||||
# Add references directory as vector_store_files/
|
||||
refs_dir = skill_dir / "references"
|
||||
if refs_dir.exists():
|
||||
for ref_file in refs_dir.rglob("*.md"):
|
||||
if ref_file.is_file() and not ref_file.name.startswith('.'):
|
||||
if ref_file.is_file() and not ref_file.name.startswith("."):
|
||||
# Place all reference files in vector_store_files/
|
||||
arcname = f"vector_store_files/{ref_file.name}"
|
||||
zf.write(ref_file, arcname)
|
||||
|
||||
# Create and add metadata file
|
||||
metadata = {
|
||||
'platform': 'openai',
|
||||
'name': skill_dir.name,
|
||||
'version': '1.0.0',
|
||||
'created_with': 'skill-seekers',
|
||||
'model': 'gpt-4o',
|
||||
'tools': ['file_search']
|
||||
"platform": "openai",
|
||||
"name": skill_dir.name,
|
||||
"version": "1.0.0",
|
||||
"created_with": "skill-seekers",
|
||||
"model": "gpt-4o",
|
||||
"tools": ["file_search"],
|
||||
}
|
||||
|
||||
zf.writestr("openai_metadata.json", json.dumps(metadata, indent=2))
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]:
|
||||
def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Upload skill ZIP to OpenAI Assistants API.
|
||||
|
||||
@@ -186,30 +185,20 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
# Validate package file FIRST
|
||||
package_path = Path(package_path)
|
||||
if not package_path.exists():
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'File not found: {package_path}'
|
||||
}
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"File not found: {package_path}"}
|
||||
|
||||
if not package_path.suffix == '.zip':
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'Not a ZIP file: {package_path}'
|
||||
}
|
||||
if not package_path.suffix == ".zip":
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"Not a ZIP file: {package_path}"}
|
||||
|
||||
# Check for openai library
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': 'openai library not installed. Run: pip install openai'
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": None,
|
||||
"message": "openai library not installed. Run: pip install openai",
|
||||
}
|
||||
|
||||
# Configure OpenAI client
|
||||
@@ -218,11 +207,10 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
|
||||
# Extract package to temp directory
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Extract ZIP
|
||||
with zipfile.ZipFile(package_path, 'r') as zf:
|
||||
with zipfile.ZipFile(package_path, "r") as zf:
|
||||
zf.extractall(temp_dir)
|
||||
|
||||
temp_path = Path(temp_dir)
|
||||
@@ -231,29 +219,27 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
instructions_file = temp_path / "assistant_instructions.txt"
|
||||
if not instructions_file.exists():
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': 'Invalid package: assistant_instructions.txt not found'
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": None,
|
||||
"message": "Invalid package: assistant_instructions.txt not found",
|
||||
}
|
||||
|
||||
instructions = instructions_file.read_text(encoding='utf-8')
|
||||
instructions = instructions_file.read_text(encoding="utf-8")
|
||||
|
||||
# Read metadata
|
||||
metadata_file = temp_path / "openai_metadata.json"
|
||||
skill_name = package_path.stem
|
||||
model = kwargs.get('model', 'gpt-4o')
|
||||
model = kwargs.get("model", "gpt-4o")
|
||||
|
||||
if metadata_file.exists():
|
||||
with open(metadata_file, 'r') as f:
|
||||
with open(metadata_file) as f:
|
||||
metadata = json.load(f)
|
||||
skill_name = metadata.get('name', skill_name)
|
||||
model = metadata.get('model', model)
|
||||
skill_name = metadata.get("name", skill_name)
|
||||
model = metadata.get("model", model)
|
||||
|
||||
# Create vector store
|
||||
vector_store = client.beta.vector_stores.create(
|
||||
name=f"{skill_name} Documentation"
|
||||
)
|
||||
vector_store = client.beta.vector_stores.create(name=f"{skill_name} Documentation")
|
||||
|
||||
# Upload reference files to vector store
|
||||
vector_files_dir = temp_path / "vector_store_files"
|
||||
@@ -262,19 +248,13 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
if vector_files_dir.exists():
|
||||
for ref_file in vector_files_dir.glob("*.md"):
|
||||
# Upload file
|
||||
with open(ref_file, 'rb') as f:
|
||||
uploaded_file = client.files.create(
|
||||
file=f,
|
||||
purpose='assistants'
|
||||
)
|
||||
with open(ref_file, "rb") as f:
|
||||
uploaded_file = client.files.create(file=f, purpose="assistants")
|
||||
file_ids.append(uploaded_file.id)
|
||||
|
||||
# Attach files to vector store
|
||||
if file_ids:
|
||||
client.beta.vector_stores.files.create_batch(
|
||||
vector_store_id=vector_store.id,
|
||||
file_ids=file_ids
|
||||
)
|
||||
client.beta.vector_stores.files.create_batch(vector_store_id=vector_store.id, file_ids=file_ids)
|
||||
|
||||
# Create assistant
|
||||
assistant = client.beta.assistants.create(
|
||||
@@ -282,27 +262,18 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
instructions=instructions,
|
||||
model=model,
|
||||
tools=[{"type": "file_search"}],
|
||||
tool_resources={
|
||||
"file_search": {
|
||||
"vector_store_ids": [vector_store.id]
|
||||
}
|
||||
}
|
||||
tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
|
||||
)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'skill_id': assistant.id,
|
||||
'url': f"https://platform.openai.com/assistants/{assistant.id}",
|
||||
'message': f'Assistant created with {len(file_ids)} knowledge files'
|
||||
"success": True,
|
||||
"skill_id": assistant.id,
|
||||
"url": f"https://platform.openai.com/assistants/{assistant.id}",
|
||||
"message": f"Assistant created with {len(file_ids)} knowledge files",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
'success': False,
|
||||
'skill_id': None,
|
||||
'url': None,
|
||||
'message': f'Upload failed: {str(e)}'
|
||||
}
|
||||
return {"success": False, "skill_id": None, "url": None, "message": f"Upload failed: {str(e)}"}
|
||||
|
||||
def validate_api_key(self, api_key: str) -> bool:
|
||||
"""
|
||||
@@ -314,7 +285,7 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
Returns:
|
||||
True if key starts with 'sk-'
|
||||
"""
|
||||
return api_key.strip().startswith('sk-')
|
||||
return api_key.strip().startswith("sk-")
|
||||
|
||||
def get_env_var_name(self) -> str:
|
||||
"""
|
||||
@@ -372,17 +343,13 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
# Read current SKILL.md
|
||||
current_skill_md = None
|
||||
if skill_md_path.exists():
|
||||
current_skill_md = skill_md_path.read_text(encoding='utf-8')
|
||||
current_skill_md = skill_md_path.read_text(encoding="utf-8")
|
||||
print(f" ℹ Found existing SKILL.md ({len(current_skill_md)} chars)")
|
||||
else:
|
||||
print(f" ℹ No existing SKILL.md, will create new one")
|
||||
print(" ℹ No existing SKILL.md, will create new one")
|
||||
|
||||
# Build enhancement prompt
|
||||
prompt = self._build_enhancement_prompt(
|
||||
skill_dir.name,
|
||||
references,
|
||||
current_skill_md
|
||||
)
|
||||
prompt = self._build_enhancement_prompt(skill_dir.name, references, current_skill_md)
|
||||
|
||||
print("\n🤖 Asking GPT-4o to enhance SKILL.md...")
|
||||
print(f" Input: {len(prompt):,} characters")
|
||||
@@ -395,15 +362,12 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an expert technical writer creating Assistant instructions for OpenAI ChatGPT."
|
||||
"content": "You are an expert technical writer creating Assistant instructions for OpenAI ChatGPT.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.3,
|
||||
max_tokens=4096
|
||||
max_tokens=4096,
|
||||
)
|
||||
|
||||
enhanced_content = response.choices[0].message.content
|
||||
@@ -411,13 +375,13 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
|
||||
# Backup original
|
||||
if skill_md_path.exists():
|
||||
backup_path = skill_md_path.with_suffix('.md.backup')
|
||||
backup_path = skill_md_path.with_suffix(".md.backup")
|
||||
skill_md_path.rename(backup_path)
|
||||
print(f" 💾 Backed up original to: {backup_path.name}")
|
||||
|
||||
# Save enhanced version
|
||||
skill_md_path.write_text(enhanced_content, encoding='utf-8')
|
||||
print(f" ✅ Saved enhanced SKILL.md")
|
||||
skill_md_path.write_text(enhanced_content, encoding="utf-8")
|
||||
print(" ✅ Saved enhanced SKILL.md")
|
||||
|
||||
return True
|
||||
|
||||
@@ -425,7 +389,7 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
print(f"❌ Error calling OpenAI API: {e}")
|
||||
return False
|
||||
|
||||
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> Dict[str, str]:
|
||||
def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> dict[str, str]:
|
||||
"""
|
||||
Read reference markdown files from skill directory.
|
||||
|
||||
@@ -448,7 +412,7 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
break
|
||||
|
||||
try:
|
||||
content = ref_file.read_text(encoding='utf-8')
|
||||
content = ref_file.read_text(encoding="utf-8")
|
||||
# Limit individual file size
|
||||
if len(content) > 30000:
|
||||
content = content[:30000] + "\n\n...(truncated)"
|
||||
@@ -462,10 +426,7 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
return references
|
||||
|
||||
def _build_enhancement_prompt(
|
||||
self,
|
||||
skill_name: str,
|
||||
references: Dict[str, str],
|
||||
current_skill_md: str = None
|
||||
self, skill_name: str, references: dict[str, str], current_skill_md: str = None
|
||||
) -> str:
|
||||
"""
|
||||
Build OpenAI API prompt for enhancement.
|
||||
@@ -483,9 +444,9 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
I've scraped documentation and organized it into reference files. Your job is to create EXCELLENT Assistant instructions that will help the Assistant use this documentation effectively.
|
||||
|
||||
CURRENT INSTRUCTIONS:
|
||||
{'```' if current_skill_md else '(none - create from scratch)'}
|
||||
{current_skill_md or 'No existing instructions'}
|
||||
{'```' if current_skill_md else ''}
|
||||
{"```" if current_skill_md else "(none - create from scratch)"}
|
||||
{current_skill_md or "No existing instructions"}
|
||||
{"```" if current_skill_md else ""}
|
||||
|
||||
REFERENCE DOCUMENTATION:
|
||||
"""
|
||||
|
||||
@@ -17,9 +17,8 @@ Credits:
|
||||
- Graceful degradation if API unavailable
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import List, Dict, Optional, Any
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -28,18 +27,19 @@ logger = logging.getLogger(__name__)
|
||||
@dataclass
|
||||
class AIAnalysis:
|
||||
"""AI analysis result for patterns or examples"""
|
||||
|
||||
explanation: str
|
||||
issues: List[str]
|
||||
recommendations: List[str]
|
||||
related_items: List[str] # Related patterns or examples
|
||||
best_practices: List[str]
|
||||
issues: list[str]
|
||||
recommendations: list[str]
|
||||
related_items: list[str] # Related patterns or examples
|
||||
best_practices: list[str]
|
||||
confidence_boost: float # -0.2 to +0.2 adjustment to confidence
|
||||
|
||||
|
||||
class AIEnhancer:
|
||||
"""Base class for AI enhancement"""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, enabled: bool = True, mode: str = "auto"):
|
||||
def __init__(self, api_key: str | None = None, enabled: bool = True, mode: str = "auto"):
|
||||
"""
|
||||
Initialize AI enhancer.
|
||||
|
||||
@@ -53,7 +53,7 @@ class AIEnhancer:
|
||||
"""
|
||||
self.enabled = enabled
|
||||
self.mode = mode
|
||||
self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY')
|
||||
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
||||
self.client = None
|
||||
|
||||
# Determine actual mode
|
||||
@@ -72,6 +72,7 @@ class AIEnhancer:
|
||||
if self.mode == "api" and self.enabled:
|
||||
try:
|
||||
import anthropic
|
||||
|
||||
self.client = anthropic.Anthropic(api_key=self.api_key)
|
||||
logger.info("✅ AI enhancement enabled (using Claude API)")
|
||||
except ImportError:
|
||||
@@ -88,16 +89,14 @@ class AIEnhancer:
|
||||
logger.info(" Use API mode (set ANTHROPIC_API_KEY) or 'skill-seekers enhance' for SKILL.md")
|
||||
self.enabled = False
|
||||
|
||||
def _call_claude(self, prompt: str, max_tokens: int = 1000) -> Optional[str]:
|
||||
def _call_claude(self, prompt: str, max_tokens: int = 1000) -> str | None:
|
||||
"""Call Claude API with error handling"""
|
||||
if not self.client:
|
||||
return None
|
||||
|
||||
try:
|
||||
response = self.client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=max_tokens,
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
model="claude-sonnet-4-20250514", max_tokens=max_tokens, messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
return response.content[0].text
|
||||
except Exception as e:
|
||||
@@ -108,7 +107,7 @@ class AIEnhancer:
|
||||
class PatternEnhancer(AIEnhancer):
|
||||
"""Enhance design pattern detection with AI analysis"""
|
||||
|
||||
def enhance_patterns(self, patterns: List[Dict]) -> List[Dict]:
|
||||
def enhance_patterns(self, patterns: list[dict]) -> list[dict]:
|
||||
"""
|
||||
Enhance detected patterns with AI analysis.
|
||||
|
||||
@@ -128,19 +127,19 @@ class PatternEnhancer(AIEnhancer):
|
||||
enhanced = []
|
||||
|
||||
for i in range(0, len(patterns), batch_size):
|
||||
batch = patterns[i:i+batch_size]
|
||||
batch = patterns[i : i + batch_size]
|
||||
batch_results = self._enhance_pattern_batch(batch)
|
||||
enhanced.extend(batch_results)
|
||||
|
||||
logger.info(f"✅ Enhanced {len(enhanced)} patterns")
|
||||
return enhanced
|
||||
|
||||
def _enhance_pattern_batch(self, patterns: List[Dict]) -> List[Dict]:
|
||||
def _enhance_pattern_batch(self, patterns: list[dict]) -> list[dict]:
|
||||
"""Enhance a batch of patterns"""
|
||||
# Prepare prompt
|
||||
pattern_descriptions = []
|
||||
for idx, p in enumerate(patterns):
|
||||
desc = f"{idx+1}. {p['pattern_type']} in {p.get('class_name', 'unknown')}"
|
||||
desc = f"{idx + 1}. {p['pattern_type']} in {p.get('class_name', 'unknown')}"
|
||||
desc += f"\n Evidence: {', '.join(p.get('evidence', []))}"
|
||||
pattern_descriptions.append(desc)
|
||||
|
||||
@@ -166,24 +165,25 @@ Format as JSON array matching input order. Be concise and actionable.
|
||||
|
||||
try:
|
||||
import json
|
||||
|
||||
analyses = json.loads(response)
|
||||
|
||||
# Merge AI analysis into patterns
|
||||
for idx, pattern in enumerate(patterns):
|
||||
if idx < len(analyses):
|
||||
analysis = analyses[idx]
|
||||
pattern['ai_analysis'] = {
|
||||
'explanation': analysis.get('explanation', ''),
|
||||
'issues': analysis.get('issues', []),
|
||||
'recommendations': analysis.get('recommendations', []),
|
||||
'related_patterns': analysis.get('related_patterns', []),
|
||||
'confidence_boost': analysis.get('confidence_boost', 0.0)
|
||||
pattern["ai_analysis"] = {
|
||||
"explanation": analysis.get("explanation", ""),
|
||||
"issues": analysis.get("issues", []),
|
||||
"recommendations": analysis.get("recommendations", []),
|
||||
"related_patterns": analysis.get("related_patterns", []),
|
||||
"confidence_boost": analysis.get("confidence_boost", 0.0),
|
||||
}
|
||||
|
||||
# Adjust confidence
|
||||
boost = analysis.get('confidence_boost', 0.0)
|
||||
boost = analysis.get("confidence_boost", 0.0)
|
||||
if -0.2 <= boost <= 0.2:
|
||||
pattern['confidence'] = min(1.0, max(0.0, pattern['confidence'] + boost))
|
||||
pattern["confidence"] = min(1.0, max(0.0, pattern["confidence"] + boost))
|
||||
|
||||
return patterns
|
||||
|
||||
@@ -198,7 +198,7 @@ Format as JSON array matching input order. Be concise and actionable.
|
||||
class TestExampleEnhancer(AIEnhancer):
|
||||
"""Enhance test examples with AI analysis"""
|
||||
|
||||
def enhance_examples(self, examples: List[Dict]) -> List[Dict]:
|
||||
def enhance_examples(self, examples: list[dict]) -> list[dict]:
|
||||
"""
|
||||
Enhance test examples with AI context and explanations.
|
||||
|
||||
@@ -218,21 +218,21 @@ class TestExampleEnhancer(AIEnhancer):
|
||||
enhanced = []
|
||||
|
||||
for i in range(0, len(examples), batch_size):
|
||||
batch = examples[i:i+batch_size]
|
||||
batch = examples[i : i + batch_size]
|
||||
batch_results = self._enhance_example_batch(batch)
|
||||
enhanced.extend(batch_results)
|
||||
|
||||
logger.info(f"✅ Enhanced {len(enhanced)} examples")
|
||||
return enhanced
|
||||
|
||||
def _enhance_example_batch(self, examples: List[Dict]) -> List[Dict]:
|
||||
def _enhance_example_batch(self, examples: list[dict]) -> list[dict]:
|
||||
"""Enhance a batch of examples"""
|
||||
# Prepare prompt
|
||||
example_descriptions = []
|
||||
for idx, ex in enumerate(examples):
|
||||
desc = f"{idx+1}. {ex.get('category', 'unknown')} - {ex.get('test_name', 'unknown')}"
|
||||
desc = f"{idx + 1}. {ex.get('category', 'unknown')} - {ex.get('test_name', 'unknown')}"
|
||||
desc += f"\n Code: {ex.get('code', '')[:100]}..."
|
||||
if ex.get('expected_behavior'):
|
||||
if ex.get("expected_behavior"):
|
||||
desc += f"\n Expected: {ex['expected_behavior']}"
|
||||
example_descriptions.append(desc)
|
||||
|
||||
@@ -257,18 +257,19 @@ Format as JSON array matching input order. Focus on educational value.
|
||||
|
||||
try:
|
||||
import json
|
||||
|
||||
analyses = json.loads(response)
|
||||
|
||||
# Merge AI analysis into examples
|
||||
for idx, example in enumerate(examples):
|
||||
if idx < len(analyses):
|
||||
analysis = analyses[idx]
|
||||
example['ai_analysis'] = {
|
||||
'explanation': analysis.get('explanation', ''),
|
||||
'best_practices': analysis.get('best_practices', []),
|
||||
'common_mistakes': analysis.get('common_mistakes', []),
|
||||
'related_examples': analysis.get('related_examples', []),
|
||||
'tutorial_group': analysis.get('tutorial_group', '')
|
||||
example["ai_analysis"] = {
|
||||
"explanation": analysis.get("explanation", ""),
|
||||
"best_practices": analysis.get("best_practices", []),
|
||||
"common_mistakes": analysis.get("common_mistakes", []),
|
||||
"related_examples": analysis.get("related_examples", []),
|
||||
"tutorial_group": analysis.get("tutorial_group", ""),
|
||||
}
|
||||
|
||||
return examples
|
||||
@@ -280,7 +281,7 @@ Format as JSON array matching input order. Focus on educational value.
|
||||
logger.warning(f"⚠️ Error processing AI analysis: {e}")
|
||||
return examples
|
||||
|
||||
def generate_tutorials(self, examples: List[Dict]) -> Dict[str, List[Dict]]:
|
||||
def generate_tutorials(self, examples: list[dict]) -> dict[str, list[dict]]:
|
||||
"""
|
||||
Group enhanced examples into tutorial sections.
|
||||
|
||||
@@ -293,8 +294,8 @@ Format as JSON array matching input order. Focus on educational value.
|
||||
tutorials = {}
|
||||
|
||||
for example in examples:
|
||||
ai_analysis = example.get('ai_analysis', {})
|
||||
group = ai_analysis.get('tutorial_group', 'Miscellaneous')
|
||||
ai_analysis = example.get("ai_analysis", {})
|
||||
group = ai_analysis.get("tutorial_group", "Miscellaneous")
|
||||
|
||||
if group not in tutorials:
|
||||
tutorials[group] = []
|
||||
|
||||
@@ -17,10 +17,9 @@ Usage:
|
||||
builder.build_reference(output_dir)
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
from typing import Any
|
||||
|
||||
|
||||
class APIReferenceBuilder:
|
||||
@@ -31,7 +30,7 @@ class APIReferenceBuilder:
|
||||
documentation for each analyzed source file.
|
||||
"""
|
||||
|
||||
def __init__(self, code_analysis: Dict[str, Any]):
|
||||
def __init__(self, code_analysis: dict[str, Any]):
|
||||
"""
|
||||
Initialize builder with code analysis results.
|
||||
|
||||
@@ -40,9 +39,9 @@ class APIReferenceBuilder:
|
||||
Expected format: {'files': [{'file': 'path', 'classes': [...], 'functions': [...]}]}
|
||||
"""
|
||||
self.code_analysis = code_analysis
|
||||
self.files_data = code_analysis.get('files', [])
|
||||
self.files_data = code_analysis.get("files", [])
|
||||
|
||||
def build_reference(self, output_dir: Path) -> Dict[str, Path]:
|
||||
def build_reference(self, output_dir: Path) -> dict[str, Path]:
|
||||
"""
|
||||
Generate markdown files for each analyzed source file.
|
||||
|
||||
@@ -58,11 +57,11 @@ class APIReferenceBuilder:
|
||||
generated_files = {}
|
||||
|
||||
for file_data in self.files_data:
|
||||
source_file = file_data.get('file', 'unknown')
|
||||
language = file_data.get('language', 'Unknown')
|
||||
source_file = file_data.get("file", "unknown")
|
||||
language = file_data.get("language", "Unknown")
|
||||
|
||||
# Skip files with no analysis
|
||||
if not file_data.get('classes') and not file_data.get('functions'):
|
||||
if not file_data.get("classes") and not file_data.get("functions"):
|
||||
continue
|
||||
|
||||
# Generate markdown content
|
||||
@@ -73,7 +72,7 @@ class APIReferenceBuilder:
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
# Write markdown file
|
||||
output_path.write_text(markdown_content, encoding='utf-8')
|
||||
output_path.write_text(markdown_content, encoding="utf-8")
|
||||
generated_files[source_file] = output_path
|
||||
|
||||
return generated_files
|
||||
@@ -92,11 +91,10 @@ class APIReferenceBuilder:
|
||||
basename = Path(source_file).name
|
||||
|
||||
# Replace extension with .md
|
||||
name_without_ext = basename.rsplit('.', 1)[0] if '.' in basename else basename
|
||||
name_without_ext = basename.rsplit(".", 1)[0] if "." in basename else basename
|
||||
return f"{name_without_ext}.md"
|
||||
|
||||
def _generate_file_reference(self, file_data: Dict[str, Any],
|
||||
source_file: str, language: str) -> str:
|
||||
def _generate_file_reference(self, file_data: dict[str, Any], source_file: str, language: str) -> str:
|
||||
"""
|
||||
Generate complete markdown reference for a single file.
|
||||
|
||||
@@ -118,7 +116,7 @@ class APIReferenceBuilder:
|
||||
lines.append("---\n")
|
||||
|
||||
# Classes section
|
||||
classes = file_data.get('classes', [])
|
||||
classes = file_data.get("classes", [])
|
||||
if classes:
|
||||
lines.append("## Classes\n")
|
||||
for cls in classes:
|
||||
@@ -126,16 +124,16 @@ class APIReferenceBuilder:
|
||||
lines.append("\n")
|
||||
|
||||
# Functions section
|
||||
functions = file_data.get('functions', [])
|
||||
functions = file_data.get("functions", [])
|
||||
if functions:
|
||||
lines.append("## Functions\n")
|
||||
for func in functions:
|
||||
lines.append(self._format_function(func))
|
||||
lines.append("\n")
|
||||
|
||||
return '\n'.join(lines)
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_class(self, class_sig: Dict[str, Any]) -> str:
|
||||
def _format_class(self, class_sig: dict[str, Any]) -> str:
|
||||
"""
|
||||
Format class signature as markdown.
|
||||
|
||||
@@ -148,33 +146,33 @@ class APIReferenceBuilder:
|
||||
lines = []
|
||||
|
||||
# Class name
|
||||
class_name = class_sig.get('name', 'Unknown')
|
||||
class_name = class_sig.get("name", "Unknown")
|
||||
lines.append(f"### {class_name}\n")
|
||||
|
||||
# Docstring
|
||||
docstring = class_sig.get('docstring')
|
||||
docstring = class_sig.get("docstring")
|
||||
if docstring:
|
||||
lines.append(f"{docstring}\n")
|
||||
|
||||
# Inheritance
|
||||
base_classes = class_sig.get('base_classes', [])
|
||||
base_classes = class_sig.get("base_classes", [])
|
||||
if base_classes:
|
||||
bases_str = ', '.join(base_classes)
|
||||
bases_str = ", ".join(base_classes)
|
||||
lines.append(f"**Inherits from**: {bases_str}\n")
|
||||
else:
|
||||
lines.append("**Inherits from**: (none)\n")
|
||||
|
||||
# Methods
|
||||
methods = class_sig.get('methods', [])
|
||||
methods = class_sig.get("methods", [])
|
||||
if methods:
|
||||
lines.append("#### Methods\n")
|
||||
for method in methods:
|
||||
lines.append(self._format_method(method))
|
||||
lines.append("")
|
||||
|
||||
return '\n'.join(lines)
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_method(self, method_sig: Dict[str, Any]) -> str:
|
||||
def _format_method(self, method_sig: dict[str, Any]) -> str:
|
||||
"""
|
||||
Format method signature as markdown.
|
||||
|
||||
@@ -191,30 +189,30 @@ class APIReferenceBuilder:
|
||||
lines.append(f"##### {signature}\n")
|
||||
|
||||
# Docstring
|
||||
docstring = method_sig.get('docstring')
|
||||
docstring = method_sig.get("docstring")
|
||||
if docstring:
|
||||
lines.append(f"{docstring}\n")
|
||||
|
||||
# Decorators
|
||||
decorators = method_sig.get('decorators', [])
|
||||
decorators = method_sig.get("decorators", [])
|
||||
if decorators:
|
||||
dec_str = ', '.join(f"`@{d}`" for d in decorators)
|
||||
dec_str = ", ".join(f"`@{d}`" for d in decorators)
|
||||
lines.append(f"**Decorators**: {dec_str}\n")
|
||||
|
||||
# Parameters table
|
||||
params = method_sig.get('parameters', [])
|
||||
params = method_sig.get("parameters", [])
|
||||
if params:
|
||||
lines.append(self._format_parameters(params))
|
||||
lines.append("")
|
||||
|
||||
# Return type
|
||||
return_type = method_sig.get('return_type')
|
||||
return_type = method_sig.get("return_type")
|
||||
if return_type:
|
||||
lines.append(f"**Returns**: `{return_type}`\n")
|
||||
|
||||
return '\n'.join(lines)
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_function(self, func_sig: Dict[str, Any]) -> str:
|
||||
def _format_function(self, func_sig: dict[str, Any]) -> str:
|
||||
"""
|
||||
Format function signature as markdown.
|
||||
|
||||
@@ -231,30 +229,30 @@ class APIReferenceBuilder:
|
||||
lines.append(f"### {signature}\n")
|
||||
|
||||
# Async indicator
|
||||
if func_sig.get('is_async'):
|
||||
if func_sig.get("is_async"):
|
||||
lines.append("**Async function**\n")
|
||||
|
||||
# Docstring
|
||||
docstring = func_sig.get('docstring')
|
||||
docstring = func_sig.get("docstring")
|
||||
if docstring:
|
||||
lines.append(f"{docstring}\n")
|
||||
|
||||
# Parameters table
|
||||
params = func_sig.get('parameters', [])
|
||||
params = func_sig.get("parameters", [])
|
||||
if params:
|
||||
lines.append(self._format_parameters(params))
|
||||
lines.append("")
|
||||
|
||||
# Return type
|
||||
return_type = func_sig.get('return_type')
|
||||
return_type = func_sig.get("return_type")
|
||||
if return_type:
|
||||
lines.append(f"**Returns**: `{return_type}`\n")
|
||||
else:
|
||||
lines.append("**Returns**: (none)\n")
|
||||
|
||||
return '\n'.join(lines)
|
||||
return "\n".join(lines)
|
||||
|
||||
def _build_signature(self, sig: Dict[str, Any]) -> str:
|
||||
def _build_signature(self, sig: dict[str, Any]) -> str:
|
||||
"""
|
||||
Build function/method signature string.
|
||||
|
||||
@@ -264,28 +262,28 @@ class APIReferenceBuilder:
|
||||
Returns:
|
||||
Formatted signature string
|
||||
"""
|
||||
name = sig.get('name', 'unknown')
|
||||
params = sig.get('parameters', [])
|
||||
return_type = sig.get('return_type')
|
||||
name = sig.get("name", "unknown")
|
||||
params = sig.get("parameters", [])
|
||||
return_type = sig.get("return_type")
|
||||
|
||||
# Build parameter list
|
||||
param_strs = []
|
||||
for param in params:
|
||||
param_str = param.get('name', '')
|
||||
param_str = param.get("name", "")
|
||||
|
||||
# Add type hint if available
|
||||
type_hint = param.get('type_hint')
|
||||
type_hint = param.get("type_hint")
|
||||
if type_hint:
|
||||
param_str += f": {type_hint}"
|
||||
|
||||
# Add default value if available
|
||||
default = param.get('default')
|
||||
default = param.get("default")
|
||||
if default:
|
||||
param_str += f" = {default}"
|
||||
|
||||
param_strs.append(param_str)
|
||||
|
||||
params_str = ', '.join(param_strs)
|
||||
params_str = ", ".join(param_strs)
|
||||
|
||||
# Build full signature
|
||||
if return_type:
|
||||
@@ -293,7 +291,7 @@ class APIReferenceBuilder:
|
||||
else:
|
||||
return f"{name}({params_str})"
|
||||
|
||||
def _format_parameters(self, params: List[Dict]) -> str:
|
||||
def _format_parameters(self, params: list[dict]) -> str:
|
||||
"""
|
||||
Format parameter list as markdown table.
|
||||
|
||||
@@ -313,19 +311,19 @@ class APIReferenceBuilder:
|
||||
lines.append("|------|------|---------|-------------|")
|
||||
|
||||
for param in params:
|
||||
name = param.get('name', '-')
|
||||
type_hint = param.get('type_hint', '-')
|
||||
default = param.get('default')
|
||||
name = param.get("name", "-")
|
||||
type_hint = param.get("type_hint", "-")
|
||||
default = param.get("default")
|
||||
|
||||
# Show "-" for parameters without defaults
|
||||
default_str = default if default is not None else '-'
|
||||
default_str = default if default is not None else "-"
|
||||
|
||||
# For description, use empty for now (would need JSDoc/docstring parsing)
|
||||
description = "-"
|
||||
|
||||
lines.append(f"| {name} | {type_hint} | {default_str} | {description} |")
|
||||
|
||||
return '\n'.join(lines)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
@@ -336,12 +334,10 @@ def main():
|
||||
"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate API reference from code analysis results'
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Generate API reference from code analysis results")
|
||||
|
||||
parser.add_argument('input_file', help='Code analysis JSON file')
|
||||
parser.add_argument('output_dir', help='Output directory for markdown files')
|
||||
parser.add_argument("input_file", help="Code analysis JSON file")
|
||||
parser.add_argument("output_dir", help="Output directory for markdown files")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -351,7 +347,7 @@ def main():
|
||||
print(f"Error: Input file not found: {input_path}")
|
||||
return 1
|
||||
|
||||
with open(input_path, 'r', encoding='utf-8') as f:
|
||||
with open(input_path, encoding="utf-8") as f:
|
||||
code_analysis = json.load(f)
|
||||
|
||||
# Build API reference
|
||||
@@ -367,6 +363,7 @@ def main():
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
sys.exit(main())
|
||||
|
||||
@@ -21,11 +21,9 @@ Credits:
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Set
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -33,41 +31,43 @@ logger = logging.getLogger(__name__)
|
||||
@dataclass
|
||||
class ArchitecturalPattern:
|
||||
"""Detected architectural pattern"""
|
||||
|
||||
pattern_name: str # e.g., "MVC", "MVVM", "Repository"
|
||||
confidence: float # 0.0-1.0
|
||||
evidence: List[str] # List of evidence supporting detection
|
||||
components: Dict[str, List[str]] # Component type -> file paths
|
||||
framework: Optional[str] = None # Detected framework (Django, Spring, etc.)
|
||||
evidence: list[str] # List of evidence supporting detection
|
||||
components: dict[str, list[str]] # Component type -> file paths
|
||||
framework: str | None = None # Detected framework (Django, Spring, etc.)
|
||||
description: str = "" # Human-readable description
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArchitecturalReport:
|
||||
"""Complete architectural analysis report"""
|
||||
patterns: List[ArchitecturalPattern]
|
||||
directory_structure: Dict[str, int] # Directory name -> file count
|
||||
total_files_analyzed: int
|
||||
frameworks_detected: List[str]
|
||||
ai_analysis: Optional[Dict] = None # AI enhancement (C3.6 integration)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
patterns: list[ArchitecturalPattern]
|
||||
directory_structure: dict[str, int] # Directory name -> file count
|
||||
total_files_analyzed: int
|
||||
frameworks_detected: list[str]
|
||||
ai_analysis: dict | None = None # AI enhancement (C3.6 integration)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Export to dictionary"""
|
||||
return {
|
||||
'patterns': [
|
||||
"patterns": [
|
||||
{
|
||||
'pattern_name': p.pattern_name,
|
||||
'confidence': p.confidence,
|
||||
'evidence': p.evidence,
|
||||
'components': p.components,
|
||||
'framework': p.framework,
|
||||
'description': p.description
|
||||
"pattern_name": p.pattern_name,
|
||||
"confidence": p.confidence,
|
||||
"evidence": p.evidence,
|
||||
"components": p.components,
|
||||
"framework": p.framework,
|
||||
"description": p.description,
|
||||
}
|
||||
for p in self.patterns
|
||||
],
|
||||
'directory_structure': self.directory_structure,
|
||||
'total_files_analyzed': self.total_files_analyzed,
|
||||
'frameworks_detected': self.frameworks_detected,
|
||||
'ai_analysis': self.ai_analysis
|
||||
"directory_structure": self.directory_structure,
|
||||
"total_files_analyzed": self.total_files_analyzed,
|
||||
"frameworks_detected": self.frameworks_detected,
|
||||
"ai_analysis": self.ai_analysis,
|
||||
}
|
||||
|
||||
|
||||
@@ -79,25 +79,25 @@ class ArchitecturalPatternDetector:
|
||||
"""
|
||||
|
||||
# Common directory patterns for architectures
|
||||
MVC_DIRS = {'models', 'views', 'controllers', 'model', 'view', 'controller'}
|
||||
MVVM_DIRS = {'models', 'views', 'viewmodels', 'viewmodel'}
|
||||
LAYERED_DIRS = {'presentation', 'business', 'data', 'dal', 'bll', 'ui'}
|
||||
CLEAN_ARCH_DIRS = {'domain', 'application', 'infrastructure', 'presentation'}
|
||||
REPO_DIRS = {'repositories', 'repository'}
|
||||
SERVICE_DIRS = {'services', 'service'}
|
||||
MVC_DIRS = {"models", "views", "controllers", "model", "view", "controller"}
|
||||
MVVM_DIRS = {"models", "views", "viewmodels", "viewmodel"}
|
||||
LAYERED_DIRS = {"presentation", "business", "data", "dal", "bll", "ui"}
|
||||
CLEAN_ARCH_DIRS = {"domain", "application", "infrastructure", "presentation"}
|
||||
REPO_DIRS = {"repositories", "repository"}
|
||||
SERVICE_DIRS = {"services", "service"}
|
||||
|
||||
# Framework detection patterns
|
||||
FRAMEWORK_MARKERS = {
|
||||
'Django': ['django', 'manage.py', 'settings.py', 'urls.py'],
|
||||
'Flask': ['flask', 'app.py', 'wsgi.py'],
|
||||
'Spring': ['springframework', '@Controller', '@Service', '@Repository'],
|
||||
'ASP.NET': ['Controllers', 'Models', 'Views', '.cshtml', 'Startup.cs'],
|
||||
'Rails': ['app/models', 'app/views', 'app/controllers', 'config/routes.rb'],
|
||||
'Angular': ['app.module.ts', '@Component', '@Injectable', 'angular.json'],
|
||||
'React': ['package.json', 'react', 'components'],
|
||||
'Vue.js': ['vue', '.vue', 'components'],
|
||||
'Express': ['express', 'app.js', 'routes'],
|
||||
'Laravel': ['artisan', 'app/Http/Controllers', 'app/Models']
|
||||
"Django": ["django", "manage.py", "settings.py", "urls.py"],
|
||||
"Flask": ["flask", "app.py", "wsgi.py"],
|
||||
"Spring": ["springframework", "@Controller", "@Service", "@Repository"],
|
||||
"ASP.NET": ["Controllers", "Models", "Views", ".cshtml", "Startup.cs"],
|
||||
"Rails": ["app/models", "app/views", "app/controllers", "config/routes.rb"],
|
||||
"Angular": ["app.module.ts", "@Component", "@Injectable", "angular.json"],
|
||||
"React": ["package.json", "react", "components"],
|
||||
"Vue.js": ["vue", ".vue", "components"],
|
||||
"Express": ["express", "app.js", "routes"],
|
||||
"Laravel": ["artisan", "app/Http/Controllers", "app/Models"],
|
||||
}
|
||||
|
||||
def __init__(self, enhance_with_ai: bool = True):
|
||||
@@ -113,12 +113,13 @@ class ArchitecturalPatternDetector:
|
||||
if self.enhance_with_ai:
|
||||
try:
|
||||
from skill_seekers.cli.ai_enhancer import AIEnhancer
|
||||
|
||||
self.ai_enhancer = AIEnhancer()
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to initialize AI enhancer: {e}")
|
||||
self.enhance_with_ai = False
|
||||
|
||||
def analyze(self, directory: Path, files_analysis: List[Dict]) -> ArchitecturalReport:
|
||||
def analyze(self, directory: Path, files_analysis: list[dict]) -> ArchitecturalReport:
|
||||
"""
|
||||
Analyze codebase for architectural patterns.
|
||||
|
||||
@@ -151,7 +152,7 @@ class ArchitecturalPatternDetector:
|
||||
patterns=patterns,
|
||||
directory_structure=dir_structure,
|
||||
total_files_analyzed=len(files_analysis),
|
||||
frameworks_detected=frameworks
|
||||
frameworks_detected=frameworks,
|
||||
)
|
||||
|
||||
# Enhance with AI if enabled (C3.6)
|
||||
@@ -161,11 +162,11 @@ class ArchitecturalPatternDetector:
|
||||
logger.info(f"✅ Detected {len(patterns)} architectural patterns")
|
||||
return report
|
||||
|
||||
def _analyze_directory_structure(self, directory: Path) -> Dict[str, int]:
|
||||
def _analyze_directory_structure(self, directory: Path) -> dict[str, int]:
|
||||
"""Analyze directory structure and count files"""
|
||||
structure = defaultdict(int)
|
||||
|
||||
for path in directory.rglob('*'):
|
||||
for path in directory.rglob("*"):
|
||||
if path.is_file():
|
||||
# Get relative directory path
|
||||
rel_dir = path.parent.relative_to(directory)
|
||||
@@ -180,13 +181,13 @@ class ArchitecturalPatternDetector:
|
||||
|
||||
return dict(structure)
|
||||
|
||||
def _detect_frameworks(self, directory: Path, files: List[Dict]) -> List[str]:
|
||||
def _detect_frameworks(self, directory: Path, files: list[dict]) -> list[str]:
|
||||
"""Detect frameworks being used"""
|
||||
detected = []
|
||||
|
||||
# Check file paths and content
|
||||
all_paths = [str(f.get('file', '')) for f in files]
|
||||
all_content = ' '.join(all_paths)
|
||||
all_paths = [str(f.get("file", "")) for f in files]
|
||||
all_content = " ".join(all_paths)
|
||||
|
||||
for framework, markers in self.FRAMEWORK_MARKERS.items():
|
||||
matches = sum(1 for marker in markers if marker.lower() in all_content.lower())
|
||||
@@ -196,7 +197,7 @@ class ArchitecturalPatternDetector:
|
||||
|
||||
return detected
|
||||
|
||||
def _detect_mvc(self, dirs: Dict[str, int], files: List[Dict], frameworks: List[str]) -> List[ArchitecturalPattern]:
|
||||
def _detect_mvc(self, dirs: dict[str, int], files: list[dict], frameworks: list[str]) -> list[ArchitecturalPattern]:
|
||||
"""Detect MVC pattern"""
|
||||
patterns = []
|
||||
|
||||
@@ -213,58 +214,62 @@ class ArchitecturalPatternDetector:
|
||||
|
||||
# Find MVC files
|
||||
for file in files:
|
||||
file_path = str(file.get('file', '')).lower()
|
||||
file_path = str(file.get("file", "")).lower()
|
||||
|
||||
if 'model' in file_path and ('models/' in file_path or '/model/' in file_path):
|
||||
components['Models'].append(file.get('file', ''))
|
||||
if len(components['Models']) == 1:
|
||||
if "model" in file_path and ("models/" in file_path or "/model/" in file_path):
|
||||
components["Models"].append(file.get("file", ""))
|
||||
if len(components["Models"]) == 1:
|
||||
evidence.append("Models directory with model classes")
|
||||
|
||||
if 'view' in file_path and ('views/' in file_path or '/view/' in file_path):
|
||||
components['Views'].append(file.get('file', ''))
|
||||
if len(components['Views']) == 1:
|
||||
if "view" in file_path and ("views/" in file_path or "/view/" in file_path):
|
||||
components["Views"].append(file.get("file", ""))
|
||||
if len(components["Views"]) == 1:
|
||||
evidence.append("Views directory with view files")
|
||||
|
||||
if 'controller' in file_path and ('controllers/' in file_path or '/controller/' in file_path):
|
||||
components['Controllers'].append(file.get('file', ''))
|
||||
if len(components['Controllers']) == 1:
|
||||
if "controller" in file_path and ("controllers/" in file_path or "/controller/" in file_path):
|
||||
components["Controllers"].append(file.get("file", ""))
|
||||
if len(components["Controllers"]) == 1:
|
||||
evidence.append("Controllers directory with controller classes")
|
||||
|
||||
# Calculate confidence
|
||||
has_models = len(components['Models']) > 0
|
||||
has_views = len(components['Views']) > 0
|
||||
has_controllers = len(components['Controllers']) > 0
|
||||
has_models = len(components["Models"]) > 0
|
||||
has_views = len(components["Views"]) > 0
|
||||
has_controllers = len(components["Controllers"]) > 0
|
||||
|
||||
if sum([has_models, has_views, has_controllers]) >= 2:
|
||||
confidence = 0.6 + (sum([has_models, has_views, has_controllers]) * 0.15)
|
||||
|
||||
# Boost confidence if framework detected
|
||||
framework = None
|
||||
for fw in ['Django', 'Flask', 'Spring', 'ASP.NET', 'Rails', 'Laravel']:
|
||||
for fw in ["Django", "Flask", "Spring", "ASP.NET", "Rails", "Laravel"]:
|
||||
if fw in frameworks:
|
||||
confidence = min(0.95, confidence + 0.1)
|
||||
framework = fw
|
||||
evidence.append(f"{fw} framework detected (uses MVC)")
|
||||
break
|
||||
|
||||
patterns.append(ArchitecturalPattern(
|
||||
pattern_name="MVC (Model-View-Controller)",
|
||||
confidence=confidence,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
framework=framework,
|
||||
description="Separates application into Models (data), Views (UI), and Controllers (logic)"
|
||||
))
|
||||
patterns.append(
|
||||
ArchitecturalPattern(
|
||||
pattern_name="MVC (Model-View-Controller)",
|
||||
confidence=confidence,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
framework=framework,
|
||||
description="Separates application into Models (data), Views (UI), and Controllers (logic)",
|
||||
)
|
||||
)
|
||||
|
||||
return patterns
|
||||
|
||||
def _detect_mvvm(self, dirs: Dict[str, int], files: List[Dict], frameworks: List[str]) -> List[ArchitecturalPattern]:
|
||||
def _detect_mvvm(
|
||||
self, dirs: dict[str, int], files: list[dict], frameworks: list[str]
|
||||
) -> list[ArchitecturalPattern]:
|
||||
"""Detect MVVM pattern"""
|
||||
patterns = []
|
||||
|
||||
# Look for ViewModels directory or classes ending with ViewModel
|
||||
has_viewmodel_dir = 'viewmodels' in dirs or 'viewmodel' in dirs
|
||||
viewmodel_files = [f for f in files if 'viewmodel' in str(f.get('file', '')).lower()]
|
||||
has_viewmodel_dir = "viewmodels" in dirs or "viewmodel" in dirs
|
||||
viewmodel_files = [f for f in files if "viewmodel" in str(f.get("file", "")).lower()]
|
||||
|
||||
if not (has_viewmodel_dir or len(viewmodel_files) >= 2):
|
||||
return patterns
|
||||
@@ -274,63 +279,68 @@ class ArchitecturalPatternDetector:
|
||||
|
||||
# Find MVVM files
|
||||
for file in files:
|
||||
file_path = str(file.get('file', '')).lower()
|
||||
classes = file.get('classes', [])
|
||||
file_path = str(file.get("file", "")).lower()
|
||||
classes = file.get("classes", [])
|
||||
|
||||
if 'model' in file_path and 'viewmodel' not in file_path:
|
||||
components['Models'].append(file.get('file', ''))
|
||||
if "model" in file_path and "viewmodel" not in file_path:
|
||||
components["Models"].append(file.get("file", ""))
|
||||
|
||||
if 'view' in file_path:
|
||||
components['Views'].append(file.get('file', ''))
|
||||
if "view" in file_path:
|
||||
components["Views"].append(file.get("file", ""))
|
||||
|
||||
if 'viewmodel' in file_path or any('viewmodel' in c.get('name', '').lower() for c in classes):
|
||||
components['ViewModels'].append(file.get('file', ''))
|
||||
if "viewmodel" in file_path or any("viewmodel" in c.get("name", "").lower() for c in classes):
|
||||
components["ViewModels"].append(file.get("file", ""))
|
||||
|
||||
if len(components['ViewModels']) >= 2:
|
||||
if len(components["ViewModels"]) >= 2:
|
||||
evidence.append(f"ViewModels directory with {len(components['ViewModels'])} ViewModel classes")
|
||||
|
||||
if len(components['Views']) >= 2:
|
||||
if len(components["Views"]) >= 2:
|
||||
evidence.append(f"Views directory with {len(components['Views'])} view files")
|
||||
|
||||
if len(components['Models']) >= 1:
|
||||
if len(components["Models"]) >= 1:
|
||||
evidence.append(f"Models directory with {len(components['Models'])} model files")
|
||||
|
||||
# Calculate confidence
|
||||
has_models = len(components['Models']) > 0
|
||||
has_views = len(components['Views']) > 0
|
||||
has_viewmodels = len(components['ViewModels']) >= 2
|
||||
has_models = len(components["Models"]) > 0
|
||||
has_views = len(components["Views"]) > 0
|
||||
has_viewmodels = len(components["ViewModels"]) >= 2
|
||||
|
||||
if has_viewmodels and (has_models or has_views):
|
||||
confidence = 0.7 if (has_models and has_views and has_viewmodels) else 0.6
|
||||
|
||||
framework = None
|
||||
for fw in ['ASP.NET', 'Angular', 'Vue.js']:
|
||||
for fw in ["ASP.NET", "Angular", "Vue.js"]:
|
||||
if fw in frameworks:
|
||||
confidence = min(0.95, confidence + 0.1)
|
||||
framework = fw
|
||||
evidence.append(f"{fw} framework detected (supports MVVM)")
|
||||
break
|
||||
|
||||
patterns.append(ArchitecturalPattern(
|
||||
pattern_name="MVVM (Model-View-ViewModel)",
|
||||
confidence=confidence,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
framework=framework,
|
||||
description="ViewModels provide data-binding between Views and Models"
|
||||
))
|
||||
patterns.append(
|
||||
ArchitecturalPattern(
|
||||
pattern_name="MVVM (Model-View-ViewModel)",
|
||||
confidence=confidence,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
framework=framework,
|
||||
description="ViewModels provide data-binding between Views and Models",
|
||||
)
|
||||
)
|
||||
|
||||
return patterns
|
||||
|
||||
def _detect_repository(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]:
|
||||
def _detect_repository(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]:
|
||||
"""Detect Repository pattern"""
|
||||
patterns = []
|
||||
|
||||
# Look for repositories directory or classes ending with Repository
|
||||
has_repo_dir = any(d in dirs for d in self.REPO_DIRS)
|
||||
repo_files = [f for f in files
|
||||
if 'repository' in str(f.get('file', '')).lower() or
|
||||
any('repository' in c.get('name', '').lower() for c in f.get('classes', []))]
|
||||
repo_files = [
|
||||
f
|
||||
for f in files
|
||||
if "repository" in str(f.get("file", "")).lower()
|
||||
or any("repository" in c.get("name", "").lower() for c in f.get("classes", []))
|
||||
]
|
||||
|
||||
if not (has_repo_dir or len(repo_files) >= 2):
|
||||
return patterns
|
||||
@@ -339,30 +349,35 @@ class ArchitecturalPatternDetector:
|
||||
components = defaultdict(list)
|
||||
|
||||
for file in repo_files:
|
||||
components['Repositories'].append(file.get('file', ''))
|
||||
components["Repositories"].append(file.get("file", ""))
|
||||
|
||||
if len(components['Repositories']) >= 2:
|
||||
if len(components["Repositories"]) >= 2:
|
||||
evidence.append(f"Repository pattern: {len(components['Repositories'])} repository classes")
|
||||
evidence.append("Repositories abstract data access logic")
|
||||
|
||||
patterns.append(ArchitecturalPattern(
|
||||
pattern_name="Repository Pattern",
|
||||
confidence=0.75,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
description="Encapsulates data access logic in repository classes"
|
||||
))
|
||||
patterns.append(
|
||||
ArchitecturalPattern(
|
||||
pattern_name="Repository Pattern",
|
||||
confidence=0.75,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
description="Encapsulates data access logic in repository classes",
|
||||
)
|
||||
)
|
||||
|
||||
return patterns
|
||||
|
||||
def _detect_service_layer(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]:
|
||||
def _detect_service_layer(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]:
|
||||
"""Detect Service Layer pattern"""
|
||||
patterns = []
|
||||
|
||||
has_service_dir = any(d in dirs for d in self.SERVICE_DIRS)
|
||||
service_files = [f for f in files
|
||||
if 'service' in str(f.get('file', '')).lower() or
|
||||
any('service' in c.get('name', '').lower() for c in f.get('classes', []))]
|
||||
service_files = [
|
||||
f
|
||||
for f in files
|
||||
if "service" in str(f.get("file", "")).lower()
|
||||
or any("service" in c.get("name", "").lower() for c in f.get("classes", []))
|
||||
]
|
||||
|
||||
if not (has_service_dir or len(service_files) >= 3):
|
||||
return patterns
|
||||
@@ -371,23 +386,25 @@ class ArchitecturalPatternDetector:
|
||||
components = defaultdict(list)
|
||||
|
||||
for file in service_files:
|
||||
components['Services'].append(file.get('file', ''))
|
||||
components["Services"].append(file.get("file", ""))
|
||||
|
||||
if len(components['Services']) >= 3:
|
||||
if len(components["Services"]) >= 3:
|
||||
evidence.append(f"Service layer: {len(components['Services'])} service classes")
|
||||
evidence.append("Services encapsulate business logic")
|
||||
|
||||
patterns.append(ArchitecturalPattern(
|
||||
pattern_name="Service Layer Pattern",
|
||||
confidence=0.75,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
description="Encapsulates business logic in service classes"
|
||||
))
|
||||
patterns.append(
|
||||
ArchitecturalPattern(
|
||||
pattern_name="Service Layer Pattern",
|
||||
confidence=0.75,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
description="Encapsulates business logic in service classes",
|
||||
)
|
||||
)
|
||||
|
||||
return patterns
|
||||
|
||||
def _detect_layered_architecture(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]:
|
||||
def _detect_layered_architecture(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]:
|
||||
"""Detect Layered Architecture (3-tier, N-tier)"""
|
||||
patterns = []
|
||||
|
||||
@@ -400,32 +417,34 @@ class ArchitecturalPatternDetector:
|
||||
components = defaultdict(list)
|
||||
layers_found = []
|
||||
|
||||
if 'presentation' in dirs or 'ui' in dirs:
|
||||
if "presentation" in dirs or "ui" in dirs:
|
||||
layers_found.append("Presentation Layer")
|
||||
evidence.append("Presentation/UI layer detected")
|
||||
|
||||
if 'business' in dirs or 'bll' in dirs:
|
||||
if "business" in dirs or "bll" in dirs:
|
||||
layers_found.append("Business Logic Layer")
|
||||
evidence.append("Business logic layer detected")
|
||||
|
||||
if 'data' in dirs or 'dal' in dirs:
|
||||
if "data" in dirs or "dal" in dirs:
|
||||
layers_found.append("Data Access Layer")
|
||||
evidence.append("Data access layer detected")
|
||||
|
||||
if len(layers_found) >= 2:
|
||||
confidence = 0.65 + (len(layers_found) * 0.1)
|
||||
|
||||
patterns.append(ArchitecturalPattern(
|
||||
pattern_name=f"Layered Architecture ({len(layers_found)}-tier)",
|
||||
confidence=min(confidence, 0.9),
|
||||
evidence=evidence,
|
||||
components={'Layers': layers_found},
|
||||
description=f"Separates concerns into {len(layers_found)} distinct layers"
|
||||
))
|
||||
patterns.append(
|
||||
ArchitecturalPattern(
|
||||
pattern_name=f"Layered Architecture ({len(layers_found)}-tier)",
|
||||
confidence=min(confidence, 0.9),
|
||||
evidence=evidence,
|
||||
components={"Layers": layers_found},
|
||||
description=f"Separates concerns into {len(layers_found)} distinct layers",
|
||||
)
|
||||
)
|
||||
|
||||
return patterns
|
||||
|
||||
def _detect_clean_architecture(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]:
|
||||
def _detect_clean_architecture(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]:
|
||||
"""Detect Clean Architecture"""
|
||||
patterns = []
|
||||
|
||||
@@ -437,50 +456,52 @@ class ArchitecturalPatternDetector:
|
||||
evidence = []
|
||||
components = defaultdict(list)
|
||||
|
||||
if 'domain' in dirs:
|
||||
if "domain" in dirs:
|
||||
evidence.append("Domain layer (core business logic)")
|
||||
components['Domain'].append('domain/')
|
||||
components["Domain"].append("domain/")
|
||||
|
||||
if 'application' in dirs:
|
||||
if "application" in dirs:
|
||||
evidence.append("Application layer (use cases)")
|
||||
components['Application'].append('application/')
|
||||
components["Application"].append("application/")
|
||||
|
||||
if 'infrastructure' in dirs:
|
||||
if "infrastructure" in dirs:
|
||||
evidence.append("Infrastructure layer (external dependencies)")
|
||||
components['Infrastructure'].append('infrastructure/')
|
||||
components["Infrastructure"].append("infrastructure/")
|
||||
|
||||
if 'presentation' in dirs:
|
||||
if "presentation" in dirs:
|
||||
evidence.append("Presentation layer (UI/API)")
|
||||
components['Presentation'].append('presentation/')
|
||||
components["Presentation"].append("presentation/")
|
||||
|
||||
if len(components) >= 3:
|
||||
patterns.append(ArchitecturalPattern(
|
||||
pattern_name="Clean Architecture",
|
||||
confidence=0.85,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
description="Dependency inversion with domain at center, infrastructure at edges"
|
||||
))
|
||||
patterns.append(
|
||||
ArchitecturalPattern(
|
||||
pattern_name="Clean Architecture",
|
||||
confidence=0.85,
|
||||
evidence=evidence,
|
||||
components=dict(components),
|
||||
description="Dependency inversion with domain at center, infrastructure at edges",
|
||||
)
|
||||
)
|
||||
|
||||
return patterns
|
||||
|
||||
def _enhance_with_ai(self, report: ArchitecturalReport) -> Dict:
|
||||
def _enhance_with_ai(self, report: ArchitecturalReport) -> dict:
|
||||
"""Enhance architectural analysis with AI insights"""
|
||||
if not self.ai_enhancer:
|
||||
return {}
|
||||
|
||||
# Prepare summary for AI
|
||||
summary = f"""Detected {len(report.patterns)} architectural patterns:
|
||||
{chr(10).join(f'- {p.pattern_name} (confidence: {p.confidence:.2f})' for p in report.patterns)}
|
||||
{chr(10).join(f"- {p.pattern_name} (confidence: {p.confidence:.2f})" for p in report.patterns)}
|
||||
|
||||
Frameworks: {', '.join(report.frameworks_detected) if report.frameworks_detected else 'None'}
|
||||
Frameworks: {", ".join(report.frameworks_detected) if report.frameworks_detected else "None"}
|
||||
Total files: {report.total_files_analyzed}
|
||||
|
||||
Provide brief architectural insights and recommendations."""
|
||||
|
||||
try:
|
||||
response = self.ai_enhancer._call_claude(summary, max_tokens=500)
|
||||
return {'insights': response} if response else {}
|
||||
return {"insights": response} if response else {}
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ AI enhancement failed: {e}")
|
||||
return {}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -24,65 +24,80 @@ Credits:
|
||||
- pathspec for .gitignore support: https://pypi.org/project/pathspec/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
from typing import Any
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from skill_seekers.cli.code_analyzer import CodeAnalyzer
|
||||
from skill_seekers.cli.api_reference_builder import APIReferenceBuilder
|
||||
from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer
|
||||
from skill_seekers.cli.code_analyzer import CodeAnalyzer
|
||||
from skill_seekers.cli.config_extractor import ConfigExtractor
|
||||
from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer
|
||||
|
||||
# Try to import pathspec for .gitignore support
|
||||
try:
|
||||
import pathspec
|
||||
|
||||
PATHSPEC_AVAILABLE = True
|
||||
except ImportError:
|
||||
PATHSPEC_AVAILABLE = False
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Language extension mapping
|
||||
LANGUAGE_EXTENSIONS = {
|
||||
'.py': 'Python',
|
||||
'.js': 'JavaScript',
|
||||
'.jsx': 'JavaScript',
|
||||
'.ts': 'TypeScript',
|
||||
'.tsx': 'TypeScript',
|
||||
'.cpp': 'C++',
|
||||
'.cc': 'C++',
|
||||
'.cxx': 'C++',
|
||||
'.h': 'C++',
|
||||
'.hpp': 'C++',
|
||||
'.hxx': 'C++',
|
||||
'.c': 'C',
|
||||
'.cs': 'C#',
|
||||
'.go': 'Go',
|
||||
'.rs': 'Rust',
|
||||
'.java': 'Java',
|
||||
'.rb': 'Ruby',
|
||||
'.php': 'PHP',
|
||||
".py": "Python",
|
||||
".js": "JavaScript",
|
||||
".jsx": "JavaScript",
|
||||
".ts": "TypeScript",
|
||||
".tsx": "TypeScript",
|
||||
".cpp": "C++",
|
||||
".cc": "C++",
|
||||
".cxx": "C++",
|
||||
".h": "C++",
|
||||
".hpp": "C++",
|
||||
".hxx": "C++",
|
||||
".c": "C",
|
||||
".cs": "C#",
|
||||
".go": "Go",
|
||||
".rs": "Rust",
|
||||
".java": "Java",
|
||||
".rb": "Ruby",
|
||||
".php": "PHP",
|
||||
}
|
||||
|
||||
# Default directories to exclude
|
||||
DEFAULT_EXCLUDED_DIRS = {
|
||||
'node_modules', 'venv', '__pycache__', '.git', '.svn', '.hg',
|
||||
'build', 'dist', 'target', '.pytest_cache', '.tox', '.mypy_cache',
|
||||
'htmlcov', 'coverage', '.coverage', '.eggs', '*.egg-info',
|
||||
'.idea', '.vscode', '.vs', '__pypackages__'
|
||||
"node_modules",
|
||||
"venv",
|
||||
"__pycache__",
|
||||
".git",
|
||||
".svn",
|
||||
".hg",
|
||||
"build",
|
||||
"dist",
|
||||
"target",
|
||||
".pytest_cache",
|
||||
".tox",
|
||||
".mypy_cache",
|
||||
"htmlcov",
|
||||
"coverage",
|
||||
".coverage",
|
||||
".eggs",
|
||||
"*.egg-info",
|
||||
".idea",
|
||||
".vscode",
|
||||
".vs",
|
||||
"__pypackages__",
|
||||
}
|
||||
|
||||
|
||||
@@ -97,10 +112,10 @@ def detect_language(file_path: Path) -> str:
|
||||
Language name or 'Unknown'
|
||||
"""
|
||||
extension = file_path.suffix.lower()
|
||||
return LANGUAGE_EXTENSIONS.get(extension, 'Unknown')
|
||||
return LANGUAGE_EXTENSIONS.get(extension, "Unknown")
|
||||
|
||||
|
||||
def load_gitignore(directory: Path) -> Optional[pathspec.PathSpec]:
|
||||
def load_gitignore(directory: Path) -> pathspec.PathSpec | None:
|
||||
"""
|
||||
Load .gitignore file and create pathspec matcher.
|
||||
|
||||
@@ -115,14 +130,14 @@ def load_gitignore(directory: Path) -> Optional[pathspec.PathSpec]:
|
||||
logger.warning("Install with: pip install pathspec")
|
||||
return None
|
||||
|
||||
gitignore_path = directory / '.gitignore'
|
||||
gitignore_path = directory / ".gitignore"
|
||||
if not gitignore_path.exists():
|
||||
logger.debug(f"No .gitignore found in {directory}")
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(gitignore_path, 'r', encoding='utf-8') as f:
|
||||
spec = pathspec.PathSpec.from_lines('gitwildmatch', f)
|
||||
with open(gitignore_path, encoding="utf-8") as f:
|
||||
spec = pathspec.PathSpec.from_lines("gitwildmatch", f)
|
||||
logger.info(f"Loaded .gitignore from {gitignore_path}")
|
||||
return spec
|
||||
except Exception as e:
|
||||
@@ -146,10 +161,10 @@ def should_exclude_dir(dir_name: str, excluded_dirs: set) -> bool:
|
||||
|
||||
def walk_directory(
|
||||
root: Path,
|
||||
patterns: Optional[List[str]] = None,
|
||||
gitignore_spec: Optional[pathspec.PathSpec] = None,
|
||||
excluded_dirs: Optional[set] = None
|
||||
) -> List[Path]:
|
||||
patterns: list[str] | None = None,
|
||||
gitignore_spec: pathspec.PathSpec | None = None,
|
||||
excluded_dirs: set | None = None,
|
||||
) -> list[Path]:
|
||||
"""
|
||||
Walk directory tree and collect source files.
|
||||
|
||||
@@ -205,9 +220,9 @@ def walk_directory(
|
||||
def analyze_codebase(
|
||||
directory: Path,
|
||||
output_dir: Path,
|
||||
depth: str = 'deep',
|
||||
languages: Optional[List[str]] = None,
|
||||
file_patterns: Optional[List[str]] = None,
|
||||
depth: str = "deep",
|
||||
languages: list[str] | None = None,
|
||||
file_patterns: list[str] | None = None,
|
||||
build_api_reference: bool = True,
|
||||
extract_comments: bool = True,
|
||||
build_dependency_graph: bool = True,
|
||||
@@ -216,8 +231,8 @@ def analyze_codebase(
|
||||
build_how_to_guides: bool = True,
|
||||
extract_config_patterns: bool = True,
|
||||
enhance_with_ai: bool = True,
|
||||
ai_mode: str = "auto"
|
||||
) -> Dict[str, Any]:
|
||||
ai_mode: str = "auto",
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Analyze local codebase and extract code knowledge.
|
||||
|
||||
@@ -255,11 +270,7 @@ def analyze_codebase(
|
||||
|
||||
# Walk directory tree
|
||||
logger.info("Scanning directory tree...")
|
||||
files = walk_directory(
|
||||
directory,
|
||||
patterns=file_patterns,
|
||||
gitignore_spec=gitignore_spec
|
||||
)
|
||||
files = walk_directory(directory, patterns=file_patterns, gitignore_spec=gitignore_spec)
|
||||
|
||||
logger.info(f"Found {len(files)} source files")
|
||||
|
||||
@@ -273,27 +284,25 @@ def analyze_codebase(
|
||||
analyzer = CodeAnalyzer(depth=depth)
|
||||
|
||||
# Analyze each file
|
||||
results = {'files': []}
|
||||
results = {"files": []}
|
||||
analyzed_count = 0
|
||||
|
||||
for file_path in files:
|
||||
try:
|
||||
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
language = detect_language(file_path)
|
||||
|
||||
if language == 'Unknown':
|
||||
if language == "Unknown":
|
||||
continue
|
||||
|
||||
# Analyze file
|
||||
analysis = analyzer.analyze_file(str(file_path), content, language)
|
||||
|
||||
# Only include files with actual analysis results
|
||||
if analysis and (analysis.get('classes') or analysis.get('functions')):
|
||||
results['files'].append({
|
||||
'file': str(file_path.relative_to(directory)),
|
||||
'language': language,
|
||||
**analysis
|
||||
})
|
||||
if analysis and (analysis.get("classes") or analysis.get("functions")):
|
||||
results["files"].append(
|
||||
{"file": str(file_path.relative_to(directory)), "language": language, **analysis}
|
||||
)
|
||||
analyzed_count += 1
|
||||
|
||||
if analyzed_count % 10 == 0:
|
||||
@@ -306,17 +315,17 @@ def analyze_codebase(
|
||||
logger.info(f"✅ Successfully analyzed {analyzed_count} files")
|
||||
|
||||
# Save results
|
||||
output_json = output_dir / 'code_analysis.json'
|
||||
with open(output_json, 'w', encoding='utf-8') as f:
|
||||
output_json = output_dir / "code_analysis.json"
|
||||
with open(output_json, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
|
||||
logger.info(f"📁 Saved analysis to: {output_json}")
|
||||
|
||||
# Build API reference if requested
|
||||
if build_api_reference and results['files']:
|
||||
if build_api_reference and results["files"]:
|
||||
logger.info("Building API reference documentation...")
|
||||
builder = APIReferenceBuilder(results)
|
||||
api_output_dir = output_dir / 'api_reference'
|
||||
api_output_dir = output_dir / "api_reference"
|
||||
generated_files = builder.build_reference(api_output_dir)
|
||||
logger.info(f"✅ Generated {len(generated_files)} API reference files")
|
||||
logger.info(f"📁 API reference: {api_output_dir}")
|
||||
@@ -329,10 +338,10 @@ def analyze_codebase(
|
||||
# Analyze dependencies for all files
|
||||
for file_path in files:
|
||||
try:
|
||||
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
language = detect_language(file_path)
|
||||
|
||||
if language != 'Unknown':
|
||||
if language != "Unknown":
|
||||
# Use relative path from directory for better graph readability
|
||||
rel_path = str(file_path.relative_to(directory))
|
||||
dep_analyzer.analyze_file(rel_path, content, language)
|
||||
@@ -348,7 +357,7 @@ def analyze_codebase(
|
||||
if cycles:
|
||||
logger.warning(f"⚠️ Found {len(cycles)} circular dependencies:")
|
||||
for i, cycle in enumerate(cycles[:5], 1): # Show first 5
|
||||
cycle_str = ' → '.join(cycle) + f" → {cycle[0]}"
|
||||
cycle_str = " → ".join(cycle) + f" → {cycle[0]}"
|
||||
logger.warning(f" {i}. {cycle_str}")
|
||||
if len(cycles) > 5:
|
||||
logger.warning(f" ... and {len(cycles) - 5} more")
|
||||
@@ -356,32 +365,34 @@ def analyze_codebase(
|
||||
logger.info("✅ No circular dependencies found")
|
||||
|
||||
# Save dependency graph data
|
||||
dep_output_dir = output_dir / 'dependencies'
|
||||
dep_output_dir = output_dir / "dependencies"
|
||||
dep_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Export as JSON
|
||||
dep_json = dep_output_dir / 'dependency_graph.json'
|
||||
with open(dep_json, 'w', encoding='utf-8') as f:
|
||||
dep_json = dep_output_dir / "dependency_graph.json"
|
||||
with open(dep_json, "w", encoding="utf-8") as f:
|
||||
json.dump(dep_analyzer.export_json(), f, indent=2)
|
||||
logger.info(f"📁 Saved dependency graph: {dep_json}")
|
||||
|
||||
# Export as Mermaid diagram
|
||||
mermaid_file = dep_output_dir / 'dependency_graph.mmd'
|
||||
mermaid_file = dep_output_dir / "dependency_graph.mmd"
|
||||
mermaid_file.write_text(dep_analyzer.export_mermaid())
|
||||
logger.info(f"📁 Saved Mermaid diagram: {mermaid_file}")
|
||||
|
||||
# Save statistics
|
||||
stats = dep_analyzer.get_statistics()
|
||||
stats_file = dep_output_dir / 'statistics.json'
|
||||
with open(stats_file, 'w', encoding='utf-8') as f:
|
||||
stats_file = dep_output_dir / "statistics.json"
|
||||
with open(stats_file, "w", encoding="utf-8") as f:
|
||||
json.dump(stats, f, indent=2)
|
||||
logger.info(f"📊 Statistics: {stats['total_files']} files, "
|
||||
f"{stats['total_dependencies']} dependencies, "
|
||||
f"{stats['circular_dependencies']} cycles")
|
||||
logger.info(
|
||||
f"📊 Statistics: {stats['total_files']} files, "
|
||||
f"{stats['total_dependencies']} dependencies, "
|
||||
f"{stats['circular_dependencies']} cycles"
|
||||
)
|
||||
|
||||
# Try to export as DOT (requires pydot)
|
||||
try:
|
||||
dot_file = dep_output_dir / 'dependency_graph.dot'
|
||||
dot_file = dep_output_dir / "dependency_graph.dot"
|
||||
dep_analyzer.export_dot(str(dot_file))
|
||||
except:
|
||||
pass # pydot not installed, skip DOT export
|
||||
@@ -396,13 +407,11 @@ def analyze_codebase(
|
||||
|
||||
for file_path in files:
|
||||
try:
|
||||
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
language = detect_language(file_path)
|
||||
|
||||
if language != 'Unknown':
|
||||
report = pattern_recognizer.analyze_file(
|
||||
str(file_path), content, language
|
||||
)
|
||||
if language != "Unknown":
|
||||
report = pattern_recognizer.analyze_file(str(file_path), content, language)
|
||||
|
||||
if report.patterns:
|
||||
pattern_results.append(report.to_dict())
|
||||
@@ -412,14 +421,14 @@ def analyze_codebase(
|
||||
|
||||
# Save pattern results
|
||||
if pattern_results:
|
||||
pattern_output = output_dir / 'patterns'
|
||||
pattern_output = output_dir / "patterns"
|
||||
pattern_output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
pattern_json = pattern_output / 'detected_patterns.json'
|
||||
with open(pattern_json, 'w', encoding='utf-8') as f:
|
||||
pattern_json = pattern_output / "detected_patterns.json"
|
||||
with open(pattern_json, "w", encoding="utf-8") as f:
|
||||
json.dump(pattern_results, f, indent=2)
|
||||
|
||||
total_patterns = sum(len(r['patterns']) for r in pattern_results)
|
||||
total_patterns = sum(len(r["patterns"]) for r in pattern_results)
|
||||
logger.info(f"✅ Detected {total_patterns} patterns in {len(pattern_results)} files")
|
||||
logger.info(f"📁 Saved to: {pattern_json}")
|
||||
else:
|
||||
@@ -432,35 +441,31 @@ def analyze_codebase(
|
||||
|
||||
# Create extractor
|
||||
test_extractor = TestExampleExtractor(
|
||||
min_confidence=0.5,
|
||||
max_per_file=10,
|
||||
languages=languages,
|
||||
enhance_with_ai=enhance_with_ai
|
||||
min_confidence=0.5, max_per_file=10, languages=languages, enhance_with_ai=enhance_with_ai
|
||||
)
|
||||
|
||||
# Extract examples from directory
|
||||
try:
|
||||
example_report = test_extractor.extract_from_directory(
|
||||
directory,
|
||||
recursive=True
|
||||
)
|
||||
example_report = test_extractor.extract_from_directory(directory, recursive=True)
|
||||
|
||||
if example_report.total_examples > 0:
|
||||
# Save results
|
||||
examples_output = output_dir / 'test_examples'
|
||||
examples_output = output_dir / "test_examples"
|
||||
examples_output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save as JSON
|
||||
examples_json = examples_output / 'test_examples.json'
|
||||
with open(examples_json, 'w', encoding='utf-8') as f:
|
||||
examples_json = examples_output / "test_examples.json"
|
||||
with open(examples_json, "w", encoding="utf-8") as f:
|
||||
json.dump(example_report.to_dict(), f, indent=2)
|
||||
|
||||
# Save as Markdown
|
||||
examples_md = examples_output / 'test_examples.md'
|
||||
examples_md.write_text(example_report.to_markdown(), encoding='utf-8')
|
||||
examples_md = examples_output / "test_examples.md"
|
||||
examples_md.write_text(example_report.to_markdown(), encoding="utf-8")
|
||||
|
||||
logger.info(f"✅ Extracted {example_report.total_examples} test examples "
|
||||
f"({example_report.high_value_count} high-value)")
|
||||
logger.info(
|
||||
f"✅ Extracted {example_report.total_examples} test examples "
|
||||
f"({example_report.high_value_count} high-value)"
|
||||
)
|
||||
logger.info(f"📁 Saved to: {examples_output}")
|
||||
else:
|
||||
logger.info("No test examples extracted")
|
||||
@@ -479,25 +484,25 @@ def analyze_codebase(
|
||||
guide_builder = HowToGuideBuilder(enhance_with_ai=enhance_with_ai)
|
||||
|
||||
# Build guides from workflow examples
|
||||
tutorials_dir = output_dir / 'tutorials'
|
||||
tutorials_dir = output_dir / "tutorials"
|
||||
|
||||
# Get workflow examples from the example_report if available
|
||||
if 'example_report' in locals() and example_report and example_report.total_examples > 0:
|
||||
if "example_report" in locals() and example_report and example_report.total_examples > 0:
|
||||
# Convert example_report to list of dicts for processing
|
||||
examples_list = example_report.to_dict().get('examples', [])
|
||||
examples_list = example_report.to_dict().get("examples", [])
|
||||
|
||||
guide_collection = guide_builder.build_guides_from_examples(
|
||||
examples_list,
|
||||
grouping_strategy='ai-tutorial-group',
|
||||
grouping_strategy="ai-tutorial-group",
|
||||
output_dir=tutorials_dir,
|
||||
enhance_with_ai=enhance_with_ai,
|
||||
ai_mode=ai_mode
|
||||
ai_mode=ai_mode,
|
||||
)
|
||||
|
||||
if guide_collection and guide_collection.total_guides > 0:
|
||||
# Save collection summary
|
||||
collection_json = tutorials_dir / 'guide_collection.json'
|
||||
with open(collection_json, 'w', encoding='utf-8') as f:
|
||||
collection_json = tutorials_dir / "guide_collection.json"
|
||||
with open(collection_json, "w", encoding="utf-8") as f:
|
||||
json.dump(guide_collection.to_dict(), f, indent=2)
|
||||
|
||||
logger.info(f"✅ Built {guide_collection.total_guides} how-to guides")
|
||||
@@ -524,9 +529,10 @@ def analyze_codebase(
|
||||
result_dict = config_extractor.to_dict(extraction_result)
|
||||
|
||||
# AI Enhancement (if enabled)
|
||||
if enhance_with_ai and ai_mode != 'none':
|
||||
if enhance_with_ai and ai_mode != "none":
|
||||
try:
|
||||
from skill_seekers.cli.config_enhancer import ConfigEnhancer
|
||||
|
||||
logger.info(f"🤖 Enhancing config analysis with AI (mode: {ai_mode})...")
|
||||
enhancer = ConfigEnhancer(mode=ai_mode)
|
||||
result_dict = enhancer.enhance_config_result(result_dict)
|
||||
@@ -535,28 +541,30 @@ def analyze_codebase(
|
||||
logger.warning(f"⚠️ Config AI enhancement failed: {e}")
|
||||
|
||||
# Save results
|
||||
config_output = output_dir / 'config_patterns'
|
||||
config_output = output_dir / "config_patterns"
|
||||
config_output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save as JSON
|
||||
config_json = config_output / 'config_patterns.json'
|
||||
with open(config_json, 'w', encoding='utf-8') as f:
|
||||
config_json = config_output / "config_patterns.json"
|
||||
with open(config_json, "w", encoding="utf-8") as f:
|
||||
json.dump(result_dict, f, indent=2)
|
||||
|
||||
# Save as Markdown (basic - AI enhancements in JSON only for now)
|
||||
config_md = config_output / 'config_patterns.md'
|
||||
config_md.write_text(extraction_result.to_markdown(), encoding='utf-8')
|
||||
config_md = config_output / "config_patterns.md"
|
||||
config_md.write_text(extraction_result.to_markdown(), encoding="utf-8")
|
||||
|
||||
# Count total settings across all files
|
||||
total_settings = sum(len(cf.settings) for cf in extraction_result.config_files)
|
||||
total_patterns = sum(len(cf.patterns) for cf in extraction_result.config_files)
|
||||
|
||||
logger.info(f"✅ Extracted {len(extraction_result.config_files)} config files "
|
||||
f"with {total_settings} settings and {total_patterns} detected patterns")
|
||||
logger.info(
|
||||
f"✅ Extracted {len(extraction_result.config_files)} config files "
|
||||
f"with {total_settings} settings and {total_patterns} detected patterns"
|
||||
)
|
||||
|
||||
if 'ai_enhancements' in result_dict:
|
||||
insights = result_dict['ai_enhancements'].get('overall_insights', {})
|
||||
if insights.get('security_issues_found'):
|
||||
if "ai_enhancements" in result_dict:
|
||||
insights = result_dict["ai_enhancements"].get("overall_insights", {})
|
||||
if insights.get("security_issues_found"):
|
||||
logger.info(f"🔐 Security issues found: {insights['security_issues_found']}")
|
||||
|
||||
logger.info(f"📁 Saved to: {config_output}")
|
||||
@@ -572,15 +580,15 @@ def analyze_codebase(
|
||||
from skill_seekers.cli.architectural_pattern_detector import ArchitecturalPatternDetector
|
||||
|
||||
arch_detector = ArchitecturalPatternDetector(enhance_with_ai=enhance_with_ai)
|
||||
arch_report = arch_detector.analyze(directory, results['files'])
|
||||
arch_report = arch_detector.analyze(directory, results["files"])
|
||||
|
||||
if arch_report.patterns:
|
||||
arch_output = output_dir / 'architecture'
|
||||
arch_output = output_dir / "architecture"
|
||||
arch_output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save as JSON
|
||||
arch_json = arch_output / 'architectural_patterns.json'
|
||||
with open(arch_json, 'w', encoding='utf-8') as f:
|
||||
arch_json = arch_output / "architectural_patterns.json"
|
||||
with open(arch_json, "w", encoding="utf-8") as f:
|
||||
json.dump(arch_report.to_dict(), f, indent=2)
|
||||
|
||||
logger.info(f"🏗️ Detected {len(arch_report.patterns)} architectural patterns")
|
||||
@@ -601,7 +609,7 @@ def analyze_codebase(
|
||||
build_dependency_graph=build_dependency_graph,
|
||||
detect_patterns=detect_patterns,
|
||||
extract_test_examples=extract_test_examples,
|
||||
extract_config_patterns=extract_config_patterns
|
||||
extract_config_patterns=extract_config_patterns,
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -610,13 +618,13 @@ def analyze_codebase(
|
||||
def _generate_skill_md(
|
||||
output_dir: Path,
|
||||
directory: Path,
|
||||
results: Dict[str, Any],
|
||||
results: dict[str, Any],
|
||||
depth: str,
|
||||
build_api_reference: bool,
|
||||
build_dependency_graph: bool,
|
||||
detect_patterns: bool,
|
||||
extract_test_examples: bool,
|
||||
extract_config_patterns: bool
|
||||
extract_config_patterns: bool,
|
||||
):
|
||||
"""
|
||||
Generate rich SKILL.md from codebase analysis results.
|
||||
@@ -635,14 +643,14 @@ def _generate_skill_md(
|
||||
repo_name = directory.name
|
||||
|
||||
# Generate skill name (lowercase, hyphens only, max 64 chars)
|
||||
skill_name = repo_name.lower().replace('_', '-').replace(' ', '-')[:64]
|
||||
skill_name = repo_name.lower().replace("_", "-").replace(" ", "-")[:64]
|
||||
|
||||
# Generate description
|
||||
description = f"Local codebase analysis for {repo_name}"
|
||||
|
||||
# Count files by language
|
||||
language_stats = _get_language_stats(results.get('files', []))
|
||||
total_files = len(results.get('files', []))
|
||||
language_stats = _get_language_stats(results.get("files", []))
|
||||
total_files = len(results.get("files", []))
|
||||
|
||||
# Start building content
|
||||
skill_content = f"""---
|
||||
@@ -658,7 +666,7 @@ Local codebase analysis and documentation generated from code analysis.
|
||||
|
||||
**Path:** `{directory}`
|
||||
**Files Analyzed:** {total_files}
|
||||
**Languages:** {', '.join(language_stats.keys())}
|
||||
**Languages:** {", ".join(language_stats.keys())}
|
||||
**Analysis Depth:** {depth}
|
||||
|
||||
## When to Use This Skill
|
||||
@@ -732,22 +740,22 @@ Use this skill when you need to:
|
||||
skill_content += "This skill includes detailed reference documentation:\n\n"
|
||||
|
||||
refs_added = False
|
||||
if build_api_reference and (output_dir / 'api_reference').exists():
|
||||
if build_api_reference and (output_dir / "api_reference").exists():
|
||||
skill_content += "- **API Reference**: `references/api_reference/` - Complete API documentation\n"
|
||||
refs_added = True
|
||||
if build_dependency_graph and (output_dir / 'dependencies').exists():
|
||||
if build_dependency_graph and (output_dir / "dependencies").exists():
|
||||
skill_content += "- **Dependencies**: `references/dependencies/` - Dependency graph and analysis\n"
|
||||
refs_added = True
|
||||
if detect_patterns and (output_dir / 'patterns').exists():
|
||||
if detect_patterns and (output_dir / "patterns").exists():
|
||||
skill_content += "- **Patterns**: `references/patterns/` - Detected design patterns\n"
|
||||
refs_added = True
|
||||
if extract_test_examples and (output_dir / 'test_examples').exists():
|
||||
if extract_test_examples and (output_dir / "test_examples").exists():
|
||||
skill_content += "- **Examples**: `references/test_examples/` - Usage examples from tests\n"
|
||||
refs_added = True
|
||||
if extract_config_patterns and (output_dir / 'config_patterns').exists():
|
||||
if extract_config_patterns and (output_dir / "config_patterns").exists():
|
||||
skill_content += "- **Configuration**: `references/config_patterns/` - Configuration patterns\n"
|
||||
refs_added = True
|
||||
if (output_dir / 'architecture').exists():
|
||||
if (output_dir / "architecture").exists():
|
||||
skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
|
||||
refs_added = True
|
||||
|
||||
@@ -762,34 +770,34 @@ Use this skill when you need to:
|
||||
|
||||
# Write SKILL.md
|
||||
skill_path = output_dir / "SKILL.md"
|
||||
skill_path.write_text(skill_content, encoding='utf-8')
|
||||
skill_path.write_text(skill_content, encoding="utf-8")
|
||||
|
||||
line_count = len(skill_content.split('\n'))
|
||||
line_count = len(skill_content.split("\n"))
|
||||
logger.info(f"✅ Generated SKILL.md: {skill_path} ({line_count} lines)")
|
||||
|
||||
# Generate references/ directory structure
|
||||
_generate_references(output_dir)
|
||||
|
||||
|
||||
def _get_language_stats(files: List[Dict]) -> Dict[str, int]:
|
||||
def _get_language_stats(files: list[dict]) -> dict[str, int]:
|
||||
"""Count files by language from analysis results."""
|
||||
stats = {}
|
||||
for file_data in files:
|
||||
# files is a list of dicts with 'language' key
|
||||
lang = file_data.get('language', 'Unknown')
|
||||
if lang != 'Unknown':
|
||||
lang = file_data.get("language", "Unknown")
|
||||
if lang != "Unknown":
|
||||
stats[lang] = stats.get(lang, 0) + 1
|
||||
return stats
|
||||
|
||||
|
||||
def _format_patterns_section(output_dir: Path) -> str:
|
||||
"""Format design patterns section from patterns/detected_patterns.json."""
|
||||
patterns_file = output_dir / 'patterns' / 'detected_patterns.json'
|
||||
patterns_file = output_dir / "patterns" / "detected_patterns.json"
|
||||
if not patterns_file.exists():
|
||||
return ""
|
||||
|
||||
try:
|
||||
with open(patterns_file, 'r', encoding='utf-8') as f:
|
||||
with open(patterns_file, encoding="utf-8") as f:
|
||||
patterns_data = json.load(f)
|
||||
except Exception:
|
||||
return ""
|
||||
@@ -802,10 +810,10 @@ def _format_patterns_section(output_dir: Path) -> str:
|
||||
by_class = {}
|
||||
|
||||
for pattern_file in patterns_data:
|
||||
for pattern in pattern_file.get('patterns', []):
|
||||
ptype = pattern.get('pattern_type', 'Unknown')
|
||||
cls = pattern.get('class_name', '')
|
||||
confidence = pattern.get('confidence', 0)
|
||||
for pattern in pattern_file.get("patterns", []):
|
||||
ptype = pattern.get("pattern_type", "Unknown")
|
||||
cls = pattern.get("class_name", "")
|
||||
confidence = pattern.get("confidence", 0)
|
||||
|
||||
# Skip low confidence
|
||||
if confidence < 0.7:
|
||||
@@ -813,7 +821,7 @@ def _format_patterns_section(output_dir: Path) -> str:
|
||||
|
||||
# Deduplicate by class
|
||||
key = f"{cls}:{ptype}"
|
||||
if key not in by_class or by_class[key]['confidence'] < confidence:
|
||||
if key not in by_class or by_class[key]["confidence"] < confidence:
|
||||
by_class[key] = pattern
|
||||
|
||||
# Count by type
|
||||
@@ -836,22 +844,22 @@ def _format_patterns_section(output_dir: Path) -> str:
|
||||
|
||||
def _format_examples_section(output_dir: Path) -> str:
|
||||
"""Format code examples section from test_examples/test_examples.json."""
|
||||
examples_file = output_dir / 'test_examples' / 'test_examples.json'
|
||||
examples_file = output_dir / "test_examples" / "test_examples.json"
|
||||
if not examples_file.exists():
|
||||
return ""
|
||||
|
||||
try:
|
||||
with open(examples_file, 'r', encoding='utf-8') as f:
|
||||
with open(examples_file, encoding="utf-8") as f:
|
||||
examples_data = json.load(f)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
examples = examples_data.get('examples', [])
|
||||
examples = examples_data.get("examples", [])
|
||||
if not examples:
|
||||
return ""
|
||||
|
||||
# Filter high-value examples (complexity > 0.7)
|
||||
high_value = [ex for ex in examples if ex.get('complexity_score', 0) > 0.7]
|
||||
high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7]
|
||||
|
||||
if not high_value:
|
||||
# If no high complexity, take any examples
|
||||
@@ -864,11 +872,11 @@ def _format_examples_section(output_dir: Path) -> str:
|
||||
content += "*High-quality examples extracted from test files (C3.2)*\n\n"
|
||||
|
||||
# Top 10 examples
|
||||
for ex in sorted(high_value, key=lambda x: x.get('complexity_score', 0), reverse=True)[:10]:
|
||||
desc = ex.get('description', 'Example')
|
||||
lang = ex.get('language', 'python').lower()
|
||||
code = ex.get('code', '')
|
||||
complexity = ex.get('complexity_score', 0)
|
||||
for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]:
|
||||
desc = ex.get("description", "Example")
|
||||
lang = ex.get("language", "python").lower()
|
||||
code = ex.get("code", "")
|
||||
complexity = ex.get("complexity_score", 0)
|
||||
|
||||
content += f"**{desc}** (complexity: {complexity:.2f})\n\n"
|
||||
content += f"```{lang}\n{code}\n```\n\n"
|
||||
@@ -879,16 +887,16 @@ def _format_examples_section(output_dir: Path) -> str:
|
||||
|
||||
def _format_api_section(output_dir: Path) -> str:
|
||||
"""Format API reference section."""
|
||||
api_dir = output_dir / 'api_reference'
|
||||
api_dir = output_dir / "api_reference"
|
||||
if not api_dir.exists():
|
||||
return ""
|
||||
|
||||
api_md = api_dir / 'api_reference.md'
|
||||
api_md = api_dir / "api_reference.md"
|
||||
if not api_md.exists():
|
||||
return ""
|
||||
|
||||
try:
|
||||
api_content = api_md.read_text(encoding='utf-8')
|
||||
api_content = api_md.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
@@ -906,17 +914,17 @@ def _format_api_section(output_dir: Path) -> str:
|
||||
|
||||
def _format_architecture_section(output_dir: Path) -> str:
|
||||
"""Format architecture section from architecture/architectural_patterns.json."""
|
||||
arch_file = output_dir / 'architecture' / 'architectural_patterns.json'
|
||||
arch_file = output_dir / "architecture" / "architectural_patterns.json"
|
||||
if not arch_file.exists():
|
||||
return ""
|
||||
|
||||
try:
|
||||
with open(arch_file, 'r', encoding='utf-8') as f:
|
||||
with open(arch_file, encoding="utf-8") as f:
|
||||
arch_data = json.load(f)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
patterns = arch_data.get('patterns', [])
|
||||
patterns = arch_data.get("patterns", [])
|
||||
if not patterns:
|
||||
return ""
|
||||
|
||||
@@ -925,9 +933,9 @@ def _format_architecture_section(output_dir: Path) -> str:
|
||||
|
||||
content += "**Detected Architectural Patterns:**\n\n"
|
||||
for pattern in patterns[:5]:
|
||||
name = pattern.get('pattern_name', 'Unknown')
|
||||
confidence = pattern.get('confidence', 0)
|
||||
indicators = pattern.get('indicators', [])
|
||||
name = pattern.get("pattern_name", "Unknown")
|
||||
confidence = pattern.get("confidence", 0)
|
||||
indicators = pattern.get("indicators", [])
|
||||
|
||||
content += f"- **{name}** (confidence: {confidence:.2f})\n"
|
||||
if indicators:
|
||||
@@ -940,22 +948,22 @@ def _format_architecture_section(output_dir: Path) -> str:
|
||||
|
||||
def _format_config_section(output_dir: Path) -> str:
|
||||
"""Format configuration patterns section."""
|
||||
config_file = output_dir / 'config_patterns' / 'config_patterns.json'
|
||||
config_file = output_dir / "config_patterns" / "config_patterns.json"
|
||||
if not config_file.exists():
|
||||
return ""
|
||||
|
||||
try:
|
||||
with open(config_file, 'r', encoding='utf-8') as f:
|
||||
with open(config_file, encoding="utf-8") as f:
|
||||
config_data = json.load(f)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
config_files = config_data.get('config_files', [])
|
||||
config_files = config_data.get("config_files", [])
|
||||
if not config_files:
|
||||
return ""
|
||||
|
||||
total_settings = sum(len(cf.get('settings', [])) for cf in config_files)
|
||||
total_patterns = sum(len(cf.get('patterns', [])) for cf in config_files)
|
||||
total_settings = sum(len(cf.get("settings", [])) for cf in config_files)
|
||||
total_patterns = sum(len(cf.get("patterns", [])) for cf in config_files)
|
||||
|
||||
content = "## ⚙️ Configuration Patterns\n\n"
|
||||
content += "*From C3.4 configuration analysis*\n\n"
|
||||
@@ -966,7 +974,7 @@ def _format_config_section(output_dir: Path) -> str:
|
||||
# List config file types found
|
||||
file_types = {}
|
||||
for cf in config_files:
|
||||
ctype = cf.get('config_type', 'unknown')
|
||||
ctype = cf.get("config_type", "unknown")
|
||||
file_types[ctype] = file_types.get(ctype, 0) + 1
|
||||
|
||||
if file_types:
|
||||
@@ -985,18 +993,18 @@ def _generate_references(output_dir: Path):
|
||||
|
||||
Creates a clean references/ directory that links to all analysis outputs.
|
||||
"""
|
||||
references_dir = output_dir / 'references'
|
||||
references_dir = output_dir / "references"
|
||||
references_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Map analysis directories to reference names
|
||||
mappings = {
|
||||
'api_reference': 'api_reference',
|
||||
'dependencies': 'dependencies',
|
||||
'patterns': 'patterns',
|
||||
'test_examples': 'test_examples',
|
||||
'tutorials': 'tutorials',
|
||||
'config_patterns': 'config_patterns',
|
||||
'architecture': 'architecture'
|
||||
"api_reference": "api_reference",
|
||||
"dependencies": "dependencies",
|
||||
"patterns": "patterns",
|
||||
"test_examples": "test_examples",
|
||||
"tutorials": "tutorials",
|
||||
"config_patterns": "config_patterns",
|
||||
"architecture": "architecture",
|
||||
}
|
||||
|
||||
for source, target in mappings.items():
|
||||
@@ -1007,9 +1015,11 @@ def _generate_references(output_dir: Path):
|
||||
# Copy directory to references/ (not symlink, for portability)
|
||||
if target_dir.exists():
|
||||
import shutil
|
||||
|
||||
shutil.rmtree(target_dir)
|
||||
|
||||
import shutil
|
||||
|
||||
shutil.copytree(source_dir, target_dir)
|
||||
logger.debug(f"Copied {source} → references/{target}")
|
||||
|
||||
@@ -1019,7 +1029,7 @@ def _generate_references(output_dir: Path):
|
||||
def main():
|
||||
"""Command-line interface for codebase analysis."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Analyze local codebases and extract code knowledge',
|
||||
description="Analyze local codebases and extract code knowledge",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -1043,101 +1053,78 @@ Examples:
|
||||
|
||||
# Skip specific features
|
||||
codebase-scraper --directory . --skip-patterns --skip-test-examples
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("--directory", required=True, help="Directory to analyze")
|
||||
parser.add_argument("--output", default="output/codebase/", help="Output directory (default: output/codebase/)")
|
||||
parser.add_argument(
|
||||
'--directory',
|
||||
required=True,
|
||||
help='Directory to analyze'
|
||||
"--depth", choices=["surface", "deep", "full"], default="deep", help="Analysis depth (default: deep)"
|
||||
)
|
||||
parser.add_argument("--languages", help="Comma-separated languages to analyze (e.g., Python,JavaScript,C++)")
|
||||
parser.add_argument("--file-patterns", help="Comma-separated file patterns (e.g., *.py,src/**/*.js)")
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
default='output/codebase/',
|
||||
help='Output directory (default: output/codebase/)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--depth',
|
||||
choices=['surface', 'deep', 'full'],
|
||||
default='deep',
|
||||
help='Analysis depth (default: deep)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--languages',
|
||||
help='Comma-separated languages to analyze (e.g., Python,JavaScript,C++)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--file-patterns',
|
||||
help='Comma-separated file patterns (e.g., *.py,src/**/*.js)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--skip-api-reference',
|
||||
action='store_true',
|
||||
"--skip-api-reference",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='Skip API reference markdown documentation generation (default: enabled)'
|
||||
help="Skip API reference markdown documentation generation (default: enabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--skip-dependency-graph',
|
||||
action='store_true',
|
||||
"--skip-dependency-graph",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='Skip dependency graph and circular dependency detection (default: enabled)'
|
||||
help="Skip dependency graph and circular dependency detection (default: enabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--skip-patterns',
|
||||
action='store_true',
|
||||
"--skip-patterns",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)'
|
||||
help="Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--skip-test-examples',
|
||||
action='store_true',
|
||||
"--skip-test-examples",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)'
|
||||
help="Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--skip-how-to-guides',
|
||||
action='store_true',
|
||||
"--skip-how-to-guides",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='Skip how-to guide generation from workflow examples (default: enabled)'
|
||||
help="Skip how-to guide generation from workflow examples (default: enabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--skip-config-patterns',
|
||||
action='store_true',
|
||||
"--skip-config-patterns",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)'
|
||||
help="Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--ai-mode',
|
||||
choices=['auto', 'api', 'local', 'none'],
|
||||
default='auto',
|
||||
help='AI enhancement mode for how-to guides: auto (detect best), api (Claude API), local (Claude Code CLI), none (disable) (default: auto)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--no-comments',
|
||||
action='store_true',
|
||||
help='Skip comment extraction'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='Enable verbose logging'
|
||||
"--ai-mode",
|
||||
choices=["auto", "api", "local", "none"],
|
||||
default="auto",
|
||||
help="AI enhancement mode for how-to guides: auto (detect best), api (Claude API), local (Claude Code CLI), none (disable) (default: auto)",
|
||||
)
|
||||
parser.add_argument("--no-comments", action="store_true", help="Skip comment extraction")
|
||||
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
|
||||
|
||||
# Check for deprecated flags
|
||||
deprecated_flags = {
|
||||
'--build-api-reference': '--skip-api-reference',
|
||||
'--build-dependency-graph': '--skip-dependency-graph',
|
||||
'--detect-patterns': '--skip-patterns',
|
||||
'--extract-test-examples': '--skip-test-examples',
|
||||
'--build-how-to-guides': '--skip-how-to-guides',
|
||||
'--extract-config-patterns': '--skip-config-patterns'
|
||||
"--build-api-reference": "--skip-api-reference",
|
||||
"--build-dependency-graph": "--skip-dependency-graph",
|
||||
"--detect-patterns": "--skip-patterns",
|
||||
"--extract-test-examples": "--skip-test-examples",
|
||||
"--build-how-to-guides": "--skip-how-to-guides",
|
||||
"--extract-config-patterns": "--skip-config-patterns",
|
||||
}
|
||||
|
||||
for old_flag, new_flag in deprecated_flags.items():
|
||||
if old_flag in sys.argv:
|
||||
logger.warning(f"⚠️ DEPRECATED: {old_flag} is deprecated. "
|
||||
f"All features are now enabled by default. "
|
||||
f"Use {new_flag} to disable this feature.")
|
||||
logger.warning(
|
||||
f"⚠️ DEPRECATED: {old_flag} is deprecated. "
|
||||
f"All features are now enabled by default. "
|
||||
f"Use {new_flag} to disable this feature."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -1158,12 +1145,12 @@ Examples:
|
||||
# Parse languages
|
||||
languages = None
|
||||
if args.languages:
|
||||
languages = [lang.strip() for lang in args.languages.split(',')]
|
||||
languages = [lang.strip() for lang in args.languages.split(",")]
|
||||
|
||||
# Parse file patterns
|
||||
file_patterns = None
|
||||
if args.file_patterns:
|
||||
file_patterns = [p.strip() for p in args.file_patterns.split(',')]
|
||||
file_patterns = [p.strip() for p in args.file_patterns.split(",")]
|
||||
|
||||
# Analyze codebase
|
||||
try:
|
||||
@@ -1181,18 +1168,18 @@ Examples:
|
||||
build_how_to_guides=not args.skip_how_to_guides,
|
||||
extract_config_patterns=not args.skip_config_patterns,
|
||||
enhance_with_ai=True, # Auto-disables if no API key present
|
||||
ai_mode=args.ai_mode # NEW: AI enhancement mode for how-to guides
|
||||
ai_mode=args.ai_mode, # NEW: AI enhancement mode for how-to guides
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"CODEBASE ANALYSIS COMPLETE")
|
||||
print(f"{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("CODEBASE ANALYSIS COMPLETE")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Files analyzed: {len(results['files'])}")
|
||||
print(f"Output directory: {args.output}")
|
||||
if args.build_api_reference:
|
||||
print(f"API reference: {Path(args.output) / 'api_reference'}")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
return 0
|
||||
|
||||
@@ -1202,9 +1189,10 @@ Examples:
|
||||
except Exception as e:
|
||||
logger.error(f"Analysis failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
@@ -4,9 +4,8 @@ Interactive Configuration Wizard for Skill Seekers
|
||||
Provides user-friendly setup for GitHub tokens, API keys, and settings.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import webbrowser
|
||||
from typing import Optional
|
||||
|
||||
from .config_manager import get_config_manager
|
||||
|
||||
|
||||
@@ -46,7 +45,7 @@ Documentation: https://github.com/SkillSeekers/skill-seekers
|
||||
# Ask if user wants to run setup now
|
||||
response = input("Would you like to run the configuration wizard now? [y/N]: ").strip().lower()
|
||||
|
||||
if response in ['y', 'yes']:
|
||||
if response in ["y", "yes"]:
|
||||
main_menu()
|
||||
else:
|
||||
print("\nYou can run the configuration wizard anytime with:")
|
||||
@@ -158,7 +157,7 @@ def add_github_profile():
|
||||
if name in config.config["github"]["profiles"]:
|
||||
print(f"❌ Profile '{name}' already exists.")
|
||||
overwrite = input("Overwrite? [y/N]: ").strip().lower()
|
||||
if overwrite not in ['y', 'yes']:
|
||||
if overwrite not in ["y", "yes"]:
|
||||
continue
|
||||
break
|
||||
|
||||
@@ -175,7 +174,7 @@ def add_github_profile():
|
||||
print(" 4. Copy the token (ghp_...)\n")
|
||||
|
||||
open_now = input("Open GitHub token page in browser? [Y/n]: ").strip().lower()
|
||||
if open_now not in ['n', 'no']:
|
||||
if open_now not in ["n", "no"]:
|
||||
open_github_token_page()
|
||||
|
||||
while True:
|
||||
@@ -186,7 +185,7 @@ def add_github_profile():
|
||||
if not (token.startswith("ghp_") or token.startswith("github_pat_")):
|
||||
print("⚠️ Warning: Token doesn't match GitHub format")
|
||||
proceed = input("Continue anyway? [y/N]: ").strip().lower()
|
||||
if proceed not in ['y', 'yes']:
|
||||
if proceed not in ["y", "yes"]:
|
||||
continue
|
||||
break
|
||||
|
||||
@@ -198,12 +197,7 @@ def add_github_profile():
|
||||
print(" 4. fail - Fail immediately")
|
||||
|
||||
strategy_choice = input("\nSelect strategy [1-4] (default: 1): ").strip() or "1"
|
||||
strategy_map = {
|
||||
"1": "prompt",
|
||||
"2": "wait",
|
||||
"3": "switch",
|
||||
"4": "fail"
|
||||
}
|
||||
strategy_map = {"1": "prompt", "2": "wait", "3": "switch", "4": "fail"}
|
||||
strategy = strategy_map.get(strategy_choice, "prompt")
|
||||
|
||||
# Timeout
|
||||
@@ -217,7 +211,7 @@ def add_github_profile():
|
||||
# Set as default
|
||||
has_profiles = bool(config.config["github"]["profiles"])
|
||||
if has_profiles:
|
||||
set_default = input("\nSet as default profile? [y/N]: ").strip().lower() in ['y', 'yes']
|
||||
set_default = input("\nSet as default profile? [y/N]: ").strip().lower() in ["y", "yes"]
|
||||
else:
|
||||
set_default = True # First profile is always default
|
||||
|
||||
@@ -228,7 +222,7 @@ def add_github_profile():
|
||||
description=description,
|
||||
rate_limit_strategy=strategy,
|
||||
timeout_minutes=timeout,
|
||||
set_as_default=set_default
|
||||
set_as_default=set_default,
|
||||
)
|
||||
|
||||
print(f"\n✅ GitHub profile '{name}' added successfully!")
|
||||
@@ -258,7 +252,7 @@ def remove_github_profile():
|
||||
if 1 <= choice_idx <= len(profiles):
|
||||
profile_name = profiles[choice_idx - 1]["name"]
|
||||
confirm = input(f"Really remove profile '{profile_name}'? [y/N]: ").strip().lower()
|
||||
if confirm in ['y', 'yes']:
|
||||
if confirm in ["y", "yes"]:
|
||||
config.remove_github_profile(profile_name)
|
||||
else:
|
||||
print("❌ Invalid choice.")
|
||||
@@ -325,11 +319,10 @@ def api_keys_menu():
|
||||
source = ""
|
||||
if key:
|
||||
import os
|
||||
env_var = {
|
||||
"anthropic": "ANTHROPIC_API_KEY",
|
||||
"google": "GOOGLE_API_KEY",
|
||||
"openai": "OPENAI_API_KEY"
|
||||
}[provider]
|
||||
|
||||
env_var = {"anthropic": "ANTHROPIC_API_KEY", "google": "GOOGLE_API_KEY", "openai": "OPENAI_API_KEY"}[
|
||||
provider
|
||||
]
|
||||
if os.getenv(env_var):
|
||||
source = " (from environment)"
|
||||
else:
|
||||
@@ -347,7 +340,7 @@ def api_keys_menu():
|
||||
provider_map = {
|
||||
"1": ("anthropic", "https://console.anthropic.com/settings/keys"),
|
||||
"2": ("google", "https://makersuite.google.com/app/apikey"),
|
||||
"3": ("openai", "https://platform.openai.com/api-keys")
|
||||
"3": ("openai", "https://platform.openai.com/api-keys"),
|
||||
}
|
||||
|
||||
if choice in provider_map:
|
||||
@@ -365,7 +358,7 @@ def set_api_key(provider: str, url: str):
|
||||
print(f"Get your API key at: {url}\n")
|
||||
|
||||
open_now = input("Open in browser? [Y/n]: ").strip().lower()
|
||||
if open_now not in ['n', 'no']:
|
||||
if open_now not in ["n", "no"]:
|
||||
try:
|
||||
webbrowser.open(url)
|
||||
print("✅ Opened in browser\n")
|
||||
@@ -390,7 +383,7 @@ def rate_limit_settings():
|
||||
|
||||
current = config.config["rate_limit"]
|
||||
|
||||
print(f"Current settings:")
|
||||
print("Current settings:")
|
||||
print(f" • Default timeout: {current['default_timeout_minutes']} minutes")
|
||||
print(f" • Auto-switch profiles: {current['auto_switch_profiles']}")
|
||||
print(f" • Show countdown: {current['show_countdown']}\n")
|
||||
@@ -404,14 +397,16 @@ def rate_limit_settings():
|
||||
print("⚠️ Invalid input, keeping current value")
|
||||
|
||||
# Auto-switch
|
||||
auto_switch_input = input(f"Auto-switch to other profiles? [y/n] ({current['auto_switch_profiles']}): ").strip().lower()
|
||||
auto_switch_input = (
|
||||
input(f"Auto-switch to other profiles? [y/n] ({current['auto_switch_profiles']}): ").strip().lower()
|
||||
)
|
||||
if auto_switch_input:
|
||||
config.config["rate_limit"]["auto_switch_profiles"] = auto_switch_input in ['y', 'yes']
|
||||
config.config["rate_limit"]["auto_switch_profiles"] = auto_switch_input in ["y", "yes"]
|
||||
|
||||
# Show countdown
|
||||
countdown_input = input(f"Show countdown timer? [y/n] ({current['show_countdown']}): ").strip().lower()
|
||||
if countdown_input:
|
||||
config.config["rate_limit"]["show_countdown"] = countdown_input in ['y', 'yes']
|
||||
config.config["rate_limit"]["show_countdown"] = countdown_input in ["y", "yes"]
|
||||
|
||||
config.save_config()
|
||||
print("\n✅ Rate limit settings updated")
|
||||
@@ -427,7 +422,7 @@ def resume_settings():
|
||||
|
||||
current = config.config["resume"]
|
||||
|
||||
print(f"Current settings:")
|
||||
print("Current settings:")
|
||||
print(f" • Auto-save interval: {current['auto_save_interval_seconds']} seconds")
|
||||
print(f" • Keep progress for: {current['keep_progress_days']} days\n")
|
||||
|
||||
@@ -467,13 +462,12 @@ def test_connections():
|
||||
print(" ⚠️ No GitHub profiles configured")
|
||||
else:
|
||||
import requests
|
||||
|
||||
for p in profiles:
|
||||
token = config.config["github"]["profiles"][p["name"]]["token"]
|
||||
try:
|
||||
response = requests.get(
|
||||
"https://api.github.com/rate_limit",
|
||||
headers={"Authorization": f"token {token}"},
|
||||
timeout=5
|
||||
"https://api.github.com/rate_limit", headers={"Authorization": f"token {token}"}, timeout=5
|
||||
)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
@@ -518,34 +512,12 @@ def main():
|
||||
"""Main entry point for config command."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Configure Skill Seekers settings"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--github",
|
||||
action="store_true",
|
||||
help="Go directly to GitHub token setup"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-keys",
|
||||
action="store_true",
|
||||
help="Go directly to API keys setup"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--show",
|
||||
action="store_true",
|
||||
help="Show current configuration and exit"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
action="store_true",
|
||||
help="Test connections and exit"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--welcome",
|
||||
action="store_true",
|
||||
help="Show welcome message"
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Configure Skill Seekers settings")
|
||||
parser.add_argument("--github", action="store_true", help="Go directly to GitHub token setup")
|
||||
parser.add_argument("--api-keys", action="store_true", help="Go directly to API keys setup")
|
||||
parser.add_argument("--show", action="store_true", help="Show current configuration and exit")
|
||||
parser.add_argument("--test", action="store_true", help="Test connections and exit")
|
||||
parser.add_argument("--welcome", action="store_true", help="Show welcome message")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -12,24 +12,24 @@ Provides dual-mode AI enhancement (API + LOCAL) for configuration analysis:
|
||||
Similar to GuideEnhancer (C3.3) but for configuration files.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Optional anthropic import
|
||||
ANTHROPIC_AVAILABLE = False
|
||||
try:
|
||||
import anthropic
|
||||
|
||||
ANTHROPIC_AVAILABLE = True
|
||||
except ImportError:
|
||||
pass
|
||||
@@ -38,6 +38,7 @@ except ImportError:
|
||||
@dataclass
|
||||
class ConfigEnhancement:
|
||||
"""AI-generated enhancement for a configuration"""
|
||||
|
||||
explanation: str = "" # What this setting does
|
||||
best_practice: str = "" # Suggested improvement
|
||||
security_concern: str = "" # Security issue (if any)
|
||||
@@ -48,11 +49,12 @@ class ConfigEnhancement:
|
||||
@dataclass
|
||||
class EnhancedConfigFile:
|
||||
"""Configuration file with AI enhancements"""
|
||||
|
||||
file_path: str
|
||||
config_type: str
|
||||
purpose: str
|
||||
enhancement: ConfigEnhancement
|
||||
setting_enhancements: Dict[str, ConfigEnhancement] = field(default_factory=dict)
|
||||
setting_enhancements: dict[str, ConfigEnhancement] = field(default_factory=dict)
|
||||
|
||||
|
||||
class ConfigEnhancer:
|
||||
@@ -73,7 +75,7 @@ class ConfigEnhancer:
|
||||
mode: Enhancement mode - "api", "local", or "auto" (default)
|
||||
"""
|
||||
self.mode = self._detect_mode(mode)
|
||||
self.api_key = os.environ.get('ANTHROPIC_API_KEY')
|
||||
self.api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
self.client = None
|
||||
|
||||
if self.mode == "api" and ANTHROPIC_AVAILABLE and self.api_key:
|
||||
@@ -93,14 +95,14 @@ class ConfigEnhancer:
|
||||
return requested_mode
|
||||
|
||||
# Auto-detect
|
||||
if os.environ.get('ANTHROPIC_API_KEY') and ANTHROPIC_AVAILABLE:
|
||||
if os.environ.get("ANTHROPIC_API_KEY") and ANTHROPIC_AVAILABLE:
|
||||
logger.info("🤖 AI enhancement: API mode (Claude API detected)")
|
||||
return "api"
|
||||
else:
|
||||
logger.info("🤖 AI enhancement: LOCAL mode (using Claude Code CLI)")
|
||||
return "local"
|
||||
|
||||
def enhance_config_result(self, result: Dict) -> Dict:
|
||||
def enhance_config_result(self, result: dict) -> dict:
|
||||
"""
|
||||
Enhance entire configuration extraction result.
|
||||
|
||||
@@ -121,7 +123,7 @@ class ConfigEnhancer:
|
||||
# API MODE - Direct Claude API calls
|
||||
# =========================================================================
|
||||
|
||||
def _enhance_via_api(self, result: Dict) -> Dict:
|
||||
def _enhance_via_api(self, result: dict) -> dict:
|
||||
"""Enhance configs using Claude API"""
|
||||
if not self.client:
|
||||
logger.error("❌ API mode requested but no API key available")
|
||||
@@ -134,12 +136,7 @@ class ConfigEnhancer:
|
||||
# Call Claude API
|
||||
logger.info("📡 Calling Claude API for config analysis...")
|
||||
response = self.client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=8000,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}]
|
||||
model="claude-sonnet-4-20250514", max_tokens=8000, messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
|
||||
# Parse response
|
||||
@@ -151,23 +148,23 @@ class ConfigEnhancer:
|
||||
logger.error(f"❌ API enhancement failed: {e}")
|
||||
return result
|
||||
|
||||
def _create_enhancement_prompt(self, result: Dict) -> str:
|
||||
def _create_enhancement_prompt(self, result: dict) -> str:
|
||||
"""Create prompt for Claude API"""
|
||||
config_files = result.get('config_files', [])
|
||||
config_files = result.get("config_files", [])
|
||||
|
||||
# Summarize configs for prompt
|
||||
config_summary = []
|
||||
for cf in config_files[:10]: # Limit to first 10 files
|
||||
settings_summary = []
|
||||
for setting in cf.get('settings', [])[:5]: # First 5 settings per file
|
||||
for setting in cf.get("settings", [])[:5]: # First 5 settings per file
|
||||
settings_summary.append(f" - {setting['key']}: {setting['value']} ({setting['value_type']})")
|
||||
|
||||
config_summary.append(f"""
|
||||
File: {cf['relative_path']} ({cf['config_type']})
|
||||
Purpose: {cf['purpose']}
|
||||
File: {cf["relative_path"]} ({cf["config_type"]})
|
||||
Purpose: {cf["purpose"]}
|
||||
Settings:
|
||||
{chr(10).join(settings_summary)}
|
||||
Patterns: {', '.join(cf.get('patterns', []))}
|
||||
Patterns: {", ".join(cf.get("patterns", []))}
|
||||
""")
|
||||
|
||||
prompt = f"""Analyze these configuration files and provide AI-enhanced insights.
|
||||
@@ -207,12 +204,13 @@ Focus on actionable insights that help developers understand and improve their c
|
||||
"""
|
||||
return prompt
|
||||
|
||||
def _parse_api_response(self, response_text: str, original_result: Dict) -> Dict:
|
||||
def _parse_api_response(self, response_text: str, original_result: dict) -> dict:
|
||||
"""Parse Claude API response and merge with original result"""
|
||||
try:
|
||||
# Extract JSON from response
|
||||
import re
|
||||
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
||||
|
||||
json_match = re.search(r"\{.*\}", response_text, re.DOTALL)
|
||||
if not json_match:
|
||||
logger.warning("⚠️ No JSON found in API response")
|
||||
return original_result
|
||||
@@ -220,14 +218,14 @@ Focus on actionable insights that help developers understand and improve their c
|
||||
enhancements = json.loads(json_match.group())
|
||||
|
||||
# Merge enhancements into original result
|
||||
original_result['ai_enhancements'] = enhancements
|
||||
original_result["ai_enhancements"] = enhancements
|
||||
|
||||
# Add enhancement flags to config files
|
||||
file_enhancements = {e['file_path']: e for e in enhancements.get('file_enhancements', [])}
|
||||
for cf in original_result.get('config_files', []):
|
||||
file_path = cf.get('relative_path', cf.get('file_path'))
|
||||
file_enhancements = {e["file_path"]: e for e in enhancements.get("file_enhancements", [])}
|
||||
for cf in original_result.get("config_files", []):
|
||||
file_path = cf.get("relative_path", cf.get("file_path"))
|
||||
if file_path in file_enhancements:
|
||||
cf['ai_enhancement'] = file_enhancements[file_path]
|
||||
cf["ai_enhancement"] = file_enhancements[file_path]
|
||||
|
||||
return original_result
|
||||
|
||||
@@ -239,11 +237,11 @@ Focus on actionable insights that help developers understand and improve their c
|
||||
# LOCAL MODE - Claude Code CLI
|
||||
# =========================================================================
|
||||
|
||||
def _enhance_via_local(self, result: Dict) -> Dict:
|
||||
def _enhance_via_local(self, result: dict) -> dict:
|
||||
"""Enhance configs using Claude Code CLI"""
|
||||
try:
|
||||
# Create temporary prompt file
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
|
||||
prompt_file = Path(f.name)
|
||||
f.write(self._create_local_prompt(result))
|
||||
|
||||
@@ -263,7 +261,7 @@ Focus on actionable insights that help developers understand and improve their c
|
||||
|
||||
if result_data:
|
||||
# Merge LOCAL enhancements
|
||||
original_result['ai_enhancements'] = result_data
|
||||
original_result["ai_enhancements"] = result_data
|
||||
logger.info("✅ LOCAL enhancement complete")
|
||||
return original_result
|
||||
else:
|
||||
@@ -274,18 +272,18 @@ Focus on actionable insights that help developers understand and improve their c
|
||||
logger.error(f"❌ LOCAL enhancement failed: {e}")
|
||||
return result
|
||||
|
||||
def _create_local_prompt(self, result: Dict) -> str:
|
||||
def _create_local_prompt(self, result: dict) -> str:
|
||||
"""Create prompt file for Claude Code CLI"""
|
||||
config_files = result.get('config_files', [])
|
||||
config_files = result.get("config_files", [])
|
||||
|
||||
# Format config data for Claude
|
||||
config_data = []
|
||||
for cf in config_files[:10]:
|
||||
config_data.append(f"""
|
||||
### {cf['relative_path']} ({cf['config_type']})
|
||||
- Purpose: {cf['purpose']}
|
||||
- Patterns: {', '.join(cf.get('patterns', []))}
|
||||
- Settings count: {len(cf.get('settings', []))}
|
||||
### {cf["relative_path"]} ({cf["config_type"]})
|
||||
- Purpose: {cf["purpose"]}
|
||||
- Patterns: {", ".join(cf.get("patterns", []))}
|
||||
- Settings count: {len(cf.get("settings", []))}
|
||||
""")
|
||||
|
||||
prompt = f"""# Configuration Analysis Task
|
||||
@@ -332,15 +330,15 @@ Focus on actionable insights:
|
||||
"""
|
||||
return prompt
|
||||
|
||||
def _run_claude_cli(self, prompt_file: Path, output_file: Path) -> Optional[Dict]:
|
||||
def _run_claude_cli(self, prompt_file: Path, output_file: Path) -> dict | None:
|
||||
"""Run Claude Code CLI and wait for completion"""
|
||||
try:
|
||||
# Run claude command
|
||||
result = subprocess.run(
|
||||
['claude', str(prompt_file)],
|
||||
["claude", str(prompt_file)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300 # 5 minute timeout
|
||||
timeout=300, # 5 minute timeout
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
@@ -350,6 +348,7 @@ Focus on actionable insights:
|
||||
# Try to find output file (Claude might save it with different name)
|
||||
# Look for JSON files created in the last minute
|
||||
import time
|
||||
|
||||
current_time = time.time()
|
||||
potential_files = []
|
||||
|
||||
@@ -360,9 +359,9 @@ Focus on actionable insights:
|
||||
# Try to load the most recent JSON file
|
||||
for json_file in sorted(potential_files, key=lambda f: f.stat().st_mtime, reverse=True):
|
||||
try:
|
||||
with open(json_file, 'r') as f:
|
||||
with open(json_file) as f:
|
||||
data = json.load(f)
|
||||
if 'file_enhancements' in data or 'overall_insights' in data:
|
||||
if "file_enhancements" in data or "overall_insights" in data:
|
||||
logger.info(f"✅ Found enhancement data in {json_file.name}")
|
||||
return data
|
||||
except:
|
||||
@@ -383,29 +382,18 @@ def main():
|
||||
"""Command-line interface for config enhancement"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='AI-enhance configuration extraction results'
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="AI-enhance configuration extraction results")
|
||||
parser.add_argument("result_file", help="Path to config extraction JSON result file")
|
||||
parser.add_argument(
|
||||
'result_file',
|
||||
help='Path to config extraction JSON result file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--mode',
|
||||
choices=['auto', 'api', 'local'],
|
||||
default='auto',
|
||||
help='Enhancement mode (default: auto)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
help='Output file for enhanced results (default: <input>_enhanced.json)'
|
||||
"--mode", choices=["auto", "api", "local"], default="auto", help="Enhancement mode (default: auto)"
|
||||
)
|
||||
parser.add_argument("--output", help="Output file for enhanced results (default: <input>_enhanced.json)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load result file
|
||||
try:
|
||||
with open(args.result_file, 'r') as f:
|
||||
with open(args.result_file) as f:
|
||||
result = json.load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to load result file: {e}")
|
||||
@@ -416,9 +404,9 @@ def main():
|
||||
enhanced_result = enhancer.enhance_config_result(result)
|
||||
|
||||
# Save
|
||||
output_file = args.output or args.result_file.replace('.json', '_enhanced.json')
|
||||
output_file = args.output or args.result_file.replace(".json", "_enhanced.json")
|
||||
try:
|
||||
with open(output_file, 'w') as f:
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(enhanced_result, f, indent=2)
|
||||
logger.info(f"✅ Enhanced results saved to: {output_file}")
|
||||
except Exception as e:
|
||||
@@ -428,5 +416,5 @@ def main():
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
@@ -9,19 +9,20 @@ This is different from C3.2 which extracts config examples from test code.
|
||||
C3.4 focuses on documenting the actual project configuration.
|
||||
"""
|
||||
|
||||
import ast
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any, Set, Literal
|
||||
import ast
|
||||
from typing import Any, Literal
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Optional dependencies
|
||||
try:
|
||||
import yaml
|
||||
|
||||
YAML_AVAILABLE = True
|
||||
except ImportError:
|
||||
YAML_AVAILABLE = False
|
||||
@@ -29,10 +30,12 @@ except ImportError:
|
||||
|
||||
try:
|
||||
import tomli
|
||||
|
||||
TOML_AVAILABLE = True
|
||||
except ImportError:
|
||||
try:
|
||||
import toml
|
||||
|
||||
TOML_AVAILABLE = True
|
||||
except ImportError:
|
||||
TOML_AVAILABLE = False
|
||||
@@ -42,68 +45,71 @@ except ImportError:
|
||||
@dataclass
|
||||
class ConfigSetting:
|
||||
"""Individual configuration setting"""
|
||||
|
||||
key: str
|
||||
value: Any
|
||||
value_type: str # 'string', 'integer', 'boolean', 'array', 'object', 'null'
|
||||
default_value: Optional[Any] = None
|
||||
default_value: Any | None = None
|
||||
required: bool = False
|
||||
env_var: Optional[str] = None
|
||||
env_var: str | None = None
|
||||
description: str = ""
|
||||
validation: Dict[str, Any] = field(default_factory=dict)
|
||||
nested_path: List[str] = field(default_factory=list) # For nested configs
|
||||
validation: dict[str, Any] = field(default_factory=dict)
|
||||
nested_path: list[str] = field(default_factory=list) # For nested configs
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConfigFile:
|
||||
"""Represents a configuration file"""
|
||||
|
||||
file_path: str
|
||||
relative_path: str
|
||||
config_type: Literal["json", "yaml", "toml", "env", "ini", "python", "javascript", "dockerfile", "docker-compose"]
|
||||
purpose: str # Inferred purpose: database, api, logging, etc.
|
||||
settings: List[ConfigSetting] = field(default_factory=list)
|
||||
patterns: List[str] = field(default_factory=list)
|
||||
raw_content: Optional[str] = None
|
||||
parse_errors: List[str] = field(default_factory=list)
|
||||
settings: list[ConfigSetting] = field(default_factory=list)
|
||||
patterns: list[str] = field(default_factory=list)
|
||||
raw_content: str | None = None
|
||||
parse_errors: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConfigExtractionResult:
|
||||
"""Result of config extraction"""
|
||||
config_files: List[ConfigFile] = field(default_factory=list)
|
||||
|
||||
config_files: list[ConfigFile] = field(default_factory=list)
|
||||
total_files: int = 0
|
||||
total_settings: int = 0
|
||||
detected_patterns: Dict[str, List[str]] = field(default_factory=dict) # pattern -> files
|
||||
errors: List[str] = field(default_factory=list)
|
||||
detected_patterns: dict[str, list[str]] = field(default_factory=dict) # pattern -> files
|
||||
errors: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert result to dictionary for JSON output"""
|
||||
return {
|
||||
'total_files': self.total_files,
|
||||
'total_settings': self.total_settings,
|
||||
'detected_patterns': self.detected_patterns,
|
||||
'config_files': [
|
||||
"total_files": self.total_files,
|
||||
"total_settings": self.total_settings,
|
||||
"detected_patterns": self.detected_patterns,
|
||||
"config_files": [
|
||||
{
|
||||
'file_path': cf.file_path,
|
||||
'relative_path': cf.relative_path,
|
||||
'type': cf.config_type,
|
||||
'purpose': cf.purpose,
|
||||
'patterns': cf.patterns,
|
||||
'settings_count': len(cf.settings),
|
||||
'settings': [
|
||||
"file_path": cf.file_path,
|
||||
"relative_path": cf.relative_path,
|
||||
"type": cf.config_type,
|
||||
"purpose": cf.purpose,
|
||||
"patterns": cf.patterns,
|
||||
"settings_count": len(cf.settings),
|
||||
"settings": [
|
||||
{
|
||||
'key': s.key,
|
||||
'value': s.value,
|
||||
'type': s.value_type,
|
||||
'env_var': s.env_var,
|
||||
'description': s.description,
|
||||
"key": s.key,
|
||||
"value": s.value,
|
||||
"type": s.value_type,
|
||||
"env_var": s.env_var,
|
||||
"description": s.description,
|
||||
}
|
||||
for s in cf.settings
|
||||
],
|
||||
'parse_errors': cf.parse_errors,
|
||||
"parse_errors": cf.parse_errors,
|
||||
}
|
||||
for cf in self.config_files
|
||||
],
|
||||
'errors': self.errors,
|
||||
"errors": self.errors,
|
||||
}
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
@@ -115,11 +121,11 @@ class ConfigExtractionResult:
|
||||
# Handle both dict and list formats for detected_patterns
|
||||
if self.detected_patterns:
|
||||
if isinstance(self.detected_patterns, dict):
|
||||
patterns_str = ', '.join(self.detected_patterns.keys())
|
||||
patterns_str = ", ".join(self.detected_patterns.keys())
|
||||
else:
|
||||
patterns_str = ', '.join(self.detected_patterns)
|
||||
patterns_str = ", ".join(self.detected_patterns)
|
||||
else:
|
||||
patterns_str = 'None'
|
||||
patterns_str = "None"
|
||||
md += f"**Detected Patterns:** {patterns_str}\n\n"
|
||||
|
||||
if self.config_files:
|
||||
@@ -148,52 +154,64 @@ class ConfigFileDetector:
|
||||
|
||||
# Config file patterns by type
|
||||
CONFIG_PATTERNS = {
|
||||
'json': {
|
||||
'patterns': ['*.json', 'package.json', 'tsconfig.json', 'jsconfig.json'],
|
||||
'names': ['config.json', 'settings.json', 'app.json', '.eslintrc.json', '.prettierrc.json'],
|
||||
"json": {
|
||||
"patterns": ["*.json", "package.json", "tsconfig.json", "jsconfig.json"],
|
||||
"names": ["config.json", "settings.json", "app.json", ".eslintrc.json", ".prettierrc.json"],
|
||||
},
|
||||
'yaml': {
|
||||
'patterns': ['*.yaml', '*.yml'],
|
||||
'names': ['config.yml', 'settings.yml', '.travis.yml', '.gitlab-ci.yml', 'docker-compose.yml'],
|
||||
"yaml": {
|
||||
"patterns": ["*.yaml", "*.yml"],
|
||||
"names": ["config.yml", "settings.yml", ".travis.yml", ".gitlab-ci.yml", "docker-compose.yml"],
|
||||
},
|
||||
'toml': {
|
||||
'patterns': ['*.toml'],
|
||||
'names': ['pyproject.toml', 'Cargo.toml', 'config.toml'],
|
||||
"toml": {
|
||||
"patterns": ["*.toml"],
|
||||
"names": ["pyproject.toml", "Cargo.toml", "config.toml"],
|
||||
},
|
||||
'env': {
|
||||
'patterns': ['.env*', '*.env'],
|
||||
'names': ['.env', '.env.example', '.env.local', '.env.production'],
|
||||
"env": {
|
||||
"patterns": [".env*", "*.env"],
|
||||
"names": [".env", ".env.example", ".env.local", ".env.production"],
|
||||
},
|
||||
'ini': {
|
||||
'patterns': ['*.ini', '*.cfg'],
|
||||
'names': ['config.ini', 'setup.cfg', 'tox.ini'],
|
||||
"ini": {
|
||||
"patterns": ["*.ini", "*.cfg"],
|
||||
"names": ["config.ini", "setup.cfg", "tox.ini"],
|
||||
},
|
||||
'python': {
|
||||
'patterns': [],
|
||||
'names': ['settings.py', 'config.py', 'configuration.py', 'constants.py'],
|
||||
"python": {
|
||||
"patterns": [],
|
||||
"names": ["settings.py", "config.py", "configuration.py", "constants.py"],
|
||||
},
|
||||
'javascript': {
|
||||
'patterns': ['*.config.js', '*.config.ts'],
|
||||
'names': ['config.js', 'next.config.js', 'vue.config.js', 'webpack.config.js'],
|
||||
"javascript": {
|
||||
"patterns": ["*.config.js", "*.config.ts"],
|
||||
"names": ["config.js", "next.config.js", "vue.config.js", "webpack.config.js"],
|
||||
},
|
||||
'dockerfile': {
|
||||
'patterns': ['Dockerfile*'],
|
||||
'names': ['Dockerfile', 'Dockerfile.dev', 'Dockerfile.prod'],
|
||||
"dockerfile": {
|
||||
"patterns": ["Dockerfile*"],
|
||||
"names": ["Dockerfile", "Dockerfile.dev", "Dockerfile.prod"],
|
||||
},
|
||||
'docker-compose': {
|
||||
'patterns': ['docker-compose*.yml', 'docker-compose*.yaml'],
|
||||
'names': ['docker-compose.yml', 'docker-compose.yaml'],
|
||||
"docker-compose": {
|
||||
"patterns": ["docker-compose*.yml", "docker-compose*.yaml"],
|
||||
"names": ["docker-compose.yml", "docker-compose.yaml"],
|
||||
},
|
||||
}
|
||||
|
||||
# Directories to skip
|
||||
SKIP_DIRS = {
|
||||
'node_modules', 'venv', 'env', '.venv', '__pycache__', '.git',
|
||||
'build', 'dist', '.tox', '.mypy_cache', '.pytest_cache',
|
||||
'htmlcov', 'coverage', '.eggs', '*.egg-info'
|
||||
"node_modules",
|
||||
"venv",
|
||||
"env",
|
||||
".venv",
|
||||
"__pycache__",
|
||||
".git",
|
||||
"build",
|
||||
"dist",
|
||||
".tox",
|
||||
".mypy_cache",
|
||||
".pytest_cache",
|
||||
"htmlcov",
|
||||
"coverage",
|
||||
".eggs",
|
||||
"*.egg-info",
|
||||
}
|
||||
|
||||
def find_config_files(self, directory: Path, max_files: int = 100) -> List[ConfigFile]:
|
||||
def find_config_files(self, directory: Path, max_files: int = 100) -> list[ConfigFile]:
|
||||
"""
|
||||
Find all configuration files in directory.
|
||||
|
||||
@@ -219,7 +237,7 @@ class ConfigFileDetector:
|
||||
file_path=str(file_path),
|
||||
relative_path=relative_path,
|
||||
config_type=config_type,
|
||||
purpose=self._infer_purpose(file_path, config_type)
|
||||
purpose=self._infer_purpose(file_path, config_type),
|
||||
)
|
||||
config_files.append(config_file)
|
||||
found_count += 1
|
||||
@@ -230,7 +248,7 @@ class ConfigFileDetector:
|
||||
|
||||
def _walk_directory(self, directory: Path):
|
||||
"""Walk directory, skipping excluded directories"""
|
||||
for item in directory.rglob('*'):
|
||||
for item in directory.rglob("*"):
|
||||
# Skip directories
|
||||
if item.is_dir():
|
||||
continue
|
||||
@@ -241,18 +259,18 @@ class ConfigFileDetector:
|
||||
|
||||
yield item
|
||||
|
||||
def _detect_config_type(self, file_path: Path) -> Optional[str]:
|
||||
def _detect_config_type(self, file_path: Path) -> str | None:
|
||||
"""Detect configuration file type"""
|
||||
filename = file_path.name.lower()
|
||||
|
||||
# Check each config type
|
||||
for config_type, patterns in self.CONFIG_PATTERNS.items():
|
||||
# Check exact name matches
|
||||
if filename in patterns['names']:
|
||||
if filename in patterns["names"]:
|
||||
return config_type
|
||||
|
||||
# Check pattern matches
|
||||
for pattern in patterns['patterns']:
|
||||
for pattern in patterns["patterns"]:
|
||||
if file_path.match(pattern):
|
||||
return config_type
|
||||
|
||||
@@ -264,43 +282,43 @@ class ConfigFileDetector:
|
||||
filename = file_path.name.lower()
|
||||
|
||||
# Database configs
|
||||
if any(word in path_lower for word in ['database', 'db', 'postgres', 'mysql', 'mongo']):
|
||||
return 'database_configuration'
|
||||
if any(word in path_lower for word in ["database", "db", "postgres", "mysql", "mongo"]):
|
||||
return "database_configuration"
|
||||
|
||||
# API configs
|
||||
if any(word in path_lower for word in ['api', 'rest', 'graphql', 'endpoint']):
|
||||
return 'api_configuration'
|
||||
if any(word in path_lower for word in ["api", "rest", "graphql", "endpoint"]):
|
||||
return "api_configuration"
|
||||
|
||||
# Logging configs
|
||||
if any(word in path_lower for word in ['log', 'logger', 'logging']):
|
||||
return 'logging_configuration'
|
||||
if any(word in path_lower for word in ["log", "logger", "logging"]):
|
||||
return "logging_configuration"
|
||||
|
||||
# Docker configs
|
||||
if 'docker' in filename:
|
||||
return 'docker_configuration'
|
||||
if "docker" in filename:
|
||||
return "docker_configuration"
|
||||
|
||||
# CI/CD configs
|
||||
if any(word in path_lower for word in ['.travis', '.gitlab', '.github', 'ci', 'cd']):
|
||||
return 'ci_cd_configuration'
|
||||
if any(word in path_lower for word in [".travis", ".gitlab", ".github", "ci", "cd"]):
|
||||
return "ci_cd_configuration"
|
||||
|
||||
# Package configs
|
||||
if filename in ['package.json', 'pyproject.toml', 'cargo.toml']:
|
||||
return 'package_configuration'
|
||||
if filename in ["package.json", "pyproject.toml", "cargo.toml"]:
|
||||
return "package_configuration"
|
||||
|
||||
# TypeScript/JavaScript configs
|
||||
if filename in ['tsconfig.json', 'jsconfig.json']:
|
||||
return 'typescript_configuration'
|
||||
if filename in ["tsconfig.json", "jsconfig.json"]:
|
||||
return "typescript_configuration"
|
||||
|
||||
# Framework configs
|
||||
if 'next.config' in filename or 'vue.config' in filename or 'webpack.config' in filename:
|
||||
return 'framework_configuration'
|
||||
if "next.config" in filename or "vue.config" in filename or "webpack.config" in filename:
|
||||
return "framework_configuration"
|
||||
|
||||
# Environment configs
|
||||
if '.env' in filename:
|
||||
return 'environment_configuration'
|
||||
if ".env" in filename:
|
||||
return "environment_configuration"
|
||||
|
||||
# Default
|
||||
return 'general_configuration'
|
||||
return "general_configuration"
|
||||
|
||||
|
||||
class ConfigParser:
|
||||
@@ -318,27 +336,27 @@ class ConfigParser:
|
||||
"""
|
||||
try:
|
||||
# Read file content
|
||||
with open(config_file.file_path, 'r', encoding='utf-8') as f:
|
||||
with open(config_file.file_path, encoding="utf-8") as f:
|
||||
config_file.raw_content = f.read()
|
||||
|
||||
# Parse based on type
|
||||
if config_file.config_type == 'json':
|
||||
if config_file.config_type == "json":
|
||||
self._parse_json(config_file)
|
||||
elif config_file.config_type == 'yaml':
|
||||
elif config_file.config_type == "yaml":
|
||||
self._parse_yaml(config_file)
|
||||
elif config_file.config_type == 'toml':
|
||||
elif config_file.config_type == "toml":
|
||||
self._parse_toml(config_file)
|
||||
elif config_file.config_type == 'env':
|
||||
elif config_file.config_type == "env":
|
||||
self._parse_env(config_file)
|
||||
elif config_file.config_type == 'ini':
|
||||
elif config_file.config_type == "ini":
|
||||
self._parse_ini(config_file)
|
||||
elif config_file.config_type == 'python':
|
||||
elif config_file.config_type == "python":
|
||||
self._parse_python_config(config_file)
|
||||
elif config_file.config_type == 'javascript':
|
||||
elif config_file.config_type == "javascript":
|
||||
self._parse_javascript_config(config_file)
|
||||
elif config_file.config_type == 'dockerfile':
|
||||
elif config_file.config_type == "dockerfile":
|
||||
self._parse_dockerfile(config_file)
|
||||
elif config_file.config_type == 'docker-compose':
|
||||
elif config_file.config_type == "docker-compose":
|
||||
self._parse_yaml(config_file) # Docker compose is YAML
|
||||
|
||||
except Exception as e:
|
||||
@@ -376,10 +394,11 @@ class ConfigParser:
|
||||
return
|
||||
|
||||
try:
|
||||
if 'tomli' in globals():
|
||||
if "tomli" in globals():
|
||||
data = tomli.loads(config_file.raw_content)
|
||||
else:
|
||||
import toml
|
||||
|
||||
data = toml.loads(config_file.raw_content)
|
||||
|
||||
self._extract_settings_from_dict(data, config_file)
|
||||
@@ -388,17 +407,17 @@ class ConfigParser:
|
||||
|
||||
def _parse_env(self, config_file: ConfigFile):
|
||||
"""Parse .env file"""
|
||||
lines = config_file.raw_content.split('\n')
|
||||
lines = config_file.raw_content.split("\n")
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip comments and empty lines
|
||||
if not line or line.startswith('#'):
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
# Parse KEY=VALUE
|
||||
match = re.match(r'([A-Z_][A-Z0-9_]*)\s*=\s*(.+)', line)
|
||||
match = re.match(r"([A-Z_][A-Z0-9_]*)\s*=\s*(.+)", line)
|
||||
if match:
|
||||
key, value = match.groups()
|
||||
value = value.strip().strip('"').strip("'")
|
||||
@@ -408,7 +427,7 @@ class ConfigParser:
|
||||
value=value,
|
||||
value_type=self._infer_type(value),
|
||||
env_var=key,
|
||||
description=self._extract_env_description(lines, line_num - 1)
|
||||
description=self._extract_env_description(lines, line_num - 1),
|
||||
)
|
||||
config_file.settings.append(setting)
|
||||
|
||||
@@ -426,7 +445,7 @@ class ConfigParser:
|
||||
key=f"{section}.{key}",
|
||||
value=value,
|
||||
value_type=self._infer_type(value),
|
||||
nested_path=[section, key]
|
||||
nested_path=[section, key],
|
||||
)
|
||||
config_file.settings.append(setting)
|
||||
except Exception as e:
|
||||
@@ -444,7 +463,7 @@ class ConfigParser:
|
||||
key = node.targets[0].id
|
||||
|
||||
# Skip private variables
|
||||
if key.startswith('_'):
|
||||
if key.startswith("_"):
|
||||
continue
|
||||
|
||||
# Extract value
|
||||
@@ -454,7 +473,7 @@ class ConfigParser:
|
||||
key=key,
|
||||
value=value,
|
||||
value_type=self._infer_type(value),
|
||||
description=self._extract_python_docstring(node)
|
||||
description=self._extract_python_docstring(node),
|
||||
)
|
||||
config_file.settings.append(setting)
|
||||
except (ValueError, TypeError):
|
||||
@@ -469,8 +488,8 @@ class ConfigParser:
|
||||
# Simple regex-based extraction for common patterns
|
||||
patterns = [
|
||||
r'(?:const|let|var)\s+(\w+)\s*[:=]\s*(["\'])(.*?)\2', # String values
|
||||
r'(?:const|let|var)\s+(\w+)\s*[:=]\s*(\d+)', # Number values
|
||||
r'(?:const|let|var)\s+(\w+)\s*[:=]\s*(true|false)', # Boolean values
|
||||
r"(?:const|let|var)\s+(\w+)\s*[:=]\s*(\d+)", # Number values
|
||||
r"(?:const|let|var)\s+(\w+)\s*[:=]\s*(true|false)", # Boolean values
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
@@ -479,47 +498,36 @@ class ConfigParser:
|
||||
key = match.group(1)
|
||||
value = match.group(3) if len(match.groups()) > 2 else match.group(2)
|
||||
|
||||
setting = ConfigSetting(
|
||||
key=key,
|
||||
value=value,
|
||||
value_type=self._infer_type(value)
|
||||
)
|
||||
setting = ConfigSetting(key=key, value=value, value_type=self._infer_type(value))
|
||||
config_file.settings.append(setting)
|
||||
|
||||
def _parse_dockerfile(self, config_file: ConfigFile):
|
||||
"""Parse Dockerfile configuration"""
|
||||
lines = config_file.raw_content.split('\n')
|
||||
lines = config_file.raw_content.split("\n")
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
# Extract ENV variables
|
||||
if line.startswith('ENV '):
|
||||
parts = line[4:].split('=', 1)
|
||||
if line.startswith("ENV "):
|
||||
parts = line[4:].split("=", 1)
|
||||
if len(parts) == 2:
|
||||
key, value = parts
|
||||
setting = ConfigSetting(
|
||||
key=key.strip(),
|
||||
value=value.strip(),
|
||||
value_type='string',
|
||||
env_var=key.strip()
|
||||
key=key.strip(), value=value.strip(), value_type="string", env_var=key.strip()
|
||||
)
|
||||
config_file.settings.append(setting)
|
||||
|
||||
# Extract ARG variables
|
||||
elif line.startswith('ARG '):
|
||||
parts = line[4:].split('=', 1)
|
||||
elif line.startswith("ARG "):
|
||||
parts = line[4:].split("=", 1)
|
||||
key = parts[0].strip()
|
||||
value = parts[1].strip() if len(parts) == 2 else None
|
||||
|
||||
setting = ConfigSetting(
|
||||
key=key,
|
||||
value=value,
|
||||
value_type='string'
|
||||
)
|
||||
setting = ConfigSetting(key=key, value=value, value_type="string")
|
||||
config_file.settings.append(setting)
|
||||
|
||||
def _extract_settings_from_dict(self, data: Dict, config_file: ConfigFile, parent_path: List[str] = None):
|
||||
def _extract_settings_from_dict(self, data: dict, config_file: ConfigFile, parent_path: list[str] = None):
|
||||
"""Recursively extract settings from dictionary"""
|
||||
if parent_path is None:
|
||||
parent_path = []
|
||||
@@ -530,35 +538,35 @@ class ConfigParser:
|
||||
self._extract_settings_from_dict(value, config_file, parent_path + [key])
|
||||
else:
|
||||
setting = ConfigSetting(
|
||||
key='.'.join(parent_path + [key]) if parent_path else key,
|
||||
key=".".join(parent_path + [key]) if parent_path else key,
|
||||
value=value,
|
||||
value_type=self._infer_type(value),
|
||||
nested_path=parent_path + [key]
|
||||
nested_path=parent_path + [key],
|
||||
)
|
||||
config_file.settings.append(setting)
|
||||
|
||||
def _infer_type(self, value: Any) -> str:
|
||||
"""Infer value type"""
|
||||
if value is None:
|
||||
return 'null'
|
||||
return "null"
|
||||
elif isinstance(value, bool):
|
||||
return 'boolean'
|
||||
return "boolean"
|
||||
elif isinstance(value, int):
|
||||
return 'integer'
|
||||
return "integer"
|
||||
elif isinstance(value, float):
|
||||
return 'number'
|
||||
return "number"
|
||||
elif isinstance(value, (list, tuple)):
|
||||
return 'array'
|
||||
return "array"
|
||||
elif isinstance(value, dict):
|
||||
return 'object'
|
||||
return "object"
|
||||
else:
|
||||
return 'string'
|
||||
return "string"
|
||||
|
||||
def _extract_env_description(self, lines: List[str], line_index: int) -> str:
|
||||
def _extract_env_description(self, lines: list[str], line_index: int) -> str:
|
||||
"""Extract description from comment above env variable"""
|
||||
if line_index > 0:
|
||||
prev_line = lines[line_index - 1].strip()
|
||||
if prev_line.startswith('#'):
|
||||
if prev_line.startswith("#"):
|
||||
return prev_line[1:].strip()
|
||||
return ""
|
||||
|
||||
@@ -573,37 +581,37 @@ class ConfigPatternDetector:
|
||||
|
||||
# Known configuration patterns
|
||||
KNOWN_PATTERNS = {
|
||||
'database_config': {
|
||||
'keys': ['host', 'port', 'database', 'user', 'username', 'password', 'db_name'],
|
||||
'min_match': 3,
|
||||
"database_config": {
|
||||
"keys": ["host", "port", "database", "user", "username", "password", "db_name"],
|
||||
"min_match": 3,
|
||||
},
|
||||
'api_config': {
|
||||
'keys': ['base_url', 'api_key', 'api_secret', 'timeout', 'retry', 'endpoint'],
|
||||
'min_match': 2,
|
||||
"api_config": {
|
||||
"keys": ["base_url", "api_key", "api_secret", "timeout", "retry", "endpoint"],
|
||||
"min_match": 2,
|
||||
},
|
||||
'logging_config': {
|
||||
'keys': ['level', 'format', 'handler', 'file', 'console', 'log_level'],
|
||||
'min_match': 2,
|
||||
"logging_config": {
|
||||
"keys": ["level", "format", "handler", "file", "console", "log_level"],
|
||||
"min_match": 2,
|
||||
},
|
||||
'cache_config': {
|
||||
'keys': ['backend', 'ttl', 'timeout', 'max_size', 'redis', 'memcached'],
|
||||
'min_match': 2,
|
||||
"cache_config": {
|
||||
"keys": ["backend", "ttl", "timeout", "max_size", "redis", "memcached"],
|
||||
"min_match": 2,
|
||||
},
|
||||
'email_config': {
|
||||
'keys': ['smtp_host', 'smtp_port', 'email', 'from_email', 'mail_server'],
|
||||
'min_match': 2,
|
||||
"email_config": {
|
||||
"keys": ["smtp_host", "smtp_port", "email", "from_email", "mail_server"],
|
||||
"min_match": 2,
|
||||
},
|
||||
'auth_config': {
|
||||
'keys': ['secret_key', 'jwt_secret', 'token', 'oauth', 'authentication'],
|
||||
'min_match': 1,
|
||||
"auth_config": {
|
||||
"keys": ["secret_key", "jwt_secret", "token", "oauth", "authentication"],
|
||||
"min_match": 1,
|
||||
},
|
||||
'server_config': {
|
||||
'keys': ['host', 'port', 'bind', 'workers', 'threads'],
|
||||
'min_match': 2,
|
||||
"server_config": {
|
||||
"keys": ["host", "port", "bind", "workers", "threads"],
|
||||
"min_match": 2,
|
||||
},
|
||||
}
|
||||
|
||||
def detect_patterns(self, config_file: ConfigFile) -> List[str]:
|
||||
def detect_patterns(self, config_file: ConfigFile) -> list[str]:
|
||||
"""
|
||||
Detect which patterns this config file matches.
|
||||
|
||||
@@ -620,8 +628,8 @@ class ConfigPatternDetector:
|
||||
|
||||
# Check against each known pattern
|
||||
for pattern_name, pattern_def in self.KNOWN_PATTERNS.items():
|
||||
pattern_keys = {k.lower() for k in pattern_def['keys']}
|
||||
min_match = pattern_def['min_match']
|
||||
pattern_keys = {k.lower() for k in pattern_def["keys"]}
|
||||
min_match = pattern_def["min_match"]
|
||||
|
||||
# Count matches
|
||||
matches = len(setting_keys & pattern_keys)
|
||||
@@ -641,11 +649,7 @@ class ConfigExtractor:
|
||||
self.parser = ConfigParser()
|
||||
self.pattern_detector = ConfigPatternDetector()
|
||||
|
||||
def extract_from_directory(
|
||||
self,
|
||||
directory: Path,
|
||||
max_files: int = 100
|
||||
) -> ConfigExtractionResult:
|
||||
def extract_from_directory(self, directory: Path, max_files: int = 100) -> ConfigExtractionResult:
|
||||
"""
|
||||
Extract configuration patterns from directory.
|
||||
|
||||
@@ -696,35 +700,35 @@ class ConfigExtractor:
|
||||
|
||||
return result
|
||||
|
||||
def to_dict(self, result: ConfigExtractionResult) -> Dict:
|
||||
def to_dict(self, result: ConfigExtractionResult) -> dict:
|
||||
"""Convert result to dictionary for JSON output"""
|
||||
return {
|
||||
'total_files': result.total_files,
|
||||
'total_settings': result.total_settings,
|
||||
'detected_patterns': result.detected_patterns,
|
||||
'config_files': [
|
||||
"total_files": result.total_files,
|
||||
"total_settings": result.total_settings,
|
||||
"detected_patterns": result.detected_patterns,
|
||||
"config_files": [
|
||||
{
|
||||
'file_path': cf.file_path,
|
||||
'relative_path': cf.relative_path,
|
||||
'type': cf.config_type,
|
||||
'purpose': cf.purpose,
|
||||
'patterns': cf.patterns,
|
||||
'settings_count': len(cf.settings),
|
||||
'settings': [
|
||||
"file_path": cf.file_path,
|
||||
"relative_path": cf.relative_path,
|
||||
"type": cf.config_type,
|
||||
"purpose": cf.purpose,
|
||||
"patterns": cf.patterns,
|
||||
"settings_count": len(cf.settings),
|
||||
"settings": [
|
||||
{
|
||||
'key': s.key,
|
||||
'value': s.value,
|
||||
'type': s.value_type,
|
||||
'env_var': s.env_var,
|
||||
'description': s.description,
|
||||
"key": s.key,
|
||||
"value": s.value,
|
||||
"type": s.value_type,
|
||||
"env_var": s.env_var,
|
||||
"description": s.description,
|
||||
}
|
||||
for s in cf.settings
|
||||
],
|
||||
'parse_errors': cf.parse_errors,
|
||||
"parse_errors": cf.parse_errors,
|
||||
}
|
||||
for cf in result.config_files
|
||||
],
|
||||
'errors': result.errors,
|
||||
"errors": result.errors,
|
||||
}
|
||||
|
||||
|
||||
@@ -732,19 +736,29 @@ def main():
|
||||
"""CLI entry point for config extraction"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Extract configuration patterns from codebase with optional AI enhancement")
|
||||
parser.add_argument('directory', type=Path, help='Directory to analyze')
|
||||
parser.add_argument('--output', '-o', type=Path, help='Output JSON file')
|
||||
parser.add_argument('--max-files', type=int, default=100, help='Maximum config files to process')
|
||||
parser.add_argument('--enhance', action='store_true', help='Enhance with AI analysis (API mode, requires ANTHROPIC_API_KEY)')
|
||||
parser.add_argument('--enhance-local', action='store_true', help='Enhance with AI analysis (LOCAL mode, uses Claude Code CLI)')
|
||||
parser.add_argument('--ai-mode', choices=['auto', 'api', 'local', 'none'], default='none',
|
||||
help='AI enhancement mode: auto (detect), api (Claude API), local (Claude Code CLI), none (disable)')
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract configuration patterns from codebase with optional AI enhancement"
|
||||
)
|
||||
parser.add_argument("directory", type=Path, help="Directory to analyze")
|
||||
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
|
||||
parser.add_argument("--max-files", type=int, default=100, help="Maximum config files to process")
|
||||
parser.add_argument(
|
||||
"--enhance", action="store_true", help="Enhance with AI analysis (API mode, requires ANTHROPIC_API_KEY)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enhance-local", action="store_true", help="Enhance with AI analysis (LOCAL mode, uses Claude Code CLI)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ai-mode",
|
||||
choices=["auto", "api", "local", "none"],
|
||||
default="none",
|
||||
help="AI enhancement mode: auto (detect), api (Claude API), local (Claude Code CLI), none (disable)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
||||
|
||||
# Extract
|
||||
extractor = ConfigExtractor()
|
||||
@@ -756,13 +770,14 @@ def main():
|
||||
# AI Enhancement (if requested)
|
||||
enhance_mode = args.ai_mode
|
||||
if args.enhance:
|
||||
enhance_mode = 'api'
|
||||
enhance_mode = "api"
|
||||
elif args.enhance_local:
|
||||
enhance_mode = 'local'
|
||||
enhance_mode = "local"
|
||||
|
||||
if enhance_mode != 'none':
|
||||
if enhance_mode != "none":
|
||||
try:
|
||||
from skill_seekers.cli.config_enhancer import ConfigEnhancer
|
||||
|
||||
logger.info(f"🤖 Starting AI enhancement (mode: {enhance_mode})...")
|
||||
enhancer = ConfigEnhancer(mode=enhance_mode)
|
||||
output_dict = enhancer.enhance_config_result(output_dict)
|
||||
@@ -774,27 +789,27 @@ def main():
|
||||
|
||||
# Output
|
||||
if args.output:
|
||||
with open(args.output, 'w') as f:
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(output_dict, f, indent=2)
|
||||
print(f"✅ Saved config extraction results to: {args.output}")
|
||||
else:
|
||||
print(json.dumps(output_dict, indent=2))
|
||||
|
||||
# Summary
|
||||
print(f"\n📊 Summary:")
|
||||
print("\n📊 Summary:")
|
||||
print(f" Config files found: {result.total_files}")
|
||||
print(f" Total settings: {result.total_settings}")
|
||||
print(f" Detected patterns: {', '.join(result.detected_patterns.keys()) or 'None'}")
|
||||
|
||||
if 'ai_enhancements' in output_dict:
|
||||
if "ai_enhancements" in output_dict:
|
||||
print(f" ✨ AI enhancements: Yes ({enhance_mode} mode)")
|
||||
insights = output_dict['ai_enhancements'].get('overall_insights', {})
|
||||
if insights.get('security_issues_found'):
|
||||
insights = output_dict["ai_enhancements"].get("overall_insights", {})
|
||||
if insights.get("security_issues_found"):
|
||||
print(f" 🔐 Security issues found: {insights['security_issues_found']}")
|
||||
|
||||
if result.errors:
|
||||
print(f"\n⚠️ Errors: {len(result.errors)}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -8,10 +8,10 @@ Provides secure storage with file permissions and auto-detection capabilities.
|
||||
import json
|
||||
import os
|
||||
import stat
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
@@ -26,28 +26,11 @@ class ConfigManager:
|
||||
# Default configuration
|
||||
DEFAULT_CONFIG = {
|
||||
"version": "1.0",
|
||||
"github": {
|
||||
"default_profile": None,
|
||||
"profiles": {}
|
||||
},
|
||||
"rate_limit": {
|
||||
"default_timeout_minutes": 30,
|
||||
"auto_switch_profiles": True,
|
||||
"show_countdown": True
|
||||
},
|
||||
"resume": {
|
||||
"auto_save_interval_seconds": 60,
|
||||
"keep_progress_days": 7
|
||||
},
|
||||
"api_keys": {
|
||||
"anthropic": None,
|
||||
"google": None,
|
||||
"openai": None
|
||||
},
|
||||
"first_run": {
|
||||
"completed": False,
|
||||
"version": "2.7.0"
|
||||
}
|
||||
"github": {"default_profile": None, "profiles": {}},
|
||||
"rate_limit": {"default_timeout_minutes": 30, "auto_switch_profiles": True, "show_countdown": True},
|
||||
"resume": {"auto_save_interval_seconds": 60, "keep_progress_days": 7},
|
||||
"api_keys": {"anthropic": None, "google": None, "openai": None},
|
||||
"first_run": {"completed": False, "version": "2.7.0"},
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
@@ -65,25 +48,26 @@ class ConfigManager:
|
||||
# Set directory permissions to 700 (rwx------)
|
||||
directory.chmod(stat.S_IRWXU)
|
||||
|
||||
def _load_config(self) -> Dict[str, Any]:
|
||||
def _load_config(self) -> dict[str, Any]:
|
||||
"""Load configuration from file or create default."""
|
||||
if not self.config_file.exists():
|
||||
return self.DEFAULT_CONFIG.copy()
|
||||
|
||||
try:
|
||||
with open(self.config_file, 'r') as f:
|
||||
with open(self.config_file) as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Merge with defaults for any missing keys
|
||||
config = self._merge_with_defaults(config)
|
||||
return config
|
||||
except (json.JSONDecodeError, IOError) as e:
|
||||
except (OSError, json.JSONDecodeError) as e:
|
||||
print(f"⚠️ Warning: Could not load config file: {e}")
|
||||
print(f" Using default configuration.")
|
||||
print(" Using default configuration.")
|
||||
return self.DEFAULT_CONFIG.copy()
|
||||
|
||||
def _merge_with_defaults(self, config: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def _merge_with_defaults(self, config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Merge loaded config with defaults to ensure all keys exist."""
|
||||
|
||||
def deep_merge(default: dict, custom: dict) -> dict:
|
||||
result = default.copy()
|
||||
for key, value in custom.items():
|
||||
@@ -98,13 +82,13 @@ class ConfigManager:
|
||||
def save_config(self):
|
||||
"""Save configuration to file with secure permissions."""
|
||||
try:
|
||||
with open(self.config_file, 'w') as f:
|
||||
with open(self.config_file, "w") as f:
|
||||
json.dump(self.config, f, indent=2)
|
||||
|
||||
# Set file permissions to 600 (rw-------)
|
||||
self.config_file.chmod(stat.S_IRUSR | stat.S_IWUSR)
|
||||
|
||||
except IOError as e:
|
||||
except OSError as e:
|
||||
print(f"❌ Error saving config: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -117,7 +101,7 @@ class ConfigManager:
|
||||
description: str = "",
|
||||
rate_limit_strategy: str = "prompt",
|
||||
timeout_minutes: int = 30,
|
||||
set_as_default: bool = False
|
||||
set_as_default: bool = False,
|
||||
):
|
||||
"""Add a new GitHub profile."""
|
||||
if not name:
|
||||
@@ -131,7 +115,7 @@ class ConfigManager:
|
||||
"description": description,
|
||||
"rate_limit_strategy": rate_limit_strategy,
|
||||
"timeout_minutes": timeout_minutes,
|
||||
"added_at": datetime.now().isoformat()
|
||||
"added_at": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
self.config["github"]["profiles"][name] = profile
|
||||
@@ -142,7 +126,7 @@ class ConfigManager:
|
||||
self.save_config()
|
||||
print(f"✅ Added GitHub profile: {name}")
|
||||
if set_as_default:
|
||||
print(f"✅ Set as default profile")
|
||||
print("✅ Set as default profile")
|
||||
|
||||
def remove_github_profile(self, name: str):
|
||||
"""Remove a GitHub profile."""
|
||||
@@ -159,7 +143,7 @@ class ConfigManager:
|
||||
self.save_config()
|
||||
print(f"✅ Removed GitHub profile: {name}")
|
||||
|
||||
def list_github_profiles(self) -> List[Dict[str, Any]]:
|
||||
def list_github_profiles(self) -> list[dict[str, Any]]:
|
||||
"""List all GitHub profiles."""
|
||||
profiles = []
|
||||
default = self.config["github"]["default_profile"]
|
||||
@@ -171,17 +155,13 @@ class ConfigManager:
|
||||
"strategy": data.get("rate_limit_strategy", "prompt"),
|
||||
"timeout": data.get("timeout_minutes", 30),
|
||||
"is_default": name == default,
|
||||
"added_at": data.get("added_at", "Unknown")
|
||||
"added_at": data.get("added_at", "Unknown"),
|
||||
}
|
||||
profiles.append(profile_info)
|
||||
|
||||
return profiles
|
||||
|
||||
def get_github_token(
|
||||
self,
|
||||
profile_name: Optional[str] = None,
|
||||
repo_url: Optional[str] = None
|
||||
) -> Optional[str]:
|
||||
def get_github_token(self, profile_name: str | None = None, repo_url: str | None = None) -> str | None:
|
||||
"""
|
||||
Get GitHub token with smart fallback chain.
|
||||
|
||||
@@ -214,14 +194,14 @@ class ConfigManager:
|
||||
# 4. No token available
|
||||
return None
|
||||
|
||||
def get_profile_for_token(self, token: str) -> Optional[str]:
|
||||
def get_profile_for_token(self, token: str) -> str | None:
|
||||
"""Get profile name for a given token."""
|
||||
for name, profile in self.config["github"]["profiles"].items():
|
||||
if profile["token"] == token:
|
||||
return name
|
||||
return None
|
||||
|
||||
def get_next_profile(self, current_token: str) -> Optional[tuple]:
|
||||
def get_next_profile(self, current_token: str) -> tuple | None:
|
||||
"""
|
||||
Get next available profile for rate limit switching.
|
||||
|
||||
@@ -248,7 +228,7 @@ class ConfigManager:
|
||||
name, profile = profiles[next_idx]
|
||||
return (name, profile["token"])
|
||||
|
||||
def get_rate_limit_strategy(self, token: Optional[str] = None) -> str:
|
||||
def get_rate_limit_strategy(self, token: str | None = None) -> str:
|
||||
"""Get rate limit strategy for a token (or default)."""
|
||||
if token:
|
||||
profile_name = self.get_profile_for_token(token)
|
||||
@@ -259,7 +239,7 @@ class ConfigManager:
|
||||
# Default strategy
|
||||
return "prompt"
|
||||
|
||||
def get_timeout_minutes(self, token: Optional[str] = None) -> int:
|
||||
def get_timeout_minutes(self, token: str | None = None) -> int:
|
||||
"""Get timeout minutes for a token (or default)."""
|
||||
if token:
|
||||
profile_name = self.get_profile_for_token(token)
|
||||
@@ -280,7 +260,7 @@ class ConfigManager:
|
||||
self.save_config()
|
||||
print(f"✅ Set {provider.capitalize()} API key")
|
||||
|
||||
def get_api_key(self, provider: str) -> Optional[str]:
|
||||
def get_api_key(self, provider: str) -> str | None:
|
||||
"""
|
||||
Get API key with environment variable fallback.
|
||||
|
||||
@@ -289,11 +269,7 @@ class ConfigManager:
|
||||
2. Config file
|
||||
"""
|
||||
# Check environment first
|
||||
env_map = {
|
||||
"anthropic": "ANTHROPIC_API_KEY",
|
||||
"google": "GOOGLE_API_KEY",
|
||||
"openai": "OPENAI_API_KEY"
|
||||
}
|
||||
env_map = {"anthropic": "ANTHROPIC_API_KEY", "google": "GOOGLE_API_KEY", "openai": "OPENAI_API_KEY"}
|
||||
|
||||
env_var = env_map.get(provider)
|
||||
if env_var:
|
||||
@@ -306,19 +282,19 @@ class ConfigManager:
|
||||
|
||||
# Progress Management
|
||||
|
||||
def save_progress(self, job_id: str, progress_data: Dict[str, Any]):
|
||||
def save_progress(self, job_id: str, progress_data: dict[str, Any]):
|
||||
"""Save progress for a job."""
|
||||
progress_file = self.progress_dir / f"{job_id}.json"
|
||||
|
||||
progress_data["last_updated"] = datetime.now().isoformat()
|
||||
|
||||
with open(progress_file, 'w') as f:
|
||||
with open(progress_file, "w") as f:
|
||||
json.dump(progress_data, f, indent=2)
|
||||
|
||||
# Set file permissions to 600
|
||||
progress_file.chmod(stat.S_IRUSR | stat.S_IWUSR)
|
||||
|
||||
def load_progress(self, job_id: str) -> Optional[Dict[str, Any]]:
|
||||
def load_progress(self, job_id: str) -> dict[str, Any] | None:
|
||||
"""Load progress for a job."""
|
||||
progress_file = self.progress_dir / f"{job_id}.json"
|
||||
|
||||
@@ -326,29 +302,31 @@ class ConfigManager:
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(progress_file, 'r') as f:
|
||||
with open(progress_file) as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, IOError):
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
def list_resumable_jobs(self) -> List[Dict[str, Any]]:
|
||||
def list_resumable_jobs(self) -> list[dict[str, Any]]:
|
||||
"""List all resumable jobs."""
|
||||
jobs = []
|
||||
|
||||
for progress_file in self.progress_dir.glob("*.json"):
|
||||
try:
|
||||
with open(progress_file, 'r') as f:
|
||||
with open(progress_file) as f:
|
||||
data = json.load(f)
|
||||
|
||||
if data.get("can_resume", False):
|
||||
jobs.append({
|
||||
"job_id": data.get("job_id", progress_file.stem),
|
||||
"started_at": data.get("started_at"),
|
||||
"command": data.get("command"),
|
||||
"progress": data.get("progress", {}),
|
||||
"last_updated": data.get("last_updated")
|
||||
})
|
||||
except (json.JSONDecodeError, IOError):
|
||||
jobs.append(
|
||||
{
|
||||
"job_id": data.get("job_id", progress_file.stem),
|
||||
"started_at": data.get("started_at"),
|
||||
"command": data.get("command"),
|
||||
"progress": data.get("progress", {}),
|
||||
"last_updated": data.get("last_updated"),
|
||||
}
|
||||
)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
continue
|
||||
|
||||
# Sort by last updated (newest first)
|
||||
@@ -447,8 +425,8 @@ class ConfigManager:
|
||||
print(f"\n📦 Resumable Jobs: {len(jobs)}")
|
||||
for job in jobs[:5]: # Show max 5
|
||||
print(f" • {job['job_id']}")
|
||||
if job.get('progress'):
|
||||
phase = job['progress'].get('phase', 'unknown')
|
||||
if job.get("progress"):
|
||||
phase = job["progress"].get("phase", "unknown")
|
||||
print(f" Phase: {phase}, Last: {job['last_updated']}")
|
||||
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ Also provides backward compatibility detection for legacy configs.
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional, Union
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -25,18 +25,18 @@ class ConfigValidator:
|
||||
"""
|
||||
|
||||
# Valid source types
|
||||
VALID_SOURCE_TYPES = {'documentation', 'github', 'pdf'}
|
||||
VALID_SOURCE_TYPES = {"documentation", "github", "pdf"}
|
||||
|
||||
# Valid merge modes
|
||||
VALID_MERGE_MODES = {'rule-based', 'claude-enhanced'}
|
||||
VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}
|
||||
|
||||
# Valid code analysis depth levels
|
||||
VALID_DEPTH_LEVELS = {'surface', 'deep', 'full'}
|
||||
VALID_DEPTH_LEVELS = {"surface", "deep", "full"}
|
||||
|
||||
# Valid AI modes for C3.x enhancement
|
||||
VALID_AI_MODES = {'auto', 'api', 'local', 'none'}
|
||||
VALID_AI_MODES = {"auto", "api", "local", "none"}
|
||||
|
||||
def __init__(self, config_or_path: Union[Dict[str, Any], str]):
|
||||
def __init__(self, config_or_path: dict[str, Any] | str):
|
||||
"""
|
||||
Initialize validator with config dict or file path.
|
||||
|
||||
@@ -51,10 +51,10 @@ class ConfigValidator:
|
||||
self.config = self._load_config()
|
||||
self.is_unified = self._detect_format()
|
||||
|
||||
def _load_config(self) -> Dict[str, Any]:
|
||||
def _load_config(self) -> dict[str, Any]:
|
||||
"""Load JSON config file."""
|
||||
try:
|
||||
with open(self.config_path, 'r', encoding='utf-8') as f:
|
||||
with open(self.config_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
raise ValueError(f"Config file not found: {self.config_path}")
|
||||
@@ -69,7 +69,7 @@ class ConfigValidator:
|
||||
True if unified format (has 'sources' array)
|
||||
False if legacy format
|
||||
"""
|
||||
return 'sources' in self.config and isinstance(self.config['sources'], list)
|
||||
return "sources" in self.config and isinstance(self.config["sources"], list)
|
||||
|
||||
def validate(self) -> bool:
|
||||
"""
|
||||
@@ -91,17 +91,17 @@ class ConfigValidator:
|
||||
logger.info("Validating unified config format...")
|
||||
|
||||
# Required top-level fields
|
||||
if 'name' not in self.config:
|
||||
if "name" not in self.config:
|
||||
raise ValueError("Missing required field: 'name'")
|
||||
|
||||
if 'description' not in self.config:
|
||||
if "description" not in self.config:
|
||||
raise ValueError("Missing required field: 'description'")
|
||||
|
||||
if 'sources' not in self.config:
|
||||
if "sources" not in self.config:
|
||||
raise ValueError("Missing required field: 'sources'")
|
||||
|
||||
# Validate sources array
|
||||
sources = self.config['sources']
|
||||
sources = self.config["sources"]
|
||||
|
||||
if not isinstance(sources, list):
|
||||
raise ValueError("'sources' must be an array")
|
||||
@@ -110,7 +110,7 @@ class ConfigValidator:
|
||||
raise ValueError("'sources' array cannot be empty")
|
||||
|
||||
# Validate merge_mode (optional)
|
||||
merge_mode = self.config.get('merge_mode', 'rule-based')
|
||||
merge_mode = self.config.get("merge_mode", "rule-based")
|
||||
if merge_mode not in self.VALID_MERGE_MODES:
|
||||
raise ValueError(f"Invalid merge_mode: '{merge_mode}'. Must be one of {self.VALID_MERGE_MODES}")
|
||||
|
||||
@@ -121,56 +121,52 @@ class ConfigValidator:
|
||||
logger.info(f"✅ Unified config valid: {len(sources)} sources")
|
||||
return True
|
||||
|
||||
def _validate_source(self, source: Dict[str, Any], index: int):
|
||||
def _validate_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate individual source configuration."""
|
||||
# Check source has 'type' field
|
||||
if 'type' not in source:
|
||||
if "type" not in source:
|
||||
raise ValueError(f"Source {index}: Missing required field 'type'")
|
||||
|
||||
source_type = source['type']
|
||||
source_type = source["type"]
|
||||
|
||||
if source_type not in self.VALID_SOURCE_TYPES:
|
||||
raise ValueError(
|
||||
f"Source {index}: Invalid type '{source_type}'. "
|
||||
f"Must be one of {self.VALID_SOURCE_TYPES}"
|
||||
)
|
||||
raise ValueError(f"Source {index}: Invalid type '{source_type}'. Must be one of {self.VALID_SOURCE_TYPES}")
|
||||
|
||||
# Type-specific validation
|
||||
if source_type == 'documentation':
|
||||
if source_type == "documentation":
|
||||
self._validate_documentation_source(source, index)
|
||||
elif source_type == 'github':
|
||||
elif source_type == "github":
|
||||
self._validate_github_source(source, index)
|
||||
elif source_type == 'pdf':
|
||||
elif source_type == "pdf":
|
||||
self._validate_pdf_source(source, index)
|
||||
|
||||
def _validate_documentation_source(self, source: Dict[str, Any], index: int):
|
||||
def _validate_documentation_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate documentation source configuration."""
|
||||
if 'base_url' not in source:
|
||||
if "base_url" not in source:
|
||||
raise ValueError(f"Source {index} (documentation): Missing required field 'base_url'")
|
||||
|
||||
# Optional but recommended fields
|
||||
if 'selectors' not in source:
|
||||
if "selectors" not in source:
|
||||
logger.warning(f"Source {index} (documentation): No 'selectors' specified, using defaults")
|
||||
|
||||
if 'max_pages' in source and not isinstance(source['max_pages'], int):
|
||||
if "max_pages" in source and not isinstance(source["max_pages"], int):
|
||||
raise ValueError(f"Source {index} (documentation): 'max_pages' must be an integer")
|
||||
|
||||
def _validate_github_source(self, source: Dict[str, Any], index: int):
|
||||
def _validate_github_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate GitHub source configuration."""
|
||||
if 'repo' not in source:
|
||||
if "repo" not in source:
|
||||
raise ValueError(f"Source {index} (github): Missing required field 'repo'")
|
||||
|
||||
# Validate repo format (owner/repo)
|
||||
repo = source['repo']
|
||||
if '/' not in repo:
|
||||
repo = source["repo"]
|
||||
if "/" not in repo:
|
||||
raise ValueError(
|
||||
f"Source {index} (github): Invalid repo format '{repo}'. "
|
||||
f"Must be 'owner/repo' (e.g., 'facebook/react')"
|
||||
f"Source {index} (github): Invalid repo format '{repo}'. Must be 'owner/repo' (e.g., 'facebook/react')"
|
||||
)
|
||||
|
||||
# Validate code_analysis_depth if specified
|
||||
if 'code_analysis_depth' in source:
|
||||
depth = source['code_analysis_depth']
|
||||
if "code_analysis_depth" in source:
|
||||
depth = source["code_analysis_depth"]
|
||||
if depth not in self.VALID_DEPTH_LEVELS:
|
||||
raise ValueError(
|
||||
f"Source {index} (github): Invalid code_analysis_depth '{depth}'. "
|
||||
@@ -178,29 +174,28 @@ class ConfigValidator:
|
||||
)
|
||||
|
||||
# Validate max_issues if specified
|
||||
if 'max_issues' in source and not isinstance(source['max_issues'], int):
|
||||
if "max_issues" in source and not isinstance(source["max_issues"], int):
|
||||
raise ValueError(f"Source {index} (github): 'max_issues' must be an integer")
|
||||
|
||||
# Validate enable_codebase_analysis if specified (C3.5)
|
||||
if 'enable_codebase_analysis' in source and not isinstance(source['enable_codebase_analysis'], bool):
|
||||
if "enable_codebase_analysis" in source and not isinstance(source["enable_codebase_analysis"], bool):
|
||||
raise ValueError(f"Source {index} (github): 'enable_codebase_analysis' must be a boolean")
|
||||
|
||||
# Validate ai_mode if specified (C3.5)
|
||||
if 'ai_mode' in source:
|
||||
ai_mode = source['ai_mode']
|
||||
if "ai_mode" in source:
|
||||
ai_mode = source["ai_mode"]
|
||||
if ai_mode not in self.VALID_AI_MODES:
|
||||
raise ValueError(
|
||||
f"Source {index} (github): Invalid ai_mode '{ai_mode}'. "
|
||||
f"Must be one of {self.VALID_AI_MODES}"
|
||||
f"Source {index} (github): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
|
||||
)
|
||||
|
||||
def _validate_pdf_source(self, source: Dict[str, Any], index: int):
|
||||
def _validate_pdf_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate PDF source configuration."""
|
||||
if 'path' not in source:
|
||||
if "path" not in source:
|
||||
raise ValueError(f"Source {index} (pdf): Missing required field 'path'")
|
||||
|
||||
# Check if file exists
|
||||
pdf_path = source['path']
|
||||
pdf_path = source["path"]
|
||||
if not Path(pdf_path).exists():
|
||||
logger.warning(f"Source {index} (pdf): File not found: {pdf_path}")
|
||||
|
||||
@@ -213,18 +208,18 @@ class ConfigValidator:
|
||||
logger.info("Detected legacy config format (backward compatible)")
|
||||
|
||||
# Detect which legacy type based on fields
|
||||
if 'base_url' in self.config:
|
||||
if "base_url" in self.config:
|
||||
logger.info("Legacy type: documentation")
|
||||
elif 'repo' in self.config:
|
||||
elif "repo" in self.config:
|
||||
logger.info("Legacy type: github")
|
||||
elif 'pdf' in self.config or 'path' in self.config:
|
||||
elif "pdf" in self.config or "path" in self.config:
|
||||
logger.info("Legacy type: pdf")
|
||||
else:
|
||||
raise ValueError("Cannot detect legacy config type (missing base_url, repo, or pdf)")
|
||||
|
||||
return True
|
||||
|
||||
def convert_legacy_to_unified(self) -> Dict[str, Any]:
|
||||
def convert_legacy_to_unified(self) -> dict[str, Any]:
|
||||
"""
|
||||
Convert legacy config to unified format.
|
||||
|
||||
@@ -238,64 +233,50 @@ class ConfigValidator:
|
||||
logger.info("Converting legacy config to unified format...")
|
||||
|
||||
# Detect legacy type and convert
|
||||
if 'base_url' in self.config:
|
||||
if "base_url" in self.config:
|
||||
return self._convert_legacy_documentation()
|
||||
elif 'repo' in self.config:
|
||||
elif "repo" in self.config:
|
||||
return self._convert_legacy_github()
|
||||
elif 'pdf' in self.config or 'path' in self.config:
|
||||
elif "pdf" in self.config or "path" in self.config:
|
||||
return self._convert_legacy_pdf()
|
||||
else:
|
||||
raise ValueError("Cannot convert: unknown legacy format")
|
||||
|
||||
def _convert_legacy_documentation(self) -> Dict[str, Any]:
|
||||
def _convert_legacy_documentation(self) -> dict[str, Any]:
|
||||
"""Convert legacy documentation config to unified."""
|
||||
unified = {
|
||||
'name': self.config.get('name', 'unnamed'),
|
||||
'description': self.config.get('description', 'Documentation skill'),
|
||||
'merge_mode': 'rule-based',
|
||||
'sources': [
|
||||
{
|
||||
'type': 'documentation',
|
||||
**{k: v for k, v in self.config.items()
|
||||
if k not in ['name', 'description']}
|
||||
}
|
||||
]
|
||||
"name": self.config.get("name", "unnamed"),
|
||||
"description": self.config.get("description", "Documentation skill"),
|
||||
"merge_mode": "rule-based",
|
||||
"sources": [
|
||||
{"type": "documentation", **{k: v for k, v in self.config.items() if k not in ["name", "description"]}}
|
||||
],
|
||||
}
|
||||
return unified
|
||||
|
||||
def _convert_legacy_github(self) -> Dict[str, Any]:
|
||||
def _convert_legacy_github(self) -> dict[str, Any]:
|
||||
"""Convert legacy GitHub config to unified."""
|
||||
unified = {
|
||||
'name': self.config.get('name', 'unnamed'),
|
||||
'description': self.config.get('description', 'GitHub repository skill'),
|
||||
'merge_mode': 'rule-based',
|
||||
'sources': [
|
||||
{
|
||||
'type': 'github',
|
||||
**{k: v for k, v in self.config.items()
|
||||
if k not in ['name', 'description']}
|
||||
}
|
||||
]
|
||||
"name": self.config.get("name", "unnamed"),
|
||||
"description": self.config.get("description", "GitHub repository skill"),
|
||||
"merge_mode": "rule-based",
|
||||
"sources": [
|
||||
{"type": "github", **{k: v for k, v in self.config.items() if k not in ["name", "description"]}}
|
||||
],
|
||||
}
|
||||
return unified
|
||||
|
||||
def _convert_legacy_pdf(self) -> Dict[str, Any]:
|
||||
def _convert_legacy_pdf(self) -> dict[str, Any]:
|
||||
"""Convert legacy PDF config to unified."""
|
||||
unified = {
|
||||
'name': self.config.get('name', 'unnamed'),
|
||||
'description': self.config.get('description', 'PDF document skill'),
|
||||
'merge_mode': 'rule-based',
|
||||
'sources': [
|
||||
{
|
||||
'type': 'pdf',
|
||||
**{k: v for k, v in self.config.items()
|
||||
if k not in ['name', 'description']}
|
||||
}
|
||||
]
|
||||
"name": self.config.get("name", "unnamed"),
|
||||
"description": self.config.get("description", "PDF document skill"),
|
||||
"merge_mode": "rule-based",
|
||||
"sources": [{"type": "pdf", **{k: v for k, v in self.config.items() if k not in ["name", "description"]}}],
|
||||
}
|
||||
return unified
|
||||
|
||||
def get_sources_by_type(self, source_type: str) -> List[Dict[str, Any]]:
|
||||
def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Get all sources of a specific type.
|
||||
|
||||
@@ -308,17 +289,17 @@ class ConfigValidator:
|
||||
if not self.is_unified:
|
||||
# For legacy, convert and get sources
|
||||
unified = self.convert_legacy_to_unified()
|
||||
sources = unified['sources']
|
||||
sources = unified["sources"]
|
||||
else:
|
||||
sources = self.config['sources']
|
||||
sources = self.config["sources"]
|
||||
|
||||
return [s for s in sources if s.get('type') == source_type]
|
||||
return [s for s in sources if s.get("type") == source_type]
|
||||
|
||||
def has_multiple_sources(self) -> bool:
|
||||
"""Check if config has multiple sources (requires merging)."""
|
||||
if not self.is_unified:
|
||||
return False
|
||||
return len(self.config['sources']) > 1
|
||||
return len(self.config["sources"]) > 1
|
||||
|
||||
def needs_api_merge(self) -> bool:
|
||||
"""
|
||||
@@ -331,13 +312,11 @@ class ConfigValidator:
|
||||
return False
|
||||
|
||||
has_docs_api = any(
|
||||
s.get('type') == 'documentation' and s.get('extract_api', True)
|
||||
for s in self.config['sources']
|
||||
s.get("type") == "documentation" and s.get("extract_api", True) for s in self.config["sources"]
|
||||
)
|
||||
|
||||
has_github_code = any(
|
||||
s.get('type') == 'github' and s.get('include_code', False)
|
||||
for s in self.config['sources']
|
||||
s.get("type") == "github" and s.get("include_code", False) for s in self.config["sources"]
|
||||
)
|
||||
|
||||
return has_docs_api and has_github_code
|
||||
@@ -361,7 +340,7 @@ def validate_config(config_path: str) -> ConfigValidator:
|
||||
return validator
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
@@ -373,18 +352,18 @@ if __name__ == '__main__':
|
||||
try:
|
||||
validator = validate_config(config_file)
|
||||
|
||||
print(f"\n✅ Config valid!")
|
||||
print("\n✅ Config valid!")
|
||||
print(f" Format: {'Unified' if validator.is_unified else 'Legacy'}")
|
||||
print(f" Name: {validator.config.get('name')}")
|
||||
|
||||
if validator.is_unified:
|
||||
sources = validator.config['sources']
|
||||
sources = validator.config["sources"]
|
||||
print(f" Sources: {len(sources)}")
|
||||
for i, source in enumerate(sources):
|
||||
print(f" {i+1}. {source['type']}")
|
||||
print(f" {i + 1}. {source['type']}")
|
||||
|
||||
if validator.needs_api_merge():
|
||||
merge_mode = validator.config.get('merge_mode', 'rule-based')
|
||||
merge_mode = validator.config.get("merge_mode", "rule-based")
|
||||
print(f" ⚠️ API merge required (mode: {merge_mode})")
|
||||
|
||||
except ValueError as e:
|
||||
|
||||
@@ -13,9 +13,9 @@ Used by unified scraper to identify discrepancies before merging.
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
from dataclasses import dataclass, asdict
|
||||
from dataclasses import asdict, dataclass
|
||||
from difflib import SequenceMatcher
|
||||
from typing import Any
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -24,13 +24,14 @@ logger = logging.getLogger(__name__)
|
||||
@dataclass
|
||||
class Conflict:
|
||||
"""Represents a conflict between documentation and code."""
|
||||
|
||||
type: str # 'missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch'
|
||||
severity: str # 'low', 'medium', 'high'
|
||||
api_name: str
|
||||
docs_info: Optional[Dict[str, Any]] = None
|
||||
code_info: Optional[Dict[str, Any]] = None
|
||||
difference: Optional[str] = None
|
||||
suggestion: Optional[str] = None
|
||||
docs_info: dict[str, Any] | None = None
|
||||
code_info: dict[str, Any] | None = None
|
||||
difference: str | None = None
|
||||
suggestion: str | None = None
|
||||
|
||||
|
||||
class ConflictDetector:
|
||||
@@ -38,7 +39,7 @@ class ConflictDetector:
|
||||
Detects conflicts between documentation and code sources.
|
||||
"""
|
||||
|
||||
def __init__(self, docs_data: Dict[str, Any], github_data: Dict[str, Any]):
|
||||
def __init__(self, docs_data: dict[str, Any], github_data: dict[str, Any]):
|
||||
"""
|
||||
Initialize conflict detector.
|
||||
|
||||
@@ -56,7 +57,7 @@ class ConflictDetector:
|
||||
logger.info(f"Loaded {len(self.docs_apis)} APIs from documentation")
|
||||
logger.info(f"Loaded {len(self.code_apis)} APIs from code")
|
||||
|
||||
def _extract_docs_apis(self) -> Dict[str, Dict[str, Any]]:
|
||||
def _extract_docs_apis(self) -> dict[str, dict[str, Any]]:
|
||||
"""
|
||||
Extract API information from documentation data.
|
||||
|
||||
@@ -66,42 +67,43 @@ class ConflictDetector:
|
||||
apis = {}
|
||||
|
||||
# Documentation structure varies, but typically has 'pages' or 'references'
|
||||
pages = self.docs_data.get('pages', {})
|
||||
pages = self.docs_data.get("pages", {})
|
||||
|
||||
# Handle both dict and list formats
|
||||
if isinstance(pages, dict):
|
||||
# Format: {url: page_data, ...}
|
||||
for url, page_data in pages.items():
|
||||
content = page_data.get('content', '')
|
||||
title = page_data.get('title', '')
|
||||
content = page_data.get("content", "")
|
||||
title = page_data.get("title", "")
|
||||
|
||||
# Simple heuristic: if title or URL contains "api", "reference", "class", "function"
|
||||
# it might be an API page
|
||||
if any(keyword in title.lower() or keyword in url.lower()
|
||||
for keyword in ['api', 'reference', 'class', 'function', 'method']):
|
||||
|
||||
if any(
|
||||
keyword in title.lower() or keyword in url.lower()
|
||||
for keyword in ["api", "reference", "class", "function", "method"]
|
||||
):
|
||||
# Extract API signatures from content (simplified)
|
||||
extracted_apis = self._parse_doc_content_for_apis(content, url)
|
||||
apis.update(extracted_apis)
|
||||
elif isinstance(pages, list):
|
||||
# Format: [{url: '...', apis: [...]}, ...]
|
||||
for page in pages:
|
||||
url = page.get('url', '')
|
||||
page_apis = page.get('apis', [])
|
||||
url = page.get("url", "")
|
||||
page_apis = page.get("apis", [])
|
||||
|
||||
# If APIs are already extracted in the page data
|
||||
for api in page_apis:
|
||||
api_name = api.get('name', '')
|
||||
api_name = api.get("name", "")
|
||||
if api_name:
|
||||
apis[api_name] = {
|
||||
'parameters': api.get('parameters', []),
|
||||
'return_type': api.get('return_type', 'Any'),
|
||||
'source_url': url
|
||||
"parameters": api.get("parameters", []),
|
||||
"return_type": api.get("return_type", "Any"),
|
||||
"source_url": url,
|
||||
}
|
||||
|
||||
return apis
|
||||
|
||||
def _parse_doc_content_for_apis(self, content: str, source_url: str) -> Dict[str, Dict]:
|
||||
def _parse_doc_content_for_apis(self, content: str, source_url: str) -> dict[str, dict]:
|
||||
"""
|
||||
Parse documentation content to extract API signatures.
|
||||
|
||||
@@ -121,13 +123,13 @@ class ConflictDetector:
|
||||
# Pattern for common API signatures
|
||||
patterns = [
|
||||
# Python style: def name(params) -> return
|
||||
r'def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*(\w+))?',
|
||||
r"def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*(\w+))?",
|
||||
# JavaScript style: function name(params)
|
||||
r'function\s+(\w+)\s*\(([^)]*)\)',
|
||||
r"function\s+(\w+)\s*\(([^)]*)\)",
|
||||
# C++ style: return_type name(params)
|
||||
r'(\w+)\s+(\w+)\s*\(([^)]*)\)',
|
||||
r"(\w+)\s+(\w+)\s*\(([^)]*)\)",
|
||||
# Method style: ClassName.method_name(params)
|
||||
r'(\w+)\.(\w+)\s*\(([^)]*)\)'
|
||||
r"(\w+)\.(\w+)\s*\(([^)]*)\)",
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
@@ -135,17 +137,17 @@ class ConflictDetector:
|
||||
groups = match.groups()
|
||||
|
||||
# Parse based on pattern matched
|
||||
if 'def' in pattern:
|
||||
if "def" in pattern:
|
||||
# Python function
|
||||
name = groups[0]
|
||||
params_str = groups[1]
|
||||
return_type = groups[2] if len(groups) > 2 else None
|
||||
elif 'function' in pattern:
|
||||
elif "function" in pattern:
|
||||
# JavaScript function
|
||||
name = groups[0]
|
||||
params_str = groups[1]
|
||||
return_type = None
|
||||
elif '.' in pattern:
|
||||
elif "." in pattern:
|
||||
# Class method
|
||||
class_name = groups[0]
|
||||
method_name = groups[1]
|
||||
@@ -162,54 +164,54 @@ class ConflictDetector:
|
||||
params = self._parse_param_string(params_str)
|
||||
|
||||
apis[name] = {
|
||||
'name': name,
|
||||
'parameters': params,
|
||||
'return_type': return_type,
|
||||
'source': source_url,
|
||||
'raw_signature': match.group(0)
|
||||
"name": name,
|
||||
"parameters": params,
|
||||
"return_type": return_type,
|
||||
"source": source_url,
|
||||
"raw_signature": match.group(0),
|
||||
}
|
||||
|
||||
return apis
|
||||
|
||||
def _parse_param_string(self, params_str: str) -> List[Dict]:
|
||||
def _parse_param_string(self, params_str: str) -> list[dict]:
|
||||
"""Parse parameter string into list of parameter dicts."""
|
||||
if not params_str.strip():
|
||||
return []
|
||||
|
||||
params = []
|
||||
for param in params_str.split(','):
|
||||
for param in params_str.split(","):
|
||||
param = param.strip()
|
||||
if not param:
|
||||
continue
|
||||
|
||||
# Try to extract name and type
|
||||
param_info = {'name': param, 'type': None, 'default': None}
|
||||
param_info = {"name": param, "type": None, "default": None}
|
||||
|
||||
# Check for type annotation (: type)
|
||||
if ':' in param:
|
||||
parts = param.split(':', 1)
|
||||
param_info['name'] = parts[0].strip()
|
||||
if ":" in param:
|
||||
parts = param.split(":", 1)
|
||||
param_info["name"] = parts[0].strip()
|
||||
type_part = parts[1].strip()
|
||||
|
||||
# Check for default value (= value)
|
||||
if '=' in type_part:
|
||||
type_str, default_str = type_part.split('=', 1)
|
||||
param_info['type'] = type_str.strip()
|
||||
param_info['default'] = default_str.strip()
|
||||
if "=" in type_part:
|
||||
type_str, default_str = type_part.split("=", 1)
|
||||
param_info["type"] = type_str.strip()
|
||||
param_info["default"] = default_str.strip()
|
||||
else:
|
||||
param_info['type'] = type_part
|
||||
param_info["type"] = type_part
|
||||
|
||||
# Check for default without type (= value)
|
||||
elif '=' in param:
|
||||
parts = param.split('=', 1)
|
||||
param_info['name'] = parts[0].strip()
|
||||
param_info['default'] = parts[1].strip()
|
||||
elif "=" in param:
|
||||
parts = param.split("=", 1)
|
||||
param_info["name"] = parts[0].strip()
|
||||
param_info["default"] = parts[1].strip()
|
||||
|
||||
params.append(param_info)
|
||||
|
||||
return params
|
||||
|
||||
def _extract_code_apis(self) -> Dict[str, Dict[str, Any]]:
|
||||
def _extract_code_apis(self) -> dict[str, dict[str, Any]]:
|
||||
"""
|
||||
Extract API information from GitHub code analysis.
|
||||
|
||||
@@ -218,61 +220,61 @@ class ConflictDetector:
|
||||
"""
|
||||
apis = {}
|
||||
|
||||
code_analysis = self.github_data.get('code_analysis', {})
|
||||
code_analysis = self.github_data.get("code_analysis", {})
|
||||
if not code_analysis:
|
||||
return apis
|
||||
|
||||
# Support both 'files' and 'analyzed_files' keys
|
||||
files = code_analysis.get('files', code_analysis.get('analyzed_files', []))
|
||||
files = code_analysis.get("files", code_analysis.get("analyzed_files", []))
|
||||
|
||||
for file_info in files:
|
||||
file_path = file_info.get('file', 'unknown')
|
||||
file_path = file_info.get("file", "unknown")
|
||||
|
||||
# Extract classes and their methods
|
||||
for class_info in file_info.get('classes', []):
|
||||
class_name = class_info['name']
|
||||
for class_info in file_info.get("classes", []):
|
||||
class_name = class_info["name"]
|
||||
|
||||
# Add class itself
|
||||
apis[class_name] = {
|
||||
'name': class_name,
|
||||
'type': 'class',
|
||||
'source': file_path,
|
||||
'line': class_info.get('line_number'),
|
||||
'base_classes': class_info.get('base_classes', []),
|
||||
'docstring': class_info.get('docstring')
|
||||
"name": class_name,
|
||||
"type": "class",
|
||||
"source": file_path,
|
||||
"line": class_info.get("line_number"),
|
||||
"base_classes": class_info.get("base_classes", []),
|
||||
"docstring": class_info.get("docstring"),
|
||||
}
|
||||
|
||||
# Add methods
|
||||
for method in class_info.get('methods', []):
|
||||
for method in class_info.get("methods", []):
|
||||
method_name = f"{class_name}.{method['name']}"
|
||||
apis[method_name] = {
|
||||
'name': method_name,
|
||||
'type': 'method',
|
||||
'parameters': method.get('parameters', []),
|
||||
'return_type': method.get('return_type'),
|
||||
'source': file_path,
|
||||
'line': method.get('line_number'),
|
||||
'docstring': method.get('docstring'),
|
||||
'is_async': method.get('is_async', False)
|
||||
"name": method_name,
|
||||
"type": "method",
|
||||
"parameters": method.get("parameters", []),
|
||||
"return_type": method.get("return_type"),
|
||||
"source": file_path,
|
||||
"line": method.get("line_number"),
|
||||
"docstring": method.get("docstring"),
|
||||
"is_async": method.get("is_async", False),
|
||||
}
|
||||
|
||||
# Extract standalone functions
|
||||
for func_info in file_info.get('functions', []):
|
||||
func_name = func_info['name']
|
||||
for func_info in file_info.get("functions", []):
|
||||
func_name = func_info["name"]
|
||||
apis[func_name] = {
|
||||
'name': func_name,
|
||||
'type': 'function',
|
||||
'parameters': func_info.get('parameters', []),
|
||||
'return_type': func_info.get('return_type'),
|
||||
'source': file_path,
|
||||
'line': func_info.get('line_number'),
|
||||
'docstring': func_info.get('docstring'),
|
||||
'is_async': func_info.get('is_async', False)
|
||||
"name": func_name,
|
||||
"type": "function",
|
||||
"parameters": func_info.get("parameters", []),
|
||||
"return_type": func_info.get("return_type"),
|
||||
"source": file_path,
|
||||
"line": func_info.get("line_number"),
|
||||
"docstring": func_info.get("docstring"),
|
||||
"is_async": func_info.get("is_async", False),
|
||||
}
|
||||
|
||||
return apis
|
||||
|
||||
def detect_all_conflicts(self) -> List[Conflict]:
|
||||
def detect_all_conflicts(self) -> list[Conflict]:
|
||||
"""
|
||||
Detect all types of conflicts.
|
||||
|
||||
@@ -296,7 +298,7 @@ class ConflictDetector:
|
||||
|
||||
return conflicts
|
||||
|
||||
def _find_missing_in_docs(self) -> List[Conflict]:
|
||||
def _find_missing_in_docs(self) -> list[Conflict]:
|
||||
"""Find APIs that exist in code but not in documentation."""
|
||||
conflicts = []
|
||||
|
||||
@@ -304,40 +306,46 @@ class ConflictDetector:
|
||||
# Simple name matching (can be enhanced with fuzzy matching)
|
||||
if api_name not in self.docs_apis:
|
||||
# Check if it's a private/internal API (often not documented)
|
||||
is_private = api_name.startswith('_') or '__' in api_name
|
||||
severity = 'low' if is_private else 'medium'
|
||||
is_private = api_name.startswith("_") or "__" in api_name
|
||||
severity = "low" if is_private else "medium"
|
||||
|
||||
conflicts.append(Conflict(
|
||||
type='missing_in_docs',
|
||||
severity=severity,
|
||||
api_name=api_name,
|
||||
code_info=code_info,
|
||||
difference=f"API exists in code ({code_info['source']}) but not found in documentation",
|
||||
suggestion="Add documentation for this API" if not is_private else "Consider if this internal API should be documented"
|
||||
))
|
||||
conflicts.append(
|
||||
Conflict(
|
||||
type="missing_in_docs",
|
||||
severity=severity,
|
||||
api_name=api_name,
|
||||
code_info=code_info,
|
||||
difference=f"API exists in code ({code_info['source']}) but not found in documentation",
|
||||
suggestion="Add documentation for this API"
|
||||
if not is_private
|
||||
else "Consider if this internal API should be documented",
|
||||
)
|
||||
)
|
||||
|
||||
logger.info(f"Found {len(conflicts)} APIs missing in documentation")
|
||||
return conflicts
|
||||
|
||||
def _find_missing_in_code(self) -> List[Conflict]:
|
||||
def _find_missing_in_code(self) -> list[Conflict]:
|
||||
"""Find APIs that are documented but don't exist in code."""
|
||||
conflicts = []
|
||||
|
||||
for api_name, docs_info in self.docs_apis.items():
|
||||
if api_name not in self.code_apis:
|
||||
conflicts.append(Conflict(
|
||||
type='missing_in_code',
|
||||
severity='high', # This is serious - documented but doesn't exist
|
||||
api_name=api_name,
|
||||
docs_info=docs_info,
|
||||
difference=f"API documented ({docs_info.get('source', 'unknown')}) but not found in code",
|
||||
suggestion="Update documentation to remove this API, or add it to codebase"
|
||||
))
|
||||
conflicts.append(
|
||||
Conflict(
|
||||
type="missing_in_code",
|
||||
severity="high", # This is serious - documented but doesn't exist
|
||||
api_name=api_name,
|
||||
docs_info=docs_info,
|
||||
difference=f"API documented ({docs_info.get('source', 'unknown')}) but not found in code",
|
||||
suggestion="Update documentation to remove this API, or add it to codebase",
|
||||
)
|
||||
)
|
||||
|
||||
logger.info(f"Found {len(conflicts)} APIs missing in code")
|
||||
return conflicts
|
||||
|
||||
def _find_signature_mismatches(self) -> List[Conflict]:
|
||||
def _find_signature_mismatches(self) -> list[Conflict]:
|
||||
"""Find APIs where signature differs between docs and code."""
|
||||
conflicts = []
|
||||
|
||||
@@ -352,41 +360,43 @@ class ConflictDetector:
|
||||
mismatch = self._compare_signatures(docs_info, code_info)
|
||||
|
||||
if mismatch:
|
||||
conflicts.append(Conflict(
|
||||
type='signature_mismatch',
|
||||
severity=mismatch['severity'],
|
||||
api_name=api_name,
|
||||
docs_info=docs_info,
|
||||
code_info=code_info,
|
||||
difference=mismatch['difference'],
|
||||
suggestion=mismatch['suggestion']
|
||||
))
|
||||
conflicts.append(
|
||||
Conflict(
|
||||
type="signature_mismatch",
|
||||
severity=mismatch["severity"],
|
||||
api_name=api_name,
|
||||
docs_info=docs_info,
|
||||
code_info=code_info,
|
||||
difference=mismatch["difference"],
|
||||
suggestion=mismatch["suggestion"],
|
||||
)
|
||||
)
|
||||
|
||||
logger.info(f"Found {len(conflicts)} signature mismatches")
|
||||
return conflicts
|
||||
|
||||
def _compare_signatures(self, docs_info: Dict, code_info: Dict) -> Optional[Dict]:
|
||||
def _compare_signatures(self, docs_info: dict, code_info: dict) -> dict | None:
|
||||
"""
|
||||
Compare signatures between docs and code.
|
||||
|
||||
Returns:
|
||||
Dict with mismatch details if conflict found, None otherwise
|
||||
"""
|
||||
docs_params = docs_info.get('parameters', [])
|
||||
code_params = code_info.get('parameters', [])
|
||||
docs_params = docs_info.get("parameters", [])
|
||||
code_params = code_info.get("parameters", [])
|
||||
|
||||
# Compare parameter counts
|
||||
if len(docs_params) != len(code_params):
|
||||
return {
|
||||
'severity': 'medium',
|
||||
'difference': f"Parameter count mismatch: docs has {len(docs_params)}, code has {len(code_params)}",
|
||||
'suggestion': f"Documentation shows {len(docs_params)} parameters, but code has {len(code_params)}"
|
||||
"severity": "medium",
|
||||
"difference": f"Parameter count mismatch: docs has {len(docs_params)}, code has {len(code_params)}",
|
||||
"suggestion": f"Documentation shows {len(docs_params)} parameters, but code has {len(code_params)}",
|
||||
}
|
||||
|
||||
# Compare parameter names and types
|
||||
for i, (doc_param, code_param) in enumerate(zip(docs_params, code_params)):
|
||||
doc_name = doc_param.get('name', '')
|
||||
code_name = code_param.get('name', '')
|
||||
doc_name = doc_param.get("name", "")
|
||||
code_name = code_param.get("name", "")
|
||||
|
||||
# Parameter name mismatch
|
||||
if doc_name != code_name:
|
||||
@@ -394,36 +404,36 @@ class ConflictDetector:
|
||||
similarity = SequenceMatcher(None, doc_name, code_name).ratio()
|
||||
if similarity < 0.8: # Not similar enough
|
||||
return {
|
||||
'severity': 'medium',
|
||||
'difference': f"Parameter {i+1} name mismatch: '{doc_name}' in docs vs '{code_name}' in code",
|
||||
'suggestion': f"Update documentation to use parameter name '{code_name}'"
|
||||
"severity": "medium",
|
||||
"difference": f"Parameter {i + 1} name mismatch: '{doc_name}' in docs vs '{code_name}' in code",
|
||||
"suggestion": f"Update documentation to use parameter name '{code_name}'",
|
||||
}
|
||||
|
||||
# Type mismatch
|
||||
doc_type = doc_param.get('type')
|
||||
code_type = code_param.get('type_hint')
|
||||
doc_type = doc_param.get("type")
|
||||
code_type = code_param.get("type_hint")
|
||||
|
||||
if doc_type and code_type and doc_type != code_type:
|
||||
return {
|
||||
'severity': 'low',
|
||||
'difference': f"Parameter '{doc_name}' type mismatch: '{doc_type}' in docs vs '{code_type}' in code",
|
||||
'suggestion': f"Verify correct type for parameter '{doc_name}'"
|
||||
"severity": "low",
|
||||
"difference": f"Parameter '{doc_name}' type mismatch: '{doc_type}' in docs vs '{code_type}' in code",
|
||||
"suggestion": f"Verify correct type for parameter '{doc_name}'",
|
||||
}
|
||||
|
||||
# Compare return types if both have them
|
||||
docs_return = docs_info.get('return_type')
|
||||
code_return = code_info.get('return_type')
|
||||
docs_return = docs_info.get("return_type")
|
||||
code_return = code_info.get("return_type")
|
||||
|
||||
if docs_return and code_return and docs_return != code_return:
|
||||
return {
|
||||
'severity': 'low',
|
||||
'difference': f"Return type mismatch: '{docs_return}' in docs vs '{code_return}' in code",
|
||||
'suggestion': "Verify correct return type"
|
||||
"severity": "low",
|
||||
"difference": f"Return type mismatch: '{docs_return}' in docs vs '{code_return}' in code",
|
||||
"suggestion": "Verify correct return type",
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
def generate_summary(self, conflicts: List[Conflict]) -> Dict[str, Any]:
|
||||
def generate_summary(self, conflicts: list[Conflict]) -> dict[str, Any]:
|
||||
"""
|
||||
Generate summary statistics for conflicts.
|
||||
|
||||
@@ -434,25 +444,25 @@ class ConflictDetector:
|
||||
Summary dict with statistics
|
||||
"""
|
||||
summary = {
|
||||
'total': len(conflicts),
|
||||
'by_type': {},
|
||||
'by_severity': {},
|
||||
'apis_affected': len(set(c.api_name for c in conflicts))
|
||||
"total": len(conflicts),
|
||||
"by_type": {},
|
||||
"by_severity": {},
|
||||
"apis_affected": len(set(c.api_name for c in conflicts)),
|
||||
}
|
||||
|
||||
# Count by type
|
||||
for conflict_type in ['missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch']:
|
||||
for conflict_type in ["missing_in_docs", "missing_in_code", "signature_mismatch", "description_mismatch"]:
|
||||
count = sum(1 for c in conflicts if c.type == conflict_type)
|
||||
summary['by_type'][conflict_type] = count
|
||||
summary["by_type"][conflict_type] = count
|
||||
|
||||
# Count by severity
|
||||
for severity in ['low', 'medium', 'high']:
|
||||
for severity in ["low", "medium", "high"]:
|
||||
count = sum(1 for c in conflicts if c.severity == severity)
|
||||
summary['by_severity'][severity] = count
|
||||
summary["by_severity"][severity] = count
|
||||
|
||||
return summary
|
||||
|
||||
def save_conflicts(self, conflicts: List[Conflict], output_path: str):
|
||||
def save_conflicts(self, conflicts: list[Conflict], output_path: str):
|
||||
"""
|
||||
Save conflicts to JSON file.
|
||||
|
||||
@@ -460,18 +470,15 @@ class ConflictDetector:
|
||||
conflicts: List of Conflict objects
|
||||
output_path: Path to output JSON file
|
||||
"""
|
||||
data = {
|
||||
'conflicts': [asdict(c) for c in conflicts],
|
||||
'summary': self.generate_summary(conflicts)
|
||||
}
|
||||
data = {"conflicts": [asdict(c) for c in conflicts], "summary": self.generate_summary(conflicts)}
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Conflicts saved to: {output_path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
@@ -482,10 +489,10 @@ if __name__ == '__main__':
|
||||
github_file = sys.argv[2]
|
||||
|
||||
# Load data
|
||||
with open(docs_file, 'r') as f:
|
||||
with open(docs_file) as f:
|
||||
docs_data = json.load(f)
|
||||
|
||||
with open(github_file, 'r') as f:
|
||||
with open(github_file) as f:
|
||||
github_data = json.load(f)
|
||||
|
||||
# Detect conflicts
|
||||
@@ -498,16 +505,16 @@ if __name__ == '__main__':
|
||||
print(f" Total conflicts: {summary['total']}")
|
||||
print(f" APIs affected: {summary['apis_affected']}")
|
||||
print("\n By Type:")
|
||||
for conflict_type, count in summary['by_type'].items():
|
||||
for conflict_type, count in summary["by_type"].items():
|
||||
if count > 0:
|
||||
print(f" {conflict_type}: {count}")
|
||||
print("\n By Severity:")
|
||||
for severity, count in summary['by_severity'].items():
|
||||
for severity, count in summary["by_severity"].items():
|
||||
if count > 0:
|
||||
emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢'
|
||||
emoji = "🔴" if severity == "high" else "🟡" if severity == "medium" else "🟢"
|
||||
print(f" {emoji} {severity}: {count}")
|
||||
|
||||
# Save to file
|
||||
output_file = 'conflicts.json'
|
||||
output_file = "conflicts.json"
|
||||
detector.save_conflicts(conflicts, output_file)
|
||||
print(f"\n✅ Full report saved to: {output_file}")
|
||||
|
||||
@@ -8,7 +8,7 @@ across the CLI tools to improve maintainability and clarity.
|
||||
|
||||
# Default scraping limits
|
||||
DEFAULT_RATE_LIMIT = 0.5 # seconds between requests
|
||||
DEFAULT_MAX_PAGES = 500 # maximum pages to scrape
|
||||
DEFAULT_MAX_PAGES = 500 # maximum pages to scrape
|
||||
DEFAULT_CHECKPOINT_INTERVAL = 1000 # pages between checkpoints
|
||||
DEFAULT_ASYNC_MODE = False # use async mode for parallel scraping (opt-in)
|
||||
|
||||
@@ -26,7 +26,7 @@ CONTENT_MATCH_POINTS = 1 # points for content keyword match
|
||||
|
||||
# API-based enhancement limits (uses Anthropic API)
|
||||
API_CONTENT_LIMIT = 100000 # max characters for API enhancement
|
||||
API_PREVIEW_LIMIT = 40000 # max characters for preview
|
||||
API_PREVIEW_LIMIT = 40000 # max characters for preview
|
||||
|
||||
# Local enhancement limits (uses Claude Code Max)
|
||||
LOCAL_CONTENT_LIMIT = 50000 # max characters for local enhancement
|
||||
@@ -36,7 +36,7 @@ LOCAL_PREVIEW_LIMIT = 20000 # max characters for preview
|
||||
|
||||
# Estimation and discovery settings
|
||||
DEFAULT_MAX_DISCOVERY = 1000 # default max pages to discover
|
||||
DISCOVERY_THRESHOLD = 10000 # threshold for warnings
|
||||
DISCOVERY_THRESHOLD = 10000 # threshold for warnings
|
||||
|
||||
# ===== FILE LIMITS =====
|
||||
|
||||
@@ -48,25 +48,25 @@ MAX_CODE_BLOCKS_PER_PAGE = 5 # maximum code blocks to extract per page
|
||||
|
||||
__all__ = [
|
||||
# Scraping
|
||||
'DEFAULT_RATE_LIMIT',
|
||||
'DEFAULT_MAX_PAGES',
|
||||
'DEFAULT_CHECKPOINT_INTERVAL',
|
||||
'DEFAULT_ASYNC_MODE',
|
||||
'CONTENT_PREVIEW_LENGTH',
|
||||
'MAX_PAGES_WARNING_THRESHOLD',
|
||||
'MIN_CATEGORIZATION_SCORE',
|
||||
'URL_MATCH_POINTS',
|
||||
'TITLE_MATCH_POINTS',
|
||||
'CONTENT_MATCH_POINTS',
|
||||
"DEFAULT_RATE_LIMIT",
|
||||
"DEFAULT_MAX_PAGES",
|
||||
"DEFAULT_CHECKPOINT_INTERVAL",
|
||||
"DEFAULT_ASYNC_MODE",
|
||||
"CONTENT_PREVIEW_LENGTH",
|
||||
"MAX_PAGES_WARNING_THRESHOLD",
|
||||
"MIN_CATEGORIZATION_SCORE",
|
||||
"URL_MATCH_POINTS",
|
||||
"TITLE_MATCH_POINTS",
|
||||
"CONTENT_MATCH_POINTS",
|
||||
# Enhancement
|
||||
'API_CONTENT_LIMIT',
|
||||
'API_PREVIEW_LIMIT',
|
||||
'LOCAL_CONTENT_LIMIT',
|
||||
'LOCAL_PREVIEW_LIMIT',
|
||||
"API_CONTENT_LIMIT",
|
||||
"API_PREVIEW_LIMIT",
|
||||
"LOCAL_CONTENT_LIMIT",
|
||||
"LOCAL_PREVIEW_LIMIT",
|
||||
# Estimation
|
||||
'DEFAULT_MAX_DISCOVERY',
|
||||
'DISCOVERY_THRESHOLD',
|
||||
"DEFAULT_MAX_DISCOVERY",
|
||||
"DISCOVERY_THRESHOLD",
|
||||
# Limits
|
||||
'MAX_REFERENCE_FILES',
|
||||
'MAX_CODE_BLOCKS_PER_PAGE',
|
||||
"MAX_REFERENCE_FILES",
|
||||
"MAX_CODE_BLOCKS_PER_PAGE",
|
||||
]
|
||||
|
||||
@@ -37,15 +37,16 @@ Credits:
|
||||
- NetworkX for graph algorithms: https://networkx.org/
|
||||
"""
|
||||
|
||||
import re
|
||||
import ast
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple, Optional, Any
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
import networkx as nx
|
||||
|
||||
NETWORKX_AVAILABLE = True
|
||||
except ImportError:
|
||||
NETWORKX_AVAILABLE = False
|
||||
@@ -56,6 +57,7 @@ logger = logging.getLogger(__name__)
|
||||
@dataclass
|
||||
class DependencyInfo:
|
||||
"""Information about a single dependency relationship."""
|
||||
|
||||
source_file: str
|
||||
imported_module: str
|
||||
import_type: str # 'import', 'from', 'require', 'include'
|
||||
@@ -66,10 +68,11 @@ class DependencyInfo:
|
||||
@dataclass
|
||||
class FileNode:
|
||||
"""Represents a file node in the dependency graph."""
|
||||
|
||||
file_path: str
|
||||
language: str
|
||||
dependencies: List[str] = field(default_factory=list)
|
||||
imported_by: List[str] = field(default_factory=list)
|
||||
dependencies: list[str] = field(default_factory=list)
|
||||
imported_by: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
class DependencyAnalyzer:
|
||||
@@ -83,16 +86,13 @@ class DependencyAnalyzer:
|
||||
def __init__(self):
|
||||
"""Initialize dependency analyzer."""
|
||||
if not NETWORKX_AVAILABLE:
|
||||
raise ImportError(
|
||||
"NetworkX is required for dependency analysis. "
|
||||
"Install with: pip install networkx"
|
||||
)
|
||||
raise ImportError("NetworkX is required for dependency analysis. Install with: pip install networkx")
|
||||
|
||||
self.graph = nx.DiGraph() # Directed graph for dependencies
|
||||
self.file_dependencies: Dict[str, List[DependencyInfo]] = {}
|
||||
self.file_nodes: Dict[str, FileNode] = {}
|
||||
self.file_dependencies: dict[str, list[DependencyInfo]] = {}
|
||||
self.file_nodes: dict[str, FileNode] = {}
|
||||
|
||||
def analyze_file(self, file_path: str, content: str, language: str) -> List[DependencyInfo]:
|
||||
def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract dependencies from a source file.
|
||||
|
||||
@@ -104,23 +104,23 @@ class DependencyAnalyzer:
|
||||
Returns:
|
||||
List of DependencyInfo objects
|
||||
"""
|
||||
if language == 'Python':
|
||||
if language == "Python":
|
||||
deps = self._extract_python_imports(content, file_path)
|
||||
elif language in ('JavaScript', 'TypeScript'):
|
||||
elif language in ("JavaScript", "TypeScript"):
|
||||
deps = self._extract_js_imports(content, file_path)
|
||||
elif language in ('C++', 'C'):
|
||||
elif language in ("C++", "C"):
|
||||
deps = self._extract_cpp_includes(content, file_path)
|
||||
elif language == 'C#':
|
||||
elif language == "C#":
|
||||
deps = self._extract_csharp_imports(content, file_path)
|
||||
elif language == 'Go':
|
||||
elif language == "Go":
|
||||
deps = self._extract_go_imports(content, file_path)
|
||||
elif language == 'Rust':
|
||||
elif language == "Rust":
|
||||
deps = self._extract_rust_imports(content, file_path)
|
||||
elif language == 'Java':
|
||||
elif language == "Java":
|
||||
deps = self._extract_java_imports(content, file_path)
|
||||
elif language == 'Ruby':
|
||||
elif language == "Ruby":
|
||||
deps = self._extract_ruby_imports(content, file_path)
|
||||
elif language == 'PHP':
|
||||
elif language == "PHP":
|
||||
deps = self._extract_php_imports(content, file_path)
|
||||
else:
|
||||
logger.warning(f"Unsupported language: {language}")
|
||||
@@ -130,15 +130,11 @@ class DependencyAnalyzer:
|
||||
|
||||
# Create file node
|
||||
imported_modules = [dep.imported_module for dep in deps]
|
||||
self.file_nodes[file_path] = FileNode(
|
||||
file_path=file_path,
|
||||
language=language,
|
||||
dependencies=imported_modules
|
||||
)
|
||||
self.file_nodes[file_path] = FileNode(file_path=file_path, language=language, dependencies=imported_modules)
|
||||
|
||||
return deps
|
||||
|
||||
def _extract_python_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
|
||||
def _extract_python_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract Python import statements using AST.
|
||||
|
||||
@@ -159,33 +155,37 @@ class DependencyAnalyzer:
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Import):
|
||||
for alias in node.names:
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=alias.name,
|
||||
import_type='import',
|
||||
is_relative=False,
|
||||
line_number=node.lineno
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=alias.name,
|
||||
import_type="import",
|
||||
is_relative=False,
|
||||
line_number=node.lineno,
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(node, ast.ImportFrom):
|
||||
module = node.module or ''
|
||||
module = node.module or ""
|
||||
is_relative = node.level > 0
|
||||
|
||||
# Handle relative imports
|
||||
if is_relative:
|
||||
module = '.' * node.level + module
|
||||
module = "." * node.level + module
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type='from',
|
||||
is_relative=is_relative,
|
||||
line_number=node.lineno
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type="from",
|
||||
is_relative=is_relative,
|
||||
line_number=node.lineno,
|
||||
)
|
||||
)
|
||||
|
||||
return deps
|
||||
|
||||
def _extract_js_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
|
||||
def _extract_js_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract JavaScript/TypeScript import statements.
|
||||
|
||||
@@ -202,35 +202,39 @@ class DependencyAnalyzer:
|
||||
import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(import_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
is_relative = module.startswith('.') or module.startswith('/')
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
is_relative = module.startswith(".") or module.startswith("/")
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type='import',
|
||||
is_relative=is_relative,
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type="import",
|
||||
is_relative=is_relative,
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
# CommonJS requires: require('module')
|
||||
require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)"
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
is_relative = module.startswith('.') or module.startswith('/')
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
is_relative = module.startswith(".") or module.startswith("/")
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type='require',
|
||||
is_relative=is_relative,
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type="require",
|
||||
is_relative=is_relative,
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
return deps
|
||||
|
||||
def _extract_cpp_includes(self, content: str, file_path: str) -> List[DependencyInfo]:
|
||||
def _extract_cpp_includes(self, content: str, file_path: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract C++ #include directives.
|
||||
|
||||
@@ -244,22 +248,24 @@ class DependencyAnalyzer:
|
||||
include_pattern = r'#include\s+[<"]([^>"]+)[>"]'
|
||||
for match in re.finditer(include_pattern, content):
|
||||
header = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
# Headers with "" are usually local, <> are system headers
|
||||
is_relative = '"' in match.group(0)
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=header,
|
||||
import_type='include',
|
||||
is_relative=is_relative,
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=header,
|
||||
import_type="include",
|
||||
is_relative=is_relative,
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
return deps
|
||||
|
||||
def _extract_csharp_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
|
||||
def _extract_csharp_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract C# using statements.
|
||||
|
||||
@@ -275,27 +281,29 @@ class DependencyAnalyzer:
|
||||
deps = []
|
||||
|
||||
# Match using statements: using [static] Namespace[.Type];
|
||||
using_pattern = r'using\s+(?:static\s+)?(?:(\w+)\s*=\s*)?([A-Za-z_][\w.]*)\s*;'
|
||||
using_pattern = r"using\s+(?:static\s+)?(?:(\w+)\s*=\s*)?([A-Za-z_][\w.]*)\s*;"
|
||||
for match in re.finditer(using_pattern, content):
|
||||
alias = match.group(1) # Optional alias
|
||||
namespace = match.group(2)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
# Skip 'using' statements for IDisposable (using var x = ...)
|
||||
if '=' in match.group(0) and not alias:
|
||||
if "=" in match.group(0) and not alias:
|
||||
continue
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=namespace,
|
||||
import_type='using',
|
||||
is_relative=False, # C# uses absolute namespaces
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=namespace,
|
||||
import_type="using",
|
||||
is_relative=False, # C# uses absolute namespaces
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
return deps
|
||||
|
||||
def _extract_go_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
|
||||
def _extract_go_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract Go import statements.
|
||||
|
||||
@@ -314,21 +322,23 @@ class DependencyAnalyzer:
|
||||
for match in re.finditer(single_import_pattern, content):
|
||||
alias = match.group(1) # Optional alias
|
||||
package = match.group(2)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
# Check if relative (starts with ./ or ../)
|
||||
is_relative = package.startswith('./')
|
||||
is_relative = package.startswith("./")
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=package,
|
||||
import_type='import',
|
||||
is_relative=is_relative,
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=package,
|
||||
import_type="import",
|
||||
is_relative=is_relative,
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
# Multi-import block: import ( ... )
|
||||
multi_import_pattern = r'import\s*\((.*?)\)'
|
||||
multi_import_pattern = r"import\s*\((.*?)\)"
|
||||
for match in re.finditer(multi_import_pattern, content, re.DOTALL):
|
||||
block = match.group(1)
|
||||
block_start = match.start()
|
||||
@@ -338,21 +348,23 @@ class DependencyAnalyzer:
|
||||
for line_match in re.finditer(import_line_pattern, block):
|
||||
alias = line_match.group(1)
|
||||
package = line_match.group(2)
|
||||
line_num = content[:block_start + line_match.start()].count('\n') + 1
|
||||
line_num = content[: block_start + line_match.start()].count("\n") + 1
|
||||
|
||||
is_relative = package.startswith('./')
|
||||
is_relative = package.startswith("./")
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=package,
|
||||
import_type='import',
|
||||
is_relative=is_relative,
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=package,
|
||||
import_type="import",
|
||||
is_relative=is_relative,
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
return deps
|
||||
|
||||
def _extract_rust_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
|
||||
def _extract_rust_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract Rust use statements.
|
||||
|
||||
@@ -369,43 +381,47 @@ class DependencyAnalyzer:
|
||||
|
||||
# Match use statements: use path::to::item; (including curly braces with spaces)
|
||||
# This pattern matches: use word::word; or use word::{item, item};
|
||||
use_pattern = r'use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;'
|
||||
use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;"
|
||||
for match in re.finditer(use_pattern, content):
|
||||
module_path = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
# Determine if relative
|
||||
is_relative = module_path.startswith(('self::', 'super::'))
|
||||
is_relative = module_path.startswith(("self::", "super::"))
|
||||
|
||||
# Handle curly brace imports (use std::{io, fs})
|
||||
if '{' in module_path:
|
||||
if "{" in module_path:
|
||||
# Extract base path
|
||||
base_path = module_path.split('{')[0].rstrip(':')
|
||||
base_path = module_path.split("{")[0].rstrip(":")
|
||||
# Extract items inside braces
|
||||
items_match = re.search(r'\{([^}]+)\}', module_path)
|
||||
items_match = re.search(r"\{([^}]+)\}", module_path)
|
||||
if items_match:
|
||||
items = [item.strip() for item in items_match.group(1).split(',')]
|
||||
items = [item.strip() for item in items_match.group(1).split(",")]
|
||||
for item in items:
|
||||
full_path = f"{base_path}::{item}" if base_path else item
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=full_path,
|
||||
import_type='use',
|
||||
is_relative=is_relative,
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=full_path,
|
||||
import_type="use",
|
||||
is_relative=is_relative,
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
else:
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module_path,
|
||||
import_type='use',
|
||||
is_relative=is_relative,
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module_path,
|
||||
import_type="use",
|
||||
is_relative=is_relative,
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
return deps
|
||||
|
||||
def _extract_java_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
|
||||
def _extract_java_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract Java import statements.
|
||||
|
||||
@@ -420,22 +436,24 @@ class DependencyAnalyzer:
|
||||
deps = []
|
||||
|
||||
# Match import statements: import [static] package.Class;
|
||||
import_pattern = r'import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;'
|
||||
import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;"
|
||||
for match in re.finditer(import_pattern, content):
|
||||
import_path = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=import_path,
|
||||
import_type='import',
|
||||
is_relative=False, # Java uses absolute package names
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=import_path,
|
||||
import_type="import",
|
||||
is_relative=False, # Java uses absolute package names
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
return deps
|
||||
|
||||
def _extract_ruby_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
|
||||
def _extract_ruby_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract Ruby require/require_relative/load statements.
|
||||
|
||||
@@ -453,47 +471,53 @@ class DependencyAnalyzer:
|
||||
require_pattern = r"require\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type='require',
|
||||
is_relative=False, # require looks in load path
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type="require",
|
||||
is_relative=False, # require looks in load path
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
# Match require_relative: require_relative 'file'
|
||||
require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(require_relative_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type='require_relative',
|
||||
is_relative=True,
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type="require_relative",
|
||||
is_relative=True,
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
# Match load: load 'script.rb'
|
||||
load_pattern = r"load\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(load_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type='load',
|
||||
is_relative=True, # load is usually relative
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type="load",
|
||||
is_relative=True, # load is usually relative
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
return deps
|
||||
|
||||
def _extract_php_imports(self, content: str, file_path: str) -> List[DependencyInfo]:
|
||||
def _extract_php_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
Extract PHP require/include/use statements.
|
||||
|
||||
@@ -513,35 +537,39 @@ class DependencyAnalyzer:
|
||||
require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
# Determine import type
|
||||
import_type = 'require' if 'require' in match.group(0) else 'include'
|
||||
import_type = "require" if "require" in match.group(0) else "include"
|
||||
|
||||
# PHP file paths are relative by default
|
||||
is_relative = not module.startswith(('/', 'http://', 'https://'))
|
||||
is_relative = not module.startswith(("/", "http://", "https://"))
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type=import_type,
|
||||
is_relative=is_relative,
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=module,
|
||||
import_type=import_type,
|
||||
is_relative=is_relative,
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
# Match namespace use: use Namespace\Class;
|
||||
use_pattern = r'use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;'
|
||||
use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;"
|
||||
for match in re.finditer(use_pattern, content):
|
||||
namespace = match.group(1)
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
|
||||
deps.append(DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=namespace,
|
||||
import_type='use',
|
||||
is_relative=False, # Namespaces are absolute
|
||||
line_number=line_num
|
||||
))
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
source_file=file_path,
|
||||
imported_module=namespace,
|
||||
import_type="use",
|
||||
is_relative=False, # Namespaces are absolute
|
||||
line_number=line_num,
|
||||
)
|
||||
)
|
||||
|
||||
return deps
|
||||
|
||||
@@ -566,12 +594,7 @@ class DependencyAnalyzer:
|
||||
|
||||
if target and target in self.file_nodes:
|
||||
# Add edge from source to dependency
|
||||
self.graph.add_edge(
|
||||
file_path,
|
||||
target,
|
||||
import_type=dep.import_type,
|
||||
line_number=dep.line_number
|
||||
)
|
||||
self.graph.add_edge(file_path, target, import_type=dep.import_type, line_number=dep.line_number)
|
||||
|
||||
# Update imported_by lists
|
||||
if target in self.file_nodes:
|
||||
@@ -579,7 +602,7 @@ class DependencyAnalyzer:
|
||||
|
||||
return self.graph
|
||||
|
||||
def _resolve_import(self, source_file: str, imported_module: str, is_relative: bool) -> Optional[str]:
|
||||
def _resolve_import(self, source_file: str, imported_module: str, is_relative: bool) -> str | None:
|
||||
"""
|
||||
Resolve import statement to actual file path.
|
||||
|
||||
@@ -609,7 +632,7 @@ class DependencyAnalyzer:
|
||||
|
||||
return None
|
||||
|
||||
def detect_cycles(self) -> List[List[str]]:
|
||||
def detect_cycles(self) -> list[list[str]]:
|
||||
"""
|
||||
Detect circular dependencies in the graph.
|
||||
|
||||
@@ -627,7 +650,7 @@ class DependencyAnalyzer:
|
||||
logger.error(f"Error detecting cycles: {e}")
|
||||
return []
|
||||
|
||||
def get_strongly_connected_components(self) -> List[Set[str]]:
|
||||
def get_strongly_connected_components(self) -> list[set[str]]:
|
||||
"""
|
||||
Get strongly connected components (groups of mutually dependent files).
|
||||
|
||||
@@ -645,13 +668,14 @@ class DependencyAnalyzer:
|
||||
"""
|
||||
try:
|
||||
from networkx.drawing.nx_pydot import write_dot
|
||||
|
||||
write_dot(self.graph, output_path)
|
||||
logger.info(f"Exported graph to DOT format: {output_path}")
|
||||
except ImportError:
|
||||
logger.warning("pydot not installed - cannot export to DOT format")
|
||||
logger.warning("Install with: pip install pydot")
|
||||
|
||||
def export_json(self) -> Dict[str, Any]:
|
||||
def export_json(self) -> dict[str, Any]:
|
||||
"""
|
||||
Export graph as JSON structure.
|
||||
|
||||
@@ -659,22 +683,19 @@ class DependencyAnalyzer:
|
||||
Dictionary with nodes and edges
|
||||
"""
|
||||
return {
|
||||
'nodes': [
|
||||
{
|
||||
'file': node,
|
||||
'language': data.get('language', 'Unknown')
|
||||
}
|
||||
"nodes": [
|
||||
{"file": node, "language": data.get("language", "Unknown")}
|
||||
for node, data in self.graph.nodes(data=True)
|
||||
],
|
||||
'edges': [
|
||||
"edges": [
|
||||
{
|
||||
'source': source,
|
||||
'target': target,
|
||||
'import_type': data.get('import_type', 'unknown'),
|
||||
'line_number': data.get('line_number', 0)
|
||||
"source": source,
|
||||
"target": target,
|
||||
"import_type": data.get("import_type", "unknown"),
|
||||
"line_number": data.get("line_number", 0),
|
||||
}
|
||||
for source, target, data in self.graph.edges(data=True)
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
def export_mermaid(self) -> str:
|
||||
@@ -684,7 +705,7 @@ class DependencyAnalyzer:
|
||||
Returns:
|
||||
Mermaid diagram as string
|
||||
"""
|
||||
lines = ['graph TD']
|
||||
lines = ["graph TD"]
|
||||
|
||||
# Create node labels (shorten file paths for readability)
|
||||
node_ids = {}
|
||||
@@ -700,9 +721,9 @@ class DependencyAnalyzer:
|
||||
target_id = node_ids[target]
|
||||
lines.append(f" {source_id} --> {target_id}")
|
||||
|
||||
return '\n'.join(lines)
|
||||
return "\n".join(lines)
|
||||
|
||||
def get_statistics(self) -> Dict[str, Any]:
|
||||
def get_statistics(self) -> dict[str, Any]:
|
||||
"""
|
||||
Get graph statistics.
|
||||
|
||||
@@ -710,20 +731,15 @@ class DependencyAnalyzer:
|
||||
Dictionary with various statistics
|
||||
"""
|
||||
return {
|
||||
'total_files': self.graph.number_of_nodes(),
|
||||
'total_dependencies': self.graph.number_of_edges(),
|
||||
'circular_dependencies': len(self.detect_cycles()),
|
||||
'strongly_connected_components': len(self.get_strongly_connected_components()),
|
||||
'avg_dependencies_per_file': (
|
||||
self.graph.number_of_edges() / self.graph.number_of_nodes()
|
||||
if self.graph.number_of_nodes() > 0 else 0
|
||||
"total_files": self.graph.number_of_nodes(),
|
||||
"total_dependencies": self.graph.number_of_edges(),
|
||||
"circular_dependencies": len(self.detect_cycles()),
|
||||
"strongly_connected_components": len(self.get_strongly_connected_components()),
|
||||
"avg_dependencies_per_file": (
|
||||
self.graph.number_of_edges() / self.graph.number_of_nodes() if self.graph.number_of_nodes() > 0 else 0
|
||||
),
|
||||
'files_with_no_dependencies': len([
|
||||
node for node in self.graph.nodes()
|
||||
if self.graph.out_degree(node) == 0
|
||||
]),
|
||||
'files_not_imported': len([
|
||||
node for node in self.graph.nodes()
|
||||
if self.graph.in_degree(node) == 0
|
||||
]),
|
||||
"files_with_no_dependencies": len(
|
||||
[node for node in self.graph.nodes() if self.graph.out_degree(node) == 0]
|
||||
),
|
||||
"files_not_imported": len([node for node in self.graph.nodes() if self.graph.in_degree(node) == 0]),
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -15,10 +15,9 @@ Usage:
|
||||
skill-seekers enhance output/react/ --target openai --api-key sk-proj-...
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports when run as script
|
||||
@@ -42,9 +41,7 @@ class SkillEnhancer:
|
||||
self.skill_md_path = self.skill_dir / "SKILL.md"
|
||||
|
||||
# Get API key - support both ANTHROPIC_API_KEY and ANTHROPIC_AUTH_TOKEN
|
||||
self.api_key = (api_key or
|
||||
os.environ.get('ANTHROPIC_API_KEY') or
|
||||
os.environ.get('ANTHROPIC_AUTH_TOKEN'))
|
||||
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")
|
||||
if not self.api_key:
|
||||
raise ValueError(
|
||||
"No API key provided. Set ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN "
|
||||
@@ -52,10 +49,10 @@ class SkillEnhancer:
|
||||
)
|
||||
|
||||
# Support custom base URL for alternative API endpoints
|
||||
base_url = os.environ.get('ANTHROPIC_BASE_URL')
|
||||
client_kwargs = {'api_key': self.api_key}
|
||||
base_url = os.environ.get("ANTHROPIC_BASE_URL")
|
||||
client_kwargs = {"api_key": self.api_key}
|
||||
if base_url:
|
||||
client_kwargs['base_url'] = base_url
|
||||
client_kwargs["base_url"] = base_url
|
||||
print(f"ℹ️ Using custom API base URL: {base_url}")
|
||||
|
||||
self.client = anthropic.Anthropic(**client_kwargs)
|
||||
@@ -64,7 +61,7 @@ class SkillEnhancer:
|
||||
"""Read existing SKILL.md"""
|
||||
if not self.skill_md_path.exists():
|
||||
return None
|
||||
return self.skill_md_path.read_text(encoding='utf-8')
|
||||
return self.skill_md_path.read_text(encoding="utf-8")
|
||||
|
||||
def enhance_skill_md(self, references, current_skill_md):
|
||||
"""Use Claude to enhance SKILL.md"""
|
||||
@@ -80,17 +77,14 @@ class SkillEnhancer:
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=4096,
|
||||
temperature=0.3,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}]
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
|
||||
# Handle response content - newer SDK versions may include ThinkingBlock
|
||||
# Find the TextBlock containing the actual response
|
||||
enhanced_content = None
|
||||
for block in message.content:
|
||||
if hasattr(block, 'text'):
|
||||
if hasattr(block, "text"):
|
||||
enhanced_content = block.text
|
||||
break
|
||||
|
||||
@@ -113,10 +107,10 @@ class SkillEnhancer:
|
||||
# Analyze sources
|
||||
sources_found = set()
|
||||
for metadata in references.values():
|
||||
sources_found.add(metadata['source'])
|
||||
sources_found.add(metadata["source"])
|
||||
|
||||
# Analyze conflicts if present
|
||||
has_conflicts = any('conflicts' in meta['path'] for meta in references.values())
|
||||
has_conflicts = any("conflicts" in meta["path"] for meta in references.values())
|
||||
|
||||
prompt = f"""You are enhancing a Claude skill's SKILL.md file. This skill is about: {skill_name}
|
||||
|
||||
@@ -124,14 +118,14 @@ I've scraped documentation from multiple sources and organized it into reference
|
||||
|
||||
SKILL OVERVIEW:
|
||||
- Name: {skill_name}
|
||||
- Source Types: {', '.join(sorted(sources_found))}
|
||||
- Multi-Source: {'Yes' if len(sources_found) > 1 else 'No'}
|
||||
- Conflicts Detected: {'Yes - see conflicts.md in references' if has_conflicts else 'No'}
|
||||
- Source Types: {", ".join(sorted(sources_found))}
|
||||
- Multi-Source: {"Yes" if len(sources_found) > 1 else "No"}
|
||||
- Conflicts Detected: {"Yes - see conflicts.md in references" if has_conflicts else "No"}
|
||||
|
||||
CURRENT SKILL.MD:
|
||||
{'```markdown' if current_skill_md else '(none - create from scratch)'}
|
||||
{current_skill_md or 'No existing SKILL.md'}
|
||||
{'```' if current_skill_md else ''}
|
||||
{"```markdown" if current_skill_md else "(none - create from scratch)"}
|
||||
{current_skill_md or "No existing SKILL.md"}
|
||||
{"```" if current_skill_md else ""}
|
||||
|
||||
SOURCE ANALYSIS:
|
||||
This skill combines knowledge from {len(sources_found)} source type(s):
|
||||
@@ -141,8 +135,8 @@ This skill combines knowledge from {len(sources_found)} source type(s):
|
||||
# Group references by (source_type, repo_id) for multi-source support
|
||||
by_source = {}
|
||||
for filename, metadata in references.items():
|
||||
source = metadata['source']
|
||||
repo_id = metadata.get('repo_id') # None for single-source
|
||||
source = metadata["source"]
|
||||
repo_id = metadata.get("repo_id") # None for single-source
|
||||
key = (source, repo_id) if repo_id else (source, None)
|
||||
|
||||
if key not in by_source:
|
||||
@@ -150,7 +144,7 @@ This skill combines knowledge from {len(sources_found)} source type(s):
|
||||
by_source[key].append((filename, metadata))
|
||||
|
||||
# Add source breakdown with repo identity
|
||||
for (source, repo_id) in sorted(by_source.keys()):
|
||||
for source, repo_id in sorted(by_source.keys()):
|
||||
files = by_source[(source, repo_id)]
|
||||
if repo_id:
|
||||
prompt += f"\n**{source.upper()} - {repo_id} ({len(files)} file(s))**\n"
|
||||
@@ -164,14 +158,14 @@ This skill combines knowledge from {len(sources_found)} source type(s):
|
||||
prompt += "\n\nREFERENCE DOCUMENTATION:\n"
|
||||
|
||||
# Add references grouped by (source, repo_id) with metadata
|
||||
for (source, repo_id) in sorted(by_source.keys()):
|
||||
for source, repo_id in sorted(by_source.keys()):
|
||||
if repo_id:
|
||||
prompt += f"\n### {source.upper()} SOURCES - {repo_id}\n\n"
|
||||
else:
|
||||
prompt += f"\n### {source.upper()} SOURCES\n\n"
|
||||
|
||||
for filename, metadata in by_source[(source, repo_id)]:
|
||||
content = metadata['content']
|
||||
content = metadata["content"]
|
||||
# Limit per-file to 30K
|
||||
if len(content) > 30000:
|
||||
content = content[:30000] + "\n\n[Content truncated for size...]"
|
||||
@@ -197,12 +191,12 @@ MULTI-REPOSITORY HANDLING:
|
||||
# Detect multiple repos from same source type
|
||||
repo_ids = set()
|
||||
for metadata in references.values():
|
||||
if metadata.get('repo_id'):
|
||||
repo_ids.add(metadata['repo_id'])
|
||||
if metadata.get("repo_id"):
|
||||
repo_ids.add(metadata["repo_id"])
|
||||
|
||||
if len(repo_ids) > 1:
|
||||
prompt += f"""
|
||||
⚠️ MULTIPLE REPOSITORIES DETECTED: {', '.join(sorted(repo_ids))}
|
||||
⚠️ MULTIPLE REPOSITORIES DETECTED: {", ".join(sorted(repo_ids))}
|
||||
|
||||
This skill combines codebase analysis from {len(repo_ids)} different repositories.
|
||||
Each repo has its own ARCHITECTURE.md, patterns, examples, and configuration.
|
||||
@@ -285,27 +279,23 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
|
||||
"""Save the enhanced SKILL.md"""
|
||||
# Backup original
|
||||
if self.skill_md_path.exists():
|
||||
backup_path = self.skill_md_path.with_suffix('.md.backup')
|
||||
backup_path = self.skill_md_path.with_suffix(".md.backup")
|
||||
self.skill_md_path.rename(backup_path)
|
||||
print(f" 💾 Backed up original to: {backup_path.name}")
|
||||
|
||||
# Save enhanced version
|
||||
self.skill_md_path.write_text(content, encoding='utf-8')
|
||||
print(f" ✅ Saved enhanced SKILL.md")
|
||||
self.skill_md_path.write_text(content, encoding="utf-8")
|
||||
print(" ✅ Saved enhanced SKILL.md")
|
||||
|
||||
def run(self):
|
||||
"""Main enhancement workflow"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"ENHANCING SKILL: {self.skill_dir.name}")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
# Read reference files
|
||||
print("📖 Reading reference documentation...")
|
||||
references = read_reference_files(
|
||||
self.skill_dir,
|
||||
max_chars=API_CONTENT_LIMIT,
|
||||
preview_limit=API_PREVIEW_LIMIT
|
||||
)
|
||||
references = read_reference_files(self.skill_dir, max_chars=API_CONTENT_LIMIT, preview_limit=API_PREVIEW_LIMIT)
|
||||
|
||||
if not references:
|
||||
print("❌ No reference files found to analyze")
|
||||
@@ -314,11 +304,11 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
|
||||
# Analyze sources
|
||||
sources_found = set()
|
||||
for metadata in references.values():
|
||||
sources_found.add(metadata['source'])
|
||||
sources_found.add(metadata["source"])
|
||||
|
||||
print(f" ✓ Read {len(references)} reference files")
|
||||
print(f" ✓ Sources: {', '.join(sorted(sources_found))}")
|
||||
total_size = sum(meta['size'] for meta in references.values())
|
||||
total_size = sum(meta["size"] for meta in references.values())
|
||||
print(f" ✓ Total size: {total_size:,} characters\n")
|
||||
|
||||
# Read current SKILL.md
|
||||
@@ -326,7 +316,7 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
|
||||
if current_skill_md:
|
||||
print(f" ℹ Found existing SKILL.md ({len(current_skill_md)} chars)")
|
||||
else:
|
||||
print(f" ℹ No existing SKILL.md, will create new one")
|
||||
print(" ℹ No existing SKILL.md, will create new one")
|
||||
|
||||
# Enhance with Claude
|
||||
enhanced = self.enhance_skill_md(references, current_skill_md)
|
||||
@@ -341,11 +331,11 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
|
||||
print("💾 Saving enhanced SKILL.md...")
|
||||
self.save_enhanced_skill_md(enhanced)
|
||||
|
||||
print(f"\n✅ Enhancement complete!")
|
||||
print(f"\nNext steps:")
|
||||
print("\n✅ Enhancement complete!")
|
||||
print("\nNext steps:")
|
||||
print(f" 1. Review: {self.skill_md_path}")
|
||||
print(f" 2. If you don't like it, restore backup: {self.skill_md_path.with_suffix('.md.backup')}")
|
||||
print(f" 3. Package your skill:")
|
||||
print(" 3. Package your skill:")
|
||||
print(f" skill-seekers package {self.skill_dir}/")
|
||||
|
||||
return True
|
||||
@@ -353,7 +343,7 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Enhance SKILL.md using platform AI APIs',
|
||||
description="Enhance SKILL.md using platform AI APIs",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -374,19 +364,18 @@ Examples:
|
||||
|
||||
# Dry run
|
||||
skill-seekers enhance output/godot/ --dry-run
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument('skill_dir', type=str,
|
||||
help='Path to skill directory (e.g., output/steam-inventory/)')
|
||||
parser.add_argument('--api-key', type=str,
|
||||
help='Platform API key (or set environment variable)')
|
||||
parser.add_argument('--target',
|
||||
choices=['claude', 'gemini', 'openai'],
|
||||
default='claude',
|
||||
help='Target LLM platform (default: claude)')
|
||||
parser.add_argument('--dry-run', action='store_true',
|
||||
help='Show what would be done without calling API')
|
||||
parser.add_argument("skill_dir", type=str, help="Path to skill directory (e.g., output/steam-inventory/)")
|
||||
parser.add_argument("--api-key", type=str, help="Platform API key (or set environment variable)")
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
choices=["claude", "gemini", "openai"],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be done without calling API")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -402,7 +391,7 @@ Examples:
|
||||
|
||||
# Dry run mode
|
||||
if args.dry_run:
|
||||
print(f"🔍 DRY RUN MODE")
|
||||
print("🔍 DRY RUN MODE")
|
||||
print(f" Would enhance: {skill_dir}")
|
||||
print(f" References: {skill_dir / 'references'}")
|
||||
print(f" SKILL.md: {skill_dir / 'SKILL.md'}")
|
||||
@@ -427,7 +416,7 @@ Examples:
|
||||
|
||||
if not adaptor.supports_enhancement():
|
||||
print(f"❌ Error: {adaptor.PLATFORM_NAME} does not support AI enhancement")
|
||||
print(f"\nSupported platforms for enhancement:")
|
||||
print("\nSupported platforms for enhancement:")
|
||||
print(" - Claude AI (Anthropic)")
|
||||
print(" - Google Gemini")
|
||||
print(" - OpenAI ChatGPT")
|
||||
@@ -436,7 +425,7 @@ Examples:
|
||||
# Get API key
|
||||
api_key = args.api_key
|
||||
if not api_key:
|
||||
api_key = os.environ.get(adaptor.get_env_var_name(), '').strip()
|
||||
api_key = os.environ.get(adaptor.get_env_var_name(), "").strip()
|
||||
|
||||
if not api_key:
|
||||
print(f"❌ Error: {adaptor.get_env_var_name()} not set")
|
||||
@@ -447,19 +436,19 @@ Examples:
|
||||
sys.exit(1)
|
||||
|
||||
# Run enhancement using adaptor
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"ENHANCING SKILL: {skill_dir}")
|
||||
print(f"Platform: {adaptor.PLATFORM_NAME}")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
success = adaptor.enhance(Path(skill_dir), api_key)
|
||||
|
||||
if success:
|
||||
print(f"\n✅ Enhancement complete!")
|
||||
print(f"\nNext steps:")
|
||||
print("\n✅ Enhancement complete!")
|
||||
print("\nNext steps:")
|
||||
print(f" 1. Review: {Path(skill_dir) / 'SKILL.md'}")
|
||||
print(f" 2. If you don't like it, restore backup: {Path(skill_dir) / 'SKILL.md.backup'}")
|
||||
print(f" 3. Package your skill:")
|
||||
print(" 3. Package your skill:")
|
||||
print(f" skill-seekers package {skill_dir}/ --target {args.target}")
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
@@ -474,6 +463,7 @@ Examples:
|
||||
except Exception as e:
|
||||
print(f"❌ Unexpected error: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -36,15 +36,15 @@ Terminal Selection:
|
||||
Supported terminals: Ghostty, iTerm, Terminal, WezTerm
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import subprocess
|
||||
import tempfile
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
from pathlib import Path
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports when run as script
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
@@ -77,29 +77,29 @@ def detect_terminal_app():
|
||||
"""
|
||||
# Map TERM_PROGRAM values to macOS app names
|
||||
TERMINAL_MAP = {
|
||||
'Apple_Terminal': 'Terminal',
|
||||
'iTerm.app': 'iTerm',
|
||||
'ghostty': 'Ghostty',
|
||||
'WezTerm': 'WezTerm',
|
||||
"Apple_Terminal": "Terminal",
|
||||
"iTerm.app": "iTerm",
|
||||
"ghostty": "Ghostty",
|
||||
"WezTerm": "WezTerm",
|
||||
}
|
||||
|
||||
# Priority 1: Check SKILL_SEEKER_TERMINAL env var (explicit preference)
|
||||
preferred_terminal = os.environ.get('SKILL_SEEKER_TERMINAL', '').strip()
|
||||
preferred_terminal = os.environ.get("SKILL_SEEKER_TERMINAL", "").strip()
|
||||
if preferred_terminal:
|
||||
return preferred_terminal, 'SKILL_SEEKER_TERMINAL'
|
||||
return preferred_terminal, "SKILL_SEEKER_TERMINAL"
|
||||
|
||||
# Priority 2: Check TERM_PROGRAM (inherit current terminal)
|
||||
term_program = os.environ.get('TERM_PROGRAM', '').strip()
|
||||
term_program = os.environ.get("TERM_PROGRAM", "").strip()
|
||||
if term_program and term_program in TERMINAL_MAP:
|
||||
return TERMINAL_MAP[term_program], 'TERM_PROGRAM'
|
||||
return TERMINAL_MAP[term_program], "TERM_PROGRAM"
|
||||
|
||||
# Priority 3: Fallback to Terminal.app
|
||||
if term_program:
|
||||
# TERM_PROGRAM is set but unknown
|
||||
return 'Terminal', f'unknown TERM_PROGRAM ({term_program})'
|
||||
return "Terminal", f"unknown TERM_PROGRAM ({term_program})"
|
||||
else:
|
||||
# No TERM_PROGRAM set
|
||||
return 'Terminal', 'default'
|
||||
return "Terminal", "default"
|
||||
|
||||
|
||||
class LocalSkillEnhancer:
|
||||
@@ -132,7 +132,7 @@ class LocalSkillEnhancer:
|
||||
Returns:
|
||||
Summarized content
|
||||
"""
|
||||
lines = content.split('\n')
|
||||
lines = content.split("\n")
|
||||
target_lines = int(len(lines) * target_ratio)
|
||||
|
||||
# Priority 1: Keep introduction (first 20%)
|
||||
@@ -146,7 +146,7 @@ class LocalSkillEnhancer:
|
||||
block_start_idx = 0
|
||||
|
||||
for i, line in enumerate(lines[intro_lines:], start=intro_lines):
|
||||
if line.strip().startswith('```'):
|
||||
if line.strip().startswith("```"):
|
||||
if in_code_block:
|
||||
# End of code block - add closing ``` and save
|
||||
current_block.append(line)
|
||||
@@ -174,9 +174,9 @@ class LocalSkillEnhancer:
|
||||
headings_added = 0
|
||||
while i < len(lines) and headings_added < 10:
|
||||
line = lines[i]
|
||||
if line.startswith('#'):
|
||||
if line.startswith("#"):
|
||||
# Found heading - keep it and next 3 lines
|
||||
chunk = lines[i:min(i+4, len(lines))]
|
||||
chunk = lines[i : min(i + 4, len(lines))]
|
||||
result.extend(chunk)
|
||||
headings_added += 1
|
||||
i += 4
|
||||
@@ -185,7 +185,7 @@ class LocalSkillEnhancer:
|
||||
|
||||
result.append("\n\n[Content intelligently summarized - full details in reference files]")
|
||||
|
||||
return '\n'.join(result)
|
||||
return "\n".join(result)
|
||||
|
||||
def create_enhancement_prompt(self, use_summarization=False, summarization_ratio=0.3):
|
||||
"""Create the prompt file for Claude Code
|
||||
@@ -197,9 +197,7 @@ class LocalSkillEnhancer:
|
||||
|
||||
# Read reference files (with enriched metadata)
|
||||
references = read_reference_files(
|
||||
self.skill_dir,
|
||||
max_chars=LOCAL_CONTENT_LIMIT,
|
||||
preview_limit=LOCAL_PREVIEW_LIMIT
|
||||
self.skill_dir, max_chars=LOCAL_CONTENT_LIMIT, preview_limit=LOCAL_PREVIEW_LIMIT
|
||||
)
|
||||
|
||||
if not references:
|
||||
@@ -209,52 +207,54 @@ class LocalSkillEnhancer:
|
||||
# Analyze sources
|
||||
sources_found = set()
|
||||
for metadata in references.values():
|
||||
sources_found.add(metadata['source'])
|
||||
sources_found.add(metadata["source"])
|
||||
|
||||
# Calculate total size
|
||||
total_ref_size = sum(meta['size'] for meta in references.values())
|
||||
total_ref_size = sum(meta["size"] for meta in references.values())
|
||||
|
||||
# Apply summarization if requested or if content is too large
|
||||
if use_summarization or total_ref_size > 30000:
|
||||
if not use_summarization:
|
||||
print(f" ⚠️ Large skill detected ({total_ref_size:,} chars)")
|
||||
print(f" 📊 Applying smart summarization (target: {int(summarization_ratio*100)}% of original)")
|
||||
print(f" 📊 Applying smart summarization (target: {int(summarization_ratio * 100)}% of original)")
|
||||
print()
|
||||
|
||||
# Summarize each reference
|
||||
for filename, metadata in references.items():
|
||||
summarized = self.summarize_reference(metadata['content'], summarization_ratio)
|
||||
metadata['content'] = summarized
|
||||
metadata['size'] = len(summarized)
|
||||
summarized = self.summarize_reference(metadata["content"], summarization_ratio)
|
||||
metadata["content"] = summarized
|
||||
metadata["size"] = len(summarized)
|
||||
|
||||
new_size = sum(meta['size'] for meta in references.values())
|
||||
print(f" ✓ Reduced from {total_ref_size:,} to {new_size:,} chars ({int(new_size/total_ref_size*100)}%)")
|
||||
new_size = sum(meta["size"] for meta in references.values())
|
||||
print(
|
||||
f" ✓ Reduced from {total_ref_size:,} to {new_size:,} chars ({int(new_size / total_ref_size * 100)}%)"
|
||||
)
|
||||
print()
|
||||
|
||||
# Read current SKILL.md
|
||||
current_skill_md = ""
|
||||
if self.skill_md_path.exists():
|
||||
current_skill_md = self.skill_md_path.read_text(encoding='utf-8')
|
||||
current_skill_md = self.skill_md_path.read_text(encoding="utf-8")
|
||||
|
||||
# Analyze conflicts if present
|
||||
has_conflicts = any('conflicts' in meta['path'] for meta in references.values())
|
||||
has_conflicts = any("conflicts" in meta["path"] for meta in references.values())
|
||||
|
||||
# Build prompt with multi-source awareness
|
||||
prompt = f"""I need you to enhance the SKILL.md file for the {self.skill_dir.name} skill.
|
||||
|
||||
SKILL OVERVIEW:
|
||||
- Name: {self.skill_dir.name}
|
||||
- Source Types: {', '.join(sorted(sources_found))}
|
||||
- Multi-Source: {'Yes' if len(sources_found) > 1 else 'No'}
|
||||
- Conflicts Detected: {'Yes - see conflicts.md in references' if has_conflicts else 'No'}
|
||||
- Source Types: {", ".join(sorted(sources_found))}
|
||||
- Multi-Source: {"Yes" if len(sources_found) > 1 else "No"}
|
||||
- Conflicts Detected: {"Yes - see conflicts.md in references" if has_conflicts else "No"}
|
||||
|
||||
CURRENT SKILL.MD:
|
||||
{'-'*60}
|
||||
{current_skill_md if current_skill_md else '(No existing SKILL.md - create from scratch)'}
|
||||
{'-'*60}
|
||||
{"-" * 60}
|
||||
{current_skill_md if current_skill_md else "(No existing SKILL.md - create from scratch)"}
|
||||
{"-" * 60}
|
||||
|
||||
SOURCE ANALYSIS:
|
||||
{'-'*60}
|
||||
{"-" * 60}
|
||||
This skill combines knowledge from {len(sources_found)} source type(s):
|
||||
|
||||
"""
|
||||
@@ -262,8 +262,8 @@ This skill combines knowledge from {len(sources_found)} source type(s):
|
||||
# Group references by (source_type, repo_id) for multi-source support
|
||||
by_source = {}
|
||||
for filename, metadata in references.items():
|
||||
source = metadata['source']
|
||||
repo_id = metadata.get('repo_id') # None for single-source
|
||||
source = metadata["source"]
|
||||
repo_id = metadata.get("repo_id") # None for single-source
|
||||
key = (source, repo_id) if repo_id else (source, None)
|
||||
|
||||
if key not in by_source:
|
||||
@@ -271,7 +271,7 @@ This skill combines knowledge from {len(sources_found)} source type(s):
|
||||
by_source[key].append((filename, metadata))
|
||||
|
||||
# Add source breakdown with repo identity
|
||||
for (source, repo_id) in sorted(by_source.keys()):
|
||||
for source, repo_id in sorted(by_source.keys()):
|
||||
files = by_source[(source, repo_id)]
|
||||
if repo_id:
|
||||
prompt += f"\n**{source.upper()} - {repo_id} ({len(files)} file(s))**\n"
|
||||
@@ -283,14 +283,14 @@ This skill combines knowledge from {len(sources_found)} source type(s):
|
||||
prompt += f"- ... and {len(files) - 5} more\n"
|
||||
|
||||
prompt += f"""
|
||||
{'-'*60}
|
||||
{"-" * 60}
|
||||
|
||||
REFERENCE DOCUMENTATION:
|
||||
{'-'*60}
|
||||
{"-" * 60}
|
||||
"""
|
||||
|
||||
# Add references grouped by (source, repo_id) with metadata
|
||||
for (source, repo_id) in sorted(by_source.keys()):
|
||||
for source, repo_id in sorted(by_source.keys()):
|
||||
if repo_id:
|
||||
prompt += f"\n### {source.upper()} SOURCES - {repo_id}\n\n"
|
||||
else:
|
||||
@@ -298,7 +298,7 @@ REFERENCE DOCUMENTATION:
|
||||
|
||||
for filename, metadata in by_source[(source, repo_id)]:
|
||||
# Further limit per-file to 12K to be safe
|
||||
content = metadata['content']
|
||||
content = metadata["content"]
|
||||
max_per_file = 12000
|
||||
if len(content) > max_per_file:
|
||||
content = content[:max_per_file] + "\n\n[Content truncated for size...]"
|
||||
@@ -311,7 +311,7 @@ REFERENCE DOCUMENTATION:
|
||||
prompt += f"{content}\n"
|
||||
|
||||
prompt += f"""
|
||||
{'-'*60}
|
||||
{"-" * 60}
|
||||
|
||||
REFERENCE PRIORITY (when sources differ):
|
||||
1. **Code patterns (codebase_analysis)**: Ground truth - what the code actually does
|
||||
@@ -325,12 +325,12 @@ MULTI-REPOSITORY HANDLING:
|
||||
# Detect multiple repos from same source type
|
||||
repo_ids = set()
|
||||
for metadata in references.values():
|
||||
if metadata.get('repo_id'):
|
||||
repo_ids.add(metadata['repo_id'])
|
||||
if metadata.get("repo_id"):
|
||||
repo_ids.add(metadata["repo_id"])
|
||||
|
||||
if len(repo_ids) > 1:
|
||||
prompt += f"""
|
||||
⚠️ MULTIPLE REPOSITORIES DETECTED: {', '.join(sorted(repo_ids))}
|
||||
⚠️ MULTIPLE REPOSITORIES DETECTED: {", ".join(sorted(repo_ids))}
|
||||
|
||||
This skill combines codebase analysis from {len(repo_ids)} different repositories.
|
||||
Each repo has its own ARCHITECTURE.md, patterns, examples, and configuration.
|
||||
@@ -435,10 +435,10 @@ After writing, the file SKILL.md should:
|
||||
"progress": progress,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"skill_dir": str(self.skill_dir),
|
||||
"error": error
|
||||
"error": error,
|
||||
}
|
||||
|
||||
self.status_file.write_text(json.dumps(status_data, indent=2), encoding='utf-8')
|
||||
self.status_file.write_text(json.dumps(status_data, indent=2), encoding="utf-8")
|
||||
|
||||
def read_status(self):
|
||||
"""Read enhancement status from file.
|
||||
@@ -450,7 +450,7 @@ After writing, the file SKILL.md should:
|
||||
return None
|
||||
|
||||
try:
|
||||
return json.loads(self.status_file.read_text(encoding='utf-8'))
|
||||
return json.loads(self.status_file.read_text(encoding="utf-8"))
|
||||
except:
|
||||
return None
|
||||
|
||||
@@ -482,9 +482,9 @@ After writing, the file SKILL.md should:
|
||||
# Daemon mode: Run as persistent process with monitoring
|
||||
if daemon:
|
||||
return self._run_daemon(timeout)
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"LOCAL ENHANCEMENT: {self.skill_dir.name}")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
# Validate
|
||||
if not self.skill_dir.exists():
|
||||
@@ -494,9 +494,7 @@ After writing, the file SKILL.md should:
|
||||
# Read reference files
|
||||
print("📖 Reading reference documentation...")
|
||||
references = read_reference_files(
|
||||
self.skill_dir,
|
||||
max_chars=LOCAL_CONTENT_LIMIT,
|
||||
preview_limit=LOCAL_PREVIEW_LIMIT
|
||||
self.skill_dir, max_chars=LOCAL_CONTENT_LIMIT, preview_limit=LOCAL_PREVIEW_LIMIT
|
||||
)
|
||||
|
||||
if not references:
|
||||
@@ -504,7 +502,7 @@ After writing, the file SKILL.md should:
|
||||
return False
|
||||
|
||||
print(f" ✓ Read {len(references)} reference files")
|
||||
total_size = sum(ref['size'] for ref in references.values())
|
||||
total_size = sum(ref["size"] for ref in references.values())
|
||||
print(f" ✓ Total size: {total_size:,} characters\n")
|
||||
|
||||
# Check if we need smart summarization
|
||||
@@ -513,7 +511,7 @@ After writing, the file SKILL.md should:
|
||||
if use_summarization:
|
||||
print("⚠️ LARGE SKILL DETECTED")
|
||||
print(f" 📊 Reference content: {total_size:,} characters")
|
||||
print(f" 💡 Claude CLI limit: ~30,000-40,000 characters")
|
||||
print(" 💡 Claude CLI limit: ~30,000-40,000 characters")
|
||||
print()
|
||||
print(" 🔧 Applying smart summarization to ensure success...")
|
||||
print(" • Keeping introductions and overviews")
|
||||
@@ -530,13 +528,13 @@ After writing, the file SKILL.md should:
|
||||
return False
|
||||
|
||||
# Save prompt to temp file
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
|
||||
prompt_file = f.name
|
||||
f.write(prompt)
|
||||
|
||||
if use_summarization:
|
||||
print(f" ✓ Prompt created and optimized ({len(prompt):,} characters)")
|
||||
print(f" ✓ Ready for Claude CLI (within safe limits)")
|
||||
print(" ✓ Ready for Claude CLI (within safe limits)")
|
||||
print()
|
||||
else:
|
||||
print(f" ✓ Prompt saved ({len(prompt):,} characters)\n")
|
||||
@@ -555,49 +553,49 @@ After writing, the file SKILL.md should:
|
||||
print()
|
||||
|
||||
# Create a shell script to run in the terminal
|
||||
shell_script = f'''#!/bin/bash
|
||||
shell_script = f"""#!/bin/bash
|
||||
claude {prompt_file}
|
||||
echo ""
|
||||
echo "✅ Enhancement complete!"
|
||||
echo "Press any key to close..."
|
||||
read -n 1
|
||||
rm {prompt_file}
|
||||
'''
|
||||
"""
|
||||
|
||||
# Save shell script
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f:
|
||||
script_file = f.name
|
||||
f.write(shell_script)
|
||||
|
||||
os.chmod(script_file, 0o755)
|
||||
|
||||
# Launch in new terminal (macOS specific)
|
||||
if sys.platform == 'darwin':
|
||||
if sys.platform == "darwin":
|
||||
# Detect which terminal app to use
|
||||
terminal_app, detection_method = detect_terminal_app()
|
||||
|
||||
# Show detection info
|
||||
if detection_method == 'SKILL_SEEKER_TERMINAL':
|
||||
if detection_method == "SKILL_SEEKER_TERMINAL":
|
||||
print(f" Using terminal: {terminal_app} (from SKILL_SEEKER_TERMINAL)")
|
||||
elif detection_method == 'TERM_PROGRAM':
|
||||
elif detection_method == "TERM_PROGRAM":
|
||||
print(f" Using terminal: {terminal_app} (inherited from current terminal)")
|
||||
elif detection_method.startswith('unknown TERM_PROGRAM'):
|
||||
elif detection_method.startswith("unknown TERM_PROGRAM"):
|
||||
print(f"⚠️ {detection_method}")
|
||||
print(f" → Using Terminal.app as fallback")
|
||||
print(" → Using Terminal.app as fallback")
|
||||
else:
|
||||
print(f" Using terminal: {terminal_app} (default)")
|
||||
|
||||
try:
|
||||
subprocess.Popen(['open', '-a', terminal_app, script_file])
|
||||
subprocess.Popen(["open", "-a", terminal_app, script_file])
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error launching {terminal_app}: {e}")
|
||||
print(f"\nManually run: {script_file}")
|
||||
return False
|
||||
else:
|
||||
print("⚠️ Auto-launch only works on macOS")
|
||||
print(f"\nManually run this command in a new terminal:")
|
||||
print("\nManually run this command in a new terminal:")
|
||||
print(f" claude '{prompt_file}'")
|
||||
print(f"\nThen delete the prompt file:")
|
||||
print("\nThen delete the prompt file:")
|
||||
print(f" rm '{prompt_file}'")
|
||||
return False
|
||||
|
||||
@@ -614,7 +612,9 @@ rm {prompt_file}
|
||||
print()
|
||||
print("💡 When done:")
|
||||
print(f" 1. Check the enhanced SKILL.md: {self.skill_md_path}")
|
||||
print(f" 2. If you don't like it, restore: mv {self.skill_md_path.with_suffix('.md.backup')} {self.skill_md_path}")
|
||||
print(
|
||||
f" 2. If you don't like it, restore: mv {self.skill_md_path.with_suffix('.md.backup')} {self.skill_md_path}"
|
||||
)
|
||||
print(f" 3. Package: skill-seekers package {self.skill_dir}/")
|
||||
|
||||
return True
|
||||
@@ -630,10 +630,9 @@ rm {prompt_file}
|
||||
bool: True if enhancement succeeded
|
||||
"""
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
print("✨ Running Claude Code enhancement (headless mode)...")
|
||||
print(f" Timeout: {timeout} seconds ({timeout//60} minutes)")
|
||||
print(f" Timeout: {timeout} seconds ({timeout // 60} minutes)")
|
||||
print()
|
||||
|
||||
# Record initial state
|
||||
@@ -652,11 +651,11 @@ rm {prompt_file}
|
||||
print()
|
||||
|
||||
result = subprocess.run(
|
||||
['claude', '--dangerously-skip-permissions', prompt_file],
|
||||
["claude", "--dangerously-skip-permissions", prompt_file],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
cwd=str(self.skill_dir) # Run from skill directory
|
||||
cwd=str(self.skill_dir), # Run from skill directory
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
@@ -681,21 +680,21 @@ rm {prompt_file}
|
||||
|
||||
return True
|
||||
else:
|
||||
print(f"⚠️ Claude finished but SKILL.md was not updated")
|
||||
print("⚠️ Claude finished but SKILL.md was not updated")
|
||||
print(f" Initial: mtime={initial_mtime}, size={initial_size}")
|
||||
print(f" Final: mtime={new_mtime}, size={new_size}")
|
||||
print(f" This might indicate an error during enhancement")
|
||||
print(" This might indicate an error during enhancement")
|
||||
print()
|
||||
# Show last 20 lines of stdout for debugging
|
||||
if result.stdout:
|
||||
print(" Last output from Claude:")
|
||||
lines = result.stdout.strip().split('\n')[-20:]
|
||||
lines = result.stdout.strip().split("\n")[-20:]
|
||||
for line in lines:
|
||||
print(f" | {line}")
|
||||
print()
|
||||
return False
|
||||
else:
|
||||
print(f"❌ SKILL.md not found after enhancement")
|
||||
print("❌ SKILL.md not found after enhancement")
|
||||
return False
|
||||
else:
|
||||
print(f"❌ Claude Code returned error (exit code: {result.returncode})")
|
||||
@@ -750,9 +749,9 @@ rm {prompt_file}
|
||||
Returns:
|
||||
bool: True if background task started successfully
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"BACKGROUND ENHANCEMENT: {self.skill_dir.name}")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
# Write initial status
|
||||
self.write_status("pending", "Starting background enhancement...")
|
||||
@@ -764,9 +763,7 @@ rm {prompt_file}
|
||||
|
||||
# Read reference files
|
||||
references = read_reference_files(
|
||||
self.skill_dir,
|
||||
max_chars=LOCAL_CONTENT_LIMIT,
|
||||
preview_limit=LOCAL_PREVIEW_LIMIT
|
||||
self.skill_dir, max_chars=LOCAL_CONTENT_LIMIT, preview_limit=LOCAL_PREVIEW_LIMIT
|
||||
)
|
||||
|
||||
if not references:
|
||||
@@ -785,7 +782,7 @@ rm {prompt_file}
|
||||
return
|
||||
|
||||
# Save prompt to temp file
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
|
||||
prompt_file = f.name
|
||||
f.write(prompt)
|
||||
|
||||
@@ -794,12 +791,7 @@ rm {prompt_file}
|
||||
# Run enhancement
|
||||
if headless:
|
||||
# Run headless (subprocess.run - blocking in thread)
|
||||
result = subprocess.run(
|
||||
['claude', prompt_file],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout
|
||||
)
|
||||
result = subprocess.run(["claude", prompt_file], capture_output=True, text=True, timeout=timeout)
|
||||
|
||||
# Clean up
|
||||
try:
|
||||
@@ -848,9 +840,9 @@ rm {prompt_file}
|
||||
Returns:
|
||||
bool: True if daemon started successfully
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"DAEMON MODE: {self.skill_dir.name}")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
# Write initial status
|
||||
self.write_status("pending", "Starting daemon process...")
|
||||
@@ -939,7 +931,7 @@ except Exception as e:
|
||||
|
||||
# Save daemon script
|
||||
daemon_script_path = self.skill_dir / ".enhancement_daemon.py"
|
||||
daemon_script_path.write_text(daemon_script, encoding='utf-8')
|
||||
daemon_script_path.write_text(daemon_script, encoding="utf-8")
|
||||
daemon_script_path.chmod(0o755)
|
||||
|
||||
# Start daemon process (fully detached)
|
||||
@@ -950,19 +942,16 @@ except Exception as e:
|
||||
if self.force:
|
||||
# Force mode: No output, fully silent
|
||||
subprocess.Popen(
|
||||
['nohup', 'python3', str(daemon_script_path)],
|
||||
["nohup", "python3", str(daemon_script_path)],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
start_new_session=True
|
||||
start_new_session=True,
|
||||
)
|
||||
else:
|
||||
# Normal mode: Log to file
|
||||
with open(log_file, 'w') as log:
|
||||
with open(log_file, "w") as log:
|
||||
subprocess.Popen(
|
||||
['nohup', 'python3', str(daemon_script_path)],
|
||||
stdout=log,
|
||||
stderr=log,
|
||||
start_new_session=True
|
||||
["nohup", "python3", str(daemon_script_path)], stdout=log, stderr=log, start_new_session=True
|
||||
)
|
||||
|
||||
# Give daemon time to start
|
||||
@@ -971,7 +960,7 @@ except Exception as e:
|
||||
# Read status to verify it started
|
||||
status = self.read_status()
|
||||
|
||||
if status and status.get('status') in ['pending', 'running']:
|
||||
if status and status.get("status") in ["pending", "running"]:
|
||||
print("✅ Daemon process started successfully!")
|
||||
print()
|
||||
print("📊 Monitoring:")
|
||||
@@ -1032,43 +1021,31 @@ Mode Comparison:
|
||||
Force Mode (Default ON):
|
||||
By default, all modes skip confirmations (auto-yes).
|
||||
Use --no-force to enable confirmation prompts.
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)")
|
||||
|
||||
parser.add_argument(
|
||||
"--interactive-enhancement",
|
||||
action="store_true",
|
||||
help="Open terminal window for enhancement (default: headless mode)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'skill_directory',
|
||||
help='Path to skill directory (e.g., output/react/)'
|
||||
"--background", action="store_true", help="Run in background and return immediately (non-blocking)"
|
||||
)
|
||||
|
||||
parser.add_argument("--daemon", action="store_true", help="Run as persistent daemon process (fully detached)")
|
||||
|
||||
parser.add_argument(
|
||||
"--no-force",
|
||||
action="store_true",
|
||||
help="Disable force mode: enable confirmation prompts (default: force mode ON)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--interactive-enhancement',
|
||||
action='store_true',
|
||||
help='Open terminal window for enhancement (default: headless mode)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--background',
|
||||
action='store_true',
|
||||
help='Run in background and return immediately (non-blocking)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--daemon',
|
||||
action='store_true',
|
||||
help='Run as persistent daemon process (fully detached)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-force',
|
||||
action='store_true',
|
||||
help='Disable force mode: enable confirmation prompts (default: force mode ON)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--timeout',
|
||||
type=int,
|
||||
default=600,
|
||||
help='Timeout in seconds for headless mode (default: 600 = 10 minutes)'
|
||||
"--timeout", type=int, default=600, help="Timeout in seconds for headless mode (default: 600 = 10 minutes)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -1084,12 +1061,7 @@ Force Mode (Default ON):
|
||||
# Force mode is ON by default, use --no-force to disable
|
||||
enhancer = LocalSkillEnhancer(args.skill_directory, force=not args.no_force)
|
||||
headless = not args.interactive_enhancement # Invert: default is headless
|
||||
success = enhancer.run(
|
||||
headless=headless,
|
||||
timeout=args.timeout,
|
||||
background=args.background,
|
||||
daemon=args.daemon
|
||||
)
|
||||
success = enhancer.run(headless=headless, timeout=args.timeout, background=args.background, daemon=args.daemon)
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
@@ -10,9 +10,8 @@ Usage:
|
||||
skill-seekers enhance-status output/react/ --json
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
@@ -32,7 +31,7 @@ def read_status(skill_dir):
|
||||
return None
|
||||
|
||||
try:
|
||||
return json.loads(status_file.read_text(encoding='utf-8'))
|
||||
return json.loads(status_file.read_text(encoding="utf-8"))
|
||||
except Exception as e:
|
||||
return {"error": f"Failed to read status: {e}"}
|
||||
|
||||
@@ -53,26 +52,21 @@ def format_status(status):
|
||||
return f"❌ {status['error']}"
|
||||
|
||||
# Status emoji mapping
|
||||
status_emojis = {
|
||||
"pending": "⏳",
|
||||
"running": "🔄",
|
||||
"completed": "✅",
|
||||
"failed": "❌"
|
||||
}
|
||||
status_emojis = {"pending": "⏳", "running": "🔄", "completed": "✅", "failed": "❌"}
|
||||
|
||||
emoji = status_emojis.get(status.get('status', ''), '❓')
|
||||
status_text = status.get('status', 'unknown').upper()
|
||||
message = status.get('message', '')
|
||||
progress = status.get('progress', 0.0)
|
||||
timestamp = status.get('timestamp', 'unknown')
|
||||
error = status.get('error')
|
||||
pid = status.get('pid')
|
||||
emoji = status_emojis.get(status.get("status", ""), "❓")
|
||||
status_text = status.get("status", "unknown").upper()
|
||||
message = status.get("message", "")
|
||||
progress = status.get("progress", 0.0)
|
||||
timestamp = status.get("timestamp", "unknown")
|
||||
error = status.get("error")
|
||||
pid = status.get("pid")
|
||||
|
||||
# Build output
|
||||
lines = []
|
||||
lines.append(f"\n{'='*60}")
|
||||
lines.append(f"\n{'=' * 60}")
|
||||
lines.append(f"ENHANCEMENT STATUS: {status_text}")
|
||||
lines.append(f"{'='*60}\n")
|
||||
lines.append(f"{'=' * 60}\n")
|
||||
|
||||
lines.append(f"{emoji} Status: {status_text}")
|
||||
|
||||
@@ -81,7 +75,7 @@ def format_status(status):
|
||||
|
||||
if progress > 0:
|
||||
progress_pct = int(progress * 100)
|
||||
progress_bar = '█' * (progress_pct // 5) + '░' * (20 - progress_pct // 5)
|
||||
progress_bar = "█" * (progress_pct // 5) + "░" * (20 - progress_pct // 5)
|
||||
lines.append(f" Progress: [{progress_bar}] {progress_pct}%")
|
||||
|
||||
if pid:
|
||||
@@ -94,7 +88,7 @@ def format_status(status):
|
||||
|
||||
lines.append("")
|
||||
|
||||
return '\n'.join(lines)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def watch_status(skill_dir, interval=2):
|
||||
@@ -106,7 +100,7 @@ def watch_status(skill_dir, interval=2):
|
||||
"""
|
||||
print(f"👀 Watching enhancement status for: {skill_dir}")
|
||||
print(f" Update interval: {interval} seconds")
|
||||
print(f" Press Ctrl+C to stop\n")
|
||||
print(" Press Ctrl+C to stop\n")
|
||||
|
||||
try:
|
||||
last_status = None
|
||||
@@ -123,7 +117,7 @@ def watch_status(skill_dir, interval=2):
|
||||
last_status = status
|
||||
|
||||
# Exit if completed or failed
|
||||
if status and status.get('status') in ['completed', 'failed']:
|
||||
if status and status.get("status") in ["completed", "failed"]:
|
||||
break
|
||||
|
||||
time.sleep(interval)
|
||||
@@ -149,32 +143,18 @@ Examples:
|
||||
|
||||
# Get JSON output (for scripts)
|
||||
skill-seekers enhance-status output/react/ --json
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'skill_directory',
|
||||
help='Path to skill directory (e.g., output/react/)'
|
||||
)
|
||||
parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)")
|
||||
|
||||
parser.add_argument(
|
||||
'--watch', '-w',
|
||||
action='store_true',
|
||||
help='Watch status in real-time (updates every 2 seconds)'
|
||||
"--watch", "-w", action="store_true", help="Watch status in real-time (updates every 2 seconds)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--json',
|
||||
action='store_true',
|
||||
help='Output raw JSON (for scripting)'
|
||||
)
|
||||
parser.add_argument("--json", action="store_true", help="Output raw JSON (for scripting)")
|
||||
|
||||
parser.add_argument(
|
||||
'--interval',
|
||||
type=int,
|
||||
default=2,
|
||||
help='Watch update interval in seconds (default: 2)'
|
||||
)
|
||||
parser.add_argument("--interval", type=int, default=2, help="Watch update interval in seconds (default: 2)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -197,9 +177,9 @@ Examples:
|
||||
# Exit code based on status
|
||||
if not status:
|
||||
sys.exit(2) # No status found
|
||||
elif status.get('status') == 'completed':
|
||||
elif status.get("status") == "completed":
|
||||
sys.exit(0) # Success
|
||||
elif status.get('status') == 'failed':
|
||||
elif status.get("status") == "failed":
|
||||
sys.exit(1) # Failed
|
||||
else:
|
||||
sys.exit(0) # In progress
|
||||
|
||||
@@ -4,23 +4,20 @@ Page Count Estimator for Skill Seeker
|
||||
Quickly estimates how many pages a config will scrape without downloading content
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports when run as script
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from skill_seekers.cli.constants import (
|
||||
DEFAULT_RATE_LIMIT,
|
||||
DEFAULT_MAX_DISCOVERY,
|
||||
DISCOVERY_THRESHOLD
|
||||
)
|
||||
from skill_seekers.cli.constants import DEFAULT_MAX_DISCOVERY, DEFAULT_RATE_LIMIT, DISCOVERY_THRESHOLD
|
||||
|
||||
|
||||
def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
@@ -35,20 +32,20 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
Returns:
|
||||
dict with estimation results
|
||||
"""
|
||||
base_url = config['base_url']
|
||||
start_urls = config.get('start_urls', [base_url])
|
||||
url_patterns = config.get('url_patterns', {'include': [], 'exclude': []})
|
||||
rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
|
||||
base_url = config["base_url"]
|
||||
start_urls = config.get("start_urls", [base_url])
|
||||
url_patterns = config.get("url_patterns", {"include": [], "exclude": []})
|
||||
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
|
||||
|
||||
visited = set()
|
||||
pending = list(start_urls)
|
||||
discovered = 0
|
||||
|
||||
include_patterns = url_patterns.get('include', [])
|
||||
exclude_patterns = url_patterns.get('exclude', [])
|
||||
include_patterns = url_patterns.get("include", [])
|
||||
exclude_patterns = url_patterns.get("exclude", [])
|
||||
|
||||
# Handle unlimited mode
|
||||
unlimited = (max_discovery == -1 or max_discovery is None)
|
||||
unlimited = max_discovery == -1 or max_discovery is None
|
||||
|
||||
print(f"🔍 Estimating pages for: {config['name']}")
|
||||
print(f"📍 Base URL: {base_url}")
|
||||
@@ -56,8 +53,8 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
print(f"⏱️ Rate limit: {rate_limit}s")
|
||||
|
||||
if unlimited:
|
||||
print(f"🔢 Max discovery: UNLIMITED (will discover all pages)")
|
||||
print(f"⚠️ WARNING: This may take a long time!")
|
||||
print("🔢 Max discovery: UNLIMITED (will discover all pages)")
|
||||
print("⚠️ WARNING: This may take a long time!")
|
||||
else:
|
||||
print(f"🔢 Max discovery: {max_discovery}")
|
||||
|
||||
@@ -80,26 +77,26 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
if discovered % 10 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
rate = discovered / elapsed if elapsed > 0 else 0
|
||||
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end='\r')
|
||||
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end="\r")
|
||||
|
||||
try:
|
||||
# HEAD request first to check if page exists (faster)
|
||||
head_response = requests.head(url, timeout=timeout, allow_redirects=True)
|
||||
|
||||
# Skip non-HTML content
|
||||
content_type = head_response.headers.get('Content-Type', '')
|
||||
if 'text/html' not in content_type:
|
||||
content_type = head_response.headers.get("Content-Type", "")
|
||||
if "text/html" not in content_type:
|
||||
continue
|
||||
|
||||
# Now GET the page to find links
|
||||
response = requests.get(url, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Find all links
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link['href']
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
full_url = urljoin(url, href)
|
||||
|
||||
# Normalize URL
|
||||
@@ -117,10 +114,10 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
# Rate limiting
|
||||
time.sleep(rate_limit)
|
||||
|
||||
except requests.RequestException as e:
|
||||
except requests.RequestException:
|
||||
# Silently skip errors during estimation
|
||||
pass
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
# Silently skip other errors
|
||||
pass
|
||||
|
||||
@@ -128,13 +125,13 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
|
||||
# Results
|
||||
results = {
|
||||
'discovered': discovered,
|
||||
'pending': len(pending),
|
||||
'estimated_total': discovered + len(pending),
|
||||
'elapsed_seconds': round(elapsed, 2),
|
||||
'discovery_rate': round(discovered / elapsed if elapsed > 0 else 0, 2),
|
||||
'hit_limit': (not unlimited) and (discovered >= max_discovery),
|
||||
'unlimited': unlimited
|
||||
"discovered": discovered,
|
||||
"pending": len(pending),
|
||||
"estimated_total": discovered + len(pending),
|
||||
"elapsed_seconds": round(elapsed, 2),
|
||||
"discovery_rate": round(discovered / elapsed if elapsed > 0 else 0, 2),
|
||||
"hit_limit": (not unlimited) and (discovered >= max_discovery),
|
||||
"unlimited": unlimited,
|
||||
}
|
||||
|
||||
return results
|
||||
@@ -143,7 +140,7 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
def is_valid_url(url, base_url, include_patterns, exclude_patterns):
|
||||
"""Check if URL should be crawled"""
|
||||
# Must be same domain
|
||||
if not url.startswith(base_url.rstrip('/')):
|
||||
if not url.startswith(base_url.rstrip("/")):
|
||||
return False
|
||||
|
||||
# Check exclude patterns first
|
||||
@@ -180,11 +177,11 @@ def print_results(results, config):
|
||||
print(f"⏱️ Time Elapsed: {results['elapsed_seconds']}s")
|
||||
print(f"⚡ Discovery Rate: {results['discovery_rate']} pages/sec")
|
||||
|
||||
if results.get('unlimited', False):
|
||||
if results.get("unlimited", False):
|
||||
print()
|
||||
print("✅ UNLIMITED MODE - Discovered all reachable pages")
|
||||
print(f" Total pages: {results['estimated_total']}")
|
||||
elif results['hit_limit']:
|
||||
elif results["hit_limit"]:
|
||||
print()
|
||||
print("⚠️ Hit discovery limit - actual total may be higher")
|
||||
print(" Increase max_discovery parameter for more accurate estimate")
|
||||
@@ -195,8 +192,8 @@ def print_results(results, config):
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
estimated = results['estimated_total']
|
||||
current_max = config.get('max_pages', 100)
|
||||
estimated = results["estimated_total"]
|
||||
current_max = config.get("max_pages", 100)
|
||||
|
||||
if estimated <= current_max:
|
||||
print(f"✅ Current max_pages ({current_max}) is sufficient")
|
||||
@@ -207,7 +204,7 @@ def print_results(results, config):
|
||||
print(f" (Estimated {estimated} + 50 buffer)")
|
||||
|
||||
# Estimate time for full scrape
|
||||
rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
|
||||
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
|
||||
estimated_time = (estimated * rate_limit) / 60 # in minutes
|
||||
|
||||
print()
|
||||
@@ -220,7 +217,7 @@ def print_results(results, config):
|
||||
def load_config(config_path):
|
||||
"""Load configuration from JSON file"""
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
return config
|
||||
except FileNotFoundError:
|
||||
@@ -298,7 +295,7 @@ def list_all_configs():
|
||||
|
||||
# Try to load the config to get name and description
|
||||
try:
|
||||
with open(config_file, 'r') as f:
|
||||
with open(config_file) as f:
|
||||
config_data = json.load(f)
|
||||
|
||||
name = config_data.get("name", config_file.stem)
|
||||
@@ -308,20 +305,19 @@ def list_all_configs():
|
||||
if len(description) > 60:
|
||||
description = description[:57] + "..."
|
||||
|
||||
by_category[category].append({
|
||||
"file": config_file.name,
|
||||
"path": str(rel_path),
|
||||
"name": name,
|
||||
"description": description
|
||||
})
|
||||
by_category[category].append(
|
||||
{"file": config_file.name, "path": str(rel_path), "name": name, "description": description}
|
||||
)
|
||||
except Exception as e:
|
||||
# If we can't parse the config, just use the filename
|
||||
by_category[category].append({
|
||||
"file": config_file.name,
|
||||
"path": str(rel_path),
|
||||
"name": config_file.stem,
|
||||
"description": f"⚠️ Error loading config: {e}"
|
||||
})
|
||||
by_category[category].append(
|
||||
{
|
||||
"file": config_file.name,
|
||||
"path": str(rel_path),
|
||||
"name": config_file.stem,
|
||||
"description": f"⚠️ Error loading config: {e}",
|
||||
}
|
||||
)
|
||||
|
||||
# Print configs by category
|
||||
total = 0
|
||||
@@ -351,7 +347,7 @@ def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Estimate page count for Skill Seeker configs',
|
||||
description="Estimate page count for Skill Seeker configs",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -366,18 +362,25 @@ Examples:
|
||||
|
||||
# Quick estimate (stop at 100 pages)
|
||||
skill-seekers estimate configs/vue.json --max-discovery 100
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument('config', nargs='?', help='Path to config JSON file')
|
||||
parser.add_argument('--all', action='store_true',
|
||||
help='List all available configs from api/configs_repo/official/')
|
||||
parser.add_argument('--max-discovery', '-m', type=int, default=DEFAULT_MAX_DISCOVERY,
|
||||
help=f'Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)')
|
||||
parser.add_argument('--unlimited', '-u', action='store_true',
|
||||
help='Remove discovery limit - discover all pages (same as --max-discovery -1)')
|
||||
parser.add_argument('--timeout', '-t', type=int, default=30,
|
||||
help='HTTP request timeout in seconds (default: 30)')
|
||||
parser.add_argument("config", nargs="?", help="Path to config JSON file")
|
||||
parser.add_argument("--all", action="store_true", help="List all available configs from api/configs_repo/official/")
|
||||
parser.add_argument(
|
||||
"--max-discovery",
|
||||
"-m",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_DISCOVERY,
|
||||
help=f"Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--unlimited",
|
||||
"-u",
|
||||
action="store_true",
|
||||
help="Remove discovery limit - discover all pages (same as --max-discovery -1)",
|
||||
)
|
||||
parser.add_argument("--timeout", "-t", type=int, default=30, help="HTTP request timeout in seconds (default: 30)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -401,7 +404,7 @@ Examples:
|
||||
print_results(results, config)
|
||||
|
||||
# Return exit code based on results
|
||||
if results['hit_limit']:
|
||||
if results["hit_limit"]:
|
||||
return 2 # Warning: hit limit
|
||||
return 0 # Success
|
||||
|
||||
@@ -413,5 +416,5 @@ Examples:
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
@@ -12,17 +12,17 @@ Phase 4 enhancements:
|
||||
- GitHub issue links for context
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Tuple, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
# Import three-stream data classes (Phase 1)
|
||||
try:
|
||||
from .github_fetcher import ThreeStreamData, DocsStream, InsightsStream
|
||||
from .merge_sources import categorize_issues_by_topic
|
||||
from .github_fetcher import DocsStream, InsightsStream, ThreeStreamData
|
||||
from .markdown_cleaner import MarkdownCleaner
|
||||
from .merge_sources import categorize_issues_by_topic
|
||||
except ImportError:
|
||||
# Fallback if github_fetcher not available
|
||||
ThreeStreamData = None
|
||||
@@ -34,10 +34,9 @@ except ImportError:
|
||||
class RouterGenerator:
|
||||
"""Generates router skills that direct to specialized sub-skills with GitHub integration"""
|
||||
|
||||
def __init__(self,
|
||||
config_paths: List[str],
|
||||
router_name: str = None,
|
||||
github_streams: Optional['ThreeStreamData'] = None):
|
||||
def __init__(
|
||||
self, config_paths: list[str], router_name: str = None, github_streams: Optional["ThreeStreamData"] = None
|
||||
):
|
||||
"""
|
||||
Initialize router generator with optional GitHub streams.
|
||||
|
||||
@@ -60,21 +59,21 @@ class RouterGenerator:
|
||||
if github_streams and github_streams.insights_stream:
|
||||
self.github_metadata = github_streams.insights_stream.metadata
|
||||
self.github_issues = {
|
||||
'common_problems': github_streams.insights_stream.common_problems,
|
||||
'known_solutions': github_streams.insights_stream.known_solutions,
|
||||
'top_labels': github_streams.insights_stream.top_labels
|
||||
"common_problems": github_streams.insights_stream.common_problems,
|
||||
"known_solutions": github_streams.insights_stream.known_solutions,
|
||||
"top_labels": github_streams.insights_stream.top_labels,
|
||||
}
|
||||
|
||||
if github_streams and github_streams.docs_stream:
|
||||
self.github_docs = {
|
||||
'readme': github_streams.docs_stream.readme,
|
||||
'contributing': github_streams.docs_stream.contributing
|
||||
"readme": github_streams.docs_stream.readme,
|
||||
"contributing": github_streams.docs_stream.contributing,
|
||||
}
|
||||
|
||||
def load_config(self, path: Path) -> Dict[str, Any]:
|
||||
def load_config(self, path: Path) -> dict[str, Any]:
|
||||
"""Load a config file"""
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading {path}: {e}")
|
||||
@@ -83,17 +82,17 @@ class RouterGenerator:
|
||||
def infer_router_name(self) -> str:
|
||||
"""Infer router name from sub-skill names"""
|
||||
# Find common prefix
|
||||
names = [cfg['name'] for cfg in self.configs]
|
||||
names = [cfg["name"] for cfg in self.configs]
|
||||
if not names:
|
||||
return "router"
|
||||
|
||||
# Get common prefix before first dash
|
||||
first_name = names[0]
|
||||
if '-' in first_name:
|
||||
return first_name.split('-')[0]
|
||||
if "-" in first_name:
|
||||
return first_name.split("-")[0]
|
||||
return first_name
|
||||
|
||||
def extract_routing_keywords(self) -> Dict[str, List[str]]:
|
||||
def extract_routing_keywords(self) -> dict[str, list[str]]:
|
||||
"""
|
||||
Extract keywords for routing to each skill (Phase 4 enhanced).
|
||||
|
||||
@@ -103,26 +102,26 @@ class RouterGenerator:
|
||||
routing = {}
|
||||
|
||||
for config in self.configs:
|
||||
name = config['name']
|
||||
name = config["name"]
|
||||
keywords = []
|
||||
|
||||
# Extract from categories (base weight: 1x)
|
||||
if 'categories' in config:
|
||||
keywords.extend(config['categories'].keys())
|
||||
if "categories" in config:
|
||||
keywords.extend(config["categories"].keys())
|
||||
|
||||
# Extract from name (part after dash)
|
||||
if '-' in name:
|
||||
skill_topic = name.split('-', 1)[1]
|
||||
if "-" in name:
|
||||
skill_topic = name.split("-", 1)[1]
|
||||
keywords.append(skill_topic)
|
||||
|
||||
# Phase 4: Add GitHub issue labels (weight 2x by including twice)
|
||||
if self.github_issues:
|
||||
# Get top labels related to this skill topic
|
||||
top_labels = self.github_issues.get('top_labels', [])
|
||||
top_labels = self.github_issues.get("top_labels", [])
|
||||
skill_keywords = set(keywords)
|
||||
|
||||
for label_info in top_labels[:10]: # Top 10 labels
|
||||
label = label_info['label'].lower()
|
||||
label = label_info["label"].lower()
|
||||
|
||||
# Check if label relates to any skill keyword
|
||||
if any(keyword.lower() in label or label in keyword.lower() for keyword in skill_keywords):
|
||||
@@ -141,7 +140,7 @@ class RouterGenerator:
|
||||
|
||||
return routing
|
||||
|
||||
def _extract_skill_specific_labels(self, skill_name: str, skill_keywords: set) -> List[str]:
|
||||
def _extract_skill_specific_labels(self, skill_name: str, skill_keywords: set) -> list[str]:
|
||||
"""
|
||||
Extract labels from GitHub issues that match this specific skill.
|
||||
|
||||
@@ -159,14 +158,14 @@ class RouterGenerator:
|
||||
if not self.github_issues:
|
||||
return []
|
||||
|
||||
common_problems = self.github_issues.get('common_problems', [])
|
||||
known_solutions = self.github_issues.get('known_solutions', [])
|
||||
common_problems = self.github_issues.get("common_problems", [])
|
||||
known_solutions = self.github_issues.get("known_solutions", [])
|
||||
all_issues = common_problems + known_solutions
|
||||
|
||||
matching_labels = set()
|
||||
|
||||
for issue in all_issues:
|
||||
issue_labels = issue.get('labels', [])
|
||||
issue_labels = issue.get("labels", [])
|
||||
issue_labels_lower = [label.lower() for label in issue_labels]
|
||||
|
||||
# Check if this issue relates to the skill
|
||||
@@ -180,13 +179,20 @@ class RouterGenerator:
|
||||
# Add ALL labels from this matching issue
|
||||
for label in issue_labels_lower:
|
||||
# Skip generic labels that don't add routing value
|
||||
if label not in ['bug', 'enhancement', 'question', 'help wanted',
|
||||
'good first issue', 'documentation', 'duplicate']:
|
||||
if label not in [
|
||||
"bug",
|
||||
"enhancement",
|
||||
"question",
|
||||
"help wanted",
|
||||
"good first issue",
|
||||
"documentation",
|
||||
"duplicate",
|
||||
]:
|
||||
matching_labels.add(label)
|
||||
|
||||
return list(matching_labels)
|
||||
|
||||
def _generate_frontmatter(self, routing_keywords: Dict[str, List[str]]) -> str:
|
||||
def _generate_frontmatter(self, routing_keywords: dict[str, list[str]]) -> str:
|
||||
"""
|
||||
Generate YAML frontmatter compliant with agentskills.io spec.
|
||||
|
||||
@@ -201,16 +207,16 @@ class RouterGenerator:
|
||||
# Build comprehensive description from all sub-skills
|
||||
all_topics = []
|
||||
for config in self.configs:
|
||||
desc = config.get('description', '')
|
||||
desc = config.get("description", "")
|
||||
# Extract key topics from description (simple extraction)
|
||||
topics = [word.strip() for word in desc.split(',') if word.strip()]
|
||||
topics = [word.strip() for word in desc.split(",") if word.strip()]
|
||||
all_topics.extend(topics[:2]) # Max 2 topics per skill
|
||||
|
||||
# Create keyword-rich description
|
||||
unique_topics = list(dict.fromkeys(all_topics))[:7] # Top 7 unique topics
|
||||
|
||||
if unique_topics:
|
||||
topics_str = ', '.join(unique_topics)
|
||||
topics_str = ", ".join(unique_topics)
|
||||
description = f"{self.router_name.title()} framework. Use when working with: {topics_str}"
|
||||
else:
|
||||
description = f"Use when working with {self.router_name.title()} development and programming"
|
||||
@@ -225,21 +231,21 @@ class RouterGenerator:
|
||||
|
||||
# Try to get language-specific compatibility if GitHub metadata available
|
||||
if self.github_metadata:
|
||||
language = self.github_metadata.get('language', '')
|
||||
language = self.github_metadata.get("language", "")
|
||||
compatibility_map = {
|
||||
'Python': f'Python 3.10+, requires {self.router_name} package',
|
||||
'JavaScript': f'Node.js 18+, requires {self.router_name} package',
|
||||
'TypeScript': f'Node.js 18+, TypeScript 5+, requires {self.router_name} package',
|
||||
'Go': f'Go 1.20+, requires {self.router_name} package',
|
||||
'Rust': f'Rust 1.70+, requires {self.router_name} package',
|
||||
'Java': f'Java 17+, requires {self.router_name} package',
|
||||
"Python": f"Python 3.10+, requires {self.router_name} package",
|
||||
"JavaScript": f"Node.js 18+, requires {self.router_name} package",
|
||||
"TypeScript": f"Node.js 18+, TypeScript 5+, requires {self.router_name} package",
|
||||
"Go": f"Go 1.20+, requires {self.router_name} package",
|
||||
"Rust": f"Rust 1.70+, requires {self.router_name} package",
|
||||
"Java": f"Java 17+, requires {self.router_name} package",
|
||||
}
|
||||
if language in compatibility_map:
|
||||
compatibility = compatibility_map[language]
|
||||
|
||||
# Try to extract license
|
||||
if isinstance(self.github_metadata.get('license'), dict):
|
||||
license_info = self.github_metadata['license'].get('name', 'MIT')
|
||||
if isinstance(self.github_metadata.get("license"), dict):
|
||||
license_info = self.github_metadata["license"].get("name", "MIT")
|
||||
|
||||
frontmatter = f"""---
|
||||
name: {self.router_name}
|
||||
@@ -289,27 +295,27 @@ compatibility: {compatibility}
|
||||
"""
|
||||
# Remove router name prefix
|
||||
if skill_name.startswith(f"{self.router_name}-"):
|
||||
topic = skill_name[len(self.router_name)+1:]
|
||||
topic = skill_name[len(self.router_name) + 1 :]
|
||||
else:
|
||||
topic = skill_name
|
||||
|
||||
# Capitalize and add context
|
||||
topic = topic.replace('-', ' ').title()
|
||||
topic = topic.replace("-", " ").title()
|
||||
|
||||
# Add common suffixes for context
|
||||
topic_map = {
|
||||
'oauth': 'OAuth authentication',
|
||||
'auth': 'authentication',
|
||||
'async': 'async patterns',
|
||||
'api': 'API integration',
|
||||
'orm': 'ORM queries',
|
||||
'hooks': 'hooks',
|
||||
'routing': 'routing',
|
||||
'testing': 'testing',
|
||||
'2d': '2D development',
|
||||
'3d': '3D development',
|
||||
'scripting': 'scripting',
|
||||
'physics': 'physics',
|
||||
"oauth": "OAuth authentication",
|
||||
"auth": "authentication",
|
||||
"async": "async patterns",
|
||||
"api": "API integration",
|
||||
"orm": "ORM queries",
|
||||
"hooks": "hooks",
|
||||
"routing": "routing",
|
||||
"testing": "testing",
|
||||
"2d": "2D development",
|
||||
"3d": "3D development",
|
||||
"scripting": "scripting",
|
||||
"physics": "physics",
|
||||
}
|
||||
|
||||
topic_lower = topic.lower()
|
||||
@@ -319,7 +325,7 @@ compatibility: {compatibility}
|
||||
|
||||
return topic
|
||||
|
||||
def _generate_dynamic_examples(self, routing_keywords: Dict[str, List[str]]) -> str:
|
||||
def _generate_dynamic_examples(self, routing_keywords: dict[str, list[str]]) -> str:
|
||||
"""
|
||||
Generate examples dynamically from actual sub-skill names and keywords.
|
||||
|
||||
@@ -351,10 +357,7 @@ compatibility: {compatibility}
|
||||
topic = self._extract_topic_from_skill(first_skill)
|
||||
keyword = first_keywords[0] if first_keywords else topic
|
||||
|
||||
examples.append(
|
||||
f'**Q:** "How do I implement {keyword}?"\n'
|
||||
f'**A:** Activates {first_skill} skill'
|
||||
)
|
||||
examples.append(f'**Q:** "How do I implement {keyword}?"\n**A:** Activates {first_skill} skill')
|
||||
|
||||
# Example 2: Different skill (second sub-skill if available)
|
||||
if len(skill_names) >= 2:
|
||||
@@ -365,8 +368,7 @@ compatibility: {compatibility}
|
||||
keyword = second_keywords[0] if second_keywords else topic
|
||||
|
||||
examples.append(
|
||||
f'**Q:** "Working with {keyword} in {self.router_name.title()}"\n'
|
||||
f'**A:** Activates {second_skill} skill'
|
||||
f'**Q:** "Working with {keyword} in {self.router_name.title()}"\n**A:** Activates {second_skill} skill'
|
||||
)
|
||||
|
||||
# Example 3: Multi-skill activation (if 2+ skills)
|
||||
@@ -378,13 +380,12 @@ compatibility: {compatibility}
|
||||
topic_2 = self._extract_topic_from_skill(skill_2)
|
||||
|
||||
examples.append(
|
||||
f'**Q:** "Combining {topic_1} with {topic_2}"\n'
|
||||
f'**A:** Activates {skill_1} + {skill_2} skills'
|
||||
f'**Q:** "Combining {topic_1} with {topic_2}"\n**A:** Activates {skill_1} + {skill_2} skills'
|
||||
)
|
||||
|
||||
return '\n\n'.join(examples)
|
||||
return "\n\n".join(examples)
|
||||
|
||||
def _generate_examples_from_github(self, routing_keywords: Dict[str, List[str]]) -> str:
|
||||
def _generate_examples_from_github(self, routing_keywords: dict[str, list[str]]) -> str:
|
||||
"""
|
||||
Generate examples from real GitHub issue titles.
|
||||
|
||||
@@ -402,7 +403,7 @@ compatibility: {compatibility}
|
||||
return self._generate_dynamic_examples(routing_keywords)
|
||||
|
||||
examples = []
|
||||
common_problems = self.github_issues.get('common_problems', [])
|
||||
common_problems = self.github_issues.get("common_problems", [])
|
||||
|
||||
if not common_problems:
|
||||
return self._generate_dynamic_examples(routing_keywords)
|
||||
@@ -414,29 +415,26 @@ compatibility: {compatibility}
|
||||
|
||||
# Find first issue matching this skill's keywords
|
||||
for issue in common_problems:
|
||||
issue_labels = [label.lower() for label in issue.get('labels', [])]
|
||||
issue_labels = [label.lower() for label in issue.get("labels", [])]
|
||||
if any(label in skill_keywords_lower for label in issue_labels):
|
||||
matched_issue = issue
|
||||
common_problems.remove(issue) # Don't reuse same issue
|
||||
break
|
||||
|
||||
if matched_issue:
|
||||
title = matched_issue.get('title', '')
|
||||
title = matched_issue.get("title", "")
|
||||
question = self._convert_issue_to_question(title)
|
||||
examples.append(
|
||||
f'**Q:** "{question}"\n'
|
||||
f'**A:** Activates {skill_name} skill'
|
||||
)
|
||||
examples.append(f'**Q:** "{question}"\n**A:** Activates {skill_name} skill')
|
||||
else:
|
||||
# Fallback to keyword-based example for this skill
|
||||
topic = self._extract_topic_from_skill(skill_name)
|
||||
keyword = keywords[0] if keywords else topic
|
||||
examples.append(
|
||||
f'**Q:** "Working with {keyword} in {self.router_name.title()}"\n'
|
||||
f'**A:** Activates {skill_name} skill'
|
||||
f"**A:** Activates {skill_name} skill"
|
||||
)
|
||||
|
||||
return '\n\n'.join(examples) if examples else self._generate_dynamic_examples(routing_keywords)
|
||||
return "\n\n".join(examples) if examples else self._generate_dynamic_examples(routing_keywords)
|
||||
|
||||
def _convert_issue_to_question(self, issue_title: str) -> str:
|
||||
"""
|
||||
@@ -456,24 +454,24 @@ compatibility: {compatibility}
|
||||
title_lower = issue_title.lower()
|
||||
|
||||
# Pattern 1: Error/Failure issues
|
||||
if 'fail' in title_lower or 'error' in title_lower or 'issue' in title_lower:
|
||||
cleaned = issue_title.replace(' fails', '').replace(' errors', '').replace(' issue', '')
|
||||
if "fail" in title_lower or "error" in title_lower or "issue" in title_lower:
|
||||
cleaned = issue_title.replace(" fails", "").replace(" errors", "").replace(" issue", "")
|
||||
return f"How do I fix {cleaned.lower()}?"
|
||||
|
||||
# Pattern 2: Documentation requests
|
||||
if 'documentation' in title_lower or 'docs' in title_lower:
|
||||
cleaned = issue_title.replace(' documentation', '').replace(' docs', '')
|
||||
if "documentation" in title_lower or "docs" in title_lower:
|
||||
cleaned = issue_title.replace(" documentation", "").replace(" docs", "")
|
||||
return f"How do I use {cleaned.lower()}?"
|
||||
|
||||
# Pattern 3: Feature requests
|
||||
if title_lower.startswith('add ') or title_lower.startswith('added '):
|
||||
feature = issue_title.replace('Add ', '').replace('Added ', '')
|
||||
if title_lower.startswith("add ") or title_lower.startswith("added "):
|
||||
feature = issue_title.replace("Add ", "").replace("Added ", "")
|
||||
return f"How do I implement {feature.lower()}?"
|
||||
|
||||
# Default: Generic question
|
||||
return f"How do I handle {issue_title.lower()}?"
|
||||
|
||||
def _extract_common_patterns(self) -> List[Dict[str, str]]:
|
||||
def _extract_common_patterns(self) -> list[dict[str, str]]:
|
||||
"""
|
||||
Extract problem-solution patterns from closed GitHub issues.
|
||||
|
||||
@@ -487,25 +485,21 @@ compatibility: {compatibility}
|
||||
if not self.github_issues:
|
||||
return []
|
||||
|
||||
known_solutions = self.github_issues.get('known_solutions', [])
|
||||
known_solutions = self.github_issues.get("known_solutions", [])
|
||||
if not known_solutions:
|
||||
return []
|
||||
|
||||
patterns = []
|
||||
|
||||
# Top 5 closed issues with most engagement (comments indicate usefulness)
|
||||
top_solutions = sorted(known_solutions, key=lambda x: x.get('comments', 0), reverse=True)[:5]
|
||||
top_solutions = sorted(known_solutions, key=lambda x: x.get("comments", 0), reverse=True)[:5]
|
||||
|
||||
for issue in top_solutions:
|
||||
title = issue.get('title', '')
|
||||
number = issue.get('number', 0)
|
||||
title = issue.get("title", "")
|
||||
number = issue.get("number", 0)
|
||||
problem, solution = self._parse_issue_pattern(title)
|
||||
|
||||
patterns.append({
|
||||
'problem': problem,
|
||||
'solution': solution,
|
||||
'issue_number': number
|
||||
})
|
||||
patterns.append({"problem": problem, "solution": solution, "issue_number": number})
|
||||
|
||||
return patterns
|
||||
|
||||
@@ -530,24 +524,24 @@ compatibility: {compatibility}
|
||||
title_lower = issue_title.lower()
|
||||
|
||||
# Pattern 1: "Fixed X" → "X not working" / "See fix"
|
||||
if title_lower.startswith('fixed ') or title_lower.startswith('fix '):
|
||||
problem_text = issue_title.replace('Fixed ', '').replace('Fix ', '')
|
||||
if title_lower.startswith("fixed ") or title_lower.startswith("fix "):
|
||||
problem_text = issue_title.replace("Fixed ", "").replace("Fix ", "")
|
||||
return (f"{problem_text} not working", "See fix implementation details")
|
||||
|
||||
# Pattern 2: "Resolved X" → "X issue" / "See resolution"
|
||||
if title_lower.startswith('resolved ') or title_lower.startswith('resolve '):
|
||||
problem_text = issue_title.replace('Resolved ', '').replace('Resolve ', '')
|
||||
if title_lower.startswith("resolved ") or title_lower.startswith("resolve "):
|
||||
problem_text = issue_title.replace("Resolved ", "").replace("Resolve ", "")
|
||||
return (f"{problem_text} issue", "See resolution approach")
|
||||
|
||||
# Pattern 3: "Added X" → "Missing X" / "Use X"
|
||||
if title_lower.startswith('added ') or title_lower.startswith('add '):
|
||||
feature_text = issue_title.replace('Added ', '').replace('Add ', '')
|
||||
if title_lower.startswith("added ") or title_lower.startswith("add "):
|
||||
feature_text = issue_title.replace("Added ", "").replace("Add ", "")
|
||||
return (f"Missing {feature_text}", f"Use {feature_text} feature")
|
||||
|
||||
# Default: Use title as-is
|
||||
return (issue_title, "See issue for solution details")
|
||||
|
||||
def _detect_framework(self) -> Optional[str]:
|
||||
def _detect_framework(self) -> str | None:
|
||||
"""
|
||||
Detect framework from router name and GitHub metadata.
|
||||
|
||||
@@ -561,14 +555,14 @@ compatibility: {compatibility}
|
||||
router_lower = self.router_name.lower()
|
||||
|
||||
framework_keywords = {
|
||||
'fastapi': 'fastapi',
|
||||
'django': 'django',
|
||||
'flask': 'flask',
|
||||
'react': 'react',
|
||||
'vue': 'vue',
|
||||
'express': 'express',
|
||||
'fastmcp': 'fastmcp',
|
||||
'mcp': 'fastmcp',
|
||||
"fastapi": "fastapi",
|
||||
"django": "django",
|
||||
"flask": "flask",
|
||||
"react": "react",
|
||||
"vue": "vue",
|
||||
"express": "express",
|
||||
"fastmcp": "fastmcp",
|
||||
"mcp": "fastmcp",
|
||||
}
|
||||
|
||||
# Check router name first
|
||||
@@ -578,7 +572,7 @@ compatibility: {compatibility}
|
||||
|
||||
# Check GitHub description if available
|
||||
if self.github_metadata:
|
||||
description = self.github_metadata.get('description', '').lower()
|
||||
description = self.github_metadata.get("description", "").lower()
|
||||
for keyword, framework in framework_keywords.items():
|
||||
if keyword in description:
|
||||
return framework
|
||||
@@ -599,7 +593,7 @@ compatibility: {compatibility}
|
||||
Formatted Quick Start section with install + hello world code
|
||||
"""
|
||||
templates = {
|
||||
'fastapi': """## Quick Start
|
||||
"fastapi": """## Quick Start
|
||||
|
||||
```bash
|
||||
pip install fastapi uvicorn
|
||||
@@ -617,7 +611,7 @@ def read_root():
|
||||
# Run: uvicorn main:app --reload
|
||||
```
|
||||
""",
|
||||
'fastmcp': """## Quick Start
|
||||
"fastmcp": """## Quick Start
|
||||
|
||||
```bash
|
||||
pip install fastmcp
|
||||
@@ -633,7 +627,7 @@ def greet(name: str) -> str:
|
||||
return f"Hello, {name}!"
|
||||
```
|
||||
""",
|
||||
'django': """## Quick Start
|
||||
"django": """## Quick Start
|
||||
|
||||
```bash
|
||||
pip install django
|
||||
@@ -644,7 +638,7 @@ python manage.py runserver
|
||||
|
||||
Visit http://127.0.0.1:8000/ to see your Django app.
|
||||
""",
|
||||
'react': """## Quick Start
|
||||
"react": """## Quick Start
|
||||
|
||||
```bash
|
||||
npx create-react-app my-app
|
||||
@@ -677,16 +671,16 @@ export default App;
|
||||
all_topics = []
|
||||
|
||||
for config in self.configs:
|
||||
desc = config.get('description', '')
|
||||
desc = config.get("description", "")
|
||||
# Extract key topics from description (simple comma-separated extraction)
|
||||
topics = [topic.strip() for topic in desc.split(',') if topic.strip()]
|
||||
topics = [topic.strip() for topic in desc.split(",") if topic.strip()]
|
||||
all_topics.extend(topics[:2]) # Max 2 topics per skill
|
||||
|
||||
# Deduplicate and take top 5-7 topics
|
||||
unique_topics = list(dict.fromkeys(all_topics))[:7]
|
||||
|
||||
if not unique_topics:
|
||||
return f'Use when working with {self.router_name} development and programming'
|
||||
return f"Use when working with {self.router_name} development and programming"
|
||||
|
||||
# Format as user-friendly bulleted list
|
||||
description = f"""Use this skill when working with:
|
||||
@@ -695,8 +689,8 @@ export default App;
|
||||
|
||||
for topic in unique_topics:
|
||||
# Clean up topic text (remove "when working with" prefixes if present)
|
||||
topic = topic.replace('when working with', '').strip()
|
||||
topic = topic.replace('Use when', '').strip()
|
||||
topic = topic.replace("when working with", "").strip()
|
||||
topic = topic.replace("Use when", "").strip()
|
||||
if topic:
|
||||
description += f"- {topic}\n"
|
||||
|
||||
@@ -721,7 +715,10 @@ export default App;
|
||||
# NEW: Generate comprehensive description from all sub-skills
|
||||
when_to_use = self._generate_comprehensive_description()
|
||||
|
||||
skill_md = frontmatter + "\n\n" + f"""# {self.router_name.replace('-', ' ').title()} Documentation
|
||||
skill_md = (
|
||||
frontmatter
|
||||
+ "\n\n"
|
||||
+ f"""# {self.router_name.replace("-", " ").title()} Documentation
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
@@ -730,26 +727,27 @@ export default App;
|
||||
This is a router skill that directs your questions to specialized sub-skills for efficient, focused assistance.
|
||||
|
||||
"""
|
||||
)
|
||||
|
||||
# Phase 4: Add GitHub repository metadata
|
||||
if self.github_metadata:
|
||||
# NEW: Use html_url from GitHub metadata instead of base_url from config
|
||||
repo_url = self.github_metadata.get('html_url', '')
|
||||
stars = self.github_metadata.get('stars', 0)
|
||||
language = self.github_metadata.get('language', 'Unknown')
|
||||
description = self.github_metadata.get('description', '')
|
||||
repo_url = self.github_metadata.get("html_url", "")
|
||||
stars = self.github_metadata.get("stars", 0)
|
||||
language = self.github_metadata.get("language", "Unknown")
|
||||
description = self.github_metadata.get("description", "")
|
||||
|
||||
skill_md += f"""## Repository Info
|
||||
|
||||
**Repository:** {repo_url}
|
||||
**Stars:** ⭐ {stars:,} | **Language:** {language}
|
||||
{f'**Description:** {description}' if description else ''}
|
||||
{f"**Description:** {description}" if description else ""}
|
||||
|
||||
"""
|
||||
|
||||
# Phase 4: Add Quick Start from README
|
||||
if self.github_docs and self.github_docs.get('readme'):
|
||||
readme = self.github_docs['readme']
|
||||
if self.github_docs and self.github_docs.get("readme"):
|
||||
readme = self.github_docs["readme"]
|
||||
|
||||
# NEW: Clean HTML and extract meaningful content
|
||||
quick_start = self._extract_clean_readme_section(readme)
|
||||
@@ -768,14 +766,20 @@ This is a router skill that directs your questions to specialized sub-skills for
|
||||
if framework:
|
||||
hello_world = self._get_framework_hello_world(framework)
|
||||
if hello_world:
|
||||
skill_md += hello_world + "\n*Note: Generic template. See references/getting_started.md for project-specific setup.*\n\n"
|
||||
skill_md += (
|
||||
hello_world
|
||||
+ "\n*Note: Generic template. See references/getting_started.md for project-specific setup.*\n\n"
|
||||
)
|
||||
else:
|
||||
# No README available - try framework fallback
|
||||
framework = self._detect_framework()
|
||||
if framework:
|
||||
hello_world = self._get_framework_hello_world(framework)
|
||||
if hello_world:
|
||||
skill_md += hello_world + "\n*Note: Generic template. Check repository for specific installation instructions.*\n\n"
|
||||
skill_md += (
|
||||
hello_world
|
||||
+ "\n*Note: Generic template. Check repository for specific installation instructions.*\n\n"
|
||||
)
|
||||
|
||||
skill_md += """## How It Works
|
||||
|
||||
@@ -785,11 +789,11 @@ This skill analyzes your question and activates the appropriate specialized skil
|
||||
|
||||
# List sub-skills
|
||||
for config in self.configs:
|
||||
name = config['name']
|
||||
desc = config.get('description', '')
|
||||
name = config["name"]
|
||||
desc = config.get("description", "")
|
||||
# Remove router name prefix from description if present
|
||||
if desc.startswith(f"{self.router_name.title()} -"):
|
||||
desc = desc.split(' - ', 1)[1]
|
||||
desc = desc.split(" - ", 1)[1]
|
||||
|
||||
skill_md += f"### {name}\n{desc}\n\n"
|
||||
|
||||
@@ -808,7 +812,7 @@ The router analyzes your question for topic keywords and activates relevant skil
|
||||
skill_md += f"- {keyword_str} → **{skill_name}**\n"
|
||||
|
||||
# Quick reference
|
||||
skill_md += f"""
|
||||
skill_md += """
|
||||
|
||||
## Quick Reference
|
||||
|
||||
@@ -839,7 +843,7 @@ For quick answers, this router provides basic overview information. For detailed
|
||||
|
||||
# Phase 4: Add Common Issues from GitHub (Summary with Reference)
|
||||
if self.github_issues:
|
||||
common_problems = self.github_issues.get('common_problems', [])[:5] # Top 5
|
||||
common_problems = self.github_issues.get("common_problems", [])[:5] # Top 5
|
||||
|
||||
if common_problems:
|
||||
skill_md += """
|
||||
@@ -850,9 +854,9 @@ Top 5 GitHub issues from the community:
|
||||
|
||||
"""
|
||||
for i, issue in enumerate(common_problems, 1):
|
||||
title = issue.get('title', '')
|
||||
number = issue.get('number', 0)
|
||||
comments = issue.get('comments', 0)
|
||||
title = issue.get("title", "")
|
||||
number = issue.get("number", 0)
|
||||
comments = issue.get("comments", 0)
|
||||
|
||||
skill_md += f"{i}. **{title}** (Issue #{number}, {comments} comments)\n"
|
||||
|
||||
@@ -871,9 +875,9 @@ Problem-solution patterns from resolved GitHub issues:
|
||||
|
||||
"""
|
||||
for i, pattern in enumerate(patterns, 1):
|
||||
problem = pattern['problem']
|
||||
solution = pattern['solution']
|
||||
issue_num = pattern['issue_number']
|
||||
problem = pattern["problem"]
|
||||
solution = pattern["solution"]
|
||||
issue_num = pattern["issue_number"]
|
||||
|
||||
skill_md += f"**Pattern {i}**: {problem}\n"
|
||||
skill_md += f"→ **Solution**: {solution} ([Issue #{issue_num}](references/github_issues.md))\n\n"
|
||||
@@ -888,10 +892,10 @@ Detailed documentation available in:
|
||||
"""
|
||||
if self.github_issues:
|
||||
skill_md += "- `references/github_issues.md` - Community problems and solutions\n"
|
||||
if self.github_docs and self.github_docs.get('readme'):
|
||||
if self.github_docs and self.github_docs.get("readme"):
|
||||
skill_md += "- `references/getting_started.md` - Detailed setup guide\n"
|
||||
|
||||
skill_md += f"""
|
||||
skill_md += """
|
||||
|
||||
## Need Help?
|
||||
|
||||
@@ -904,7 +908,7 @@ Simply ask your question and mention the topic. The router will find the right s
|
||||
|
||||
return skill_md
|
||||
|
||||
def generate_subskill_issues_section(self, skill_name: str, topics: List[str]) -> str:
|
||||
def generate_subskill_issues_section(self, skill_name: str, topics: list[str]) -> str:
|
||||
"""
|
||||
Generate "Common Issues" section for a sub-skill (Phase 4).
|
||||
|
||||
@@ -918,8 +922,8 @@ Simply ask your question and mention the topic. The router will find the right s
|
||||
if not self.github_issues or not categorize_issues_by_topic:
|
||||
return ""
|
||||
|
||||
common_problems = self.github_issues.get('common_problems', [])
|
||||
known_solutions = self.github_issues.get('known_solutions', [])
|
||||
common_problems = self.github_issues.get("common_problems", [])
|
||||
known_solutions = self.github_issues.get("known_solutions", [])
|
||||
|
||||
# Categorize issues by topic
|
||||
categorized = categorize_issues_by_topic(common_problems, known_solutions, topics)
|
||||
@@ -944,11 +948,11 @@ GitHub issues related to this topic:
|
||||
issues_md += f"\n### {topic.title()}\n\n"
|
||||
|
||||
for issue in issues[:3]: # Top 3 per topic
|
||||
title = issue.get('title', '')
|
||||
number = issue.get('number', 0)
|
||||
state = issue.get('state', 'unknown')
|
||||
comments = issue.get('comments', 0)
|
||||
labels = issue.get('labels', [])
|
||||
title = issue.get("title", "")
|
||||
number = issue.get("number", 0)
|
||||
state = issue.get("state", "unknown")
|
||||
comments = issue.get("comments", 0)
|
||||
labels = issue.get("labels", [])
|
||||
|
||||
# Format issue
|
||||
state_icon = "🔴" if state == "open" else "✅"
|
||||
@@ -964,21 +968,24 @@ GitHub issues related to this topic:
|
||||
|
||||
return issues_md
|
||||
|
||||
def create_router_config(self) -> Dict[str, Any]:
|
||||
def create_router_config(self) -> dict[str, Any]:
|
||||
"""Create router configuration"""
|
||||
routing_keywords = self.extract_routing_keywords()
|
||||
|
||||
router_config = {
|
||||
"name": self.router_name,
|
||||
"description": self.base_config.get('description', f'Use when working with {self.router_name} documentation (router for multiple sub-skills)'),
|
||||
"base_url": self.base_config['base_url'],
|
||||
"selectors": self.base_config.get('selectors', {}),
|
||||
"url_patterns": self.base_config.get('url_patterns', {}),
|
||||
"rate_limit": self.base_config.get('rate_limit', 0.5),
|
||||
"description": self.base_config.get(
|
||||
"description",
|
||||
f"Use when working with {self.router_name} documentation (router for multiple sub-skills)",
|
||||
),
|
||||
"base_url": self.base_config["base_url"],
|
||||
"selectors": self.base_config.get("selectors", {}),
|
||||
"url_patterns": self.base_config.get("url_patterns", {}),
|
||||
"rate_limit": self.base_config.get("rate_limit", 0.5),
|
||||
"max_pages": 500, # Router only scrapes overview pages
|
||||
"_router": True,
|
||||
"_sub_skills": [cfg['name'] for cfg in self.configs],
|
||||
"_routing_keywords": routing_keywords
|
||||
"_sub_skills": [cfg["name"] for cfg in self.configs],
|
||||
"_routing_keywords": routing_keywords,
|
||||
}
|
||||
|
||||
return router_config
|
||||
@@ -993,34 +1000,38 @@ GitHub issues related to this topic:
|
||||
md = "# Common GitHub Issues\n\n"
|
||||
md += "Top issues reported by the community:\n\n"
|
||||
|
||||
common_problems = self.github_issues.get('common_problems', [])[:10] if self.github_issues else []
|
||||
known_solutions = self.github_issues.get('known_solutions', [])[:10] if self.github_issues else []
|
||||
common_problems = self.github_issues.get("common_problems", [])[:10] if self.github_issues else []
|
||||
known_solutions = self.github_issues.get("known_solutions", [])[:10] if self.github_issues else []
|
||||
|
||||
if common_problems:
|
||||
md += "## Open Issues (Common Problems)\n\n"
|
||||
for i, issue in enumerate(common_problems, 1):
|
||||
title = issue.get('title', '')
|
||||
number = issue.get('number', 0)
|
||||
comments = issue.get('comments', 0)
|
||||
labels = issue.get('labels', [])
|
||||
title = issue.get("title", "")
|
||||
number = issue.get("number", 0)
|
||||
comments = issue.get("comments", 0)
|
||||
labels = issue.get("labels", [])
|
||||
if isinstance(labels, list):
|
||||
labels_str = ', '.join(str(label) for label in labels)
|
||||
labels_str = ", ".join(str(label) for label in labels)
|
||||
else:
|
||||
labels_str = str(labels) if labels else ''
|
||||
labels_str = str(labels) if labels else ""
|
||||
|
||||
md += f"### {i}. {title}\n\n"
|
||||
md += f"**Issue**: #{number}\n"
|
||||
md += f"**Comments**: {comments}\n"
|
||||
if labels_str:
|
||||
md += f"**Labels**: {labels_str}\n"
|
||||
md += f"**Link**: https://github.com/{self.github_metadata.get('html_url', '').replace('https://github.com/', '')}/issues/{number}\n\n" if self.github_metadata else "\n\n"
|
||||
md += (
|
||||
f"**Link**: https://github.com/{self.github_metadata.get('html_url', '').replace('https://github.com/', '')}/issues/{number}\n\n"
|
||||
if self.github_metadata
|
||||
else "\n\n"
|
||||
)
|
||||
|
||||
if known_solutions:
|
||||
md += "\n## Closed Issues (Known Solutions)\n\n"
|
||||
for i, issue in enumerate(known_solutions, 1):
|
||||
title = issue.get('title', '')
|
||||
number = issue.get('number', 0)
|
||||
comments = issue.get('comments', 0)
|
||||
title = issue.get("title", "")
|
||||
number = issue.get("number", 0)
|
||||
comments = issue.get("comments", 0)
|
||||
|
||||
md += f"### {i}. {title}\n\n"
|
||||
md += f"**Issue**: #{number} (Closed)\n"
|
||||
@@ -1042,8 +1053,8 @@ GitHub issues related to this topic:
|
||||
md = "# Getting Started\n\n"
|
||||
md += "*Extracted from project README*\n\n"
|
||||
|
||||
if self.github_docs and self.github_docs.get('readme'):
|
||||
readme = self.github_docs['readme']
|
||||
if self.github_docs and self.github_docs.get("readme"):
|
||||
readme = self.github_docs["readme"]
|
||||
|
||||
# Clean and extract full quick start section (up to 2000 chars)
|
||||
cleaner = MarkdownCleaner()
|
||||
@@ -1069,16 +1080,16 @@ GitHub issues related to this topic:
|
||||
# 1. GitHub Issues Reference
|
||||
if self.github_issues:
|
||||
issues_md = self._generate_github_issues_reference()
|
||||
with open(references_dir / 'github_issues.md', 'w') as f:
|
||||
with open(references_dir / "github_issues.md", "w") as f:
|
||||
f.write(issues_md)
|
||||
|
||||
# 2. Getting Started Reference
|
||||
if self.github_docs and self.github_docs.get('readme'):
|
||||
if self.github_docs and self.github_docs.get("readme"):
|
||||
getting_started_md = self._generate_getting_started_reference()
|
||||
with open(references_dir / 'getting_started.md', 'w') as f:
|
||||
with open(references_dir / "getting_started.md", "w") as f:
|
||||
f.write(getting_started_md)
|
||||
|
||||
def generate(self, output_dir: Path = None) -> Tuple[Path, Path]:
|
||||
def generate(self, output_dir: Path = None) -> tuple[Path, Path]:
|
||||
"""Generate router skill and config with progressive disclosure"""
|
||||
if output_dir is None:
|
||||
output_dir = self.config_paths[0].parent
|
||||
@@ -1090,11 +1101,11 @@ GitHub issues related to this topic:
|
||||
skill_path = output_dir.parent / f"output/{self.router_name}/SKILL.md"
|
||||
skill_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(skill_path, 'w') as f:
|
||||
with open(skill_path, "w") as f:
|
||||
f.write(skill_md)
|
||||
|
||||
# NEW: Create references/ directory and generate reference files
|
||||
references_dir = skill_path.parent / 'references'
|
||||
references_dir = skill_path.parent / "references"
|
||||
references_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._generate_reference_files(references_dir)
|
||||
|
||||
@@ -1102,7 +1113,7 @@ GitHub issues related to this topic:
|
||||
router_config = self.create_router_config()
|
||||
config_path = output_dir / f"{self.router_name}.json"
|
||||
|
||||
with open(config_path, 'w') as f:
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(router_config, f, indent=2)
|
||||
|
||||
return config_path, skill_path
|
||||
@@ -1125,24 +1136,14 @@ Examples:
|
||||
|
||||
# Custom output directory
|
||||
python3 generate_router.py configs/godot-*.json --output-dir configs/routers/
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'configs',
|
||||
nargs='+',
|
||||
help='Sub-skill config files'
|
||||
)
|
||||
parser.add_argument("configs", nargs="+", help="Sub-skill config files")
|
||||
|
||||
parser.add_argument(
|
||||
'--name',
|
||||
help='Router skill name (default: inferred from sub-skills)'
|
||||
)
|
||||
parser.add_argument("--name", help="Router skill name (default: inferred from sub-skills)")
|
||||
|
||||
parser.add_argument(
|
||||
'--output-dir',
|
||||
help='Output directory (default: same as input configs)'
|
||||
)
|
||||
parser.add_argument("--output-dir", help="Output directory (default: same as input configs)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -1150,16 +1151,16 @@ Examples:
|
||||
config_files = []
|
||||
for path_str in args.configs:
|
||||
path = Path(path_str)
|
||||
if path.exists() and not path.stem.endswith('-router'):
|
||||
if path.exists() and not path.stem.endswith("-router"):
|
||||
config_files.append(path_str)
|
||||
|
||||
if not config_files:
|
||||
print("❌ Error: No valid config files provided")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("ROUTER SKILL GENERATOR")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Sub-skills: {len(config_files)}")
|
||||
for cfg in config_files:
|
||||
print(f" - {Path(cfg).stem}")
|
||||
@@ -1172,11 +1173,11 @@ Examples:
|
||||
print(f"✅ Router config created: {config_path}")
|
||||
print(f"✅ Router SKILL.md created: {skill_path}")
|
||||
print("")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print("NEXT STEPS")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"1. Review router SKILL.md: {skill_path}")
|
||||
print(f"2. Optionally scrape router (for overview pages):")
|
||||
print("2. Optionally scrape router (for overview pages):")
|
||||
print(f" skill-seekers scrape --config {config_path}")
|
||||
print("3. Package router skill:")
|
||||
print(f" skill-seekers package output/{generator.router_name}/")
|
||||
|
||||
@@ -12,43 +12,47 @@ This is the foundation of the unified codebase analyzer architecture.
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from collections import Counter
|
||||
|
||||
import requests
|
||||
|
||||
from .rate_limit_handler import RateLimitHandler, RateLimitError, create_github_headers
|
||||
from .config_manager import get_config_manager
|
||||
from .rate_limit_handler import RateLimitError, RateLimitHandler, create_github_headers
|
||||
|
||||
|
||||
@dataclass
|
||||
class CodeStream:
|
||||
"""Code files for C3.x analysis."""
|
||||
|
||||
directory: Path
|
||||
files: List[Path]
|
||||
files: list[Path]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocsStream:
|
||||
"""Documentation files from repository."""
|
||||
readme: Optional[str]
|
||||
contributing: Optional[str]
|
||||
docs_files: List[Dict] # [{"path": "docs/oauth.md", "content": "..."}]
|
||||
|
||||
readme: str | None
|
||||
contributing: str | None
|
||||
docs_files: list[dict] # [{"path": "docs/oauth.md", "content": "..."}]
|
||||
|
||||
|
||||
@dataclass
|
||||
class InsightsStream:
|
||||
"""GitHub metadata and issues."""
|
||||
metadata: Dict # stars, forks, language, etc.
|
||||
common_problems: List[Dict]
|
||||
known_solutions: List[Dict]
|
||||
top_labels: List[Dict]
|
||||
|
||||
metadata: dict # stars, forks, language, etc.
|
||||
common_problems: list[dict]
|
||||
known_solutions: list[dict]
|
||||
top_labels: list[dict]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThreeStreamData:
|
||||
"""Complete output from GitHub fetcher."""
|
||||
|
||||
code_stream: CodeStream
|
||||
docs_stream: DocsStream
|
||||
insights_stream: InsightsStream
|
||||
@@ -73,11 +77,7 @@ class GitHubThreeStreamFetcher:
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
repo_url: str,
|
||||
github_token: Optional[str] = None,
|
||||
interactive: bool = True,
|
||||
profile_name: Optional[str] = None
|
||||
self, repo_url: str, github_token: str | None = None, interactive: bool = True, profile_name: str | None = None
|
||||
):
|
||||
"""
|
||||
Initialize fetcher.
|
||||
@@ -89,7 +89,7 @@ class GitHubThreeStreamFetcher:
|
||||
profile_name: Name of the GitHub profile being used
|
||||
"""
|
||||
self.repo_url = repo_url
|
||||
self.github_token = github_token or os.getenv('GITHUB_TOKEN')
|
||||
self.github_token = github_token or os.getenv("GITHUB_TOKEN")
|
||||
self.owner, self.repo = self.parse_repo_url(repo_url)
|
||||
self.interactive = interactive
|
||||
|
||||
@@ -99,12 +99,10 @@ class GitHubThreeStreamFetcher:
|
||||
profile_name = config.get_profile_for_token(self.github_token)
|
||||
|
||||
self.rate_limiter = RateLimitHandler(
|
||||
token=self.github_token,
|
||||
interactive=interactive,
|
||||
profile_name=profile_name
|
||||
token=self.github_token, interactive=interactive, profile_name=profile_name
|
||||
)
|
||||
|
||||
def parse_repo_url(self, url: str) -> Tuple[str, str]:
|
||||
def parse_repo_url(self, url: str) -> tuple[str, str]:
|
||||
"""
|
||||
Parse GitHub URL to extract owner and repo.
|
||||
|
||||
@@ -115,18 +113,18 @@ class GitHubThreeStreamFetcher:
|
||||
Tuple of (owner, repo)
|
||||
"""
|
||||
# Remove .git suffix if present
|
||||
if url.endswith('.git'):
|
||||
if url.endswith(".git"):
|
||||
url = url[:-4] # Remove last 4 characters (.git)
|
||||
|
||||
# Handle git@ URLs (SSH format)
|
||||
if url.startswith('git@github.com:'):
|
||||
parts = url.replace('git@github.com:', '').split('/')
|
||||
if url.startswith("git@github.com:"):
|
||||
parts = url.replace("git@github.com:", "").split("/")
|
||||
if len(parts) >= 2:
|
||||
return parts[0], parts[1]
|
||||
|
||||
# Handle HTTPS URLs
|
||||
if 'github.com/' in url:
|
||||
parts = url.split('github.com/')[-1].split('/')
|
||||
if "github.com/" in url:
|
||||
parts = url.split("github.com/")[-1].split("/")
|
||||
if len(parts) >= 2:
|
||||
return parts[0], parts[1]
|
||||
|
||||
@@ -150,18 +148,18 @@ class GitHubThreeStreamFetcher:
|
||||
raise RateLimitError("Rate limit check failed during startup")
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = Path(tempfile.mkdtemp(prefix='github_fetch_'))
|
||||
output_dir = Path(tempfile.mkdtemp(prefix="github_fetch_"))
|
||||
|
||||
print(f"📦 Cloning {self.repo_url}...")
|
||||
local_path = self.clone_repo(output_dir)
|
||||
|
||||
print(f"🔍 Fetching GitHub metadata...")
|
||||
print("🔍 Fetching GitHub metadata...")
|
||||
metadata = self.fetch_github_metadata()
|
||||
|
||||
print(f"🐛 Fetching issues...")
|
||||
print("🐛 Fetching issues...")
|
||||
issues = self.fetch_issues(max_issues=100)
|
||||
|
||||
print(f"📂 Classifying files...")
|
||||
print("📂 Classifying files...")
|
||||
code_files, doc_files = self.classify_files(local_path)
|
||||
print(f" - Code: {len(code_files)} files")
|
||||
print(f" - Docs: {len(doc_files)} files")
|
||||
@@ -171,25 +169,22 @@ class GitHubThreeStreamFetcher:
|
||||
|
||||
# Build three streams
|
||||
return ThreeStreamData(
|
||||
code_stream=CodeStream(
|
||||
directory=local_path,
|
||||
files=code_files
|
||||
),
|
||||
code_stream=CodeStream(directory=local_path, files=code_files),
|
||||
docs_stream=DocsStream(
|
||||
readme=self.read_file(local_path / 'README.md'),
|
||||
contributing=self.read_file(local_path / 'CONTRIBUTING.md'),
|
||||
readme=self.read_file(local_path / "README.md"),
|
||||
contributing=self.read_file(local_path / "CONTRIBUTING.md"),
|
||||
docs_files=[
|
||||
{'path': str(f.relative_to(local_path)), 'content': self.read_file(f)}
|
||||
{"path": str(f.relative_to(local_path)), "content": self.read_file(f)}
|
||||
for f in doc_files
|
||||
if f.name not in ['README.md', 'CONTRIBUTING.md']
|
||||
]
|
||||
if f.name not in ["README.md", "CONTRIBUTING.md"]
|
||||
],
|
||||
),
|
||||
insights_stream=InsightsStream(
|
||||
metadata=metadata,
|
||||
common_problems=issue_insights['common_problems'],
|
||||
known_solutions=issue_insights['known_solutions'],
|
||||
top_labels=issue_insights['top_labels']
|
||||
)
|
||||
common_problems=issue_insights["common_problems"],
|
||||
known_solutions=issue_insights["known_solutions"],
|
||||
top_labels=issue_insights["top_labels"],
|
||||
),
|
||||
)
|
||||
|
||||
def clone_repo(self, output_dir: Path) -> Path:
|
||||
@@ -206,7 +201,7 @@ class GitHubThreeStreamFetcher:
|
||||
repo_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Clone with depth 1 for speed
|
||||
cmd = ['git', 'clone', '--depth', '1', self.repo_url, str(repo_dir)]
|
||||
cmd = ["git", "clone", "--depth", "1", self.repo_url, str(repo_dir)]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
@@ -214,7 +209,7 @@ class GitHubThreeStreamFetcher:
|
||||
|
||||
return repo_dir
|
||||
|
||||
def fetch_github_metadata(self) -> Dict:
|
||||
def fetch_github_metadata(self) -> dict:
|
||||
"""
|
||||
Fetch repo metadata via GitHub API.
|
||||
|
||||
@@ -238,35 +233,35 @@ class GitHubThreeStreamFetcher:
|
||||
data = response.json()
|
||||
|
||||
return {
|
||||
'stars': data.get('stargazers_count', 0),
|
||||
'forks': data.get('forks_count', 0),
|
||||
'open_issues': data.get('open_issues_count', 0),
|
||||
'language': data.get('language', 'Unknown'),
|
||||
'description': data.get('description', ''),
|
||||
'homepage': data.get('homepage', ''),
|
||||
'created_at': data.get('created_at', ''),
|
||||
'updated_at': data.get('updated_at', ''),
|
||||
'html_url': data.get('html_url', ''), # NEW: Repository URL
|
||||
'license': data.get('license', {}) # NEW: License info
|
||||
"stars": data.get("stargazers_count", 0),
|
||||
"forks": data.get("forks_count", 0),
|
||||
"open_issues": data.get("open_issues_count", 0),
|
||||
"language": data.get("language", "Unknown"),
|
||||
"description": data.get("description", ""),
|
||||
"homepage": data.get("homepage", ""),
|
||||
"created_at": data.get("created_at", ""),
|
||||
"updated_at": data.get("updated_at", ""),
|
||||
"html_url": data.get("html_url", ""), # NEW: Repository URL
|
||||
"license": data.get("license", {}), # NEW: License info
|
||||
}
|
||||
except RateLimitError:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"⚠️ Failed to fetch metadata: {e}")
|
||||
return {
|
||||
'stars': 0,
|
||||
'forks': 0,
|
||||
'open_issues': 0,
|
||||
'language': 'Unknown',
|
||||
'description': '',
|
||||
'homepage': '',
|
||||
'created_at': '',
|
||||
'updated_at': '',
|
||||
'html_url': '', # NEW: Repository URL
|
||||
'license': {} # NEW: License info
|
||||
"stars": 0,
|
||||
"forks": 0,
|
||||
"open_issues": 0,
|
||||
"language": "Unknown",
|
||||
"description": "",
|
||||
"homepage": "",
|
||||
"created_at": "",
|
||||
"updated_at": "",
|
||||
"html_url": "", # NEW: Repository URL
|
||||
"license": {}, # NEW: License info
|
||||
}
|
||||
|
||||
def fetch_issues(self, max_issues: int = 100) -> List[Dict]:
|
||||
def fetch_issues(self, max_issues: int = 100) -> list[dict]:
|
||||
"""
|
||||
Fetch GitHub issues (open + closed).
|
||||
|
||||
@@ -279,14 +274,14 @@ class GitHubThreeStreamFetcher:
|
||||
all_issues = []
|
||||
|
||||
# Fetch open issues
|
||||
all_issues.extend(self._fetch_issues_page(state='open', max_count=max_issues // 2))
|
||||
all_issues.extend(self._fetch_issues_page(state="open", max_count=max_issues // 2))
|
||||
|
||||
# Fetch closed issues
|
||||
all_issues.extend(self._fetch_issues_page(state='closed', max_count=max_issues // 2))
|
||||
all_issues.extend(self._fetch_issues_page(state="closed", max_count=max_issues // 2))
|
||||
|
||||
return all_issues
|
||||
|
||||
def _fetch_issues_page(self, state: str, max_count: int) -> List[Dict]:
|
||||
def _fetch_issues_page(self, state: str, max_count: int) -> list[dict]:
|
||||
"""
|
||||
Fetch one page of issues.
|
||||
|
||||
@@ -304,10 +299,10 @@ class GitHubThreeStreamFetcher:
|
||||
headers = create_github_headers(self.github_token)
|
||||
|
||||
params = {
|
||||
'state': state,
|
||||
'per_page': min(max_count, 100), # GitHub API limit
|
||||
'sort': 'comments',
|
||||
'direction': 'desc'
|
||||
"state": state,
|
||||
"per_page": min(max_count, 100), # GitHub API limit
|
||||
"sort": "comments",
|
||||
"direction": "desc",
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -321,7 +316,7 @@ class GitHubThreeStreamFetcher:
|
||||
issues = response.json()
|
||||
|
||||
# Filter out pull requests (they appear in issues endpoint)
|
||||
issues = [issue for issue in issues if 'pull_request' not in issue]
|
||||
issues = [issue for issue in issues if "pull_request" not in issue]
|
||||
|
||||
return issues
|
||||
except RateLimitError:
|
||||
@@ -330,7 +325,7 @@ class GitHubThreeStreamFetcher:
|
||||
print(f"⚠️ Failed to fetch {state} issues: {e}")
|
||||
return []
|
||||
|
||||
def classify_files(self, repo_path: Path) -> Tuple[List[Path], List[Path]]:
|
||||
def classify_files(self, repo_path: Path) -> tuple[list[Path], list[Path]]:
|
||||
"""
|
||||
Split files into code vs documentation.
|
||||
|
||||
@@ -354,36 +349,61 @@ class GitHubThreeStreamFetcher:
|
||||
|
||||
# Documentation patterns
|
||||
doc_patterns = [
|
||||
'**/README.md',
|
||||
'**/CONTRIBUTING.md',
|
||||
'**/CHANGELOG.md',
|
||||
'**/LICENSE.md',
|
||||
'docs/*.md', # Files directly in docs/
|
||||
'docs/**/*.md', # Files in subdirectories of docs/
|
||||
'doc/*.md', # Files directly in doc/
|
||||
'doc/**/*.md', # Files in subdirectories of doc/
|
||||
'documentation/*.md', # Files directly in documentation/
|
||||
'documentation/**/*.md', # Files in subdirectories of documentation/
|
||||
'**/*.rst',
|
||||
"**/README.md",
|
||||
"**/CONTRIBUTING.md",
|
||||
"**/CHANGELOG.md",
|
||||
"**/LICENSE.md",
|
||||
"docs/*.md", # Files directly in docs/
|
||||
"docs/**/*.md", # Files in subdirectories of docs/
|
||||
"doc/*.md", # Files directly in doc/
|
||||
"doc/**/*.md", # Files in subdirectories of doc/
|
||||
"documentation/*.md", # Files directly in documentation/
|
||||
"documentation/**/*.md", # Files in subdirectories of documentation/
|
||||
"**/*.rst",
|
||||
]
|
||||
|
||||
# Code extensions
|
||||
code_extensions = [
|
||||
'.py', '.js', '.ts', '.jsx', '.tsx',
|
||||
'.go', '.rs', '.java', '.kt',
|
||||
'.c', '.cpp', '.h', '.hpp',
|
||||
'.rb', '.php', '.swift', '.cs',
|
||||
'.scala', '.clj', '.cljs'
|
||||
".py",
|
||||
".js",
|
||||
".ts",
|
||||
".jsx",
|
||||
".tsx",
|
||||
".go",
|
||||
".rs",
|
||||
".java",
|
||||
".kt",
|
||||
".c",
|
||||
".cpp",
|
||||
".h",
|
||||
".hpp",
|
||||
".rb",
|
||||
".php",
|
||||
".swift",
|
||||
".cs",
|
||||
".scala",
|
||||
".clj",
|
||||
".cljs",
|
||||
]
|
||||
|
||||
# Directories to exclude
|
||||
exclude_dirs = [
|
||||
'node_modules', '__pycache__', 'venv', '.venv',
|
||||
'.git', 'build', 'dist', '.tox', '.pytest_cache',
|
||||
'htmlcov', '.mypy_cache', '.eggs', '*.egg-info'
|
||||
"node_modules",
|
||||
"__pycache__",
|
||||
"venv",
|
||||
".venv",
|
||||
".git",
|
||||
"build",
|
||||
"dist",
|
||||
".tox",
|
||||
".pytest_cache",
|
||||
"htmlcov",
|
||||
".mypy_cache",
|
||||
".eggs",
|
||||
"*.egg-info",
|
||||
]
|
||||
|
||||
for file_path in repo_path.rglob('*'):
|
||||
for file_path in repo_path.rglob("*"):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
@@ -392,8 +412,8 @@ class GitHubThreeStreamFetcher:
|
||||
continue
|
||||
|
||||
# Skip hidden files (but allow docs in docs/ directories)
|
||||
is_in_docs_dir = any(pattern in str(file_path) for pattern in ['docs/', 'doc/', 'documentation/'])
|
||||
if any(part.startswith('.') for part in file_path.parts):
|
||||
is_in_docs_dir = any(pattern in str(file_path) for pattern in ["docs/", "doc/", "documentation/"])
|
||||
if any(part.startswith(".") for part in file_path.parts):
|
||||
if not is_in_docs_dir:
|
||||
continue
|
||||
|
||||
@@ -407,7 +427,7 @@ class GitHubThreeStreamFetcher:
|
||||
|
||||
return code_files, doc_files
|
||||
|
||||
def analyze_issues(self, issues: List[Dict]) -> Dict:
|
||||
def analyze_issues(self, issues: list[dict]) -> dict:
|
||||
"""
|
||||
Analyze GitHub issues to extract insights.
|
||||
|
||||
@@ -446,44 +466,41 @@ class GitHubThreeStreamFetcher:
|
||||
|
||||
for issue in issues:
|
||||
# Handle both string labels and dict labels (GitHub API format)
|
||||
raw_labels = issue.get('labels', [])
|
||||
raw_labels = issue.get("labels", [])
|
||||
labels = []
|
||||
for label in raw_labels:
|
||||
if isinstance(label, dict):
|
||||
labels.append(label.get('name', ''))
|
||||
labels.append(label.get("name", ""))
|
||||
else:
|
||||
labels.append(str(label))
|
||||
all_labels.extend(labels)
|
||||
|
||||
issue_data = {
|
||||
'title': issue.get('title', ''),
|
||||
'number': issue.get('number', 0),
|
||||
'labels': labels,
|
||||
'comments': issue.get('comments', 0),
|
||||
'state': issue.get('state', 'unknown')
|
||||
"title": issue.get("title", ""),
|
||||
"number": issue.get("number", 0),
|
||||
"labels": labels,
|
||||
"comments": issue.get("comments", 0),
|
||||
"state": issue.get("state", "unknown"),
|
||||
}
|
||||
|
||||
# Open issues with many comments = common problems
|
||||
if issue['state'] == 'open' and issue.get('comments', 0) >= 5:
|
||||
if issue["state"] == "open" and issue.get("comments", 0) >= 5:
|
||||
common_problems.append(issue_data)
|
||||
|
||||
# Closed issues with comments = known solutions
|
||||
elif issue['state'] == 'closed' and issue.get('comments', 0) > 0:
|
||||
elif issue["state"] == "closed" and issue.get("comments", 0) > 0:
|
||||
known_solutions.append(issue_data)
|
||||
|
||||
# Count label frequency
|
||||
label_counts = Counter(all_labels)
|
||||
|
||||
return {
|
||||
'common_problems': sorted(common_problems, key=lambda x: x['comments'], reverse=True)[:10],
|
||||
'known_solutions': sorted(known_solutions, key=lambda x: x['comments'], reverse=True)[:10],
|
||||
'top_labels': [
|
||||
{'label': label, 'count': count}
|
||||
for label, count in label_counts.most_common(10)
|
||||
]
|
||||
"common_problems": sorted(common_problems, key=lambda x: x["comments"], reverse=True)[:10],
|
||||
"known_solutions": sorted(known_solutions, key=lambda x: x["comments"], reverse=True)[:10],
|
||||
"top_labels": [{"label": label, "count": count} for label, count in label_counts.most_common(10)],
|
||||
}
|
||||
|
||||
def read_file(self, file_path: Path) -> Optional[str]:
|
||||
def read_file(self, file_path: Path) -> str | None:
|
||||
"""
|
||||
Read file content safely.
|
||||
|
||||
@@ -497,10 +514,10 @@ class GitHubThreeStreamFetcher:
|
||||
return None
|
||||
|
||||
try:
|
||||
return file_path.read_text(encoding='utf-8')
|
||||
return file_path.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
# Try with different encoding
|
||||
try:
|
||||
return file_path.read_text(encoding='latin-1')
|
||||
return file_path.read_text(encoding="latin-1")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -20,7 +20,7 @@ import subprocess
|
||||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
# Avoid circular imports by using TYPE_CHECKING
|
||||
if TYPE_CHECKING:
|
||||
@@ -40,15 +40,17 @@ else:
|
||||
@dataclass
|
||||
class TroubleshootingItem:
|
||||
problem: str
|
||||
symptoms: List[str] = field(default_factory=list)
|
||||
symptoms: list[str] = field(default_factory=list)
|
||||
solution: str = ""
|
||||
diagnostic_steps: List[str] = field(default_factory=list)
|
||||
diagnostic_steps: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Conditional import for Anthropic API
|
||||
try:
|
||||
import anthropic
|
||||
|
||||
ANTHROPIC_AVAILABLE = True
|
||||
except ImportError:
|
||||
ANTHROPIC_AVAILABLE = False
|
||||
@@ -58,9 +60,10 @@ except ImportError:
|
||||
@dataclass
|
||||
class StepEnhancement:
|
||||
"""Enhanced step information (internal use only)"""
|
||||
|
||||
step_index: int
|
||||
explanation: str # Natural language explanation
|
||||
variations: List[str] = field(default_factory=list) # Alternative approaches
|
||||
variations: list[str] = field(default_factory=list) # Alternative approaches
|
||||
|
||||
|
||||
class GuideEnhancer:
|
||||
@@ -81,7 +84,7 @@ class GuideEnhancer:
|
||||
mode: Enhancement mode - "api", "local", or "auto"
|
||||
"""
|
||||
self.mode = self._detect_mode(mode)
|
||||
self.api_key = os.environ.get('ANTHROPIC_API_KEY')
|
||||
self.api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
self.client = None
|
||||
|
||||
if self.mode == "api":
|
||||
@@ -119,7 +122,7 @@ class GuideEnhancer:
|
||||
"""
|
||||
if requested_mode == "auto":
|
||||
# Prefer API if key available, else LOCAL
|
||||
if os.environ.get('ANTHROPIC_API_KEY') and ANTHROPIC_AVAILABLE:
|
||||
if os.environ.get("ANTHROPIC_API_KEY") and ANTHROPIC_AVAILABLE:
|
||||
return "api"
|
||||
elif self._check_claude_cli():
|
||||
return "local"
|
||||
@@ -130,17 +133,12 @@ class GuideEnhancer:
|
||||
def _check_claude_cli(self) -> bool:
|
||||
"""Check if Claude Code CLI is available."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['claude', '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
result = subprocess.run(["claude", "--version"], capture_output=True, text=True, timeout=5)
|
||||
return result.returncode == 0
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
return False
|
||||
|
||||
def enhance_guide(self, guide_data: Dict) -> Dict:
|
||||
def enhance_guide(self, guide_data: dict) -> dict:
|
||||
"""
|
||||
Apply all 5 enhancements to a guide.
|
||||
|
||||
@@ -164,7 +162,7 @@ class GuideEnhancer:
|
||||
logger.info("📝 Returning original guide without enhancement")
|
||||
return guide_data
|
||||
|
||||
def enhance_step_descriptions(self, steps: List[Dict]) -> List[StepEnhancement]:
|
||||
def enhance_step_descriptions(self, steps: list[dict]) -> list[StepEnhancement]:
|
||||
"""
|
||||
Enhancement 1: Add natural language explanations to steps.
|
||||
|
||||
@@ -187,17 +185,17 @@ class GuideEnhancer:
|
||||
data = json.loads(response)
|
||||
return [
|
||||
StepEnhancement(
|
||||
step_index=item.get('step_index', i),
|
||||
explanation=item.get('explanation', ''),
|
||||
variations=item.get('variations', [])
|
||||
step_index=item.get("step_index", i),
|
||||
explanation=item.get("explanation", ""),
|
||||
variations=item.get("variations", []),
|
||||
)
|
||||
for i, item in enumerate(data.get('step_descriptions', []))
|
||||
for i, item in enumerate(data.get("step_descriptions", []))
|
||||
]
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"⚠️ Failed to parse step descriptions: {e}")
|
||||
return []
|
||||
|
||||
def enhance_troubleshooting(self, guide_data: Dict) -> List[TroubleshootingItem]:
|
||||
def enhance_troubleshooting(self, guide_data: dict) -> list[TroubleshootingItem]:
|
||||
"""
|
||||
Enhancement 2: Generate diagnostic flows + solutions.
|
||||
|
||||
@@ -220,18 +218,18 @@ class GuideEnhancer:
|
||||
data = json.loads(response)
|
||||
return [
|
||||
TroubleshootingItem(
|
||||
problem=item.get('problem', ''),
|
||||
symptoms=item.get('symptoms', []),
|
||||
diagnostic_steps=item.get('diagnostic_steps', []),
|
||||
solution=item.get('solution', '')
|
||||
problem=item.get("problem", ""),
|
||||
symptoms=item.get("symptoms", []),
|
||||
diagnostic_steps=item.get("diagnostic_steps", []),
|
||||
solution=item.get("solution", ""),
|
||||
)
|
||||
for item in data.get('troubleshooting', [])
|
||||
for item in data.get("troubleshooting", [])
|
||||
]
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"⚠️ Failed to parse troubleshooting items: {e}")
|
||||
return []
|
||||
|
||||
def enhance_prerequisites(self, prereqs: List[str]) -> List[PrerequisiteItem]:
|
||||
def enhance_prerequisites(self, prereqs: list[str]) -> list[PrerequisiteItem]:
|
||||
"""
|
||||
Enhancement 3: Explain why prerequisites are needed.
|
||||
|
||||
@@ -253,18 +251,14 @@ class GuideEnhancer:
|
||||
try:
|
||||
data = json.loads(response)
|
||||
return [
|
||||
PrerequisiteItem(
|
||||
name=item.get('name', ''),
|
||||
why=item.get('why', ''),
|
||||
setup=item.get('setup', '')
|
||||
)
|
||||
for item in data.get('prerequisites_detailed', [])
|
||||
PrerequisiteItem(name=item.get("name", ""), why=item.get("why", ""), setup=item.get("setup", ""))
|
||||
for item in data.get("prerequisites_detailed", [])
|
||||
]
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"⚠️ Failed to parse prerequisites: {e}")
|
||||
return []
|
||||
|
||||
def enhance_next_steps(self, guide_data: Dict) -> List[str]:
|
||||
def enhance_next_steps(self, guide_data: dict) -> list[str]:
|
||||
"""
|
||||
Enhancement 4: Suggest related guides and variations.
|
||||
|
||||
@@ -285,12 +279,12 @@ class GuideEnhancer:
|
||||
|
||||
try:
|
||||
data = json.loads(response)
|
||||
return data.get('next_steps', [])
|
||||
return data.get("next_steps", [])
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"⚠️ Failed to parse next steps: {e}")
|
||||
return []
|
||||
|
||||
def enhance_use_cases(self, guide_data: Dict) -> List[str]:
|
||||
def enhance_use_cases(self, guide_data: dict) -> list[str]:
|
||||
"""
|
||||
Enhancement 5: Generate real-world scenario examples.
|
||||
|
||||
@@ -311,14 +305,14 @@ class GuideEnhancer:
|
||||
|
||||
try:
|
||||
data = json.loads(response)
|
||||
return data.get('use_cases', [])
|
||||
return data.get("use_cases", [])
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"⚠️ Failed to parse use cases: {e}")
|
||||
return []
|
||||
|
||||
# === AI Call Methods ===
|
||||
|
||||
def _call_ai(self, prompt: str, max_tokens: int = 4000) -> Optional[str]:
|
||||
def _call_ai(self, prompt: str, max_tokens: int = 4000) -> str | None:
|
||||
"""
|
||||
Call AI with the given prompt.
|
||||
|
||||
@@ -335,7 +329,7 @@ class GuideEnhancer:
|
||||
return self._call_claude_local(prompt)
|
||||
return None
|
||||
|
||||
def _call_claude_api(self, prompt: str, max_tokens: int = 4000) -> Optional[str]:
|
||||
def _call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str | None:
|
||||
"""
|
||||
Call Claude API.
|
||||
|
||||
@@ -351,16 +345,14 @@ class GuideEnhancer:
|
||||
|
||||
try:
|
||||
response = self.client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=max_tokens,
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
model="claude-sonnet-4-20250514", max_tokens=max_tokens, messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
return response.content[0].text
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Claude API call failed: {e}")
|
||||
return None
|
||||
|
||||
def _call_claude_local(self, prompt: str) -> Optional[str]:
|
||||
def _call_claude_local(self, prompt: str) -> str | None:
|
||||
"""
|
||||
Call Claude Code CLI.
|
||||
|
||||
@@ -372,16 +364,16 @@ class GuideEnhancer:
|
||||
"""
|
||||
try:
|
||||
# Create temporary prompt file
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
|
||||
f.write(prompt)
|
||||
prompt_file = f.name
|
||||
|
||||
# Run claude CLI
|
||||
result = subprocess.run(
|
||||
['claude', prompt_file],
|
||||
["claude", prompt_file],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300 # 5 min timeout
|
||||
timeout=300, # 5 min timeout
|
||||
)
|
||||
|
||||
# Clean up prompt file
|
||||
@@ -399,7 +391,7 @@ class GuideEnhancer:
|
||||
|
||||
# === Prompt Creation Methods ===
|
||||
|
||||
def _enhance_via_api(self, guide_data: Dict) -> Dict:
|
||||
def _enhance_via_api(self, guide_data: dict) -> dict:
|
||||
"""
|
||||
Enhance guide via API mode.
|
||||
|
||||
@@ -417,7 +409,7 @@ class GuideEnhancer:
|
||||
|
||||
return self._parse_enhancement_response(response, guide_data)
|
||||
|
||||
def _enhance_via_local(self, guide_data: Dict) -> Dict:
|
||||
def _enhance_via_local(self, guide_data: dict) -> dict:
|
||||
"""
|
||||
Enhance guide via LOCAL mode.
|
||||
|
||||
@@ -435,7 +427,7 @@ class GuideEnhancer:
|
||||
|
||||
return self._parse_enhancement_response(response, guide_data)
|
||||
|
||||
def _create_enhancement_prompt(self, guide_data: Dict) -> str:
|
||||
def _create_enhancement_prompt(self, guide_data: dict) -> str:
|
||||
"""
|
||||
Create comprehensive enhancement prompt for all 5 enhancements.
|
||||
|
||||
@@ -445,13 +437,13 @@ class GuideEnhancer:
|
||||
Returns:
|
||||
Complete prompt text
|
||||
"""
|
||||
title = guide_data.get('title', 'Unknown Guide')
|
||||
steps = guide_data.get('steps', [])
|
||||
language = guide_data.get('language', 'python')
|
||||
prerequisites = guide_data.get('prerequisites', [])
|
||||
title = guide_data.get("title", "Unknown Guide")
|
||||
steps = guide_data.get("steps", [])
|
||||
language = guide_data.get("language", "python")
|
||||
prerequisites = guide_data.get("prerequisites", [])
|
||||
|
||||
steps_text = self._format_steps_for_prompt(steps)
|
||||
prereqs_text = ', '.join(prerequisites) if prerequisites else 'None specified'
|
||||
prereqs_text = ", ".join(prerequisites) if prerequisites else "None specified"
|
||||
|
||||
prompt = f"""I need you to enhance this how-to guide with 5 improvements:
|
||||
|
||||
@@ -528,7 +520,7 @@ IMPORTANT: Return ONLY valid JSON, no markdown code blocks or extra text.
|
||||
"""
|
||||
return prompt
|
||||
|
||||
def _create_step_description_prompt(self, steps: List[Dict]) -> str:
|
||||
def _create_step_description_prompt(self, steps: list[dict]) -> str:
|
||||
"""Create prompt for step descriptions only."""
|
||||
steps_text = self._format_steps_for_prompt(steps)
|
||||
return f"""Generate natural language explanations for these code steps:
|
||||
@@ -546,11 +538,11 @@ Return JSON:
|
||||
IMPORTANT: Return ONLY valid JSON.
|
||||
"""
|
||||
|
||||
def _create_troubleshooting_prompt(self, guide_data: Dict) -> str:
|
||||
def _create_troubleshooting_prompt(self, guide_data: dict) -> str:
|
||||
"""Create prompt for troubleshooting items."""
|
||||
title = guide_data.get('title', 'Unknown')
|
||||
language = guide_data.get('language', 'python')
|
||||
steps = guide_data.get('steps', [])
|
||||
title = guide_data.get("title", "Unknown")
|
||||
language = guide_data.get("language", "python")
|
||||
steps = guide_data.get("steps", [])
|
||||
steps_text = self._format_steps_for_prompt(steps)
|
||||
|
||||
return f"""Generate troubleshooting guidance for this {language} workflow:
|
||||
@@ -575,9 +567,9 @@ Return JSON with 3-5 common errors:
|
||||
IMPORTANT: Return ONLY valid JSON.
|
||||
"""
|
||||
|
||||
def _create_prerequisites_prompt(self, prereqs: List[str]) -> str:
|
||||
def _create_prerequisites_prompt(self, prereqs: list[str]) -> str:
|
||||
"""Create prompt for prerequisites enhancement."""
|
||||
prereqs_text = ', '.join(prereqs)
|
||||
prereqs_text = ", ".join(prereqs)
|
||||
return f"""Explain why these prerequisites are needed and how to install them:
|
||||
|
||||
Prerequisites: {prereqs_text}
|
||||
@@ -593,9 +585,9 @@ Return JSON:
|
||||
IMPORTANT: Return ONLY valid JSON.
|
||||
"""
|
||||
|
||||
def _create_next_steps_prompt(self, guide_data: Dict) -> str:
|
||||
def _create_next_steps_prompt(self, guide_data: dict) -> str:
|
||||
"""Create prompt for next steps suggestions."""
|
||||
title = guide_data.get('title', 'Unknown')
|
||||
title = guide_data.get("title", "Unknown")
|
||||
return f"""Suggest 3-5 related guides and learning paths after completing: {title}
|
||||
|
||||
Return JSON:
|
||||
@@ -610,10 +602,10 @@ Return JSON:
|
||||
IMPORTANT: Return ONLY valid JSON.
|
||||
"""
|
||||
|
||||
def _create_use_cases_prompt(self, guide_data: Dict) -> str:
|
||||
def _create_use_cases_prompt(self, guide_data: dict) -> str:
|
||||
"""Create prompt for use case examples."""
|
||||
title = guide_data.get('title', 'Unknown')
|
||||
description = guide_data.get('description', '')
|
||||
title = guide_data.get("title", "Unknown")
|
||||
description = guide_data.get("description", "")
|
||||
|
||||
return f"""Generate 2-3 real-world use cases for this guide:
|
||||
|
||||
@@ -632,23 +624,23 @@ Return JSON:
|
||||
IMPORTANT: Return ONLY valid JSON.
|
||||
"""
|
||||
|
||||
def _format_steps_for_prompt(self, steps: List[Dict]) -> str:
|
||||
def _format_steps_for_prompt(self, steps: list[dict]) -> str:
|
||||
"""Format steps for inclusion in prompts."""
|
||||
if not steps:
|
||||
return "No steps provided"
|
||||
|
||||
formatted = []
|
||||
for i, step in enumerate(steps):
|
||||
desc = step.get('description', '')
|
||||
code = step.get('code', '')
|
||||
desc = step.get("description", "")
|
||||
code = step.get("code", "")
|
||||
if code:
|
||||
formatted.append(f"Step {i+1}: {desc}\n```\n{code}\n```")
|
||||
formatted.append(f"Step {i + 1}: {desc}\n```\n{code}\n```")
|
||||
else:
|
||||
formatted.append(f"Step {i+1}: {desc}")
|
||||
formatted.append(f"Step {i + 1}: {desc}")
|
||||
|
||||
return "\n\n".join(formatted)
|
||||
|
||||
def _parse_enhancement_response(self, response: str, guide_data: Dict) -> Dict:
|
||||
def _parse_enhancement_response(self, response: str, guide_data: dict) -> dict:
|
||||
"""
|
||||
Parse AI enhancement response.
|
||||
|
||||
@@ -661,8 +653,8 @@ IMPORTANT: Return ONLY valid JSON.
|
||||
"""
|
||||
try:
|
||||
# Try to extract JSON from response (in case there's extra text)
|
||||
json_start = response.find('{')
|
||||
json_end = response.rfind('}') + 1
|
||||
json_start = response.find("{")
|
||||
json_end = response.rfind("}") + 1
|
||||
if json_start >= 0 and json_end > json_start:
|
||||
json_text = response[json_start:json_end]
|
||||
data = json.loads(json_text)
|
||||
@@ -673,46 +665,42 @@ IMPORTANT: Return ONLY valid JSON.
|
||||
enhanced = guide_data.copy()
|
||||
|
||||
# Step descriptions
|
||||
if 'step_descriptions' in data:
|
||||
enhanced['step_enhancements'] = [
|
||||
if "step_descriptions" in data:
|
||||
enhanced["step_enhancements"] = [
|
||||
StepEnhancement(
|
||||
step_index=item.get('step_index', i),
|
||||
explanation=item.get('explanation', ''),
|
||||
variations=item.get('variations', [])
|
||||
step_index=item.get("step_index", i),
|
||||
explanation=item.get("explanation", ""),
|
||||
variations=item.get("variations", []),
|
||||
)
|
||||
for i, item in enumerate(data['step_descriptions'])
|
||||
for i, item in enumerate(data["step_descriptions"])
|
||||
]
|
||||
|
||||
# Troubleshooting
|
||||
if 'troubleshooting' in data:
|
||||
enhanced['troubleshooting_detailed'] = [
|
||||
if "troubleshooting" in data:
|
||||
enhanced["troubleshooting_detailed"] = [
|
||||
TroubleshootingItem(
|
||||
problem=item.get('problem', ''),
|
||||
symptoms=item.get('symptoms', []),
|
||||
diagnostic_steps=item.get('diagnostic_steps', []),
|
||||
solution=item.get('solution', '')
|
||||
problem=item.get("problem", ""),
|
||||
symptoms=item.get("symptoms", []),
|
||||
diagnostic_steps=item.get("diagnostic_steps", []),
|
||||
solution=item.get("solution", ""),
|
||||
)
|
||||
for item in data['troubleshooting']
|
||||
for item in data["troubleshooting"]
|
||||
]
|
||||
|
||||
# Prerequisites
|
||||
if 'prerequisites_detailed' in data:
|
||||
enhanced['prerequisites_detailed'] = [
|
||||
PrerequisiteItem(
|
||||
name=item.get('name', ''),
|
||||
why=item.get('why', ''),
|
||||
setup=item.get('setup', '')
|
||||
)
|
||||
for item in data['prerequisites_detailed']
|
||||
if "prerequisites_detailed" in data:
|
||||
enhanced["prerequisites_detailed"] = [
|
||||
PrerequisiteItem(name=item.get("name", ""), why=item.get("why", ""), setup=item.get("setup", ""))
|
||||
for item in data["prerequisites_detailed"]
|
||||
]
|
||||
|
||||
# Next steps
|
||||
if 'next_steps' in data:
|
||||
enhanced['next_steps_detailed'] = data['next_steps']
|
||||
if "next_steps" in data:
|
||||
enhanced["next_steps_detailed"] = data["next_steps"]
|
||||
|
||||
# Use cases
|
||||
if 'use_cases' in data:
|
||||
enhanced['use_cases'] = data['use_cases']
|
||||
if "use_cases" in data:
|
||||
enhanced["use_cases"] = data["use_cases"]
|
||||
|
||||
logger.info("✅ Successfully enhanced guide with all 5 improvements")
|
||||
return enhanced
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -26,30 +26,28 @@ Examples:
|
||||
import argparse
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Tuple, Union
|
||||
from difflib import get_close_matches
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# Agent installation paths
|
||||
# Global paths (install to home directory): Use ~/.{agent}/skills/
|
||||
# Project paths (install to current directory): Use .{agent}/skills/
|
||||
AGENT_PATHS = {
|
||||
'claude': '~/.claude/skills/', # Global (home)
|
||||
'cursor': '.cursor/skills/', # Project-relative
|
||||
'vscode': '.github/skills/', # Project-relative
|
||||
'copilot': '.github/skills/', # Same as VSCode
|
||||
'amp': '~/.amp/skills/', # Global
|
||||
'goose': '~/.config/goose/skills/', # Global
|
||||
'opencode': '~/.opencode/skills/', # Global
|
||||
'letta': '~/.letta/skills/', # Global
|
||||
'aide': '~/.aide/skills/', # Global
|
||||
'windsurf': '~/.windsurf/skills/', # Global
|
||||
'neovate': '~/.neovate/skills/', # Global
|
||||
"claude": "~/.claude/skills/", # Global (home)
|
||||
"cursor": ".cursor/skills/", # Project-relative
|
||||
"vscode": ".github/skills/", # Project-relative
|
||||
"copilot": ".github/skills/", # Same as VSCode
|
||||
"amp": "~/.amp/skills/", # Global
|
||||
"goose": "~/.config/goose/skills/", # Global
|
||||
"opencode": "~/.opencode/skills/", # Global
|
||||
"letta": "~/.letta/skills/", # Global
|
||||
"aide": "~/.aide/skills/", # Global
|
||||
"windsurf": "~/.windsurf/skills/", # Global
|
||||
"neovate": "~/.neovate/skills/", # Global
|
||||
}
|
||||
|
||||
|
||||
def get_agent_path(agent_name: str, project_root: Optional[Path] = None) -> Path:
|
||||
def get_agent_path(agent_name: str, project_root: Path | None = None) -> Path:
|
||||
"""
|
||||
Resolve the installation path for a given agent.
|
||||
|
||||
@@ -75,7 +73,7 @@ def get_agent_path(agent_name: str, project_root: Optional[Path] = None) -> Path
|
||||
path_template = AGENT_PATHS[agent_name]
|
||||
|
||||
# Handle home directory expansion (~)
|
||||
if path_template.startswith('~'):
|
||||
if path_template.startswith("~"):
|
||||
return Path(path_template).expanduser()
|
||||
|
||||
# Handle project-relative paths
|
||||
@@ -95,7 +93,7 @@ def get_available_agents() -> list:
|
||||
return sorted(AGENT_PATHS.keys())
|
||||
|
||||
|
||||
def validate_agent_name(agent_name: str) -> Tuple[bool, Optional[str]]:
|
||||
def validate_agent_name(agent_name: str) -> tuple[bool, str | None]:
|
||||
"""
|
||||
Validate an agent name and provide suggestions if invalid.
|
||||
|
||||
@@ -111,7 +109,7 @@ def validate_agent_name(agent_name: str) -> Tuple[bool, Optional[str]]:
|
||||
- error_message: None if valid, error message with suggestions if invalid
|
||||
"""
|
||||
# Special case: 'all' is valid for installing to all agents
|
||||
if agent_name.lower() == 'all':
|
||||
if agent_name.lower() == "all":
|
||||
return True, None
|
||||
|
||||
# Case-insensitive check
|
||||
@@ -130,13 +128,13 @@ def validate_agent_name(agent_name: str) -> Tuple[bool, Optional[str]]:
|
||||
error_msg += f"Did you mean: {suggestions[0]}?\n\n"
|
||||
|
||||
error_msg += "Available agents:\n "
|
||||
error_msg += ", ".join(available + ['all'])
|
||||
error_msg += ", ".join(available + ["all"])
|
||||
error_msg += f"\n\nUsage:\n skill-seekers install-agent <skill_directory> --agent {suggestions[0] if suggestions else 'claude'}"
|
||||
|
||||
return False, error_msg
|
||||
|
||||
|
||||
def validate_skill_directory(skill_dir: Path) -> Tuple[bool, Optional[str]]:
|
||||
def validate_skill_directory(skill_dir: Path) -> tuple[bool, str | None]:
|
||||
"""
|
||||
Validate that a directory is a valid skill directory.
|
||||
|
||||
@@ -165,11 +163,8 @@ def validate_skill_directory(skill_dir: Path) -> Tuple[bool, Optional[str]]:
|
||||
|
||||
|
||||
def install_to_agent(
|
||||
skill_dir: Union[str, Path],
|
||||
agent_name: str,
|
||||
force: bool = False,
|
||||
dry_run: bool = False
|
||||
) -> Tuple[bool, str]:
|
||||
skill_dir: str | Path, agent_name: str, force: bool = False, dry_run: bool = False
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Install a skill to a specific agent's directory.
|
||||
|
||||
@@ -212,7 +207,7 @@ def install_to_agent(
|
||||
|
||||
# Check if already exists
|
||||
if target_path.exists() and not force:
|
||||
error_msg = f"❌ Skill already installed\n\n"
|
||||
error_msg = "❌ Skill already installed\n\n"
|
||||
error_msg += f"Location: {target_path}\n\n"
|
||||
error_msg += "Options:\n"
|
||||
error_msg += f" 1. Overwrite: skill-seekers install-agent {skill_dir} --agent {agent_name} --force\n"
|
||||
@@ -222,34 +217,34 @@ def install_to_agent(
|
||||
|
||||
# Dry run mode - just preview
|
||||
if dry_run:
|
||||
msg = f"🔍 DRY RUN - No changes will be made\n\n"
|
||||
msg = "🔍 DRY RUN - No changes will be made\n\n"
|
||||
msg += f"Would install skill: {skill_name}\n"
|
||||
msg += f" Source: {skill_dir}\n"
|
||||
msg += f" Target: {target_path}\n\n"
|
||||
|
||||
# Calculate total size
|
||||
total_size = sum(f.stat().st_size for f in skill_dir.rglob('*') if f.is_file())
|
||||
total_size = sum(f.stat().st_size for f in skill_dir.rglob("*") if f.is_file())
|
||||
|
||||
msg += f"Files to copy:\n"
|
||||
msg += "Files to copy:\n"
|
||||
msg += f" SKILL.md ({(skill_dir / 'SKILL.md').stat().st_size / 1024:.1f} KB)\n"
|
||||
|
||||
references_dir = skill_dir / 'references'
|
||||
references_dir = skill_dir / "references"
|
||||
if references_dir.exists():
|
||||
ref_files = list(references_dir.rglob('*.md'))
|
||||
ref_files = list(references_dir.rglob("*.md"))
|
||||
ref_size = sum(f.stat().st_size for f in ref_files)
|
||||
msg += f" references/ ({len(ref_files)} files, {ref_size / 1024:.1f} KB)\n"
|
||||
|
||||
for subdir in ['scripts', 'assets']:
|
||||
for subdir in ["scripts", "assets"]:
|
||||
subdir_path = skill_dir / subdir
|
||||
if subdir_path.exists():
|
||||
files = list(subdir_path.rglob('*'))
|
||||
files = list(subdir_path.rglob("*"))
|
||||
if files:
|
||||
msg += f" {subdir}/ ({len(files)} files)\n"
|
||||
else:
|
||||
msg += f" {subdir}/ (empty)\n"
|
||||
|
||||
msg += f"\nTotal size: {total_size / 1024:.1f} KB\n\n"
|
||||
msg += f"To actually install, run:\n"
|
||||
msg += "To actually install, run:\n"
|
||||
msg += f" skill-seekers install-agent {skill_dir} --agent {agent_name}"
|
||||
|
||||
return True, msg
|
||||
@@ -258,7 +253,10 @@ def install_to_agent(
|
||||
try:
|
||||
agent_base_path.mkdir(parents=True, exist_ok=True)
|
||||
except PermissionError:
|
||||
return False, f"❌ Permission denied: {agent_base_path}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}"
|
||||
return (
|
||||
False,
|
||||
f"❌ Permission denied: {agent_base_path}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}",
|
||||
)
|
||||
|
||||
# Copy skill directory
|
||||
def ignore_files(directory, files):
|
||||
@@ -266,16 +264,13 @@ def install_to_agent(
|
||||
ignored = []
|
||||
for f in files:
|
||||
# Exclude backup files
|
||||
if f.endswith('.backup'):
|
||||
ignored.append(f)
|
||||
# Exclude Python cache
|
||||
elif f == '__pycache__':
|
||||
ignored.append(f)
|
||||
# Exclude macOS metadata
|
||||
elif f == '.DS_Store':
|
||||
ignored.append(f)
|
||||
# Exclude hidden files (except .github for vscode)
|
||||
elif f.startswith('.') and f not in ['.github', '.cursor']:
|
||||
if (
|
||||
f.endswith(".backup")
|
||||
or f == "__pycache__"
|
||||
or f == ".DS_Store"
|
||||
or f.startswith(".")
|
||||
and f not in [".github", ".cursor"]
|
||||
):
|
||||
ignored.append(f)
|
||||
return ignored
|
||||
|
||||
@@ -288,16 +283,16 @@ def install_to_agent(
|
||||
shutil.copytree(skill_dir, target_path, ignore=ignore_files)
|
||||
|
||||
# Success message
|
||||
msg = f"✅ Installation complete!\n\n"
|
||||
msg = "✅ Installation complete!\n\n"
|
||||
msg += f"Skill '{skill_name}' installed to {agent_name}\n"
|
||||
msg += f"Location: {target_path}\n\n"
|
||||
|
||||
# Agent-specific restart instructions
|
||||
if agent_name.lower() == 'claude':
|
||||
if agent_name.lower() == "claude":
|
||||
msg += "Restart Claude Code to load the new skill."
|
||||
elif agent_name.lower() == 'cursor':
|
||||
elif agent_name.lower() == "cursor":
|
||||
msg += "Restart Cursor to load the new skill."
|
||||
elif agent_name.lower() in ['vscode', 'copilot']:
|
||||
elif agent_name.lower() in ["vscode", "copilot"]:
|
||||
msg += "Restart VS Code to load the new skill."
|
||||
else:
|
||||
msg += f"Restart {agent_name.capitalize()} to load the new skill."
|
||||
@@ -305,16 +300,17 @@ def install_to_agent(
|
||||
return True, msg
|
||||
|
||||
except PermissionError as e:
|
||||
return False, f"❌ Permission denied: {e}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}"
|
||||
return (
|
||||
False,
|
||||
f"❌ Permission denied: {e}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}",
|
||||
)
|
||||
except Exception as e:
|
||||
return False, f"❌ Installation failed: {e}"
|
||||
|
||||
|
||||
def install_to_all_agents(
|
||||
skill_dir: Union[str, Path],
|
||||
force: bool = False,
|
||||
dry_run: bool = False
|
||||
) -> Dict[str, Tuple[bool, str]]:
|
||||
skill_dir: str | Path, force: bool = False, dry_run: bool = False
|
||||
) -> dict[str, tuple[bool, str]]:
|
||||
"""
|
||||
Install a skill to all available agents.
|
||||
|
||||
@@ -365,31 +361,16 @@ Examples:
|
||||
|
||||
Supported agents:
|
||||
claude, cursor, vscode, copilot, amp, goose, opencode, letta, aide, windsurf, neovate, all
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"skill_directory",
|
||||
help="Path to skill directory (e.g., output/react/)"
|
||||
)
|
||||
parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)")
|
||||
|
||||
parser.add_argument(
|
||||
"--agent",
|
||||
required=True,
|
||||
help="Agent name (use 'all' to install to all agents)"
|
||||
)
|
||||
parser.add_argument("--agent", required=True, help="Agent name (use 'all' to install to all agents)")
|
||||
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Overwrite existing installation without asking"
|
||||
)
|
||||
parser.add_argument("--force", action="store_true", help="Overwrite existing installation without asking")
|
||||
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview installation without making changes"
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview installation without making changes")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -398,7 +379,7 @@ Supported agents:
|
||||
skill_name = skill_dir.name
|
||||
|
||||
# Handle 'all' agent
|
||||
if args.agent.lower() == 'all':
|
||||
if args.agent.lower() == "all":
|
||||
print(f"\n📋 Installing skill to all agents: {skill_name}\n")
|
||||
|
||||
if args.dry_run:
|
||||
@@ -433,7 +414,7 @@ Supported agents:
|
||||
skipped_count += 1
|
||||
|
||||
# Summary
|
||||
print(f"\n📊 Summary:")
|
||||
print("\n📊 Summary:")
|
||||
if args.dry_run:
|
||||
print(f" Would install: {installed_count} agents")
|
||||
else:
|
||||
|
||||
@@ -26,8 +26,8 @@ Examples:
|
||||
skill-seekers install --config react --dry-run
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
@@ -78,51 +78,35 @@ Phases:
|
||||
3. AI Enhancement (MANDATORY - no skip option)
|
||||
4. Package for target platform (ZIP or tar.gz)
|
||||
5. Upload to target platform (optional)
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
required=True,
|
||||
help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')"
|
||||
"--config", required=True, help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--destination",
|
||||
default="output",
|
||||
help="Output directory for skill files (default: output/)"
|
||||
)
|
||||
parser.add_argument("--destination", default="output", help="Output directory for skill files (default: output/)")
|
||||
|
||||
parser.add_argument("--no-upload", action="store_true", help="Skip automatic upload to Claude")
|
||||
|
||||
parser.add_argument(
|
||||
"--no-upload",
|
||||
action="store_true",
|
||||
help="Skip automatic upload to Claude"
|
||||
"--unlimited", action="store_true", help="Remove page limits during scraping (WARNING: Can take hours)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--unlimited",
|
||||
action="store_true",
|
||||
help="Remove page limits during scraping (WARNING: Can take hours)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview workflow without executing"
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview workflow without executing")
|
||||
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
choices=['claude', 'gemini', 'openai', 'markdown'],
|
||||
default='claude',
|
||||
help="Target LLM platform (default: claude)"
|
||||
choices=["claude", "gemini", "openai", "markdown"],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine if config is a name or path
|
||||
config_arg = args.config
|
||||
if config_arg.endswith('.json') or '/' in config_arg or '\\' in config_arg:
|
||||
if config_arg.endswith(".json") or "/" in config_arg or "\\" in config_arg:
|
||||
# It's a path
|
||||
config_path = config_arg
|
||||
config_name = None
|
||||
@@ -139,7 +123,7 @@ Phases:
|
||||
"auto_upload": not args.no_upload,
|
||||
"unlimited": args.unlimited,
|
||||
"dry_run": args.dry_run,
|
||||
"target": args.target
|
||||
"target": args.target,
|
||||
}
|
||||
|
||||
# Run async tool
|
||||
|
||||
@@ -8,9 +8,8 @@ Supports 20+ programming languages with weighted pattern matching.
|
||||
Author: Skill Seekers Project
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Optional, Tuple, Dict, List
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -18,19 +17,11 @@ logger = logging.getLogger(__name__)
|
||||
try:
|
||||
from skill_seekers.cli.swift_patterns import SWIFT_PATTERNS
|
||||
except ImportError as e:
|
||||
logger.warning(
|
||||
"Swift language detection patterns unavailable. "
|
||||
"Swift code detection will be disabled. Error: %s",
|
||||
e
|
||||
)
|
||||
SWIFT_PATTERNS: Dict[str, List[Tuple[str, int]]] = {}
|
||||
logger.warning("Swift language detection patterns unavailable. Swift code detection will be disabled. Error: %s", e)
|
||||
SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {}
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to load Swift patterns due to unexpected error: %s. "
|
||||
"Swift detection disabled.",
|
||||
e
|
||||
)
|
||||
SWIFT_PATTERNS: Dict[str, List[Tuple[str, int]]] = {}
|
||||
logger.error("Failed to load Swift patterns due to unexpected error: %s. Swift detection disabled.", e)
|
||||
SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {}
|
||||
|
||||
# Verify Swift patterns were loaded correctly
|
||||
if not SWIFT_PATTERNS:
|
||||
@@ -38,15 +29,13 @@ if not SWIFT_PATTERNS:
|
||||
"Swift pattern dictionary is empty. Swift detection is disabled. "
|
||||
"This may indicate swift_patterns.py has no patterns defined."
|
||||
)
|
||||
elif 'swift' not in SWIFT_PATTERNS:
|
||||
elif "swift" not in SWIFT_PATTERNS:
|
||||
logger.error(
|
||||
"Swift patterns loaded but 'swift' key is missing. "
|
||||
"Swift detection is broken. Please file a bug report."
|
||||
"Swift patterns loaded but 'swift' key is missing. Swift detection is broken. Please file a bug report."
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Swift patterns loaded successfully: %d patterns for language detection",
|
||||
len(SWIFT_PATTERNS.get('swift', []))
|
||||
"Swift patterns loaded successfully: %d patterns for language detection", len(SWIFT_PATTERNS.get("swift", []))
|
||||
)
|
||||
|
||||
# Comprehensive language patterns with weighted confidence scoring
|
||||
@@ -56,355 +45,325 @@ else:
|
||||
# Weight 2: Moderate indicators
|
||||
# Weight 1: Weak indicators
|
||||
|
||||
LANGUAGE_PATTERNS: Dict[str, List[Tuple[str, int]]] = {
|
||||
LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = {
|
||||
# ===== PRIORITY 1: Unity C# (Critical - User's Primary Issue) =====
|
||||
'csharp': [
|
||||
"csharp": [
|
||||
# Unity-specific patterns (weight 4-5, CRITICAL)
|
||||
(r'\busing\s+UnityEngine', 5),
|
||||
(r'\bMonoBehaviour\b', 5),
|
||||
(r'\bGameObject\b', 4),
|
||||
(r'\bTransform\b', 4),
|
||||
(r'\bVector[23]\b', 3),
|
||||
(r'\bQuaternion\b', 3),
|
||||
(r'\bvoid\s+Start\s*\(\)', 4),
|
||||
(r'\bvoid\s+Update\s*\(\)', 4),
|
||||
(r'\bvoid\s+Awake\s*\(\)', 4),
|
||||
(r'\bvoid\s+OnEnable\s*\(\)', 3),
|
||||
(r'\bvoid\s+OnDisable\s*\(\)', 3),
|
||||
(r'\bvoid\s+FixedUpdate\s*\(\)', 4),
|
||||
(r'\bvoid\s+LateUpdate\s*\(\)', 4),
|
||||
(r'\bvoid\s+OnCollisionEnter', 4),
|
||||
(r'\bvoid\s+OnTriggerEnter', 4),
|
||||
(r'\bIEnumerator\b', 4),
|
||||
(r'\bStartCoroutine\s*\(', 4),
|
||||
(r'\byield\s+return\s+new\s+WaitForSeconds', 4),
|
||||
(r'\byield\s+return\s+null', 3),
|
||||
(r'\byield\s+return', 4),
|
||||
(r'\[SerializeField\]', 4),
|
||||
(r'\[RequireComponent', 4),
|
||||
(r'\[Header\(', 3),
|
||||
(r'\[Range\(', 3),
|
||||
(r'\bTime\.deltaTime\b', 4),
|
||||
(r'\bInput\.Get', 4),
|
||||
(r'\bRigidbody\b', 3),
|
||||
(r'\bCollider\b', 3),
|
||||
(r'\bRenderer\b', 3),
|
||||
(r'\bGetComponent<', 3),
|
||||
|
||||
(r"\busing\s+UnityEngine", 5),
|
||||
(r"\bMonoBehaviour\b", 5),
|
||||
(r"\bGameObject\b", 4),
|
||||
(r"\bTransform\b", 4),
|
||||
(r"\bVector[23]\b", 3),
|
||||
(r"\bQuaternion\b", 3),
|
||||
(r"\bvoid\s+Start\s*\(\)", 4),
|
||||
(r"\bvoid\s+Update\s*\(\)", 4),
|
||||
(r"\bvoid\s+Awake\s*\(\)", 4),
|
||||
(r"\bvoid\s+OnEnable\s*\(\)", 3),
|
||||
(r"\bvoid\s+OnDisable\s*\(\)", 3),
|
||||
(r"\bvoid\s+FixedUpdate\s*\(\)", 4),
|
||||
(r"\bvoid\s+LateUpdate\s*\(\)", 4),
|
||||
(r"\bvoid\s+OnCollisionEnter", 4),
|
||||
(r"\bvoid\s+OnTriggerEnter", 4),
|
||||
(r"\bIEnumerator\b", 4),
|
||||
(r"\bStartCoroutine\s*\(", 4),
|
||||
(r"\byield\s+return\s+new\s+WaitForSeconds", 4),
|
||||
(r"\byield\s+return\s+null", 3),
|
||||
(r"\byield\s+return", 4),
|
||||
(r"\[SerializeField\]", 4),
|
||||
(r"\[RequireComponent", 4),
|
||||
(r"\[Header\(", 3),
|
||||
(r"\[Range\(", 3),
|
||||
(r"\bTime\.deltaTime\b", 4),
|
||||
(r"\bInput\.Get", 4),
|
||||
(r"\bRigidbody\b", 3),
|
||||
(r"\bCollider\b", 3),
|
||||
(r"\bRenderer\b", 3),
|
||||
(r"\bGetComponent<", 3),
|
||||
# Basic C# patterns (weight 2-4)
|
||||
(r'\bnamespace\s+\w+', 3),
|
||||
(r'\busing\s+System', 3),
|
||||
(r'\bConsole\.WriteLine', 4), # C#-specific output
|
||||
(r'\bConsole\.Write', 3),
|
||||
(r'\bpublic\s+class\s+\w+', 4), # Increased to match Java weight
|
||||
(r'\bprivate\s+class\s+\w+', 3),
|
||||
(r'\binternal\s+class\s+\w+', 4), # C#-specific modifier
|
||||
(r'\bstring\s+\w+\s*[;=]', 2), # C#-specific lowercase string
|
||||
(r'\bprivate\s+\w+\s+\w+\s*;', 2), # Private fields (common in both C# and Java)
|
||||
(r'\{\s*get;\s*set;\s*\}', 3), # Auto properties
|
||||
(r'\{\s*get;\s*private\s+set;\s*\}', 3),
|
||||
(r'\{\s*get\s*=>\s*', 2), # Expression properties
|
||||
(r'\bpublic\s+static\s+void\s+', 2),
|
||||
|
||||
(r"\bnamespace\s+\w+", 3),
|
||||
(r"\busing\s+System", 3),
|
||||
(r"\bConsole\.WriteLine", 4), # C#-specific output
|
||||
(r"\bConsole\.Write", 3),
|
||||
(r"\bpublic\s+class\s+\w+", 4), # Increased to match Java weight
|
||||
(r"\bprivate\s+class\s+\w+", 3),
|
||||
(r"\binternal\s+class\s+\w+", 4), # C#-specific modifier
|
||||
(r"\bstring\s+\w+\s*[;=]", 2), # C#-specific lowercase string
|
||||
(r"\bprivate\s+\w+\s+\w+\s*;", 2), # Private fields (common in both C# and Java)
|
||||
(r"\{\s*get;\s*set;\s*\}", 3), # Auto properties
|
||||
(r"\{\s*get;\s*private\s+set;\s*\}", 3),
|
||||
(r"\{\s*get\s*=>\s*", 2), # Expression properties
|
||||
(r"\bpublic\s+static\s+void\s+", 2),
|
||||
# Modern C# patterns (weight 2)
|
||||
(r'\bfrom\s+\w+\s+in\s+', 2), # LINQ
|
||||
(r'\.Where\s*\(', 2),
|
||||
(r'\.Select\s*\(', 2),
|
||||
(r'\basync\s+Task', 2),
|
||||
(r'\bawait\s+', 2),
|
||||
(r'\bvar\s+\w+\s*=', 1),
|
||||
(r"\bfrom\s+\w+\s+in\s+", 2), # LINQ
|
||||
(r"\.Where\s*\(", 2),
|
||||
(r"\.Select\s*\(", 2),
|
||||
(r"\basync\s+Task", 2),
|
||||
(r"\bawait\s+", 2),
|
||||
(r"\bvar\s+\w+\s*=", 1),
|
||||
],
|
||||
|
||||
# ===== PRIORITY 2: Frontend Languages =====
|
||||
'typescript': [
|
||||
"typescript": [
|
||||
# TypeScript-specific (weight 4-5)
|
||||
(r'\binterface\s+\w+\s*\{', 5),
|
||||
(r'\btype\s+\w+\s*=', 4),
|
||||
(r':\s*\w+\s*=', 3), # Type annotation
|
||||
(r':\s*\w+\[\]', 3), # Array type
|
||||
(r'<[\w,\s]+>', 2), # Generic type
|
||||
(r'\bas\s+\w+', 2), # Type assertion
|
||||
(r'\benum\s+\w+\s*\{', 4),
|
||||
(r'\bimplements\s+\w+', 3),
|
||||
(r'\bexport\s+interface', 4),
|
||||
(r'\bexport\s+type', 4),
|
||||
|
||||
(r"\binterface\s+\w+\s*\{", 5),
|
||||
(r"\btype\s+\w+\s*=", 4),
|
||||
(r":\s*\w+\s*=", 3), # Type annotation
|
||||
(r":\s*\w+\[\]", 3), # Array type
|
||||
(r"<[\w,\s]+>", 2), # Generic type
|
||||
(r"\bas\s+\w+", 2), # Type assertion
|
||||
(r"\benum\s+\w+\s*\{", 4),
|
||||
(r"\bimplements\s+\w+", 3),
|
||||
(r"\bexport\s+interface", 4),
|
||||
(r"\bexport\s+type", 4),
|
||||
# Also has JS patterns (weight 1)
|
||||
(r'\bconst\s+\w+\s*=', 1),
|
||||
(r'\blet\s+\w+\s*=', 1),
|
||||
(r'=>', 1),
|
||||
(r"\bconst\s+\w+\s*=", 1),
|
||||
(r"\blet\s+\w+\s*=", 1),
|
||||
(r"=>", 1),
|
||||
],
|
||||
|
||||
'javascript': [
|
||||
(r'\bfunction\s+\w+\s*\(', 3),
|
||||
(r'\bconst\s+\w+\s*=', 2),
|
||||
(r'\blet\s+\w+\s*=', 2),
|
||||
(r'=>', 2), # Arrow function
|
||||
(r'\bconsole\.log', 2),
|
||||
(r'\bvar\s+\w+\s*=', 1),
|
||||
(r'\.then\s*\(', 2), # Promise
|
||||
(r'\.catch\s*\(', 2), # Promise
|
||||
(r'\basync\s+function', 3),
|
||||
(r'\bawait\s+', 2),
|
||||
(r'require\s*\(', 2), # CommonJS
|
||||
(r'\bexport\s+default', 2), # ES6
|
||||
(r'\bexport\s+const', 2),
|
||||
"javascript": [
|
||||
(r"\bfunction\s+\w+\s*\(", 3),
|
||||
(r"\bconst\s+\w+\s*=", 2),
|
||||
(r"\blet\s+\w+\s*=", 2),
|
||||
(r"=>", 2), # Arrow function
|
||||
(r"\bconsole\.log", 2),
|
||||
(r"\bvar\s+\w+\s*=", 1),
|
||||
(r"\.then\s*\(", 2), # Promise
|
||||
(r"\.catch\s*\(", 2), # Promise
|
||||
(r"\basync\s+function", 3),
|
||||
(r"\bawait\s+", 2),
|
||||
(r"require\s*\(", 2), # CommonJS
|
||||
(r"\bexport\s+default", 2), # ES6
|
||||
(r"\bexport\s+const", 2),
|
||||
],
|
||||
|
||||
'jsx': [
|
||||
"jsx": [
|
||||
# JSX patterns (weight 4-5)
|
||||
(r'<\w+\s+[^>]*>', 4), # JSX tag with attributes
|
||||
(r'<\w+\s*/>', 4), # Self-closing tag
|
||||
(r'className=', 3), # React className
|
||||
(r'onClick=', 3), # React event
|
||||
(r'\brender\s*\(\s*\)\s*\{', 4), # React render
|
||||
(r'\buseState\s*\(', 4), # React hook
|
||||
(r'\buseEffect\s*\(', 4), # React hook
|
||||
(r'\buseRef\s*\(', 3),
|
||||
(r'\buseCallback\s*\(', 3),
|
||||
(r'\buseMemo\s*\(', 3),
|
||||
|
||||
(r"<\w+\s+[^>]*>", 4), # JSX tag with attributes
|
||||
(r"<\w+\s*/>", 4), # Self-closing tag
|
||||
(r"className=", 3), # React className
|
||||
(r"onClick=", 3), # React event
|
||||
(r"\brender\s*\(\s*\)\s*\{", 4), # React render
|
||||
(r"\buseState\s*\(", 4), # React hook
|
||||
(r"\buseEffect\s*\(", 4), # React hook
|
||||
(r"\buseRef\s*\(", 3),
|
||||
(r"\buseCallback\s*\(", 3),
|
||||
(r"\buseMemo\s*\(", 3),
|
||||
# Also has JS patterns
|
||||
(r'\bconst\s+\w+\s*=', 1),
|
||||
(r'=>', 1),
|
||||
(r"\bconst\s+\w+\s*=", 1),
|
||||
(r"=>", 1),
|
||||
],
|
||||
|
||||
'tsx': [
|
||||
"tsx": [
|
||||
# TSX = TypeScript + JSX (weight 5)
|
||||
(r'<\w+\s+[^>]*>', 3), # JSX tag
|
||||
(r':\s*React\.\w+', 5), # React types
|
||||
(r'interface\s+\w+Props', 5), # Props interface
|
||||
(r'\bFunctionComponent<', 4),
|
||||
(r'\bReact\.FC<', 4),
|
||||
(r'\buseState<', 4), # Typed hook
|
||||
(r'\buseRef<', 3),
|
||||
|
||||
(r"<\w+\s+[^>]*>", 3), # JSX tag
|
||||
(r":\s*React\.\w+", 5), # React types
|
||||
(r"interface\s+\w+Props", 5), # Props interface
|
||||
(r"\bFunctionComponent<", 4),
|
||||
(r"\bReact\.FC<", 4),
|
||||
(r"\buseState<", 4), # Typed hook
|
||||
(r"\buseRef<", 3),
|
||||
# Also has TS patterns
|
||||
(r'\binterface\s+\w+', 2),
|
||||
(r'\btype\s+\w+\s*=', 2),
|
||||
(r"\binterface\s+\w+", 2),
|
||||
(r"\btype\s+\w+\s*=", 2),
|
||||
],
|
||||
|
||||
'vue': [
|
||||
"vue": [
|
||||
# Vue SFC patterns (weight 4-5)
|
||||
(r'<template>', 5),
|
||||
(r'<script>', 3),
|
||||
(r'<style\s+scoped>', 4),
|
||||
(r'\bexport\s+default\s*\{', 3),
|
||||
(r'\bdata\s*\(\s*\)\s*\{', 4), # Vue 2
|
||||
(r'\bcomputed\s*:', 3),
|
||||
(r'\bmethods\s*:', 3),
|
||||
(r'\bsetup\s*\(', 4), # Vue 3 Composition
|
||||
(r'\bref\s*\(', 4), # Vue 3
|
||||
(r'\breactive\s*\(', 4), # Vue 3
|
||||
(r'v-bind:', 3),
|
||||
(r'v-for=', 3),
|
||||
(r'v-if=', 3),
|
||||
(r'v-model=', 3),
|
||||
(r"<template>", 5),
|
||||
(r"<script>", 3),
|
||||
(r"<style\s+scoped>", 4),
|
||||
(r"\bexport\s+default\s*\{", 3),
|
||||
(r"\bdata\s*\(\s*\)\s*\{", 4), # Vue 2
|
||||
(r"\bcomputed\s*:", 3),
|
||||
(r"\bmethods\s*:", 3),
|
||||
(r"\bsetup\s*\(", 4), # Vue 3 Composition
|
||||
(r"\bref\s*\(", 4), # Vue 3
|
||||
(r"\breactive\s*\(", 4), # Vue 3
|
||||
(r"v-bind:", 3),
|
||||
(r"v-for=", 3),
|
||||
(r"v-if=", 3),
|
||||
(r"v-model=", 3),
|
||||
],
|
||||
|
||||
# ===== PRIORITY 3: Backend Languages =====
|
||||
'java': [
|
||||
(r'\bpublic\s+class\s+\w+', 4),
|
||||
(r'\bprivate\s+\w+\s+\w+', 2),
|
||||
(r'\bSystem\.out\.println', 3),
|
||||
(r'\bpublic\s+static\s+void\s+main', 4),
|
||||
(r'\bpublic\s+\w+\s+\w+\s*\(', 2),
|
||||
(r'@Override', 3),
|
||||
(r'@Autowired', 3), # Spring
|
||||
(r'@Service', 3), # Spring
|
||||
(r'@RestController', 3), # Spring
|
||||
(r'@GetMapping', 3), # Spring
|
||||
(r'@PostMapping', 3), # Spring
|
||||
(r'\bimport\s+java\.', 2),
|
||||
(r'\bextends\s+\w+', 2),
|
||||
"java": [
|
||||
(r"\bpublic\s+class\s+\w+", 4),
|
||||
(r"\bprivate\s+\w+\s+\w+", 2),
|
||||
(r"\bSystem\.out\.println", 3),
|
||||
(r"\bpublic\s+static\s+void\s+main", 4),
|
||||
(r"\bpublic\s+\w+\s+\w+\s*\(", 2),
|
||||
(r"@Override", 3),
|
||||
(r"@Autowired", 3), # Spring
|
||||
(r"@Service", 3), # Spring
|
||||
(r"@RestController", 3), # Spring
|
||||
(r"@GetMapping", 3), # Spring
|
||||
(r"@PostMapping", 3), # Spring
|
||||
(r"\bimport\s+java\.", 2),
|
||||
(r"\bextends\s+\w+", 2),
|
||||
],
|
||||
|
||||
'go': [
|
||||
(r'\bfunc\s+\w+\s*\(', 3),
|
||||
(r'\bpackage\s+\w+', 4),
|
||||
(r':=', 3), # Short declaration
|
||||
(r'\bfmt\.Print', 2),
|
||||
(r'\bfunc\s+\(.*\)\s+\w+\s*\(', 4), # Method
|
||||
(r'\bdefer\s+', 3),
|
||||
(r'\bgo\s+\w+\s*\(', 3), # Goroutine
|
||||
(r'\bchan\s+', 3), # Channel
|
||||
(r'\binterface\{\}', 2), # Empty interface
|
||||
(r'\bfunc\s+main\s*\(\)', 4),
|
||||
"go": [
|
||||
(r"\bfunc\s+\w+\s*\(", 3),
|
||||
(r"\bpackage\s+\w+", 4),
|
||||
(r":=", 3), # Short declaration
|
||||
(r"\bfmt\.Print", 2),
|
||||
(r"\bfunc\s+\(.*\)\s+\w+\s*\(", 4), # Method
|
||||
(r"\bdefer\s+", 3),
|
||||
(r"\bgo\s+\w+\s*\(", 3), # Goroutine
|
||||
(r"\bchan\s+", 3), # Channel
|
||||
(r"\binterface\{\}", 2), # Empty interface
|
||||
(r"\bfunc\s+main\s*\(\)", 4),
|
||||
],
|
||||
|
||||
'rust': [
|
||||
(r'\bfn\s+\w+\s*\(', 4),
|
||||
(r'\blet\s+mut\s+\w+', 3),
|
||||
(r'\bprintln!', 3),
|
||||
(r'\bimpl\s+\w+', 3),
|
||||
(r'\buse\s+\w+::', 3),
|
||||
(r'\bpub\s+fn\s+', 3),
|
||||
(r'\bmatch\s+\w+\s*\{', 3),
|
||||
(r'\bSome\(', 2),
|
||||
(r'\bNone\b', 2),
|
||||
(r'\bResult<', 3),
|
||||
(r'\bOption<', 3),
|
||||
(r'&str\b', 2),
|
||||
(r'\bfn\s+main\s*\(\)', 4),
|
||||
"rust": [
|
||||
(r"\bfn\s+\w+\s*\(", 4),
|
||||
(r"\blet\s+mut\s+\w+", 3),
|
||||
(r"\bprintln!", 3),
|
||||
(r"\bimpl\s+\w+", 3),
|
||||
(r"\buse\s+\w+::", 3),
|
||||
(r"\bpub\s+fn\s+", 3),
|
||||
(r"\bmatch\s+\w+\s*\{", 3),
|
||||
(r"\bSome\(", 2),
|
||||
(r"\bNone\b", 2),
|
||||
(r"\bResult<", 3),
|
||||
(r"\bOption<", 3),
|
||||
(r"&str\b", 2),
|
||||
(r"\bfn\s+main\s*\(\)", 4),
|
||||
],
|
||||
|
||||
'php': [
|
||||
(r'<\?php', 5),
|
||||
(r'\$\w+\s*=', 2),
|
||||
(r'\bfunction\s+\w+\s*\(', 2),
|
||||
(r'\bpublic\s+function', 3),
|
||||
(r'\bprivate\s+function', 3),
|
||||
(r'\bclass\s+\w+', 3),
|
||||
(r'\bnamespace\s+\w+', 3),
|
||||
(r'\buse\s+\w+\\', 2),
|
||||
(r'->', 2), # Object operator
|
||||
(r'::', 1), # Static operator
|
||||
"php": [
|
||||
(r"<\?php", 5),
|
||||
(r"\$\w+\s*=", 2),
|
||||
(r"\bfunction\s+\w+\s*\(", 2),
|
||||
(r"\bpublic\s+function", 3),
|
||||
(r"\bprivate\s+function", 3),
|
||||
(r"\bclass\s+\w+", 3),
|
||||
(r"\bnamespace\s+\w+", 3),
|
||||
(r"\buse\s+\w+\\", 2),
|
||||
(r"->", 2), # Object operator
|
||||
(r"::", 1), # Static operator
|
||||
],
|
||||
|
||||
# ===== PRIORITY 4: System/Data Languages =====
|
||||
'python': [
|
||||
(r'\bdef\s+\w+\s*\(', 3),
|
||||
(r'\bimport\s+\w+', 2),
|
||||
(r'\bclass\s+\w+:', 3),
|
||||
(r'\bfrom\s+\w+\s+import', 2),
|
||||
(r':\s*$', 1), # Lines ending with :
|
||||
(r'@\w+', 2), # Decorator
|
||||
(r'\bself\.\w+', 2),
|
||||
(r'\b__init__\s*\(', 3),
|
||||
(r'\basync\s+def\s+', 3),
|
||||
(r'\bawait\s+', 2),
|
||||
(r'\bprint\s*\(', 1),
|
||||
"python": [
|
||||
(r"\bdef\s+\w+\s*\(", 3),
|
||||
(r"\bimport\s+\w+", 2),
|
||||
(r"\bclass\s+\w+:", 3),
|
||||
(r"\bfrom\s+\w+\s+import", 2),
|
||||
(r":\s*$", 1), # Lines ending with :
|
||||
(r"@\w+", 2), # Decorator
|
||||
(r"\bself\.\w+", 2),
|
||||
(r"\b__init__\s*\(", 3),
|
||||
(r"\basync\s+def\s+", 3),
|
||||
(r"\bawait\s+", 2),
|
||||
(r"\bprint\s*\(", 1),
|
||||
],
|
||||
|
||||
'r': [
|
||||
(r'<-', 4), # Assignment operator
|
||||
(r'\bfunction\s*\(', 2),
|
||||
(r'\blibrary\s*\(', 3),
|
||||
(r'\bggplot\s*\(', 4), # ggplot2
|
||||
(r'\bdata\.frame\s*\(', 3),
|
||||
(r'\%>\%', 4), # Pipe operator
|
||||
(r'\bsummary\s*\(', 2),
|
||||
(r'\bread\.csv\s*\(', 3),
|
||||
"r": [
|
||||
(r"<-", 4), # Assignment operator
|
||||
(r"\bfunction\s*\(", 2),
|
||||
(r"\blibrary\s*\(", 3),
|
||||
(r"\bggplot\s*\(", 4), # ggplot2
|
||||
(r"\bdata\.frame\s*\(", 3),
|
||||
(r"\%>\%", 4), # Pipe operator
|
||||
(r"\bsummary\s*\(", 2),
|
||||
(r"\bread\.csv\s*\(", 3),
|
||||
],
|
||||
|
||||
'julia': [
|
||||
(r'\bfunction\s+\w+\s*\(', 3),
|
||||
(r'\bend\b', 2),
|
||||
(r'\busing\s+\w+', 3),
|
||||
(r'::', 2), # Type annotation
|
||||
(r'\bmodule\s+\w+', 3),
|
||||
(r'\babstract\s+type', 3),
|
||||
(r'\bstruct\s+\w+', 3),
|
||||
"julia": [
|
||||
(r"\bfunction\s+\w+\s*\(", 3),
|
||||
(r"\bend\b", 2),
|
||||
(r"\busing\s+\w+", 3),
|
||||
(r"::", 2), # Type annotation
|
||||
(r"\bmodule\s+\w+", 3),
|
||||
(r"\babstract\s+type", 3),
|
||||
(r"\bstruct\s+\w+", 3),
|
||||
],
|
||||
|
||||
'sql': [
|
||||
(r'\bSELECT\s+', 4),
|
||||
(r'\bFROM\s+', 3),
|
||||
(r'\bWHERE\s+', 2),
|
||||
(r'\bINSERT\s+INTO', 4),
|
||||
(r'\bCREATE\s+TABLE', 4),
|
||||
(r'\bJOIN\s+', 3),
|
||||
(r'\bGROUP\s+BY', 3),
|
||||
(r'\bORDER\s+BY', 3),
|
||||
(r'\bUPDATE\s+', 3),
|
||||
(r'\bDELETE\s+FROM', 3),
|
||||
"sql": [
|
||||
(r"\bSELECT\s+", 4),
|
||||
(r"\bFROM\s+", 3),
|
||||
(r"\bWHERE\s+", 2),
|
||||
(r"\bINSERT\s+INTO", 4),
|
||||
(r"\bCREATE\s+TABLE", 4),
|
||||
(r"\bJOIN\s+", 3),
|
||||
(r"\bGROUP\s+BY", 3),
|
||||
(r"\bORDER\s+BY", 3),
|
||||
(r"\bUPDATE\s+", 3),
|
||||
(r"\bDELETE\s+FROM", 3),
|
||||
],
|
||||
|
||||
# ===== Additional Languages =====
|
||||
'cpp': [
|
||||
(r'#include\s*<', 4),
|
||||
(r'\bstd::', 3),
|
||||
(r'\bnamespace\s+\w+', 3),
|
||||
(r'\bcout\s*<<', 3),
|
||||
(r'\bvoid\s+\w+\s*\(', 2),
|
||||
(r'\bint\s+main\s*\(', 4),
|
||||
(r'->', 2), # Pointer
|
||||
"cpp": [
|
||||
(r"#include\s*<", 4),
|
||||
(r"\bstd::", 3),
|
||||
(r"\bnamespace\s+\w+", 3),
|
||||
(r"\bcout\s*<<", 3),
|
||||
(r"\bvoid\s+\w+\s*\(", 2),
|
||||
(r"\bint\s+main\s*\(", 4),
|
||||
(r"->", 2), # Pointer
|
||||
],
|
||||
|
||||
'c': [
|
||||
(r'#include\s*<', 4),
|
||||
(r'\bprintf\s*\(', 3),
|
||||
(r'\bint\s+main\s*\(', 4),
|
||||
(r'\bvoid\s+\w+\s*\(', 2),
|
||||
(r'\bstruct\s+\w+', 3),
|
||||
"c": [
|
||||
(r"#include\s*<", 4),
|
||||
(r"\bprintf\s*\(", 3),
|
||||
(r"\bint\s+main\s*\(", 4),
|
||||
(r"\bvoid\s+\w+\s*\(", 2),
|
||||
(r"\bstruct\s+\w+", 3),
|
||||
],
|
||||
|
||||
'gdscript': [
|
||||
(r'\bfunc\s+\w+\s*\(', 3),
|
||||
(r'\bvar\s+\w+\s*=', 3),
|
||||
(r'\bextends\s+\w+', 4),
|
||||
(r'\b_ready\s*\(', 4),
|
||||
(r'\b_process\s*\(', 4),
|
||||
"gdscript": [
|
||||
(r"\bfunc\s+\w+\s*\(", 3),
|
||||
(r"\bvar\s+\w+\s*=", 3),
|
||||
(r"\bextends\s+\w+", 4),
|
||||
(r"\b_ready\s*\(", 4),
|
||||
(r"\b_process\s*\(", 4),
|
||||
],
|
||||
|
||||
# ===== Markup/Config Languages =====
|
||||
'html': [
|
||||
(r'<!DOCTYPE\s+html>', 5),
|
||||
(r'<html', 4),
|
||||
(r'<head>', 3),
|
||||
(r'<body>', 3),
|
||||
(r'<div', 2),
|
||||
(r'<span', 2),
|
||||
(r'<script', 2),
|
||||
"html": [
|
||||
(r"<!DOCTYPE\s+html>", 5),
|
||||
(r"<html", 4),
|
||||
(r"<head>", 3),
|
||||
(r"<body>", 3),
|
||||
(r"<div", 2),
|
||||
(r"<span", 2),
|
||||
(r"<script", 2),
|
||||
],
|
||||
|
||||
'css': [
|
||||
(r'\{\s*[\w-]+\s*:', 3),
|
||||
(r'@media', 3),
|
||||
(r'\.[\w-]+\s*\{', 2),
|
||||
(r'#[\w-]+\s*\{', 2),
|
||||
(r'@import', 2),
|
||||
"css": [
|
||||
(r"\{\s*[\w-]+\s*:", 3),
|
||||
(r"@media", 3),
|
||||
(r"\.[\w-]+\s*\{", 2),
|
||||
(r"#[\w-]+\s*\{", 2),
|
||||
(r"@import", 2),
|
||||
],
|
||||
|
||||
'json': [
|
||||
(r'^\s*\{', 3),
|
||||
(r'^\s*\[', 3),
|
||||
"json": [
|
||||
(r"^\s*\{", 3),
|
||||
(r"^\s*\[", 3),
|
||||
(r'"\w+"\s*:', 3),
|
||||
(r':\s*["\d\[\{]', 2),
|
||||
],
|
||||
|
||||
'yaml': [
|
||||
(r'^\w+:', 3),
|
||||
(r'^\s+-\s+\w+', 2),
|
||||
(r'---', 2),
|
||||
(r'^\s+\w+:', 2),
|
||||
"yaml": [
|
||||
(r"^\w+:", 3),
|
||||
(r"^\s+-\s+\w+", 2),
|
||||
(r"---", 2),
|
||||
(r"^\s+\w+:", 2),
|
||||
],
|
||||
|
||||
'xml': [
|
||||
(r'<\?xml', 5),
|
||||
(r'<\w+\s+\w+=', 2),
|
||||
(r'<\w+>', 1),
|
||||
(r'</\w+>', 1),
|
||||
"xml": [
|
||||
(r"<\?xml", 5),
|
||||
(r"<\w+\s+\w+=", 2),
|
||||
(r"<\w+>", 1),
|
||||
(r"</\w+>", 1),
|
||||
],
|
||||
|
||||
'markdown': [
|
||||
(r'^#+\s+', 3),
|
||||
(r'^\*\*\w+\*\*', 2),
|
||||
(r'^\s*[-*]\s+', 2),
|
||||
(r'\[.*\]\(.*\)', 2),
|
||||
"markdown": [
|
||||
(r"^#+\s+", 3),
|
||||
(r"^\*\*\w+\*\*", 2),
|
||||
(r"^\s*[-*]\s+", 2),
|
||||
(r"\[.*\]\(.*\)", 2),
|
||||
],
|
||||
|
||||
'bash': [
|
||||
(r'#!/bin/bash', 5),
|
||||
(r'#!/bin/sh', 5),
|
||||
(r'\becho\s+', 2),
|
||||
(r'\$\{?\w+\}?', 2),
|
||||
(r'\bif\s+\[', 2),
|
||||
(r'\bfor\s+\w+\s+in', 2),
|
||||
"bash": [
|
||||
(r"#!/bin/bash", 5),
|
||||
(r"#!/bin/sh", 5),
|
||||
(r"\becho\s+", 2),
|
||||
(r"\$\{?\w+\}?", 2),
|
||||
(r"\bif\s+\[", 2),
|
||||
(r"\bfor\s+\w+\s+in", 2),
|
||||
],
|
||||
|
||||
'shell': [
|
||||
(r'#!/bin/bash', 5),
|
||||
(r'#!/bin/sh', 5),
|
||||
(r'\becho\s+', 2),
|
||||
(r'\$\{?\w+\}?', 2),
|
||||
"shell": [
|
||||
(r"#!/bin/bash", 5),
|
||||
(r"#!/bin/sh", 5),
|
||||
(r"\becho\s+", 2),
|
||||
(r"\$\{?\w+\}?", 2),
|
||||
],
|
||||
|
||||
'powershell': [
|
||||
(r'\$\w+\s*=', 2),
|
||||
(r'Get-\w+', 3),
|
||||
(r'Set-\w+', 3),
|
||||
(r'\bWrite-Host\s+', 2),
|
||||
"powershell": [
|
||||
(r"\$\w+\s*=", 2),
|
||||
(r"Get-\w+", 3),
|
||||
(r"Set-\w+", 3),
|
||||
(r"\bWrite-Host\s+", 2),
|
||||
],
|
||||
}
|
||||
|
||||
@@ -414,11 +373,42 @@ LANGUAGE_PATTERNS.update(SWIFT_PATTERNS)
|
||||
|
||||
# Known language list for CSS class detection
|
||||
KNOWN_LANGUAGES = [
|
||||
"javascript", "java", "xml", "html", "python", "bash", "cpp", "typescript",
|
||||
"go", "rust", "php", "ruby", "swift", "kotlin", "csharp", "c", "sql",
|
||||
"yaml", "json", "markdown", "css", "scss", "sass", "jsx", "tsx", "vue",
|
||||
"shell", "powershell", "r", "scala", "dart", "perl", "lua", "elixir",
|
||||
"julia", "gdscript",
|
||||
"javascript",
|
||||
"java",
|
||||
"xml",
|
||||
"html",
|
||||
"python",
|
||||
"bash",
|
||||
"cpp",
|
||||
"typescript",
|
||||
"go",
|
||||
"rust",
|
||||
"php",
|
||||
"ruby",
|
||||
"swift",
|
||||
"kotlin",
|
||||
"csharp",
|
||||
"c",
|
||||
"sql",
|
||||
"yaml",
|
||||
"json",
|
||||
"markdown",
|
||||
"css",
|
||||
"scss",
|
||||
"sass",
|
||||
"jsx",
|
||||
"tsx",
|
||||
"vue",
|
||||
"shell",
|
||||
"powershell",
|
||||
"r",
|
||||
"scala",
|
||||
"dart",
|
||||
"perl",
|
||||
"lua",
|
||||
"elixir",
|
||||
"julia",
|
||||
"gdscript",
|
||||
]
|
||||
|
||||
|
||||
@@ -452,7 +442,7 @@ class LanguageDetector:
|
||||
0.3 = low, 0.5 = medium, 0.7 = high
|
||||
"""
|
||||
self.min_confidence = min_confidence
|
||||
self._pattern_cache: Dict[str, List[Tuple[re.Pattern, int]]] = {}
|
||||
self._pattern_cache: dict[str, list[tuple[re.Pattern, int]]] = {}
|
||||
self._compile_patterns()
|
||||
|
||||
def _compile_patterns(self) -> None:
|
||||
@@ -465,27 +455,28 @@ class LanguageDetector:
|
||||
compiled_patterns.append((compiled, weight))
|
||||
except re.error as e:
|
||||
logger.error(
|
||||
"Invalid regex pattern for language '%s' at index %d: '%s'. "
|
||||
"Error: %s. Pattern skipped.",
|
||||
lang, i, pattern[:50], e
|
||||
"Invalid regex pattern for language '%s' at index %d: '%s'. Error: %s. Pattern skipped.",
|
||||
lang,
|
||||
i,
|
||||
pattern[:50],
|
||||
e,
|
||||
)
|
||||
except TypeError as e:
|
||||
except TypeError:
|
||||
logger.error(
|
||||
"Pattern for language '%s' at index %d is not a string: %s. "
|
||||
"Pattern skipped.",
|
||||
lang, i, type(pattern).__name__
|
||||
"Pattern for language '%s' at index %d is not a string: %s. Pattern skipped.",
|
||||
lang,
|
||||
i,
|
||||
type(pattern).__name__,
|
||||
)
|
||||
|
||||
if compiled_patterns:
|
||||
self._pattern_cache[lang] = compiled_patterns
|
||||
else:
|
||||
logger.warning(
|
||||
"No valid patterns compiled for language '%s'. "
|
||||
"Detection for this language is disabled.",
|
||||
lang
|
||||
"No valid patterns compiled for language '%s'. Detection for this language is disabled.", lang
|
||||
)
|
||||
|
||||
def detect_from_html(self, elem, code: str) -> Tuple[str, float]:
|
||||
def detect_from_html(self, elem, code: str) -> tuple[str, float]:
|
||||
"""
|
||||
Detect language from HTML element with CSS classes + code content.
|
||||
|
||||
@@ -498,21 +489,21 @@ class LanguageDetector:
|
||||
"""
|
||||
# Tier 1: CSS classes (confidence 1.0)
|
||||
if elem:
|
||||
css_lang = self.extract_language_from_classes(elem.get('class', []))
|
||||
css_lang = self.extract_language_from_classes(elem.get("class", []))
|
||||
if css_lang:
|
||||
return css_lang, 1.0
|
||||
|
||||
# Check parent pre element
|
||||
parent = elem.parent
|
||||
if parent and parent.name == 'pre':
|
||||
css_lang = self.extract_language_from_classes(parent.get('class', []))
|
||||
if parent and parent.name == "pre":
|
||||
css_lang = self.extract_language_from_classes(parent.get("class", []))
|
||||
if css_lang:
|
||||
return css_lang, 1.0
|
||||
|
||||
# Tier 2: Pattern matching
|
||||
return self.detect_from_code(code)
|
||||
|
||||
def detect_from_code(self, code: str) -> Tuple[str, float]:
|
||||
def detect_from_code(self, code: str) -> tuple[str, float]:
|
||||
"""
|
||||
Detect language from code content only (for PDFs, GitHub files).
|
||||
|
||||
@@ -524,13 +515,13 @@ class LanguageDetector:
|
||||
"""
|
||||
# Edge case: code too short
|
||||
if len(code.strip()) < 10:
|
||||
return 'unknown', 0.0
|
||||
return "unknown", 0.0
|
||||
|
||||
# Calculate confidence scores for all languages
|
||||
scores = self._calculate_confidence(code)
|
||||
|
||||
if not scores:
|
||||
return 'unknown', 0.0
|
||||
return "unknown", 0.0
|
||||
|
||||
# Get language with highest score
|
||||
best_lang = max(scores.items(), key=lambda x: x[1])
|
||||
@@ -538,11 +529,11 @@ class LanguageDetector:
|
||||
|
||||
# Apply minimum confidence threshold
|
||||
if confidence < self.min_confidence:
|
||||
return 'unknown', 0.0
|
||||
return "unknown", 0.0
|
||||
|
||||
return lang, confidence
|
||||
|
||||
def extract_language_from_classes(self, classes: List[str]) -> Optional[str]:
|
||||
def extract_language_from_classes(self, classes: list[str]) -> str | None:
|
||||
"""
|
||||
Extract language from CSS class list.
|
||||
|
||||
@@ -563,21 +554,21 @@ class LanguageDetector:
|
||||
|
||||
for cls in classes:
|
||||
# Handle brush: pattern
|
||||
if 'brush:' in cls:
|
||||
parts = cls.split('brush:')
|
||||
if "brush:" in cls:
|
||||
parts = cls.split("brush:")
|
||||
if len(parts) > 1:
|
||||
lang = parts[1].strip().lower()
|
||||
if lang in KNOWN_LANGUAGES:
|
||||
return lang
|
||||
|
||||
# Handle language- prefix
|
||||
if cls.startswith('language-'):
|
||||
if cls.startswith("language-"):
|
||||
lang = cls[9:].lower()
|
||||
if lang in KNOWN_LANGUAGES:
|
||||
return lang
|
||||
|
||||
# Handle lang- prefix
|
||||
if cls.startswith('lang-'):
|
||||
if cls.startswith("lang-"):
|
||||
lang = cls[5:].lower()
|
||||
if lang in KNOWN_LANGUAGES:
|
||||
return lang
|
||||
@@ -588,7 +579,7 @@ class LanguageDetector:
|
||||
|
||||
return None
|
||||
|
||||
def _calculate_confidence(self, code: str) -> Dict[str, float]:
|
||||
def _calculate_confidence(self, code: str) -> dict[str, float]:
|
||||
"""
|
||||
Calculate weighted confidence scores for all languages.
|
||||
|
||||
@@ -598,7 +589,7 @@ class LanguageDetector:
|
||||
Returns:
|
||||
Dictionary mapping language names to confidence scores (0.0-1.0)
|
||||
"""
|
||||
scores: Dict[str, float] = {}
|
||||
scores: dict[str, float] = {}
|
||||
|
||||
for lang, compiled_patterns in self._pattern_cache.items():
|
||||
total_score = 0
|
||||
|
||||
@@ -1,23 +1,20 @@
|
||||
# ABOUTME: Detects and validates llms.txt file availability at documentation URLs
|
||||
# ABOUTME: Supports llms-full.txt, llms.txt, and llms-small.txt variants
|
||||
|
||||
import requests
|
||||
from typing import Optional, Dict, List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class LlmsTxtDetector:
|
||||
"""Detect llms.txt files at documentation URLs"""
|
||||
|
||||
VARIANTS = [
|
||||
('llms-full.txt', 'full'),
|
||||
('llms.txt', 'standard'),
|
||||
('llms-small.txt', 'small')
|
||||
]
|
||||
VARIANTS = [("llms-full.txt", "full"), ("llms.txt", "standard"), ("llms-small.txt", "small")]
|
||||
|
||||
def __init__(self, base_url: str):
|
||||
self.base_url = base_url.rstrip('/')
|
||||
self.base_url = base_url.rstrip("/")
|
||||
|
||||
def detect(self) -> Optional[Dict[str, str]]:
|
||||
def detect(self) -> dict[str, str] | None:
|
||||
"""
|
||||
Detect available llms.txt variant.
|
||||
|
||||
@@ -31,11 +28,11 @@ class LlmsTxtDetector:
|
||||
url = f"{root_url}/{filename}"
|
||||
|
||||
if self._check_url_exists(url):
|
||||
return {'url': url, 'variant': variant}
|
||||
return {"url": url, "variant": variant}
|
||||
|
||||
return None
|
||||
|
||||
def detect_all(self) -> List[Dict[str, str]]:
|
||||
def detect_all(self) -> list[dict[str, str]]:
|
||||
"""
|
||||
Detect all available llms.txt variants.
|
||||
|
||||
@@ -50,10 +47,7 @@ class LlmsTxtDetector:
|
||||
url = f"{root_url}/{filename}"
|
||||
|
||||
if self._check_url_exists(url):
|
||||
found_variants.append({
|
||||
'url': url,
|
||||
'variant': variant
|
||||
})
|
||||
found_variants.append({"url": url, "variant": variant})
|
||||
|
||||
return found_variants
|
||||
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
|
||||
|
||||
"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff"""
|
||||
|
||||
import requests
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class LlmsTxtDownloader:
|
||||
"""Download llms.txt content from URLs with retry logic"""
|
||||
@@ -27,12 +29,13 @@ class LlmsTxtDownloader:
|
||||
"""
|
||||
# Extract filename from URL
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed = urlparse(self.url)
|
||||
filename = parsed.path.split('/')[-1]
|
||||
filename = parsed.path.split("/")[-1]
|
||||
|
||||
# Replace .txt with .md
|
||||
if filename.endswith('.txt'):
|
||||
filename = filename[:-4] + '.md'
|
||||
if filename.endswith(".txt"):
|
||||
filename = filename[:-4] + ".md"
|
||||
|
||||
return filename
|
||||
|
||||
@@ -46,37 +49,31 @@ class LlmsTxtDownloader:
|
||||
# First, reject HTML content (common redirect trap)
|
||||
content_start = content.strip()[:500].lower()
|
||||
html_indicators = [
|
||||
'<!doctype html',
|
||||
'<html',
|
||||
'<!doctype',
|
||||
'<head>',
|
||||
'<meta charset',
|
||||
"<!doctype html",
|
||||
"<html",
|
||||
"<!doctype",
|
||||
"<head>",
|
||||
"<meta charset",
|
||||
]
|
||||
if any(indicator in content_start for indicator in html_indicators):
|
||||
return False
|
||||
|
||||
# Then check for markdown patterns
|
||||
markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
|
||||
markdown_patterns = ["# ", "## ", "```", "- ", "* ", "`"]
|
||||
return any(pattern in content for pattern in markdown_patterns)
|
||||
|
||||
def download(self) -> Optional[str]:
|
||||
def download(self) -> str | None:
|
||||
"""
|
||||
Download llms.txt content with retry logic.
|
||||
|
||||
Returns:
|
||||
String content or None if download fails
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
|
||||
}
|
||||
headers = {"User-Agent": "Skill-Seekers-llms.txt-Reader/1.0"}
|
||||
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
response = requests.get(
|
||||
self.url,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
response = requests.get(self.url, headers=headers, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
|
||||
content = response.text
|
||||
@@ -88,7 +85,7 @@ class LlmsTxtDownloader:
|
||||
|
||||
# Validate content looks like markdown
|
||||
if not self._is_markdown(content):
|
||||
print(f"⚠️ Content doesn't look like markdown")
|
||||
print("⚠️ Content doesn't look like markdown")
|
||||
return None
|
||||
|
||||
return content
|
||||
@@ -96,7 +93,7 @@ class LlmsTxtDownloader:
|
||||
except requests.RequestException as e:
|
||||
if attempt < self.max_retries - 1:
|
||||
# Calculate exponential backoff delay: 1s, 2s, 4s, etc.
|
||||
delay = 2 ** attempt
|
||||
delay = 2**attempt
|
||||
print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}")
|
||||
print(f" Retrying in {delay}s...")
|
||||
time.sleep(delay)
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
"""ABOUTME: Parses llms.txt markdown content into structured page data"""
|
||||
|
||||
"""ABOUTME: Extracts titles, content, code samples, and headings from markdown"""
|
||||
|
||||
import re
|
||||
from typing import List, Dict
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class LlmsTxtParser:
|
||||
"""Parse llms.txt markdown content into page structures"""
|
||||
|
||||
@@ -12,7 +13,7 @@ class LlmsTxtParser:
|
||||
self.content = content
|
||||
self.base_url = base_url
|
||||
|
||||
def extract_urls(self) -> List[str]:
|
||||
def extract_urls(self) -> list[str]:
|
||||
"""
|
||||
Extract all URLs from the llms.txt content.
|
||||
|
||||
@@ -33,13 +34,13 @@ class LlmsTxtParser:
|
||||
urls = set()
|
||||
|
||||
# Match markdown links: [text](url)
|
||||
md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
|
||||
md_links = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", self.content)
|
||||
for _, url in md_links:
|
||||
if url.startswith('http'):
|
||||
if url.startswith("http"):
|
||||
clean_url = self._clean_url(url)
|
||||
if clean_url:
|
||||
urls.add(clean_url)
|
||||
elif self.base_url and not url.startswith('#'):
|
||||
elif self.base_url and not url.startswith("#"):
|
||||
clean_url = self._clean_url(urljoin(self.base_url, url))
|
||||
if clean_url:
|
||||
urls.add(clean_url)
|
||||
@@ -48,7 +49,7 @@ class LlmsTxtParser:
|
||||
bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
|
||||
for url in bare_urls:
|
||||
# Clean trailing punctuation
|
||||
url = url.rstrip('.,;:')
|
||||
url = url.rstrip(".,;:")
|
||||
clean_url = self._clean_url(url)
|
||||
if clean_url:
|
||||
urls.add(clean_url)
|
||||
@@ -79,16 +80,16 @@ class LlmsTxtParser:
|
||||
"""
|
||||
# Skip URLs with path after anchor (e.g., #section/index.html.md)
|
||||
# These are malformed and return duplicate HTML content
|
||||
if '#' in url:
|
||||
anchor_pos = url.index('#')
|
||||
after_anchor = url[anchor_pos + 1:]
|
||||
if "#" in url:
|
||||
anchor_pos = url.index("#")
|
||||
after_anchor = url[anchor_pos + 1 :]
|
||||
# If there's a path separator after anchor, it's invalid
|
||||
if '/' in after_anchor:
|
||||
if "/" in after_anchor:
|
||||
# Extract the base URL without the malformed anchor
|
||||
return url[:anchor_pos]
|
||||
return url
|
||||
|
||||
def parse(self) -> List[Dict]:
|
||||
def parse(self) -> list[dict]:
|
||||
"""
|
||||
Parse markdown content into page structures.
|
||||
|
||||
@@ -98,55 +99,50 @@ class LlmsTxtParser:
|
||||
pages = []
|
||||
|
||||
# Split by h1 headers (# Title)
|
||||
sections = re.split(r'\n# ', self.content)
|
||||
sections = re.split(r"\n# ", self.content)
|
||||
|
||||
for section in sections:
|
||||
if not section.strip():
|
||||
continue
|
||||
|
||||
# First line is title
|
||||
lines = section.split('\n')
|
||||
title = lines[0].strip('#').strip()
|
||||
lines = section.split("\n")
|
||||
title = lines[0].strip("#").strip()
|
||||
|
||||
# Parse content
|
||||
page = self._parse_section('\n'.join(lines[1:]), title)
|
||||
page = self._parse_section("\n".join(lines[1:]), title)
|
||||
pages.append(page)
|
||||
|
||||
return pages
|
||||
|
||||
def _parse_section(self, content: str, title: str) -> Dict:
|
||||
def _parse_section(self, content: str, title: str) -> dict:
|
||||
"""Parse a single section into page structure"""
|
||||
page = {
|
||||
'title': title,
|
||||
'content': '',
|
||||
'code_samples': [],
|
||||
'headings': [],
|
||||
'url': f'llms-txt#{title.lower().replace(" ", "-")}',
|
||||
'links': []
|
||||
"title": title,
|
||||
"content": "",
|
||||
"code_samples": [],
|
||||
"headings": [],
|
||||
"url": f"llms-txt#{title.lower().replace(' ', '-')}",
|
||||
"links": [],
|
||||
}
|
||||
|
||||
# Extract code blocks
|
||||
code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
|
||||
code_blocks = re.findall(r"```(\w+)?\n(.*?)```", content, re.DOTALL)
|
||||
for lang, code in code_blocks:
|
||||
page['code_samples'].append({
|
||||
'code': code.strip(),
|
||||
'language': lang or 'unknown'
|
||||
})
|
||||
page["code_samples"].append({"code": code.strip(), "language": lang or "unknown"})
|
||||
|
||||
# Extract h2/h3 headings
|
||||
headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
|
||||
headings = re.findall(r"^(#{2,3})\s+(.+)$", content, re.MULTILINE)
|
||||
for level_markers, text in headings:
|
||||
page['headings'].append({
|
||||
'level': f'h{len(level_markers)}',
|
||||
'text': text.strip(),
|
||||
'id': text.lower().replace(' ', '-')
|
||||
})
|
||||
page["headings"].append(
|
||||
{"level": f"h{len(level_markers)}", "text": text.strip(), "id": text.lower().replace(" ", "-")}
|
||||
)
|
||||
|
||||
# Remove code blocks from content for plain text
|
||||
content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
|
||||
content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
|
||||
|
||||
# Extract paragraphs
|
||||
paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
|
||||
page['content'] = '\n\n'.join(paragraphs)
|
||||
paragraphs = [p.strip() for p in content_no_code.split("\n\n") if len(p.strip()) > 20]
|
||||
page["content"] = "\n\n".join(paragraphs)
|
||||
|
||||
return page
|
||||
|
||||
@@ -31,9 +31,8 @@ Examples:
|
||||
skill-seekers install-agent output/react/ --agent cursor
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from typing import List, Optional
|
||||
import sys
|
||||
|
||||
|
||||
def create_parser() -> argparse.ArgumentParser:
|
||||
@@ -61,54 +60,27 @@ Examples:
|
||||
skill-seekers upload output/react.zip
|
||||
|
||||
For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="%(prog)s 2.7.0"
|
||||
)
|
||||
parser.add_argument("--version", action="version", version="%(prog)s 2.7.0")
|
||||
|
||||
subparsers = parser.add_subparsers(
|
||||
dest="command",
|
||||
title="commands",
|
||||
description="Available Skill Seekers commands",
|
||||
help="Command to run"
|
||||
dest="command", title="commands", description="Available Skill Seekers commands", help="Command to run"
|
||||
)
|
||||
|
||||
# === config subcommand ===
|
||||
config_parser = subparsers.add_parser(
|
||||
"config",
|
||||
help="Configure GitHub tokens, API keys, and settings",
|
||||
description="Interactive configuration wizard"
|
||||
)
|
||||
config_parser.add_argument(
|
||||
"--github",
|
||||
action="store_true",
|
||||
help="Go directly to GitHub token setup"
|
||||
)
|
||||
config_parser.add_argument(
|
||||
"--api-keys",
|
||||
action="store_true",
|
||||
help="Go directly to API keys setup"
|
||||
)
|
||||
config_parser.add_argument(
|
||||
"--show",
|
||||
action="store_true",
|
||||
help="Show current configuration and exit"
|
||||
)
|
||||
config_parser.add_argument(
|
||||
"--test",
|
||||
action="store_true",
|
||||
help="Test connections and exit"
|
||||
"config", help="Configure GitHub tokens, API keys, and settings", description="Interactive configuration wizard"
|
||||
)
|
||||
config_parser.add_argument("--github", action="store_true", help="Go directly to GitHub token setup")
|
||||
config_parser.add_argument("--api-keys", action="store_true", help="Go directly to API keys setup")
|
||||
config_parser.add_argument("--show", action="store_true", help="Show current configuration and exit")
|
||||
config_parser.add_argument("--test", action="store_true", help="Test connections and exit")
|
||||
|
||||
# === scrape subcommand ===
|
||||
scrape_parser = subparsers.add_parser(
|
||||
"scrape",
|
||||
help="Scrape documentation website",
|
||||
description="Scrape documentation website and generate skill"
|
||||
"scrape", help="Scrape documentation website", description="Scrape documentation website and generate skill"
|
||||
)
|
||||
scrape_parser.add_argument("--config", help="Config JSON file")
|
||||
scrape_parser.add_argument("--name", help="Skill name")
|
||||
@@ -123,9 +95,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
|
||||
# === github subcommand ===
|
||||
github_parser = subparsers.add_parser(
|
||||
"github",
|
||||
help="Scrape GitHub repository",
|
||||
description="Scrape GitHub repository and generate skill"
|
||||
"github", help="Scrape GitHub repository", description="Scrape GitHub repository and generate skill"
|
||||
)
|
||||
github_parser.add_argument("--config", help="Config JSON file")
|
||||
github_parser.add_argument("--repo", help="GitHub repo (owner/repo)")
|
||||
@@ -134,14 +104,14 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
github_parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)")
|
||||
github_parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)")
|
||||
github_parser.add_argument("--api-key", type=str, help="Anthropic API key for --enhance")
|
||||
github_parser.add_argument("--non-interactive", action="store_true", help="Non-interactive mode (fail fast on rate limits)")
|
||||
github_parser.add_argument(
|
||||
"--non-interactive", action="store_true", help="Non-interactive mode (fail fast on rate limits)"
|
||||
)
|
||||
github_parser.add_argument("--profile", type=str, help="GitHub profile name from config")
|
||||
|
||||
# === pdf subcommand ===
|
||||
pdf_parser = subparsers.add_parser(
|
||||
"pdf",
|
||||
help="Extract from PDF file",
|
||||
description="Extract content from PDF and generate skill"
|
||||
"pdf", help="Extract from PDF file", description="Extract content from PDF and generate skill"
|
||||
)
|
||||
pdf_parser.add_argument("--config", help="Config JSON file")
|
||||
pdf_parser.add_argument("--pdf", help="PDF file path")
|
||||
@@ -153,7 +123,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
unified_parser = subparsers.add_parser(
|
||||
"unified",
|
||||
help="Multi-source scraping (docs + GitHub + PDF)",
|
||||
description="Combine multiple sources into one skill"
|
||||
description="Combine multiple sources into one skill",
|
||||
)
|
||||
unified_parser.add_argument("--config", required=True, help="Unified config JSON file")
|
||||
unified_parser.add_argument("--merge-mode", help="Merge mode (rule-based, claude-enhanced)")
|
||||
@@ -163,7 +133,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
enhance_parser = subparsers.add_parser(
|
||||
"enhance",
|
||||
help="AI-powered enhancement (local, no API key)",
|
||||
description="Enhance SKILL.md using Claude Code (local)"
|
||||
description="Enhance SKILL.md using Claude Code (local)",
|
||||
)
|
||||
enhance_parser.add_argument("skill_directory", help="Skill directory path")
|
||||
enhance_parser.add_argument("--background", action="store_true", help="Run in background")
|
||||
@@ -175,7 +145,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
enhance_status_parser = subparsers.add_parser(
|
||||
"enhance-status",
|
||||
help="Check enhancement status (for background/daemon modes)",
|
||||
description="Monitor background enhancement processes"
|
||||
description="Monitor background enhancement processes",
|
||||
)
|
||||
enhance_status_parser.add_argument("skill_directory", help="Skill directory path")
|
||||
enhance_status_parser.add_argument("--watch", "-w", action="store_true", help="Watch in real-time")
|
||||
@@ -184,9 +154,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
|
||||
# === package subcommand ===
|
||||
package_parser = subparsers.add_parser(
|
||||
"package",
|
||||
help="Package skill into .zip file",
|
||||
description="Package skill directory into uploadable .zip"
|
||||
"package", help="Package skill into .zip file", description="Package skill directory into uploadable .zip"
|
||||
)
|
||||
package_parser.add_argument("skill_directory", help="Skill directory path")
|
||||
package_parser.add_argument("--no-open", action="store_true", help="Don't open output folder")
|
||||
@@ -194,9 +162,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
|
||||
# === upload subcommand ===
|
||||
upload_parser = subparsers.add_parser(
|
||||
"upload",
|
||||
help="Upload skill to Claude",
|
||||
description="Upload .zip file to Claude via Anthropic API"
|
||||
"upload", help="Upload skill to Claude", description="Upload .zip file to Claude via Anthropic API"
|
||||
)
|
||||
upload_parser.add_argument("zip_file", help=".zip file to upload")
|
||||
upload_parser.add_argument("--api-key", help="Anthropic API key")
|
||||
@@ -205,7 +171,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
estimate_parser = subparsers.add_parser(
|
||||
"estimate",
|
||||
help="Estimate page count before scraping",
|
||||
description="Estimate total pages for documentation scraping"
|
||||
description="Estimate total pages for documentation scraping",
|
||||
)
|
||||
estimate_parser.add_argument("config", nargs="?", help="Config JSON file")
|
||||
estimate_parser.add_argument("--all", action="store_true", help="List all available configs")
|
||||
@@ -215,128 +181,63 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
test_examples_parser = subparsers.add_parser(
|
||||
"extract-test-examples",
|
||||
help="Extract usage examples from test files",
|
||||
description="Analyze test files to extract real API usage patterns"
|
||||
description="Analyze test files to extract real API usage patterns",
|
||||
)
|
||||
test_examples_parser.add_argument("directory", nargs="?", help="Directory containing test files")
|
||||
test_examples_parser.add_argument("--file", help="Single test file to analyze")
|
||||
test_examples_parser.add_argument("--language", help="Filter by programming language (python, javascript, etc.)")
|
||||
test_examples_parser.add_argument(
|
||||
"--min-confidence", type=float, default=0.5, help="Minimum confidence threshold (0.0-1.0, default: 0.5)"
|
||||
)
|
||||
test_examples_parser.add_argument(
|
||||
"directory",
|
||||
nargs="?",
|
||||
help="Directory containing test files"
|
||||
)
|
||||
test_examples_parser.add_argument(
|
||||
"--file",
|
||||
help="Single test file to analyze"
|
||||
)
|
||||
test_examples_parser.add_argument(
|
||||
"--language",
|
||||
help="Filter by programming language (python, javascript, etc.)"
|
||||
)
|
||||
test_examples_parser.add_argument(
|
||||
"--min-confidence",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help="Minimum confidence threshold (0.0-1.0, default: 0.5)"
|
||||
)
|
||||
test_examples_parser.add_argument(
|
||||
"--max-per-file",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Maximum examples per file (default: 10)"
|
||||
)
|
||||
test_examples_parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
help="Output JSON format"
|
||||
)
|
||||
test_examples_parser.add_argument(
|
||||
"--markdown",
|
||||
action="store_true",
|
||||
help="Output Markdown format"
|
||||
"--max-per-file", type=int, default=10, help="Maximum examples per file (default: 10)"
|
||||
)
|
||||
test_examples_parser.add_argument("--json", action="store_true", help="Output JSON format")
|
||||
test_examples_parser.add_argument("--markdown", action="store_true", help="Output Markdown format")
|
||||
|
||||
# === install-agent subcommand ===
|
||||
install_agent_parser = subparsers.add_parser(
|
||||
"install-agent",
|
||||
help="Install skill to AI agent directories",
|
||||
description="Copy skill to agent-specific installation directories"
|
||||
description="Copy skill to agent-specific installation directories",
|
||||
)
|
||||
install_agent_parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)")
|
||||
install_agent_parser.add_argument(
|
||||
"--agent", required=True, help="Agent name (claude, cursor, vscode, amp, goose, opencode, all)"
|
||||
)
|
||||
install_agent_parser.add_argument(
|
||||
"skill_directory",
|
||||
help="Skill directory path (e.g., output/react/)"
|
||||
"--force", action="store_true", help="Overwrite existing installation without asking"
|
||||
)
|
||||
install_agent_parser.add_argument(
|
||||
"--agent",
|
||||
required=True,
|
||||
help="Agent name (claude, cursor, vscode, amp, goose, opencode, all)"
|
||||
)
|
||||
install_agent_parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Overwrite existing installation without asking"
|
||||
)
|
||||
install_agent_parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview installation without making changes"
|
||||
"--dry-run", action="store_true", help="Preview installation without making changes"
|
||||
)
|
||||
|
||||
# === install subcommand ===
|
||||
install_parser = subparsers.add_parser(
|
||||
"install",
|
||||
help="Complete workflow: fetch → scrape → enhance → package → upload",
|
||||
description="One-command skill installation (AI enhancement MANDATORY)"
|
||||
description="One-command skill installation (AI enhancement MANDATORY)",
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--config",
|
||||
required=True,
|
||||
help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')"
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--destination",
|
||||
default="output",
|
||||
help="Output directory (default: output/)"
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--no-upload",
|
||||
action="store_true",
|
||||
help="Skip automatic upload to Claude"
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--unlimited",
|
||||
action="store_true",
|
||||
help="Remove page limits during scraping"
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview workflow without executing"
|
||||
"--config", required=True, help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')"
|
||||
)
|
||||
install_parser.add_argument("--destination", default="output", help="Output directory (default: output/)")
|
||||
install_parser.add_argument("--no-upload", action="store_true", help="Skip automatic upload to Claude")
|
||||
install_parser.add_argument("--unlimited", action="store_true", help="Remove page limits during scraping")
|
||||
install_parser.add_argument("--dry-run", action="store_true", help="Preview workflow without executing")
|
||||
|
||||
# === resume subcommand ===
|
||||
resume_parser = subparsers.add_parser(
|
||||
"resume",
|
||||
help="Resume interrupted scraping job",
|
||||
description="Continue from saved progress checkpoint"
|
||||
)
|
||||
resume_parser.add_argument(
|
||||
"job_id",
|
||||
nargs="?",
|
||||
help="Job ID to resume (or use --list to see available jobs)"
|
||||
)
|
||||
resume_parser.add_argument(
|
||||
"--list",
|
||||
action="store_true",
|
||||
help="List all resumable jobs"
|
||||
)
|
||||
resume_parser.add_argument(
|
||||
"--clean",
|
||||
action="store_true",
|
||||
help="Clean up old progress files"
|
||||
"resume", help="Resume interrupted scraping job", description="Continue from saved progress checkpoint"
|
||||
)
|
||||
resume_parser.add_argument("job_id", nargs="?", help="Job ID to resume (or use --list to see available jobs)")
|
||||
resume_parser.add_argument("--list", action="store_true", help="List all resumable jobs")
|
||||
resume_parser.add_argument("--clean", action="store_true", help="Clean up old progress files")
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: Optional[List[str]] = None) -> int:
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
"""Main entry point for the unified CLI.
|
||||
|
||||
Args:
|
||||
@@ -356,6 +257,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
try:
|
||||
if args.command == "config":
|
||||
from skill_seekers.cli.config_command import main as config_main
|
||||
|
||||
sys.argv = ["config_command.py"]
|
||||
if args.github:
|
||||
sys.argv.append("--github")
|
||||
@@ -369,6 +271,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "scrape":
|
||||
from skill_seekers.cli.doc_scraper import main as scrape_main
|
||||
|
||||
# Convert args namespace to sys.argv format for doc_scraper
|
||||
sys.argv = ["doc_scraper.py"]
|
||||
if args.config:
|
||||
@@ -395,6 +298,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "github":
|
||||
from skill_seekers.cli.github_scraper import main as github_main
|
||||
|
||||
sys.argv = ["github_scraper.py"]
|
||||
if args.config:
|
||||
sys.argv.extend(["--config", args.config])
|
||||
@@ -418,6 +322,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "pdf":
|
||||
from skill_seekers.cli.pdf_scraper import main as pdf_main
|
||||
|
||||
sys.argv = ["pdf_scraper.py"]
|
||||
if args.config:
|
||||
sys.argv.extend(["--config", args.config])
|
||||
@@ -433,6 +338,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "unified":
|
||||
from skill_seekers.cli.unified_scraper import main as unified_main
|
||||
|
||||
sys.argv = ["unified_scraper.py", "--config", args.config]
|
||||
if args.merge_mode:
|
||||
sys.argv.extend(["--merge-mode", args.merge_mode])
|
||||
@@ -442,6 +348,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "enhance":
|
||||
from skill_seekers.cli.enhance_skill_local import main as enhance_main
|
||||
|
||||
sys.argv = ["enhance_skill_local.py", args.skill_directory]
|
||||
if args.background:
|
||||
sys.argv.append("--background")
|
||||
@@ -455,6 +362,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "enhance-status":
|
||||
from skill_seekers.cli.enhance_status import main as enhance_status_main
|
||||
|
||||
sys.argv = ["enhance_status.py", args.skill_directory]
|
||||
if args.watch:
|
||||
sys.argv.append("--watch")
|
||||
@@ -466,6 +374,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "package":
|
||||
from skill_seekers.cli.package_skill import main as package_main
|
||||
|
||||
sys.argv = ["package_skill.py", args.skill_directory]
|
||||
if args.no_open:
|
||||
sys.argv.append("--no-open")
|
||||
@@ -475,6 +384,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "upload":
|
||||
from skill_seekers.cli.upload_skill import main as upload_main
|
||||
|
||||
sys.argv = ["upload_skill.py", args.zip_file]
|
||||
if args.api_key:
|
||||
sys.argv.extend(["--api-key", args.api_key])
|
||||
@@ -482,6 +392,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "estimate":
|
||||
from skill_seekers.cli.estimate_pages import main as estimate_main
|
||||
|
||||
sys.argv = ["estimate_pages.py"]
|
||||
if args.all:
|
||||
sys.argv.append("--all")
|
||||
@@ -493,6 +404,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "extract-test-examples":
|
||||
from skill_seekers.cli.test_example_extractor import main as test_examples_main
|
||||
|
||||
sys.argv = ["test_example_extractor.py"]
|
||||
if args.directory:
|
||||
sys.argv.append(args.directory)
|
||||
@@ -512,6 +424,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "install-agent":
|
||||
from skill_seekers.cli.install_agent import main as install_agent_main
|
||||
|
||||
sys.argv = ["install_agent.py", args.skill_directory, "--agent", args.agent]
|
||||
if args.force:
|
||||
sys.argv.append("--force")
|
||||
@@ -521,6 +434,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "install":
|
||||
from skill_seekers.cli.install_skill import main as install_main
|
||||
|
||||
sys.argv = ["install_skill.py"]
|
||||
if args.config:
|
||||
sys.argv.extend(["--config", args.config])
|
||||
@@ -536,6 +450,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
||||
|
||||
elif args.command == "resume":
|
||||
from skill_seekers.cli.resume_command import main as resume_main
|
||||
|
||||
sys.argv = ["resume_command.py"]
|
||||
if args.job_id:
|
||||
sys.argv.append(args.job_id)
|
||||
|
||||
@@ -24,13 +24,13 @@ class MarkdownCleaner:
|
||||
Cleaned markdown with HTML tags removed
|
||||
"""
|
||||
# Remove HTML comments
|
||||
text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
|
||||
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
|
||||
|
||||
# Remove HTML tags but keep content
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
|
||||
# Remove empty lines created by HTML removal
|
||||
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
|
||||
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
@@ -58,7 +58,7 @@ class MarkdownCleaner:
|
||||
return text.strip()
|
||||
|
||||
# For longer text, extract smartly
|
||||
lines = text.split('\n')
|
||||
lines = text.split("\n")
|
||||
content_lines = []
|
||||
char_count = 0
|
||||
section_count = 0
|
||||
@@ -66,11 +66,11 @@ class MarkdownCleaner:
|
||||
|
||||
for line in lines:
|
||||
# Check for code fence (```)
|
||||
if line.strip().startswith('```'):
|
||||
if line.strip().startswith("```"):
|
||||
in_code_block = not in_code_block
|
||||
|
||||
# Check for any heading (H1-H6)
|
||||
is_heading = re.match(r'^#{1,6}\s+', line)
|
||||
is_heading = re.match(r"^#{1,6}\s+", line)
|
||||
|
||||
if is_heading:
|
||||
section_count += 1
|
||||
@@ -91,7 +91,7 @@ class MarkdownCleaner:
|
||||
if char_count >= max_chars and not in_code_block:
|
||||
break
|
||||
|
||||
result = '\n'.join(content_lines).strip()
|
||||
result = "\n".join(content_lines).strip()
|
||||
|
||||
# If we truncated, ensure we don't break markdown (only if not in code block)
|
||||
if char_count >= max_chars and not in_code_block:
|
||||
@@ -119,17 +119,13 @@ class MarkdownCleaner:
|
||||
truncated = text[:max_chars]
|
||||
|
||||
# Look for last period, exclamation, or question mark
|
||||
last_sentence = max(
|
||||
truncated.rfind('. '),
|
||||
truncated.rfind('! '),
|
||||
truncated.rfind('? ')
|
||||
)
|
||||
last_sentence = max(truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
|
||||
|
||||
if last_sentence > max_chars // 2: # At least half the content
|
||||
return truncated[:last_sentence + 1]
|
||||
return truncated[: last_sentence + 1]
|
||||
|
||||
# Fall back to word boundary
|
||||
last_space = truncated.rfind(' ')
|
||||
last_space = truncated.rfind(" ")
|
||||
if last_space > 0:
|
||||
return truncated[:last_space] + "..."
|
||||
|
||||
|
||||
@@ -17,16 +17,16 @@ Multi-layer architecture (Phase 3):
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
from .conflict_detector import Conflict, ConflictDetector
|
||||
|
||||
# Import three-stream data classes (Phase 1)
|
||||
try:
|
||||
from .github_fetcher import ThreeStreamData, CodeStream, DocsStream, InsightsStream
|
||||
from .github_fetcher import CodeStream, DocsStream, InsightsStream, ThreeStreamData
|
||||
except ImportError:
|
||||
# Fallback if github_fetcher not available
|
||||
ThreeStreamData = None
|
||||
@@ -38,11 +38,7 @@ logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def categorize_issues_by_topic(
|
||||
problems: List[Dict],
|
||||
solutions: List[Dict],
|
||||
topics: List[str]
|
||||
) -> Dict[str, List[Dict]]:
|
||||
def categorize_issues_by_topic(problems: list[dict], solutions: list[dict], topics: list[str]) -> dict[str, list[dict]]:
|
||||
"""
|
||||
Categorize GitHub issues by topic keywords.
|
||||
|
||||
@@ -55,14 +51,14 @@ def categorize_issues_by_topic(
|
||||
Dict mapping topic to relevant issues
|
||||
"""
|
||||
categorized = {topic: [] for topic in topics}
|
||||
categorized['other'] = []
|
||||
categorized["other"] = []
|
||||
|
||||
all_issues = problems + solutions
|
||||
|
||||
for issue in all_issues:
|
||||
# Get searchable text
|
||||
title = issue.get('title', '').lower()
|
||||
labels = [label.lower() for label in issue.get('labels', [])]
|
||||
title = issue.get("title", "").lower()
|
||||
labels = [label.lower() for label in issue.get("labels", [])]
|
||||
text = f"{title} {' '.join(labels)}"
|
||||
|
||||
# Find best matching topic
|
||||
@@ -82,18 +78,15 @@ def categorize_issues_by_topic(
|
||||
if matched_topic and max_matches > 0:
|
||||
categorized[matched_topic].append(issue)
|
||||
else:
|
||||
categorized['other'].append(issue)
|
||||
categorized["other"].append(issue)
|
||||
|
||||
# Remove empty categories
|
||||
return {k: v for k, v in categorized.items() if v}
|
||||
|
||||
|
||||
def generate_hybrid_content(
|
||||
api_data: Dict,
|
||||
github_docs: Optional[Dict],
|
||||
github_insights: Optional[Dict],
|
||||
conflicts: List[Conflict]
|
||||
) -> Dict[str, Any]:
|
||||
api_data: dict, github_docs: dict | None, github_insights: dict | None, conflicts: list[Conflict]
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Generate hybrid content combining API data with GitHub context.
|
||||
|
||||
@@ -106,76 +99,67 @@ def generate_hybrid_content(
|
||||
Returns:
|
||||
Hybrid content dict with enriched API reference
|
||||
"""
|
||||
hybrid = {
|
||||
'api_reference': api_data,
|
||||
'github_context': {}
|
||||
}
|
||||
hybrid = {"api_reference": api_data, "github_context": {}}
|
||||
|
||||
# Add GitHub documentation layer
|
||||
if github_docs:
|
||||
hybrid['github_context']['docs'] = {
|
||||
'readme': github_docs.get('readme'),
|
||||
'contributing': github_docs.get('contributing'),
|
||||
'docs_files_count': len(github_docs.get('docs_files', []))
|
||||
hybrid["github_context"]["docs"] = {
|
||||
"readme": github_docs.get("readme"),
|
||||
"contributing": github_docs.get("contributing"),
|
||||
"docs_files_count": len(github_docs.get("docs_files", [])),
|
||||
}
|
||||
|
||||
# Add GitHub insights layer
|
||||
if github_insights:
|
||||
metadata = github_insights.get('metadata', {})
|
||||
hybrid['github_context']['metadata'] = {
|
||||
'stars': metadata.get('stars', 0),
|
||||
'forks': metadata.get('forks', 0),
|
||||
'language': metadata.get('language', 'Unknown'),
|
||||
'description': metadata.get('description', '')
|
||||
metadata = github_insights.get("metadata", {})
|
||||
hybrid["github_context"]["metadata"] = {
|
||||
"stars": metadata.get("stars", 0),
|
||||
"forks": metadata.get("forks", 0),
|
||||
"language": metadata.get("language", "Unknown"),
|
||||
"description": metadata.get("description", ""),
|
||||
}
|
||||
|
||||
# Add issue insights
|
||||
common_problems = github_insights.get('common_problems', [])
|
||||
known_solutions = github_insights.get('known_solutions', [])
|
||||
common_problems = github_insights.get("common_problems", [])
|
||||
known_solutions = github_insights.get("known_solutions", [])
|
||||
|
||||
hybrid['github_context']['issues'] = {
|
||||
'common_problems_count': len(common_problems),
|
||||
'known_solutions_count': len(known_solutions),
|
||||
'top_problems': common_problems[:5], # Top 5 most-discussed
|
||||
'top_solutions': known_solutions[:5]
|
||||
hybrid["github_context"]["issues"] = {
|
||||
"common_problems_count": len(common_problems),
|
||||
"known_solutions_count": len(known_solutions),
|
||||
"top_problems": common_problems[:5], # Top 5 most-discussed
|
||||
"top_solutions": known_solutions[:5],
|
||||
}
|
||||
|
||||
hybrid['github_context']['top_labels'] = github_insights.get('top_labels', [])
|
||||
hybrid["github_context"]["top_labels"] = github_insights.get("top_labels", [])
|
||||
|
||||
# Add conflict summary
|
||||
hybrid['conflict_summary'] = {
|
||||
'total_conflicts': len(conflicts),
|
||||
'by_type': {},
|
||||
'by_severity': {}
|
||||
}
|
||||
hybrid["conflict_summary"] = {"total_conflicts": len(conflicts), "by_type": {}, "by_severity": {}}
|
||||
|
||||
for conflict in conflicts:
|
||||
# Count by type
|
||||
conflict_type = conflict.type
|
||||
hybrid['conflict_summary']['by_type'][conflict_type] = \
|
||||
hybrid['conflict_summary']['by_type'].get(conflict_type, 0) + 1
|
||||
hybrid["conflict_summary"]["by_type"][conflict_type] = (
|
||||
hybrid["conflict_summary"]["by_type"].get(conflict_type, 0) + 1
|
||||
)
|
||||
|
||||
# Count by severity
|
||||
severity = conflict.severity
|
||||
hybrid['conflict_summary']['by_severity'][severity] = \
|
||||
hybrid['conflict_summary']['by_severity'].get(severity, 0) + 1
|
||||
hybrid["conflict_summary"]["by_severity"][severity] = (
|
||||
hybrid["conflict_summary"]["by_severity"].get(severity, 0) + 1
|
||||
)
|
||||
|
||||
# Add GitHub issue links for relevant APIs
|
||||
if github_insights:
|
||||
hybrid['issue_links'] = _match_issues_to_apis(
|
||||
api_data.get('apis', {}),
|
||||
github_insights.get('common_problems', []),
|
||||
github_insights.get('known_solutions', [])
|
||||
hybrid["issue_links"] = _match_issues_to_apis(
|
||||
api_data.get("apis", {}),
|
||||
github_insights.get("common_problems", []),
|
||||
github_insights.get("known_solutions", []),
|
||||
)
|
||||
|
||||
return hybrid
|
||||
|
||||
|
||||
def _match_issues_to_apis(
|
||||
apis: Dict[str, Dict],
|
||||
problems: List[Dict],
|
||||
solutions: List[Dict]
|
||||
) -> Dict[str, List[Dict]]:
|
||||
def _match_issues_to_apis(apis: dict[str, dict], problems: list[dict], solutions: list[dict]) -> dict[str, list[dict]]:
|
||||
"""
|
||||
Match GitHub issues to specific APIs by keyword matching.
|
||||
|
||||
@@ -190,24 +174,26 @@ def _match_issues_to_apis(
|
||||
issue_links = {}
|
||||
all_issues = problems + solutions
|
||||
|
||||
for api_name in apis.keys():
|
||||
for api_name in apis:
|
||||
# Extract searchable keywords from API name
|
||||
api_keywords = api_name.lower().replace('_', ' ').split('.')
|
||||
api_keywords = api_name.lower().replace("_", " ").split(".")
|
||||
|
||||
matched_issues = []
|
||||
for issue in all_issues:
|
||||
title = issue.get('title', '').lower()
|
||||
labels = [label.lower() for label in issue.get('labels', [])]
|
||||
title = issue.get("title", "").lower()
|
||||
labels = [label.lower() for label in issue.get("labels", [])]
|
||||
text = f"{title} {' '.join(labels)}"
|
||||
|
||||
# Check if any API keyword appears in issue
|
||||
if any(keyword in text for keyword in api_keywords):
|
||||
matched_issues.append({
|
||||
'number': issue.get('number'),
|
||||
'title': issue.get('title'),
|
||||
'state': issue.get('state'),
|
||||
'comments': issue.get('comments')
|
||||
})
|
||||
matched_issues.append(
|
||||
{
|
||||
"number": issue.get("number"),
|
||||
"title": issue.get("title"),
|
||||
"state": issue.get("state"),
|
||||
"comments": issue.get("comments"),
|
||||
}
|
||||
)
|
||||
|
||||
if matched_issues:
|
||||
issue_links[api_name] = matched_issues
|
||||
@@ -232,11 +218,13 @@ class RuleBasedMerger:
|
||||
4. If conflict → Include both versions with [CONFLICT] tag, prefer code signature
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
docs_data: Dict,
|
||||
github_data: Dict,
|
||||
conflicts: List[Conflict],
|
||||
github_streams: Optional['ThreeStreamData'] = None):
|
||||
def __init__(
|
||||
self,
|
||||
docs_data: dict,
|
||||
github_data: dict,
|
||||
conflicts: list[Conflict],
|
||||
github_streams: Optional["ThreeStreamData"] = None,
|
||||
):
|
||||
"""
|
||||
Initialize rule-based merger with GitHub streams support.
|
||||
|
||||
@@ -266,21 +254,21 @@ class RuleBasedMerger:
|
||||
# Layer 3: GitHub docs
|
||||
if github_streams.docs_stream:
|
||||
self.github_docs = {
|
||||
'readme': github_streams.docs_stream.readme,
|
||||
'contributing': github_streams.docs_stream.contributing,
|
||||
'docs_files': github_streams.docs_stream.docs_files
|
||||
"readme": github_streams.docs_stream.readme,
|
||||
"contributing": github_streams.docs_stream.contributing,
|
||||
"docs_files": github_streams.docs_stream.docs_files,
|
||||
}
|
||||
|
||||
# Layer 4: GitHub insights
|
||||
if github_streams.insights_stream:
|
||||
self.github_insights = {
|
||||
'metadata': github_streams.insights_stream.metadata,
|
||||
'common_problems': github_streams.insights_stream.common_problems,
|
||||
'known_solutions': github_streams.insights_stream.known_solutions,
|
||||
'top_labels': github_streams.insights_stream.top_labels
|
||||
"metadata": github_streams.insights_stream.metadata,
|
||||
"common_problems": github_streams.insights_stream.common_problems,
|
||||
"known_solutions": github_streams.insights_stream.known_solutions,
|
||||
"top_labels": github_streams.insights_stream.top_labels,
|
||||
}
|
||||
|
||||
def merge_all(self) -> Dict[str, Any]:
|
||||
def merge_all(self) -> dict[str, Any]:
|
||||
"""
|
||||
Merge all APIs using rule-based logic with GitHub insights (Phase 3).
|
||||
|
||||
@@ -302,15 +290,15 @@ class RuleBasedMerger:
|
||||
|
||||
# Build base result
|
||||
merged_data = {
|
||||
'merge_mode': 'rule-based',
|
||||
'apis': merged_apis,
|
||||
'summary': {
|
||||
'total_apis': len(merged_apis),
|
||||
'docs_only': sum(1 for api in merged_apis.values() if api['status'] == 'docs_only'),
|
||||
'code_only': sum(1 for api in merged_apis.values() if api['status'] == 'code_only'),
|
||||
'matched': sum(1 for api in merged_apis.values() if api['status'] == 'matched'),
|
||||
'conflict': sum(1 for api in merged_apis.values() if api['status'] == 'conflict')
|
||||
}
|
||||
"merge_mode": "rule-based",
|
||||
"apis": merged_apis,
|
||||
"summary": {
|
||||
"total_apis": len(merged_apis),
|
||||
"docs_only": sum(1 for api in merged_apis.values() if api["status"] == "docs_only"),
|
||||
"code_only": sum(1 for api in merged_apis.values() if api["status"] == "code_only"),
|
||||
"matched": sum(1 for api in merged_apis.values() if api["status"] == "matched"),
|
||||
"conflict": sum(1 for api in merged_apis.values() if api["status"] == "conflict"),
|
||||
},
|
||||
}
|
||||
|
||||
# Generate hybrid content if GitHub streams available (Phase 3)
|
||||
@@ -320,20 +308,22 @@ class RuleBasedMerger:
|
||||
api_data=merged_data,
|
||||
github_docs=self.github_docs,
|
||||
github_insights=self.github_insights,
|
||||
conflicts=self.conflicts
|
||||
conflicts=self.conflicts,
|
||||
)
|
||||
|
||||
# Merge hybrid content into result
|
||||
merged_data['github_context'] = hybrid_content.get('github_context', {})
|
||||
merged_data['conflict_summary'] = hybrid_content.get('conflict_summary', {})
|
||||
merged_data['issue_links'] = hybrid_content.get('issue_links', {})
|
||||
merged_data["github_context"] = hybrid_content.get("github_context", {})
|
||||
merged_data["conflict_summary"] = hybrid_content.get("conflict_summary", {})
|
||||
merged_data["issue_links"] = hybrid_content.get("issue_links", {})
|
||||
|
||||
logger.info(f"Added GitHub context: {len(self.github_insights.get('common_problems', []))} problems, "
|
||||
f"{len(self.github_insights.get('known_solutions', []))} solutions")
|
||||
logger.info(
|
||||
f"Added GitHub context: {len(self.github_insights.get('common_problems', []))} problems, "
|
||||
f"{len(self.github_insights.get('known_solutions', []))} solutions"
|
||||
)
|
||||
|
||||
return merged_data
|
||||
|
||||
def _merge_single_api(self, api_name: str) -> Dict[str, Any]:
|
||||
def _merge_single_api(self, api_name: str) -> dict[str, Any]:
|
||||
"""
|
||||
Merge a single API using rules.
|
||||
|
||||
@@ -351,25 +341,27 @@ class RuleBasedMerger:
|
||||
if in_docs and not in_code:
|
||||
conflict = self.conflict_index.get(api_name)
|
||||
return {
|
||||
'name': api_name,
|
||||
'status': 'docs_only',
|
||||
'source': 'documentation',
|
||||
'data': self.docs_apis[api_name],
|
||||
'warning': 'This API is documented but not found in codebase',
|
||||
'conflict': conflict.__dict__ if conflict else None
|
||||
"name": api_name,
|
||||
"status": "docs_only",
|
||||
"source": "documentation",
|
||||
"data": self.docs_apis[api_name],
|
||||
"warning": "This API is documented but not found in codebase",
|
||||
"conflict": conflict.__dict__ if conflict else None,
|
||||
}
|
||||
|
||||
# Rule 2: Only in code
|
||||
if in_code and not in_docs:
|
||||
is_private = api_name.startswith('_')
|
||||
is_private = api_name.startswith("_")
|
||||
conflict = self.conflict_index.get(api_name)
|
||||
return {
|
||||
'name': api_name,
|
||||
'status': 'code_only',
|
||||
'source': 'code',
|
||||
'data': self.code_apis[api_name],
|
||||
'warning': 'This API exists in code but is not documented' if not is_private else 'Internal/private API',
|
||||
'conflict': conflict.__dict__ if conflict else None
|
||||
"name": api_name,
|
||||
"status": "code_only",
|
||||
"source": "code",
|
||||
"data": self.code_apis[api_name],
|
||||
"warning": "This API exists in code but is not documented"
|
||||
if not is_private
|
||||
else "Internal/private API",
|
||||
"conflict": conflict.__dict__ if conflict else None,
|
||||
}
|
||||
|
||||
# Both exist - check for conflicts
|
||||
@@ -379,32 +371,32 @@ class RuleBasedMerger:
|
||||
# Rule 3: Both match perfectly (no conflict)
|
||||
if not has_conflict:
|
||||
return {
|
||||
'name': api_name,
|
||||
'status': 'matched',
|
||||
'source': 'both',
|
||||
'docs_data': docs_info,
|
||||
'code_data': code_info,
|
||||
'merged_signature': self._create_merged_signature(code_info, docs_info),
|
||||
'merged_description': docs_info.get('docstring') or code_info.get('docstring')
|
||||
"name": api_name,
|
||||
"status": "matched",
|
||||
"source": "both",
|
||||
"docs_data": docs_info,
|
||||
"code_data": code_info,
|
||||
"merged_signature": self._create_merged_signature(code_info, docs_info),
|
||||
"merged_description": docs_info.get("docstring") or code_info.get("docstring"),
|
||||
}
|
||||
|
||||
# Rule 4: Conflict exists - prefer code signature, keep docs description
|
||||
conflict = self.conflict_index[api_name]
|
||||
|
||||
return {
|
||||
'name': api_name,
|
||||
'status': 'conflict',
|
||||
'source': 'both',
|
||||
'docs_data': docs_info,
|
||||
'code_data': code_info,
|
||||
'conflict': conflict.__dict__,
|
||||
'resolution': 'prefer_code_signature',
|
||||
'merged_signature': self._create_merged_signature(code_info, docs_info),
|
||||
'merged_description': docs_info.get('docstring') or code_info.get('docstring'),
|
||||
'warning': conflict.difference
|
||||
"name": api_name,
|
||||
"status": "conflict",
|
||||
"source": "both",
|
||||
"docs_data": docs_info,
|
||||
"code_data": code_info,
|
||||
"conflict": conflict.__dict__,
|
||||
"resolution": "prefer_code_signature",
|
||||
"merged_signature": self._create_merged_signature(code_info, docs_info),
|
||||
"merged_description": docs_info.get("docstring") or code_info.get("docstring"),
|
||||
"warning": conflict.difference,
|
||||
}
|
||||
|
||||
def _create_merged_signature(self, code_info: Dict, docs_info: Dict) -> str:
|
||||
def _create_merged_signature(self, code_info: dict, docs_info: dict) -> str:
|
||||
"""
|
||||
Create merged signature preferring code data.
|
||||
|
||||
@@ -415,17 +407,17 @@ class RuleBasedMerger:
|
||||
Returns:
|
||||
Merged signature string
|
||||
"""
|
||||
name = code_info.get('name', docs_info.get('name'))
|
||||
params = code_info.get('parameters', docs_info.get('parameters', []))
|
||||
return_type = code_info.get('return_type', docs_info.get('return_type'))
|
||||
name = code_info.get("name", docs_info.get("name"))
|
||||
params = code_info.get("parameters", docs_info.get("parameters", []))
|
||||
return_type = code_info.get("return_type", docs_info.get("return_type"))
|
||||
|
||||
# Build parameter string
|
||||
param_strs = []
|
||||
for param in params:
|
||||
param_str = param['name']
|
||||
if param.get('type_hint'):
|
||||
param_str = param["name"]
|
||||
if param.get("type_hint"):
|
||||
param_str += f": {param['type_hint']}"
|
||||
if param.get('default'):
|
||||
if param.get("default"):
|
||||
param_str += f" = {param['default']}"
|
||||
param_strs.append(param_str)
|
||||
|
||||
@@ -451,11 +443,13 @@ class ClaudeEnhancedMerger:
|
||||
- Layer 4: GitHub insights (issues)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
docs_data: Dict,
|
||||
github_data: Dict,
|
||||
conflicts: List[Conflict],
|
||||
github_streams: Optional['ThreeStreamData'] = None):
|
||||
def __init__(
|
||||
self,
|
||||
docs_data: dict,
|
||||
github_data: dict,
|
||||
conflicts: list[Conflict],
|
||||
github_streams: Optional["ThreeStreamData"] = None,
|
||||
):
|
||||
"""
|
||||
Initialize Claude-enhanced merger with GitHub streams support.
|
||||
|
||||
@@ -473,7 +467,7 @@ class ClaudeEnhancedMerger:
|
||||
# First do rule-based merge as baseline
|
||||
self.rule_merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)
|
||||
|
||||
def merge_all(self) -> Dict[str, Any]:
|
||||
def merge_all(self) -> dict[str, Any]:
|
||||
"""
|
||||
Merge all APIs using Claude enhancement.
|
||||
|
||||
@@ -510,7 +504,7 @@ class ClaudeEnhancedMerger:
|
||||
Returns:
|
||||
Path to workspace directory
|
||||
"""
|
||||
workspace = tempfile.mkdtemp(prefix='skill_merge_')
|
||||
workspace = tempfile.mkdtemp(prefix="skill_merge_")
|
||||
logger.info(f"Created merge workspace: {workspace}")
|
||||
|
||||
# Write context files for Claude
|
||||
@@ -522,26 +516,30 @@ class ClaudeEnhancedMerger:
|
||||
"""Write context files for Claude to analyze."""
|
||||
|
||||
# 1. Write conflicts summary
|
||||
conflicts_file = os.path.join(workspace, 'conflicts.json')
|
||||
with open(conflicts_file, 'w') as f:
|
||||
json.dump({
|
||||
'conflicts': [c.__dict__ for c in self.conflicts],
|
||||
'summary': {
|
||||
'total': len(self.conflicts),
|
||||
'by_type': self._count_by_field('type'),
|
||||
'by_severity': self._count_by_field('severity')
|
||||
}
|
||||
}, f, indent=2)
|
||||
conflicts_file = os.path.join(workspace, "conflicts.json")
|
||||
with open(conflicts_file, "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
"conflicts": [c.__dict__ for c in self.conflicts],
|
||||
"summary": {
|
||||
"total": len(self.conflicts),
|
||||
"by_type": self._count_by_field("type"),
|
||||
"by_severity": self._count_by_field("severity"),
|
||||
},
|
||||
},
|
||||
f,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
# 2. Write documentation APIs
|
||||
docs_apis_file = os.path.join(workspace, 'docs_apis.json')
|
||||
docs_apis_file = os.path.join(workspace, "docs_apis.json")
|
||||
detector = ConflictDetector(self.docs_data, self.github_data)
|
||||
with open(docs_apis_file, 'w') as f:
|
||||
with open(docs_apis_file, "w") as f:
|
||||
json.dump(detector.docs_apis, f, indent=2)
|
||||
|
||||
# 3. Write code APIs
|
||||
code_apis_file = os.path.join(workspace, 'code_apis.json')
|
||||
with open(code_apis_file, 'w') as f:
|
||||
code_apis_file = os.path.join(workspace, "code_apis.json")
|
||||
with open(code_apis_file, "w") as f:
|
||||
json.dump(detector.code_apis, f, indent=2)
|
||||
|
||||
# 4. Write merge instructions for Claude
|
||||
@@ -602,13 +600,13 @@ Create `merged_apis.json` with this structure:
|
||||
Take your time to analyze each conflict carefully. The goal is to create the most accurate and helpful API reference possible.
|
||||
"""
|
||||
|
||||
instructions_file = os.path.join(workspace, 'MERGE_INSTRUCTIONS.md')
|
||||
with open(instructions_file, 'w') as f:
|
||||
instructions_file = os.path.join(workspace, "MERGE_INSTRUCTIONS.md")
|
||||
with open(instructions_file, "w") as f:
|
||||
f.write(instructions)
|
||||
|
||||
logger.info(f"Wrote context files to {workspace}")
|
||||
|
||||
def _count_by_field(self, field: str) -> Dict[str, int]:
|
||||
def _count_by_field(self, field: str) -> dict[str, int]:
|
||||
"""Count conflicts by a specific field."""
|
||||
counts = {}
|
||||
for conflict in self.conflicts:
|
||||
@@ -623,7 +621,7 @@ Take your time to analyze each conflict carefully. The goal is to create the mos
|
||||
Similar to enhance_skill_local.py approach.
|
||||
"""
|
||||
# Create a script that Claude will execute
|
||||
script_path = os.path.join(workspace, 'merge_script.sh')
|
||||
script_path = os.path.join(workspace, "merge_script.sh")
|
||||
|
||||
script_content = f"""#!/bin/bash
|
||||
# Automatic merge script for Claude Code
|
||||
@@ -646,23 +644,18 @@ echo "When done, save merged_apis.json and close this terminal."
|
||||
read -p "Press Enter when merge is complete..."
|
||||
"""
|
||||
|
||||
with open(script_path, 'w') as f:
|
||||
with open(script_path, "w") as f:
|
||||
f.write(script_content)
|
||||
|
||||
os.chmod(script_path, 0o755)
|
||||
|
||||
# Open new terminal with Claude Code
|
||||
# Try different terminal emulators
|
||||
terminals = [
|
||||
['x-terminal-emulator', '-e'],
|
||||
['gnome-terminal', '--'],
|
||||
['xterm', '-e'],
|
||||
['konsole', '-e']
|
||||
]
|
||||
terminals = [["x-terminal-emulator", "-e"], ["gnome-terminal", "--"], ["xterm", "-e"], ["konsole", "-e"]]
|
||||
|
||||
for terminal_cmd in terminals:
|
||||
try:
|
||||
cmd = terminal_cmd + ['bash', script_path]
|
||||
cmd = terminal_cmd + ["bash", script_path]
|
||||
subprocess.Popen(cmd)
|
||||
logger.info(f"Opened terminal with {terminal_cmd[0]}")
|
||||
break
|
||||
@@ -670,12 +663,13 @@ read -p "Press Enter when merge is complete..."
|
||||
continue
|
||||
|
||||
# Wait for merge to complete
|
||||
merged_file = os.path.join(workspace, 'merged_apis.json')
|
||||
merged_file = os.path.join(workspace, "merged_apis.json")
|
||||
logger.info(f"Waiting for merged results at: {merged_file}")
|
||||
logger.info("Close the terminal when done to continue...")
|
||||
|
||||
# Poll for file existence
|
||||
import time
|
||||
|
||||
timeout = 3600 # 1 hour max
|
||||
elapsed = 0
|
||||
while not os.path.exists(merged_file) and elapsed < timeout:
|
||||
@@ -685,27 +679,26 @@ read -p "Press Enter when merge is complete..."
|
||||
if not os.path.exists(merged_file):
|
||||
raise TimeoutError("Claude merge timed out after 1 hour")
|
||||
|
||||
def _read_merged_results(self, workspace: str) -> Dict[str, Any]:
|
||||
def _read_merged_results(self, workspace: str) -> dict[str, Any]:
|
||||
"""Read merged results from workspace."""
|
||||
merged_file = os.path.join(workspace, 'merged_apis.json')
|
||||
merged_file = os.path.join(workspace, "merged_apis.json")
|
||||
|
||||
if not os.path.exists(merged_file):
|
||||
raise FileNotFoundError(f"Merged results not found: {merged_file}")
|
||||
|
||||
with open(merged_file, 'r') as f:
|
||||
with open(merged_file) as f:
|
||||
merged_data = json.load(f)
|
||||
|
||||
return {
|
||||
'merge_mode': 'claude-enhanced',
|
||||
**merged_data
|
||||
}
|
||||
return {"merge_mode": "claude-enhanced", **merged_data}
|
||||
|
||||
|
||||
def merge_sources(docs_data_path: str,
|
||||
github_data_path: str,
|
||||
output_path: str,
|
||||
mode: str = 'rule-based',
|
||||
github_streams: Optional['ThreeStreamData'] = None) -> Dict[str, Any]:
|
||||
def merge_sources(
|
||||
docs_data_path: str,
|
||||
github_data_path: str,
|
||||
output_path: str,
|
||||
mode: str = "rule-based",
|
||||
github_streams: Optional["ThreeStreamData"] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Merge documentation and GitHub data with optional GitHub streams (Phase 3).
|
||||
|
||||
@@ -726,10 +719,10 @@ def merge_sources(docs_data_path: str,
|
||||
Merged data dict with hybrid content
|
||||
"""
|
||||
# Load data
|
||||
with open(docs_data_path, 'r') as f:
|
||||
with open(docs_data_path) as f:
|
||||
docs_data = json.load(f)
|
||||
|
||||
with open(github_data_path, 'r') as f:
|
||||
with open(github_data_path) as f:
|
||||
github_data = json.load(f)
|
||||
|
||||
# Detect conflicts
|
||||
@@ -749,7 +742,7 @@ def merge_sources(docs_data_path: str,
|
||||
logger.info(f" - Insights stream: {problems} problems, {solutions} solutions")
|
||||
|
||||
# Merge based on mode
|
||||
if mode == 'claude-enhanced':
|
||||
if mode == "claude-enhanced":
|
||||
merger = ClaudeEnhancedMerger(docs_data, github_data, conflicts, github_streams)
|
||||
else:
|
||||
merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)
|
||||
@@ -757,7 +750,7 @@ def merge_sources(docs_data_path: str,
|
||||
merged_data = merger.merge_all()
|
||||
|
||||
# Save merged data
|
||||
with open(output_path, 'w') as f:
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(merged_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Merged data saved to: {output_path}")
|
||||
@@ -765,22 +758,23 @@ def merge_sources(docs_data_path: str,
|
||||
return merged_data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Merge documentation and code sources')
|
||||
parser.add_argument('docs_data', help='Path to documentation data JSON')
|
||||
parser.add_argument('github_data', help='Path to GitHub data JSON')
|
||||
parser.add_argument('--output', '-o', default='merged_data.json', help='Output file path')
|
||||
parser.add_argument('--mode', '-m', choices=['rule-based', 'claude-enhanced'],
|
||||
default='rule-based', help='Merge mode')
|
||||
parser = argparse.ArgumentParser(description="Merge documentation and code sources")
|
||||
parser.add_argument("docs_data", help="Path to documentation data JSON")
|
||||
parser.add_argument("github_data", help="Path to GitHub data JSON")
|
||||
parser.add_argument("--output", "-o", default="merged_data.json", help="Output file path")
|
||||
parser.add_argument(
|
||||
"--mode", "-m", choices=["rule-based", "claude-enhanced"], default="rule-based", help="Merge mode"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
merged = merge_sources(args.docs_data, args.github_data, args.output, args.mode)
|
||||
|
||||
# Print summary
|
||||
summary = merged.get('summary', {})
|
||||
summary = merged.get("summary", {})
|
||||
print(f"\n✅ Merge complete ({merged.get('merge_mode')})")
|
||||
print(f" Total APIs: {summary.get('total_apis', 0)}")
|
||||
print(f" Matched: {summary.get('matched', 0)}")
|
||||
|
||||
@@ -5,10 +5,10 @@ Multi-Skill Packager
|
||||
Package multiple skills at once. Useful for packaging router + sub-skills together.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def package_skill(skill_dir: Path) -> bool:
|
||||
@@ -17,7 +17,7 @@ def package_skill(skill_dir: Path) -> bool:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(Path(__file__).parent / "package_skill.py"), str(skill_dir)],
|
||||
capture_output=True,
|
||||
text=True
|
||||
text=True,
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception as e:
|
||||
@@ -36,20 +36,16 @@ Examples:
|
||||
|
||||
# Package specific skills
|
||||
python3 package_multi.py output/godot-2d/ output/godot-3d/ output/godot-scripting/
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'skill_dirs',
|
||||
nargs='+',
|
||||
help='Skill directories to package'
|
||||
)
|
||||
parser.add_argument("skill_dirs", nargs="+", help="Skill directories to package")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"MULTI-SKILL PACKAGER")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("MULTI-SKILL PACKAGER")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
skill_dirs = [Path(d) for d in args.skill_dirs]
|
||||
success_count = 0
|
||||
@@ -67,14 +63,14 @@ Examples:
|
||||
print(f"📦 Packaging: {skill_dir.name}")
|
||||
if package_skill(skill_dir):
|
||||
success_count += 1
|
||||
print(f" ✅ Success")
|
||||
print(" ✅ Success")
|
||||
else:
|
||||
print(f" ❌ Failed")
|
||||
print(" ❌ Failed")
|
||||
print("")
|
||||
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"SUMMARY: {success_count}/{total_count} skills packaged")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -9,34 +9,23 @@ Usage:
|
||||
skill-seekers package output/react/ --no-open # Don't open folder
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
# Import utilities
|
||||
try:
|
||||
from utils import (
|
||||
open_folder,
|
||||
print_upload_instructions,
|
||||
format_file_size,
|
||||
validate_skill_directory
|
||||
)
|
||||
from quality_checker import SkillQualityChecker, print_report
|
||||
from utils import format_file_size, open_folder, print_upload_instructions, validate_skill_directory
|
||||
except ImportError:
|
||||
# If running from different directory, add cli to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from utils import (
|
||||
open_folder,
|
||||
print_upload_instructions,
|
||||
format_file_size,
|
||||
validate_skill_directory
|
||||
)
|
||||
from quality_checker import SkillQualityChecker, print_report
|
||||
from utils import format_file_size, open_folder, print_upload_instructions, validate_skill_directory
|
||||
|
||||
|
||||
def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, target='claude'):
|
||||
def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, target="claude"):
|
||||
"""
|
||||
Package a skill directory into platform-specific format
|
||||
|
||||
@@ -73,7 +62,7 @@ def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, t
|
||||
if report.has_errors or report.has_warnings:
|
||||
print("=" * 60)
|
||||
response = input("\nContinue with packaging? (y/n): ").strip().lower()
|
||||
if response != 'y':
|
||||
if response != "y":
|
||||
print("\n❌ Packaging cancelled by user")
|
||||
return False, None
|
||||
print()
|
||||
@@ -84,6 +73,7 @@ def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, t
|
||||
# Get platform-specific adaptor
|
||||
try:
|
||||
from skill_seekers.cli.adaptors import get_adaptor
|
||||
|
||||
adaptor = get_adaptor(target)
|
||||
except (ImportError, ValueError) as e:
|
||||
print(f"❌ Error: {e}")
|
||||
@@ -140,37 +130,24 @@ Examples:
|
||||
|
||||
# Get help
|
||||
skill-seekers package --help
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("skill_dir", help="Path to skill directory (e.g., output/react/)")
|
||||
|
||||
parser.add_argument("--no-open", action="store_true", help="Do not open the output folder after packaging")
|
||||
|
||||
parser.add_argument("--skip-quality-check", action="store_true", help="Skip quality checks before packaging")
|
||||
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
choices=["claude", "gemini", "openai", "markdown"],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'skill_dir',
|
||||
help='Path to skill directory (e.g., output/react/)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-open',
|
||||
action='store_true',
|
||||
help='Do not open the output folder after packaging'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--skip-quality-check',
|
||||
action='store_true',
|
||||
help='Skip quality checks before packaging'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--target',
|
||||
choices=['claude', 'gemini', 'openai', 'markdown'],
|
||||
default='claude',
|
||||
help='Target LLM platform (default: claude)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--upload',
|
||||
action='store_true',
|
||||
help='Automatically upload after packaging (requires platform API key)'
|
||||
"--upload", action="store_true", help="Automatically upload after packaging (requires platform API key)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -179,7 +156,7 @@ Examples:
|
||||
args.skill_dir,
|
||||
open_folder_after=not args.no_open,
|
||||
skip_quality_check=args.skip_quality_check,
|
||||
target=args.target
|
||||
target=args.target,
|
||||
)
|
||||
|
||||
if not success:
|
||||
@@ -194,42 +171,42 @@ Examples:
|
||||
adaptor = get_adaptor(args.target)
|
||||
|
||||
# Get API key from environment
|
||||
api_key = os.environ.get(adaptor.get_env_var_name(), '').strip()
|
||||
api_key = os.environ.get(adaptor.get_env_var_name(), "").strip()
|
||||
|
||||
if not api_key:
|
||||
# No API key - show helpful message but DON'T fail
|
||||
print("\n" + "="*60)
|
||||
print("\n" + "=" * 60)
|
||||
print("💡 Automatic Upload")
|
||||
print("="*60)
|
||||
print("=" * 60)
|
||||
print()
|
||||
print(f"To enable automatic upload to {adaptor.PLATFORM_NAME}:")
|
||||
print(f" 1. Get API key from the platform")
|
||||
print(" 1. Get API key from the platform")
|
||||
print(f" 2. Set: export {adaptor.get_env_var_name()}=...")
|
||||
print(f" 3. Run package command with --upload flag")
|
||||
print(" 3. Run package command with --upload flag")
|
||||
print()
|
||||
print("For now, use manual upload (instructions above) ☝️")
|
||||
print("="*60)
|
||||
print("=" * 60)
|
||||
# Exit successfully - packaging worked!
|
||||
sys.exit(0)
|
||||
|
||||
# API key exists - try upload
|
||||
print("\n" + "="*60)
|
||||
print("\n" + "=" * 60)
|
||||
print(f"📤 Uploading to {adaptor.PLATFORM_NAME}...")
|
||||
print("="*60)
|
||||
print("=" * 60)
|
||||
|
||||
result = adaptor.upload(package_path, api_key)
|
||||
|
||||
if result['success']:
|
||||
if result["success"]:
|
||||
print(f"\n✅ {result['message']}")
|
||||
if result['url']:
|
||||
if result["url"]:
|
||||
print(f" View at: {result['url']}")
|
||||
print("="*60)
|
||||
print("=" * 60)
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(f"\n❌ Upload failed: {result['message']}")
|
||||
print()
|
||||
print("💡 Try manual upload instead (instructions above) ☝️")
|
||||
print("="*60)
|
||||
print("=" * 60)
|
||||
# Exit successfully - packaging worked even if upload failed
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -48,11 +48,11 @@ Example:
|
||||
--extract-tables --parallel
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Import unified language detector
|
||||
@@ -70,12 +70,14 @@ except ImportError:
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
TESSERACT_AVAILABLE = True
|
||||
except ImportError:
|
||||
TESSERACT_AVAILABLE = False
|
||||
|
||||
try:
|
||||
import concurrent.futures
|
||||
|
||||
CONCURRENT_AVAILABLE = True
|
||||
except ImportError:
|
||||
CONCURRENT_AVAILABLE = False
|
||||
@@ -84,10 +86,22 @@ except ImportError:
|
||||
class PDFExtractor:
|
||||
"""Extract text and code from PDF documentation"""
|
||||
|
||||
def __init__(self, pdf_path, verbose=False, chunk_size=10, min_quality=0.0,
|
||||
extract_images=False, image_dir=None, min_image_size=100,
|
||||
use_ocr=False, password=None, extract_tables=False,
|
||||
parallel=False, max_workers=None, use_cache=True):
|
||||
def __init__(
|
||||
self,
|
||||
pdf_path,
|
||||
verbose=False,
|
||||
chunk_size=10,
|
||||
min_quality=0.0,
|
||||
extract_images=False,
|
||||
image_dir=None,
|
||||
min_image_size=100,
|
||||
use_ocr=False,
|
||||
password=None,
|
||||
extract_tables=False,
|
||||
parallel=False,
|
||||
max_workers=None,
|
||||
use_cache=True,
|
||||
):
|
||||
self.pdf_path = pdf_path
|
||||
self.verbose = verbose
|
||||
self.chunk_size = chunk_size # Pages per chunk (0 = no chunking)
|
||||
@@ -175,11 +189,11 @@ class PDFExtractor:
|
||||
tabs = page.find_tables()
|
||||
for idx, tab in enumerate(tabs.tables):
|
||||
table_data = {
|
||||
'table_index': idx,
|
||||
'rows': tab.extract(),
|
||||
'bbox': tab.bbox,
|
||||
'row_count': len(tab.extract()),
|
||||
'col_count': len(tab.extract()[0]) if tab.extract() else 0
|
||||
"table_index": idx,
|
||||
"rows": tab.extract(),
|
||||
"bbox": tab.bbox,
|
||||
"row_count": len(tab.extract()),
|
||||
"col_count": len(tab.extract()[0]) if tab.extract() else 0,
|
||||
}
|
||||
tables.append(table_data)
|
||||
self.log(f" Found table {idx}: {table_data['row_count']}x{table_data['col_count']}")
|
||||
@@ -236,54 +250,54 @@ class PDFExtractor:
|
||||
|
||||
# Common syntax checks
|
||||
if not code.strip():
|
||||
return False, ['Empty code block']
|
||||
return False, ["Empty code block"]
|
||||
|
||||
# Language-specific validation
|
||||
if language == 'python':
|
||||
if language == "python":
|
||||
# Check indentation consistency
|
||||
lines = code.split('\n')
|
||||
lines = code.split("\n")
|
||||
indent_chars = set()
|
||||
for line in lines:
|
||||
if line.startswith(' '):
|
||||
indent_chars.add('space')
|
||||
elif line.startswith('\t'):
|
||||
indent_chars.add('tab')
|
||||
if line.startswith(" "):
|
||||
indent_chars.add("space")
|
||||
elif line.startswith("\t"):
|
||||
indent_chars.add("tab")
|
||||
|
||||
if len(indent_chars) > 1:
|
||||
issues.append('Mixed tabs and spaces')
|
||||
issues.append("Mixed tabs and spaces")
|
||||
|
||||
# Check for unclosed brackets/parens
|
||||
open_count = code.count('(') + code.count('[') + code.count('{')
|
||||
close_count = code.count(')') + code.count(']') + code.count('}')
|
||||
open_count = code.count("(") + code.count("[") + code.count("{")
|
||||
close_count = code.count(")") + code.count("]") + code.count("}")
|
||||
if abs(open_count - close_count) > 2: # Allow small mismatch
|
||||
issues.append('Unbalanced brackets')
|
||||
issues.append("Unbalanced brackets")
|
||||
|
||||
elif language in ['javascript', 'java', 'cpp', 'c', 'csharp', 'go']:
|
||||
elif language in ["javascript", "java", "cpp", "c", "csharp", "go"]:
|
||||
# Check for balanced braces
|
||||
open_braces = code.count('{')
|
||||
close_braces = code.count('}')
|
||||
open_braces = code.count("{")
|
||||
close_braces = code.count("}")
|
||||
if abs(open_braces - close_braces) > 1:
|
||||
issues.append('Unbalanced braces')
|
||||
issues.append("Unbalanced braces")
|
||||
|
||||
elif language == 'json':
|
||||
elif language == "json":
|
||||
# Try to parse JSON
|
||||
try:
|
||||
json.loads(code)
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
issues.append(f'Invalid JSON syntax: {str(e)[:50]}')
|
||||
issues.append(f"Invalid JSON syntax: {str(e)[:50]}")
|
||||
|
||||
# General checks
|
||||
# Check if code looks like natural language (too many common words)
|
||||
common_words = ['the', 'and', 'for', 'with', 'this', 'that', 'have', 'from']
|
||||
common_words = ["the", "and", "for", "with", "this", "that", "have", "from"]
|
||||
word_count = sum(1 for word in common_words if word in code.lower())
|
||||
if word_count > 5 and len(code.split()) < 50:
|
||||
issues.append('May be natural language, not code')
|
||||
issues.append("May be natural language, not code")
|
||||
|
||||
# Check code/comment ratio
|
||||
comment_lines = sum(1 for line in code.split('\n') if line.strip().startswith(('#', '//', '/*', '*', '--')))
|
||||
total_lines = len([l for l in code.split('\n') if l.strip()])
|
||||
comment_lines = sum(1 for line in code.split("\n") if line.strip().startswith(("#", "//", "/*", "*", "--")))
|
||||
total_lines = len([l for l in code.split("\n") if l.strip()])
|
||||
if total_lines > 0 and comment_lines / total_lines > 0.7:
|
||||
issues.append('Mostly comments')
|
||||
issues.append("Mostly comments")
|
||||
|
||||
return len(issues) == 0, issues
|
||||
|
||||
@@ -309,18 +323,18 @@ class PDFExtractor:
|
||||
score -= 2.0
|
||||
|
||||
# Factor 3: Number of lines
|
||||
lines = [l for l in code.split('\n') if l.strip()]
|
||||
lines = [l for l in code.split("\n") if l.strip()]
|
||||
if 2 <= len(lines) <= 50:
|
||||
score += 1.0
|
||||
elif len(lines) > 100:
|
||||
score -= 1.0
|
||||
|
||||
# Factor 4: Has function/class definitions
|
||||
if re.search(r'\b(def|function|class|func|fn|public class)\b', code):
|
||||
if re.search(r"\b(def|function|class|func|fn|public class)\b", code):
|
||||
score += 1.5
|
||||
|
||||
# Factor 5: Has meaningful variable names (not just x, y, i)
|
||||
meaningful_vars = re.findall(r'\b[a-z_][a-z0-9_]{3,}\b', code.lower())
|
||||
meaningful_vars = re.findall(r"\b[a-z_][a-z0-9_]{3,}\b", code.lower())
|
||||
if len(meaningful_vars) >= 2:
|
||||
score += 1.0
|
||||
|
||||
@@ -344,19 +358,19 @@ class PDFExtractor:
|
||||
code_blocks = []
|
||||
blocks = page.get_text("dict")["blocks"]
|
||||
|
||||
monospace_fonts = ['courier', 'mono', 'consolas', 'menlo', 'monaco', 'dejavu']
|
||||
monospace_fonts = ["courier", "mono", "consolas", "menlo", "monaco", "dejavu"]
|
||||
|
||||
current_code = []
|
||||
current_font = None
|
||||
|
||||
for block in blocks:
|
||||
if 'lines' not in block:
|
||||
if "lines" not in block:
|
||||
continue
|
||||
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
font = span['font'].lower()
|
||||
text = span['text']
|
||||
for line in block["lines"]:
|
||||
for span in line["spans"]:
|
||||
font = span["font"].lower()
|
||||
text = span["text"]
|
||||
|
||||
# Check if font is monospace
|
||||
is_monospace = any(mf in font for mf in monospace_fonts)
|
||||
@@ -364,47 +378,51 @@ class PDFExtractor:
|
||||
if is_monospace:
|
||||
# Accumulate code text
|
||||
current_code.append(text)
|
||||
current_font = span['font']
|
||||
current_font = span["font"]
|
||||
else:
|
||||
# End of code block
|
||||
if current_code:
|
||||
code_text = ''.join(current_code).strip()
|
||||
code_text = "".join(current_code).strip()
|
||||
if len(code_text) > 10: # Minimum code length
|
||||
lang, confidence = self.detect_language_from_code(code_text)
|
||||
quality = self.score_code_quality(code_text, lang, confidence)
|
||||
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
||||
|
||||
code_blocks.append({
|
||||
'code': code_text,
|
||||
'language': lang,
|
||||
'confidence': confidence,
|
||||
'quality_score': quality,
|
||||
'is_valid': is_valid,
|
||||
'validation_issues': issues if not is_valid else [],
|
||||
'font': current_font,
|
||||
'detection_method': 'font'
|
||||
})
|
||||
code_blocks.append(
|
||||
{
|
||||
"code": code_text,
|
||||
"language": lang,
|
||||
"confidence": confidence,
|
||||
"quality_score": quality,
|
||||
"is_valid": is_valid,
|
||||
"validation_issues": issues if not is_valid else [],
|
||||
"font": current_font,
|
||||
"detection_method": "font",
|
||||
}
|
||||
)
|
||||
current_code = []
|
||||
current_font = None
|
||||
|
||||
# Handle final code block
|
||||
if current_code:
|
||||
code_text = ''.join(current_code).strip()
|
||||
code_text = "".join(current_code).strip()
|
||||
if len(code_text) > 10:
|
||||
lang, confidence = self.detect_language_from_code(code_text)
|
||||
quality = self.score_code_quality(code_text, lang, confidence)
|
||||
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
||||
|
||||
code_blocks.append({
|
||||
'code': code_text,
|
||||
'language': lang,
|
||||
'confidence': confidence,
|
||||
'quality_score': quality,
|
||||
'is_valid': is_valid,
|
||||
'validation_issues': issues if not is_valid else [],
|
||||
'font': current_font,
|
||||
'detection_method': 'font'
|
||||
})
|
||||
code_blocks.append(
|
||||
{
|
||||
"code": code_text,
|
||||
"language": lang,
|
||||
"confidence": confidence,
|
||||
"quality_score": quality,
|
||||
"is_valid": is_valid,
|
||||
"validation_issues": issues if not is_valid else [],
|
||||
"font": current_font,
|
||||
"detection_method": "font",
|
||||
}
|
||||
)
|
||||
|
||||
return code_blocks
|
||||
|
||||
@@ -416,55 +434,59 @@ class PDFExtractor:
|
||||
Returns list of detected code blocks.
|
||||
"""
|
||||
code_blocks = []
|
||||
lines = text.split('\n')
|
||||
lines = text.split("\n")
|
||||
current_block = []
|
||||
indent_pattern = None
|
||||
|
||||
for line in lines:
|
||||
# Check for indentation (4 spaces or tab)
|
||||
if line.startswith(' ') or line.startswith('\t'):
|
||||
if line.startswith(" ") or line.startswith("\t"):
|
||||
# Start or continue code block
|
||||
if not indent_pattern:
|
||||
indent_pattern = line[:4] if line.startswith(' ') else '\t'
|
||||
indent_pattern = line[:4] if line.startswith(" ") else "\t"
|
||||
current_block.append(line)
|
||||
else:
|
||||
# End of code block
|
||||
if current_block and len(current_block) >= 2: # At least 2 lines
|
||||
code_text = '\n'.join(current_block).strip()
|
||||
code_text = "\n".join(current_block).strip()
|
||||
if len(code_text) > 20: # Minimum code length
|
||||
lang, confidence = self.detect_language_from_code(code_text)
|
||||
quality = self.score_code_quality(code_text, lang, confidence)
|
||||
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
||||
|
||||
code_blocks.append({
|
||||
'code': code_text,
|
||||
'language': lang,
|
||||
'confidence': confidence,
|
||||
'quality_score': quality,
|
||||
'is_valid': is_valid,
|
||||
'validation_issues': issues if not is_valid else [],
|
||||
'detection_method': 'indent'
|
||||
})
|
||||
code_blocks.append(
|
||||
{
|
||||
"code": code_text,
|
||||
"language": lang,
|
||||
"confidence": confidence,
|
||||
"quality_score": quality,
|
||||
"is_valid": is_valid,
|
||||
"validation_issues": issues if not is_valid else [],
|
||||
"detection_method": "indent",
|
||||
}
|
||||
)
|
||||
current_block = []
|
||||
indent_pattern = None
|
||||
|
||||
# Handle final block
|
||||
if current_block and len(current_block) >= 2:
|
||||
code_text = '\n'.join(current_block).strip()
|
||||
code_text = "\n".join(current_block).strip()
|
||||
if len(code_text) > 20:
|
||||
lang, confidence = self.detect_language_from_code(code_text)
|
||||
quality = self.score_code_quality(code_text, lang, confidence)
|
||||
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
||||
|
||||
code_blocks.append({
|
||||
'code': code_text,
|
||||
'language': lang,
|
||||
'confidence': confidence,
|
||||
'quality_score': quality,
|
||||
'is_valid': is_valid,
|
||||
'validation_issues': issues if not is_valid else [],
|
||||
'detection_method': 'indent'
|
||||
})
|
||||
code_blocks.append(
|
||||
{
|
||||
"code": code_text,
|
||||
"language": lang,
|
||||
"confidence": confidence,
|
||||
"quality_score": quality,
|
||||
"is_valid": is_valid,
|
||||
"validation_issues": issues if not is_valid else [],
|
||||
"detection_method": "indent",
|
||||
}
|
||||
)
|
||||
|
||||
return code_blocks
|
||||
|
||||
@@ -479,11 +501,11 @@ class PDFExtractor:
|
||||
# Common code patterns that span multiple lines
|
||||
patterns = [
|
||||
# Function definitions
|
||||
(r'((?:def|function|func|fn|public|private)\s+\w+\s*\([^)]*\)\s*[{:]?[^}]*[}]?)', 'function'),
|
||||
(r"((?:def|function|func|fn|public|private)\s+\w+\s*\([^)]*\)\s*[{:]?[^}]*[}]?)", "function"),
|
||||
# Class definitions
|
||||
(r'(class\s+\w+[^{]*\{[^}]*\})', 'class'),
|
||||
(r"(class\s+\w+[^{]*\{[^}]*\})", "class"),
|
||||
# Import statements block
|
||||
(r'((?:import|require|use|include)[^\n]+(?:\n(?:import|require|use|include)[^\n]+)*)', 'imports'),
|
||||
(r"((?:import|require|use|include)[^\n]+(?:\n(?:import|require|use|include)[^\n]+)*)", "imports"),
|
||||
]
|
||||
|
||||
for pattern, block_type in patterns:
|
||||
@@ -495,16 +517,18 @@ class PDFExtractor:
|
||||
quality = self.score_code_quality(code_text, lang, confidence)
|
||||
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
||||
|
||||
code_blocks.append({
|
||||
'code': code_text,
|
||||
'language': lang,
|
||||
'confidence': confidence,
|
||||
'quality_score': quality,
|
||||
'is_valid': is_valid,
|
||||
'validation_issues': issues if not is_valid else [],
|
||||
'detection_method': 'pattern',
|
||||
'pattern_type': block_type
|
||||
})
|
||||
code_blocks.append(
|
||||
{
|
||||
"code": code_text,
|
||||
"language": lang,
|
||||
"confidence": confidence,
|
||||
"quality_score": quality,
|
||||
"is_valid": is_valid,
|
||||
"validation_issues": issues if not is_valid else [],
|
||||
"detection_method": "pattern",
|
||||
"pattern_type": block_type,
|
||||
}
|
||||
)
|
||||
|
||||
return code_blocks
|
||||
|
||||
@@ -514,24 +538,24 @@ class PDFExtractor:
|
||||
|
||||
Returns (is_chapter_start, chapter_title) tuple.
|
||||
"""
|
||||
headings = page_data.get('headings', [])
|
||||
headings = page_data.get("headings", [])
|
||||
|
||||
# Check for h1 or h2 at start of page
|
||||
if headings:
|
||||
first_heading = headings[0]
|
||||
# H1 headings are strong indicators of chapters
|
||||
if first_heading['level'] in ['h1', 'h2']:
|
||||
return True, first_heading['text']
|
||||
if first_heading["level"] in ["h1", "h2"]:
|
||||
return True, first_heading["text"]
|
||||
|
||||
# Check for specific chapter markers in text
|
||||
text = page_data.get('text', '')
|
||||
first_line = text.split('\n')[0] if text else ''
|
||||
text = page_data.get("text", "")
|
||||
first_line = text.split("\n")[0] if text else ""
|
||||
|
||||
chapter_patterns = [
|
||||
r'^Chapter\s+\d+',
|
||||
r'^Part\s+\d+',
|
||||
r'^Section\s+\d+',
|
||||
r'^\d+\.\s+[A-Z]', # "1. Introduction"
|
||||
r"^Chapter\s+\d+",
|
||||
r"^Part\s+\d+",
|
||||
r"^Section\s+\d+",
|
||||
r"^\d+\.\s+[A-Z]", # "1. Introduction"
|
||||
]
|
||||
|
||||
for pattern in chapter_patterns:
|
||||
@@ -552,42 +576,43 @@ class PDFExtractor:
|
||||
next_page = pages[i + 1]
|
||||
|
||||
# Check if current page has code blocks
|
||||
if not current_page['code_samples']:
|
||||
if not current_page["code_samples"]:
|
||||
continue
|
||||
|
||||
# Get last code block of current page
|
||||
last_code = current_page['code_samples'][-1]
|
||||
last_code = current_page["code_samples"][-1]
|
||||
|
||||
# Check if next page starts with code
|
||||
if not next_page['code_samples']:
|
||||
if not next_page["code_samples"]:
|
||||
continue
|
||||
|
||||
first_next_code = next_page['code_samples'][0]
|
||||
first_next_code = next_page["code_samples"][0]
|
||||
|
||||
# Same language and detection method = likely continuation
|
||||
if (last_code['language'] == first_next_code['language'] and
|
||||
last_code['detection_method'] == first_next_code['detection_method']):
|
||||
|
||||
if (
|
||||
last_code["language"] == first_next_code["language"]
|
||||
and last_code["detection_method"] == first_next_code["detection_method"]
|
||||
):
|
||||
# Check if last code block looks incomplete (doesn't end with closing brace/etc)
|
||||
last_code_text = last_code['code'].rstrip()
|
||||
last_code_text = last_code["code"].rstrip()
|
||||
continuation_indicators = [
|
||||
not last_code_text.endswith('}'),
|
||||
not last_code_text.endswith(';'),
|
||||
last_code_text.endswith(','),
|
||||
last_code_text.endswith('\\'),
|
||||
not last_code_text.endswith("}"),
|
||||
not last_code_text.endswith(";"),
|
||||
last_code_text.endswith(","),
|
||||
last_code_text.endswith("\\"),
|
||||
]
|
||||
|
||||
if any(continuation_indicators):
|
||||
# Merge the code blocks
|
||||
merged_code = last_code['code'] + '\n' + first_next_code['code']
|
||||
last_code['code'] = merged_code
|
||||
last_code['merged_from_next_page'] = True
|
||||
merged_code = last_code["code"] + "\n" + first_next_code["code"]
|
||||
last_code["code"] = merged_code
|
||||
last_code["merged_from_next_page"] = True
|
||||
|
||||
# Remove the first code block from next page
|
||||
next_page['code_samples'].pop(0)
|
||||
next_page['code_blocks_count'] -= 1
|
||||
next_page["code_samples"].pop(0)
|
||||
next_page["code_blocks_count"] -= 1
|
||||
|
||||
self.log(f" Merged code block from page {i+1} to {i+2}")
|
||||
self.log(f" Merged code block from page {i + 1} to {i + 2}")
|
||||
|
||||
return pages
|
||||
|
||||
@@ -603,13 +628,7 @@ class PDFExtractor:
|
||||
"""
|
||||
if self.chunk_size == 0:
|
||||
# No chunking - return all pages as one chunk
|
||||
return [{
|
||||
'chunk_number': 1,
|
||||
'start_page': 1,
|
||||
'end_page': len(pages),
|
||||
'pages': pages,
|
||||
'chapter_title': None
|
||||
}]
|
||||
return [{"chunk_number": 1, "start_page": 1, "end_page": len(pages), "pages": pages, "chapter_title": None}]
|
||||
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
@@ -622,13 +641,15 @@ class PDFExtractor:
|
||||
|
||||
if is_chapter and current_chunk:
|
||||
# Save current chunk before starting new one
|
||||
chunks.append({
|
||||
'chunk_number': len(chunks) + 1,
|
||||
'start_page': chunk_start + 1,
|
||||
'end_page': i,
|
||||
'pages': current_chunk,
|
||||
'chapter_title': current_chapter
|
||||
})
|
||||
chunks.append(
|
||||
{
|
||||
"chunk_number": len(chunks) + 1,
|
||||
"start_page": chunk_start + 1,
|
||||
"end_page": i,
|
||||
"pages": current_chunk,
|
||||
"chapter_title": current_chapter,
|
||||
}
|
||||
)
|
||||
current_chunk = []
|
||||
chunk_start = i
|
||||
current_chapter = chapter_title
|
||||
@@ -640,26 +661,30 @@ class PDFExtractor:
|
||||
|
||||
# Check if chunk size reached (but don't break chapters)
|
||||
if not is_chapter and len(current_chunk) >= self.chunk_size:
|
||||
chunks.append({
|
||||
'chunk_number': len(chunks) + 1,
|
||||
'start_page': chunk_start + 1,
|
||||
'end_page': i + 1,
|
||||
'pages': current_chunk,
|
||||
'chapter_title': current_chapter
|
||||
})
|
||||
chunks.append(
|
||||
{
|
||||
"chunk_number": len(chunks) + 1,
|
||||
"start_page": chunk_start + 1,
|
||||
"end_page": i + 1,
|
||||
"pages": current_chunk,
|
||||
"chapter_title": current_chapter,
|
||||
}
|
||||
)
|
||||
current_chunk = []
|
||||
chunk_start = i + 1
|
||||
current_chapter = None
|
||||
|
||||
# Add remaining pages as final chunk
|
||||
if current_chunk:
|
||||
chunks.append({
|
||||
'chunk_number': len(chunks) + 1,
|
||||
'start_page': chunk_start + 1,
|
||||
'end_page': len(pages),
|
||||
'pages': current_chunk,
|
||||
'chapter_title': current_chapter
|
||||
})
|
||||
chunks.append(
|
||||
{
|
||||
"chunk_number": len(chunks) + 1,
|
||||
"start_page": chunk_start + 1,
|
||||
"end_page": len(pages),
|
||||
"pages": current_chunk,
|
||||
"chapter_title": current_chapter,
|
||||
}
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
@@ -696,7 +721,7 @@ class PDFExtractor:
|
||||
|
||||
# Generate filename
|
||||
pdf_basename = Path(self.pdf_path).stem
|
||||
image_filename = f"{pdf_basename}_page{page_num+1}_img{img_index+1}.{image_ext}"
|
||||
image_filename = f"{pdf_basename}_page{page_num + 1}_img{img_index + 1}.{image_ext}"
|
||||
|
||||
# Save image
|
||||
image_path = Path(self.image_dir) / image_filename
|
||||
@@ -707,14 +732,14 @@ class PDFExtractor:
|
||||
|
||||
# Store metadata
|
||||
image_info = {
|
||||
'filename': image_filename,
|
||||
'path': str(image_path),
|
||||
'page_number': page_num + 1,
|
||||
'width': width,
|
||||
'height': height,
|
||||
'format': image_ext,
|
||||
'size_bytes': len(image_bytes),
|
||||
'xref': xref
|
||||
"filename": image_filename,
|
||||
"path": str(image_path),
|
||||
"page_number": page_num + 1,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"format": image_ext,
|
||||
"size_bytes": len(image_bytes),
|
||||
"xref": xref,
|
||||
}
|
||||
|
||||
extracted.append(image_info)
|
||||
@@ -771,12 +796,12 @@ class PDFExtractor:
|
||||
# Simple deduplication by code content
|
||||
unique_code = {}
|
||||
for block in all_code_blocks:
|
||||
code_hash = hash(block['code'])
|
||||
code_hash = hash(block["code"])
|
||||
if code_hash not in unique_code:
|
||||
unique_code[code_hash] = block
|
||||
else:
|
||||
# Keep the one with higher quality score
|
||||
if block['quality_score'] > unique_code[code_hash]['quality_score']:
|
||||
if block["quality_score"] > unique_code[code_hash]["quality_score"]:
|
||||
unique_code[code_hash] = block
|
||||
|
||||
code_samples = list(unique_code.values())
|
||||
@@ -784,44 +809,43 @@ class PDFExtractor:
|
||||
# Filter by minimum quality (NEW in B1.4)
|
||||
if self.min_quality > 0:
|
||||
code_samples_before = len(code_samples)
|
||||
code_samples = [c for c in code_samples if c['quality_score'] >= self.min_quality]
|
||||
code_samples = [c for c in code_samples if c["quality_score"] >= self.min_quality]
|
||||
filtered_count = code_samples_before - len(code_samples)
|
||||
if filtered_count > 0:
|
||||
self.log(f" Filtered out {filtered_count} low-quality code blocks (min_quality={self.min_quality})")
|
||||
|
||||
# Sort by quality score (highest first)
|
||||
code_samples.sort(key=lambda x: x['quality_score'], reverse=True)
|
||||
code_samples.sort(key=lambda x: x["quality_score"], reverse=True)
|
||||
|
||||
# Extract headings from markdown
|
||||
headings = []
|
||||
for line in markdown.split('\n'):
|
||||
if line.startswith('#'):
|
||||
level = len(line) - len(line.lstrip('#'))
|
||||
text = line.lstrip('#').strip()
|
||||
for line in markdown.split("\n"):
|
||||
if line.startswith("#"):
|
||||
level = len(line) - len(line.lstrip("#"))
|
||||
text = line.lstrip("#").strip()
|
||||
if text:
|
||||
headings.append({
|
||||
'level': f'h{level}',
|
||||
'text': text
|
||||
})
|
||||
headings.append({"level": f"h{level}", "text": text})
|
||||
|
||||
page_data = {
|
||||
'page_number': page_num + 1, # 1-indexed for humans
|
||||
'text': text.strip(),
|
||||
'markdown': markdown.strip(),
|
||||
'headings': headings,
|
||||
'code_samples': code_samples,
|
||||
'images_count': len(images),
|
||||
'extracted_images': extracted_images, # NEW in B1.5
|
||||
'tables': tables, # NEW in Priority 2
|
||||
'char_count': len(text),
|
||||
'code_blocks_count': len(code_samples),
|
||||
'tables_count': len(tables) # NEW in Priority 2
|
||||
"page_number": page_num + 1, # 1-indexed for humans
|
||||
"text": text.strip(),
|
||||
"markdown": markdown.strip(),
|
||||
"headings": headings,
|
||||
"code_samples": code_samples,
|
||||
"images_count": len(images),
|
||||
"extracted_images": extracted_images, # NEW in B1.5
|
||||
"tables": tables, # NEW in Priority 2
|
||||
"char_count": len(text),
|
||||
"code_blocks_count": len(code_samples),
|
||||
"tables_count": len(tables), # NEW in Priority 2
|
||||
}
|
||||
|
||||
# Cache the result (Priority 3)
|
||||
self.set_cached(cache_key, page_data)
|
||||
|
||||
self.log(f" Page {page_num + 1}: {len(text)} chars, {len(code_samples)} code blocks, {len(headings)} headings, {len(extracted_images)} images, {len(tables)} tables")
|
||||
self.log(
|
||||
f" Page {page_num + 1}: {len(text)} chars, {len(code_samples)} code blocks, {len(headings)} headings, {len(extracted_images)} images, {len(tables)} tables"
|
||||
)
|
||||
|
||||
return page_data
|
||||
|
||||
@@ -841,15 +865,15 @@ class PDFExtractor:
|
||||
# Handle encrypted PDFs (Priority 2)
|
||||
if self.doc.is_encrypted:
|
||||
if self.password:
|
||||
print(f" 🔐 PDF is encrypted, trying password...")
|
||||
print(" 🔐 PDF is encrypted, trying password...")
|
||||
if self.doc.authenticate(self.password):
|
||||
print(f" ✅ Password accepted")
|
||||
print(" ✅ Password accepted")
|
||||
else:
|
||||
print(f" ❌ Invalid password")
|
||||
print(" ❌ Invalid password")
|
||||
return None
|
||||
else:
|
||||
print(f" ❌ PDF is encrypted but no password provided")
|
||||
print(f" Use --password option to provide password")
|
||||
print(" ❌ PDF is encrypted but no password provided")
|
||||
print(" Use --password option to provide password")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
@@ -870,12 +894,12 @@ class PDFExtractor:
|
||||
status = "✅ enabled" if TESSERACT_AVAILABLE else "⚠️ not available (install pytesseract)"
|
||||
print(f" OCR: {status}")
|
||||
if self.extract_tables:
|
||||
print(f" Table extraction: ✅ enabled")
|
||||
print(" Table extraction: ✅ enabled")
|
||||
if self.parallel:
|
||||
status = "✅ enabled" if CONCURRENT_AVAILABLE else "⚠️ not available"
|
||||
print(f" Parallel processing: {status} ({self.max_workers} workers)")
|
||||
if self.use_cache:
|
||||
print(f" Caching: ✅ enabled")
|
||||
print(" Caching: ✅ enabled")
|
||||
|
||||
print("")
|
||||
|
||||
@@ -900,73 +924,71 @@ class PDFExtractor:
|
||||
chunks = self.create_chunks(self.pages)
|
||||
|
||||
# Build summary
|
||||
total_chars = sum(p['char_count'] for p in self.pages)
|
||||
total_code_blocks = sum(p['code_blocks_count'] for p in self.pages)
|
||||
total_headings = sum(len(p['headings']) for p in self.pages)
|
||||
total_images = sum(p['images_count'] for p in self.pages)
|
||||
total_tables = sum(p['tables_count'] for p in self.pages) # NEW in Priority 2
|
||||
total_chars = sum(p["char_count"] for p in self.pages)
|
||||
total_code_blocks = sum(p["code_blocks_count"] for p in self.pages)
|
||||
total_headings = sum(len(p["headings"]) for p in self.pages)
|
||||
total_images = sum(p["images_count"] for p in self.pages)
|
||||
total_tables = sum(p["tables_count"] for p in self.pages) # NEW in Priority 2
|
||||
|
||||
# Detect languages used
|
||||
languages = {}
|
||||
all_code_blocks_list = []
|
||||
for page in self.pages:
|
||||
for code in page['code_samples']:
|
||||
lang = code['language']
|
||||
for code in page["code_samples"]:
|
||||
lang = code["language"]
|
||||
languages[lang] = languages.get(lang, 0) + 1
|
||||
all_code_blocks_list.append(code)
|
||||
|
||||
# Calculate quality statistics (NEW in B1.4)
|
||||
quality_stats = {}
|
||||
if all_code_blocks_list:
|
||||
quality_scores = [c['quality_score'] for c in all_code_blocks_list]
|
||||
confidences = [c['confidence'] for c in all_code_blocks_list]
|
||||
valid_count = sum(1 for c in all_code_blocks_list if c['is_valid'])
|
||||
quality_scores = [c["quality_score"] for c in all_code_blocks_list]
|
||||
confidences = [c["confidence"] for c in all_code_blocks_list]
|
||||
valid_count = sum(1 for c in all_code_blocks_list if c["is_valid"])
|
||||
|
||||
quality_stats = {
|
||||
'average_quality': sum(quality_scores) / len(quality_scores),
|
||||
'average_confidence': sum(confidences) / len(confidences),
|
||||
'valid_code_blocks': valid_count,
|
||||
'invalid_code_blocks': total_code_blocks - valid_count,
|
||||
'validation_rate': valid_count / total_code_blocks if total_code_blocks > 0 else 0,
|
||||
'high_quality_blocks': sum(1 for s in quality_scores if s >= 7.0),
|
||||
'medium_quality_blocks': sum(1 for s in quality_scores if 4.0 <= s < 7.0),
|
||||
'low_quality_blocks': sum(1 for s in quality_scores if s < 4.0),
|
||||
"average_quality": sum(quality_scores) / len(quality_scores),
|
||||
"average_confidence": sum(confidences) / len(confidences),
|
||||
"valid_code_blocks": valid_count,
|
||||
"invalid_code_blocks": total_code_blocks - valid_count,
|
||||
"validation_rate": valid_count / total_code_blocks if total_code_blocks > 0 else 0,
|
||||
"high_quality_blocks": sum(1 for s in quality_scores if s >= 7.0),
|
||||
"medium_quality_blocks": sum(1 for s in quality_scores if 4.0 <= s < 7.0),
|
||||
"low_quality_blocks": sum(1 for s in quality_scores if s < 4.0),
|
||||
}
|
||||
|
||||
# Extract chapter information
|
||||
chapters = []
|
||||
for chunk in chunks:
|
||||
if chunk['chapter_title']:
|
||||
chapters.append({
|
||||
'title': chunk['chapter_title'],
|
||||
'start_page': chunk['start_page'],
|
||||
'end_page': chunk['end_page']
|
||||
})
|
||||
if chunk["chapter_title"]:
|
||||
chapters.append(
|
||||
{"title": chunk["chapter_title"], "start_page": chunk["start_page"], "end_page": chunk["end_page"]}
|
||||
)
|
||||
|
||||
result = {
|
||||
'source_file': self.pdf_path,
|
||||
'metadata': self.doc.metadata,
|
||||
'total_pages': len(self.doc),
|
||||
'total_chars': total_chars,
|
||||
'total_code_blocks': total_code_blocks,
|
||||
'total_headings': total_headings,
|
||||
'total_images': total_images,
|
||||
'total_extracted_images': len(self.extracted_images), # NEW in B1.5
|
||||
'total_tables': total_tables, # NEW in Priority 2
|
||||
'image_directory': self.image_dir if self.extract_images else None, # NEW in B1.5
|
||||
'extracted_images': self.extracted_images, # NEW in B1.5
|
||||
'total_chunks': len(chunks),
|
||||
'chapters': chapters,
|
||||
'languages_detected': languages,
|
||||
'quality_statistics': quality_stats, # NEW in B1.4
|
||||
'chunks': chunks,
|
||||
'pages': self.pages # Still include all pages for compatibility
|
||||
"source_file": self.pdf_path,
|
||||
"metadata": self.doc.metadata,
|
||||
"total_pages": len(self.doc),
|
||||
"total_chars": total_chars,
|
||||
"total_code_blocks": total_code_blocks,
|
||||
"total_headings": total_headings,
|
||||
"total_images": total_images,
|
||||
"total_extracted_images": len(self.extracted_images), # NEW in B1.5
|
||||
"total_tables": total_tables, # NEW in Priority 2
|
||||
"image_directory": self.image_dir if self.extract_images else None, # NEW in B1.5
|
||||
"extracted_images": self.extracted_images, # NEW in B1.5
|
||||
"total_chunks": len(chunks),
|
||||
"chapters": chapters,
|
||||
"languages_detected": languages,
|
||||
"quality_statistics": quality_stats, # NEW in B1.4
|
||||
"chunks": chunks,
|
||||
"pages": self.pages, # Still include all pages for compatibility
|
||||
}
|
||||
|
||||
# Close document
|
||||
self.doc.close()
|
||||
|
||||
print(f"\n✅ Extraction complete:")
|
||||
print("\n✅ Extraction complete:")
|
||||
print(f" Total characters: {total_chars:,}")
|
||||
print(f" Code blocks found: {total_code_blocks}")
|
||||
print(f" Headings found: {total_headings}")
|
||||
@@ -983,10 +1005,12 @@ class PDFExtractor:
|
||||
|
||||
# Print quality statistics (NEW in B1.4)
|
||||
if quality_stats:
|
||||
print(f"\n📊 Code Quality Statistics:")
|
||||
print("\n📊 Code Quality Statistics:")
|
||||
print(f" Average quality: {quality_stats['average_quality']:.1f}/10")
|
||||
print(f" Average confidence: {quality_stats['average_confidence']:.1%}")
|
||||
print(f" Valid code blocks: {quality_stats['valid_code_blocks']}/{total_code_blocks} ({quality_stats['validation_rate']:.1%})")
|
||||
print(
|
||||
f" Valid code blocks: {quality_stats['valid_code_blocks']}/{total_code_blocks} ({quality_stats['validation_rate']:.1%})"
|
||||
)
|
||||
print(f" High quality (7+): {quality_stats['high_quality_blocks']}")
|
||||
print(f" Medium quality (4-7): {quality_stats['medium_quality_blocks']}")
|
||||
print(f" Low quality (<4): {quality_stats['low_quality_blocks']}")
|
||||
@@ -996,7 +1020,7 @@ class PDFExtractor:
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract text and code blocks from PDF documentation',
|
||||
description="Extract text and code blocks from PDF documentation",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -1011,39 +1035,39 @@ Examples:
|
||||
|
||||
# Extract and save
|
||||
python3 pdf_extractor_poc.py docs/python.pdf -o python_extracted.json -v
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument('pdf_file', help='Path to PDF file to extract')
|
||||
parser.add_argument('-o', '--output', help='Output JSON file path (default: print to stdout)')
|
||||
parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
|
||||
parser.add_argument('--pretty', action='store_true', help='Pretty-print JSON output')
|
||||
parser.add_argument('--chunk-size', type=int, default=10,
|
||||
help='Pages per chunk (0 = no chunking, default: 10)')
|
||||
parser.add_argument('--no-merge', action='store_true',
|
||||
help='Disable merging code blocks across pages')
|
||||
parser.add_argument('--min-quality', type=float, default=0.0,
|
||||
help='Minimum code quality score (0-10, default: 0 = no filtering)')
|
||||
parser.add_argument('--extract-images', action='store_true',
|
||||
help='Extract images to files (NEW in B1.5)')
|
||||
parser.add_argument('--image-dir', type=str, default=None,
|
||||
help='Directory to save extracted images (default: output/{pdf_name}_images)')
|
||||
parser.add_argument('--min-image-size', type=int, default=100,
|
||||
help='Minimum image dimension in pixels (filters icons, default: 100)')
|
||||
parser.add_argument("pdf_file", help="Path to PDF file to extract")
|
||||
parser.add_argument("-o", "--output", help="Output JSON file path (default: print to stdout)")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
|
||||
parser.add_argument("--chunk-size", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)")
|
||||
parser.add_argument("--no-merge", action="store_true", help="Disable merging code blocks across pages")
|
||||
parser.add_argument(
|
||||
"--min-quality", type=float, default=0.0, help="Minimum code quality score (0-10, default: 0 = no filtering)"
|
||||
)
|
||||
parser.add_argument("--extract-images", action="store_true", help="Extract images to files (NEW in B1.5)")
|
||||
parser.add_argument(
|
||||
"--image-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Directory to save extracted images (default: output/{pdf_name}_images)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-image-size",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Minimum image dimension in pixels (filters icons, default: 100)",
|
||||
)
|
||||
|
||||
# Advanced features (Priority 2 & 3)
|
||||
parser.add_argument('--ocr', action='store_true',
|
||||
help='Use OCR for scanned PDFs (requires pytesseract)')
|
||||
parser.add_argument('--password', type=str, default=None,
|
||||
help='Password for encrypted PDF')
|
||||
parser.add_argument('--extract-tables', action='store_true',
|
||||
help='Extract tables from PDF (Priority 2)')
|
||||
parser.add_argument('--parallel', action='store_true',
|
||||
help='Process pages in parallel (Priority 3)')
|
||||
parser.add_argument('--workers', type=int, default=None,
|
||||
help='Number of parallel workers (default: CPU count)')
|
||||
parser.add_argument('--no-cache', action='store_true',
|
||||
help='Disable caching of expensive operations')
|
||||
parser.add_argument("--ocr", action="store_true", help="Use OCR for scanned PDFs (requires pytesseract)")
|
||||
parser.add_argument("--password", type=str, default=None, help="Password for encrypted PDF")
|
||||
parser.add_argument("--extract-tables", action="store_true", help="Extract tables from PDF (Priority 2)")
|
||||
parser.add_argument("--parallel", action="store_true", help="Process pages in parallel (Priority 3)")
|
||||
parser.add_argument("--workers", type=int, default=None, help="Number of parallel workers (default: CPU count)")
|
||||
parser.add_argument("--no-cache", action="store_true", help="Disable caching of expensive operations")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -1052,8 +1076,8 @@ Examples:
|
||||
print(f"❌ Error: File not found: {args.pdf_file}")
|
||||
sys.exit(1)
|
||||
|
||||
if not args.pdf_file.lower().endswith('.pdf'):
|
||||
print(f"⚠️ Warning: File does not have .pdf extension")
|
||||
if not args.pdf_file.lower().endswith(".pdf"):
|
||||
print("⚠️ Warning: File does not have .pdf extension")
|
||||
|
||||
# Extract
|
||||
extractor = PDFExtractor(
|
||||
@@ -1070,7 +1094,7 @@ Examples:
|
||||
extract_tables=args.extract_tables,
|
||||
parallel=args.parallel,
|
||||
max_workers=args.workers,
|
||||
use_cache=not args.no_cache
|
||||
use_cache=not args.no_cache,
|
||||
)
|
||||
result = extractor.extract_all()
|
||||
|
||||
@@ -1080,7 +1104,7 @@ Examples:
|
||||
# Output
|
||||
if args.output:
|
||||
# Save to file
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
if args.pretty:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
else:
|
||||
@@ -1094,5 +1118,5 @@ Examples:
|
||||
print(json.dumps(result, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -11,18 +11,18 @@ Usage:
|
||||
python3 pdf_scraper.py --from-json manual_extracted.json
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Import the PDF extractor
|
||||
from .pdf_extractor_poc import PDFExtractor
|
||||
|
||||
|
||||
def infer_description_from_pdf(pdf_metadata: dict = None, name: str = '') -> str:
|
||||
def infer_description_from_pdf(pdf_metadata: dict = None, name: str = "") -> str:
|
||||
"""
|
||||
Infer skill description from PDF metadata or document properties.
|
||||
|
||||
@@ -39,22 +39,22 @@ def infer_description_from_pdf(pdf_metadata: dict = None, name: str = '') -> str
|
||||
"""
|
||||
if pdf_metadata:
|
||||
# Try to use subject field (often contains description)
|
||||
if 'subject' in pdf_metadata and pdf_metadata['subject']:
|
||||
desc = str(pdf_metadata['subject']).strip()
|
||||
if "subject" in pdf_metadata and pdf_metadata["subject"]:
|
||||
desc = str(pdf_metadata["subject"]).strip()
|
||||
if len(desc) > 20:
|
||||
if len(desc) > 150:
|
||||
desc = desc[:147] + '...'
|
||||
return f'Use when {desc.lower()}'
|
||||
desc = desc[:147] + "..."
|
||||
return f"Use when {desc.lower()}"
|
||||
|
||||
# Try title field if meaningful
|
||||
if 'title' in pdf_metadata and pdf_metadata['title']:
|
||||
title = str(pdf_metadata['title']).strip()
|
||||
if "title" in pdf_metadata and pdf_metadata["title"]:
|
||||
title = str(pdf_metadata["title"]).strip()
|
||||
# Skip if it's just the filename
|
||||
if len(title) > 10 and not title.endswith('.pdf'):
|
||||
return f'Use when working with {title.lower()}'
|
||||
if len(title) > 10 and not title.endswith(".pdf"):
|
||||
return f"Use when working with {title.lower()}"
|
||||
|
||||
# Improved fallback
|
||||
return f'Use when referencing {name} documentation' if name else 'Use when referencing this documentation'
|
||||
return f"Use when referencing {name} documentation" if name else "Use when referencing this documentation"
|
||||
|
||||
|
||||
class PDFToSkillConverter:
|
||||
@@ -62,20 +62,20 @@ class PDFToSkillConverter:
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.name = config['name']
|
||||
self.pdf_path = config.get('pdf_path', '')
|
||||
self.name = config["name"]
|
||||
self.pdf_path = config.get("pdf_path", "")
|
||||
# Set initial description (will be improved after extraction if metadata available)
|
||||
self.description = config.get('description', f'Use when referencing {self.name} documentation')
|
||||
self.description = config.get("description", f"Use when referencing {self.name} documentation")
|
||||
|
||||
# Paths
|
||||
self.skill_dir = f"output/{self.name}"
|
||||
self.data_file = f"output/{self.name}_extracted.json"
|
||||
|
||||
# Extraction options
|
||||
self.extract_options = config.get('extract_options', {})
|
||||
self.extract_options = config.get("extract_options", {})
|
||||
|
||||
# Categories
|
||||
self.categories = config.get('categories', {})
|
||||
self.categories = config.get("categories", {})
|
||||
|
||||
# Extracted data
|
||||
self.extracted_data = None
|
||||
@@ -88,11 +88,11 @@ class PDFToSkillConverter:
|
||||
extractor = PDFExtractor(
|
||||
self.pdf_path,
|
||||
verbose=True,
|
||||
chunk_size=self.extract_options.get('chunk_size', 10),
|
||||
min_quality=self.extract_options.get('min_quality', 5.0),
|
||||
extract_images=self.extract_options.get('extract_images', True),
|
||||
chunk_size=self.extract_options.get("chunk_size", 10),
|
||||
min_quality=self.extract_options.get("min_quality", 5.0),
|
||||
extract_images=self.extract_options.get("extract_images", True),
|
||||
image_dir=f"{self.skill_dir}/assets/images",
|
||||
min_image_size=self.extract_options.get('min_image_size', 100)
|
||||
min_image_size=self.extract_options.get("min_image_size", 100),
|
||||
)
|
||||
|
||||
# Extract
|
||||
@@ -103,7 +103,7 @@ class PDFToSkillConverter:
|
||||
raise RuntimeError(f"Failed to extract PDF: {self.pdf_path}")
|
||||
|
||||
# Save extracted data
|
||||
with open(self.data_file, 'w', encoding='utf-8') as f:
|
||||
with open(self.data_file, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n💾 Saved extracted data to: {self.data_file}")
|
||||
@@ -114,7 +114,7 @@ class PDFToSkillConverter:
|
||||
"""Load previously extracted data from JSON"""
|
||||
print(f"\n📂 Loading extracted data from: {json_path}")
|
||||
|
||||
with open(json_path, 'r', encoding='utf-8') as f:
|
||||
with open(json_path, encoding="utf-8") as f:
|
||||
self.extracted_data = json.load(f)
|
||||
|
||||
print(f"✅ Loaded {self.extracted_data['total_pages']} pages")
|
||||
@@ -122,28 +122,25 @@ class PDFToSkillConverter:
|
||||
|
||||
def categorize_content(self):
|
||||
"""Categorize pages based on chapters or keywords"""
|
||||
print(f"\n📋 Categorizing content...")
|
||||
print("\n📋 Categorizing content...")
|
||||
|
||||
categorized = {}
|
||||
|
||||
# Use chapters if available
|
||||
if self.extracted_data.get('chapters'):
|
||||
for chapter in self.extracted_data['chapters']:
|
||||
category_key = self._sanitize_filename(chapter['title'])
|
||||
categorized[category_key] = {
|
||||
'title': chapter['title'],
|
||||
'pages': []
|
||||
}
|
||||
if self.extracted_data.get("chapters"):
|
||||
for chapter in self.extracted_data["chapters"]:
|
||||
category_key = self._sanitize_filename(chapter["title"])
|
||||
categorized[category_key] = {"title": chapter["title"], "pages": []}
|
||||
|
||||
# Assign pages to chapters
|
||||
for page in self.extracted_data['pages']:
|
||||
page_num = page['page_number']
|
||||
for page in self.extracted_data["pages"]:
|
||||
page_num = page["page_number"]
|
||||
|
||||
# Find which chapter this page belongs to
|
||||
for chapter in self.extracted_data['chapters']:
|
||||
if chapter['start_page'] <= page_num <= chapter['end_page']:
|
||||
category_key = self._sanitize_filename(chapter['title'])
|
||||
categorized[category_key]['pages'].append(page)
|
||||
for chapter in self.extracted_data["chapters"]:
|
||||
if chapter["start_page"] <= page_num <= chapter["end_page"]:
|
||||
category_key = self._sanitize_filename(chapter["title"])
|
||||
categorized[category_key]["pages"].append(page)
|
||||
break
|
||||
|
||||
# Fall back to keyword-based categorization
|
||||
@@ -154,31 +151,28 @@ class PDFToSkillConverter:
|
||||
if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
|
||||
# Already categorized - convert to expected format
|
||||
for cat_key, pages in self.categories.items():
|
||||
categorized[cat_key] = {
|
||||
'title': cat_key.replace('_', ' ').title(),
|
||||
'pages': pages
|
||||
}
|
||||
categorized[cat_key] = {"title": cat_key.replace("_", " ").title(), "pages": pages}
|
||||
else:
|
||||
# Keyword-based categorization
|
||||
# Initialize categories
|
||||
for cat_key, keywords in self.categories.items():
|
||||
categorized[cat_key] = {
|
||||
'title': cat_key.replace('_', ' ').title(),
|
||||
'pages': []
|
||||
}
|
||||
categorized[cat_key] = {"title": cat_key.replace("_", " ").title(), "pages": []}
|
||||
|
||||
# Categorize by keywords
|
||||
for page in self.extracted_data['pages']:
|
||||
text = page.get('text', '').lower()
|
||||
headings_text = ' '.join([h['text'] for h in page.get('headings', [])]).lower()
|
||||
for page in self.extracted_data["pages"]:
|
||||
text = page.get("text", "").lower()
|
||||
headings_text = " ".join([h["text"] for h in page.get("headings", [])]).lower()
|
||||
|
||||
# Score against each category
|
||||
scores = {}
|
||||
for cat_key, keywords in self.categories.items():
|
||||
# Handle both string keywords and dict keywords (shouldn't happen, but be safe)
|
||||
if isinstance(keywords, list):
|
||||
score = sum(1 for kw in keywords
|
||||
if isinstance(kw, str) and (kw.lower() in text or kw.lower() in headings_text))
|
||||
score = sum(
|
||||
1
|
||||
for kw in keywords
|
||||
if isinstance(kw, str) and (kw.lower() in text or kw.lower() in headings_text)
|
||||
)
|
||||
else:
|
||||
score = 0
|
||||
if score > 0:
|
||||
@@ -187,19 +181,16 @@ class PDFToSkillConverter:
|
||||
# Assign to highest scoring category
|
||||
if scores:
|
||||
best_cat = max(scores, key=scores.get)
|
||||
categorized[best_cat]['pages'].append(page)
|
||||
categorized[best_cat]["pages"].append(page)
|
||||
else:
|
||||
# Default category
|
||||
if 'other' not in categorized:
|
||||
categorized['other'] = {'title': 'Other', 'pages': []}
|
||||
categorized['other']['pages'].append(page)
|
||||
if "other" not in categorized:
|
||||
categorized["other"] = {"title": "Other", "pages": []}
|
||||
categorized["other"]["pages"].append(page)
|
||||
|
||||
else:
|
||||
# No categorization - use single category
|
||||
categorized['content'] = {
|
||||
'title': 'Content',
|
||||
'pages': self.extracted_data['pages']
|
||||
}
|
||||
categorized["content"] = {"title": "Content", "pages": self.extracted_data["pages"]}
|
||||
|
||||
print(f"✅ Created {len(categorized)} categories")
|
||||
for cat_key, cat_data in categorized.items():
|
||||
@@ -220,7 +211,7 @@ class PDFToSkillConverter:
|
||||
categorized = self.categorize_content()
|
||||
|
||||
# Generate reference files
|
||||
print(f"\n📝 Generating reference files...")
|
||||
print("\n📝 Generating reference files...")
|
||||
for cat_key, cat_data in categorized.items():
|
||||
self._generate_reference_file(cat_key, cat_data)
|
||||
|
||||
@@ -237,42 +228,42 @@ class PDFToSkillConverter:
|
||||
"""Generate a reference markdown file for a category"""
|
||||
filename = f"{self.skill_dir}/references/{cat_key}.md"
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write(f"# {cat_data['title']}\n\n")
|
||||
|
||||
for page in cat_data['pages']:
|
||||
for page in cat_data["pages"]:
|
||||
# Add headings as section markers
|
||||
if page.get('headings'):
|
||||
if page.get("headings"):
|
||||
f.write(f"## {page['headings'][0]['text']}\n\n")
|
||||
|
||||
# Add text content
|
||||
if page.get('text'):
|
||||
if page.get("text"):
|
||||
# Limit to first 1000 chars per page to avoid huge files
|
||||
text = page['text'][:1000]
|
||||
text = page["text"][:1000]
|
||||
f.write(f"{text}\n\n")
|
||||
|
||||
# Add code samples (check both 'code_samples' and 'code_blocks' for compatibility)
|
||||
code_list = page.get('code_samples') or page.get('code_blocks')
|
||||
code_list = page.get("code_samples") or page.get("code_blocks")
|
||||
if code_list:
|
||||
f.write("### Code Examples\n\n")
|
||||
for code in code_list[:3]: # Limit to top 3
|
||||
lang = code.get('language', '')
|
||||
lang = code.get("language", "")
|
||||
f.write(f"```{lang}\n{code['code']}\n```\n\n")
|
||||
|
||||
# Add images
|
||||
if page.get('images'):
|
||||
if page.get("images"):
|
||||
# Create assets directory if needed
|
||||
assets_dir = os.path.join(self.skill_dir, 'assets')
|
||||
assets_dir = os.path.join(self.skill_dir, "assets")
|
||||
os.makedirs(assets_dir, exist_ok=True)
|
||||
|
||||
f.write("### Images\n\n")
|
||||
for img in page['images']:
|
||||
for img in page["images"]:
|
||||
# Save image to assets
|
||||
img_filename = f"page_{page['page_number']}_img_{img['index']}.png"
|
||||
img_path = os.path.join(assets_dir, img_filename)
|
||||
|
||||
with open(img_path, 'wb') as img_file:
|
||||
img_file.write(img['data'])
|
||||
with open(img_path, "wb") as img_file:
|
||||
img_file.write(img["data"])
|
||||
|
||||
# Add markdown image reference
|
||||
f.write(f"![Image {img['index']}](../assets/{img_filename})\n\n")
|
||||
@@ -285,16 +276,16 @@ class PDFToSkillConverter:
|
||||
"""Generate reference index"""
|
||||
filename = f"{self.skill_dir}/references/index.md"
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write(f"# {self.name.title()} Documentation Reference\n\n")
|
||||
f.write("## Categories\n\n")
|
||||
|
||||
for cat_key, cat_data in categorized.items():
|
||||
page_count = len(cat_data['pages'])
|
||||
page_count = len(cat_data["pages"])
|
||||
f.write(f"- [{cat_data['title']}]({cat_key}.md) ({page_count} pages)\n")
|
||||
|
||||
f.write("\n## Statistics\n\n")
|
||||
stats = self.extracted_data.get('quality_statistics', {})
|
||||
stats = self.extracted_data.get("quality_statistics", {})
|
||||
f.write(f"- Total pages: {self.extracted_data.get('total_pages', 0)}\n")
|
||||
f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n")
|
||||
f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n")
|
||||
@@ -309,37 +300,37 @@ class PDFToSkillConverter:
|
||||
filename = f"{self.skill_dir}/SKILL.md"
|
||||
|
||||
# Generate skill name (lowercase, hyphens only, max 64 chars)
|
||||
skill_name = self.name.lower().replace('_', '-').replace(' ', '-')[:64]
|
||||
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
|
||||
|
||||
# Truncate description to 1024 chars if needed
|
||||
desc = self.description[:1024] if len(self.description) > 1024 else self.description
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
# Write YAML frontmatter
|
||||
f.write(f"---\n")
|
||||
f.write("---\n")
|
||||
f.write(f"name: {skill_name}\n")
|
||||
f.write(f"description: {desc}\n")
|
||||
f.write(f"---\n\n")
|
||||
f.write("---\n\n")
|
||||
|
||||
f.write(f"# {self.name.title()} Documentation Skill\n\n")
|
||||
f.write(f"{self.description}\n\n")
|
||||
|
||||
# Enhanced "When to Use" section
|
||||
f.write("## 💡 When to Use This Skill\n\n")
|
||||
f.write(f"Use this skill when you need to:\n")
|
||||
f.write("Use this skill when you need to:\n")
|
||||
f.write(f"- Understand {self.name} concepts and fundamentals\n")
|
||||
f.write(f"- Look up API references and technical specifications\n")
|
||||
f.write(f"- Find code examples and implementation patterns\n")
|
||||
f.write(f"- Review tutorials, guides, and best practices\n")
|
||||
f.write(f"- Explore the complete documentation structure\n\n")
|
||||
f.write("- Look up API references and technical specifications\n")
|
||||
f.write("- Find code examples and implementation patterns\n")
|
||||
f.write("- Review tutorials, guides, and best practices\n")
|
||||
f.write("- Explore the complete documentation structure\n\n")
|
||||
|
||||
# Chapter Overview (PDF structure)
|
||||
f.write("## 📖 Chapter Overview\n\n")
|
||||
total_pages = self.extracted_data.get('total_pages', 0)
|
||||
total_pages = self.extracted_data.get("total_pages", 0)
|
||||
f.write(f"**Total Pages:** {total_pages}\n\n")
|
||||
f.write("**Content Breakdown:**\n\n")
|
||||
for cat_key, cat_data in categorized.items():
|
||||
page_count = len(cat_data['pages'])
|
||||
page_count = len(cat_data["pages"])
|
||||
f.write(f"- **{cat_data['title']}**: {page_count} pages\n")
|
||||
f.write("\n")
|
||||
|
||||
@@ -352,11 +343,11 @@ class PDFToSkillConverter:
|
||||
|
||||
# Enhanced code examples section (top 15, grouped by language)
|
||||
all_code = []
|
||||
for page in self.extracted_data['pages']:
|
||||
all_code.extend(page.get('code_samples', []))
|
||||
for page in self.extracted_data["pages"]:
|
||||
all_code.extend(page.get("code_samples", []))
|
||||
|
||||
# Sort by quality and get top 15
|
||||
all_code.sort(key=lambda x: x.get('quality_score', 0), reverse=True)
|
||||
all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
|
||||
top_code = all_code[:15]
|
||||
|
||||
if top_code:
|
||||
@@ -366,7 +357,7 @@ class PDFToSkillConverter:
|
||||
# Group by language
|
||||
by_lang = {}
|
||||
for code in top_code:
|
||||
lang = code.get('language', 'unknown')
|
||||
lang = code.get("language", "unknown")
|
||||
if lang not in by_lang:
|
||||
by_lang[lang] = []
|
||||
by_lang[lang].append(code)
|
||||
@@ -377,8 +368,8 @@ class PDFToSkillConverter:
|
||||
f.write(f"### {lang.title()} Examples ({len(examples)})\n\n")
|
||||
|
||||
for i, code in enumerate(examples[:5], 1): # Top 5 per language
|
||||
quality = code.get('quality_score', 0)
|
||||
code_text = code.get('code', '')
|
||||
quality = code.get("quality_score", 0)
|
||||
code_text = code.get("code", "")
|
||||
|
||||
f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n")
|
||||
f.write(f"```{lang}\n")
|
||||
@@ -394,13 +385,13 @@ class PDFToSkillConverter:
|
||||
# Statistics
|
||||
f.write("## 📊 Documentation Statistics\n\n")
|
||||
f.write(f"- **Total Pages**: {total_pages}\n")
|
||||
total_code_blocks = self.extracted_data.get('total_code_blocks', 0)
|
||||
total_code_blocks = self.extracted_data.get("total_code_blocks", 0)
|
||||
f.write(f"- **Code Blocks**: {total_code_blocks}\n")
|
||||
total_images = self.extracted_data.get('total_images', 0)
|
||||
total_images = self.extracted_data.get("total_images", 0)
|
||||
f.write(f"- **Images/Diagrams**: {total_images}\n")
|
||||
|
||||
# Language statistics
|
||||
langs = self.extracted_data.get('languages_detected', {})
|
||||
langs = self.extracted_data.get("languages_detected", {})
|
||||
if langs:
|
||||
f.write(f"- **Programming Languages**: {len(langs)}\n\n")
|
||||
f.write("**Language Breakdown:**\n\n")
|
||||
@@ -409,11 +400,11 @@ class PDFToSkillConverter:
|
||||
f.write("\n")
|
||||
|
||||
# Quality metrics
|
||||
quality_stats = self.extracted_data.get('quality_statistics', {})
|
||||
quality_stats = self.extracted_data.get("quality_statistics", {})
|
||||
if quality_stats:
|
||||
avg_quality = quality_stats.get('average_quality', 0)
|
||||
valid_blocks = quality_stats.get('valid_code_blocks', 0)
|
||||
f.write(f"**Code Quality:**\n\n")
|
||||
avg_quality = quality_stats.get("average_quality", 0)
|
||||
valid_blocks = quality_stats.get("valid_code_blocks", 0)
|
||||
f.write("**Code Quality:**\n\n")
|
||||
f.write(f"- Average Quality Score: {avg_quality:.1f}/10\n")
|
||||
f.write(f"- Valid Code Blocks: {valid_blocks}\n\n")
|
||||
|
||||
@@ -421,7 +412,7 @@ class PDFToSkillConverter:
|
||||
f.write("## 🗺️ Navigation\n\n")
|
||||
f.write("**Reference Files:**\n\n")
|
||||
for cat_key, cat_data in categorized.items():
|
||||
cat_file = self._sanitize_filename(cat_data['title'])
|
||||
cat_file = self._sanitize_filename(cat_data["title"])
|
||||
f.write(f"- `references/{cat_file}.md` - {cat_data['title']}\n")
|
||||
f.write("\n")
|
||||
f.write("See `references/index.md` for complete documentation structure.\n\n")
|
||||
@@ -430,18 +421,18 @@ class PDFToSkillConverter:
|
||||
f.write("---\n\n")
|
||||
f.write("**Generated by Skill Seeker** | PDF Documentation Scraper\n")
|
||||
|
||||
line_count = len(open(filename, 'r', encoding='utf-8').read().split('\n'))
|
||||
line_count = len(open(filename, encoding="utf-8").read().split("\n"))
|
||||
print(f" Generated: {filename} ({line_count} lines)")
|
||||
|
||||
def _format_key_concepts(self) -> str:
|
||||
"""Extract key concepts from headings across all pages."""
|
||||
all_headings = []
|
||||
|
||||
for page in self.extracted_data.get('pages', []):
|
||||
headings = page.get('headings', [])
|
||||
for page in self.extracted_data.get("pages", []):
|
||||
headings = page.get("headings", [])
|
||||
for heading in headings:
|
||||
text = heading.get('text', '').strip()
|
||||
level = heading.get('level', 'h1')
|
||||
text = heading.get("text", "").strip()
|
||||
level = heading.get("level", "h1")
|
||||
if text and len(text) > 3: # Skip very short headings
|
||||
all_headings.append((level, text))
|
||||
|
||||
@@ -452,8 +443,8 @@ class PDFToSkillConverter:
|
||||
content += "*Main topics covered in this documentation*\n\n"
|
||||
|
||||
# Group by level and show top concepts
|
||||
h1_headings = [text for level, text in all_headings if level == 'h1']
|
||||
h2_headings = [text for level, text in all_headings if level == 'h2']
|
||||
h1_headings = [text for level, text in all_headings if level == "h1"]
|
||||
h2_headings = [text for level, text in all_headings if level == "h2"]
|
||||
|
||||
if h1_headings:
|
||||
content += "**Major Topics:**\n\n"
|
||||
@@ -475,27 +466,31 @@ class PDFToSkillConverter:
|
||||
patterns = []
|
||||
|
||||
# Simple pattern extraction from headings and emphasized text
|
||||
for page in self.extracted_data.get('pages', []):
|
||||
text = page.get('text', '')
|
||||
headings = page.get('headings', [])
|
||||
for page in self.extracted_data.get("pages", []):
|
||||
text = page.get("text", "")
|
||||
headings = page.get("headings", [])
|
||||
|
||||
# Look for common pattern keywords in headings
|
||||
pattern_keywords = [
|
||||
'getting started', 'installation', 'configuration',
|
||||
'usage', 'api', 'examples', 'tutorial', 'guide',
|
||||
'best practices', 'troubleshooting', 'faq'
|
||||
"getting started",
|
||||
"installation",
|
||||
"configuration",
|
||||
"usage",
|
||||
"api",
|
||||
"examples",
|
||||
"tutorial",
|
||||
"guide",
|
||||
"best practices",
|
||||
"troubleshooting",
|
||||
"faq",
|
||||
]
|
||||
|
||||
for heading in headings:
|
||||
heading_text = heading.get('text', '').lower()
|
||||
heading_text = heading.get("text", "").lower()
|
||||
for keyword in pattern_keywords:
|
||||
if keyword in heading_text:
|
||||
page_num = page.get('page_number', 0)
|
||||
patterns.append({
|
||||
'type': keyword.title(),
|
||||
'heading': heading.get('text', ''),
|
||||
'page': page_num
|
||||
})
|
||||
page_num = page.get("page_number", 0)
|
||||
patterns.append({"type": keyword.title(), "heading": heading.get("text", ""), "page": page_num})
|
||||
break # Only add once per heading
|
||||
|
||||
if not patterns:
|
||||
@@ -506,7 +501,7 @@ class PDFToSkillConverter:
|
||||
# Group by type
|
||||
by_type = {}
|
||||
for pattern in patterns:
|
||||
ptype = pattern['type']
|
||||
ptype = pattern["type"]
|
||||
if ptype not in by_type:
|
||||
by_type[ptype] = []
|
||||
by_type[ptype].append(pattern)
|
||||
@@ -524,22 +519,21 @@ class PDFToSkillConverter:
|
||||
def _sanitize_filename(self, name):
|
||||
"""Convert string to safe filename"""
|
||||
# Remove special chars, replace spaces with underscores
|
||||
safe = re.sub(r'[^\w\s-]', '', name.lower())
|
||||
safe = re.sub(r'[-\s]+', '_', safe)
|
||||
safe = re.sub(r"[^\w\s-]", "", name.lower())
|
||||
safe = re.sub(r"[-\s]+", "_", safe)
|
||||
return safe
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert PDF documentation to Claude skill',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
description="Convert PDF documentation to Claude skill", formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument('--config', help='PDF config JSON file')
|
||||
parser.add_argument('--pdf', help='Direct PDF file path')
|
||||
parser.add_argument('--name', help='Skill name (with --pdf)')
|
||||
parser.add_argument('--from-json', help='Build skill from extracted JSON')
|
||||
parser.add_argument('--description', help='Skill description')
|
||||
parser.add_argument("--config", help="PDF config JSON file")
|
||||
parser.add_argument("--pdf", help="Direct PDF file path")
|
||||
parser.add_argument("--name", help="Skill name (with --pdf)")
|
||||
parser.add_argument("--from-json", help="Build skill from extracted JSON")
|
||||
parser.add_argument("--description", help="Skill description")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -549,15 +543,12 @@ def main():
|
||||
|
||||
# Load or create config
|
||||
if args.config:
|
||||
with open(args.config, 'r') as f:
|
||||
with open(args.config) as f:
|
||||
config = json.load(f)
|
||||
elif args.from_json:
|
||||
# Build from extracted JSON
|
||||
name = Path(args.from_json).stem.replace('_extracted', '')
|
||||
config = {
|
||||
'name': name,
|
||||
'description': args.description or f'Use when referencing {name} documentation'
|
||||
}
|
||||
name = Path(args.from_json).stem.replace("_extracted", "")
|
||||
config = {"name": name, "description": args.description or f"Use when referencing {name} documentation"}
|
||||
converter = PDFToSkillConverter(config)
|
||||
converter.load_extracted_data(args.from_json)
|
||||
converter.build_skill()
|
||||
@@ -567,22 +558,17 @@ def main():
|
||||
if not args.name:
|
||||
parser.error("Must specify --name with --pdf")
|
||||
config = {
|
||||
'name': args.name,
|
||||
'pdf_path': args.pdf,
|
||||
'description': args.description or f'Use when referencing {args.name} documentation',
|
||||
'extract_options': {
|
||||
'chunk_size': 10,
|
||||
'min_quality': 5.0,
|
||||
'extract_images': True,
|
||||
'min_image_size': 100
|
||||
}
|
||||
"name": args.name,
|
||||
"pdf_path": args.pdf,
|
||||
"description": args.description or f"Use when referencing {args.name} documentation",
|
||||
"extract_options": {"chunk_size": 10, "min_quality": 5.0, "extract_images": True, "min_image_size": 100},
|
||||
}
|
||||
|
||||
# Create converter
|
||||
converter = PDFToSkillConverter(config)
|
||||
|
||||
# Extract if needed
|
||||
if config.get('pdf_path'):
|
||||
if config.get("pdf_path"):
|
||||
if not converter.extract_pdf():
|
||||
sys.exit(1)
|
||||
|
||||
@@ -590,5 +576,5 @@ def main():
|
||||
converter.build_skill()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -8,44 +8,44 @@ Usage:
|
||||
python3 quality_checker.py output/godot/ --verbose
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class QualityIssue:
|
||||
"""Represents a quality issue found during validation."""
|
||||
|
||||
level: str # 'error', 'warning', 'info'
|
||||
category: str # 'enhancement', 'content', 'links', 'structure'
|
||||
message: str
|
||||
file: Optional[str] = None
|
||||
line: Optional[int] = None
|
||||
file: str | None = None
|
||||
line: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class QualityReport:
|
||||
"""Complete quality report for a skill."""
|
||||
|
||||
skill_name: str
|
||||
skill_path: Path
|
||||
errors: List[QualityIssue] = field(default_factory=list)
|
||||
warnings: List[QualityIssue] = field(default_factory=list)
|
||||
info: List[QualityIssue] = field(default_factory=list)
|
||||
errors: list[QualityIssue] = field(default_factory=list)
|
||||
warnings: list[QualityIssue] = field(default_factory=list)
|
||||
info: list[QualityIssue] = field(default_factory=list)
|
||||
|
||||
def add_error(self, category: str, message: str, file: str = None, line: int = None):
|
||||
"""Add an error to the report."""
|
||||
self.errors.append(QualityIssue('error', category, message, file, line))
|
||||
self.errors.append(QualityIssue("error", category, message, file, line))
|
||||
|
||||
def add_warning(self, category: str, message: str, file: str = None, line: int = None):
|
||||
"""Add a warning to the report."""
|
||||
self.warnings.append(QualityIssue('warning', category, message, file, line))
|
||||
self.warnings.append(QualityIssue("warning", category, message, file, line))
|
||||
|
||||
def add_info(self, category: str, message: str, file: str = None, line: int = None):
|
||||
"""Add info to the report."""
|
||||
self.info.append(QualityIssue('info', category, message, file, line))
|
||||
self.info.append(QualityIssue("info", category, message, file, line))
|
||||
|
||||
@property
|
||||
def has_errors(self) -> bool:
|
||||
@@ -80,15 +80,15 @@ class QualityReport:
|
||||
"""Get quality grade (A-F)."""
|
||||
score = self.quality_score
|
||||
if score >= 90:
|
||||
return 'A'
|
||||
return "A"
|
||||
elif score >= 80:
|
||||
return 'B'
|
||||
return "B"
|
||||
elif score >= 70:
|
||||
return 'C'
|
||||
return "C"
|
||||
elif score >= 60:
|
||||
return 'D'
|
||||
return "D"
|
||||
else:
|
||||
return 'F'
|
||||
return "F"
|
||||
|
||||
|
||||
class SkillQualityChecker:
|
||||
@@ -103,10 +103,7 @@ class SkillQualityChecker:
|
||||
self.skill_dir = Path(skill_dir)
|
||||
self.skill_md_path = self.skill_dir / "SKILL.md"
|
||||
self.references_dir = self.skill_dir / "references"
|
||||
self.report = QualityReport(
|
||||
skill_name=self.skill_dir.name,
|
||||
skill_path=self.skill_dir
|
||||
)
|
||||
self.report = QualityReport(skill_name=self.skill_dir.name, skill_path=self.skill_dir)
|
||||
|
||||
def check_all(self) -> QualityReport:
|
||||
"""Run all quality checks and return report.
|
||||
@@ -135,25 +132,19 @@ class SkillQualityChecker:
|
||||
"""Check basic skill structure."""
|
||||
# Check SKILL.md exists
|
||||
if not self.skill_md_path.exists():
|
||||
self.report.add_error(
|
||||
'structure',
|
||||
'SKILL.md file not found',
|
||||
str(self.skill_md_path)
|
||||
)
|
||||
self.report.add_error("structure", "SKILL.md file not found", str(self.skill_md_path))
|
||||
return
|
||||
|
||||
# Check references directory exists
|
||||
if not self.references_dir.exists():
|
||||
self.report.add_warning(
|
||||
'structure',
|
||||
'references/ directory not found - skill may be incomplete',
|
||||
str(self.references_dir)
|
||||
"structure", "references/ directory not found - skill may be incomplete", str(self.references_dir)
|
||||
)
|
||||
elif not list(self.references_dir.rglob('*.md')):
|
||||
elif not list(self.references_dir.rglob("*.md")):
|
||||
self.report.add_warning(
|
||||
'structure',
|
||||
'references/ directory is empty - no reference documentation found',
|
||||
str(self.references_dir)
|
||||
"structure",
|
||||
"references/ directory is empty - no reference documentation found",
|
||||
str(self.references_dir),
|
||||
)
|
||||
|
||||
def _check_enhancement_quality(self):
|
||||
@@ -161,7 +152,7 @@ class SkillQualityChecker:
|
||||
if not self.skill_md_path.exists():
|
||||
return
|
||||
|
||||
content = self.skill_md_path.read_text(encoding='utf-8')
|
||||
content = self.skill_md_path.read_text(encoding="utf-8")
|
||||
|
||||
# Check for template indicators (signs it wasn't enhanced)
|
||||
template_indicators = [
|
||||
@@ -174,140 +165,90 @@ class SkillQualityChecker:
|
||||
for indicator in template_indicators:
|
||||
if indicator.lower() in content.lower():
|
||||
self.report.add_warning(
|
||||
'enhancement',
|
||||
"enhancement",
|
||||
f'Found template placeholder: "{indicator}" - SKILL.md may not be enhanced',
|
||||
'SKILL.md'
|
||||
"SKILL.md",
|
||||
)
|
||||
|
||||
# Check for good signs of enhancement
|
||||
enhancement_indicators = {
|
||||
'code_examples': re.compile(r'```[\w-]+\n', re.MULTILINE),
|
||||
'real_examples': re.compile(r'Example:', re.IGNORECASE),
|
||||
'sections': re.compile(r'^## .+', re.MULTILINE),
|
||||
"code_examples": re.compile(r"```[\w-]+\n", re.MULTILINE),
|
||||
"real_examples": re.compile(r"Example:", re.IGNORECASE),
|
||||
"sections": re.compile(r"^## .+", re.MULTILINE),
|
||||
}
|
||||
|
||||
code_blocks = len(enhancement_indicators['code_examples'].findall(content))
|
||||
real_examples = len(enhancement_indicators['real_examples'].findall(content))
|
||||
sections = len(enhancement_indicators['sections'].findall(content))
|
||||
code_blocks = len(enhancement_indicators["code_examples"].findall(content))
|
||||
real_examples = len(enhancement_indicators["real_examples"].findall(content))
|
||||
sections = len(enhancement_indicators["sections"].findall(content))
|
||||
|
||||
# Quality thresholds
|
||||
if code_blocks == 0:
|
||||
self.report.add_warning(
|
||||
'enhancement',
|
||||
'No code examples found in SKILL.md - consider enhancing',
|
||||
'SKILL.md'
|
||||
"enhancement", "No code examples found in SKILL.md - consider enhancing", "SKILL.md"
|
||||
)
|
||||
elif code_blocks < 3:
|
||||
self.report.add_info(
|
||||
'enhancement',
|
||||
f'Only {code_blocks} code examples found - more examples would improve quality',
|
||||
'SKILL.md'
|
||||
"enhancement",
|
||||
f"Only {code_blocks} code examples found - more examples would improve quality",
|
||||
"SKILL.md",
|
||||
)
|
||||
else:
|
||||
self.report.add_info(
|
||||
'enhancement',
|
||||
f'✓ Found {code_blocks} code examples',
|
||||
'SKILL.md'
|
||||
)
|
||||
self.report.add_info("enhancement", f"✓ Found {code_blocks} code examples", "SKILL.md")
|
||||
|
||||
if sections < 4:
|
||||
self.report.add_warning(
|
||||
'enhancement',
|
||||
f'Only {sections} sections found - SKILL.md may be too basic',
|
||||
'SKILL.md'
|
||||
"enhancement", f"Only {sections} sections found - SKILL.md may be too basic", "SKILL.md"
|
||||
)
|
||||
else:
|
||||
self.report.add_info(
|
||||
'enhancement',
|
||||
f'✓ Found {sections} sections',
|
||||
'SKILL.md'
|
||||
)
|
||||
self.report.add_info("enhancement", f"✓ Found {sections} sections", "SKILL.md")
|
||||
|
||||
def _check_content_quality(self):
|
||||
"""Check content quality."""
|
||||
if not self.skill_md_path.exists():
|
||||
return
|
||||
|
||||
content = self.skill_md_path.read_text(encoding='utf-8')
|
||||
content = self.skill_md_path.read_text(encoding="utf-8")
|
||||
|
||||
# Check YAML frontmatter
|
||||
if not content.startswith('---'):
|
||||
self.report.add_error(
|
||||
'content',
|
||||
'Missing YAML frontmatter - SKILL.md must start with ---',
|
||||
'SKILL.md',
|
||||
1
|
||||
)
|
||||
if not content.startswith("---"):
|
||||
self.report.add_error("content", "Missing YAML frontmatter - SKILL.md must start with ---", "SKILL.md", 1)
|
||||
else:
|
||||
# Extract frontmatter
|
||||
try:
|
||||
frontmatter_match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
|
||||
frontmatter_match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
|
||||
if frontmatter_match:
|
||||
frontmatter = frontmatter_match.group(1)
|
||||
|
||||
# Check for required fields
|
||||
if 'name:' not in frontmatter:
|
||||
self.report.add_error(
|
||||
'content',
|
||||
'Missing "name:" field in YAML frontmatter',
|
||||
'SKILL.md',
|
||||
2
|
||||
)
|
||||
if "name:" not in frontmatter:
|
||||
self.report.add_error("content", 'Missing "name:" field in YAML frontmatter', "SKILL.md", 2)
|
||||
|
||||
# Check for description
|
||||
if 'description:' in frontmatter:
|
||||
self.report.add_info(
|
||||
'content',
|
||||
'✓ YAML frontmatter includes description',
|
||||
'SKILL.md'
|
||||
)
|
||||
if "description:" in frontmatter:
|
||||
self.report.add_info("content", "✓ YAML frontmatter includes description", "SKILL.md")
|
||||
else:
|
||||
self.report.add_error(
|
||||
'content',
|
||||
'Invalid YAML frontmatter format',
|
||||
'SKILL.md',
|
||||
1
|
||||
)
|
||||
self.report.add_error("content", "Invalid YAML frontmatter format", "SKILL.md", 1)
|
||||
except Exception as e:
|
||||
self.report.add_error(
|
||||
'content',
|
||||
f'Error parsing YAML frontmatter: {e}',
|
||||
'SKILL.md',
|
||||
1
|
||||
)
|
||||
self.report.add_error("content", f"Error parsing YAML frontmatter: {e}", "SKILL.md", 1)
|
||||
|
||||
# Check code block language tags
|
||||
code_blocks_without_lang = re.findall(r'```\n[^`]', content)
|
||||
code_blocks_without_lang = re.findall(r"```\n[^`]", content)
|
||||
if code_blocks_without_lang:
|
||||
self.report.add_warning(
|
||||
'content',
|
||||
f'Found {len(code_blocks_without_lang)} code blocks without language tags',
|
||||
'SKILL.md'
|
||||
"content", f"Found {len(code_blocks_without_lang)} code blocks without language tags", "SKILL.md"
|
||||
)
|
||||
|
||||
# Check for "When to Use" section
|
||||
if 'when to use' not in content.lower():
|
||||
self.report.add_warning(
|
||||
'content',
|
||||
'Missing "When to Use This Skill" section',
|
||||
'SKILL.md'
|
||||
)
|
||||
if "when to use" not in content.lower():
|
||||
self.report.add_warning("content", 'Missing "When to Use This Skill" section', "SKILL.md")
|
||||
else:
|
||||
self.report.add_info(
|
||||
'content',
|
||||
'✓ Found "When to Use" section',
|
||||
'SKILL.md'
|
||||
)
|
||||
self.report.add_info("content", '✓ Found "When to Use" section', "SKILL.md")
|
||||
|
||||
# Check reference files
|
||||
if self.references_dir.exists():
|
||||
ref_files = list(self.references_dir.rglob('*.md'))
|
||||
ref_files = list(self.references_dir.rglob("*.md"))
|
||||
if ref_files:
|
||||
self.report.add_info(
|
||||
'content',
|
||||
f'✓ Found {len(ref_files)} reference files',
|
||||
'references/'
|
||||
)
|
||||
self.report.add_info("content", f"✓ Found {len(ref_files)} reference files", "references/")
|
||||
|
||||
# Check if references are mentioned in SKILL.md
|
||||
mentioned_refs = 0
|
||||
@@ -317,9 +258,7 @@ class SkillQualityChecker:
|
||||
|
||||
if mentioned_refs == 0:
|
||||
self.report.add_warning(
|
||||
'content',
|
||||
'Reference files exist but none are mentioned in SKILL.md',
|
||||
'SKILL.md'
|
||||
"content", "Reference files exist but none are mentioned in SKILL.md", "SKILL.md"
|
||||
)
|
||||
|
||||
def _check_links(self):
|
||||
@@ -327,21 +266,21 @@ class SkillQualityChecker:
|
||||
if not self.skill_md_path.exists():
|
||||
return
|
||||
|
||||
content = self.skill_md_path.read_text(encoding='utf-8')
|
||||
content = self.skill_md_path.read_text(encoding="utf-8")
|
||||
|
||||
# Find all markdown links [text](path)
|
||||
link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
|
||||
link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
||||
links = link_pattern.findall(content)
|
||||
|
||||
broken_links = []
|
||||
|
||||
for text, link in links:
|
||||
# Skip external links (http/https)
|
||||
if link.startswith('http://') or link.startswith('https://'):
|
||||
if link.startswith("http://") or link.startswith("https://"):
|
||||
continue
|
||||
|
||||
# Skip anchor links
|
||||
if link.startswith('#'):
|
||||
if link.startswith("#"):
|
||||
continue
|
||||
|
||||
# Check if file exists (relative to SKILL.md)
|
||||
@@ -351,20 +290,12 @@ class SkillQualityChecker:
|
||||
|
||||
if broken_links:
|
||||
for text, link in broken_links:
|
||||
self.report.add_warning(
|
||||
'links',
|
||||
f'Broken link: [{text}]({link})',
|
||||
'SKILL.md'
|
||||
)
|
||||
self.report.add_warning("links", f"Broken link: [{text}]({link})", "SKILL.md")
|
||||
else:
|
||||
if links:
|
||||
internal_links = [l for t, l in links if not l.startswith('http')]
|
||||
internal_links = [l for t, l in links if not l.startswith("http")]
|
||||
if internal_links:
|
||||
self.report.add_info(
|
||||
'links',
|
||||
f'✓ All {len(internal_links)} internal links are valid',
|
||||
'SKILL.md'
|
||||
)
|
||||
self.report.add_info("links", f"✓ All {len(internal_links)} internal links are valid", "SKILL.md")
|
||||
|
||||
def _check_skill_completeness(self):
|
||||
"""Check skill completeness based on best practices.
|
||||
@@ -375,83 +306,61 @@ class SkillQualityChecker:
|
||||
if not self.skill_md_path.exists():
|
||||
return
|
||||
|
||||
content = self.skill_md_path.read_text(encoding='utf-8')
|
||||
content = self.skill_md_path.read_text(encoding="utf-8")
|
||||
|
||||
# Check for grounding/verification section (prerequisites)
|
||||
grounding_patterns = [
|
||||
r'before\s+(executing|running|proceeding|you\s+start)',
|
||||
r'verify\s+that',
|
||||
r'prerequisites?',
|
||||
r'requirements?:',
|
||||
r'make\s+sure\s+you\s+have',
|
||||
r"before\s+(executing|running|proceeding|you\s+start)",
|
||||
r"verify\s+that",
|
||||
r"prerequisites?",
|
||||
r"requirements?:",
|
||||
r"make\s+sure\s+you\s+have",
|
||||
]
|
||||
has_grounding = any(
|
||||
re.search(pattern, content, re.IGNORECASE)
|
||||
for pattern in grounding_patterns
|
||||
)
|
||||
has_grounding = any(re.search(pattern, content, re.IGNORECASE) for pattern in grounding_patterns)
|
||||
if has_grounding:
|
||||
self.report.add_info(
|
||||
'completeness',
|
||||
'✓ Found verification/prerequisites section',
|
||||
'SKILL.md'
|
||||
)
|
||||
self.report.add_info("completeness", "✓ Found verification/prerequisites section", "SKILL.md")
|
||||
else:
|
||||
self.report.add_info(
|
||||
'completeness',
|
||||
'Consider adding prerequisites section - helps Claude verify conditions first',
|
||||
'SKILL.md'
|
||||
"completeness",
|
||||
"Consider adding prerequisites section - helps Claude verify conditions first",
|
||||
"SKILL.md",
|
||||
)
|
||||
|
||||
# Check for error handling/troubleshooting guidance
|
||||
error_patterns = [
|
||||
r'if\s+.*\s+(fails?|errors?)',
|
||||
r'troubleshoot',
|
||||
r'common\s+(issues?|problems?)',
|
||||
r'error\s+handling',
|
||||
r'when\s+things\s+go\s+wrong',
|
||||
r"if\s+.*\s+(fails?|errors?)",
|
||||
r"troubleshoot",
|
||||
r"common\s+(issues?|problems?)",
|
||||
r"error\s+handling",
|
||||
r"when\s+things\s+go\s+wrong",
|
||||
]
|
||||
has_error_handling = any(
|
||||
re.search(pattern, content, re.IGNORECASE)
|
||||
for pattern in error_patterns
|
||||
)
|
||||
has_error_handling = any(re.search(pattern, content, re.IGNORECASE) for pattern in error_patterns)
|
||||
if has_error_handling:
|
||||
self.report.add_info(
|
||||
'completeness',
|
||||
'✓ Found error handling/troubleshooting guidance',
|
||||
'SKILL.md'
|
||||
)
|
||||
self.report.add_info("completeness", "✓ Found error handling/troubleshooting guidance", "SKILL.md")
|
||||
else:
|
||||
self.report.add_info(
|
||||
'completeness',
|
||||
'Consider adding troubleshooting section for common issues',
|
||||
'SKILL.md'
|
||||
"completeness", "Consider adding troubleshooting section for common issues", "SKILL.md"
|
||||
)
|
||||
|
||||
# Check for workflow steps (numbered or sequential indicators)
|
||||
step_patterns = [
|
||||
r'step\s+\d',
|
||||
r'##\s+\d\.',
|
||||
r'first,?\s+',
|
||||
r'then,?\s+',
|
||||
r'finally,?\s+',
|
||||
r'next,?\s+',
|
||||
r"step\s+\d",
|
||||
r"##\s+\d\.",
|
||||
r"first,?\s+",
|
||||
r"then,?\s+",
|
||||
r"finally,?\s+",
|
||||
r"next,?\s+",
|
||||
]
|
||||
steps_found = sum(
|
||||
1 for pattern in step_patterns
|
||||
if re.search(pattern, content, re.IGNORECASE)
|
||||
)
|
||||
steps_found = sum(1 for pattern in step_patterns if re.search(pattern, content, re.IGNORECASE))
|
||||
if steps_found >= 3:
|
||||
self.report.add_info(
|
||||
'completeness',
|
||||
f'✓ Found clear workflow indicators ({steps_found} step markers)',
|
||||
'SKILL.md'
|
||||
"completeness", f"✓ Found clear workflow indicators ({steps_found} step markers)", "SKILL.md"
|
||||
)
|
||||
elif steps_found > 0:
|
||||
self.report.add_info(
|
||||
'completeness',
|
||||
f'Some workflow guidance found ({steps_found} markers) - '
|
||||
'consider adding numbered steps for clarity',
|
||||
'SKILL.md'
|
||||
"completeness",
|
||||
f"Some workflow guidance found ({steps_found} markers) - consider adding numbered steps for clarity",
|
||||
"SKILL.md",
|
||||
)
|
||||
|
||||
|
||||
@@ -475,7 +384,13 @@ def print_report(report: QualityReport, verbose: bool = False):
|
||||
if report.errors:
|
||||
print(f"❌ ERRORS ({len(report.errors)}):")
|
||||
for issue in report.errors:
|
||||
location = f" ({issue.file}:{issue.line})" if issue.file and issue.line else f" ({issue.file})" if issue.file else ""
|
||||
location = (
|
||||
f" ({issue.file}:{issue.line})"
|
||||
if issue.file and issue.line
|
||||
else f" ({issue.file})"
|
||||
if issue.file
|
||||
else ""
|
||||
)
|
||||
print(f" [{issue.category}] {issue.message}{location}")
|
||||
print()
|
||||
|
||||
@@ -483,7 +398,13 @@ def print_report(report: QualityReport, verbose: bool = False):
|
||||
if report.warnings:
|
||||
print(f"⚠️ WARNINGS ({len(report.warnings)}):")
|
||||
for issue in report.warnings:
|
||||
location = f" ({issue.file}:{issue.line})" if issue.file and issue.line else f" ({issue.file})" if issue.file else ""
|
||||
location = (
|
||||
f" ({issue.file}:{issue.line})"
|
||||
if issue.file and issue.line
|
||||
else f" ({issue.file})"
|
||||
if issue.file
|
||||
else ""
|
||||
)
|
||||
print(f" [{issue.category}] {issue.message}{location}")
|
||||
print()
|
||||
|
||||
@@ -523,25 +444,14 @@ Examples:
|
||||
|
||||
# Exit with error code if issues found
|
||||
python3 quality_checker.py output/django/ --strict
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'skill_directory',
|
||||
help='Path to skill directory (e.g., output/react/)'
|
||||
)
|
||||
parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)")
|
||||
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help='Show all info messages'
|
||||
)
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Show all info messages")
|
||||
|
||||
parser.add_argument(
|
||||
'--strict',
|
||||
action='store_true',
|
||||
help='Exit with error code if any warnings or errors found'
|
||||
)
|
||||
parser.add_argument("--strict", action="store_true", help="Exit with error code if any warnings or errors found")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -559,9 +469,7 @@ Examples:
|
||||
print_report(report, verbose=args.verbose)
|
||||
|
||||
# Exit code
|
||||
if args.strict and (report.has_errors or report.has_warnings):
|
||||
sys.exit(1)
|
||||
elif report.has_errors:
|
||||
if args.strict and (report.has_errors or report.has_warnings) or report.has_errors:
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(0)
|
||||
|
||||
@@ -9,16 +9,19 @@ Handles GitHub API rate limits with smart strategies:
|
||||
- Non-interactive mode for CI/CD
|
||||
"""
|
||||
|
||||
import time
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, Dict, Any
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
from .config_manager import get_config_manager
|
||||
|
||||
|
||||
class RateLimitError(Exception):
|
||||
"""Raised when rate limit is exceeded and cannot be handled."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@@ -43,10 +46,10 @@ class RateLimitHandler:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
token: Optional[str] = None,
|
||||
token: str | None = None,
|
||||
interactive: bool = True,
|
||||
profile_name: Optional[str] = None,
|
||||
auto_switch: bool = True
|
||||
profile_name: str | None = None,
|
||||
auto_switch: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize rate limit handler.
|
||||
@@ -91,7 +94,7 @@ class RateLimitHandler:
|
||||
|
||||
if self.interactive:
|
||||
response = input("Continue without token? [Y/n]: ").strip().lower()
|
||||
if response in ['n', 'no']:
|
||||
if response in ["n", "no"]:
|
||||
print("\n✅ Run 'skill-seekers config --github' to set up a token.\n")
|
||||
return False
|
||||
|
||||
@@ -100,12 +103,12 @@ class RateLimitHandler:
|
||||
# Check current rate limit status
|
||||
try:
|
||||
rate_info = self.get_rate_limit_info()
|
||||
remaining = rate_info.get('remaining', 0)
|
||||
limit = rate_info.get('limit', 5000)
|
||||
remaining = rate_info.get("remaining", 0)
|
||||
limit = rate_info.get("limit", 5000)
|
||||
|
||||
if remaining == 0:
|
||||
print(f"\n⚠️ Warning: GitHub rate limit already exhausted (0/{limit})")
|
||||
reset_time = rate_info.get('reset_time')
|
||||
reset_time = rate_info.get("reset_time")
|
||||
if reset_time:
|
||||
wait_minutes = (reset_time - datetime.now()).total_seconds() / 60
|
||||
print(f" Resets in {int(wait_minutes)} minutes")
|
||||
@@ -146,9 +149,9 @@ class RateLimitHandler:
|
||||
if response.status_code == 403:
|
||||
try:
|
||||
error_data = response.json()
|
||||
message = error_data.get('message', '')
|
||||
message = error_data.get("message", "")
|
||||
|
||||
if 'rate limit' in message.lower() or 'api rate limit exceeded' in message.lower():
|
||||
if "rate limit" in message.lower() or "api rate limit exceeded" in message.lower():
|
||||
# Extract rate limit info from headers
|
||||
rate_info = self.extract_rate_limit_info(response)
|
||||
return self.handle_rate_limit(rate_info)
|
||||
@@ -158,7 +161,7 @@ class RateLimitHandler:
|
||||
|
||||
return True
|
||||
|
||||
def extract_rate_limit_info(self, response: requests.Response) -> Dict[str, Any]:
|
||||
def extract_rate_limit_info(self, response: requests.Response) -> dict[str, Any]:
|
||||
"""
|
||||
Extract rate limit information from response headers.
|
||||
|
||||
@@ -170,20 +173,15 @@ class RateLimitHandler:
|
||||
"""
|
||||
headers = response.headers
|
||||
|
||||
limit = int(headers.get('X-RateLimit-Limit', 0))
|
||||
remaining = int(headers.get('X-RateLimit-Remaining', 0))
|
||||
reset_timestamp = int(headers.get('X-RateLimit-Reset', 0))
|
||||
limit = int(headers.get("X-RateLimit-Limit", 0))
|
||||
remaining = int(headers.get("X-RateLimit-Remaining", 0))
|
||||
reset_timestamp = int(headers.get("X-RateLimit-Reset", 0))
|
||||
|
||||
reset_time = datetime.fromtimestamp(reset_timestamp) if reset_timestamp else None
|
||||
|
||||
return {
|
||||
'limit': limit,
|
||||
'remaining': remaining,
|
||||
'reset_timestamp': reset_timestamp,
|
||||
'reset_time': reset_time
|
||||
}
|
||||
return {"limit": limit, "remaining": remaining, "reset_timestamp": reset_timestamp, "reset_time": reset_time}
|
||||
|
||||
def get_rate_limit_info(self) -> Dict[str, Any]:
|
||||
def get_rate_limit_info(self) -> dict[str, Any]:
|
||||
"""
|
||||
Get current rate limit status from GitHub API.
|
||||
|
||||
@@ -193,25 +191,25 @@ class RateLimitHandler:
|
||||
url = "https://api.github.com/rate_limit"
|
||||
headers = {}
|
||||
if self.token:
|
||||
headers['Authorization'] = f'token {self.token}'
|
||||
headers["Authorization"] = f"token {self.token}"
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=5)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
core = data.get('rate', {})
|
||||
core = data.get("rate", {})
|
||||
|
||||
reset_timestamp = core.get('reset', 0)
|
||||
reset_timestamp = core.get("reset", 0)
|
||||
reset_time = datetime.fromtimestamp(reset_timestamp) if reset_timestamp else None
|
||||
|
||||
return {
|
||||
'limit': core.get('limit', 0),
|
||||
'remaining': core.get('remaining', 0),
|
||||
'reset_timestamp': reset_timestamp,
|
||||
'reset_time': reset_time
|
||||
"limit": core.get("limit", 0),
|
||||
"remaining": core.get("remaining", 0),
|
||||
"reset_timestamp": reset_timestamp,
|
||||
"reset_time": reset_time,
|
||||
}
|
||||
|
||||
def handle_rate_limit(self, rate_info: Dict[str, Any]) -> bool:
|
||||
def handle_rate_limit(self, rate_info: dict[str, Any]) -> bool:
|
||||
"""
|
||||
Handle rate limit based on strategy.
|
||||
|
||||
@@ -224,11 +222,11 @@ class RateLimitHandler:
|
||||
Raises:
|
||||
RateLimitError: If cannot handle in non-interactive mode
|
||||
"""
|
||||
reset_time = rate_info.get('reset_time')
|
||||
remaining = rate_info.get('remaining', 0)
|
||||
limit = rate_info.get('limit', 0)
|
||||
reset_time = rate_info.get("reset_time")
|
||||
remaining = rate_info.get("remaining", 0)
|
||||
limit = rate_info.get("limit", 0)
|
||||
|
||||
print(f"\n⚠️ GitHub Rate Limit Reached")
|
||||
print("\n⚠️ GitHub Rate Limit Reached")
|
||||
print(f" Profile: {self.profile_name or 'default'}")
|
||||
print(f" Limit: {remaining}/{limit} requests")
|
||||
|
||||
@@ -294,8 +292,8 @@ class RateLimitHandler:
|
||||
self.token = next_token
|
||||
|
||||
rate_info = self.get_rate_limit_info()
|
||||
remaining = rate_info.get('remaining', 0)
|
||||
limit = rate_info.get('limit', 0)
|
||||
remaining = rate_info.get("remaining", 0)
|
||||
limit = rate_info.get("limit", 0)
|
||||
|
||||
if remaining > 0:
|
||||
print(f"✅ Profile '{next_name}' has {remaining}/{limit} requests available")
|
||||
@@ -394,24 +392,24 @@ class RateLimitHandler:
|
||||
while True:
|
||||
choice = input("Select an option [w/s/t/c]: ").strip().lower()
|
||||
|
||||
if choice == 'w':
|
||||
if choice == "w":
|
||||
return self.wait_for_reset(wait_seconds, wait_minutes)
|
||||
|
||||
elif choice == 's':
|
||||
elif choice == "s":
|
||||
if self.try_switch_profile():
|
||||
return True
|
||||
else:
|
||||
print("⚠️ Profile switching failed. Choose another option.")
|
||||
continue
|
||||
|
||||
elif choice == 't':
|
||||
elif choice == "t":
|
||||
print("\n💡 Opening GitHub token setup...")
|
||||
print(" Run this command in another terminal:")
|
||||
print(" $ skill-seekers config --github\n")
|
||||
print(" Then restart your scraping job.\n")
|
||||
return False
|
||||
|
||||
elif choice == 'c':
|
||||
elif choice == "c":
|
||||
print("\n⏸️ Operation cancelled by user\n")
|
||||
return False
|
||||
|
||||
@@ -419,7 +417,7 @@ class RateLimitHandler:
|
||||
print("❌ Invalid choice. Please enter w, s, t, or c.")
|
||||
|
||||
|
||||
def create_github_headers(token: Optional[str] = None) -> Dict[str, str]:
|
||||
def create_github_headers(token: str | None = None) -> dict[str, str]:
|
||||
"""
|
||||
Create GitHub API headers with optional token.
|
||||
|
||||
@@ -431,5 +429,5 @@ def create_github_headers(token: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
headers = {}
|
||||
if token:
|
||||
headers['Authorization'] = f'token {token}'
|
||||
headers["Authorization"] = f"token {token}"
|
||||
return headers
|
||||
|
||||
@@ -4,9 +4,9 @@ Resume Command for Skill Seekers
|
||||
Allows users to resume interrupted scraping jobs from saved progress.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from typing import Optional
|
||||
import sys
|
||||
|
||||
from .config_manager import get_config_manager
|
||||
|
||||
|
||||
@@ -132,24 +132,10 @@ def clean_old_jobs():
|
||||
|
||||
def main():
|
||||
"""Main entry point for resume command."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Resume interrupted Skill Seekers jobs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"job_id",
|
||||
nargs="?",
|
||||
help="Job ID to resume"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list",
|
||||
action="store_true",
|
||||
help="List all resumable jobs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--clean",
|
||||
action="store_true",
|
||||
help="Clean up old progress files"
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Resume interrupted Skill Seekers jobs")
|
||||
parser.add_argument("job_id", nargs="?", help="Job ID to resume")
|
||||
parser.add_argument("--list", action="store_true", help="List all resumable jobs")
|
||||
parser.add_argument("--clean", action="store_true", help="Clean up old progress files")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -6,21 +6,18 @@ Runs all test suites and generates a comprehensive test report
|
||||
|
||||
import sys
|
||||
import unittest
|
||||
import os
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class ColoredTextTestResult(unittest.TextTestResult):
|
||||
"""Custom test result class with colored output"""
|
||||
|
||||
# ANSI color codes
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
YELLOW = '\033[93m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
GREEN = "\033[92m"
|
||||
RED = "\033[91m"
|
||||
YELLOW = "\033[93m"
|
||||
BLUE = "\033[94m"
|
||||
RESET = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -28,7 +25,7 @@ class ColoredTextTestResult(unittest.TextTestResult):
|
||||
|
||||
def addSuccess(self, test):
|
||||
super().addSuccess(test)
|
||||
self.test_results.append(('PASS', test))
|
||||
self.test_results.append(("PASS", test))
|
||||
if self.showAll:
|
||||
self.stream.write(f"{self.GREEN}✓ PASS{self.RESET}\n")
|
||||
elif self.dots:
|
||||
@@ -37,7 +34,7 @@ class ColoredTextTestResult(unittest.TextTestResult):
|
||||
|
||||
def addError(self, test, err):
|
||||
super().addError(test, err)
|
||||
self.test_results.append(('ERROR', test))
|
||||
self.test_results.append(("ERROR", test))
|
||||
if self.showAll:
|
||||
self.stream.write(f"{self.RED}✗ ERROR{self.RESET}\n")
|
||||
elif self.dots:
|
||||
@@ -46,7 +43,7 @@ class ColoredTextTestResult(unittest.TextTestResult):
|
||||
|
||||
def addFailure(self, test, err):
|
||||
super().addFailure(test, err)
|
||||
self.test_results.append(('FAIL', test))
|
||||
self.test_results.append(("FAIL", test))
|
||||
if self.showAll:
|
||||
self.stream.write(f"{self.RED}✗ FAIL{self.RESET}\n")
|
||||
elif self.dots:
|
||||
@@ -55,7 +52,7 @@ class ColoredTextTestResult(unittest.TextTestResult):
|
||||
|
||||
def addSkip(self, test, reason):
|
||||
super().addSkip(test, reason)
|
||||
self.test_results.append(('SKIP', test))
|
||||
self.test_results.append(("SKIP", test))
|
||||
if self.showAll:
|
||||
self.stream.write(f"{self.YELLOW}⊘ SKIP{self.RESET}\n")
|
||||
elif self.dots:
|
||||
@@ -65,14 +62,15 @@ class ColoredTextTestResult(unittest.TextTestResult):
|
||||
|
||||
class ColoredTextTestRunner(unittest.TextTestRunner):
|
||||
"""Custom test runner with colored output"""
|
||||
|
||||
resultclass = ColoredTextTestResult
|
||||
|
||||
|
||||
def discover_tests(test_dir='tests'):
|
||||
def discover_tests(test_dir="tests"):
|
||||
"""Discover all test files in the tests directory"""
|
||||
loader = unittest.TestLoader()
|
||||
start_dir = test_dir
|
||||
pattern = 'test_*.py'
|
||||
pattern = "test_*.py"
|
||||
|
||||
suite = loader.discover(start_dir, pattern=pattern)
|
||||
return suite
|
||||
@@ -83,9 +81,9 @@ def run_specific_suite(suite_name):
|
||||
loader = unittest.TestLoader()
|
||||
|
||||
suite_map = {
|
||||
'config': 'tests.test_config_validation',
|
||||
'features': 'tests.test_scraper_features',
|
||||
'integration': 'tests.test_integration'
|
||||
"config": "tests.test_config_validation",
|
||||
"features": "tests.test_scraper_features",
|
||||
"integration": "tests.test_integration",
|
||||
}
|
||||
|
||||
if suite_name not in suite_map:
|
||||
@@ -110,9 +108,9 @@ def print_summary(result):
|
||||
errors = len(result.errors)
|
||||
skipped = len(result.skipped)
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("\n" + "=" * 70)
|
||||
print("TEST SUMMARY")
|
||||
print("="*70)
|
||||
print("=" * 70)
|
||||
|
||||
# Overall stats
|
||||
print(f"\n{ColoredTextTestResult.BOLD}Total Tests:{ColoredTextTestResult.RESET} {total}")
|
||||
@@ -127,31 +125,35 @@ def print_summary(result):
|
||||
# Success rate
|
||||
if total > 0:
|
||||
success_rate = (passed / total) * 100
|
||||
color = ColoredTextTestResult.GREEN if success_rate == 100 else \
|
||||
ColoredTextTestResult.YELLOW if success_rate >= 80 else \
|
||||
ColoredTextTestResult.RED
|
||||
color = (
|
||||
ColoredTextTestResult.GREEN
|
||||
if success_rate == 100
|
||||
else ColoredTextTestResult.YELLOW
|
||||
if success_rate >= 80
|
||||
else ColoredTextTestResult.RED
|
||||
)
|
||||
print(f"\n{color}Success Rate: {success_rate:.1f}%{ColoredTextTestResult.RESET}")
|
||||
|
||||
# Category breakdown
|
||||
if hasattr(result, 'test_results'):
|
||||
if hasattr(result, "test_results"):
|
||||
print(f"\n{ColoredTextTestResult.BOLD}Test Breakdown by Category:{ColoredTextTestResult.RESET}")
|
||||
|
||||
categories = {}
|
||||
for status, test in result.test_results:
|
||||
test_name = str(test)
|
||||
# Extract test class name
|
||||
if '.' in test_name:
|
||||
class_name = test_name.split('.')[0].split()[-1]
|
||||
if "." in test_name:
|
||||
class_name = test_name.split(".")[0].split()[-1]
|
||||
if class_name not in categories:
|
||||
categories[class_name] = {'PASS': 0, 'FAIL': 0, 'ERROR': 0, 'SKIP': 0}
|
||||
categories[class_name] = {"PASS": 0, "FAIL": 0, "ERROR": 0, "SKIP": 0}
|
||||
categories[class_name][status] += 1
|
||||
|
||||
for category, stats in sorted(categories.items()):
|
||||
total_cat = sum(stats.values())
|
||||
passed_cat = stats['PASS']
|
||||
passed_cat = stats["PASS"]
|
||||
print(f" {category}: {passed_cat}/{total_cat} passed")
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("\n" + "=" * 70)
|
||||
|
||||
# Return status
|
||||
return failed == 0 and errors == 0
|
||||
@@ -162,20 +164,14 @@ def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Run tests for Skill Seeker',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
description="Run tests for Skill Seeker", formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument('--suite', '-s', type=str,
|
||||
help='Run specific test suite (config, features, integration)')
|
||||
parser.add_argument('--verbose', '-v', action='store_true',
|
||||
help='Verbose output (show each test)')
|
||||
parser.add_argument('--quiet', '-q', action='store_true',
|
||||
help='Quiet output (minimal output)')
|
||||
parser.add_argument('--failfast', '-f', action='store_true',
|
||||
help='Stop on first failure')
|
||||
parser.add_argument('--list', '-l', action='store_true',
|
||||
help='List all available tests')
|
||||
parser.add_argument("--suite", "-s", type=str, help="Run specific test suite (config, features, integration)")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output (show each test)")
|
||||
parser.add_argument("--quiet", "-q", action="store_true", help="Quiet output (minimal output)")
|
||||
parser.add_argument("--failfast", "-f", action="store_true", help="Stop on first failure")
|
||||
parser.add_argument("--list", "-l", action="store_true", help="List all available tests")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -186,9 +182,9 @@ def main():
|
||||
elif args.quiet:
|
||||
verbosity = 0
|
||||
|
||||
print(f"\n{ColoredTextTestResult.BOLD}{'='*70}{ColoredTextTestResult.RESET}")
|
||||
print(f"\n{ColoredTextTestResult.BOLD}{'=' * 70}{ColoredTextTestResult.RESET}")
|
||||
print(f"{ColoredTextTestResult.BOLD}SKILL SEEKER TEST SUITE{ColoredTextTestResult.RESET}")
|
||||
print(f"{ColoredTextTestResult.BOLD}{'='*70}{ColoredTextTestResult.RESET}\n")
|
||||
print(f"{ColoredTextTestResult.BOLD}{'=' * 70}{ColoredTextTestResult.RESET}\n")
|
||||
|
||||
# Discover or load specific suite
|
||||
if args.suite:
|
||||
@@ -210,10 +206,7 @@ def main():
|
||||
return 0
|
||||
|
||||
# Run tests
|
||||
runner = ColoredTextTestRunner(
|
||||
verbosity=verbosity,
|
||||
failfast=args.failfast
|
||||
)
|
||||
runner = ColoredTextTestRunner(verbosity=verbosity, failfast=args.failfast)
|
||||
|
||||
result = runner.run(suite)
|
||||
|
||||
@@ -224,5 +217,5 @@ def main():
|
||||
return 0 if success else 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
@@ -6,12 +6,12 @@ Splits large documentation configs into multiple smaller, focused skill configs.
|
||||
Supports multiple splitting strategies: category-based, size-based, and automatic.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
class ConfigSplitter:
|
||||
@@ -22,12 +22,12 @@ class ConfigSplitter:
|
||||
self.strategy = strategy
|
||||
self.target_pages = target_pages
|
||||
self.config = self.load_config()
|
||||
self.base_name = self.config['name']
|
||||
self.base_name = self.config["name"]
|
||||
|
||||
def load_config(self) -> Dict[str, Any]:
|
||||
def load_config(self) -> dict[str, Any]:
|
||||
"""Load configuration from file"""
|
||||
try:
|
||||
with open(self.config_path, 'r') as f:
|
||||
with open(self.config_path) as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"❌ Error: Config file not found: {self.config_path}")
|
||||
@@ -38,45 +38,45 @@ class ConfigSplitter:
|
||||
|
||||
def is_unified_config(self) -> bool:
|
||||
"""Check if this is a unified multi-source config"""
|
||||
return 'sources' in self.config
|
||||
return "sources" in self.config
|
||||
|
||||
def get_split_strategy(self) -> str:
|
||||
"""Determine split strategy"""
|
||||
# For unified configs, default to source-based splitting
|
||||
if self.is_unified_config():
|
||||
if self.strategy == "auto":
|
||||
num_sources = len(self.config.get('sources', []))
|
||||
num_sources = len(self.config.get("sources", []))
|
||||
if num_sources <= 1:
|
||||
print(f"ℹ️ Single source unified config - no splitting needed")
|
||||
print("ℹ️ Single source unified config - no splitting needed")
|
||||
return "none"
|
||||
else:
|
||||
print(f"ℹ️ Multi-source unified config ({num_sources} sources) - source split recommended")
|
||||
return "source"
|
||||
# For unified configs, only 'source' and 'none' strategies are valid
|
||||
elif self.strategy in ['source', 'none']:
|
||||
elif self.strategy in ["source", "none"]:
|
||||
return self.strategy
|
||||
else:
|
||||
print(f"⚠️ Warning: Strategy '{self.strategy}' not supported for unified configs")
|
||||
print(f"ℹ️ Using 'source' strategy instead")
|
||||
print("ℹ️ Using 'source' strategy instead")
|
||||
return "source"
|
||||
|
||||
# Check if strategy is defined in config (documentation configs)
|
||||
if 'split_strategy' in self.config:
|
||||
config_strategy = self.config['split_strategy']
|
||||
if "split_strategy" in self.config:
|
||||
config_strategy = self.config["split_strategy"]
|
||||
if config_strategy != "none":
|
||||
return config_strategy
|
||||
|
||||
# Use provided strategy or auto-detect (documentation configs)
|
||||
if self.strategy == "auto":
|
||||
max_pages = self.config.get('max_pages', 500)
|
||||
max_pages = self.config.get("max_pages", 500)
|
||||
|
||||
if max_pages < 5000:
|
||||
print(f"ℹ️ Small documentation ({max_pages} pages) - no splitting needed")
|
||||
return "none"
|
||||
elif max_pages < 10000 and 'categories' in self.config:
|
||||
elif max_pages < 10000 and "categories" in self.config:
|
||||
print(f"ℹ️ Medium documentation ({max_pages} pages) - category split recommended")
|
||||
return "category"
|
||||
elif 'categories' in self.config and len(self.config['categories']) >= 3:
|
||||
elif "categories" in self.config and len(self.config["categories"]) >= 3:
|
||||
print(f"ℹ️ Large documentation ({max_pages} pages) - router + categories recommended")
|
||||
return "router"
|
||||
else:
|
||||
@@ -85,14 +85,14 @@ class ConfigSplitter:
|
||||
|
||||
return self.strategy
|
||||
|
||||
def split_by_category(self, create_router: bool = False) -> List[Dict[str, Any]]:
|
||||
def split_by_category(self, create_router: bool = False) -> list[dict[str, Any]]:
|
||||
"""Split config by categories"""
|
||||
if 'categories' not in self.config:
|
||||
if "categories" not in self.config:
|
||||
print("❌ Error: No categories defined in config")
|
||||
sys.exit(1)
|
||||
|
||||
categories = self.config['categories']
|
||||
split_categories = self.config.get('split_config', {}).get('split_by_categories')
|
||||
categories = self.config["categories"]
|
||||
split_categories = self.config.get("split_config", {}).get("split_by_categories")
|
||||
|
||||
# If specific categories specified, use only those
|
||||
if split_categories:
|
||||
@@ -103,34 +103,36 @@ class ConfigSplitter:
|
||||
for category_name, keywords in categories.items():
|
||||
# Create new config for this category
|
||||
new_config = self.config.copy()
|
||||
new_config['name'] = f"{self.base_name}-{category_name}"
|
||||
new_config['description'] = f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}"
|
||||
new_config["name"] = f"{self.base_name}-{category_name}"
|
||||
new_config["description"] = (
|
||||
f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}"
|
||||
)
|
||||
|
||||
# Update URL patterns to focus on this category
|
||||
url_patterns = new_config.get('url_patterns', {})
|
||||
url_patterns = new_config.get("url_patterns", {})
|
||||
|
||||
# Add category keywords to includes
|
||||
includes = url_patterns.get('include', [])
|
||||
includes = url_patterns.get("include", [])
|
||||
for keyword in keywords:
|
||||
if keyword.startswith('/'):
|
||||
if keyword.startswith("/"):
|
||||
includes.append(keyword)
|
||||
|
||||
if includes:
|
||||
url_patterns['include'] = list(set(includes))
|
||||
new_config['url_patterns'] = url_patterns
|
||||
url_patterns["include"] = list(set(includes))
|
||||
new_config["url_patterns"] = url_patterns
|
||||
|
||||
# Keep only this category
|
||||
new_config['categories'] = {category_name: keywords}
|
||||
new_config["categories"] = {category_name: keywords}
|
||||
|
||||
# Remove split config from child
|
||||
if 'split_strategy' in new_config:
|
||||
del new_config['split_strategy']
|
||||
if 'split_config' in new_config:
|
||||
del new_config['split_config']
|
||||
if "split_strategy" in new_config:
|
||||
del new_config["split_strategy"]
|
||||
if "split_config" in new_config:
|
||||
del new_config["split_config"]
|
||||
|
||||
# Adjust max_pages estimate
|
||||
if 'max_pages' in new_config:
|
||||
new_config['max_pages'] = self.target_pages
|
||||
if "max_pages" in new_config:
|
||||
new_config["max_pages"] = self.target_pages
|
||||
|
||||
configs.append(new_config)
|
||||
|
||||
@@ -144,9 +146,9 @@ class ConfigSplitter:
|
||||
|
||||
return configs
|
||||
|
||||
def split_by_size(self) -> List[Dict[str, Any]]:
|
||||
def split_by_size(self) -> list[dict[str, Any]]:
|
||||
"""Split config by size (page count)"""
|
||||
max_pages = self.config.get('max_pages', 500)
|
||||
max_pages = self.config.get("max_pages", 500)
|
||||
num_splits = (max_pages + self.target_pages - 1) // self.target_pages
|
||||
|
||||
configs = []
|
||||
@@ -154,28 +156,30 @@ class ConfigSplitter:
|
||||
for i in range(num_splits):
|
||||
new_config = self.config.copy()
|
||||
part_num = i + 1
|
||||
new_config['name'] = f"{self.base_name}-part{part_num}"
|
||||
new_config['description'] = f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}"
|
||||
new_config['max_pages'] = self.target_pages
|
||||
new_config["name"] = f"{self.base_name}-part{part_num}"
|
||||
new_config["description"] = (
|
||||
f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}"
|
||||
)
|
||||
new_config["max_pages"] = self.target_pages
|
||||
|
||||
# Remove split config from child
|
||||
if 'split_strategy' in new_config:
|
||||
del new_config['split_strategy']
|
||||
if 'split_config' in new_config:
|
||||
del new_config['split_config']
|
||||
if "split_strategy" in new_config:
|
||||
del new_config["split_strategy"]
|
||||
if "split_config" in new_config:
|
||||
del new_config["split_config"]
|
||||
|
||||
configs.append(new_config)
|
||||
|
||||
print(f"✅ Created {len(configs)} size-based configs ({self.target_pages} pages each)")
|
||||
return configs
|
||||
|
||||
def split_by_source(self) -> List[Dict[str, Any]]:
|
||||
def split_by_source(self) -> list[dict[str, Any]]:
|
||||
"""Split unified config by source type"""
|
||||
if not self.is_unified_config():
|
||||
print("❌ Error: Config is not a unified config (missing 'sources' key)")
|
||||
sys.exit(1)
|
||||
|
||||
sources = self.config.get('sources', [])
|
||||
sources = self.config.get("sources", [])
|
||||
if not sources:
|
||||
print("❌ Error: No sources defined in unified config")
|
||||
sys.exit(1)
|
||||
@@ -184,20 +188,20 @@ class ConfigSplitter:
|
||||
source_type_counts = defaultdict(int)
|
||||
|
||||
for source in sources:
|
||||
source_type = source.get('type', 'unknown')
|
||||
source_type = source.get("type", "unknown")
|
||||
source_type_counts[source_type] += 1
|
||||
count = source_type_counts[source_type]
|
||||
|
||||
# Create new config for this source
|
||||
new_config = {
|
||||
'name': f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""),
|
||||
'description': f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}",
|
||||
'sources': [source] # Single source per config
|
||||
"name": f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""),
|
||||
"description": f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}",
|
||||
"sources": [source], # Single source per config
|
||||
}
|
||||
|
||||
# Copy merge_mode if it exists
|
||||
if 'merge_mode' in self.config:
|
||||
new_config['merge_mode'] = self.config['merge_mode']
|
||||
if "merge_mode" in self.config:
|
||||
new_config["merge_mode"] = self.config["merge_mode"]
|
||||
|
||||
configs.append(new_config)
|
||||
|
||||
@@ -209,36 +213,33 @@ class ConfigSplitter:
|
||||
|
||||
return configs
|
||||
|
||||
def create_router_config(self, sub_configs: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
def create_router_config(self, sub_configs: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"""Create a router config that references sub-skills"""
|
||||
router_name = self.config.get('split_config', {}).get('router_name', self.base_name)
|
||||
router_name = self.config.get("split_config", {}).get("router_name", self.base_name)
|
||||
|
||||
router_config = {
|
||||
"name": router_name,
|
||||
"description": self.config.get('description', ''),
|
||||
"base_url": self.config['base_url'],
|
||||
"selectors": self.config['selectors'],
|
||||
"url_patterns": self.config.get('url_patterns', {}),
|
||||
"rate_limit": self.config.get('rate_limit', 0.5),
|
||||
"description": self.config.get("description", ""),
|
||||
"base_url": self.config["base_url"],
|
||||
"selectors": self.config["selectors"],
|
||||
"url_patterns": self.config.get("url_patterns", {}),
|
||||
"rate_limit": self.config.get("rate_limit", 0.5),
|
||||
"max_pages": 500, # Router only needs overview pages
|
||||
"_router": True,
|
||||
"_sub_skills": [cfg['name'] for cfg in sub_configs],
|
||||
"_routing_keywords": {
|
||||
cfg['name']: list(cfg.get('categories', {}).keys())
|
||||
for cfg in sub_configs
|
||||
}
|
||||
"_sub_skills": [cfg["name"] for cfg in sub_configs],
|
||||
"_routing_keywords": {cfg["name"]: list(cfg.get("categories", {}).keys()) for cfg in sub_configs},
|
||||
}
|
||||
|
||||
return router_config
|
||||
|
||||
def split(self) -> List[Dict[str, Any]]:
|
||||
def split(self) -> list[dict[str, Any]]:
|
||||
"""Execute split based on strategy"""
|
||||
strategy = self.get_split_strategy()
|
||||
|
||||
config_type = "UNIFIED" if self.is_unified_config() else "DOCUMENTATION"
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"CONFIG SPLITTER: {self.base_name} ({config_type})")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Strategy: {strategy}")
|
||||
if not self.is_unified_config():
|
||||
print(f"Target pages per skill: {self.target_pages}")
|
||||
@@ -255,7 +256,7 @@ class ConfigSplitter:
|
||||
return self.split_by_category(create_router=False)
|
||||
|
||||
elif strategy == "router":
|
||||
create_router = self.config.get('split_config', {}).get('create_router', True)
|
||||
create_router = self.config.get("split_config", {}).get("create_router", True)
|
||||
return self.split_by_category(create_router=create_router)
|
||||
|
||||
elif strategy == "size":
|
||||
@@ -265,7 +266,7 @@ class ConfigSplitter:
|
||||
print(f"❌ Error: Unknown strategy: {strategy}")
|
||||
sys.exit(1)
|
||||
|
||||
def save_configs(self, configs: List[Dict[str, Any]], output_dir: Path = None) -> List[Path]:
|
||||
def save_configs(self, configs: list[dict[str, Any]], output_dir: Path = None) -> list[Path]:
|
||||
"""Save configs to files"""
|
||||
if output_dir is None:
|
||||
output_dir = self.config_path.parent
|
||||
@@ -279,7 +280,7 @@ class ConfigSplitter:
|
||||
filename = f"{config['name']}.json"
|
||||
filepath = output_dir / filename
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
with open(filepath, "w") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
|
||||
saved_files.append(filepath)
|
||||
@@ -320,38 +321,23 @@ Split Strategies:
|
||||
Config Types:
|
||||
Documentation - Single base_url config (supports: category, router, size)
|
||||
Unified - Multi-source config (supports: source)
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'config',
|
||||
help='Path to config file (e.g., configs/godot.json)'
|
||||
)
|
||||
parser.add_argument("config", help="Path to config file (e.g., configs/godot.json)")
|
||||
|
||||
parser.add_argument(
|
||||
'--strategy',
|
||||
choices=['auto', 'none', 'source', 'category', 'router', 'size'],
|
||||
default='auto',
|
||||
help='Splitting strategy (default: auto)'
|
||||
"--strategy",
|
||||
choices=["auto", "none", "source", "category", "router", "size"],
|
||||
default="auto",
|
||||
help="Splitting strategy (default: auto)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--target-pages',
|
||||
type=int,
|
||||
default=5000,
|
||||
help='Target pages per skill (default: 5000)'
|
||||
)
|
||||
parser.add_argument("--target-pages", type=int, default=5000, help="Target pages per skill (default: 5000)")
|
||||
|
||||
parser.add_argument(
|
||||
'--output-dir',
|
||||
help='Output directory for configs (default: same as input)'
|
||||
)
|
||||
parser.add_argument("--output-dir", help="Output directory for configs (default: same as input)")
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be created without saving files'
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be created without saving files")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -362,23 +348,23 @@ Config Types:
|
||||
configs = splitter.split()
|
||||
|
||||
if args.dry_run:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("DRY RUN - No files saved")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Would create {len(configs)} config files:")
|
||||
for cfg in configs:
|
||||
is_router = cfg.get('_router', False)
|
||||
is_router = cfg.get("_router", False)
|
||||
router_marker = " (ROUTER)" if is_router else ""
|
||||
print(f" 📄 {cfg['name']}.json{router_marker}")
|
||||
else:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("SAVING CONFIGS")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
saved_files = splitter.save_configs(configs, args.output_dir)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("NEXT STEPS")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print("1. Review generated configs")
|
||||
print("2. Scrape each config:")
|
||||
for filepath in saved_files:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -27,19 +27,18 @@ Example usage:
|
||||
python test_example_extractor.py tests/ --min-confidence 0.7
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import List, Dict, Optional, Literal, Set
|
||||
from pathlib import Path
|
||||
import ast
|
||||
import re
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
import ast
|
||||
import hashlib
|
||||
import json
|
||||
import sys
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -47,22 +46,23 @@ logger = logging.getLogger(__name__)
|
||||
# DATA MODELS
|
||||
# ============================================================================
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestExample:
|
||||
"""Single extracted usage example from test code"""
|
||||
|
||||
# Identity
|
||||
example_id: str # Unique hash of example
|
||||
test_name: str # Test function/method name
|
||||
example_id: str # Unique hash of example
|
||||
test_name: str # Test function/method name
|
||||
category: Literal["instantiation", "method_call", "config", "setup", "workflow"]
|
||||
|
||||
# Code
|
||||
code: str # Actual example code
|
||||
language: str # Programming language
|
||||
code: str # Actual example code
|
||||
language: str # Programming language
|
||||
|
||||
# Context
|
||||
description: str # What this demonstrates
|
||||
expected_behavior: str # Expected outcome from assertions
|
||||
description: str # What this demonstrates
|
||||
expected_behavior: str # Expected outcome from assertions
|
||||
|
||||
# Source
|
||||
file_path: str
|
||||
@@ -71,13 +71,13 @@ class TestExample:
|
||||
|
||||
# Quality
|
||||
complexity_score: float # 0-1 scale (higher = more complex/valuable)
|
||||
confidence: float # 0-1 scale (higher = more confident extraction)
|
||||
confidence: float # 0-1 scale (higher = more confident extraction)
|
||||
|
||||
# Optional fields (must come after required fields)
|
||||
setup_code: Optional[str] = None # Required setup code
|
||||
tags: List[str] = field(default_factory=list) # ["pytest", "mock", "async"]
|
||||
dependencies: List[str] = field(default_factory=list) # Imported modules
|
||||
ai_analysis: Optional[Dict] = None # AI-generated analysis (C3.6)
|
||||
setup_code: str | None = None # Required setup code
|
||||
tags: list[str] = field(default_factory=list) # ["pytest", "mock", "async"]
|
||||
dependencies: list[str] = field(default_factory=list) # Imported modules
|
||||
ai_analysis: dict | None = None # AI-generated analysis (C3.6)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization"""
|
||||
@@ -96,12 +96,12 @@ class TestExample:
|
||||
|
||||
# Add AI analysis if available (C3.6)
|
||||
if self.ai_analysis:
|
||||
md += f"\n**🤖 AI Analysis:** \n"
|
||||
if self.ai_analysis.get('explanation'):
|
||||
md += "\n**🤖 AI Analysis:** \n"
|
||||
if self.ai_analysis.get("explanation"):
|
||||
md += f"*{self.ai_analysis['explanation']}* \n"
|
||||
if self.ai_analysis.get('best_practices'):
|
||||
if self.ai_analysis.get("best_practices"):
|
||||
md += f"**Best Practices:** {', '.join(self.ai_analysis['best_practices'])} \n"
|
||||
if self.ai_analysis.get('tutorial_group'):
|
||||
if self.ai_analysis.get("tutorial_group"):
|
||||
md += f"**Tutorial Group:** {self.ai_analysis['tutorial_group']} \n"
|
||||
|
||||
md += f"\n```{self.language.lower()}\n"
|
||||
@@ -117,13 +117,13 @@ class ExampleReport:
|
||||
"""Summary of test example extraction results"""
|
||||
|
||||
total_examples: int
|
||||
examples_by_category: Dict[str, int]
|
||||
examples_by_language: Dict[str, int]
|
||||
examples: List[TestExample]
|
||||
examples_by_category: dict[str, int]
|
||||
examples_by_language: dict[str, int]
|
||||
examples: list[TestExample]
|
||||
avg_complexity: float
|
||||
high_value_count: int # confidence > 0.7
|
||||
file_path: Optional[str] = None # If single file
|
||||
directory: Optional[str] = None # If directory
|
||||
file_path: str | None = None # If single file
|
||||
directory: str | None = None # If directory
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization"""
|
||||
@@ -135,7 +135,7 @@ class ExampleReport:
|
||||
"high_value_count": self.high_value_count,
|
||||
"file_path": self.file_path,
|
||||
"directory": self.directory,
|
||||
"examples": [ex.to_dict() for ex in self.examples]
|
||||
"examples": [ex.to_dict() for ex in self.examples],
|
||||
}
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
@@ -164,19 +164,20 @@ class ExampleReport:
|
||||
# PYTHON TEST ANALYZER (AST-based)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class PythonTestAnalyzer:
|
||||
"""Deep AST-based test example extraction for Python"""
|
||||
|
||||
def __init__(self):
|
||||
self.trivial_patterns = {
|
||||
'assertTrue(True)',
|
||||
'assertFalse(False)',
|
||||
'assertEqual(1, 1)',
|
||||
'assertIsNone(None)',
|
||||
'assertIsNotNone(None)',
|
||||
"assertTrue(True)",
|
||||
"assertFalse(False)",
|
||||
"assertEqual(1, 1)",
|
||||
"assertIsNone(None)",
|
||||
"assertIsNotNone(None)",
|
||||
}
|
||||
|
||||
def extract(self, file_path: str, code: str) -> List[TestExample]:
|
||||
def extract(self, file_path: str, code: str) -> list[TestExample]:
|
||||
"""Extract examples from Python test file"""
|
||||
examples = []
|
||||
|
||||
@@ -193,20 +194,16 @@ class PythonTestAnalyzer:
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.ClassDef):
|
||||
if self._is_test_class(node):
|
||||
examples.extend(self._extract_from_test_class(
|
||||
node, file_path, imports
|
||||
))
|
||||
examples.extend(self._extract_from_test_class(node, file_path, imports))
|
||||
|
||||
# Find test functions (pytest)
|
||||
elif isinstance(node, ast.FunctionDef):
|
||||
if self._is_test_function(node):
|
||||
examples.extend(self._extract_from_test_function(
|
||||
node, file_path, imports
|
||||
))
|
||||
examples.extend(self._extract_from_test_function(node, file_path, imports))
|
||||
|
||||
return examples
|
||||
|
||||
def _extract_imports(self, tree: ast.AST) -> List[str]:
|
||||
def _extract_imports(self, tree: ast.AST) -> list[str]:
|
||||
"""Extract imported modules"""
|
||||
imports = []
|
||||
for node in ast.walk(tree):
|
||||
@@ -221,30 +218,30 @@ class PythonTestAnalyzer:
|
||||
"""Check if class is a test class"""
|
||||
# unittest.TestCase pattern
|
||||
for base in node.bases:
|
||||
if isinstance(base, ast.Name) and 'Test' in base.id:
|
||||
return True
|
||||
elif isinstance(base, ast.Attribute) and base.attr == 'TestCase':
|
||||
if (
|
||||
isinstance(base, ast.Name)
|
||||
and "Test" in base.id
|
||||
or isinstance(base, ast.Attribute)
|
||||
and base.attr == "TestCase"
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_test_function(self, node: ast.FunctionDef) -> bool:
|
||||
"""Check if function is a test function"""
|
||||
# pytest pattern: starts with test_
|
||||
if node.name.startswith('test_'):
|
||||
if node.name.startswith("test_"):
|
||||
return True
|
||||
# Has @pytest.mark decorator
|
||||
for decorator in node.decorator_list:
|
||||
if isinstance(decorator, ast.Attribute):
|
||||
if 'pytest' in ast.unparse(decorator):
|
||||
if "pytest" in ast.unparse(decorator):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _extract_from_test_class(
|
||||
self,
|
||||
class_node: ast.ClassDef,
|
||||
file_path: str,
|
||||
imports: List[str]
|
||||
) -> List[TestExample]:
|
||||
self, class_node: ast.ClassDef, file_path: str, imports: list[str]
|
||||
) -> list[TestExample]:
|
||||
"""Extract examples from unittest.TestCase class"""
|
||||
examples = []
|
||||
|
||||
@@ -253,63 +250,46 @@ class PythonTestAnalyzer:
|
||||
|
||||
# Process each test method
|
||||
for node in class_node.body:
|
||||
if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'):
|
||||
examples.extend(self._analyze_test_body(
|
||||
node,
|
||||
file_path,
|
||||
imports,
|
||||
setup_code=setup_code
|
||||
))
|
||||
if isinstance(node, ast.FunctionDef) and node.name.startswith("test_"):
|
||||
examples.extend(self._analyze_test_body(node, file_path, imports, setup_code=setup_code))
|
||||
|
||||
return examples
|
||||
|
||||
def _extract_from_test_function(
|
||||
self,
|
||||
func_node: ast.FunctionDef,
|
||||
file_path: str,
|
||||
imports: List[str]
|
||||
) -> List[TestExample]:
|
||||
self, func_node: ast.FunctionDef, file_path: str, imports: list[str]
|
||||
) -> list[TestExample]:
|
||||
"""Extract examples from pytest test function"""
|
||||
# Check for fixture parameters
|
||||
fixture_setup = self._extract_fixtures(func_node)
|
||||
|
||||
return self._analyze_test_body(
|
||||
func_node,
|
||||
file_path,
|
||||
imports,
|
||||
setup_code=fixture_setup
|
||||
)
|
||||
return self._analyze_test_body(func_node, file_path, imports, setup_code=fixture_setup)
|
||||
|
||||
def _extract_setup_method(self, class_node: ast.ClassDef) -> Optional[str]:
|
||||
def _extract_setup_method(self, class_node: ast.ClassDef) -> str | None:
|
||||
"""Extract setUp method code"""
|
||||
for node in class_node.body:
|
||||
if isinstance(node, ast.FunctionDef) and node.name == 'setUp':
|
||||
if isinstance(node, ast.FunctionDef) and node.name == "setUp":
|
||||
return ast.unparse(node.body)
|
||||
return None
|
||||
|
||||
def _extract_fixtures(self, func_node: ast.FunctionDef) -> Optional[str]:
|
||||
def _extract_fixtures(self, func_node: ast.FunctionDef) -> str | None:
|
||||
"""Extract pytest fixture parameters"""
|
||||
if not func_node.args.args:
|
||||
return None
|
||||
|
||||
# Skip 'self' parameter
|
||||
params = [arg.arg for arg in func_node.args.args if arg.arg != 'self']
|
||||
params = [arg.arg for arg in func_node.args.args if arg.arg != "self"]
|
||||
if params:
|
||||
return f"# Fixtures: {', '.join(params)}"
|
||||
return None
|
||||
|
||||
def _analyze_test_body(
|
||||
self,
|
||||
func_node: ast.FunctionDef,
|
||||
file_path: str,
|
||||
imports: List[str],
|
||||
setup_code: Optional[str] = None
|
||||
) -> List[TestExample]:
|
||||
self, func_node: ast.FunctionDef, file_path: str, imports: list[str], setup_code: str | None = None
|
||||
) -> list[TestExample]:
|
||||
"""Analyze test function body for extractable patterns"""
|
||||
examples = []
|
||||
|
||||
# Get docstring for description
|
||||
docstring = ast.get_docstring(func_node) or func_node.name.replace('_', ' ')
|
||||
docstring = ast.get_docstring(func_node) or func_node.name.replace("_", " ")
|
||||
|
||||
# Detect tags
|
||||
tags = self._detect_tags(func_node, imports)
|
||||
@@ -321,7 +301,9 @@ class PythonTestAnalyzer:
|
||||
examples.extend(instantiations)
|
||||
|
||||
# 2. Method calls with assertions
|
||||
method_calls = self._find_method_calls_with_assertions(func_node, file_path, docstring, setup_code, tags, imports)
|
||||
method_calls = self._find_method_calls_with_assertions(
|
||||
func_node, file_path, docstring, setup_code, tags, imports
|
||||
)
|
||||
examples.extend(method_calls)
|
||||
|
||||
# 3. Configuration dictionaries
|
||||
@@ -334,28 +316,28 @@ class PythonTestAnalyzer:
|
||||
|
||||
return examples
|
||||
|
||||
def _detect_tags(self, func_node: ast.FunctionDef, imports: List[str]) -> List[str]:
|
||||
def _detect_tags(self, func_node: ast.FunctionDef, imports: list[str]) -> list[str]:
|
||||
"""Detect test tags (pytest, mock, async, etc.)"""
|
||||
tags = []
|
||||
|
||||
# Check decorators
|
||||
for decorator in func_node.decorator_list:
|
||||
decorator_str = ast.unparse(decorator).lower()
|
||||
if 'pytest' in decorator_str:
|
||||
tags.append('pytest')
|
||||
if 'mock' in decorator_str:
|
||||
tags.append('mock')
|
||||
if 'async' in decorator_str or func_node.name.startswith('test_async'):
|
||||
tags.append('async')
|
||||
if "pytest" in decorator_str:
|
||||
tags.append("pytest")
|
||||
if "mock" in decorator_str:
|
||||
tags.append("mock")
|
||||
if "async" in decorator_str or func_node.name.startswith("test_async"):
|
||||
tags.append("async")
|
||||
|
||||
# Check if using unittest
|
||||
if 'unittest' in imports:
|
||||
tags.append('unittest')
|
||||
if "unittest" in imports:
|
||||
tags.append("unittest")
|
||||
|
||||
# Check function body for mock usage
|
||||
func_str = ast.unparse(func_node).lower()
|
||||
if 'mock' in func_str or 'patch' in func_str:
|
||||
tags.append('mock')
|
||||
if "mock" in func_str or "patch" in func_str:
|
||||
tags.append("mock")
|
||||
|
||||
return list(set(tags))
|
||||
|
||||
@@ -364,10 +346,10 @@ class PythonTestAnalyzer:
|
||||
func_node: ast.FunctionDef,
|
||||
file_path: str,
|
||||
description: str,
|
||||
setup_code: Optional[str],
|
||||
tags: List[str],
|
||||
imports: List[str]
|
||||
) -> List[TestExample]:
|
||||
setup_code: str | None,
|
||||
tags: list[str],
|
||||
imports: list[str],
|
||||
) -> list[TestExample]:
|
||||
"""Find object instantiation patterns: obj = ClassName(...)"""
|
||||
examples = []
|
||||
|
||||
@@ -379,7 +361,7 @@ class PythonTestAnalyzer:
|
||||
code = ast.unparse(node)
|
||||
|
||||
# Skip trivial or mock-only
|
||||
if len(code) < 20 or 'Mock()' in code:
|
||||
if len(code) < 20 or "Mock()" in code:
|
||||
continue
|
||||
|
||||
# Get class name
|
||||
@@ -400,7 +382,7 @@ class PythonTestAnalyzer:
|
||||
complexity_score=self._calculate_complexity(code),
|
||||
confidence=0.8,
|
||||
tags=tags,
|
||||
dependencies=imports
|
||||
dependencies=imports,
|
||||
)
|
||||
examples.append(example)
|
||||
|
||||
@@ -411,10 +393,10 @@ class PythonTestAnalyzer:
|
||||
func_node: ast.FunctionDef,
|
||||
file_path: str,
|
||||
description: str,
|
||||
setup_code: Optional[str],
|
||||
tags: List[str],
|
||||
imports: List[str]
|
||||
) -> List[TestExample]:
|
||||
setup_code: str | None,
|
||||
tags: list[str],
|
||||
imports: list[str],
|
||||
) -> list[TestExample]:
|
||||
"""Find method calls followed by assertions"""
|
||||
examples = []
|
||||
|
||||
@@ -450,7 +432,7 @@ class PythonTestAnalyzer:
|
||||
complexity_score=self._calculate_complexity(code),
|
||||
confidence=0.85,
|
||||
tags=tags,
|
||||
dependencies=imports
|
||||
dependencies=imports,
|
||||
)
|
||||
examples.append(example)
|
||||
|
||||
@@ -461,10 +443,10 @@ class PythonTestAnalyzer:
|
||||
func_node: ast.FunctionDef,
|
||||
file_path: str,
|
||||
description: str,
|
||||
setup_code: Optional[str],
|
||||
tags: List[str],
|
||||
imports: List[str]
|
||||
) -> List[TestExample]:
|
||||
setup_code: str | None,
|
||||
tags: list[str],
|
||||
imports: list[str],
|
||||
) -> list[TestExample]:
|
||||
"""Find configuration dictionary patterns"""
|
||||
examples = []
|
||||
|
||||
@@ -491,7 +473,7 @@ class PythonTestAnalyzer:
|
||||
complexity_score=self._calculate_complexity(code),
|
||||
confidence=0.75,
|
||||
tags=tags,
|
||||
dependencies=imports
|
||||
dependencies=imports,
|
||||
)
|
||||
examples.append(example)
|
||||
|
||||
@@ -502,10 +484,10 @@ class PythonTestAnalyzer:
|
||||
func_node: ast.FunctionDef,
|
||||
file_path: str,
|
||||
description: str,
|
||||
setup_code: Optional[str],
|
||||
tags: List[str],
|
||||
imports: List[str]
|
||||
) -> List[TestExample]:
|
||||
setup_code: str | None,
|
||||
tags: list[str],
|
||||
imports: list[str],
|
||||
) -> list[TestExample]:
|
||||
"""Find multi-step workflow patterns (integration tests)"""
|
||||
examples = []
|
||||
|
||||
@@ -515,7 +497,7 @@ class PythonTestAnalyzer:
|
||||
code = ast.unparse(func_node.body)
|
||||
|
||||
# Skip if too long (> 30 lines)
|
||||
if code.count('\n') > 30:
|
||||
if code.count("\n") > 30:
|
||||
return examples
|
||||
|
||||
example = TestExample(
|
||||
@@ -532,8 +514,8 @@ class PythonTestAnalyzer:
|
||||
line_end=func_node.end_lineno or func_node.lineno,
|
||||
complexity_score=min(1.0, len(func_node.body) / 10),
|
||||
confidence=0.9,
|
||||
tags=tags + ['workflow', 'integration'],
|
||||
dependencies=imports
|
||||
tags=tags + ["workflow", "integration"],
|
||||
dependencies=imports,
|
||||
)
|
||||
examples.append(example)
|
||||
|
||||
@@ -568,7 +550,7 @@ class PythonTestAnalyzer:
|
||||
|
||||
if isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
|
||||
call_str = ast.unparse(node.value).lower()
|
||||
assertion_methods = ['assert', 'expect', 'should']
|
||||
assertion_methods = ["assert", "expect", "should"]
|
||||
return any(method in call_str for method in assertion_methods)
|
||||
|
||||
return False
|
||||
@@ -584,7 +566,7 @@ class PythonTestAnalyzer:
|
||||
def _is_integration_test(self, func_node: ast.FunctionDef) -> bool:
|
||||
"""Check if test looks like an integration test"""
|
||||
test_name = func_node.name.lower()
|
||||
integration_keywords = ['workflow', 'integration', 'end_to_end', 'e2e', 'full']
|
||||
integration_keywords = ["workflow", "integration", "end_to_end", "e2e", "full"]
|
||||
return any(keyword in test_name for keyword in integration_keywords)
|
||||
|
||||
def _extract_assertion_after(self, func_node: ast.FunctionDef, target_node: ast.AST) -> str:
|
||||
@@ -608,8 +590,8 @@ class PythonTestAnalyzer:
|
||||
def _calculate_complexity(self, code: str) -> float:
|
||||
"""Calculate code complexity score (0-1)"""
|
||||
# Simple heuristic: more lines + more parameters = more complex
|
||||
lines = code.count('\n') + 1
|
||||
params = code.count(',') + 1
|
||||
lines = code.count("\n") + 1
|
||||
params = code.count(",") + 1
|
||||
|
||||
complexity = min(1.0, (lines * 0.1) + (params * 0.05))
|
||||
return round(complexity, 2)
|
||||
@@ -623,57 +605,58 @@ class PythonTestAnalyzer:
|
||||
# GENERIC TEST ANALYZER (Regex-based for non-Python languages)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class GenericTestAnalyzer:
|
||||
"""Regex-based test example extraction for non-Python languages"""
|
||||
|
||||
# Language-specific regex patterns
|
||||
PATTERNS = {
|
||||
"javascript": {
|
||||
"instantiation": r'(?:const|let|var)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)',
|
||||
"assertion": r'expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)',
|
||||
"instantiation": r"(?:const|let|var)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
|
||||
"assertion": r"expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)",
|
||||
"test_function": r'(?:test|it)\(["\']([^"\']+)["\']',
|
||||
"config": r'(?:const|let)\s+config\s*=\s*\{[\s\S]{20,500}?\}',
|
||||
"config": r"(?:const|let)\s+config\s*=\s*\{[\s\S]{20,500}?\}",
|
||||
},
|
||||
"typescript": {
|
||||
"instantiation": r'(?:const|let|var)\s+(\w+):\s*\w+\s*=\s*new\s+(\w+)\(([^)]*)\)',
|
||||
"assertion": r'expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)',
|
||||
"instantiation": r"(?:const|let|var)\s+(\w+):\s*\w+\s*=\s*new\s+(\w+)\(([^)]*)\)",
|
||||
"assertion": r"expect\(([^)]+)\)\.to(?:Equal|Be|Match)\(([^)]+)\)",
|
||||
"test_function": r'(?:test|it)\(["\']([^"\']+)["\']',
|
||||
"config": r'(?:const|let)\s+config:\s*\w+\s*=\s*\{[\s\S]{20,500}?\}',
|
||||
"config": r"(?:const|let)\s+config:\s*\w+\s*=\s*\{[\s\S]{20,500}?\}",
|
||||
},
|
||||
"go": {
|
||||
"instantiation": r'(\w+)\s*:=\s*(\w+)\{([^}]+)\}',
|
||||
"instantiation": r"(\w+)\s*:=\s*(\w+)\{([^}]+)\}",
|
||||
"assertion": r't\.(?:Error|Fatal)(?:f)?\(["\']([^"\']+)["\']',
|
||||
"test_function": r'func\s+(Test\w+)\(t\s+\*testing\.T\)',
|
||||
"table_test": r'tests\s*:=\s*\[\]struct\s*\{[\s\S]{50,1000}?\}',
|
||||
"test_function": r"func\s+(Test\w+)\(t\s+\*testing\.T\)",
|
||||
"table_test": r"tests\s*:=\s*\[\]struct\s*\{[\s\S]{50,1000}?\}",
|
||||
},
|
||||
"rust": {
|
||||
"instantiation": r'let\s+(\w+)\s*=\s*(\w+)::new\(([^)]*)\)',
|
||||
"assertion": r'assert(?:_eq)?!\(([^)]+)\)',
|
||||
"test_function": r'#\[test\]\s*fn\s+(\w+)\(\)',
|
||||
"instantiation": r"let\s+(\w+)\s*=\s*(\w+)::new\(([^)]*)\)",
|
||||
"assertion": r"assert(?:_eq)?!\(([^)]+)\)",
|
||||
"test_function": r"#\[test\]\s*fn\s+(\w+)\(\)",
|
||||
},
|
||||
"java": {
|
||||
"instantiation": r'(\w+)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)',
|
||||
"assertion": r'assert(?:Equals|True|False|NotNull)\(([^)]+)\)',
|
||||
"test_function": r'@Test\s+public\s+void\s+(\w+)\(\)',
|
||||
"instantiation": r"(\w+)\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
|
||||
"assertion": r"assert(?:Equals|True|False|NotNull)\(([^)]+)\)",
|
||||
"test_function": r"@Test\s+public\s+void\s+(\w+)\(\)",
|
||||
},
|
||||
"csharp": {
|
||||
"instantiation": r'var\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)',
|
||||
"assertion": r'Assert\.(?:AreEqual|IsTrue|IsFalse|IsNotNull)\(([^)]+)\)',
|
||||
"test_function": r'\[Test\]\s+public\s+void\s+(\w+)\(\)',
|
||||
"instantiation": r"var\s+(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
|
||||
"assertion": r"Assert\.(?:AreEqual|IsTrue|IsFalse|IsNotNull)\(([^)]+)\)",
|
||||
"test_function": r"\[Test\]\s+public\s+void\s+(\w+)\(\)",
|
||||
},
|
||||
"php": {
|
||||
"instantiation": r'\$(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)',
|
||||
"assertion": r'\$this->assert(?:Equals|True|False|NotNull)\(([^)]+)\)',
|
||||
"test_function": r'public\s+function\s+(test\w+)\(\)',
|
||||
"instantiation": r"\$(\w+)\s*=\s*new\s+(\w+)\(([^)]*)\)",
|
||||
"assertion": r"\$this->assert(?:Equals|True|False|NotNull)\(([^)]+)\)",
|
||||
"test_function": r"public\s+function\s+(test\w+)\(\)",
|
||||
},
|
||||
"ruby": {
|
||||
"instantiation": r'(\w+)\s*=\s*(\w+)\.new\(([^)]*)\)',
|
||||
"assertion": r'expect\(([^)]+)\)\.to\s+(?:eq|be|match)\(([^)]+)\)',
|
||||
"instantiation": r"(\w+)\s*=\s*(\w+)\.new\(([^)]*)\)",
|
||||
"assertion": r"expect\(([^)]+)\)\.to\s+(?:eq|be|match)\(([^)]+)\)",
|
||||
"test_function": r'(?:test|it)\s+["\']([^"\']+)["\']',
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
def extract(self, file_path: str, code: str, language: str) -> List[TestExample]:
|
||||
def extract(self, file_path: str, code: str, language: str) -> list[TestExample]:
|
||||
"""Extract examples from test file using regex patterns"""
|
||||
examples = []
|
||||
|
||||
@@ -704,7 +687,7 @@ class GenericTestAnalyzer:
|
||||
code=inst_match.group(0),
|
||||
language=language,
|
||||
file_path=file_path,
|
||||
line_number=code[:start_pos + inst_match.start()].count('\n') + 1
|
||||
line_number=code[: start_pos + inst_match.start()].count("\n") + 1,
|
||||
)
|
||||
examples.append(example)
|
||||
|
||||
@@ -717,20 +700,14 @@ class GenericTestAnalyzer:
|
||||
code=config_match.group(0),
|
||||
language=language,
|
||||
file_path=file_path,
|
||||
line_number=code[:start_pos + config_match.start()].count('\n') + 1
|
||||
line_number=code[: start_pos + config_match.start()].count("\n") + 1,
|
||||
)
|
||||
examples.append(example)
|
||||
|
||||
return examples
|
||||
|
||||
def _create_example(
|
||||
self,
|
||||
test_name: str,
|
||||
category: str,
|
||||
code: str,
|
||||
language: str,
|
||||
file_path: str,
|
||||
line_number: int
|
||||
self, test_name: str, category: str, code: str, language: str, file_path: str, line_number: int
|
||||
) -> TestExample:
|
||||
"""Create TestExample from regex match"""
|
||||
return TestExample(
|
||||
@@ -743,11 +720,11 @@ class GenericTestAnalyzer:
|
||||
expected_behavior="",
|
||||
file_path=file_path,
|
||||
line_start=line_number,
|
||||
line_end=line_number + code.count('\n'),
|
||||
complexity_score=min(1.0, (code.count('\n') + 1) * 0.1),
|
||||
line_end=line_number + code.count("\n"),
|
||||
complexity_score=min(1.0, (code.count("\n") + 1) * 0.1),
|
||||
confidence=0.6, # Lower confidence for regex extraction
|
||||
tags=[],
|
||||
dependencies=[]
|
||||
dependencies=[],
|
||||
)
|
||||
|
||||
|
||||
@@ -755,6 +732,7 @@ class GenericTestAnalyzer:
|
||||
# EXAMPLE QUALITY FILTER
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class ExampleQualityFilter:
|
||||
"""Filter out trivial or low-quality examples"""
|
||||
|
||||
@@ -764,16 +742,16 @@ class ExampleQualityFilter:
|
||||
|
||||
# Trivial patterns to exclude
|
||||
self.trivial_patterns = [
|
||||
'Mock()',
|
||||
'MagicMock()',
|
||||
'assertTrue(True)',
|
||||
'assertFalse(False)',
|
||||
'assertEqual(1, 1)',
|
||||
'pass',
|
||||
'...',
|
||||
"Mock()",
|
||||
"MagicMock()",
|
||||
"assertTrue(True)",
|
||||
"assertFalse(False)",
|
||||
"assertEqual(1, 1)",
|
||||
"pass",
|
||||
"...",
|
||||
]
|
||||
|
||||
def filter(self, examples: List[TestExample]) -> List[TestExample]:
|
||||
def filter(self, examples: list[TestExample]) -> list[TestExample]:
|
||||
"""Filter examples by quality criteria"""
|
||||
filtered = []
|
||||
|
||||
@@ -803,42 +781,43 @@ class ExampleQualityFilter:
|
||||
# TEST EXAMPLE EXTRACTOR (Main Orchestrator)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class TestExampleExtractor:
|
||||
"""Main orchestrator for test example extraction"""
|
||||
|
||||
# Test file patterns
|
||||
TEST_PATTERNS = [
|
||||
'test_*.py',
|
||||
'*_test.py',
|
||||
'test*.js',
|
||||
'*test.js',
|
||||
'*_test.go',
|
||||
'*_test.rs',
|
||||
'Test*.java',
|
||||
'Test*.cs',
|
||||
'*Test.php',
|
||||
'*_spec.rb',
|
||||
"test_*.py",
|
||||
"*_test.py",
|
||||
"test*.js",
|
||||
"*test.js",
|
||||
"*_test.go",
|
||||
"*_test.rs",
|
||||
"Test*.java",
|
||||
"Test*.cs",
|
||||
"*Test.php",
|
||||
"*_spec.rb",
|
||||
]
|
||||
|
||||
# Language detection by extension
|
||||
LANGUAGE_MAP = {
|
||||
'.py': 'Python',
|
||||
'.js': 'JavaScript',
|
||||
'.ts': 'TypeScript',
|
||||
'.go': 'Go',
|
||||
'.rs': 'Rust',
|
||||
'.java': 'Java',
|
||||
'.cs': 'C#',
|
||||
'.php': 'PHP',
|
||||
'.rb': 'Ruby',
|
||||
".py": "Python",
|
||||
".js": "JavaScript",
|
||||
".ts": "TypeScript",
|
||||
".go": "Go",
|
||||
".rs": "Rust",
|
||||
".java": "Java",
|
||||
".cs": "C#",
|
||||
".php": "PHP",
|
||||
".rb": "Ruby",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
min_confidence: float = 0.7,
|
||||
max_per_file: int = 10,
|
||||
languages: Optional[List[str]] = None,
|
||||
enhance_with_ai: bool = True
|
||||
languages: list[str] | None = None,
|
||||
enhance_with_ai: bool = True,
|
||||
):
|
||||
self.python_analyzer = PythonTestAnalyzer()
|
||||
self.generic_analyzer = GenericTestAnalyzer()
|
||||
@@ -852,16 +831,13 @@ class TestExampleExtractor:
|
||||
if self.enhance_with_ai:
|
||||
try:
|
||||
from skill_seekers.cli.ai_enhancer import TestExampleEnhancer
|
||||
|
||||
self.ai_enhancer = TestExampleEnhancer()
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to initialize AI enhancer: {e}")
|
||||
self.enhance_with_ai = False
|
||||
|
||||
def extract_from_directory(
|
||||
self,
|
||||
directory: Path,
|
||||
recursive: bool = True
|
||||
) -> ExampleReport:
|
||||
def extract_from_directory(self, directory: Path, recursive: bool = True) -> ExampleReport:
|
||||
"""Extract examples from all test files in directory"""
|
||||
directory = Path(directory)
|
||||
|
||||
@@ -882,7 +858,7 @@ class TestExampleExtractor:
|
||||
# Generate report
|
||||
return self._create_report(all_examples, directory=str(directory))
|
||||
|
||||
def extract_from_file(self, file_path: Path) -> List[TestExample]:
|
||||
def extract_from_file(self, file_path: Path) -> list[TestExample]:
|
||||
"""Extract examples from single test file"""
|
||||
file_path = Path(file_path)
|
||||
|
||||
@@ -898,13 +874,13 @@ class TestExampleExtractor:
|
||||
|
||||
# Read file
|
||||
try:
|
||||
code = file_path.read_text(encoding='utf-8')
|
||||
code = file_path.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
logger.warning(f"Failed to read {file_path} (encoding error)")
|
||||
return []
|
||||
|
||||
# Extract examples based on language
|
||||
if language == 'Python':
|
||||
if language == "Python":
|
||||
examples = self.python_analyzer.extract(str(file_path), code)
|
||||
else:
|
||||
examples = self.generic_analyzer.extract(str(file_path), code, language)
|
||||
@@ -915,17 +891,13 @@ class TestExampleExtractor:
|
||||
# Limit per file
|
||||
if len(filtered_examples) > self.max_per_file:
|
||||
# Sort by confidence and take top N
|
||||
filtered_examples = sorted(
|
||||
filtered_examples,
|
||||
key=lambda x: x.confidence,
|
||||
reverse=True
|
||||
)[:self.max_per_file]
|
||||
filtered_examples = sorted(filtered_examples, key=lambda x: x.confidence, reverse=True)[: self.max_per_file]
|
||||
|
||||
logger.info(f"Extracted {len(filtered_examples)} examples from {file_path.name}")
|
||||
|
||||
return filtered_examples
|
||||
|
||||
def _find_test_files(self, directory: Path, recursive: bool) -> List[Path]:
|
||||
def _find_test_files(self, directory: Path, recursive: bool) -> list[Path]:
|
||||
"""Find test files in directory"""
|
||||
test_files = []
|
||||
|
||||
@@ -940,13 +912,10 @@ class TestExampleExtractor:
|
||||
def _detect_language(self, file_path: Path) -> str:
|
||||
"""Detect programming language from file extension"""
|
||||
suffix = file_path.suffix.lower()
|
||||
return self.LANGUAGE_MAP.get(suffix, 'Unknown')
|
||||
return self.LANGUAGE_MAP.get(suffix, "Unknown")
|
||||
|
||||
def _create_report(
|
||||
self,
|
||||
examples: List[TestExample],
|
||||
file_path: Optional[str] = None,
|
||||
directory: Optional[str] = None
|
||||
self, examples: list[TestExample], file_path: str | None = None, directory: str | None = None
|
||||
) -> ExampleReport:
|
||||
"""Create summary report from examples"""
|
||||
# Enhance examples with AI analysis (C3.6)
|
||||
@@ -957,20 +926,18 @@ class TestExampleExtractor:
|
||||
|
||||
# Update examples with AI analysis
|
||||
for i, example in enumerate(examples):
|
||||
if i < len(enhanced_dicts) and 'ai_analysis' in enhanced_dicts[i]:
|
||||
example.ai_analysis = enhanced_dicts[i]['ai_analysis']
|
||||
if i < len(enhanced_dicts) and "ai_analysis" in enhanced_dicts[i]:
|
||||
example.ai_analysis = enhanced_dicts[i]["ai_analysis"]
|
||||
|
||||
# Count by category
|
||||
examples_by_category = {}
|
||||
for example in examples:
|
||||
examples_by_category[example.category] = \
|
||||
examples_by_category.get(example.category, 0) + 1
|
||||
examples_by_category[example.category] = examples_by_category.get(example.category, 0) + 1
|
||||
|
||||
# Count by language
|
||||
examples_by_language = {}
|
||||
for example in examples:
|
||||
examples_by_language[example.language] = \
|
||||
examples_by_language.get(example.language, 0) + 1
|
||||
examples_by_language[example.language] = examples_by_language.get(example.language, 0) + 1
|
||||
|
||||
# Calculate averages
|
||||
avg_complexity = sum(ex.complexity_score for ex in examples) / len(examples) if examples else 0.0
|
||||
@@ -984,7 +951,7 @@ class TestExampleExtractor:
|
||||
avg_complexity=round(avg_complexity, 2),
|
||||
high_value_count=high_value_count,
|
||||
file_path=file_path,
|
||||
directory=directory
|
||||
directory=directory,
|
||||
)
|
||||
|
||||
|
||||
@@ -992,10 +959,11 @@ class TestExampleExtractor:
|
||||
# COMMAND-LINE INTERFACE
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract usage examples from test files',
|
||||
description="Extract usage examples from test files",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -1010,49 +978,20 @@ Examples:
|
||||
|
||||
# Filter by confidence
|
||||
%(prog)s tests/ --min-confidence 0.7
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("directory", nargs="?", help="Directory containing test files")
|
||||
parser.add_argument("--file", help="Single test file to analyze")
|
||||
parser.add_argument("--language", help="Filter by programming language (python, javascript, etc.)")
|
||||
parser.add_argument(
|
||||
'directory',
|
||||
nargs='?',
|
||||
help='Directory containing test files'
|
||||
"--min-confidence", type=float, default=0.5, help="Minimum confidence threshold (0.0-1.0, default: 0.5)"
|
||||
)
|
||||
parser.add_argument("--max-per-file", type=int, default=10, help="Maximum examples per file (default: 10)")
|
||||
parser.add_argument("--json", action="store_true", help="Output JSON format")
|
||||
parser.add_argument("--markdown", action="store_true", help="Output Markdown format")
|
||||
parser.add_argument(
|
||||
'--file',
|
||||
help='Single test file to analyze'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--language',
|
||||
help='Filter by programming language (python, javascript, etc.)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--min-confidence',
|
||||
type=float,
|
||||
default=0.5,
|
||||
help='Minimum confidence threshold (0.0-1.0, default: 0.5)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--max-per-file',
|
||||
type=int,
|
||||
default=10,
|
||||
help='Maximum examples per file (default: 10)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--json',
|
||||
action='store_true',
|
||||
help='Output JSON format'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--markdown',
|
||||
action='store_true',
|
||||
help='Output Markdown format'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--recursive',
|
||||
action='store_true',
|
||||
default=True,
|
||||
help='Search directory recursively (default: True)'
|
||||
"--recursive", action="store_true", default=True, help="Search directory recursively (default: True)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -1064,9 +1003,7 @@ Examples:
|
||||
# Create extractor
|
||||
languages = [args.language] if args.language else None
|
||||
extractor = TestExampleExtractor(
|
||||
min_confidence=args.min_confidence,
|
||||
max_per_file=args.max_per_file,
|
||||
languages=languages
|
||||
min_confidence=args.min_confidence, max_per_file=args.max_per_file, languages=languages
|
||||
)
|
||||
|
||||
# Extract examples
|
||||
@@ -1074,10 +1011,7 @@ Examples:
|
||||
examples = extractor.extract_from_file(Path(args.file))
|
||||
report = extractor._create_report(examples, file_path=args.file)
|
||||
else:
|
||||
report = extractor.extract_from_directory(
|
||||
Path(args.directory),
|
||||
recursive=args.recursive
|
||||
)
|
||||
report = extractor.extract_from_directory(Path(args.directory), recursive=args.recursive)
|
||||
|
||||
# Output results
|
||||
if args.json:
|
||||
@@ -1086,19 +1020,19 @@ Examples:
|
||||
print(report.to_markdown())
|
||||
else:
|
||||
# Human-readable summary
|
||||
print(f"\nTest Example Extraction Results")
|
||||
print(f"=" * 50)
|
||||
print("\nTest Example Extraction Results")
|
||||
print("=" * 50)
|
||||
print(f"Total Examples: {report.total_examples}")
|
||||
print(f"High Value (confidence > 0.7): {report.high_value_count}")
|
||||
print(f"Average Complexity: {report.avg_complexity:.2f}")
|
||||
print(f"\nExamples by Category:")
|
||||
print("\nExamples by Category:")
|
||||
for category, count in sorted(report.examples_by_category.items()):
|
||||
print(f" {category}: {count}")
|
||||
print(f"\nExamples by Language:")
|
||||
print("\nExamples by Language:")
|
||||
for language, count in sorted(report.examples_by_language.items()):
|
||||
print(f" {language}: {count}")
|
||||
print(f"\nUse --json or --markdown for detailed output")
|
||||
print("\nUse --json or --markdown for detailed output")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -5,9 +5,9 @@ Simple Integration Tests for Unified Multi-Source Scraper
|
||||
Focuses on real-world usage patterns rather than unit tests.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
@@ -16,16 +16,12 @@ sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from .config_validator import validate_config
|
||||
|
||||
|
||||
def test_validate_existing_unified_configs():
|
||||
"""Test that all existing unified configs are valid"""
|
||||
configs_dir = Path(__file__).parent.parent / 'configs'
|
||||
configs_dir = Path(__file__).parent.parent / "configs"
|
||||
|
||||
unified_configs = [
|
||||
'godot_unified.json',
|
||||
'react_unified.json',
|
||||
'django_unified.json',
|
||||
'fastapi_unified.json'
|
||||
]
|
||||
unified_configs = ["godot_unified.json", "react_unified.json", "django_unified.json", "fastapi_unified.json"]
|
||||
|
||||
for config_name in unified_configs:
|
||||
config_path = configs_dir / config_name
|
||||
@@ -40,13 +36,9 @@ def test_validate_existing_unified_configs():
|
||||
|
||||
def test_backward_compatibility():
|
||||
"""Test that legacy configs still work"""
|
||||
configs_dir = Path(__file__).parent.parent / 'configs'
|
||||
configs_dir = Path(__file__).parent.parent / "configs"
|
||||
|
||||
legacy_configs = [
|
||||
'react.json',
|
||||
'godot.json',
|
||||
'django.json'
|
||||
]
|
||||
legacy_configs = ["react.json", "godot.json", "django.json"]
|
||||
|
||||
for config_name in legacy_configs:
|
||||
config_path = configs_dir / config_name
|
||||
@@ -54,7 +46,7 @@ def test_backward_compatibility():
|
||||
print(f"\n✓ Validating legacy {config_name}...")
|
||||
validator = validate_config(str(config_path))
|
||||
assert not validator.is_unified, f"{config_name} should be legacy format"
|
||||
print(f" Format: Legacy")
|
||||
print(" Format: Legacy")
|
||||
|
||||
|
||||
def test_create_temp_unified_config():
|
||||
@@ -64,22 +56,12 @@ def test_create_temp_unified_config():
|
||||
"description": "Test unified config",
|
||||
"merge_mode": "rule-based",
|
||||
"sources": [
|
||||
{
|
||||
"type": "documentation",
|
||||
"base_url": "https://example.com/docs",
|
||||
"extract_api": True,
|
||||
"max_pages": 50
|
||||
},
|
||||
{
|
||||
"type": "github",
|
||||
"repo": "test/repo",
|
||||
"include_code": True,
|
||||
"code_analysis_depth": "surface"
|
||||
}
|
||||
]
|
||||
{"type": "documentation", "base_url": "https://example.com/docs", "extract_api": True, "max_pages": 50},
|
||||
{"type": "github", "repo": "test/repo", "include_code": True, "code_analysis_depth": "surface"},
|
||||
],
|
||||
}
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
||||
json.dump(config, f)
|
||||
config_path = f.name
|
||||
|
||||
@@ -88,7 +70,7 @@ def test_create_temp_unified_config():
|
||||
validator = validate_config(config_path)
|
||||
assert validator.is_unified
|
||||
assert validator.needs_api_merge()
|
||||
assert len(validator.config['sources']) == 2
|
||||
assert len(validator.config["sources"]) == 2
|
||||
print(" ✓ Config is valid unified format")
|
||||
print(f" Sources: {len(validator.config['sources'])}")
|
||||
finally:
|
||||
@@ -102,22 +84,13 @@ def test_mixed_source_types():
|
||||
"description": "Test mixed sources",
|
||||
"merge_mode": "rule-based",
|
||||
"sources": [
|
||||
{
|
||||
"type": "documentation",
|
||||
"base_url": "https://example.com"
|
||||
},
|
||||
{
|
||||
"type": "github",
|
||||
"repo": "test/repo"
|
||||
},
|
||||
{
|
||||
"type": "pdf",
|
||||
"path": "/path/to/manual.pdf"
|
||||
}
|
||||
]
|
||||
{"type": "documentation", "base_url": "https://example.com"},
|
||||
{"type": "github", "repo": "test/repo"},
|
||||
{"type": "pdf", "path": "/path/to/manual.pdf"},
|
||||
],
|
||||
}
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
||||
json.dump(config, f)
|
||||
config_path = f.name
|
||||
|
||||
@@ -125,13 +98,13 @@ def test_mixed_source_types():
|
||||
print("\n✓ Validating mixed source types...")
|
||||
validator = validate_config(config_path)
|
||||
assert validator.is_unified
|
||||
assert len(validator.config['sources']) == 3
|
||||
assert len(validator.config["sources"]) == 3
|
||||
|
||||
# Check each source type
|
||||
source_types = [s['type'] for s in validator.config['sources']]
|
||||
assert 'documentation' in source_types
|
||||
assert 'github' in source_types
|
||||
assert 'pdf' in source_types
|
||||
source_types = [s["type"] for s in validator.config["sources"]]
|
||||
assert "documentation" in source_types
|
||||
assert "github" in source_types
|
||||
assert "pdf" in source_types
|
||||
print(" ✓ All 3 source types validated")
|
||||
finally:
|
||||
os.unlink(config_path)
|
||||
@@ -143,12 +116,10 @@ def test_config_validation_errors():
|
||||
config = {
|
||||
"name": "test",
|
||||
"description": "Test",
|
||||
"sources": [
|
||||
{"type": "invalid_type", "url": "https://example.com"}
|
||||
]
|
||||
"sources": [{"type": "invalid_type", "url": "https://example.com"}],
|
||||
}
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
||||
json.dump(config, f)
|
||||
config_path = f.name
|
||||
|
||||
@@ -166,7 +137,7 @@ def test_config_validation_errors():
|
||||
|
||||
|
||||
# Run tests
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
print("=" * 60)
|
||||
print("Running Unified Scraper Integration Tests")
|
||||
print("=" * 60)
|
||||
@@ -188,5 +159,6 @@ if __name__ == '__main__':
|
||||
except Exception as e:
|
||||
print(f"\n❌ Unexpected error: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -13,21 +13,21 @@ Analysis modes:
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, List
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from skill_seekers.cli.github_fetcher import GitHubThreeStreamFetcher, ThreeStreamData
|
||||
from skill_seekers.cli.github_fetcher import GitHubThreeStreamFetcher
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnalysisResult:
|
||||
"""Unified analysis result from any codebase source."""
|
||||
code_analysis: Dict
|
||||
github_docs: Optional[Dict] = None
|
||||
github_insights: Optional[Dict] = None
|
||||
source_type: str = 'local' # 'local' or 'github'
|
||||
analysis_depth: str = 'basic' # 'basic' or 'c3x'
|
||||
|
||||
code_analysis: dict
|
||||
github_docs: dict | None = None
|
||||
github_insights: dict | None = None
|
||||
source_type: str = "local" # 'local' or 'github'
|
||||
analysis_depth: str = "basic" # 'basic' or 'c3x'
|
||||
|
||||
|
||||
class UnifiedCodebaseAnalyzer:
|
||||
@@ -59,21 +59,17 @@ class UnifiedCodebaseAnalyzer:
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, github_token: Optional[str] = None):
|
||||
def __init__(self, github_token: str | None = None):
|
||||
"""
|
||||
Initialize analyzer.
|
||||
|
||||
Args:
|
||||
github_token: Optional GitHub API token for higher rate limits
|
||||
"""
|
||||
self.github_token = github_token or os.getenv('GITHUB_TOKEN')
|
||||
self.github_token = github_token or os.getenv("GITHUB_TOKEN")
|
||||
|
||||
def analyze(
|
||||
self,
|
||||
source: str,
|
||||
depth: str = 'c3x',
|
||||
fetch_github_metadata: bool = True,
|
||||
output_dir: Optional[Path] = None
|
||||
self, source: str, depth: str = "c3x", fetch_github_metadata: bool = True, output_dir: Path | None = None
|
||||
) -> AnalysisResult:
|
||||
"""
|
||||
Analyze codebase with specified depth.
|
||||
@@ -92,18 +88,14 @@ class UnifiedCodebaseAnalyzer:
|
||||
|
||||
# Step 1: Acquire source
|
||||
if self.is_github_url(source):
|
||||
print(f"📦 Source type: GitHub repository")
|
||||
print("📦 Source type: GitHub repository")
|
||||
return self._analyze_github(source, depth, fetch_github_metadata, output_dir)
|
||||
else:
|
||||
print(f"📁 Source type: Local directory")
|
||||
print("📁 Source type: Local directory")
|
||||
return self._analyze_local(source, depth)
|
||||
|
||||
def _analyze_github(
|
||||
self,
|
||||
repo_url: str,
|
||||
depth: str,
|
||||
fetch_metadata: bool,
|
||||
output_dir: Optional[Path]
|
||||
self, repo_url: str, depth: str, fetch_metadata: bool, output_dir: Path | None
|
||||
) -> AnalysisResult:
|
||||
"""
|
||||
Analyze GitHub repository with three-stream fetcher.
|
||||
@@ -123,32 +115,28 @@ class UnifiedCodebaseAnalyzer:
|
||||
|
||||
# Analyze code with specified depth
|
||||
code_directory = three_streams.code_stream.directory
|
||||
if depth == 'basic':
|
||||
if depth == "basic":
|
||||
code_analysis = self.basic_analysis(code_directory)
|
||||
elif depth == 'c3x':
|
||||
elif depth == "c3x":
|
||||
code_analysis = self.c3x_analysis(code_directory)
|
||||
else:
|
||||
raise ValueError(f"Unknown depth: {depth}. Use 'basic' or 'c3x'")
|
||||
|
||||
# Build result with all streams
|
||||
result = AnalysisResult(
|
||||
code_analysis=code_analysis,
|
||||
source_type='github',
|
||||
analysis_depth=depth
|
||||
)
|
||||
result = AnalysisResult(code_analysis=code_analysis, source_type="github", analysis_depth=depth)
|
||||
|
||||
# Add GitHub-specific data if available
|
||||
if fetch_metadata:
|
||||
result.github_docs = {
|
||||
'readme': three_streams.docs_stream.readme,
|
||||
'contributing': three_streams.docs_stream.contributing,
|
||||
'docs_files': three_streams.docs_stream.docs_files
|
||||
"readme": three_streams.docs_stream.readme,
|
||||
"contributing": three_streams.docs_stream.contributing,
|
||||
"docs_files": three_streams.docs_stream.docs_files,
|
||||
}
|
||||
result.github_insights = {
|
||||
'metadata': three_streams.insights_stream.metadata,
|
||||
'common_problems': three_streams.insights_stream.common_problems,
|
||||
'known_solutions': three_streams.insights_stream.known_solutions,
|
||||
'top_labels': three_streams.insights_stream.top_labels
|
||||
"metadata": three_streams.insights_stream.metadata,
|
||||
"common_problems": three_streams.insights_stream.common_problems,
|
||||
"known_solutions": three_streams.insights_stream.known_solutions,
|
||||
"top_labels": three_streams.insights_stream.top_labels,
|
||||
}
|
||||
|
||||
return result
|
||||
@@ -173,20 +161,16 @@ class UnifiedCodebaseAnalyzer:
|
||||
raise NotADirectoryError(f"Not a directory: {directory}")
|
||||
|
||||
# Analyze code with specified depth
|
||||
if depth == 'basic':
|
||||
if depth == "basic":
|
||||
code_analysis = self.basic_analysis(code_directory)
|
||||
elif depth == 'c3x':
|
||||
elif depth == "c3x":
|
||||
code_analysis = self.c3x_analysis(code_directory)
|
||||
else:
|
||||
raise ValueError(f"Unknown depth: {depth}. Use 'basic' or 'c3x'")
|
||||
|
||||
return AnalysisResult(
|
||||
code_analysis=code_analysis,
|
||||
source_type='local',
|
||||
analysis_depth=depth
|
||||
)
|
||||
return AnalysisResult(code_analysis=code_analysis, source_type="local", analysis_depth=depth)
|
||||
|
||||
def basic_analysis(self, directory: Path) -> Dict:
|
||||
def basic_analysis(self, directory: Path) -> dict:
|
||||
"""
|
||||
Fast, shallow analysis (1-2 min).
|
||||
|
||||
@@ -205,19 +189,19 @@ class UnifiedCodebaseAnalyzer:
|
||||
print("📊 Running basic analysis (1-2 min)...")
|
||||
|
||||
analysis = {
|
||||
'directory': str(directory),
|
||||
'analysis_type': 'basic',
|
||||
'files': self.list_files(directory),
|
||||
'structure': self.get_directory_structure(directory),
|
||||
'imports': self.extract_imports(directory),
|
||||
'entry_points': self.find_entry_points(directory),
|
||||
'statistics': self.compute_statistics(directory)
|
||||
"directory": str(directory),
|
||||
"analysis_type": "basic",
|
||||
"files": self.list_files(directory),
|
||||
"structure": self.get_directory_structure(directory),
|
||||
"imports": self.extract_imports(directory),
|
||||
"entry_points": self.find_entry_points(directory),
|
||||
"statistics": self.compute_statistics(directory),
|
||||
}
|
||||
|
||||
print(f"✅ Basic analysis complete: {len(analysis['files'])} files analyzed")
|
||||
return analysis
|
||||
|
||||
def c3x_analysis(self, directory: Path) -> Dict:
|
||||
def c3x_analysis(self, directory: Path) -> dict:
|
||||
"""
|
||||
Deep C3.x analysis (20-60 min).
|
||||
|
||||
@@ -245,17 +229,18 @@ class UnifiedCodebaseAnalyzer:
|
||||
|
||||
try:
|
||||
# Import codebase analyzer
|
||||
from .codebase_scraper import analyze_codebase
|
||||
import tempfile
|
||||
|
||||
from .codebase_scraper import analyze_codebase
|
||||
|
||||
# Create temporary output directory for C3.x analysis
|
||||
temp_output = Path(tempfile.mkdtemp(prefix='c3x_analysis_'))
|
||||
temp_output = Path(tempfile.mkdtemp(prefix="c3x_analysis_"))
|
||||
|
||||
# Run full C3.x analysis
|
||||
analyze_codebase(
|
||||
directory=directory,
|
||||
output_dir=temp_output,
|
||||
depth='deep',
|
||||
depth="deep",
|
||||
languages=None, # All languages
|
||||
file_patterns=None, # All files
|
||||
build_api_reference=True,
|
||||
@@ -265,20 +250,16 @@ class UnifiedCodebaseAnalyzer:
|
||||
build_how_to_guides=True,
|
||||
extract_config_patterns=True,
|
||||
enhance_with_ai=False, # Disable AI for speed
|
||||
ai_mode='none'
|
||||
ai_mode="none",
|
||||
)
|
||||
|
||||
# Load C3.x results from output files
|
||||
c3x_data = self._load_c3x_results(temp_output)
|
||||
|
||||
# Merge with basic analysis
|
||||
c3x = {
|
||||
**basic,
|
||||
'analysis_type': 'c3x',
|
||||
**c3x_data
|
||||
}
|
||||
c3x = {**basic, "analysis_type": "c3x", **c3x_data}
|
||||
|
||||
print(f"✅ C3.x analysis complete!")
|
||||
print("✅ C3.x analysis complete!")
|
||||
print(f" - {len(c3x_data.get('c3_1_patterns', []))} design patterns detected")
|
||||
print(f" - {c3x_data.get('c3_2_examples_count', 0)} test examples extracted")
|
||||
print(f" - {len(c3x_data.get('c3_3_guides', []))} how-to guides generated")
|
||||
@@ -289,24 +270,24 @@ class UnifiedCodebaseAnalyzer:
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ C3.x analysis failed: {e}")
|
||||
print(f" Falling back to basic analysis with placeholders")
|
||||
print(" Falling back to basic analysis with placeholders")
|
||||
|
||||
# Fall back to placeholders
|
||||
c3x = {
|
||||
**basic,
|
||||
'analysis_type': 'c3x',
|
||||
'c3_1_patterns': [],
|
||||
'c3_2_examples': [],
|
||||
'c3_2_examples_count': 0,
|
||||
'c3_3_guides': [],
|
||||
'c3_4_configs': [],
|
||||
'c3_7_architecture': [],
|
||||
'error': str(e)
|
||||
"analysis_type": "c3x",
|
||||
"c3_1_patterns": [],
|
||||
"c3_2_examples": [],
|
||||
"c3_2_examples_count": 0,
|
||||
"c3_3_guides": [],
|
||||
"c3_4_configs": [],
|
||||
"c3_7_architecture": [],
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
return c3x
|
||||
|
||||
def _load_c3x_results(self, output_dir: Path) -> Dict:
|
||||
def _load_c3x_results(self, output_dir: Path) -> dict:
|
||||
"""
|
||||
Load C3.x analysis results from output directory.
|
||||
|
||||
@@ -321,65 +302,65 @@ class UnifiedCodebaseAnalyzer:
|
||||
c3x_data = {}
|
||||
|
||||
# C3.1: Design Patterns
|
||||
patterns_file = output_dir / 'patterns' / 'design_patterns.json'
|
||||
patterns_file = output_dir / "patterns" / "design_patterns.json"
|
||||
if patterns_file.exists():
|
||||
with open(patterns_file, 'r') as f:
|
||||
with open(patterns_file) as f:
|
||||
patterns_data = json.load(f)
|
||||
c3x_data['c3_1_patterns'] = patterns_data.get('patterns', [])
|
||||
c3x_data["c3_1_patterns"] = patterns_data.get("patterns", [])
|
||||
else:
|
||||
c3x_data['c3_1_patterns'] = []
|
||||
c3x_data["c3_1_patterns"] = []
|
||||
|
||||
# C3.2: Test Examples
|
||||
examples_file = output_dir / 'test_examples' / 'test_examples.json'
|
||||
examples_file = output_dir / "test_examples" / "test_examples.json"
|
||||
if examples_file.exists():
|
||||
with open(examples_file, 'r') as f:
|
||||
with open(examples_file) as f:
|
||||
examples_data = json.load(f)
|
||||
c3x_data['c3_2_examples'] = examples_data.get('examples', [])
|
||||
c3x_data['c3_2_examples_count'] = examples_data.get('total_examples', 0)
|
||||
c3x_data["c3_2_examples"] = examples_data.get("examples", [])
|
||||
c3x_data["c3_2_examples_count"] = examples_data.get("total_examples", 0)
|
||||
else:
|
||||
c3x_data['c3_2_examples'] = []
|
||||
c3x_data['c3_2_examples_count'] = 0
|
||||
c3x_data["c3_2_examples"] = []
|
||||
c3x_data["c3_2_examples_count"] = 0
|
||||
|
||||
# C3.3: How-to Guides
|
||||
guides_file = output_dir / 'tutorials' / 'guide_collection.json'
|
||||
guides_file = output_dir / "tutorials" / "guide_collection.json"
|
||||
if guides_file.exists():
|
||||
with open(guides_file, 'r') as f:
|
||||
with open(guides_file) as f:
|
||||
guides_data = json.load(f)
|
||||
c3x_data['c3_3_guides'] = guides_data.get('guides', [])
|
||||
c3x_data["c3_3_guides"] = guides_data.get("guides", [])
|
||||
else:
|
||||
c3x_data['c3_3_guides'] = []
|
||||
c3x_data["c3_3_guides"] = []
|
||||
|
||||
# C3.4: Config Patterns
|
||||
config_file = output_dir / 'config_patterns' / 'config_patterns.json'
|
||||
config_file = output_dir / "config_patterns" / "config_patterns.json"
|
||||
if config_file.exists():
|
||||
with open(config_file, 'r') as f:
|
||||
with open(config_file) as f:
|
||||
config_data = json.load(f)
|
||||
c3x_data['c3_4_configs'] = config_data.get('config_files', [])
|
||||
c3x_data["c3_4_configs"] = config_data.get("config_files", [])
|
||||
else:
|
||||
c3x_data['c3_4_configs'] = []
|
||||
c3x_data["c3_4_configs"] = []
|
||||
|
||||
# C3.7: Architecture
|
||||
arch_file = output_dir / 'architecture' / 'architectural_patterns.json'
|
||||
arch_file = output_dir / "architecture" / "architectural_patterns.json"
|
||||
if arch_file.exists():
|
||||
with open(arch_file, 'r') as f:
|
||||
with open(arch_file) as f:
|
||||
arch_data = json.load(f)
|
||||
c3x_data['c3_7_architecture'] = arch_data.get('patterns', [])
|
||||
c3x_data["c3_7_architecture"] = arch_data.get("patterns", [])
|
||||
else:
|
||||
c3x_data['c3_7_architecture'] = []
|
||||
c3x_data["c3_7_architecture"] = []
|
||||
|
||||
# Add dependency graph data
|
||||
dep_file = output_dir / 'dependencies' / 'dependency_graph.json'
|
||||
dep_file = output_dir / "dependencies" / "dependency_graph.json"
|
||||
if dep_file.exists():
|
||||
with open(dep_file, 'r') as f:
|
||||
with open(dep_file) as f:
|
||||
dep_data = json.load(f)
|
||||
c3x_data['dependency_graph'] = dep_data
|
||||
c3x_data["dependency_graph"] = dep_data
|
||||
|
||||
# Add API reference data
|
||||
api_file = output_dir / 'code_analysis.json'
|
||||
api_file = output_dir / "code_analysis.json"
|
||||
if api_file.exists():
|
||||
with open(api_file, 'r') as f:
|
||||
with open(api_file) as f:
|
||||
api_data = json.load(f)
|
||||
c3x_data['api_reference'] = api_data
|
||||
c3x_data["api_reference"] = api_data
|
||||
|
||||
return c3x_data
|
||||
|
||||
@@ -393,9 +374,9 @@ class UnifiedCodebaseAnalyzer:
|
||||
Returns:
|
||||
True if GitHub URL, False otherwise
|
||||
"""
|
||||
return 'github.com' in source
|
||||
return "github.com" in source
|
||||
|
||||
def list_files(self, directory: Path) -> List[Dict]:
|
||||
def list_files(self, directory: Path) -> list[dict]:
|
||||
"""
|
||||
List all files in directory with metadata.
|
||||
|
||||
@@ -406,20 +387,22 @@ class UnifiedCodebaseAnalyzer:
|
||||
List of file info dicts
|
||||
"""
|
||||
files = []
|
||||
for file_path in directory.rglob('*'):
|
||||
for file_path in directory.rglob("*"):
|
||||
if file_path.is_file():
|
||||
try:
|
||||
files.append({
|
||||
'path': str(file_path.relative_to(directory)),
|
||||
'size': file_path.stat().st_size,
|
||||
'extension': file_path.suffix
|
||||
})
|
||||
files.append(
|
||||
{
|
||||
"path": str(file_path.relative_to(directory)),
|
||||
"size": file_path.stat().st_size,
|
||||
"extension": file_path.suffix,
|
||||
}
|
||||
)
|
||||
except Exception:
|
||||
# Skip files we can't access
|
||||
continue
|
||||
return files
|
||||
|
||||
def get_directory_structure(self, directory: Path) -> Dict:
|
||||
def get_directory_structure(self, directory: Path) -> dict:
|
||||
"""
|
||||
Get directory structure tree.
|
||||
|
||||
@@ -429,35 +412,24 @@ class UnifiedCodebaseAnalyzer:
|
||||
Returns:
|
||||
Dict representing directory structure
|
||||
"""
|
||||
structure = {
|
||||
'name': directory.name,
|
||||
'type': 'directory',
|
||||
'children': []
|
||||
}
|
||||
structure = {"name": directory.name, "type": "directory", "children": []}
|
||||
|
||||
try:
|
||||
for item in sorted(directory.iterdir()):
|
||||
if item.name.startswith('.'):
|
||||
if item.name.startswith("."):
|
||||
continue # Skip hidden files
|
||||
|
||||
if item.is_dir():
|
||||
# Only include immediate subdirectories
|
||||
structure['children'].append({
|
||||
'name': item.name,
|
||||
'type': 'directory'
|
||||
})
|
||||
structure["children"].append({"name": item.name, "type": "directory"})
|
||||
elif item.is_file():
|
||||
structure['children'].append({
|
||||
'name': item.name,
|
||||
'type': 'file',
|
||||
'extension': item.suffix
|
||||
})
|
||||
structure["children"].append({"name": item.name, "type": "file", "extension": item.suffix})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return structure
|
||||
|
||||
def extract_imports(self, directory: Path) -> Dict[str, List[str]]:
|
||||
def extract_imports(self, directory: Path) -> dict[str, list[str]]:
|
||||
"""
|
||||
Extract import statements from code files.
|
||||
|
||||
@@ -467,27 +439,23 @@ class UnifiedCodebaseAnalyzer:
|
||||
Returns:
|
||||
Dict mapping file extensions to import lists
|
||||
"""
|
||||
imports = {
|
||||
'.py': [],
|
||||
'.js': [],
|
||||
'.ts': []
|
||||
}
|
||||
imports = {".py": [], ".js": [], ".ts": []}
|
||||
|
||||
# Sample up to 10 files per extension
|
||||
for ext in imports.keys():
|
||||
files = list(directory.rglob(f'*{ext}'))[:10]
|
||||
for ext in imports:
|
||||
files = list(directory.rglob(f"*{ext}"))[:10]
|
||||
for file_path in files:
|
||||
try:
|
||||
content = file_path.read_text(encoding='utf-8')
|
||||
if ext == '.py':
|
||||
content = file_path.read_text(encoding="utf-8")
|
||||
if ext == ".py":
|
||||
# Extract Python imports
|
||||
for line in content.split('\n')[:50]: # Check first 50 lines
|
||||
if line.strip().startswith(('import ', 'from ')):
|
||||
for line in content.split("\n")[:50]: # Check first 50 lines
|
||||
if line.strip().startswith(("import ", "from ")):
|
||||
imports[ext].append(line.strip())
|
||||
elif ext in ['.js', '.ts']:
|
||||
elif ext in [".js", ".ts"]:
|
||||
# Extract JS/TS imports
|
||||
for line in content.split('\n')[:50]:
|
||||
if line.strip().startswith(('import ', 'require(')):
|
||||
for line in content.split("\n")[:50]:
|
||||
if line.strip().startswith(("import ", "require(")):
|
||||
imports[ext].append(line.strip())
|
||||
except Exception:
|
||||
continue
|
||||
@@ -495,7 +463,7 @@ class UnifiedCodebaseAnalyzer:
|
||||
# Remove empty lists
|
||||
return {k: v for k, v in imports.items() if v}
|
||||
|
||||
def find_entry_points(self, directory: Path) -> List[str]:
|
||||
def find_entry_points(self, directory: Path) -> list[str]:
|
||||
"""
|
||||
Find potential entry points (main files, setup files, etc.).
|
||||
|
||||
@@ -509,10 +477,20 @@ class UnifiedCodebaseAnalyzer:
|
||||
|
||||
# Common entry point patterns
|
||||
entry_patterns = [
|
||||
'main.py', '__main__.py', 'app.py', 'server.py',
|
||||
'index.js', 'index.ts', 'main.js', 'main.ts',
|
||||
'setup.py', 'pyproject.toml', 'package.json',
|
||||
'Makefile', 'docker-compose.yml', 'Dockerfile'
|
||||
"main.py",
|
||||
"__main__.py",
|
||||
"app.py",
|
||||
"server.py",
|
||||
"index.js",
|
||||
"index.ts",
|
||||
"main.js",
|
||||
"main.ts",
|
||||
"setup.py",
|
||||
"pyproject.toml",
|
||||
"package.json",
|
||||
"Makefile",
|
||||
"docker-compose.yml",
|
||||
"Dockerfile",
|
||||
]
|
||||
|
||||
for pattern in entry_patterns:
|
||||
@@ -525,7 +503,7 @@ class UnifiedCodebaseAnalyzer:
|
||||
|
||||
return entry_points
|
||||
|
||||
def compute_statistics(self, directory: Path) -> Dict:
|
||||
def compute_statistics(self, directory: Path) -> dict:
|
||||
"""
|
||||
Compute basic statistics about the codebase.
|
||||
|
||||
@@ -535,39 +513,34 @@ class UnifiedCodebaseAnalyzer:
|
||||
Returns:
|
||||
Dict with statistics
|
||||
"""
|
||||
stats = {
|
||||
'total_files': 0,
|
||||
'total_size_bytes': 0,
|
||||
'file_types': {},
|
||||
'languages': {}
|
||||
}
|
||||
stats = {"total_files": 0, "total_size_bytes": 0, "file_types": {}, "languages": {}}
|
||||
|
||||
for file_path in directory.rglob('*'):
|
||||
for file_path in directory.rglob("*"):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
try:
|
||||
stats['total_files'] += 1
|
||||
stats['total_size_bytes'] += file_path.stat().st_size
|
||||
stats["total_files"] += 1
|
||||
stats["total_size_bytes"] += file_path.stat().st_size
|
||||
|
||||
ext = file_path.suffix
|
||||
if ext:
|
||||
stats['file_types'][ext] = stats['file_types'].get(ext, 0) + 1
|
||||
stats["file_types"][ext] = stats["file_types"].get(ext, 0) + 1
|
||||
|
||||
# Map extensions to languages
|
||||
language_map = {
|
||||
'.py': 'Python',
|
||||
'.js': 'JavaScript',
|
||||
'.ts': 'TypeScript',
|
||||
'.go': 'Go',
|
||||
'.rs': 'Rust',
|
||||
'.java': 'Java',
|
||||
'.rb': 'Ruby',
|
||||
'.php': 'PHP'
|
||||
".py": "Python",
|
||||
".js": "JavaScript",
|
||||
".ts": "TypeScript",
|
||||
".go": "Go",
|
||||
".rs": "Rust",
|
||||
".java": "Java",
|
||||
".rb": "Ruby",
|
||||
".php": "PHP",
|
||||
}
|
||||
if ext in language_map:
|
||||
lang = language_map[ext]
|
||||
stats['languages'][lang] = stats['languages'].get(lang, 0) + 1
|
||||
stats["languages"][lang] = stats["languages"].get(lang, 0) + 1
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
|
||||
@@ -12,31 +12,28 @@ Usage:
|
||||
skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
import subprocess
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
from typing import Any
|
||||
|
||||
# Import validators and scrapers
|
||||
try:
|
||||
from skill_seekers.cli.config_validator import ConfigValidator, validate_config
|
||||
from skill_seekers.cli.conflict_detector import ConflictDetector
|
||||
from skill_seekers.cli.merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
|
||||
from skill_seekers.cli.merge_sources import ClaudeEnhancedMerger, RuleBasedMerger
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
except ImportError as e:
|
||||
print(f"Error importing modules: {e}")
|
||||
print("Make sure you're running from the project root directory")
|
||||
sys.exit(1)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -52,7 +49,7 @@ class UnifiedScraper:
|
||||
5. Build unified skill
|
||||
"""
|
||||
|
||||
def __init__(self, config_path: str, merge_mode: Optional[str] = None):
|
||||
def __init__(self, config_path: str, merge_mode: str | None = None):
|
||||
"""
|
||||
Initialize unified scraper.
|
||||
|
||||
@@ -68,21 +65,21 @@ class UnifiedScraper:
|
||||
self.config = self.validator.config
|
||||
|
||||
# Determine merge mode
|
||||
self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based')
|
||||
self.merge_mode = merge_mode or self.config.get("merge_mode", "rule-based")
|
||||
logger.info(f"Merge mode: {self.merge_mode}")
|
||||
|
||||
# Storage for scraped data - use lists to support multiple sources of same type
|
||||
self.scraped_data = {
|
||||
'documentation': [], # List of doc sources
|
||||
'github': [], # List of github sources
|
||||
'pdf': [] # List of pdf sources
|
||||
"documentation": [], # List of doc sources
|
||||
"github": [], # List of github sources
|
||||
"pdf": [], # List of pdf sources
|
||||
}
|
||||
|
||||
# Track source index for unique naming (multi-source support)
|
||||
self._source_counters = {'documentation': 0, 'github': 0, 'pdf': 0}
|
||||
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0}
|
||||
|
||||
# Output paths - cleaner organization
|
||||
self.name = self.config['name']
|
||||
self.name = self.config["name"]
|
||||
self.output_dir = f"output/{self.name}" # Final skill only
|
||||
|
||||
# Use hidden cache directory for intermediate files
|
||||
@@ -107,17 +104,16 @@ class UnifiedScraper:
|
||||
from datetime import datetime
|
||||
|
||||
# Create log filename with timestamp
|
||||
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
log_file = f"{self.logs_dir}/unified_{timestamp}.log"
|
||||
|
||||
# Add file handler to root logger
|
||||
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
||||
file_handler = logging.FileHandler(log_file, encoding="utf-8")
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
@@ -141,18 +137,18 @@ class UnifiedScraper:
|
||||
logger.warning("Config is not unified format, converting...")
|
||||
self.config = self.validator.convert_legacy_to_unified()
|
||||
|
||||
sources = self.config.get('sources', [])
|
||||
sources = self.config.get("sources", [])
|
||||
|
||||
for i, source in enumerate(sources):
|
||||
source_type = source['type']
|
||||
logger.info(f"\n[{i+1}/{len(sources)}] Scraping {source_type} source...")
|
||||
source_type = source["type"]
|
||||
logger.info(f"\n[{i + 1}/{len(sources)}] Scraping {source_type} source...")
|
||||
|
||||
try:
|
||||
if source_type == 'documentation':
|
||||
if source_type == "documentation":
|
||||
self._scrape_documentation(source)
|
||||
elif source_type == 'github':
|
||||
elif source_type == "github":
|
||||
self._scrape_github(source)
|
||||
elif source_type == 'pdf':
|
||||
elif source_type == "pdf":
|
||||
self._scrape_pdf(source)
|
||||
else:
|
||||
logger.warning(f"Unknown source type: {source_type}")
|
||||
@@ -162,40 +158,40 @@ class UnifiedScraper:
|
||||
|
||||
logger.info(f"\n✅ Scraped {len(self.scraped_data)} sources successfully")
|
||||
|
||||
def _scrape_documentation(self, source: Dict[str, Any]):
|
||||
def _scrape_documentation(self, source: dict[str, Any]):
|
||||
"""Scrape documentation website."""
|
||||
# Create temporary config for doc scraper
|
||||
doc_config = {
|
||||
'name': f"{self.name}_docs",
|
||||
'base_url': source['base_url'],
|
||||
'selectors': source.get('selectors', {}),
|
||||
'url_patterns': source.get('url_patterns', {}),
|
||||
'categories': source.get('categories', {}),
|
||||
'rate_limit': source.get('rate_limit', 0.5),
|
||||
'max_pages': source.get('max_pages', 100)
|
||||
"name": f"{self.name}_docs",
|
||||
"base_url": source["base_url"],
|
||||
"selectors": source.get("selectors", {}),
|
||||
"url_patterns": source.get("url_patterns", {}),
|
||||
"categories": source.get("categories", {}),
|
||||
"rate_limit": source.get("rate_limit", 0.5),
|
||||
"max_pages": source.get("max_pages", 100),
|
||||
}
|
||||
|
||||
# Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
|
||||
if 'llms_txt_url' in source:
|
||||
doc_config['llms_txt_url'] = source.get('llms_txt_url')
|
||||
if "llms_txt_url" in source:
|
||||
doc_config["llms_txt_url"] = source.get("llms_txt_url")
|
||||
|
||||
if 'skip_llms_txt' in source:
|
||||
doc_config['skip_llms_txt'] = source.get('skip_llms_txt')
|
||||
if "skip_llms_txt" in source:
|
||||
doc_config["skip_llms_txt"] = source.get("skip_llms_txt")
|
||||
|
||||
# Optional: support overriding start URLs
|
||||
if 'start_urls' in source:
|
||||
doc_config['start_urls'] = source.get('start_urls')
|
||||
if "start_urls" in source:
|
||||
doc_config["start_urls"] = source.get("start_urls")
|
||||
|
||||
# Write temporary config
|
||||
temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
|
||||
with open(temp_config_path, 'w', encoding='utf-8') as f:
|
||||
temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json")
|
||||
with open(temp_config_path, "w", encoding="utf-8") as f:
|
||||
json.dump(doc_config, f, indent=2)
|
||||
|
||||
# Run doc_scraper as subprocess
|
||||
logger.info(f"Scraping documentation from {source['base_url']}")
|
||||
|
||||
doc_scraper_path = Path(__file__).parent / "doc_scraper.py"
|
||||
cmd = [sys.executable, str(doc_scraper_path), '--config', temp_config_path, '--fresh']
|
||||
cmd = [sys.executable, str(doc_scraper_path), "--config", temp_config_path, "--fresh"]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, stdin=subprocess.DEVNULL)
|
||||
|
||||
@@ -213,18 +209,20 @@ class UnifiedScraper:
|
||||
docs_data_file = f"output/{doc_config['name']}_data/summary.json"
|
||||
|
||||
if os.path.exists(docs_data_file):
|
||||
with open(docs_data_file, 'r', encoding='utf-8') as f:
|
||||
with open(docs_data_file, encoding="utf-8") as f:
|
||||
summary = json.load(f)
|
||||
|
||||
# Append to documentation list (multi-source support)
|
||||
self.scraped_data['documentation'].append({
|
||||
'source_id': doc_config['name'],
|
||||
'base_url': source['base_url'],
|
||||
'pages': summary.get('pages', []),
|
||||
'total_pages': summary.get('total_pages', 0),
|
||||
'data_file': docs_data_file,
|
||||
'refs_dir': '' # Will be set after moving to cache
|
||||
})
|
||||
self.scraped_data["documentation"].append(
|
||||
{
|
||||
"source_id": doc_config["name"],
|
||||
"base_url": source["base_url"],
|
||||
"pages": summary.get("pages", []),
|
||||
"total_pages": summary.get("total_pages", 0),
|
||||
"data_file": docs_data_file,
|
||||
"refs_dir": "", # Will be set after moving to cache
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped")
|
||||
else:
|
||||
@@ -246,9 +244,9 @@ class UnifiedScraper:
|
||||
logger.info(f"📦 Moved docs output to cache: {cache_docs_dir}")
|
||||
|
||||
# Update refs_dir in scraped_data with cache location
|
||||
refs_dir_path = os.path.join(cache_docs_dir, 'references')
|
||||
if self.scraped_data['documentation']:
|
||||
self.scraped_data['documentation'][-1]['refs_dir'] = refs_dir_path
|
||||
refs_dir_path = os.path.join(cache_docs_dir, "references")
|
||||
if self.scraped_data["documentation"]:
|
||||
self.scraped_data["documentation"][-1]["refs_dir"] = refs_dir_path
|
||||
|
||||
if os.path.exists(docs_data_dir):
|
||||
cache_data_dir = os.path.join(self.data_dir, f"{doc_config['name']}_data")
|
||||
@@ -257,7 +255,7 @@ class UnifiedScraper:
|
||||
shutil.move(docs_data_dir, cache_data_dir)
|
||||
logger.info(f"📦 Moved docs data to cache: {cache_data_dir}")
|
||||
|
||||
def _clone_github_repo(self, repo_name: str, idx: int = 0) -> Optional[str]:
|
||||
def _clone_github_repo(self, repo_name: str, idx: int = 0) -> str | None:
|
||||
"""
|
||||
Clone GitHub repository to cache directory for C3.x analysis.
|
||||
Reuses existing clone if already present.
|
||||
@@ -274,9 +272,9 @@ class UnifiedScraper:
|
||||
clone_path = os.path.join(self.repos_dir, repo_dir_name)
|
||||
|
||||
# Check if already cloned
|
||||
if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, '.git')):
|
||||
if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, ".git")):
|
||||
logger.info(f"♻️ Found existing repository clone: {clone_path}")
|
||||
logger.info(f" Reusing for C3.x analysis (skip re-cloning)")
|
||||
logger.info(" Reusing for C3.x analysis (skip re-cloning)")
|
||||
return clone_path
|
||||
|
||||
# repos_dir already created in __init__
|
||||
@@ -285,18 +283,18 @@ class UnifiedScraper:
|
||||
repo_url = f"https://github.com/{repo_name}.git"
|
||||
logger.info(f"🔄 Cloning repository for C3.x analysis: {repo_url}")
|
||||
logger.info(f" → {clone_path}")
|
||||
logger.info(f" 💾 Clone will be saved for future reuse")
|
||||
logger.info(" 💾 Clone will be saved for future reuse")
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['git', 'clone', repo_url, clone_path],
|
||||
["git", "clone", repo_url, clone_path],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600 # 10 minute timeout for full clone
|
||||
timeout=600, # 10 minute timeout for full clone
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
logger.info(f"✅ Repository cloned successfully")
|
||||
logger.info("✅ Repository cloned successfully")
|
||||
logger.info(f" 📁 Saved to: {clone_path}")
|
||||
return clone_path
|
||||
else:
|
||||
@@ -307,7 +305,7 @@ class UnifiedScraper:
|
||||
return None
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error(f"❌ Git clone timed out after 10 minutes")
|
||||
logger.error("❌ Git clone timed out after 10 minutes")
|
||||
if os.path.exists(clone_path):
|
||||
shutil.rmtree(clone_path)
|
||||
return None
|
||||
@@ -317,7 +315,7 @@ class UnifiedScraper:
|
||||
shutil.rmtree(clone_path)
|
||||
return None
|
||||
|
||||
def _scrape_github(self, source: Dict[str, Any]):
|
||||
def _scrape_github(self, source: dict[str, Any]):
|
||||
"""Scrape GitHub repository."""
|
||||
try:
|
||||
from skill_seekers.cli.github_scraper import GitHubScraper
|
||||
@@ -326,16 +324,16 @@ class UnifiedScraper:
|
||||
return
|
||||
|
||||
# Multi-source support: Get unique index for this GitHub source
|
||||
idx = self._source_counters['github']
|
||||
self._source_counters['github'] += 1
|
||||
idx = self._source_counters["github"]
|
||||
self._source_counters["github"] += 1
|
||||
|
||||
# Extract repo identifier for unique naming
|
||||
repo = source['repo']
|
||||
repo_id = repo.replace('/', '_')
|
||||
repo = source["repo"]
|
||||
repo_id = repo.replace("/", "_")
|
||||
|
||||
# Check if we need to clone for C3.x analysis
|
||||
enable_codebase_analysis = source.get('enable_codebase_analysis', True)
|
||||
local_repo_path = source.get('local_repo_path')
|
||||
enable_codebase_analysis = source.get("enable_codebase_analysis", True)
|
||||
local_repo_path = source.get("local_repo_path")
|
||||
cloned_repo_path = None
|
||||
|
||||
# Auto-clone if C3.x analysis is enabled but no local path provided
|
||||
@@ -351,24 +349,24 @@ class UnifiedScraper:
|
||||
|
||||
# Create config for GitHub scraper
|
||||
github_config = {
|
||||
'repo': repo,
|
||||
'name': f"{self.name}_github_{idx}_{repo_id}",
|
||||
'github_token': source.get('github_token'),
|
||||
'include_issues': source.get('include_issues', True),
|
||||
'max_issues': source.get('max_issues', 100),
|
||||
'include_changelog': source.get('include_changelog', True),
|
||||
'include_releases': source.get('include_releases', True),
|
||||
'include_code': source.get('include_code', True),
|
||||
'code_analysis_depth': source.get('code_analysis_depth', 'surface'),
|
||||
'file_patterns': source.get('file_patterns', []),
|
||||
'local_repo_path': local_repo_path # Use cloned path if available
|
||||
"repo": repo,
|
||||
"name": f"{self.name}_github_{idx}_{repo_id}",
|
||||
"github_token": source.get("github_token"),
|
||||
"include_issues": source.get("include_issues", True),
|
||||
"max_issues": source.get("max_issues", 100),
|
||||
"include_changelog": source.get("include_changelog", True),
|
||||
"include_releases": source.get("include_releases", True),
|
||||
"include_code": source.get("include_code", True),
|
||||
"code_analysis_depth": source.get("code_analysis_depth", "surface"),
|
||||
"file_patterns": source.get("file_patterns", []),
|
||||
"local_repo_path": local_repo_path, # Use cloned path if available
|
||||
}
|
||||
|
||||
# Pass directory exclusions if specified (optional)
|
||||
if 'exclude_dirs' in source:
|
||||
github_config['exclude_dirs'] = source['exclude_dirs']
|
||||
if 'exclude_dirs_additional' in source:
|
||||
github_config['exclude_dirs_additional'] = source['exclude_dirs_additional']
|
||||
if "exclude_dirs" in source:
|
||||
github_config["exclude_dirs"] = source["exclude_dirs"]
|
||||
if "exclude_dirs_additional" in source:
|
||||
github_config["exclude_dirs_additional"] = source["exclude_dirs_additional"]
|
||||
|
||||
# Scrape
|
||||
logger.info(f"Scraping GitHub repository: {source['repo']}")
|
||||
@@ -381,13 +379,14 @@ class UnifiedScraper:
|
||||
try:
|
||||
c3_data = self._run_c3_analysis(local_repo_path, source)
|
||||
if c3_data:
|
||||
github_data['c3_analysis'] = c3_data
|
||||
github_data["c3_analysis"] = c3_data
|
||||
logger.info("✅ C3.x analysis complete")
|
||||
else:
|
||||
logger.warning("⚠️ C3.x analysis returned no data")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ C3.x analysis failed: {e}")
|
||||
import traceback
|
||||
|
||||
logger.debug(f"Traceback: {traceback.format_exc()}")
|
||||
# Continue without C3.x data - graceful degradation
|
||||
|
||||
@@ -396,32 +395,29 @@ class UnifiedScraper:
|
||||
logger.info(f"📁 Repository clone saved for future use: {cloned_repo_path}")
|
||||
|
||||
# Save data to unified location with unique filename
|
||||
github_data_file = os.path.join(self.data_dir, f'github_data_{idx}_{repo_id}.json')
|
||||
with open(github_data_file, 'w', encoding='utf-8') as f:
|
||||
github_data_file = os.path.join(self.data_dir, f"github_data_{idx}_{repo_id}.json")
|
||||
with open(github_data_file, "w", encoding="utf-8") as f:
|
||||
json.dump(github_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# ALSO save to the location GitHubToSkillConverter expects (with C3.x data!)
|
||||
converter_data_file = f"output/{github_config['name']}_github_data.json"
|
||||
with open(converter_data_file, 'w', encoding='utf-8') as f:
|
||||
with open(converter_data_file, "w", encoding="utf-8") as f:
|
||||
json.dump(github_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Append to list instead of overwriting (multi-source support)
|
||||
self.scraped_data['github'].append({
|
||||
'repo': repo,
|
||||
'repo_id': repo_id,
|
||||
'idx': idx,
|
||||
'data': github_data,
|
||||
'data_file': github_data_file
|
||||
})
|
||||
self.scraped_data["github"].append(
|
||||
{"repo": repo, "repo_id": repo_id, "idx": idx, "data": github_data, "data_file": github_data_file}
|
||||
)
|
||||
|
||||
# Build standalone SKILL.md for synthesis using GitHubToSkillConverter
|
||||
try:
|
||||
from skill_seekers.cli.github_scraper import GitHubToSkillConverter
|
||||
|
||||
# Use github_config which has the correct name field
|
||||
# Converter will load from output/{name}_github_data.json which now has C3.x data
|
||||
converter = GitHubToSkillConverter(config=github_config)
|
||||
converter.build_skill()
|
||||
logger.info(f"✅ GitHub: Standalone SKILL.md created")
|
||||
logger.info("✅ GitHub: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone GitHub SKILL.md: {e}")
|
||||
|
||||
@@ -430,7 +426,7 @@ class UnifiedScraper:
|
||||
github_data_file_path = f"output/{github_config['name']}_github_data.json"
|
||||
|
||||
if os.path.exists(github_output_dir):
|
||||
cache_github_dir = os.path.join(self.sources_dir, github_config['name'])
|
||||
cache_github_dir = os.path.join(self.sources_dir, github_config["name"])
|
||||
if os.path.exists(cache_github_dir):
|
||||
shutil.rmtree(cache_github_dir)
|
||||
shutil.move(github_output_dir, cache_github_dir)
|
||||
@@ -443,9 +439,9 @@ class UnifiedScraper:
|
||||
shutil.move(github_data_file_path, cache_github_data)
|
||||
logger.info(f"📦 Moved GitHub data to cache: {cache_github_data}")
|
||||
|
||||
logger.info(f"✅ GitHub: Repository scraped successfully")
|
||||
logger.info("✅ GitHub: Repository scraped successfully")
|
||||
|
||||
def _scrape_pdf(self, source: Dict[str, Any]):
|
||||
def _scrape_pdf(self, source: dict[str, Any]):
|
||||
"""Scrape PDF document."""
|
||||
try:
|
||||
from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
|
||||
@@ -454,20 +450,20 @@ class UnifiedScraper:
|
||||
return
|
||||
|
||||
# Multi-source support: Get unique index for this PDF source
|
||||
idx = self._source_counters['pdf']
|
||||
self._source_counters['pdf'] += 1
|
||||
idx = self._source_counters["pdf"]
|
||||
self._source_counters["pdf"] += 1
|
||||
|
||||
# Extract PDF identifier for unique naming (filename without extension)
|
||||
pdf_path = source['path']
|
||||
pdf_path = source["path"]
|
||||
pdf_id = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
|
||||
# Create config for PDF scraper
|
||||
pdf_config = {
|
||||
'name': f"{self.name}_pdf_{idx}_{pdf_id}",
|
||||
'pdf': source['path'],
|
||||
'extract_tables': source.get('extract_tables', False),
|
||||
'ocr': source.get('ocr', False),
|
||||
'password': source.get('password')
|
||||
"name": f"{self.name}_pdf_{idx}_{pdf_id}",
|
||||
"pdf": source["path"],
|
||||
"extract_tables": source.get("extract_tables", False),
|
||||
"ocr": source.get("ocr", False),
|
||||
"password": source.get("password"),
|
||||
}
|
||||
|
||||
# Scrape
|
||||
@@ -476,29 +472,25 @@ class UnifiedScraper:
|
||||
pdf_data = converter.extract_all()
|
||||
|
||||
# Save data
|
||||
pdf_data_file = os.path.join(self.data_dir, f'pdf_data_{idx}_{pdf_id}.json')
|
||||
with open(pdf_data_file, 'w', encoding='utf-8') as f:
|
||||
pdf_data_file = os.path.join(self.data_dir, f"pdf_data_{idx}_{pdf_id}.json")
|
||||
with open(pdf_data_file, "w", encoding="utf-8") as f:
|
||||
json.dump(pdf_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Append to list instead of overwriting
|
||||
self.scraped_data['pdf'].append({
|
||||
'pdf_path': pdf_path,
|
||||
'pdf_id': pdf_id,
|
||||
'idx': idx,
|
||||
'data': pdf_data,
|
||||
'data_file': pdf_data_file
|
||||
})
|
||||
self.scraped_data["pdf"].append(
|
||||
{"pdf_path": pdf_path, "pdf_id": pdf_id, "idx": idx, "data": pdf_data, "data_file": pdf_data_file}
|
||||
)
|
||||
|
||||
# Build standalone SKILL.md for synthesis
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info(f"✅ PDF: Standalone SKILL.md created")
|
||||
logger.info("✅ PDF: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone PDF SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted")
|
||||
|
||||
def _load_json(self, file_path: Path) -> Dict:
|
||||
def _load_json(self, file_path: Path) -> dict:
|
||||
"""
|
||||
Load JSON file safely.
|
||||
|
||||
@@ -513,13 +505,13 @@ class UnifiedScraper:
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, IOError) as e:
|
||||
except (OSError, json.JSONDecodeError) as e:
|
||||
logger.warning(f"Failed to load JSON {file_path}: {e}")
|
||||
return {}
|
||||
|
||||
def _load_guide_collection(self, tutorials_dir: Path) -> Dict:
|
||||
def _load_guide_collection(self, tutorials_dir: Path) -> dict:
|
||||
"""
|
||||
Load how-to guide collection from tutorials directory.
|
||||
|
||||
@@ -531,22 +523,22 @@ class UnifiedScraper:
|
||||
"""
|
||||
if not tutorials_dir.exists():
|
||||
logger.warning(f"Tutorials directory not found: {tutorials_dir}")
|
||||
return {'guides': []}
|
||||
return {"guides": []}
|
||||
|
||||
collection_file = tutorials_dir / 'guide_collection.json'
|
||||
collection_file = tutorials_dir / "guide_collection.json"
|
||||
if collection_file.exists():
|
||||
return self._load_json(collection_file)
|
||||
|
||||
# Fallback: scan for individual guide JSON files
|
||||
guides = []
|
||||
for guide_file in tutorials_dir.glob('guide_*.json'):
|
||||
for guide_file in tutorials_dir.glob("guide_*.json"):
|
||||
guide_data = self._load_json(guide_file)
|
||||
if guide_data:
|
||||
guides.append(guide_data)
|
||||
|
||||
return {'guides': guides, 'total_count': len(guides)}
|
||||
return {"guides": guides, "total_count": len(guides)}
|
||||
|
||||
def _load_api_reference(self, api_dir: Path) -> Dict[str, Any]:
|
||||
def _load_api_reference(self, api_dir: Path) -> dict[str, Any]:
|
||||
"""
|
||||
Load API reference markdown files from api_reference directory.
|
||||
|
||||
@@ -561,16 +553,16 @@ class UnifiedScraper:
|
||||
return {}
|
||||
|
||||
api_refs = {}
|
||||
for md_file in api_dir.glob('*.md'):
|
||||
for md_file in api_dir.glob("*.md"):
|
||||
try:
|
||||
module_name = md_file.stem
|
||||
api_refs[module_name] = md_file.read_text(encoding='utf-8')
|
||||
except IOError as e:
|
||||
api_refs[module_name] = md_file.read_text(encoding="utf-8")
|
||||
except OSError as e:
|
||||
logger.warning(f"Failed to read API reference {md_file}: {e}")
|
||||
|
||||
return api_refs
|
||||
|
||||
def _run_c3_analysis(self, local_repo_path: str, source: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def _run_c3_analysis(self, local_repo_path: str, source: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Run comprehensive C3.x codebase analysis.
|
||||
|
||||
@@ -592,7 +584,7 @@ class UnifiedScraper:
|
||||
return {}
|
||||
|
||||
# Create temp output dir for C3.x analysis
|
||||
temp_output = Path(self.data_dir) / 'c3_analysis_temp'
|
||||
temp_output = Path(self.data_dir) / "c3_analysis_temp"
|
||||
temp_output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(f" Analyzing codebase: {local_repo_path}")
|
||||
@@ -602,37 +594,37 @@ class UnifiedScraper:
|
||||
results = analyze_codebase(
|
||||
directory=Path(local_repo_path),
|
||||
output_dir=temp_output,
|
||||
depth='deep',
|
||||
depth="deep",
|
||||
languages=None, # Analyze all languages
|
||||
file_patterns=source.get('file_patterns'),
|
||||
build_api_reference=True, # C2.5: API Reference
|
||||
extract_comments=False, # Not needed
|
||||
file_patterns=source.get("file_patterns"),
|
||||
build_api_reference=True, # C2.5: API Reference
|
||||
extract_comments=False, # Not needed
|
||||
build_dependency_graph=True, # C2.6: Dependency Graph
|
||||
detect_patterns=True, # C3.1: Design patterns
|
||||
extract_test_examples=True, # C3.2: Test examples
|
||||
build_how_to_guides=True, # C3.3: How-to guides
|
||||
detect_patterns=True, # C3.1: Design patterns
|
||||
extract_test_examples=True, # C3.2: Test examples
|
||||
build_how_to_guides=True, # C3.3: How-to guides
|
||||
extract_config_patterns=True, # C3.4: Config patterns
|
||||
enhance_with_ai=source.get('ai_mode', 'auto') != 'none',
|
||||
ai_mode=source.get('ai_mode', 'auto')
|
||||
enhance_with_ai=source.get("ai_mode", "auto") != "none",
|
||||
ai_mode=source.get("ai_mode", "auto"),
|
||||
)
|
||||
|
||||
# Load C3.x outputs into memory
|
||||
c3_data = {
|
||||
'patterns': self._load_json(temp_output / 'patterns' / 'detected_patterns.json'),
|
||||
'test_examples': self._load_json(temp_output / 'test_examples' / 'test_examples.json'),
|
||||
'how_to_guides': self._load_guide_collection(temp_output / 'tutorials'),
|
||||
'config_patterns': self._load_json(temp_output / 'config_patterns' / 'config_patterns.json'),
|
||||
'architecture': self._load_json(temp_output / 'architecture' / 'architectural_patterns.json'),
|
||||
'api_reference': self._load_api_reference(temp_output / 'api_reference'), # C2.5
|
||||
'dependency_graph': self._load_json(temp_output / 'dependencies' / 'dependency_graph.json') # C2.6
|
||||
"patterns": self._load_json(temp_output / "patterns" / "detected_patterns.json"),
|
||||
"test_examples": self._load_json(temp_output / "test_examples" / "test_examples.json"),
|
||||
"how_to_guides": self._load_guide_collection(temp_output / "tutorials"),
|
||||
"config_patterns": self._load_json(temp_output / "config_patterns" / "config_patterns.json"),
|
||||
"architecture": self._load_json(temp_output / "architecture" / "architectural_patterns.json"),
|
||||
"api_reference": self._load_api_reference(temp_output / "api_reference"), # C2.5
|
||||
"dependency_graph": self._load_json(temp_output / "dependencies" / "dependency_graph.json"), # C2.6
|
||||
}
|
||||
|
||||
# Log summary
|
||||
total_patterns = sum(len(f.get('patterns', [])) for f in c3_data.get('patterns', []))
|
||||
total_examples = c3_data.get('test_examples', {}).get('total_examples', 0)
|
||||
total_guides = len(c3_data.get('how_to_guides', {}).get('guides', []))
|
||||
total_configs = len(c3_data.get('config_patterns', {}).get('config_files', []))
|
||||
arch_patterns = len(c3_data.get('architecture', {}).get('patterns', []))
|
||||
total_patterns = sum(len(f.get("patterns", [])) for f in c3_data.get("patterns", []))
|
||||
total_examples = c3_data.get("test_examples", {}).get("total_examples", 0)
|
||||
total_guides = len(c3_data.get("how_to_guides", {}).get("guides", []))
|
||||
total_configs = len(c3_data.get("config_patterns", {}).get("config_files", []))
|
||||
arch_patterns = len(c3_data.get("architecture", {}).get("patterns", []))
|
||||
|
||||
logger.info(f" ✓ Design Patterns: {total_patterns}")
|
||||
logger.info(f" ✓ Test Examples: {total_examples}")
|
||||
@@ -645,6 +637,7 @@ class UnifiedScraper:
|
||||
except Exception as e:
|
||||
logger.error(f"C3.x analysis failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return {}
|
||||
|
||||
@@ -656,7 +649,7 @@ class UnifiedScraper:
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clean up temp directory: {e}")
|
||||
|
||||
def detect_conflicts(self) -> List:
|
||||
def detect_conflicts(self) -> list:
|
||||
"""
|
||||
Detect conflicts between documentation and code.
|
||||
|
||||
@@ -674,18 +667,18 @@ class UnifiedScraper:
|
||||
return []
|
||||
|
||||
# Get documentation and GitHub data
|
||||
docs_data = self.scraped_data.get('documentation', {})
|
||||
github_data = self.scraped_data.get('github', {})
|
||||
docs_data = self.scraped_data.get("documentation", {})
|
||||
github_data = self.scraped_data.get("github", {})
|
||||
|
||||
if not docs_data or not github_data:
|
||||
logger.warning("Missing documentation or GitHub data for conflict detection")
|
||||
return []
|
||||
|
||||
# Load data files
|
||||
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
with open(docs_data["data_file"], encoding="utf-8") as f:
|
||||
docs_json = json.load(f)
|
||||
|
||||
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
with open(github_data["data_file"], encoding="utf-8") as f:
|
||||
github_json = json.load(f)
|
||||
|
||||
# Detect conflicts
|
||||
@@ -693,26 +686,26 @@ class UnifiedScraper:
|
||||
conflicts = detector.detect_all_conflicts()
|
||||
|
||||
# Save conflicts
|
||||
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
|
||||
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
|
||||
detector.save_conflicts(conflicts, conflicts_file)
|
||||
|
||||
# Print summary
|
||||
summary = detector.generate_summary(conflicts)
|
||||
logger.info(f"\n📊 Conflict Summary:")
|
||||
logger.info("\n📊 Conflict Summary:")
|
||||
logger.info(f" Total: {summary['total']}")
|
||||
logger.info(f" By Type:")
|
||||
for ctype, count in summary['by_type'].items():
|
||||
logger.info(" By Type:")
|
||||
for ctype, count in summary["by_type"].items():
|
||||
if count > 0:
|
||||
logger.info(f" - {ctype}: {count}")
|
||||
logger.info(f" By Severity:")
|
||||
for severity, count in summary['by_severity'].items():
|
||||
logger.info(" By Severity:")
|
||||
for severity, count in summary["by_severity"].items():
|
||||
if count > 0:
|
||||
emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢'
|
||||
emoji = "🔴" if severity == "high" else "🟡" if severity == "medium" else "🟢"
|
||||
logger.info(f" {emoji} {severity}: {count}")
|
||||
|
||||
return conflicts
|
||||
|
||||
def merge_sources(self, conflicts: List):
|
||||
def merge_sources(self, conflicts: list):
|
||||
"""
|
||||
Merge data from multiple sources.
|
||||
|
||||
@@ -728,18 +721,18 @@ class UnifiedScraper:
|
||||
return None
|
||||
|
||||
# Get data files
|
||||
docs_data = self.scraped_data.get('documentation', {})
|
||||
github_data = self.scraped_data.get('github', {})
|
||||
docs_data = self.scraped_data.get("documentation", {})
|
||||
github_data = self.scraped_data.get("github", {})
|
||||
|
||||
# Load data
|
||||
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
with open(docs_data["data_file"], encoding="utf-8") as f:
|
||||
docs_json = json.load(f)
|
||||
|
||||
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
with open(github_data["data_file"], encoding="utf-8") as f:
|
||||
github_json = json.load(f)
|
||||
|
||||
# Choose merger
|
||||
if self.merge_mode == 'claude-enhanced':
|
||||
if self.merge_mode == "claude-enhanced":
|
||||
merger = ClaudeEnhancedMerger(docs_json, github_json, conflicts)
|
||||
else:
|
||||
merger = RuleBasedMerger(docs_json, github_json, conflicts)
|
||||
@@ -748,15 +741,15 @@ class UnifiedScraper:
|
||||
merged_data = merger.merge_all()
|
||||
|
||||
# Save merged data
|
||||
merged_file = os.path.join(self.data_dir, 'merged_data.json')
|
||||
with open(merged_file, 'w', encoding='utf-8') as f:
|
||||
merged_file = os.path.join(self.data_dir, "merged_data.json")
|
||||
with open(merged_file, "w", encoding="utf-8") as f:
|
||||
json.dump(merged_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"✅ Merged data saved: {merged_file}")
|
||||
|
||||
return merged_data
|
||||
|
||||
def build_skill(self, merged_data: Optional[Dict] = None):
|
||||
def build_skill(self, merged_data: dict | None = None):
|
||||
"""
|
||||
Build final unified skill.
|
||||
|
||||
@@ -769,20 +762,14 @@ class UnifiedScraper:
|
||||
|
||||
# Load conflicts if they exist
|
||||
conflicts = []
|
||||
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
|
||||
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
|
||||
if os.path.exists(conflicts_file):
|
||||
with open(conflicts_file, 'r', encoding='utf-8') as f:
|
||||
with open(conflicts_file, encoding="utf-8") as f:
|
||||
conflicts_data = json.load(f)
|
||||
conflicts = conflicts_data.get('conflicts', [])
|
||||
conflicts = conflicts_data.get("conflicts", [])
|
||||
|
||||
# Build skill
|
||||
builder = UnifiedSkillBuilder(
|
||||
self.config,
|
||||
self.scraped_data,
|
||||
merged_data,
|
||||
conflicts,
|
||||
cache_dir=self.cache_dir
|
||||
)
|
||||
builder = UnifiedSkillBuilder(self.config, self.scraped_data, merged_data, conflicts, cache_dir=self.cache_dir)
|
||||
|
||||
builder.build()
|
||||
|
||||
@@ -824,6 +811,7 @@ class UnifiedScraper:
|
||||
except Exception as e:
|
||||
logger.error(f"\n\n❌ Error during scraping: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -831,7 +819,7 @@ class UnifiedScraper:
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Unified multi-source scraper',
|
||||
description="Unified multi-source scraper",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -843,17 +831,18 @@ Examples:
|
||||
|
||||
# Backward compatible with legacy configs
|
||||
skill-seekers unified --config configs/react.json
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument('--config', '-c', required=True,
|
||||
help='Path to unified config JSON file')
|
||||
parser.add_argument('--merge-mode', '-m',
|
||||
choices=['rule-based', 'claude-enhanced'],
|
||||
help='Override config merge mode')
|
||||
parser.add_argument('--skip-codebase-analysis',
|
||||
action='store_true',
|
||||
help='Skip C3.x codebase analysis for GitHub sources (default: enabled)')
|
||||
parser.add_argument("--config", "-c", required=True, help="Path to unified config JSON file")
|
||||
parser.add_argument(
|
||||
"--merge-mode", "-m", choices=["rule-based", "claude-enhanced"], help="Override config merge mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-codebase-analysis",
|
||||
action="store_true",
|
||||
help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -862,14 +851,14 @@ Examples:
|
||||
|
||||
# Disable codebase analysis if requested
|
||||
if args.skip_codebase_analysis:
|
||||
for source in scraper.config.get('sources', []):
|
||||
if source['type'] == 'github':
|
||||
source['enable_codebase_analysis'] = False
|
||||
for source in scraper.config.get("sources", []):
|
||||
if source["type"] == "github":
|
||||
source["enable_codebase_analysis"] = False
|
||||
logger.info(f"⏭️ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}")
|
||||
|
||||
# Run scraper
|
||||
scraper.run()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -17,27 +17,20 @@ Usage:
|
||||
skill-seekers upload output/react-openai.zip --target openai
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
# Import utilities
|
||||
try:
|
||||
from utils import (
|
||||
print_upload_instructions,
|
||||
validate_zip_file
|
||||
)
|
||||
from utils import print_upload_instructions, validate_zip_file
|
||||
except ImportError:
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from utils import (
|
||||
print_upload_instructions,
|
||||
validate_zip_file
|
||||
)
|
||||
from utils import print_upload_instructions
|
||||
|
||||
|
||||
def upload_skill_api(package_path, target='claude', api_key=None):
|
||||
def upload_skill_api(package_path, target="claude", api_key=None):
|
||||
"""
|
||||
Upload skill package to LLM platform
|
||||
|
||||
@@ -62,7 +55,7 @@ def upload_skill_api(package_path, target='claude', api_key=None):
|
||||
|
||||
# Get API key
|
||||
if not api_key:
|
||||
api_key = os.environ.get(adaptor.get_env_var_name(), '').strip()
|
||||
api_key = os.environ.get(adaptor.get_env_var_name(), "").strip()
|
||||
|
||||
if not api_key:
|
||||
return False, f"{adaptor.get_env_var_name()} not set. Export your API key first."
|
||||
@@ -91,19 +84,19 @@ def upload_skill_api(package_path, target='claude', api_key=None):
|
||||
try:
|
||||
result = adaptor.upload(package_path, api_key)
|
||||
|
||||
if result['success']:
|
||||
if result["success"]:
|
||||
print()
|
||||
print(f"✅ {result['message']}")
|
||||
print()
|
||||
if result['url']:
|
||||
if result["url"]:
|
||||
print("Your skill is now available at:")
|
||||
print(f" {result['url']}")
|
||||
if result['skill_id']:
|
||||
if result["skill_id"]:
|
||||
print(f" Skill ID: {result['skill_id']}")
|
||||
print()
|
||||
return True, "Upload successful"
|
||||
else:
|
||||
return False, result['message']
|
||||
return False, result["message"]
|
||||
|
||||
except Exception as e:
|
||||
return False, f"Unexpected error: {str(e)}"
|
||||
@@ -136,25 +129,19 @@ Examples:
|
||||
|
||||
# Upload with explicit API key
|
||||
skill-seekers upload output/react.zip --api-key sk-ant-...
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'package_file',
|
||||
help='Path to skill package file (e.g., output/react.zip)'
|
||||
)
|
||||
parser.add_argument("package_file", help="Path to skill package file (e.g., output/react.zip)")
|
||||
|
||||
parser.add_argument(
|
||||
'--target',
|
||||
choices=['claude', 'gemini', 'openai'],
|
||||
default='claude',
|
||||
help='Target LLM platform (default: claude)'
|
||||
"--target",
|
||||
choices=["claude", "gemini", "openai"],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--api-key',
|
||||
help='Platform API key (or set environment variable)'
|
||||
)
|
||||
parser.add_argument("--api-key", help="Platform API key (or set environment variable)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -3,21 +3,21 @@
|
||||
Utility functions for Skill Seeker CLI tools
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import platform
|
||||
import time
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple, Dict, Union, TypeVar, Callable
|
||||
from typing import TypeVar
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
T = TypeVar('T')
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def open_folder(folder_path: Union[str, Path]) -> bool:
|
||||
def open_folder(folder_path: str | Path) -> bool:
|
||||
"""
|
||||
Open a folder in the system file browser
|
||||
|
||||
@@ -50,10 +50,10 @@ def open_folder(folder_path: Union[str, Path]) -> bool:
|
||||
return True
|
||||
|
||||
except subprocess.CalledProcessError:
|
||||
print(f"⚠️ Could not open folder automatically")
|
||||
print("⚠️ Could not open folder automatically")
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
print(f"⚠️ File browser not found on system")
|
||||
print("⚠️ File browser not found on system")
|
||||
return False
|
||||
|
||||
|
||||
@@ -64,18 +64,18 @@ def has_api_key() -> bool:
|
||||
Returns:
|
||||
bool: True if API key is set, False otherwise
|
||||
"""
|
||||
api_key = os.environ.get('ANTHROPIC_API_KEY', '').strip()
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
|
||||
return len(api_key) > 0
|
||||
|
||||
|
||||
def get_api_key() -> Optional[str]:
|
||||
def get_api_key() -> str | None:
|
||||
"""
|
||||
Get ANTHROPIC_API_KEY from environment
|
||||
|
||||
Returns:
|
||||
str: API key or None if not set
|
||||
"""
|
||||
api_key = os.environ.get('ANTHROPIC_API_KEY', '').strip()
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
|
||||
return api_key if api_key else None
|
||||
|
||||
|
||||
@@ -89,7 +89,7 @@ def get_upload_url() -> str:
|
||||
return "https://claude.ai/skills"
|
||||
|
||||
|
||||
def print_upload_instructions(zip_path: Union[str, Path]) -> None:
|
||||
def print_upload_instructions(zip_path: str | Path) -> None:
|
||||
"""
|
||||
Print clear upload instructions for manual upload
|
||||
|
||||
@@ -106,7 +106,7 @@ def print_upload_instructions(zip_path: Union[str, Path]) -> None:
|
||||
print(f"📤 Upload to Claude: {get_upload_url()}")
|
||||
print()
|
||||
print(f"1. Go to {get_upload_url()}")
|
||||
print("2. Click \"Upload Skill\"")
|
||||
print('2. Click "Upload Skill"')
|
||||
print(f"3. Select: {zip_path}")
|
||||
print("4. Done! ✅")
|
||||
print()
|
||||
@@ -130,7 +130,7 @@ def format_file_size(size_bytes: int) -> str:
|
||||
return f"{size_bytes / (1024 * 1024):.1f} MB"
|
||||
|
||||
|
||||
def validate_skill_directory(skill_dir: Union[str, Path]) -> Tuple[bool, Optional[str]]:
|
||||
def validate_skill_directory(skill_dir: str | Path) -> tuple[bool, str | None]:
|
||||
"""
|
||||
Validate that a directory is a valid skill directory
|
||||
|
||||
@@ -155,7 +155,7 @@ def validate_skill_directory(skill_dir: Union[str, Path]) -> Tuple[bool, Optiona
|
||||
return True, None
|
||||
|
||||
|
||||
def validate_zip_file(zip_path: Union[str, Path]) -> Tuple[bool, Optional[str]]:
|
||||
def validate_zip_file(zip_path: str | Path) -> tuple[bool, str | None]:
|
||||
"""
|
||||
Validate that a file is a valid skill .zip file
|
||||
|
||||
@@ -173,13 +173,13 @@ def validate_zip_file(zip_path: Union[str, Path]) -> Tuple[bool, Optional[str]]:
|
||||
if not zip_path.is_file():
|
||||
return False, f"Not a file: {zip_path}"
|
||||
|
||||
if not zip_path.suffix == '.zip':
|
||||
if not zip_path.suffix == ".zip":
|
||||
return False, f"Not a .zip file: {zip_path}"
|
||||
|
||||
return True, None
|
||||
|
||||
|
||||
def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, preview_limit: int = 40000) -> Dict[str, Dict]:
|
||||
def read_reference_files(skill_dir: str | Path, max_chars: int = 100000, preview_limit: int = 40000) -> dict[str, dict]:
|
||||
"""Read reference files from a skill directory with enriched metadata.
|
||||
|
||||
This function reads markdown files from the references/ subdirectory
|
||||
@@ -210,13 +210,13 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
|
||||
|
||||
skill_path = Path(skill_dir)
|
||||
references_dir = skill_path / "references"
|
||||
references: Dict[str, Dict] = {}
|
||||
references: dict[str, dict] = {}
|
||||
|
||||
if not references_dir.exists():
|
||||
print(f"⚠ No references directory found at {references_dir}")
|
||||
return references
|
||||
|
||||
def _determine_source_metadata(relative_path: Path) -> Tuple[str, str, Optional[str]]:
|
||||
def _determine_source_metadata(relative_path: Path) -> tuple[str, str, str | None]:
|
||||
"""Determine source type, confidence level, and repo_id from path.
|
||||
|
||||
For multi-source support, extracts repo_id from paths like:
|
||||
@@ -230,54 +230,54 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
|
||||
repo_id = None # Default: no repo identity
|
||||
|
||||
# Documentation sources (official docs)
|
||||
if path_str.startswith('documentation/'):
|
||||
return 'documentation', 'high', None
|
||||
if path_str.startswith("documentation/"):
|
||||
return "documentation", "high", None
|
||||
|
||||
# GitHub sources
|
||||
elif path_str.startswith('github/'):
|
||||
elif path_str.startswith("github/"):
|
||||
# README and releases are medium confidence
|
||||
if 'README' in path_str or 'releases' in path_str:
|
||||
return 'github', 'medium', None
|
||||
if "README" in path_str or "releases" in path_str:
|
||||
return "github", "medium", None
|
||||
# Issues are low confidence (user reports)
|
||||
elif 'issues' in path_str:
|
||||
return 'github', 'low', None
|
||||
elif "issues" in path_str:
|
||||
return "github", "low", None
|
||||
else:
|
||||
return 'github', 'medium', None
|
||||
return "github", "medium", None
|
||||
|
||||
# PDF sources (books, manuals)
|
||||
elif path_str.startswith('pdf/'):
|
||||
return 'pdf', 'high', None
|
||||
elif path_str.startswith("pdf/"):
|
||||
return "pdf", "high", None
|
||||
|
||||
# Merged API (synthesized from multiple sources)
|
||||
elif path_str.startswith('api/'):
|
||||
return 'api', 'high', None
|
||||
elif path_str.startswith("api/"):
|
||||
return "api", "high", None
|
||||
|
||||
# Codebase analysis (C3.x automated analysis)
|
||||
elif path_str.startswith('codebase_analysis/'):
|
||||
elif path_str.startswith("codebase_analysis/"):
|
||||
# Extract repo_id from path: codebase_analysis/{repo_id}/...
|
||||
parts = Path(path_str).parts
|
||||
if len(parts) >= 2:
|
||||
repo_id = parts[1] # e.g., 'encode_httpx', 'encode_httpcore'
|
||||
|
||||
# ARCHITECTURE.md is high confidence (comprehensive)
|
||||
if 'ARCHITECTURE' in path_str:
|
||||
return 'codebase_analysis', 'high', repo_id
|
||||
if "ARCHITECTURE" in path_str:
|
||||
return "codebase_analysis", "high", repo_id
|
||||
# Patterns and examples are medium (heuristic-based)
|
||||
elif 'patterns' in path_str or 'examples' in path_str:
|
||||
return 'codebase_analysis', 'medium', repo_id
|
||||
elif "patterns" in path_str or "examples" in path_str:
|
||||
return "codebase_analysis", "medium", repo_id
|
||||
# Configuration is high (direct extraction)
|
||||
elif 'configuration' in path_str:
|
||||
return 'codebase_analysis', 'high', repo_id
|
||||
elif "configuration" in path_str:
|
||||
return "codebase_analysis", "high", repo_id
|
||||
else:
|
||||
return 'codebase_analysis', 'medium', repo_id
|
||||
return "codebase_analysis", "medium", repo_id
|
||||
|
||||
# Conflicts report (discrepancy detection)
|
||||
elif 'conflicts' in path_str:
|
||||
return 'conflicts', 'medium', None
|
||||
elif "conflicts" in path_str:
|
||||
return "conflicts", "medium", None
|
||||
|
||||
# Fallback
|
||||
else:
|
||||
return 'unknown', 'medium', None
|
||||
return "unknown", "medium", None
|
||||
|
||||
total_chars = 0
|
||||
# Search recursively for all .md files (including subdirectories like github/README.md)
|
||||
@@ -285,7 +285,7 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
|
||||
# Note: We now include index.md files as they contain important content
|
||||
# (patterns, examples, configuration analysis)
|
||||
|
||||
content = ref_file.read_text(encoding='utf-8')
|
||||
content = ref_file.read_text(encoding="utf-8")
|
||||
|
||||
# Limit size per file
|
||||
truncated = False
|
||||
@@ -299,13 +299,13 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
|
||||
|
||||
# Build enriched metadata (with repo_id for multi-source support)
|
||||
references[str(relative_path)] = {
|
||||
'content': content,
|
||||
'source': source_type,
|
||||
'confidence': confidence,
|
||||
'path': str(relative_path),
|
||||
'truncated': truncated,
|
||||
'size': len(content),
|
||||
'repo_id': repo_id # None for single-source, repo identifier for multi-source
|
||||
"content": content,
|
||||
"source": source_type,
|
||||
"confidence": confidence,
|
||||
"path": str(relative_path),
|
||||
"truncated": truncated,
|
||||
"size": len(content),
|
||||
"repo_id": repo_id, # None for single-source, repo identifier for multi-source
|
||||
}
|
||||
|
||||
total_chars += len(content)
|
||||
@@ -319,10 +319,7 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
|
||||
|
||||
|
||||
def retry_with_backoff(
|
||||
operation: Callable[[], T],
|
||||
max_attempts: int = 3,
|
||||
base_delay: float = 1.0,
|
||||
operation_name: str = "operation"
|
||||
operation: Callable[[], T], max_attempts: int = 3, base_delay: float = 1.0, operation_name: str = "operation"
|
||||
) -> T:
|
||||
"""Retry an operation with exponential backoff.
|
||||
|
||||
@@ -348,7 +345,7 @@ def retry_with_backoff(
|
||||
... return response.text
|
||||
>>> content = retry_with_backoff(fetch_page, max_attempts=3, operation_name=f"fetch {url}")
|
||||
"""
|
||||
last_exception: Optional[Exception] = None
|
||||
last_exception: Exception | None = None
|
||||
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
try:
|
||||
@@ -358,15 +355,11 @@ def retry_with_backoff(
|
||||
if attempt < max_attempts:
|
||||
delay = base_delay * (2 ** (attempt - 1))
|
||||
logger.warning(
|
||||
"%s failed (attempt %d/%d), retrying in %.1fs: %s",
|
||||
operation_name, attempt, max_attempts, delay, e
|
||||
"%s failed (attempt %d/%d), retrying in %.1fs: %s", operation_name, attempt, max_attempts, delay, e
|
||||
)
|
||||
time.sleep(delay)
|
||||
else:
|
||||
logger.error(
|
||||
"%s failed after %d attempts: %s",
|
||||
operation_name, max_attempts, e
|
||||
)
|
||||
logger.error("%s failed after %d attempts: %s", operation_name, max_attempts, e)
|
||||
|
||||
# This should always have a value, but mypy doesn't know that
|
||||
if last_exception is not None:
|
||||
@@ -375,10 +368,7 @@ def retry_with_backoff(
|
||||
|
||||
|
||||
async def retry_with_backoff_async(
|
||||
operation: Callable[[], T],
|
||||
max_attempts: int = 3,
|
||||
base_delay: float = 1.0,
|
||||
operation_name: str = "operation"
|
||||
operation: Callable[[], T], max_attempts: int = 3, base_delay: float = 1.0, operation_name: str = "operation"
|
||||
) -> T:
|
||||
"""Async version of retry_with_backoff for async operations.
|
||||
|
||||
@@ -403,7 +393,7 @@ async def retry_with_backoff_async(
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
last_exception: Optional[Exception] = None
|
||||
last_exception: Exception | None = None
|
||||
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
try:
|
||||
@@ -413,15 +403,11 @@ async def retry_with_backoff_async(
|
||||
if attempt < max_attempts:
|
||||
delay = base_delay * (2 ** (attempt - 1))
|
||||
logger.warning(
|
||||
"%s failed (attempt %d/%d), retrying in %.1fs: %s",
|
||||
operation_name, attempt, max_attempts, delay, e
|
||||
"%s failed (attempt %d/%d), retrying in %.1fs: %s", operation_name, attempt, max_attempts, delay, e
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
logger.error(
|
||||
"%s failed after %d attempts: %s",
|
||||
operation_name, max_attempts, e
|
||||
)
|
||||
logger.error("%s failed after %d attempts: %s", operation_name, max_attempts, e)
|
||||
|
||||
if last_exception is not None:
|
||||
raise last_exception
|
||||
|
||||
Reference in New Issue
Block a user