diff --git a/api/config_analyzer.py b/api/config_analyzer.py index 644ade7..61c1931 100644 --- a/api/config_analyzer.py +++ b/api/config_analyzer.py @@ -4,11 +4,10 @@ Config Analyzer - Extract metadata from Skill Seekers config files """ import json -import os import subprocess -from pathlib import Path -from typing import List, Dict, Any, Optional from datetime import datetime +from pathlib import Path +from typing import Any class ConfigAnalyzer: @@ -16,27 +15,13 @@ class ConfigAnalyzer: # Category mapping based on config content CATEGORY_MAPPING = { - "web-frameworks": [ - "react", "vue", "django", "fastapi", "laravel", "astro", "hono" - ], - "game-engines": [ - "godot", "unity", "unreal" - ], - "devops": [ - "kubernetes", "ansible", "docker", "terraform" - ], - "css-frameworks": [ - "tailwind", "bootstrap", "bulma" - ], - "development-tools": [ - "claude-code", "vscode", "git" - ], - "gaming": [ - "steam" - ], - "testing": [ - "pytest", "jest", "test" - ] + "web-frameworks": ["react", "vue", "django", "fastapi", "laravel", "astro", "hono"], + "game-engines": ["godot", "unity", "unreal"], + "devops": ["kubernetes", "ansible", "docker", "terraform"], + "css-frameworks": ["tailwind", "bootstrap", "bulma"], + "development-tools": ["claude-code", "vscode", "git"], + "gaming": ["steam"], + "testing": ["pytest", "jest", "test"], } # Tag extraction keywords @@ -50,7 +35,7 @@ class ConfigAnalyzer: "game-development": ["godot", "unity", "unreal", "game"], "devops": ["kubernetes", "ansible", "docker", "k8s", "devops"], "documentation": ["docs", "documentation"], - "testing": ["test", "testing", "pytest", "jest"] + "testing": ["test", "testing", "pytest", "jest"], } def __init__(self, config_dir: Path, base_url: str = "https://api.skillseekersweb.com"): @@ -67,7 +52,7 @@ class ConfigAnalyzer: if not self.config_dir.exists(): raise ValueError(f"Config directory not found: {self.config_dir}") - def analyze_all_configs(self) -> List[Dict[str, Any]]: + def analyze_all_configs(self) -> list[dict[str, Any]]: """ Analyze all config files and extract metadata @@ -92,7 +77,7 @@ class ConfigAnalyzer: return configs - def analyze_config(self, config_path: Path) -> Optional[Dict[str, Any]]: + def analyze_config(self, config_path: Path) -> dict[str, Any] | None: """ Analyze a single config file and extract metadata @@ -104,7 +89,7 @@ class ConfigAnalyzer: """ try: # Read config file - with open(config_path, 'r') as f: + with open(config_path) as f: config_data = json.load(f) # Skip if no name field @@ -147,7 +132,7 @@ class ConfigAnalyzer: "file_size": file_size, "last_updated": last_updated, "download_url": download_url, - "config_file": config_path.name + "config_file": config_path.name, } except json.JSONDecodeError as e: @@ -157,7 +142,7 @@ class ConfigAnalyzer: print(f"Error analyzing {config_path.name}: {e}") return None - def get_config_by_name(self, name: str) -> Optional[Dict[str, Any]]: + def get_config_by_name(self, name: str) -> dict[str, Any] | None: """ Get config metadata by name @@ -173,7 +158,7 @@ class ConfigAnalyzer: return config return None - def _determine_type(self, config_data: Dict[str, Any]) -> str: + def _determine_type(self, config_data: dict[str, Any]) -> str: """ Determine if config is single-source or unified @@ -193,7 +178,7 @@ class ConfigAnalyzer: return "single-source" - def _get_primary_source(self, config_data: Dict[str, Any], config_type: str) -> str: + def _get_primary_source(self, config_data: dict[str, Any], config_type: str) -> str: """ Get primary source URL/repo @@ -227,7 +212,7 @@ class ConfigAnalyzer: return "Unknown" - def _categorize_config(self, name: str, description: str, config_data: Dict[str, Any]) -> str: + def _categorize_config(self, name: str, description: str, config_data: dict[str, Any]) -> str: """ Auto-categorize config based on name and content @@ -261,7 +246,7 @@ class ConfigAnalyzer: # Default to uncategorized return "uncategorized" - def _extract_tags(self, name: str, description: str, config_data: Dict[str, Any]) -> List[str]: + def _extract_tags(self, name: str, description: str, config_data: dict[str, Any]) -> list[str]: """ Extract relevant tags from config @@ -288,18 +273,26 @@ class ConfigAnalyzer: tags.add("multi-source") # Add source type tags - if "base_url" in config_data or (config_type == "unified" and any(s.get("type") == "documentation" for s in config_data.get("sources", []))): + if "base_url" in config_data or ( + config_type == "unified" and any(s.get("type") == "documentation" for s in config_data.get("sources", [])) + ): tags.add("documentation") - if "repo" in config_data or (config_type == "unified" and any(s.get("type") == "github" for s in config_data.get("sources", []))): + if "repo" in config_data or ( + config_type == "unified" and any(s.get("type") == "github" for s in config_data.get("sources", [])) + ): tags.add("github") - if "pdf" in config_data or "pdf_url" in config_data or (config_type == "unified" and any(s.get("type") == "pdf" for s in config_data.get("sources", []))): + if ( + "pdf" in config_data + or "pdf_url" in config_data + or (config_type == "unified" and any(s.get("type") == "pdf" for s in config_data.get("sources", []))) + ): tags.add("pdf") return sorted(list(tags)) - def _get_max_pages(self, config_data: Dict[str, Any]) -> Optional[int]: + def _get_max_pages(self, config_data: dict[str, Any]) -> int | None: """ Get max_pages value from config @@ -338,7 +331,7 @@ class ConfigAnalyzer: cwd=config_path.parent.parent, capture_output=True, text=True, - timeout=5 + timeout=5, ) if result.returncode == 0 and result.stdout.strip(): diff --git a/api/main.py b/api/main.py index 27b8383..3274cd4 100644 --- a/api/main.py +++ b/api/main.py @@ -4,21 +4,20 @@ Skill Seekers Config API FastAPI backend for listing available skill configs """ -from fastapi import FastAPI, HTTPException -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, FileResponse -from typing import List, Dict, Any, Optional -import os from pathlib import Path +from typing import Any from config_analyzer import ConfigAnalyzer +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse app = FastAPI( title="Skill Seekers Config API", description="API for discovering and downloading Skill Seekers configuration files", version="1.0.0", docs_url="/docs", - redoc_url="/redoc" + redoc_url="/redoc", ) # CORS middleware - allow all origins for public API @@ -54,16 +53,12 @@ async def root(): }, "repository": "https://github.com/yusufkaraaslan/Skill_Seekers", "configs_repository": "https://github.com/yusufkaraaslan/skill-seekers-configs", - "website": "https://api.skillseekersweb.com" + "website": "https://api.skillseekersweb.com", } @app.get("/api/configs") -async def list_configs( - category: Optional[str] = None, - tag: Optional[str] = None, - type: Optional[str] = None -) -> Dict[str, Any]: +async def list_configs(category: str | None = None, tag: str | None = None, type: str | None = None) -> dict[str, Any]: """ List all available configs with metadata @@ -102,7 +97,7 @@ async def list_configs( "version": "1.0.0", "total": len(configs), "filters": filters_applied if filters_applied else None, - "configs": configs + "configs": configs, } except Exception as e: @@ -110,7 +105,7 @@ async def list_configs( @app.get("/api/configs/{name}") -async def get_config(name: str) -> Dict[str, Any]: +async def get_config(name: str) -> dict[str, Any]: """ Get detailed information about a specific config @@ -124,10 +119,7 @@ async def get_config(name: str) -> Dict[str, Any]: config = analyzer.get_config_by_name(name) if not config: - raise HTTPException( - status_code=404, - detail=f"Config '{name}' not found" - ) + raise HTTPException(status_code=404, detail=f"Config '{name}' not found") return config @@ -138,7 +130,7 @@ async def get_config(name: str) -> Dict[str, Any]: @app.get("/api/categories") -async def list_categories() -> Dict[str, Any]: +async def list_categories() -> dict[str, Any]: """ List all available categories with config counts @@ -155,10 +147,7 @@ async def list_categories() -> Dict[str, Any]: cat = config.get("category", "uncategorized") category_counts[cat] = category_counts.get(cat, 0) + 1 - return { - "total_categories": len(category_counts), - "categories": category_counts - } + return {"total_categories": len(category_counts), "categories": category_counts} except Exception as e: raise HTTPException(status_code=500, detail=f"Error analyzing categories: {str(e)}") @@ -191,16 +180,9 @@ async def download_config(config_name: str): break if not config_path or not config_path.exists(): - raise HTTPException( - status_code=404, - detail=f"Config file '{config_name}' not found" - ) + raise HTTPException(status_code=404, detail=f"Config file '{config_name}' not found") - return FileResponse( - path=config_path, - media_type="application/json", - filename=config_name - ) + return FileResponse(path=config_path, media_type="application/json", filename=config_name) except HTTPException: raise @@ -216,4 +198,5 @@ async def health_check(): if __name__ == "__main__": import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/demo_conflicts.py b/demo_conflicts.py index 776ad50..f83be4d 100644 --- a/demo_conflicts.py +++ b/demo_conflicts.py @@ -6,14 +6,13 @@ This demonstrates the unified scraper's ability to detect and report conflicts between documentation and code implementation. """ -import sys import json +import sys from pathlib import Path # Add CLI to path -sys.path.insert(0, str(Path(__file__).parent / 'cli')) +sys.path.insert(0, str(Path(__file__).parent / "cli")) -from conflict_detector import ConflictDetector print("=" * 70) print("UNIFIED SCRAPER - CONFLICT DETECTION DEMO") @@ -26,11 +25,11 @@ print(" - Documentation APIs from example docs") print(" - Code APIs from example repository") print() -with open('cli/conflicts.json', 'r') as f: +with open("cli/conflicts.json") as f: conflicts_data = json.load(f) -conflicts = conflicts_data['conflicts'] -summary = conflicts_data['summary'] +conflicts = conflicts_data["conflicts"] +summary = conflicts_data["summary"] print(f"āœ… Loaded {summary['total']} conflicts") print() @@ -45,14 +44,14 @@ print(f"šŸ“Š **Total Conflicts**: {summary['total']}") print() print("**By Type:**") -for conflict_type, count in summary['by_type'].items(): +for conflict_type, count in summary["by_type"].items(): if count > 0: emoji = "šŸ“–" if conflict_type == "missing_in_docs" else "šŸ’»" if conflict_type == "missing_in_code" else "āš ļø" print(f" {emoji} {conflict_type}: {count}") print() print("**By Severity:**") -for severity, count in summary['by_severity'].items(): +for severity, count in summary["by_severity"].items(): if count > 0: emoji = "šŸ”“" if severity == "high" else "🟔" if severity == "medium" else "🟢" print(f" {emoji} {severity.upper()}: {count}") @@ -65,9 +64,9 @@ print("=" * 70) print() # Group by severity -high = [c for c in conflicts if c['severity'] == 'high'] -medium = [c for c in conflicts if c['severity'] == 'medium'] -low = [c for c in conflicts if c['severity'] == 'low'] +high = [c for c in conflicts if c["severity"] == "high"] +medium = [c for c in conflicts if c["severity"] == "medium"] +low = [c for c in conflicts if c["severity"] == "low"] # Show high severity first if high: @@ -80,14 +79,14 @@ if high: print(f"**Issue**: {conflict['difference']}") print(f"**Suggestion**: {conflict['suggestion']}") - if conflict['docs_info']: - print(f"\n**Documented as**:") + if conflict["docs_info"]: + print("\n**Documented as**:") print(f" Signature: {conflict['docs_info'].get('raw_signature', 'N/A')}") - if conflict['code_info']: - print(f"\n**Implemented as**:") - params = conflict['code_info'].get('parameters', []) - param_str = ', '.join(f"{p['name']}: {p.get('type_hint', 'Any')}" for p in params if p['name'] != 'self') + if conflict["code_info"]: + print("\n**Implemented as**:") + params = conflict["code_info"].get("parameters", []) + param_str = ", ".join(f"{p['name']}: {p.get('type_hint', 'Any')}" for p in params if p["name"] != "self") print(f" Signature: {conflict['code_info']['name']}({param_str})") print(f" Return type: {conflict['code_info'].get('return_type', 'None')}") print(f" Location: {conflict['code_info'].get('source', 'N/A')}:{conflict['code_info'].get('line', '?')}") @@ -103,7 +102,7 @@ if medium: print(f"**Type**: {conflict['type']}") print(f"**Issue**: {conflict['difference']}") - if conflict['code_info']: + if conflict["code_info"]: print(f"**Location**: {conflict['code_info'].get('source', 'N/A')}") if len(medium) > 3: @@ -128,30 +127,30 @@ print() print(f"āš ļø **Conflict**: {example_conflict['difference']}") print() -if example_conflict.get('docs_info'): +if example_conflict.get("docs_info"): print("**Documentation says:**") print("```") - print(example_conflict['docs_info'].get('raw_signature', 'N/A')) + print(example_conflict["docs_info"].get("raw_signature", "N/A")) print("```") print() -if example_conflict.get('code_info'): +if example_conflict.get("code_info"): print("**Code implementation:**") print("```python") - params = example_conflict['code_info'].get('parameters', []) + params = example_conflict["code_info"].get("parameters", []) param_strs = [] for p in params: - if p['name'] == 'self': + if p["name"] == "self": continue - param_str = p['name'] - if p.get('type_hint'): + param_str = p["name"] + if p.get("type_hint"): param_str += f": {p['type_hint']}" - if p.get('default'): + if p.get("default"): param_str += f" = {p['default']}" param_strs.append(param_str) sig = f"def {example_conflict['code_info']['name']}({', '.join(param_strs)})" - if example_conflict['code_info'].get('return_type'): + if example_conflict["code_info"].get("return_type"): sig += f" -> {example_conflict['code_info']['return_type']}" print(sig) diff --git a/examples/test_http_server.py b/examples/test_http_server.py index 350f8a1..73799c3 100644 --- a/examples/test_http_server.py +++ b/examples/test_http_server.py @@ -10,8 +10,9 @@ Usage: import asyncio import subprocess -import time import sys +import time + import requests @@ -47,7 +48,7 @@ async def test_http_server(): print("3. Testing health check endpoint...") response = requests.get("http://127.0.0.1:8765/health", timeout=5) if response.status_code == 200: - print(f" āœ“ Health check passed") + print(" āœ“ Health check passed") print(f" Response: {response.json()}") else: print(f" āœ— Health check failed: {response.status_code}") @@ -57,13 +58,11 @@ async def test_http_server(): print("4. Testing SSE endpoint availability...") # Just check if the endpoint exists (full SSE testing requires MCP client) try: - response = requests.get( - "http://127.0.0.1:8765/sse", timeout=5, stream=True - ) + response = requests.get("http://127.0.0.1:8765/sse", timeout=5, stream=True) print(f" āœ“ SSE endpoint is available (status: {response.status_code})") except Exception as e: print(f" ℹ SSE endpoint response: {e}") - print(f" (This is expected - full SSE testing requires MCP client)") + print(" (This is expected - full SSE testing requires MCP client)") print() print("=" * 60) @@ -71,13 +70,13 @@ async def test_http_server(): print("=" * 60) print() print("Server Configuration for Claude Desktop:") - print('{') + print("{") print(' "mcpServers": {') print(' "skill-seeker": {') print(' "url": "http://127.0.0.1:8765/sse"') - print(' }') - print(' }') - print('}') + print(" }") + print(" }") + print("}") print() return True diff --git a/pyproject.toml b/pyproject.toml index fbc2a14..af4907f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,10 +167,52 @@ exclude_lines = [ "@abstractmethod", ] +[tool.ruff] +line-length = 120 +target-version = "py310" +src = ["src", "tests"] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # Pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade + "ARG", # flake8-unused-arguments + "SIM", # flake8-simplify +] +ignore = [ + "E501", # line too long (handled by formatter) +] + +[tool.ruff.lint.isort] +known-first-party = ["skill_seekers"] + +[tool.mypy] +python_version = "3.10" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +disallow_incomplete_defs = false +check_untyped_defs = true +ignore_missing_imports = true +show_error_codes = true +pretty = true + +[[tool.mypy.overrides]] +module = "tests.*" +disallow_untyped_defs = false +check_untyped_defs = false + [dependency-groups] dev = [ "pytest>=8.4.2", "pytest-asyncio>=0.24.0", "pytest-cov>=7.0.0", "coverage>=7.11.0", + "ruff>=0.14.13", + "mypy>=1.19.1", ] diff --git a/src/skill_seekers/cli/adaptors/__init__.py b/src/skill_seekers/cli/adaptors/__init__.py index 92cae46..8207298 100644 --- a/src/skill_seekers/cli/adaptors/__init__.py +++ b/src/skill_seekers/cli/adaptors/__init__.py @@ -33,17 +33,17 @@ except ImportError: # Registry of available adaptors -ADAPTORS: Dict[str, Type[SkillAdaptor]] = {} +ADAPTORS: dict[str, type[SkillAdaptor]] = {} # Register adaptors that are implemented if ClaudeAdaptor: - ADAPTORS['claude'] = ClaudeAdaptor + ADAPTORS["claude"] = ClaudeAdaptor if GeminiAdaptor: - ADAPTORS['gemini'] = GeminiAdaptor + ADAPTORS["gemini"] = GeminiAdaptor if OpenAIAdaptor: - ADAPTORS['openai'] = OpenAIAdaptor + ADAPTORS["openai"] = OpenAIAdaptor if MarkdownAdaptor: - ADAPTORS['markdown'] = MarkdownAdaptor + ADAPTORS["markdown"] = MarkdownAdaptor def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor: @@ -65,15 +65,11 @@ def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor: >>> adaptor = get_adaptor('gemini', {'api_version': 'v1beta'}) """ if platform not in ADAPTORS: - available = ', '.join(ADAPTORS.keys()) + available = ", ".join(ADAPTORS.keys()) if not ADAPTORS: - raise ValueError( - f"No adaptors are currently implemented. " - f"Platform '{platform}' is not available." - ) + raise ValueError(f"No adaptors are currently implemented. Platform '{platform}' is not available.") raise ValueError( - f"Platform '{platform}' is not supported or not yet implemented. " - f"Available platforms: {available}" + f"Platform '{platform}' is not supported or not yet implemented. Available platforms: {available}" ) adaptor_class = ADAPTORS[platform] @@ -115,10 +111,10 @@ def is_platform_available(platform: str) -> bool: # Export public interface __all__ = [ - 'SkillAdaptor', - 'SkillMetadata', - 'get_adaptor', - 'list_platforms', - 'is_platform_available', - 'ADAPTORS', + "SkillAdaptor", + "SkillMetadata", + "get_adaptor", + "list_platforms", + "is_platform_available", + "ADAPTORS", ] diff --git a/src/skill_seekers/cli/adaptors/base.py b/src/skill_seekers/cli/adaptors/base.py index f390503..fdfabff 100644 --- a/src/skill_seekers/cli/adaptors/base.py +++ b/src/skill_seekers/cli/adaptors/base.py @@ -7,18 +7,19 @@ This enables Skill Seekers to generate skills for multiple LLM platforms (Claude """ from abc import ABC, abstractmethod -from pathlib import Path -from typing import Dict, Any, Optional from dataclasses import dataclass, field +from pathlib import Path +from typing import Any @dataclass class SkillMetadata: """Universal skill metadata used across all platforms""" + name: str description: str version: str = "1.0.0" - author: Optional[str] = None + author: str | None = None tags: list[str] = field(default_factory=list) @@ -34,11 +35,11 @@ class SkillAdaptor(ABC): """ # Platform identifiers (override in subclasses) - PLATFORM: str = "unknown" # e.g., "claude", "gemini", "openai" - PLATFORM_NAME: str = "Unknown" # e.g., "Claude AI (Anthropic)" - DEFAULT_API_ENDPOINT: Optional[str] = None + PLATFORM: str = "unknown" # e.g., "claude", "gemini", "openai" + PLATFORM_NAME: str = "Unknown" # e.g., "Claude AI (Anthropic)" + DEFAULT_API_ENDPOINT: str | None = None - def __init__(self, config: Optional[Dict[str, Any]] = None): + def __init__(self, config: dict[str, Any] | None = None): """ Initialize adaptor with optional configuration. @@ -86,7 +87,7 @@ class SkillAdaptor(ABC): pass @abstractmethod - def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]: + def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]: """ Upload packaged skill to platform. @@ -168,11 +169,11 @@ class SkillAdaptor(ABC): if not skill_md_path.exists(): return "" - content = skill_md_path.read_text(encoding='utf-8') + content = skill_md_path.read_text(encoding="utf-8") # Strip YAML frontmatter if present - if content.startswith('---'): - parts = content.split('---', 2) + if content.startswith("---"): + parts = content.split("---", 2) if len(parts) >= 3: return parts[2].strip() @@ -193,7 +194,7 @@ class SkillAdaptor(ABC): return "See references/ directory for documentation." # Read index and extract relevant sections - content = index_path.read_text(encoding='utf-8') + content = index_path.read_text(encoding="utf-8") return content[:500] + "..." if len(content) > 500 else content def _generate_toc(self, skill_dir: Path) -> str: @@ -214,7 +215,7 @@ class SkillAdaptor(ABC): for ref_file in sorted(refs_dir.glob("*.md")): if ref_file.name == "index.md": continue - title = ref_file.stem.replace('_', ' ').title() + title = ref_file.stem.replace("_", " ").title() toc_lines.append(f"- [{title}](references/{ref_file.name})") return "\n".join(toc_lines) diff --git a/src/skill_seekers/cli/adaptors/claude.py b/src/skill_seekers/cli/adaptors/claude.py index 267a69f..4e97a40 100644 --- a/src/skill_seekers/cli/adaptors/claude.py +++ b/src/skill_seekers/cli/adaptors/claude.py @@ -6,10 +6,9 @@ Implements platform-specific handling for Claude AI (Anthropic) skills. Refactored from upload_skill.py and enhance_skill.py. """ -import os import zipfile from pathlib import Path -from typing import Dict, Any +from typing import Any from .base import SkillAdaptor, SkillMetadata @@ -101,16 +100,16 @@ version: {metadata.version} skill_dir = Path(skill_dir) # Determine output filename - if output_path.is_dir() or str(output_path).endswith('/'): + if output_path.is_dir() or str(output_path).endswith("/"): output_path = Path(output_path) / f"{skill_dir.name}.zip" - elif not str(output_path).endswith('.zip'): - output_path = Path(str(output_path) + '.zip') + elif not str(output_path).endswith(".zip"): + output_path = Path(str(output_path) + ".zip") output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Create ZIP file - with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf: + with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf: # Add SKILL.md (required) skill_md = skill_dir / "SKILL.md" if skill_md.exists(): @@ -120,7 +119,7 @@ version: {metadata.version} refs_dir = skill_dir / "references" if refs_dir.exists(): for ref_file in refs_dir.rglob("*"): - if ref_file.is_file() and not ref_file.name.startswith('.'): + if ref_file.is_file() and not ref_file.name.startswith("."): arcname = ref_file.relative_to(skill_dir) zf.write(ref_file, str(arcname)) @@ -128,7 +127,7 @@ version: {metadata.version} scripts_dir = skill_dir / "scripts" if scripts_dir.exists(): for script_file in scripts_dir.rglob("*"): - if script_file.is_file() and not script_file.name.startswith('.'): + if script_file.is_file() and not script_file.name.startswith("."): arcname = script_file.relative_to(skill_dir) zf.write(script_file, str(arcname)) @@ -136,13 +135,13 @@ version: {metadata.version} assets_dir = skill_dir / "assets" if assets_dir.exists(): for asset_file in assets_dir.rglob("*"): - if asset_file.is_file() and not asset_file.name.startswith('.'): + if asset_file.is_file() and not asset_file.name.startswith("."): arcname = asset_file.relative_to(skill_dir) zf.write(asset_file, str(arcname)) return output_path - def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]: + def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]: """ Upload skill ZIP to Anthropic Skills API. @@ -159,130 +158,99 @@ version: {metadata.version} import requests except ImportError: return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': 'requests library not installed. Run: pip install requests' + "success": False, + "skill_id": None, + "url": None, + "message": "requests library not installed. Run: pip install requests", } # Validate ZIP file package_path = Path(package_path) if not package_path.exists(): - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'File not found: {package_path}' - } + return {"success": False, "skill_id": None, "url": None, "message": f"File not found: {package_path}"} - if not package_path.suffix == '.zip': - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'Not a ZIP file: {package_path}' - } + if not package_path.suffix == ".zip": + return {"success": False, "skill_id": None, "url": None, "message": f"Not a ZIP file: {package_path}"} # Prepare API request api_url = self.DEFAULT_API_ENDPOINT - headers = { - "x-api-key": api_key, - "anthropic-version": "2023-06-01", - "anthropic-beta": "skills-2025-10-02" - } + headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01", "anthropic-beta": "skills-2025-10-02"} - timeout = kwargs.get('timeout', 60) + timeout = kwargs.get("timeout", 60) try: # Read ZIP file - with open(package_path, 'rb') as f: + with open(package_path, "rb") as f: zip_data = f.read() # Upload skill - files = { - 'files[]': (package_path.name, zip_data, 'application/zip') - } + files = {"files[]": (package_path.name, zip_data, "application/zip")} - response = requests.post( - api_url, - headers=headers, - files=files, - timeout=timeout - ) + response = requests.post(api_url, headers=headers, files=files, timeout=timeout) # Check response if response.status_code == 200: # Extract skill ID if available try: response_data = response.json() - skill_id = response_data.get('id') + skill_id = response_data.get("id") except: skill_id = None return { - 'success': True, - 'skill_id': skill_id, - 'url': 'https://claude.ai/skills', - 'message': 'Skill uploaded successfully to Claude AI' + "success": True, + "skill_id": skill_id, + "url": "https://claude.ai/skills", + "message": "Skill uploaded successfully to Claude AI", } elif response.status_code == 401: return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': 'Authentication failed. Check your ANTHROPIC_API_KEY' + "success": False, + "skill_id": None, + "url": None, + "message": "Authentication failed. Check your ANTHROPIC_API_KEY", } elif response.status_code == 400: try: - error_msg = response.json().get('error', {}).get('message', 'Unknown error') + error_msg = response.json().get("error", {}).get("message", "Unknown error") except: - error_msg = 'Invalid skill format' + error_msg = "Invalid skill format" return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'Invalid skill format: {error_msg}' + "success": False, + "skill_id": None, + "url": None, + "message": f"Invalid skill format: {error_msg}", } else: try: - error_msg = response.json().get('error', {}).get('message', 'Unknown error') + error_msg = response.json().get("error", {}).get("message", "Unknown error") except: - error_msg = f'HTTP {response.status_code}' + error_msg = f"HTTP {response.status_code}" - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'Upload failed: {error_msg}' - } + return {"success": False, "skill_id": None, "url": None, "message": f"Upload failed: {error_msg}"} except requests.exceptions.Timeout: return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': 'Upload timed out. Try again or use manual upload' + "success": False, + "skill_id": None, + "url": None, + "message": "Upload timed out. Try again or use manual upload", } except requests.exceptions.ConnectionError: return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': 'Connection error. Check your internet connection' + "success": False, + "skill_id": None, + "url": None, + "message": "Connection error. Check your internet connection", } except Exception as e: - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'Unexpected error: {str(e)}' - } + return {"success": False, "skill_id": None, "url": None, "message": f"Unexpected error: {str(e)}"} def validate_api_key(self, api_key: str) -> bool: """ @@ -294,7 +262,7 @@ version: {metadata.version} Returns: True if key starts with 'sk-ant-' """ - return api_key.strip().startswith('sk-ant-') + return api_key.strip().startswith("sk-ant-") def get_env_var_name(self) -> str: """ @@ -355,17 +323,13 @@ version: {metadata.version} # Read current SKILL.md current_skill_md = None if skill_md_path.exists(): - current_skill_md = skill_md_path.read_text(encoding='utf-8') + current_skill_md = skill_md_path.read_text(encoding="utf-8") print(f" ℹ Found existing SKILL.md ({len(current_skill_md)} chars)") else: - print(f" ℹ No existing SKILL.md, will create new one") + print(" ℹ No existing SKILL.md, will create new one") # Build enhancement prompt - prompt = self._build_enhancement_prompt( - skill_dir.name, - references, - current_skill_md - ) + prompt = self._build_enhancement_prompt(skill_dir.name, references, current_skill_md) print("\nšŸ¤– Asking Claude to enhance SKILL.md...") print(f" Input: {len(prompt):,} characters") @@ -377,10 +341,7 @@ version: {metadata.version} model="claude-sonnet-4-20250514", max_tokens=4096, temperature=0.3, - messages=[{ - "role": "user", - "content": prompt - }] + messages=[{"role": "user", "content": prompt}], ) enhanced_content = message.content[0].text @@ -388,13 +349,13 @@ version: {metadata.version} # Backup original if skill_md_path.exists(): - backup_path = skill_md_path.with_suffix('.md.backup') + backup_path = skill_md_path.with_suffix(".md.backup") skill_md_path.rename(backup_path) print(f" šŸ’¾ Backed up original to: {backup_path.name}") # Save enhanced version - skill_md_path.write_text(enhanced_content, encoding='utf-8') - print(f" āœ… Saved enhanced SKILL.md") + skill_md_path.write_text(enhanced_content, encoding="utf-8") + print(" āœ… Saved enhanced SKILL.md") return True @@ -402,7 +363,7 @@ version: {metadata.version} print(f"āŒ Error calling Claude API: {e}") return False - def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> Dict[str, str]: + def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> dict[str, str]: """ Read reference markdown files from skill directory. @@ -425,7 +386,7 @@ version: {metadata.version} break try: - content = ref_file.read_text(encoding='utf-8') + content = ref_file.read_text(encoding="utf-8") # Limit individual file size if len(content) > 30000: content = content[:30000] + "\n\n...(truncated)" @@ -439,10 +400,7 @@ version: {metadata.version} return references def _build_enhancement_prompt( - self, - skill_name: str, - references: Dict[str, str], - current_skill_md: str = None + self, skill_name: str, references: dict[str, str], current_skill_md: str = None ) -> str: """ Build Claude API prompt for enhancement. @@ -460,9 +418,9 @@ version: {metadata.version} I've scraped documentation and organized it into reference files. Your job is to create an EXCELLENT SKILL.md that will help Claude use this documentation effectively. CURRENT SKILL.MD: -{'```markdown' if current_skill_md else '(none - create from scratch)'} -{current_skill_md or 'No existing SKILL.md'} -{'```' if current_skill_md else ''} +{"```markdown" if current_skill_md else "(none - create from scratch)"} +{current_skill_md or "No existing SKILL.md"} +{"```" if current_skill_md else ""} REFERENCE DOCUMENTATION: """ diff --git a/src/skill_seekers/cli/adaptors/gemini.py b/src/skill_seekers/cli/adaptors/gemini.py index 5d361dd..be5a396 100644 --- a/src/skill_seekers/cli/adaptors/gemini.py +++ b/src/skill_seekers/cli/adaptors/gemini.py @@ -6,11 +6,11 @@ Implements platform-specific handling for Google Gemini skills. Uses Gemini Files API for grounding and Gemini 2.0 Flash for enhancement. """ +import json import os import tarfile -import json from pathlib import Path -from typing import Dict, Any +from typing import Any from .base import SkillAdaptor, SkillMetadata @@ -105,20 +105,20 @@ See the references directory for complete documentation with examples and best p skill_dir = Path(skill_dir) # Determine output filename - if output_path.is_dir() or str(output_path).endswith('/'): + if output_path.is_dir() or str(output_path).endswith("/"): output_path = Path(output_path) / f"{skill_dir.name}-gemini.tar.gz" - elif not str(output_path).endswith('.tar.gz'): + elif not str(output_path).endswith(".tar.gz"): # Replace .zip with .tar.gz if needed - output_str = str(output_path).replace('.zip', '.tar.gz') - if not output_str.endswith('.tar.gz'): - output_str += '.tar.gz' + output_str = str(output_path).replace(".zip", ".tar.gz") + if not output_str.endswith(".tar.gz"): + output_str += ".tar.gz" output_path = Path(output_str) output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Create tar.gz file - with tarfile.open(output_path, 'w:gz') as tar: + with tarfile.open(output_path, "w:gz") as tar: # Add SKILL.md as system_instructions.md skill_md = skill_dir / "SKILL.md" if skill_md.exists(): @@ -128,21 +128,22 @@ See the references directory for complete documentation with examples and best p refs_dir = skill_dir / "references" if refs_dir.exists(): for ref_file in refs_dir.rglob("*"): - if ref_file.is_file() and not ref_file.name.startswith('.'): + if ref_file.is_file() and not ref_file.name.startswith("."): arcname = ref_file.relative_to(skill_dir) tar.add(ref_file, arcname=str(arcname)) # Create and add metadata file metadata = { - 'platform': 'gemini', - 'name': skill_dir.name, - 'version': '1.0.0', - 'created_with': 'skill-seekers' + "platform": "gemini", + "name": skill_dir.name, + "version": "1.0.0", + "created_with": "skill-seekers", } # Write metadata to temp file and add to archive import tempfile - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp: + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp: json.dump(metadata, tmp, indent=2) tmp_path = tmp.name @@ -153,7 +154,7 @@ See the references directory for complete documentation with examples and best p return output_path - def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]: + def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]: """ Upload skill tar.gz to Gemini Files API. @@ -168,30 +169,20 @@ See the references directory for complete documentation with examples and best p # Validate package file FIRST package_path = Path(package_path) if not package_path.exists(): - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'File not found: {package_path}' - } + return {"success": False, "skill_id": None, "url": None, "message": f"File not found: {package_path}"} - if not package_path.suffix == '.gz': - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'Not a tar.gz file: {package_path}' - } + if not package_path.suffix == ".gz": + return {"success": False, "skill_id": None, "url": None, "message": f"Not a tar.gz file: {package_path}"} # Check for google-generativeai library try: import google.generativeai as genai except ImportError: return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': 'google-generativeai library not installed. Run: pip install google-generativeai' + "success": False, + "skill_id": None, + "url": None, + "message": "google-generativeai library not installed. Run: pip install google-generativeai", } # Configure Gemini @@ -200,11 +191,10 @@ See the references directory for complete documentation with examples and best p # Extract tar.gz to temp directory import tempfile - import shutil with tempfile.TemporaryDirectory() as temp_dir: # Extract archive - with tarfile.open(package_path, 'r:gz') as tar: + with tarfile.open(package_path, "r:gz") as tar: tar.extractall(temp_dir) temp_path = Path(temp_dir) @@ -213,17 +203,14 @@ See the references directory for complete documentation with examples and best p main_file = temp_path / "system_instructions.md" if not main_file.exists(): return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': 'Invalid package: system_instructions.md not found' + "success": False, + "skill_id": None, + "url": None, + "message": "Invalid package: system_instructions.md not found", } # Upload to Files API - uploaded_file = genai.upload_file( - path=str(main_file), - display_name=f"{package_path.stem}_instructions" - ) + uploaded_file = genai.upload_file(path=str(main_file), display_name=f"{package_path.stem}_instructions") # Upload reference files (if any) refs_dir = temp_path / "references" @@ -231,25 +218,19 @@ See the references directory for complete documentation with examples and best p if refs_dir.exists(): for ref_file in refs_dir.glob("*.md"): ref_uploaded = genai.upload_file( - path=str(ref_file), - display_name=f"{package_path.stem}_{ref_file.stem}" + path=str(ref_file), display_name=f"{package_path.stem}_{ref_file.stem}" ) uploaded_refs.append(ref_uploaded.name) return { - 'success': True, - 'skill_id': uploaded_file.name, - 'url': f"https://aistudio.google.com/app/files/{uploaded_file.name}", - 'message': f'Skill uploaded to Google AI Studio ({len(uploaded_refs) + 1} files)' + "success": True, + "skill_id": uploaded_file.name, + "url": f"https://aistudio.google.com/app/files/{uploaded_file.name}", + "message": f"Skill uploaded to Google AI Studio ({len(uploaded_refs) + 1} files)", } except Exception as e: - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'Upload failed: {str(e)}' - } + return {"success": False, "skill_id": None, "url": None, "message": f"Upload failed: {str(e)}"} def validate_api_key(self, api_key: str) -> bool: """ @@ -261,7 +242,7 @@ See the references directory for complete documentation with examples and best p Returns: True if key starts with 'AIza' """ - return api_key.strip().startswith('AIza') + return api_key.strip().startswith("AIza") def get_env_var_name(self) -> str: """ @@ -319,17 +300,13 @@ See the references directory for complete documentation with examples and best p # Read current SKILL.md current_skill_md = None if skill_md_path.exists(): - current_skill_md = skill_md_path.read_text(encoding='utf-8') + current_skill_md = skill_md_path.read_text(encoding="utf-8") print(f" ℹ Found existing SKILL.md ({len(current_skill_md)} chars)") else: - print(f" ℹ No existing SKILL.md, will create new one") + print(" ℹ No existing SKILL.md, will create new one") # Build enhancement prompt - prompt = self._build_enhancement_prompt( - skill_dir.name, - references, - current_skill_md - ) + prompt = self._build_enhancement_prompt(skill_dir.name, references, current_skill_md) print("\nšŸ¤– Asking Gemini to enhance SKILL.md...") print(f" Input: {len(prompt):,} characters") @@ -337,7 +314,7 @@ See the references directory for complete documentation with examples and best p try: genai.configure(api_key=api_key) - model = genai.GenerativeModel('gemini-2.0-flash-exp') + model = genai.GenerativeModel("gemini-2.0-flash-exp") response = model.generate_content(prompt) @@ -346,13 +323,13 @@ See the references directory for complete documentation with examples and best p # Backup original if skill_md_path.exists(): - backup_path = skill_md_path.with_suffix('.md.backup') + backup_path = skill_md_path.with_suffix(".md.backup") skill_md_path.rename(backup_path) print(f" šŸ’¾ Backed up original to: {backup_path.name}") # Save enhanced version - skill_md_path.write_text(enhanced_content, encoding='utf-8') - print(f" āœ… Saved enhanced SKILL.md") + skill_md_path.write_text(enhanced_content, encoding="utf-8") + print(" āœ… Saved enhanced SKILL.md") return True @@ -360,7 +337,7 @@ See the references directory for complete documentation with examples and best p print(f"āŒ Error calling Gemini API: {e}") return False - def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> Dict[str, str]: + def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> dict[str, str]: """ Read reference markdown files from skill directory. @@ -383,7 +360,7 @@ See the references directory for complete documentation with examples and best p break try: - content = ref_file.read_text(encoding='utf-8') + content = ref_file.read_text(encoding="utf-8") # Limit individual file size if len(content) > 30000: content = content[:30000] + "\n\n...(truncated)" @@ -397,10 +374,7 @@ See the references directory for complete documentation with examples and best p return references def _build_enhancement_prompt( - self, - skill_name: str, - references: Dict[str, str], - current_skill_md: str = None + self, skill_name: str, references: dict[str, str], current_skill_md: str = None ) -> str: """ Build Gemini API prompt for enhancement. @@ -418,9 +392,9 @@ See the references directory for complete documentation with examples and best p I've scraped documentation and organized it into reference files. Your job is to create an EXCELLENT markdown documentation file that will help Gemini use this documentation effectively. CURRENT DOCUMENTATION: -{'```markdown' if current_skill_md else '(none - create from scratch)'} -{current_skill_md or 'No existing documentation'} -{'```' if current_skill_md else ''} +{"```markdown" if current_skill_md else "(none - create from scratch)"} +{current_skill_md or "No existing documentation"} +{"```" if current_skill_md else ""} REFERENCE DOCUMENTATION: """ diff --git a/src/skill_seekers/cli/adaptors/markdown.py b/src/skill_seekers/cli/adaptors/markdown.py index 2d534ba..323d3e9 100644 --- a/src/skill_seekers/cli/adaptors/markdown.py +++ b/src/skill_seekers/cli/adaptors/markdown.py @@ -8,7 +8,7 @@ No platform-specific features, just clean markdown documentation. import zipfile from pathlib import Path -from typing import Dict, Any +from typing import Any from .base import SkillAdaptor, SkillMetadata @@ -100,33 +100,33 @@ Browse the reference files for detailed information on each topic. All files are skill_dir = Path(skill_dir) # Determine output filename - if output_path.is_dir() or str(output_path).endswith('/'): + if output_path.is_dir() or str(output_path).endswith("/"): output_path = Path(output_path) / f"{skill_dir.name}-markdown.zip" - elif not str(output_path).endswith('.zip'): + elif not str(output_path).endswith(".zip"): # Replace extension if needed - output_str = str(output_path).replace('.tar.gz', '.zip') - if not output_str.endswith('-markdown.zip'): - output_str = output_str.replace('.zip', '-markdown.zip') - if not output_str.endswith('.zip'): - output_str += '.zip' + output_str = str(output_path).replace(".tar.gz", ".zip") + if not output_str.endswith("-markdown.zip"): + output_str = output_str.replace(".zip", "-markdown.zip") + if not output_str.endswith(".zip"): + output_str += ".zip" output_path = Path(output_str) output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Create ZIP file - with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf: + with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf: # Add SKILL.md as README.md skill_md = skill_dir / "SKILL.md" if skill_md.exists(): - content = skill_md.read_text(encoding='utf-8') + content = skill_md.read_text(encoding="utf-8") zf.writestr("README.md", content) # Add individual reference files refs_dir = skill_dir / "references" if refs_dir.exists(): for ref_file in refs_dir.rglob("*.md"): - if ref_file.is_file() and not ref_file.name.startswith('.'): + if ref_file.is_file() and not ref_file.name.startswith("."): # Preserve directory structure under references/ arcname = ref_file.relative_to(skill_dir) zf.write(ref_file, str(arcname)) @@ -138,20 +138,21 @@ Browse the reference files for detailed information on each topic. All files are # Add metadata file import json + metadata = { - 'platform': 'markdown', - 'name': skill_dir.name, - 'version': '1.0.0', - 'created_with': 'skill-seekers', - 'format': 'universal_markdown', - 'usage': 'Use with any LLM or documentation system' + "platform": "markdown", + "name": skill_dir.name, + "version": "1.0.0", + "created_with": "skill-seekers", + "format": "universal_markdown", + "usage": "Use with any LLM or documentation system", } zf.writestr("metadata.json", json.dumps(metadata, indent=2)) return output_path - def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]: + def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]: """ Generic markdown export does not support upload. @@ -166,13 +167,13 @@ Browse the reference files for detailed information on each topic. All files are Result indicating no upload capability """ return { - 'success': False, - 'skill_id': None, - 'url': str(package_path.absolute()), - 'message': ( - 'Generic markdown export does not support automatic upload. ' - f'Your documentation is packaged at: {package_path.absolute()}' - ) + "success": False, + "skill_id": None, + "url": str(package_path.absolute()), + "message": ( + "Generic markdown export does not support automatic upload. " + f"Your documentation is packaged at: {package_path.absolute()}" + ), } def validate_api_key(self, api_key: str) -> bool: @@ -237,10 +238,10 @@ Browse the reference files for detailed information on each topic. All files are # Add main content if skill_md.exists(): - content = skill_md.read_text(encoding='utf-8') + content = skill_md.read_text(encoding="utf-8") # Strip YAML frontmatter if present - if content.startswith('---'): - parts = content.split('---', 2) + if content.startswith("---"): + parts = content.split("---", 2) if len(parts) >= 3: content = parts[2].strip() combined_parts.append(content) @@ -258,7 +259,7 @@ Browse the reference files for detailed information on each topic. All files are continue # Skip index try: - ref_content = ref_file.read_text(encoding='utf-8') + ref_content = ref_file.read_text(encoding="utf-8") combined_parts.append(f"# {ref_file.stem.replace('_', ' ').title()}\n\n") combined_parts.append(ref_content) combined_parts.append("\n\n---\n\n") diff --git a/src/skill_seekers/cli/adaptors/openai.py b/src/skill_seekers/cli/adaptors/openai.py index 4fbbd1c..c272f51 100644 --- a/src/skill_seekers/cli/adaptors/openai.py +++ b/src/skill_seekers/cli/adaptors/openai.py @@ -6,11 +6,10 @@ Implements platform-specific handling for OpenAI ChatGPT Assistants. Uses Assistants API with Vector Store for file search. """ -import os -import zipfile import json +import zipfile from pathlib import Path -from typing import Dict, Any +from typing import Any from .base import SkillAdaptor, SkillMetadata @@ -123,51 +122,51 @@ Always prioritize accuracy by consulting the attached documentation files before skill_dir = Path(skill_dir) # Determine output filename - if output_path.is_dir() or str(output_path).endswith('/'): + if output_path.is_dir() or str(output_path).endswith("/"): output_path = Path(output_path) / f"{skill_dir.name}-openai.zip" - elif not str(output_path).endswith('.zip'): + elif not str(output_path).endswith(".zip"): # Keep .zip extension - if not str(output_path).endswith('-openai.zip'): - output_str = str(output_path).replace('.zip', '-openai.zip') - if not output_str.endswith('.zip'): - output_str += '.zip' + if not str(output_path).endswith("-openai.zip"): + output_str = str(output_path).replace(".zip", "-openai.zip") + if not output_str.endswith(".zip"): + output_str += ".zip" output_path = Path(output_str) output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Create ZIP file - with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf: + with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf: # Add SKILL.md as assistant_instructions.txt skill_md = skill_dir / "SKILL.md" if skill_md.exists(): - instructions = skill_md.read_text(encoding='utf-8') + instructions = skill_md.read_text(encoding="utf-8") zf.writestr("assistant_instructions.txt", instructions) # Add references directory as vector_store_files/ refs_dir = skill_dir / "references" if refs_dir.exists(): for ref_file in refs_dir.rglob("*.md"): - if ref_file.is_file() and not ref_file.name.startswith('.'): + if ref_file.is_file() and not ref_file.name.startswith("."): # Place all reference files in vector_store_files/ arcname = f"vector_store_files/{ref_file.name}" zf.write(ref_file, arcname) # Create and add metadata file metadata = { - 'platform': 'openai', - 'name': skill_dir.name, - 'version': '1.0.0', - 'created_with': 'skill-seekers', - 'model': 'gpt-4o', - 'tools': ['file_search'] + "platform": "openai", + "name": skill_dir.name, + "version": "1.0.0", + "created_with": "skill-seekers", + "model": "gpt-4o", + "tools": ["file_search"], } zf.writestr("openai_metadata.json", json.dumps(metadata, indent=2)) return output_path - def upload(self, package_path: Path, api_key: str, **kwargs) -> Dict[str, Any]: + def upload(self, package_path: Path, api_key: str, **kwargs) -> dict[str, Any]: """ Upload skill ZIP to OpenAI Assistants API. @@ -186,30 +185,20 @@ Always prioritize accuracy by consulting the attached documentation files before # Validate package file FIRST package_path = Path(package_path) if not package_path.exists(): - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'File not found: {package_path}' - } + return {"success": False, "skill_id": None, "url": None, "message": f"File not found: {package_path}"} - if not package_path.suffix == '.zip': - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'Not a ZIP file: {package_path}' - } + if not package_path.suffix == ".zip": + return {"success": False, "skill_id": None, "url": None, "message": f"Not a ZIP file: {package_path}"} # Check for openai library try: from openai import OpenAI except ImportError: return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': 'openai library not installed. Run: pip install openai' + "success": False, + "skill_id": None, + "url": None, + "message": "openai library not installed. Run: pip install openai", } # Configure OpenAI client @@ -218,11 +207,10 @@ Always prioritize accuracy by consulting the attached documentation files before # Extract package to temp directory import tempfile - import shutil with tempfile.TemporaryDirectory() as temp_dir: # Extract ZIP - with zipfile.ZipFile(package_path, 'r') as zf: + with zipfile.ZipFile(package_path, "r") as zf: zf.extractall(temp_dir) temp_path = Path(temp_dir) @@ -231,29 +219,27 @@ Always prioritize accuracy by consulting the attached documentation files before instructions_file = temp_path / "assistant_instructions.txt" if not instructions_file.exists(): return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': 'Invalid package: assistant_instructions.txt not found' + "success": False, + "skill_id": None, + "url": None, + "message": "Invalid package: assistant_instructions.txt not found", } - instructions = instructions_file.read_text(encoding='utf-8') + instructions = instructions_file.read_text(encoding="utf-8") # Read metadata metadata_file = temp_path / "openai_metadata.json" skill_name = package_path.stem - model = kwargs.get('model', 'gpt-4o') + model = kwargs.get("model", "gpt-4o") if metadata_file.exists(): - with open(metadata_file, 'r') as f: + with open(metadata_file) as f: metadata = json.load(f) - skill_name = metadata.get('name', skill_name) - model = metadata.get('model', model) + skill_name = metadata.get("name", skill_name) + model = metadata.get("model", model) # Create vector store - vector_store = client.beta.vector_stores.create( - name=f"{skill_name} Documentation" - ) + vector_store = client.beta.vector_stores.create(name=f"{skill_name} Documentation") # Upload reference files to vector store vector_files_dir = temp_path / "vector_store_files" @@ -262,19 +248,13 @@ Always prioritize accuracy by consulting the attached documentation files before if vector_files_dir.exists(): for ref_file in vector_files_dir.glob("*.md"): # Upload file - with open(ref_file, 'rb') as f: - uploaded_file = client.files.create( - file=f, - purpose='assistants' - ) + with open(ref_file, "rb") as f: + uploaded_file = client.files.create(file=f, purpose="assistants") file_ids.append(uploaded_file.id) # Attach files to vector store if file_ids: - client.beta.vector_stores.files.create_batch( - vector_store_id=vector_store.id, - file_ids=file_ids - ) + client.beta.vector_stores.files.create_batch(vector_store_id=vector_store.id, file_ids=file_ids) # Create assistant assistant = client.beta.assistants.create( @@ -282,27 +262,18 @@ Always prioritize accuracy by consulting the attached documentation files before instructions=instructions, model=model, tools=[{"type": "file_search"}], - tool_resources={ - "file_search": { - "vector_store_ids": [vector_store.id] - } - } + tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}}, ) return { - 'success': True, - 'skill_id': assistant.id, - 'url': f"https://platform.openai.com/assistants/{assistant.id}", - 'message': f'Assistant created with {len(file_ids)} knowledge files' + "success": True, + "skill_id": assistant.id, + "url": f"https://platform.openai.com/assistants/{assistant.id}", + "message": f"Assistant created with {len(file_ids)} knowledge files", } except Exception as e: - return { - 'success': False, - 'skill_id': None, - 'url': None, - 'message': f'Upload failed: {str(e)}' - } + return {"success": False, "skill_id": None, "url": None, "message": f"Upload failed: {str(e)}"} def validate_api_key(self, api_key: str) -> bool: """ @@ -314,7 +285,7 @@ Always prioritize accuracy by consulting the attached documentation files before Returns: True if key starts with 'sk-' """ - return api_key.strip().startswith('sk-') + return api_key.strip().startswith("sk-") def get_env_var_name(self) -> str: """ @@ -372,17 +343,13 @@ Always prioritize accuracy by consulting the attached documentation files before # Read current SKILL.md current_skill_md = None if skill_md_path.exists(): - current_skill_md = skill_md_path.read_text(encoding='utf-8') + current_skill_md = skill_md_path.read_text(encoding="utf-8") print(f" ℹ Found existing SKILL.md ({len(current_skill_md)} chars)") else: - print(f" ℹ No existing SKILL.md, will create new one") + print(" ℹ No existing SKILL.md, will create new one") # Build enhancement prompt - prompt = self._build_enhancement_prompt( - skill_dir.name, - references, - current_skill_md - ) + prompt = self._build_enhancement_prompt(skill_dir.name, references, current_skill_md) print("\nšŸ¤– Asking GPT-4o to enhance SKILL.md...") print(f" Input: {len(prompt):,} characters") @@ -395,15 +362,12 @@ Always prioritize accuracy by consulting the attached documentation files before messages=[ { "role": "system", - "content": "You are an expert technical writer creating Assistant instructions for OpenAI ChatGPT." + "content": "You are an expert technical writer creating Assistant instructions for OpenAI ChatGPT.", }, - { - "role": "user", - "content": prompt - } + {"role": "user", "content": prompt}, ], temperature=0.3, - max_tokens=4096 + max_tokens=4096, ) enhanced_content = response.choices[0].message.content @@ -411,13 +375,13 @@ Always prioritize accuracy by consulting the attached documentation files before # Backup original if skill_md_path.exists(): - backup_path = skill_md_path.with_suffix('.md.backup') + backup_path = skill_md_path.with_suffix(".md.backup") skill_md_path.rename(backup_path) print(f" šŸ’¾ Backed up original to: {backup_path.name}") # Save enhanced version - skill_md_path.write_text(enhanced_content, encoding='utf-8') - print(f" āœ… Saved enhanced SKILL.md") + skill_md_path.write_text(enhanced_content, encoding="utf-8") + print(" āœ… Saved enhanced SKILL.md") return True @@ -425,7 +389,7 @@ Always prioritize accuracy by consulting the attached documentation files before print(f"āŒ Error calling OpenAI API: {e}") return False - def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> Dict[str, str]: + def _read_reference_files(self, references_dir: Path, max_chars: int = 200000) -> dict[str, str]: """ Read reference markdown files from skill directory. @@ -448,7 +412,7 @@ Always prioritize accuracy by consulting the attached documentation files before break try: - content = ref_file.read_text(encoding='utf-8') + content = ref_file.read_text(encoding="utf-8") # Limit individual file size if len(content) > 30000: content = content[:30000] + "\n\n...(truncated)" @@ -462,10 +426,7 @@ Always prioritize accuracy by consulting the attached documentation files before return references def _build_enhancement_prompt( - self, - skill_name: str, - references: Dict[str, str], - current_skill_md: str = None + self, skill_name: str, references: dict[str, str], current_skill_md: str = None ) -> str: """ Build OpenAI API prompt for enhancement. @@ -483,9 +444,9 @@ Always prioritize accuracy by consulting the attached documentation files before I've scraped documentation and organized it into reference files. Your job is to create EXCELLENT Assistant instructions that will help the Assistant use this documentation effectively. CURRENT INSTRUCTIONS: -{'```' if current_skill_md else '(none - create from scratch)'} -{current_skill_md or 'No existing instructions'} -{'```' if current_skill_md else ''} +{"```" if current_skill_md else "(none - create from scratch)"} +{current_skill_md or "No existing instructions"} +{"```" if current_skill_md else ""} REFERENCE DOCUMENTATION: """ diff --git a/src/skill_seekers/cli/ai_enhancer.py b/src/skill_seekers/cli/ai_enhancer.py index 1c42cbe..dbfe971 100644 --- a/src/skill_seekers/cli/ai_enhancer.py +++ b/src/skill_seekers/cli/ai_enhancer.py @@ -17,9 +17,8 @@ Credits: - Graceful degradation if API unavailable """ -import os import logging -from typing import List, Dict, Optional, Any +import os from dataclasses import dataclass logger = logging.getLogger(__name__) @@ -28,18 +27,19 @@ logger = logging.getLogger(__name__) @dataclass class AIAnalysis: """AI analysis result for patterns or examples""" + explanation: str - issues: List[str] - recommendations: List[str] - related_items: List[str] # Related patterns or examples - best_practices: List[str] + issues: list[str] + recommendations: list[str] + related_items: list[str] # Related patterns or examples + best_practices: list[str] confidence_boost: float # -0.2 to +0.2 adjustment to confidence class AIEnhancer: """Base class for AI enhancement""" - def __init__(self, api_key: Optional[str] = None, enabled: bool = True, mode: str = "auto"): + def __init__(self, api_key: str | None = None, enabled: bool = True, mode: str = "auto"): """ Initialize AI enhancer. @@ -53,7 +53,7 @@ class AIEnhancer: """ self.enabled = enabled self.mode = mode - self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY') + self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") self.client = None # Determine actual mode @@ -72,6 +72,7 @@ class AIEnhancer: if self.mode == "api" and self.enabled: try: import anthropic + self.client = anthropic.Anthropic(api_key=self.api_key) logger.info("āœ… AI enhancement enabled (using Claude API)") except ImportError: @@ -88,16 +89,14 @@ class AIEnhancer: logger.info(" Use API mode (set ANTHROPIC_API_KEY) or 'skill-seekers enhance' for SKILL.md") self.enabled = False - def _call_claude(self, prompt: str, max_tokens: int = 1000) -> Optional[str]: + def _call_claude(self, prompt: str, max_tokens: int = 1000) -> str | None: """Call Claude API with error handling""" if not self.client: return None try: response = self.client.messages.create( - model="claude-sonnet-4-20250514", - max_tokens=max_tokens, - messages=[{"role": "user", "content": prompt}] + model="claude-sonnet-4-20250514", max_tokens=max_tokens, messages=[{"role": "user", "content": prompt}] ) return response.content[0].text except Exception as e: @@ -108,7 +107,7 @@ class AIEnhancer: class PatternEnhancer(AIEnhancer): """Enhance design pattern detection with AI analysis""" - def enhance_patterns(self, patterns: List[Dict]) -> List[Dict]: + def enhance_patterns(self, patterns: list[dict]) -> list[dict]: """ Enhance detected patterns with AI analysis. @@ -128,19 +127,19 @@ class PatternEnhancer(AIEnhancer): enhanced = [] for i in range(0, len(patterns), batch_size): - batch = patterns[i:i+batch_size] + batch = patterns[i : i + batch_size] batch_results = self._enhance_pattern_batch(batch) enhanced.extend(batch_results) logger.info(f"āœ… Enhanced {len(enhanced)} patterns") return enhanced - def _enhance_pattern_batch(self, patterns: List[Dict]) -> List[Dict]: + def _enhance_pattern_batch(self, patterns: list[dict]) -> list[dict]: """Enhance a batch of patterns""" # Prepare prompt pattern_descriptions = [] for idx, p in enumerate(patterns): - desc = f"{idx+1}. {p['pattern_type']} in {p.get('class_name', 'unknown')}" + desc = f"{idx + 1}. {p['pattern_type']} in {p.get('class_name', 'unknown')}" desc += f"\n Evidence: {', '.join(p.get('evidence', []))}" pattern_descriptions.append(desc) @@ -166,24 +165,25 @@ Format as JSON array matching input order. Be concise and actionable. try: import json + analyses = json.loads(response) # Merge AI analysis into patterns for idx, pattern in enumerate(patterns): if idx < len(analyses): analysis = analyses[idx] - pattern['ai_analysis'] = { - 'explanation': analysis.get('explanation', ''), - 'issues': analysis.get('issues', []), - 'recommendations': analysis.get('recommendations', []), - 'related_patterns': analysis.get('related_patterns', []), - 'confidence_boost': analysis.get('confidence_boost', 0.0) + pattern["ai_analysis"] = { + "explanation": analysis.get("explanation", ""), + "issues": analysis.get("issues", []), + "recommendations": analysis.get("recommendations", []), + "related_patterns": analysis.get("related_patterns", []), + "confidence_boost": analysis.get("confidence_boost", 0.0), } # Adjust confidence - boost = analysis.get('confidence_boost', 0.0) + boost = analysis.get("confidence_boost", 0.0) if -0.2 <= boost <= 0.2: - pattern['confidence'] = min(1.0, max(0.0, pattern['confidence'] + boost)) + pattern["confidence"] = min(1.0, max(0.0, pattern["confidence"] + boost)) return patterns @@ -198,7 +198,7 @@ Format as JSON array matching input order. Be concise and actionable. class TestExampleEnhancer(AIEnhancer): """Enhance test examples with AI analysis""" - def enhance_examples(self, examples: List[Dict]) -> List[Dict]: + def enhance_examples(self, examples: list[dict]) -> list[dict]: """ Enhance test examples with AI context and explanations. @@ -218,21 +218,21 @@ class TestExampleEnhancer(AIEnhancer): enhanced = [] for i in range(0, len(examples), batch_size): - batch = examples[i:i+batch_size] + batch = examples[i : i + batch_size] batch_results = self._enhance_example_batch(batch) enhanced.extend(batch_results) logger.info(f"āœ… Enhanced {len(enhanced)} examples") return enhanced - def _enhance_example_batch(self, examples: List[Dict]) -> List[Dict]: + def _enhance_example_batch(self, examples: list[dict]) -> list[dict]: """Enhance a batch of examples""" # Prepare prompt example_descriptions = [] for idx, ex in enumerate(examples): - desc = f"{idx+1}. {ex.get('category', 'unknown')} - {ex.get('test_name', 'unknown')}" + desc = f"{idx + 1}. {ex.get('category', 'unknown')} - {ex.get('test_name', 'unknown')}" desc += f"\n Code: {ex.get('code', '')[:100]}..." - if ex.get('expected_behavior'): + if ex.get("expected_behavior"): desc += f"\n Expected: {ex['expected_behavior']}" example_descriptions.append(desc) @@ -257,18 +257,19 @@ Format as JSON array matching input order. Focus on educational value. try: import json + analyses = json.loads(response) # Merge AI analysis into examples for idx, example in enumerate(examples): if idx < len(analyses): analysis = analyses[idx] - example['ai_analysis'] = { - 'explanation': analysis.get('explanation', ''), - 'best_practices': analysis.get('best_practices', []), - 'common_mistakes': analysis.get('common_mistakes', []), - 'related_examples': analysis.get('related_examples', []), - 'tutorial_group': analysis.get('tutorial_group', '') + example["ai_analysis"] = { + "explanation": analysis.get("explanation", ""), + "best_practices": analysis.get("best_practices", []), + "common_mistakes": analysis.get("common_mistakes", []), + "related_examples": analysis.get("related_examples", []), + "tutorial_group": analysis.get("tutorial_group", ""), } return examples @@ -280,7 +281,7 @@ Format as JSON array matching input order. Focus on educational value. logger.warning(f"āš ļø Error processing AI analysis: {e}") return examples - def generate_tutorials(self, examples: List[Dict]) -> Dict[str, List[Dict]]: + def generate_tutorials(self, examples: list[dict]) -> dict[str, list[dict]]: """ Group enhanced examples into tutorial sections. @@ -293,8 +294,8 @@ Format as JSON array matching input order. Focus on educational value. tutorials = {} for example in examples: - ai_analysis = example.get('ai_analysis', {}) - group = ai_analysis.get('tutorial_group', 'Miscellaneous') + ai_analysis = example.get("ai_analysis", {}) + group = ai_analysis.get("tutorial_group", "Miscellaneous") if group not in tutorials: tutorials[group] = [] diff --git a/src/skill_seekers/cli/api_reference_builder.py b/src/skill_seekers/cli/api_reference_builder.py index 1264a31..670f602 100644 --- a/src/skill_seekers/cli/api_reference_builder.py +++ b/src/skill_seekers/cli/api_reference_builder.py @@ -17,10 +17,9 @@ Usage: builder.build_reference(output_dir) """ -import os import json from pathlib import Path -from typing import Dict, List, Any, Optional +from typing import Any class APIReferenceBuilder: @@ -31,7 +30,7 @@ class APIReferenceBuilder: documentation for each analyzed source file. """ - def __init__(self, code_analysis: Dict[str, Any]): + def __init__(self, code_analysis: dict[str, Any]): """ Initialize builder with code analysis results. @@ -40,9 +39,9 @@ class APIReferenceBuilder: Expected format: {'files': [{'file': 'path', 'classes': [...], 'functions': [...]}]} """ self.code_analysis = code_analysis - self.files_data = code_analysis.get('files', []) + self.files_data = code_analysis.get("files", []) - def build_reference(self, output_dir: Path) -> Dict[str, Path]: + def build_reference(self, output_dir: Path) -> dict[str, Path]: """ Generate markdown files for each analyzed source file. @@ -58,11 +57,11 @@ class APIReferenceBuilder: generated_files = {} for file_data in self.files_data: - source_file = file_data.get('file', 'unknown') - language = file_data.get('language', 'Unknown') + source_file = file_data.get("file", "unknown") + language = file_data.get("language", "Unknown") # Skip files with no analysis - if not file_data.get('classes') and not file_data.get('functions'): + if not file_data.get("classes") and not file_data.get("functions"): continue # Generate markdown content @@ -73,7 +72,7 @@ class APIReferenceBuilder: output_path = output_dir / output_filename # Write markdown file - output_path.write_text(markdown_content, encoding='utf-8') + output_path.write_text(markdown_content, encoding="utf-8") generated_files[source_file] = output_path return generated_files @@ -92,11 +91,10 @@ class APIReferenceBuilder: basename = Path(source_file).name # Replace extension with .md - name_without_ext = basename.rsplit('.', 1)[0] if '.' in basename else basename + name_without_ext = basename.rsplit(".", 1)[0] if "." in basename else basename return f"{name_without_ext}.md" - def _generate_file_reference(self, file_data: Dict[str, Any], - source_file: str, language: str) -> str: + def _generate_file_reference(self, file_data: dict[str, Any], source_file: str, language: str) -> str: """ Generate complete markdown reference for a single file. @@ -118,7 +116,7 @@ class APIReferenceBuilder: lines.append("---\n") # Classes section - classes = file_data.get('classes', []) + classes = file_data.get("classes", []) if classes: lines.append("## Classes\n") for cls in classes: @@ -126,16 +124,16 @@ class APIReferenceBuilder: lines.append("\n") # Functions section - functions = file_data.get('functions', []) + functions = file_data.get("functions", []) if functions: lines.append("## Functions\n") for func in functions: lines.append(self._format_function(func)) lines.append("\n") - return '\n'.join(lines) + return "\n".join(lines) - def _format_class(self, class_sig: Dict[str, Any]) -> str: + def _format_class(self, class_sig: dict[str, Any]) -> str: """ Format class signature as markdown. @@ -148,33 +146,33 @@ class APIReferenceBuilder: lines = [] # Class name - class_name = class_sig.get('name', 'Unknown') + class_name = class_sig.get("name", "Unknown") lines.append(f"### {class_name}\n") # Docstring - docstring = class_sig.get('docstring') + docstring = class_sig.get("docstring") if docstring: lines.append(f"{docstring}\n") # Inheritance - base_classes = class_sig.get('base_classes', []) + base_classes = class_sig.get("base_classes", []) if base_classes: - bases_str = ', '.join(base_classes) + bases_str = ", ".join(base_classes) lines.append(f"**Inherits from**: {bases_str}\n") else: lines.append("**Inherits from**: (none)\n") # Methods - methods = class_sig.get('methods', []) + methods = class_sig.get("methods", []) if methods: lines.append("#### Methods\n") for method in methods: lines.append(self._format_method(method)) lines.append("") - return '\n'.join(lines) + return "\n".join(lines) - def _format_method(self, method_sig: Dict[str, Any]) -> str: + def _format_method(self, method_sig: dict[str, Any]) -> str: """ Format method signature as markdown. @@ -191,30 +189,30 @@ class APIReferenceBuilder: lines.append(f"##### {signature}\n") # Docstring - docstring = method_sig.get('docstring') + docstring = method_sig.get("docstring") if docstring: lines.append(f"{docstring}\n") # Decorators - decorators = method_sig.get('decorators', []) + decorators = method_sig.get("decorators", []) if decorators: - dec_str = ', '.join(f"`@{d}`" for d in decorators) + dec_str = ", ".join(f"`@{d}`" for d in decorators) lines.append(f"**Decorators**: {dec_str}\n") # Parameters table - params = method_sig.get('parameters', []) + params = method_sig.get("parameters", []) if params: lines.append(self._format_parameters(params)) lines.append("") # Return type - return_type = method_sig.get('return_type') + return_type = method_sig.get("return_type") if return_type: lines.append(f"**Returns**: `{return_type}`\n") - return '\n'.join(lines) + return "\n".join(lines) - def _format_function(self, func_sig: Dict[str, Any]) -> str: + def _format_function(self, func_sig: dict[str, Any]) -> str: """ Format function signature as markdown. @@ -231,30 +229,30 @@ class APIReferenceBuilder: lines.append(f"### {signature}\n") # Async indicator - if func_sig.get('is_async'): + if func_sig.get("is_async"): lines.append("**Async function**\n") # Docstring - docstring = func_sig.get('docstring') + docstring = func_sig.get("docstring") if docstring: lines.append(f"{docstring}\n") # Parameters table - params = func_sig.get('parameters', []) + params = func_sig.get("parameters", []) if params: lines.append(self._format_parameters(params)) lines.append("") # Return type - return_type = func_sig.get('return_type') + return_type = func_sig.get("return_type") if return_type: lines.append(f"**Returns**: `{return_type}`\n") else: lines.append("**Returns**: (none)\n") - return '\n'.join(lines) + return "\n".join(lines) - def _build_signature(self, sig: Dict[str, Any]) -> str: + def _build_signature(self, sig: dict[str, Any]) -> str: """ Build function/method signature string. @@ -264,28 +262,28 @@ class APIReferenceBuilder: Returns: Formatted signature string """ - name = sig.get('name', 'unknown') - params = sig.get('parameters', []) - return_type = sig.get('return_type') + name = sig.get("name", "unknown") + params = sig.get("parameters", []) + return_type = sig.get("return_type") # Build parameter list param_strs = [] for param in params: - param_str = param.get('name', '') + param_str = param.get("name", "") # Add type hint if available - type_hint = param.get('type_hint') + type_hint = param.get("type_hint") if type_hint: param_str += f": {type_hint}" # Add default value if available - default = param.get('default') + default = param.get("default") if default: param_str += f" = {default}" param_strs.append(param_str) - params_str = ', '.join(param_strs) + params_str = ", ".join(param_strs) # Build full signature if return_type: @@ -293,7 +291,7 @@ class APIReferenceBuilder: else: return f"{name}({params_str})" - def _format_parameters(self, params: List[Dict]) -> str: + def _format_parameters(self, params: list[dict]) -> str: """ Format parameter list as markdown table. @@ -313,19 +311,19 @@ class APIReferenceBuilder: lines.append("|------|------|---------|-------------|") for param in params: - name = param.get('name', '-') - type_hint = param.get('type_hint', '-') - default = param.get('default') + name = param.get("name", "-") + type_hint = param.get("type_hint", "-") + default = param.get("default") # Show "-" for parameters without defaults - default_str = default if default is not None else '-' + default_str = default if default is not None else "-" # For description, use empty for now (would need JSDoc/docstring parsing) description = "-" lines.append(f"| {name} | {type_hint} | {default_str} | {description} |") - return '\n'.join(lines) + return "\n".join(lines) def main(): @@ -336,12 +334,10 @@ def main(): """ import argparse - parser = argparse.ArgumentParser( - description='Generate API reference from code analysis results' - ) + parser = argparse.ArgumentParser(description="Generate API reference from code analysis results") - parser.add_argument('input_file', help='Code analysis JSON file') - parser.add_argument('output_dir', help='Output directory for markdown files') + parser.add_argument("input_file", help="Code analysis JSON file") + parser.add_argument("output_dir", help="Output directory for markdown files") args = parser.parse_args() @@ -351,7 +347,7 @@ def main(): print(f"Error: Input file not found: {input_path}") return 1 - with open(input_path, 'r', encoding='utf-8') as f: + with open(input_path, encoding="utf-8") as f: code_analysis = json.load(f) # Build API reference @@ -367,6 +363,7 @@ def main(): return 0 -if __name__ == '__main__': +if __name__ == "__main__": import sys + sys.exit(main()) diff --git a/src/skill_seekers/cli/architectural_pattern_detector.py b/src/skill_seekers/cli/architectural_pattern_detector.py index bf1d38f..116bec5 100644 --- a/src/skill_seekers/cli/architectural_pattern_detector.py +++ b/src/skill_seekers/cli/architectural_pattern_detector.py @@ -21,11 +21,9 @@ Credits: """ import logging -import re -from dataclasses import dataclass, field -from pathlib import Path -from typing import List, Dict, Optional, Set from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path logger = logging.getLogger(__name__) @@ -33,41 +31,43 @@ logger = logging.getLogger(__name__) @dataclass class ArchitecturalPattern: """Detected architectural pattern""" + pattern_name: str # e.g., "MVC", "MVVM", "Repository" confidence: float # 0.0-1.0 - evidence: List[str] # List of evidence supporting detection - components: Dict[str, List[str]] # Component type -> file paths - framework: Optional[str] = None # Detected framework (Django, Spring, etc.) + evidence: list[str] # List of evidence supporting detection + components: dict[str, list[str]] # Component type -> file paths + framework: str | None = None # Detected framework (Django, Spring, etc.) description: str = "" # Human-readable description @dataclass class ArchitecturalReport: """Complete architectural analysis report""" - patterns: List[ArchitecturalPattern] - directory_structure: Dict[str, int] # Directory name -> file count - total_files_analyzed: int - frameworks_detected: List[str] - ai_analysis: Optional[Dict] = None # AI enhancement (C3.6 integration) - def to_dict(self) -> Dict: + patterns: list[ArchitecturalPattern] + directory_structure: dict[str, int] # Directory name -> file count + total_files_analyzed: int + frameworks_detected: list[str] + ai_analysis: dict | None = None # AI enhancement (C3.6 integration) + + def to_dict(self) -> dict: """Export to dictionary""" return { - 'patterns': [ + "patterns": [ { - 'pattern_name': p.pattern_name, - 'confidence': p.confidence, - 'evidence': p.evidence, - 'components': p.components, - 'framework': p.framework, - 'description': p.description + "pattern_name": p.pattern_name, + "confidence": p.confidence, + "evidence": p.evidence, + "components": p.components, + "framework": p.framework, + "description": p.description, } for p in self.patterns ], - 'directory_structure': self.directory_structure, - 'total_files_analyzed': self.total_files_analyzed, - 'frameworks_detected': self.frameworks_detected, - 'ai_analysis': self.ai_analysis + "directory_structure": self.directory_structure, + "total_files_analyzed": self.total_files_analyzed, + "frameworks_detected": self.frameworks_detected, + "ai_analysis": self.ai_analysis, } @@ -79,25 +79,25 @@ class ArchitecturalPatternDetector: """ # Common directory patterns for architectures - MVC_DIRS = {'models', 'views', 'controllers', 'model', 'view', 'controller'} - MVVM_DIRS = {'models', 'views', 'viewmodels', 'viewmodel'} - LAYERED_DIRS = {'presentation', 'business', 'data', 'dal', 'bll', 'ui'} - CLEAN_ARCH_DIRS = {'domain', 'application', 'infrastructure', 'presentation'} - REPO_DIRS = {'repositories', 'repository'} - SERVICE_DIRS = {'services', 'service'} + MVC_DIRS = {"models", "views", "controllers", "model", "view", "controller"} + MVVM_DIRS = {"models", "views", "viewmodels", "viewmodel"} + LAYERED_DIRS = {"presentation", "business", "data", "dal", "bll", "ui"} + CLEAN_ARCH_DIRS = {"domain", "application", "infrastructure", "presentation"} + REPO_DIRS = {"repositories", "repository"} + SERVICE_DIRS = {"services", "service"} # Framework detection patterns FRAMEWORK_MARKERS = { - 'Django': ['django', 'manage.py', 'settings.py', 'urls.py'], - 'Flask': ['flask', 'app.py', 'wsgi.py'], - 'Spring': ['springframework', '@Controller', '@Service', '@Repository'], - 'ASP.NET': ['Controllers', 'Models', 'Views', '.cshtml', 'Startup.cs'], - 'Rails': ['app/models', 'app/views', 'app/controllers', 'config/routes.rb'], - 'Angular': ['app.module.ts', '@Component', '@Injectable', 'angular.json'], - 'React': ['package.json', 'react', 'components'], - 'Vue.js': ['vue', '.vue', 'components'], - 'Express': ['express', 'app.js', 'routes'], - 'Laravel': ['artisan', 'app/Http/Controllers', 'app/Models'] + "Django": ["django", "manage.py", "settings.py", "urls.py"], + "Flask": ["flask", "app.py", "wsgi.py"], + "Spring": ["springframework", "@Controller", "@Service", "@Repository"], + "ASP.NET": ["Controllers", "Models", "Views", ".cshtml", "Startup.cs"], + "Rails": ["app/models", "app/views", "app/controllers", "config/routes.rb"], + "Angular": ["app.module.ts", "@Component", "@Injectable", "angular.json"], + "React": ["package.json", "react", "components"], + "Vue.js": ["vue", ".vue", "components"], + "Express": ["express", "app.js", "routes"], + "Laravel": ["artisan", "app/Http/Controllers", "app/Models"], } def __init__(self, enhance_with_ai: bool = True): @@ -113,12 +113,13 @@ class ArchitecturalPatternDetector: if self.enhance_with_ai: try: from skill_seekers.cli.ai_enhancer import AIEnhancer + self.ai_enhancer = AIEnhancer() except Exception as e: logger.warning(f"āš ļø Failed to initialize AI enhancer: {e}") self.enhance_with_ai = False - def analyze(self, directory: Path, files_analysis: List[Dict]) -> ArchitecturalReport: + def analyze(self, directory: Path, files_analysis: list[dict]) -> ArchitecturalReport: """ Analyze codebase for architectural patterns. @@ -151,7 +152,7 @@ class ArchitecturalPatternDetector: patterns=patterns, directory_structure=dir_structure, total_files_analyzed=len(files_analysis), - frameworks_detected=frameworks + frameworks_detected=frameworks, ) # Enhance with AI if enabled (C3.6) @@ -161,11 +162,11 @@ class ArchitecturalPatternDetector: logger.info(f"āœ… Detected {len(patterns)} architectural patterns") return report - def _analyze_directory_structure(self, directory: Path) -> Dict[str, int]: + def _analyze_directory_structure(self, directory: Path) -> dict[str, int]: """Analyze directory structure and count files""" structure = defaultdict(int) - for path in directory.rglob('*'): + for path in directory.rglob("*"): if path.is_file(): # Get relative directory path rel_dir = path.parent.relative_to(directory) @@ -180,13 +181,13 @@ class ArchitecturalPatternDetector: return dict(structure) - def _detect_frameworks(self, directory: Path, files: List[Dict]) -> List[str]: + def _detect_frameworks(self, directory: Path, files: list[dict]) -> list[str]: """Detect frameworks being used""" detected = [] # Check file paths and content - all_paths = [str(f.get('file', '')) for f in files] - all_content = ' '.join(all_paths) + all_paths = [str(f.get("file", "")) for f in files] + all_content = " ".join(all_paths) for framework, markers in self.FRAMEWORK_MARKERS.items(): matches = sum(1 for marker in markers if marker.lower() in all_content.lower()) @@ -196,7 +197,7 @@ class ArchitecturalPatternDetector: return detected - def _detect_mvc(self, dirs: Dict[str, int], files: List[Dict], frameworks: List[str]) -> List[ArchitecturalPattern]: + def _detect_mvc(self, dirs: dict[str, int], files: list[dict], frameworks: list[str]) -> list[ArchitecturalPattern]: """Detect MVC pattern""" patterns = [] @@ -213,58 +214,62 @@ class ArchitecturalPatternDetector: # Find MVC files for file in files: - file_path = str(file.get('file', '')).lower() + file_path = str(file.get("file", "")).lower() - if 'model' in file_path and ('models/' in file_path or '/model/' in file_path): - components['Models'].append(file.get('file', '')) - if len(components['Models']) == 1: + if "model" in file_path and ("models/" in file_path or "/model/" in file_path): + components["Models"].append(file.get("file", "")) + if len(components["Models"]) == 1: evidence.append("Models directory with model classes") - if 'view' in file_path and ('views/' in file_path or '/view/' in file_path): - components['Views'].append(file.get('file', '')) - if len(components['Views']) == 1: + if "view" in file_path and ("views/" in file_path or "/view/" in file_path): + components["Views"].append(file.get("file", "")) + if len(components["Views"]) == 1: evidence.append("Views directory with view files") - if 'controller' in file_path and ('controllers/' in file_path or '/controller/' in file_path): - components['Controllers'].append(file.get('file', '')) - if len(components['Controllers']) == 1: + if "controller" in file_path and ("controllers/" in file_path or "/controller/" in file_path): + components["Controllers"].append(file.get("file", "")) + if len(components["Controllers"]) == 1: evidence.append("Controllers directory with controller classes") # Calculate confidence - has_models = len(components['Models']) > 0 - has_views = len(components['Views']) > 0 - has_controllers = len(components['Controllers']) > 0 + has_models = len(components["Models"]) > 0 + has_views = len(components["Views"]) > 0 + has_controllers = len(components["Controllers"]) > 0 if sum([has_models, has_views, has_controllers]) >= 2: confidence = 0.6 + (sum([has_models, has_views, has_controllers]) * 0.15) # Boost confidence if framework detected framework = None - for fw in ['Django', 'Flask', 'Spring', 'ASP.NET', 'Rails', 'Laravel']: + for fw in ["Django", "Flask", "Spring", "ASP.NET", "Rails", "Laravel"]: if fw in frameworks: confidence = min(0.95, confidence + 0.1) framework = fw evidence.append(f"{fw} framework detected (uses MVC)") break - patterns.append(ArchitecturalPattern( - pattern_name="MVC (Model-View-Controller)", - confidence=confidence, - evidence=evidence, - components=dict(components), - framework=framework, - description="Separates application into Models (data), Views (UI), and Controllers (logic)" - )) + patterns.append( + ArchitecturalPattern( + pattern_name="MVC (Model-View-Controller)", + confidence=confidence, + evidence=evidence, + components=dict(components), + framework=framework, + description="Separates application into Models (data), Views (UI), and Controllers (logic)", + ) + ) return patterns - def _detect_mvvm(self, dirs: Dict[str, int], files: List[Dict], frameworks: List[str]) -> List[ArchitecturalPattern]: + def _detect_mvvm( + self, dirs: dict[str, int], files: list[dict], frameworks: list[str] + ) -> list[ArchitecturalPattern]: """Detect MVVM pattern""" patterns = [] # Look for ViewModels directory or classes ending with ViewModel - has_viewmodel_dir = 'viewmodels' in dirs or 'viewmodel' in dirs - viewmodel_files = [f for f in files if 'viewmodel' in str(f.get('file', '')).lower()] + has_viewmodel_dir = "viewmodels" in dirs or "viewmodel" in dirs + viewmodel_files = [f for f in files if "viewmodel" in str(f.get("file", "")).lower()] if not (has_viewmodel_dir or len(viewmodel_files) >= 2): return patterns @@ -274,63 +279,68 @@ class ArchitecturalPatternDetector: # Find MVVM files for file in files: - file_path = str(file.get('file', '')).lower() - classes = file.get('classes', []) + file_path = str(file.get("file", "")).lower() + classes = file.get("classes", []) - if 'model' in file_path and 'viewmodel' not in file_path: - components['Models'].append(file.get('file', '')) + if "model" in file_path and "viewmodel" not in file_path: + components["Models"].append(file.get("file", "")) - if 'view' in file_path: - components['Views'].append(file.get('file', '')) + if "view" in file_path: + components["Views"].append(file.get("file", "")) - if 'viewmodel' in file_path or any('viewmodel' in c.get('name', '').lower() for c in classes): - components['ViewModels'].append(file.get('file', '')) + if "viewmodel" in file_path or any("viewmodel" in c.get("name", "").lower() for c in classes): + components["ViewModels"].append(file.get("file", "")) - if len(components['ViewModels']) >= 2: + if len(components["ViewModels"]) >= 2: evidence.append(f"ViewModels directory with {len(components['ViewModels'])} ViewModel classes") - if len(components['Views']) >= 2: + if len(components["Views"]) >= 2: evidence.append(f"Views directory with {len(components['Views'])} view files") - if len(components['Models']) >= 1: + if len(components["Models"]) >= 1: evidence.append(f"Models directory with {len(components['Models'])} model files") # Calculate confidence - has_models = len(components['Models']) > 0 - has_views = len(components['Views']) > 0 - has_viewmodels = len(components['ViewModels']) >= 2 + has_models = len(components["Models"]) > 0 + has_views = len(components["Views"]) > 0 + has_viewmodels = len(components["ViewModels"]) >= 2 if has_viewmodels and (has_models or has_views): confidence = 0.7 if (has_models and has_views and has_viewmodels) else 0.6 framework = None - for fw in ['ASP.NET', 'Angular', 'Vue.js']: + for fw in ["ASP.NET", "Angular", "Vue.js"]: if fw in frameworks: confidence = min(0.95, confidence + 0.1) framework = fw evidence.append(f"{fw} framework detected (supports MVVM)") break - patterns.append(ArchitecturalPattern( - pattern_name="MVVM (Model-View-ViewModel)", - confidence=confidence, - evidence=evidence, - components=dict(components), - framework=framework, - description="ViewModels provide data-binding between Views and Models" - )) + patterns.append( + ArchitecturalPattern( + pattern_name="MVVM (Model-View-ViewModel)", + confidence=confidence, + evidence=evidence, + components=dict(components), + framework=framework, + description="ViewModels provide data-binding between Views and Models", + ) + ) return patterns - def _detect_repository(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]: + def _detect_repository(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]: """Detect Repository pattern""" patterns = [] # Look for repositories directory or classes ending with Repository has_repo_dir = any(d in dirs for d in self.REPO_DIRS) - repo_files = [f for f in files - if 'repository' in str(f.get('file', '')).lower() or - any('repository' in c.get('name', '').lower() for c in f.get('classes', []))] + repo_files = [ + f + for f in files + if "repository" in str(f.get("file", "")).lower() + or any("repository" in c.get("name", "").lower() for c in f.get("classes", [])) + ] if not (has_repo_dir or len(repo_files) >= 2): return patterns @@ -339,30 +349,35 @@ class ArchitecturalPatternDetector: components = defaultdict(list) for file in repo_files: - components['Repositories'].append(file.get('file', '')) + components["Repositories"].append(file.get("file", "")) - if len(components['Repositories']) >= 2: + if len(components["Repositories"]) >= 2: evidence.append(f"Repository pattern: {len(components['Repositories'])} repository classes") evidence.append("Repositories abstract data access logic") - patterns.append(ArchitecturalPattern( - pattern_name="Repository Pattern", - confidence=0.75, - evidence=evidence, - components=dict(components), - description="Encapsulates data access logic in repository classes" - )) + patterns.append( + ArchitecturalPattern( + pattern_name="Repository Pattern", + confidence=0.75, + evidence=evidence, + components=dict(components), + description="Encapsulates data access logic in repository classes", + ) + ) return patterns - def _detect_service_layer(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]: + def _detect_service_layer(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]: """Detect Service Layer pattern""" patterns = [] has_service_dir = any(d in dirs for d in self.SERVICE_DIRS) - service_files = [f for f in files - if 'service' in str(f.get('file', '')).lower() or - any('service' in c.get('name', '').lower() for c in f.get('classes', []))] + service_files = [ + f + for f in files + if "service" in str(f.get("file", "")).lower() + or any("service" in c.get("name", "").lower() for c in f.get("classes", [])) + ] if not (has_service_dir or len(service_files) >= 3): return patterns @@ -371,23 +386,25 @@ class ArchitecturalPatternDetector: components = defaultdict(list) for file in service_files: - components['Services'].append(file.get('file', '')) + components["Services"].append(file.get("file", "")) - if len(components['Services']) >= 3: + if len(components["Services"]) >= 3: evidence.append(f"Service layer: {len(components['Services'])} service classes") evidence.append("Services encapsulate business logic") - patterns.append(ArchitecturalPattern( - pattern_name="Service Layer Pattern", - confidence=0.75, - evidence=evidence, - components=dict(components), - description="Encapsulates business logic in service classes" - )) + patterns.append( + ArchitecturalPattern( + pattern_name="Service Layer Pattern", + confidence=0.75, + evidence=evidence, + components=dict(components), + description="Encapsulates business logic in service classes", + ) + ) return patterns - def _detect_layered_architecture(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]: + def _detect_layered_architecture(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]: """Detect Layered Architecture (3-tier, N-tier)""" patterns = [] @@ -400,32 +417,34 @@ class ArchitecturalPatternDetector: components = defaultdict(list) layers_found = [] - if 'presentation' in dirs or 'ui' in dirs: + if "presentation" in dirs or "ui" in dirs: layers_found.append("Presentation Layer") evidence.append("Presentation/UI layer detected") - if 'business' in dirs or 'bll' in dirs: + if "business" in dirs or "bll" in dirs: layers_found.append("Business Logic Layer") evidence.append("Business logic layer detected") - if 'data' in dirs or 'dal' in dirs: + if "data" in dirs or "dal" in dirs: layers_found.append("Data Access Layer") evidence.append("Data access layer detected") if len(layers_found) >= 2: confidence = 0.65 + (len(layers_found) * 0.1) - patterns.append(ArchitecturalPattern( - pattern_name=f"Layered Architecture ({len(layers_found)}-tier)", - confidence=min(confidence, 0.9), - evidence=evidence, - components={'Layers': layers_found}, - description=f"Separates concerns into {len(layers_found)} distinct layers" - )) + patterns.append( + ArchitecturalPattern( + pattern_name=f"Layered Architecture ({len(layers_found)}-tier)", + confidence=min(confidence, 0.9), + evidence=evidence, + components={"Layers": layers_found}, + description=f"Separates concerns into {len(layers_found)} distinct layers", + ) + ) return patterns - def _detect_clean_architecture(self, dirs: Dict[str, int], files: List[Dict]) -> List[ArchitecturalPattern]: + def _detect_clean_architecture(self, dirs: dict[str, int], files: list[dict]) -> list[ArchitecturalPattern]: """Detect Clean Architecture""" patterns = [] @@ -437,50 +456,52 @@ class ArchitecturalPatternDetector: evidence = [] components = defaultdict(list) - if 'domain' in dirs: + if "domain" in dirs: evidence.append("Domain layer (core business logic)") - components['Domain'].append('domain/') + components["Domain"].append("domain/") - if 'application' in dirs: + if "application" in dirs: evidence.append("Application layer (use cases)") - components['Application'].append('application/') + components["Application"].append("application/") - if 'infrastructure' in dirs: + if "infrastructure" in dirs: evidence.append("Infrastructure layer (external dependencies)") - components['Infrastructure'].append('infrastructure/') + components["Infrastructure"].append("infrastructure/") - if 'presentation' in dirs: + if "presentation" in dirs: evidence.append("Presentation layer (UI/API)") - components['Presentation'].append('presentation/') + components["Presentation"].append("presentation/") if len(components) >= 3: - patterns.append(ArchitecturalPattern( - pattern_name="Clean Architecture", - confidence=0.85, - evidence=evidence, - components=dict(components), - description="Dependency inversion with domain at center, infrastructure at edges" - )) + patterns.append( + ArchitecturalPattern( + pattern_name="Clean Architecture", + confidence=0.85, + evidence=evidence, + components=dict(components), + description="Dependency inversion with domain at center, infrastructure at edges", + ) + ) return patterns - def _enhance_with_ai(self, report: ArchitecturalReport) -> Dict: + def _enhance_with_ai(self, report: ArchitecturalReport) -> dict: """Enhance architectural analysis with AI insights""" if not self.ai_enhancer: return {} # Prepare summary for AI summary = f"""Detected {len(report.patterns)} architectural patterns: -{chr(10).join(f'- {p.pattern_name} (confidence: {p.confidence:.2f})' for p in report.patterns)} +{chr(10).join(f"- {p.pattern_name} (confidence: {p.confidence:.2f})" for p in report.patterns)} -Frameworks: {', '.join(report.frameworks_detected) if report.frameworks_detected else 'None'} +Frameworks: {", ".join(report.frameworks_detected) if report.frameworks_detected else "None"} Total files: {report.total_files_analyzed} Provide brief architectural insights and recommendations.""" try: response = self.ai_enhancer._call_claude(summary, max_tokens=500) - return {'insights': response} if response else {} + return {"insights": response} if response else {} except Exception as e: logger.warning(f"āš ļø AI enhancement failed: {e}") return {} diff --git a/src/skill_seekers/cli/code_analyzer.py b/src/skill_seekers/cli/code_analyzer.py index 1d6ed3b..10c0ca1 100644 --- a/src/skill_seekers/cli/code_analyzer.py +++ b/src/skill_seekers/cli/code_analyzer.py @@ -23,10 +23,10 @@ consider using dedicated parsers (tree-sitter, language-specific AST libraries). """ import ast -import re import logging -from typing import Dict, List, Any, Optional -from dataclasses import dataclass, asdict +import re +from dataclasses import asdict, dataclass +from typing import Any logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -35,22 +35,24 @@ logger = logging.getLogger(__name__) @dataclass class Parameter: """Represents a function parameter.""" + name: str - type_hint: Optional[str] = None - default: Optional[str] = None + type_hint: str | None = None + default: str | None = None @dataclass class FunctionSignature: """Represents a function/method signature.""" + name: str - parameters: List[Parameter] - return_type: Optional[str] = None - docstring: Optional[str] = None - line_number: Optional[int] = None + parameters: list[Parameter] + return_type: str | None = None + docstring: str | None = None + line_number: int | None = None is_async: bool = False is_method: bool = False - decorators: List[str] = None + decorators: list[str] = None def __post_init__(self): if self.decorators is None: @@ -60,11 +62,12 @@ class FunctionSignature: @dataclass class ClassSignature: """Represents a class signature.""" + name: str - base_classes: List[str] - methods: List[FunctionSignature] - docstring: Optional[str] = None - line_number: Optional[int] = None + base_classes: list[str] + methods: list[FunctionSignature] + docstring: str | None = None + line_number: int | None = None class CodeAnalyzer: @@ -72,7 +75,7 @@ class CodeAnalyzer: Analyzes code at different depth levels. """ - def __init__(self, depth: str = 'surface'): + def __init__(self, depth: str = "surface"): """ Initialize code analyzer. @@ -81,7 +84,7 @@ class CodeAnalyzer: """ self.depth = depth - def analyze_file(self, file_path: str, content: str, language: str) -> Dict[str, Any]: + def analyze_file(self, file_path: str, content: str, language: str) -> dict[str, Any]: """ Analyze a single file based on depth level. @@ -93,29 +96,29 @@ class CodeAnalyzer: Returns: Dict containing extracted signatures """ - if self.depth == 'surface': + if self.depth == "surface": return {} # Surface level doesn't analyze individual files logger.debug(f"Analyzing {file_path} (language: {language}, depth: {self.depth})") try: - if language == 'Python': + if language == "Python": return self._analyze_python(content, file_path) - elif language in ['JavaScript', 'TypeScript']: + elif language in ["JavaScript", "TypeScript"]: return self._analyze_javascript(content, file_path) - elif language in ['C', 'C++']: + elif language in ["C", "C++"]: return self._analyze_cpp(content, file_path) - elif language == 'C#': + elif language == "C#": return self._analyze_csharp(content, file_path) - elif language == 'Go': + elif language == "Go": return self._analyze_go(content, file_path) - elif language == 'Rust': + elif language == "Rust": return self._analyze_rust(content, file_path) - elif language == 'Java': + elif language == "Java": return self._analyze_java(content, file_path) - elif language == 'Ruby': + elif language == "Ruby": return self._analyze_ruby(content, file_path) - elif language == 'PHP': + elif language == "PHP": return self._analyze_php(content, file_path) else: logger.debug(f"No analyzer for language: {language}") @@ -124,7 +127,7 @@ class CodeAnalyzer: logger.warning(f"Error analyzing {file_path}: {e}") return {} - def _analyze_python(self, content: str, file_path: str) -> Dict[str, Any]: + def _analyze_python(self, content: str, file_path: str) -> dict[str, Any]: """Analyze Python file using AST.""" try: tree = ast.parse(content) @@ -144,9 +147,11 @@ class CodeAnalyzer: # Fix AST parser to check isinstance(parent.body, list) before 'in' operator is_method = False try: - is_method = any(isinstance(parent, ast.ClassDef) - for parent in ast.walk(tree) - if hasattr(parent, 'body') and isinstance(parent.body, list) and node in parent.body) + is_method = any( + isinstance(parent, ast.ClassDef) + for parent in ast.walk(tree) + if hasattr(parent, "body") and isinstance(parent.body, list) and node in parent.body + ) except (TypeError, AttributeError): # If body is not iterable or check fails, assume it's a top-level function is_method = False @@ -158,11 +163,7 @@ class CodeAnalyzer: # Extract comments comments = self._extract_python_comments(content) - return { - 'classes': classes, - 'functions': functions, - 'comments': comments - } + return {"classes": classes, "functions": functions, "comments": comments} def _extract_python_class(self, node: ast.ClassDef) -> ClassSignature: """Extract class signature from AST node.""" @@ -172,7 +173,7 @@ class CodeAnalyzer: if isinstance(base, ast.Name): bases.append(base.id) elif isinstance(base, ast.Attribute): - bases.append(f"{base.value.id}.{base.attr}" if hasattr(base.value, 'id') else base.attr) + bases.append(f"{base.value.id}.{base.attr}" if hasattr(base.value, "id") else base.attr) # Extract methods methods = [] @@ -185,11 +186,7 @@ class CodeAnalyzer: docstring = ast.get_docstring(node) return ClassSignature( - name=node.name, - base_classes=bases, - methods=methods, - docstring=docstring, - line_number=node.lineno + name=node.name, base_classes=bases, methods=methods, docstring=docstring, line_number=node.lineno ) def _extract_python_function(self, node, is_method: bool = False) -> FunctionSignature: @@ -199,12 +196,9 @@ class CodeAnalyzer: for arg in node.args.args: param_type = None if arg.annotation: - param_type = ast.unparse(arg.annotation) if hasattr(ast, 'unparse') else None + param_type = ast.unparse(arg.annotation) if hasattr(ast, "unparse") else None - params.append(Parameter( - name=arg.arg, - type_hint=param_type - )) + params.append(Parameter(name=arg.arg, type_hint=param_type)) # Extract defaults defaults = node.args.defaults @@ -215,7 +209,7 @@ class CodeAnalyzer: param_idx = num_no_default + i if param_idx < len(params): try: - params[param_idx].default = ast.unparse(default) if hasattr(ast, 'unparse') else str(default) + params[param_idx].default = ast.unparse(default) if hasattr(ast, "unparse") else str(default) except: params[param_idx].default = "..." @@ -223,7 +217,7 @@ class CodeAnalyzer: return_type = None if node.returns: try: - return_type = ast.unparse(node.returns) if hasattr(ast, 'unparse') else None + return_type = ast.unparse(node.returns) if hasattr(ast, "unparse") else None except: pass @@ -231,7 +225,7 @@ class CodeAnalyzer: decorators = [] for decorator in node.decorator_list: try: - if hasattr(ast, 'unparse'): + if hasattr(ast, "unparse"): decorators.append(ast.unparse(decorator)) elif isinstance(decorator, ast.Name): decorators.append(decorator.id) @@ -249,10 +243,10 @@ class CodeAnalyzer: line_number=node.lineno, is_async=isinstance(node, ast.AsyncFunctionDef), is_method=is_method, - decorators=decorators + decorators=decorators, ) - def _analyze_javascript(self, content: str, file_path: str) -> Dict[str, Any]: + def _analyze_javascript(self, content: str, file_path: str) -> dict[str, Any]: """ Analyze JavaScript/TypeScript file using regex patterns. @@ -263,7 +257,7 @@ class CodeAnalyzer: functions = [] # Extract class definitions - class_pattern = r'class\s+(\w+)(?:\s+extends\s+(\w+))?\s*\{' + class_pattern = r"class\s+(\w+)(?:\s+extends\s+(\w+))?\s*\{" for match in re.finditer(class_pattern, content): class_name = match.group(1) base_class = match.group(2) if match.group(2) else None @@ -271,101 +265,105 @@ class CodeAnalyzer: # Try to extract methods (simplified) class_block_start = match.end() # This is a simplification - proper parsing would track braces - class_block_end = content.find('}', class_block_start) + class_block_end = content.find("}", class_block_start) if class_block_end != -1: class_body = content[class_block_start:class_block_end] methods = self._extract_js_methods(class_body) else: methods = [] - classes.append({ - 'name': class_name, - 'base_classes': [base_class] if base_class else [], - 'methods': methods, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1 - }) + classes.append( + { + "name": class_name, + "base_classes": [base_class] if base_class else [], + "methods": methods, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + } + ) # Extract top-level functions - func_pattern = r'(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)' + func_pattern = r"(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)" for match in re.finditer(func_pattern, content): func_name = match.group(1) params_str = match.group(2) - is_async = 'async' in match.group(0) + is_async = "async" in match.group(0) params = self._parse_js_parameters(params_str) - functions.append({ - 'name': func_name, - 'parameters': params, - 'return_type': None, # JS doesn't have type annotations (unless TS) - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1, - 'is_async': is_async, - 'is_method': False, - 'decorators': [] - }) + functions.append( + { + "name": func_name, + "parameters": params, + "return_type": None, # JS doesn't have type annotations (unless TS) + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + "is_async": is_async, + "is_method": False, + "decorators": [], + } + ) # Extract arrow functions assigned to const/let - arrow_pattern = r'(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\(([^)]*)\)\s*=>' + arrow_pattern = r"(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\(([^)]*)\)\s*=>" for match in re.finditer(arrow_pattern, content): func_name = match.group(1) params_str = match.group(2) - is_async = 'async' in match.group(0) + is_async = "async" in match.group(0) params = self._parse_js_parameters(params_str) - functions.append({ - 'name': func_name, - 'parameters': params, - 'return_type': None, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1, - 'is_async': is_async, - 'is_method': False, - 'decorators': [] - }) + functions.append( + { + "name": func_name, + "parameters": params, + "return_type": None, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + "is_async": is_async, + "is_method": False, + "decorators": [], + } + ) # Extract comments comments = self._extract_js_comments(content) - return { - 'classes': classes, - 'functions': functions, - 'comments': comments - } + return {"classes": classes, "functions": functions, "comments": comments} - def _extract_js_methods(self, class_body: str) -> List[Dict]: + def _extract_js_methods(self, class_body: str) -> list[dict]: """Extract method signatures from class body.""" methods = [] # Match method definitions - method_pattern = r'(?:async\s+)?(\w+)\s*\(([^)]*)\)' + method_pattern = r"(?:async\s+)?(\w+)\s*\(([^)]*)\)" for match in re.finditer(method_pattern, class_body): method_name = match.group(1) params_str = match.group(2) - is_async = 'async' in match.group(0) + is_async = "async" in match.group(0) # Skip constructor keyword detection - if method_name in ['if', 'for', 'while', 'switch']: + if method_name in ["if", "for", "while", "switch"]: continue params = self._parse_js_parameters(params_str) - methods.append({ - 'name': method_name, - 'parameters': params, - 'return_type': None, - 'docstring': None, - 'line_number': None, - 'is_async': is_async, - 'is_method': True, - 'decorators': [] - }) + methods.append( + { + "name": method_name, + "parameters": params, + "return_type": None, + "docstring": None, + "line_number": None, + "is_async": is_async, + "is_method": True, + "decorators": [], + } + ) return methods - def _parse_js_parameters(self, params_str: str) -> List[Dict]: + def _parse_js_parameters(self, params_str: str) -> list[dict]: """Parse JavaScript parameter string.""" params = [] @@ -373,15 +371,15 @@ class CodeAnalyzer: return params # Split by comma (simplified - doesn't handle complex default values) - param_list = [p.strip() for p in params_str.split(',')] + param_list = [p.strip() for p in params_str.split(",")] for param in param_list: if not param: continue # Check for default value - if '=' in param: - name, default = param.split('=', 1) + if "=" in param: + name, default = param.split("=", 1) name = name.strip() default = default.strip() else: @@ -390,20 +388,16 @@ class CodeAnalyzer: # Check for type annotation (TypeScript) type_hint = None - if ':' in name: - name, type_hint = name.split(':', 1) + if ":" in name: + name, type_hint = name.split(":", 1) name = name.strip() type_hint = type_hint.strip() - params.append({ - 'name': name, - 'type_hint': type_hint, - 'default': default - }) + params.append({"name": name, "type_hint": type_hint, "default": default}) return params - def _analyze_cpp(self, content: str, file_path: str) -> Dict[str, Any]: + def _analyze_cpp(self, content: str, file_path: str) -> dict[str, Any]: """ Analyze C/C++ header file using regex patterns. @@ -414,61 +408,61 @@ class CodeAnalyzer: functions = [] # Extract class definitions (simplified - doesn't handle nested classes) - class_pattern = r'class\s+(\w+)(?:\s*:\s*public\s+(\w+))?\s*\{' + class_pattern = r"class\s+(\w+)(?:\s*:\s*public\s+(\w+))?\s*\{" for match in re.finditer(class_pattern, content): class_name = match.group(1) base_class = match.group(2) if match.group(2) else None - classes.append({ - 'name': class_name, - 'base_classes': [base_class] if base_class else [], - 'methods': [], # Simplified - would need to parse class body - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1 - }) + classes.append( + { + "name": class_name, + "base_classes": [base_class] if base_class else [], + "methods": [], # Simplified - would need to parse class body + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + } + ) # Extract function declarations - func_pattern = r'(\w+(?:\s*\*|\s*&)?)\s+(\w+)\s*\(([^)]*)\)' + func_pattern = r"(\w+(?:\s*\*|\s*&)?)\s+(\w+)\s*\(([^)]*)\)" for match in re.finditer(func_pattern, content): return_type = match.group(1).strip() func_name = match.group(2) params_str = match.group(3) # Skip common keywords - if func_name in ['if', 'for', 'while', 'switch', 'return']: + if func_name in ["if", "for", "while", "switch", "return"]: continue params = self._parse_cpp_parameters(params_str) - functions.append({ - 'name': func_name, - 'parameters': params, - 'return_type': return_type, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1, - 'is_async': False, - 'is_method': False, - 'decorators': [] - }) + functions.append( + { + "name": func_name, + "parameters": params, + "return_type": return_type, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + "is_async": False, + "is_method": False, + "decorators": [], + } + ) # Extract comments comments = self._extract_cpp_comments(content) - return { - 'classes': classes, - 'functions': functions, - 'comments': comments - } + return {"classes": classes, "functions": functions, "comments": comments} - def _parse_cpp_parameters(self, params_str: str) -> List[Dict]: + def _parse_cpp_parameters(self, params_str: str) -> list[dict]: """Parse C++ parameter string.""" params = [] - if not params_str.strip() or params_str.strip() == 'void': + if not params_str.strip() or params_str.strip() == "void": return params # Split by comma (simplified) - param_list = [p.strip() for p in params_str.split(',')] + param_list = [p.strip() for p in params_str.split(",")] for param in param_list: if not param: @@ -476,8 +470,8 @@ class CodeAnalyzer: # Check for default value default = None - if '=' in param: - param, default = param.rsplit('=', 1) + if "=" in param: + param, default = param.rsplit("=", 1) param = param.strip() default = default.strip() @@ -485,21 +479,17 @@ class CodeAnalyzer: # Format: "type name" or "type* name" or "type& name" parts = param.split() if len(parts) >= 2: - param_type = ' '.join(parts[:-1]) + param_type = " ".join(parts[:-1]) param_name = parts[-1] else: param_type = param param_name = "unknown" - params.append({ - 'name': param_name, - 'type_hint': param_type, - 'default': default - }) + params.append({"name": param_name, "type_hint": param_type, "default": default}) return params - def _extract_python_comments(self, content: str) -> List[Dict]: + def _extract_python_comments(self, content: str) -> list[dict]: """ Extract Python comments (# style). @@ -511,21 +501,17 @@ class CodeAnalyzer: stripped = line.strip() # Skip shebang and encoding declarations - if stripped.startswith('#!') or stripped.startswith('#') and 'coding' in stripped: + if stripped.startswith("#!") or stripped.startswith("#") and "coding" in stripped: continue # Extract regular comments - if stripped.startswith('#'): + if stripped.startswith("#"): comment_text = stripped[1:].strip() - comments.append({ - 'line': i, - 'text': comment_text, - 'type': 'inline' - }) + comments.append({"line": i, "text": comment_text, "type": "inline"}) return comments - def _extract_js_comments(self, content: str) -> List[Dict]: + def _extract_js_comments(self, content: str) -> list[dict]: """ Extract JavaScript/TypeScript comments (// and /* */ styles). @@ -534,30 +520,22 @@ class CodeAnalyzer: comments = [] # Extract single-line comments (//) - for match in re.finditer(r'//(.+)$', content, re.MULTILINE): - line_num = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"//(.+)$", content, re.MULTILINE): + line_num = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() - comments.append({ - 'line': line_num, - 'text': comment_text, - 'type': 'inline' - }) + comments.append({"line": line_num, "text": comment_text, "type": "inline"}) # Extract multi-line comments (/* */) - for match in re.finditer(r'/\*(.+?)\*/', content, re.DOTALL): - start_line = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL): + start_line = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() - comments.append({ - 'line': start_line, - 'text': comment_text, - 'type': 'block' - }) + comments.append({"line": start_line, "text": comment_text, "type": "block"}) return comments - def _extract_cpp_comments(self, content: str) -> List[Dict]: + def _extract_cpp_comments(self, content: str) -> list[dict]: """ Extract C++ comments (// and /* */ styles, same as JavaScript). @@ -566,7 +544,7 @@ class CodeAnalyzer: # C++ uses the same comment syntax as JavaScript return self._extract_js_comments(content) - def _analyze_csharp(self, content: str, file_path: str) -> Dict[str, Any]: + def _analyze_csharp(self, content: str, file_path: str) -> dict[str, Any]: """ Analyze C# file using regex patterns. @@ -581,15 +559,15 @@ class CodeAnalyzer: # Extract class definitions # Matches: [modifiers] class ClassName [: BaseClass] [, Interface] - class_pattern = r'(?:public|private|internal|protected)?\s*(?:static|abstract|sealed)?\s*class\s+(\w+)(?:\s*:\s*([\w\s,<>]+))?\s*\{' + class_pattern = r"(?:public|private|internal|protected)?\s*(?:static|abstract|sealed)?\s*class\s+(\w+)(?:\s*:\s*([\w\s,<>]+))?\s*\{" for match in re.finditer(class_pattern, content): class_name = match.group(1) - bases_str = match.group(2) if match.group(2) else '' + bases_str = match.group(2) if match.group(2) else "" # Parse base classes and interfaces base_classes = [] if bases_str: - base_classes = [b.strip() for b in bases_str.split(',')] + base_classes = [b.strip() for b in bases_str.split(",")] # Try to extract methods (simplified) class_block_start = match.end() @@ -597,9 +575,9 @@ class CodeAnalyzer: brace_count = 1 class_block_end = class_block_start for i, char in enumerate(content[class_block_start:], class_block_start): - if char == '{': + if char == "{": brace_count += 1 - elif char == '}': + elif char == "}": brace_count -= 1 if brace_count == 0: class_block_end = i @@ -611,81 +589,83 @@ class CodeAnalyzer: else: methods = [] - classes.append({ - 'name': class_name, - 'base_classes': base_classes, - 'methods': methods, - 'docstring': None, # Would need to extract XML doc comments - 'line_number': content[:match.start()].count('\n') + 1 - }) + classes.append( + { + "name": class_name, + "base_classes": base_classes, + "methods": methods, + "docstring": None, # Would need to extract XML doc comments + "line_number": content[: match.start()].count("\n") + 1, + } + ) # Extract top-level functions/methods # Matches: [modifiers] [async] ReturnType MethodName(params) - func_pattern = r'(?:public|private|internal|protected)?\s*(?:static|virtual|override|abstract)?\s*(?:async\s+)?(\w+(?:<[\w\s,]+>)?)\s+(\w+)\s*\(([^)]*)\)' + func_pattern = r"(?:public|private|internal|protected)?\s*(?:static|virtual|override|abstract)?\s*(?:async\s+)?(\w+(?:<[\w\s,]+>)?)\s+(\w+)\s*\(([^)]*)\)" for match in re.finditer(func_pattern, content): return_type = match.group(1).strip() func_name = match.group(2) params_str = match.group(3) - is_async = 'async' in match.group(0) + is_async = "async" in match.group(0) # Skip common keywords - if func_name in ['if', 'for', 'while', 'switch', 'return', 'using', 'namespace']: + if func_name in ["if", "for", "while", "switch", "return", "using", "namespace"]: continue params = self._parse_csharp_parameters(params_str) - functions.append({ - 'name': func_name, - 'parameters': params, - 'return_type': return_type, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1, - 'is_async': is_async, - 'is_method': False, - 'decorators': [] - }) + functions.append( + { + "name": func_name, + "parameters": params, + "return_type": return_type, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + "is_async": is_async, + "is_method": False, + "decorators": [], + } + ) # Extract comments comments = self._extract_csharp_comments(content) - return { - 'classes': classes, - 'functions': functions, - 'comments': comments - } + return {"classes": classes, "functions": functions, "comments": comments} - def _extract_csharp_methods(self, class_body: str) -> List[Dict]: + def _extract_csharp_methods(self, class_body: str) -> list[dict]: """Extract C# method signatures from class body.""" methods = [] # Match method definitions - method_pattern = r'(?:public|private|internal|protected)?\s*(?:static|virtual|override|abstract)?\s*(?:async\s+)?(\w+(?:<[\w\s,]+>)?)\s+(\w+)\s*\(([^)]*)\)' + method_pattern = r"(?:public|private|internal|protected)?\s*(?:static|virtual|override|abstract)?\s*(?:async\s+)?(\w+(?:<[\w\s,]+>)?)\s+(\w+)\s*\(([^)]*)\)" for match in re.finditer(method_pattern, class_body): return_type = match.group(1).strip() method_name = match.group(2) params_str = match.group(3) - is_async = 'async' in match.group(0) + is_async = "async" in match.group(0) # Skip keywords - if method_name in ['if', 'for', 'while', 'switch', 'get', 'set']: + if method_name in ["if", "for", "while", "switch", "get", "set"]: continue params = self._parse_csharp_parameters(params_str) - methods.append({ - 'name': method_name, - 'parameters': params, - 'return_type': return_type, - 'docstring': None, - 'line_number': None, - 'is_async': is_async, - 'is_method': True, - 'decorators': [] - }) + methods.append( + { + "name": method_name, + "parameters": params, + "return_type": return_type, + "docstring": None, + "line_number": None, + "is_async": is_async, + "is_method": True, + "decorators": [], + } + ) return methods - def _parse_csharp_parameters(self, params_str: str) -> List[Dict]: + def _parse_csharp_parameters(self, params_str: str) -> list[dict]: """Parse C# parameter string.""" params = [] @@ -693,7 +673,7 @@ class CodeAnalyzer: return params # Split by comma (simplified) - param_list = [p.strip() for p in params_str.split(',')] + param_list = [p.strip() for p in params_str.split(",")] for param in param_list: if not param: @@ -701,8 +681,8 @@ class CodeAnalyzer: # Check for default value default = None - if '=' in param: - param, default = param.split('=', 1) + if "=" in param: + param, default = param.split("=", 1) param = param.strip() default = default.strip() @@ -710,7 +690,7 @@ class CodeAnalyzer: parts = param.split() if len(parts) >= 2: # Remove ref/out modifiers - if parts[0] in ['ref', 'out', 'in', 'params']: + if parts[0] in ["ref", "out", "in", "params"]: parts = parts[1:] if len(parts) >= 2: @@ -723,46 +703,34 @@ class CodeAnalyzer: param_type = None param_name = param - params.append({ - 'name': param_name, - 'type_hint': param_type, - 'default': default - }) + params.append({"name": param_name, "type_hint": param_type, "default": default}) return params - def _extract_csharp_comments(self, content: str) -> List[Dict]: + def _extract_csharp_comments(self, content: str) -> list[dict]: """Extract C# comments (// and /* */ and /// XML docs).""" comments = [] # Single-line comments (//) - for match in re.finditer(r'//(.+)$', content, re.MULTILINE): - line_num = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"//(.+)$", content, re.MULTILINE): + line_num = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() # Distinguish XML doc comments (///) - comment_type = 'doc' if match.group(1).startswith('/') else 'inline' + comment_type = "doc" if match.group(1).startswith("/") else "inline" - comments.append({ - 'line': line_num, - 'text': comment_text.lstrip('/').strip(), - 'type': comment_type - }) + comments.append({"line": line_num, "text": comment_text.lstrip("/").strip(), "type": comment_type}) # Multi-line comments (/* */) - for match in re.finditer(r'/\*(.+?)\*/', content, re.DOTALL): - start_line = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL): + start_line = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() - comments.append({ - 'line': start_line, - 'text': comment_text, - 'type': 'block' - }) + comments.append({"line": start_line, "text": comment_text, "type": "block"}) return comments - def _analyze_go(self, content: str, file_path: str) -> Dict[str, Any]: + def _analyze_go(self, content: str, file_path: str) -> dict[str, Any]: """ Analyze Go file using regex patterns. @@ -776,21 +744,23 @@ class CodeAnalyzer: functions = [] # Extract struct definitions (Go's equivalent of classes) - struct_pattern = r'type\s+(\w+)\s+struct\s*\{' + struct_pattern = r"type\s+(\w+)\s+struct\s*\{" for match in re.finditer(struct_pattern, content): struct_name = match.group(1) - classes.append({ - 'name': struct_name, - 'base_classes': [], # Go uses embedding, not inheritance - 'methods': [], # Methods extracted separately - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1 - }) + classes.append( + { + "name": struct_name, + "base_classes": [], # Go uses embedding, not inheritance + "methods": [], # Methods extracted separately + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + } + ) # Extract function definitions # Matches: func [receiver] name(params) [returns] - func_pattern = r'func\s+(?:\((\w+)\s+\*?(\w+)\)\s+)?(\w+)\s*\(([^)]*)\)(?:\s+\(([^)]+)\)|(?:\s+(\w+(?:\[.*?\])?(?:,\s*\w+)*)))?' + func_pattern = r"func\s+(?:\((\w+)\s+\*?(\w+)\)\s+)?(\w+)\s*\(([^)]*)\)(?:\s+\(([^)]+)\)|(?:\s+(\w+(?:\[.*?\])?(?:,\s*\w+)*)))?" for match in re.finditer(func_pattern, content): receiver_var = match.group(1) receiver_type = match.group(2) @@ -811,27 +781,25 @@ class CodeAnalyzer: params = self._parse_go_parameters(params_str) - functions.append({ - 'name': func_name, - 'parameters': params, - 'return_type': return_type, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1, - 'is_async': False, # Go uses goroutines differently - 'is_method': is_method, - 'decorators': [] - }) + functions.append( + { + "name": func_name, + "parameters": params, + "return_type": return_type, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + "is_async": False, # Go uses goroutines differently + "is_method": is_method, + "decorators": [], + } + ) # Extract comments comments = self._extract_go_comments(content) - return { - 'classes': classes, - 'functions': functions, - 'comments': comments - } + return {"classes": classes, "functions": functions, "comments": comments} - def _parse_go_parameters(self, params_str: str) -> List[Dict]: + def _parse_go_parameters(self, params_str: str) -> list[dict]: """Parse Go parameter string.""" params = [] @@ -839,7 +807,7 @@ class CodeAnalyzer: return params # Split by comma - param_list = [p.strip() for p in params_str.split(',')] + param_list = [p.strip() for p in params_str.split(",")] for param in param_list: if not param: @@ -851,25 +819,27 @@ class CodeAnalyzer: if len(parts) >= 2: # Last part is type param_type = parts[-1] - param_name = ' '.join(parts[:-1]) + param_name = " ".join(parts[:-1]) else: param_type = param param_name = "unknown" - params.append({ - 'name': param_name, - 'type_hint': param_type, - 'default': None # Go doesn't support default parameters - }) + params.append( + { + "name": param_name, + "type_hint": param_type, + "default": None, # Go doesn't support default parameters + } + ) return params - def _extract_go_comments(self, content: str) -> List[Dict]: + def _extract_go_comments(self, content: str) -> list[dict]: """Extract Go comments (// and /* */ styles).""" # Go uses C-style comments return self._extract_js_comments(content) - def _analyze_rust(self, content: str, file_path: str) -> Dict[str, Any]: + def _analyze_rust(self, content: str, file_path: str) -> dict[str, Any]: """ Analyze Rust file using regex patterns. @@ -883,50 +853,50 @@ class CodeAnalyzer: functions = [] # Extract struct definitions - struct_pattern = r'(?:pub\s+)?struct\s+(\w+)(?:<[^>]+>)?\s*\{' + struct_pattern = r"(?:pub\s+)?struct\s+(\w+)(?:<[^>]+>)?\s*\{" for match in re.finditer(struct_pattern, content): struct_name = match.group(1) - classes.append({ - 'name': struct_name, - 'base_classes': [], # Rust uses traits, not inheritance - 'methods': [], - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1 - }) + classes.append( + { + "name": struct_name, + "base_classes": [], # Rust uses traits, not inheritance + "methods": [], + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + } + ) # Extract function definitions # Matches: [pub] [async] [unsafe] [const] fn name(params) -> ReturnType - func_pattern = r'(?:pub\s+)?(?:async\s+)?(?:unsafe\s+)?(?:const\s+)?fn\s+(\w+)(?:<[^>]+>)?\s*\(([^)]*)\)(?:\s*->\s*([^{;]+))?' + func_pattern = r"(?:pub\s+)?(?:async\s+)?(?:unsafe\s+)?(?:const\s+)?fn\s+(\w+)(?:<[^>]+>)?\s*\(([^)]*)\)(?:\s*->\s*([^{;]+))?" for match in re.finditer(func_pattern, content): func_name = match.group(1) params_str = match.group(2) return_type = match.group(3).strip() if match.group(3) else None - is_async = 'async' in match.group(0) + is_async = "async" in match.group(0) params = self._parse_rust_parameters(params_str) - functions.append({ - 'name': func_name, - 'parameters': params, - 'return_type': return_type, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1, - 'is_async': is_async, - 'is_method': False, - 'decorators': [] - }) + functions.append( + { + "name": func_name, + "parameters": params, + "return_type": return_type, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + "is_async": is_async, + "is_method": False, + "decorators": [], + } + ) # Extract comments comments = self._extract_rust_comments(content) - return { - 'classes': classes, - 'functions': functions, - 'comments': comments - } + return {"classes": classes, "functions": functions, "comments": comments} - def _parse_rust_parameters(self, params_str: str) -> List[Dict]: + def _parse_rust_parameters(self, params_str: str) -> list[dict]: """Parse Rust parameter string.""" params = [] @@ -934,15 +904,15 @@ class CodeAnalyzer: return params # Split by comma - param_list = [p.strip() for p in params_str.split(',')] + param_list = [p.strip() for p in params_str.split(",")] for param in param_list: if not param: continue # Rust format: name: type or &self - if ':' in param: - name, param_type = param.split(':', 1) + if ":" in param: + name, param_type = param.split(":", 1) name = name.strip() param_type = param_type.strip() else: @@ -950,50 +920,44 @@ class CodeAnalyzer: name = param param_type = None - params.append({ - 'name': name, - 'type_hint': param_type, - 'default': None # Rust doesn't support default parameters - }) + params.append( + { + "name": name, + "type_hint": param_type, + "default": None, # Rust doesn't support default parameters + } + ) return params - def _extract_rust_comments(self, content: str) -> List[Dict]: + def _extract_rust_comments(self, content: str) -> list[dict]: """Extract Rust comments (// and /* */ and /// doc comments).""" comments = [] # Single-line comments (//) - for match in re.finditer(r'//(.+)$', content, re.MULTILINE): - line_num = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"//(.+)$", content, re.MULTILINE): + line_num = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() # Distinguish doc comments (/// or //!) - if comment_text.startswith('/') or comment_text.startswith('!'): - comment_type = 'doc' - comment_text = comment_text.lstrip('/!').strip() + if comment_text.startswith("/") or comment_text.startswith("!"): + comment_type = "doc" + comment_text = comment_text.lstrip("/!").strip() else: - comment_type = 'inline' + comment_type = "inline" - comments.append({ - 'line': line_num, - 'text': comment_text, - 'type': comment_type - }) + comments.append({"line": line_num, "text": comment_text, "type": comment_type}) # Multi-line comments (/* */) - for match in re.finditer(r'/\*(.+?)\*/', content, re.DOTALL): - start_line = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL): + start_line = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() - comments.append({ - 'line': start_line, - 'text': comment_text, - 'type': 'block' - }) + comments.append({"line": start_line, "text": comment_text, "type": "block"}) return comments - def _analyze_java(self, content: str, file_path: str) -> Dict[str, Any]: + def _analyze_java(self, content: str, file_path: str) -> dict[str, Any]: """ Analyze Java file using regex patterns. @@ -1008,7 +972,7 @@ class CodeAnalyzer: # Extract class definitions # Matches: [modifiers] class ClassName [extends Base] [implements Interfaces] - class_pattern = r'(?:public|private|protected)?\s*(?:static|final|abstract)?\s*class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([\w\s,]+))?\s*\{' + class_pattern = r"(?:public|private|protected)?\s*(?:static|final|abstract)?\s*class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([\w\s,]+))?\s*\{" for match in re.finditer(class_pattern, content): class_name = match.group(1) base_class = match.group(2) @@ -1018,16 +982,16 @@ class CodeAnalyzer: if base_class: base_classes.append(base_class) if interfaces_str: - base_classes.extend([i.strip() for i in interfaces_str.split(',')]) + base_classes.extend([i.strip() for i in interfaces_str.split(",")]) # Extract methods (simplified) class_block_start = match.end() brace_count = 1 class_block_end = class_block_start for i, char in enumerate(content[class_block_start:], class_block_start): - if char == '{': + if char == "{": brace_count += 1 - elif char == '}': + elif char == "}": brace_count -= 1 if brace_count == 0: class_block_end = i @@ -1039,77 +1003,79 @@ class CodeAnalyzer: else: methods = [] - classes.append({ - 'name': class_name, - 'base_classes': base_classes, - 'methods': methods, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1 - }) + classes.append( + { + "name": class_name, + "base_classes": base_classes, + "methods": methods, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + } + ) # Extract top-level functions (rare in Java, but static methods) - func_pattern = r'(?:public|private|protected)?\s*(?:static|final|synchronized)?\s*(\w+(?:<[\w\s,]+>)?)\s+(\w+)\s*\(([^)]*)\)' + func_pattern = r"(?:public|private|protected)?\s*(?:static|final|synchronized)?\s*(\w+(?:<[\w\s,]+>)?)\s+(\w+)\s*\(([^)]*)\)" for match in re.finditer(func_pattern, content): return_type = match.group(1).strip() func_name = match.group(2) params_str = match.group(3) # Skip keywords - if func_name in ['if', 'for', 'while', 'switch', 'return', 'class', 'void']: + if func_name in ["if", "for", "while", "switch", "return", "class", "void"]: continue params = self._parse_java_parameters(params_str) - functions.append({ - 'name': func_name, - 'parameters': params, - 'return_type': return_type, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1, - 'is_async': False, - 'is_method': False, - 'decorators': [] - }) + functions.append( + { + "name": func_name, + "parameters": params, + "return_type": return_type, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + "is_async": False, + "is_method": False, + "decorators": [], + } + ) # Extract comments comments = self._extract_java_comments(content) - return { - 'classes': classes, - 'functions': functions, - 'comments': comments - } + return {"classes": classes, "functions": functions, "comments": comments} - def _extract_java_methods(self, class_body: str) -> List[Dict]: + def _extract_java_methods(self, class_body: str) -> list[dict]: """Extract Java method signatures from class body.""" methods = [] - method_pattern = r'(?:public|private|protected)?\s*(?:static|final|synchronized)?\s*(\w+(?:<[\w\s,]+>)?)\s+(\w+)\s*\(([^)]*)\)' + method_pattern = r"(?:public|private|protected)?\s*(?:static|final|synchronized)?\s*(\w+(?:<[\w\s,]+>)?)\s+(\w+)\s*\(([^)]*)\)" for match in re.finditer(method_pattern, class_body): return_type = match.group(1).strip() method_name = match.group(2) params_str = match.group(3) # Skip keywords - if method_name in ['if', 'for', 'while', 'switch']: + if method_name in ["if", "for", "while", "switch"]: continue params = self._parse_java_parameters(params_str) - methods.append({ - 'name': method_name, - 'parameters': params, - 'return_type': return_type, - 'docstring': None, - 'line_number': None, - 'is_async': False, - 'is_method': True, - 'decorators': [] - }) + methods.append( + { + "name": method_name, + "parameters": params, + "return_type": return_type, + "docstring": None, + "line_number": None, + "is_async": False, + "is_method": True, + "decorators": [], + } + ) return methods - def _parse_java_parameters(self, params_str: str) -> List[Dict]: + def _parse_java_parameters(self, params_str: str) -> list[dict]: """Parse Java parameter string.""" params = [] @@ -1117,7 +1083,7 @@ class CodeAnalyzer: return params # Split by comma - param_list = [p.strip() for p in params_str.split(',')] + param_list = [p.strip() for p in params_str.split(",")] for param in param_list: if not param: @@ -1127,7 +1093,7 @@ class CodeAnalyzer: parts = param.split() if len(parts) >= 2: # Remove 'final' if present - if parts[0] == 'final': + if parts[0] == "final": parts = parts[1:] if len(parts) >= 2: @@ -1140,46 +1106,40 @@ class CodeAnalyzer: param_type = param param_name = "unknown" - params.append({ - 'name': param_name, - 'type_hint': param_type, - 'default': None # Java doesn't support default parameters - }) + params.append( + { + "name": param_name, + "type_hint": param_type, + "default": None, # Java doesn't support default parameters + } + ) return params - def _extract_java_comments(self, content: str) -> List[Dict]: + def _extract_java_comments(self, content: str) -> list[dict]: """Extract Java comments (// and /* */ and /** JavaDoc */).""" comments = [] # Single-line comments (//) - for match in re.finditer(r'//(.+)$', content, re.MULTILINE): - line_num = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"//(.+)$", content, re.MULTILINE): + line_num = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() - comments.append({ - 'line': line_num, - 'text': comment_text, - 'type': 'inline' - }) + comments.append({"line": line_num, "text": comment_text, "type": "inline"}) # Multi-line and JavaDoc comments (/* */ and /** */) - for match in re.finditer(r'/\*\*?(.+?)\*/', content, re.DOTALL): - start_line = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL): + start_line = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() # Distinguish JavaDoc (starts with **) - comment_type = 'doc' if match.group(0).startswith('/**') else 'block' + comment_type = "doc" if match.group(0).startswith("/**") else "block" - comments.append({ - 'line': start_line, - 'text': comment_text, - 'type': comment_type - }) + comments.append({"line": start_line, "text": comment_text, "type": comment_type}) return comments - def _analyze_ruby(self, content: str, file_path: str) -> Dict[str, Any]: + def _analyze_ruby(self, content: str, file_path: str) -> dict[str, Any]: """ Analyze Ruby file using regex patterns. @@ -1193,51 +1153,51 @@ class CodeAnalyzer: functions = [] # Extract class definitions - class_pattern = r'class\s+(\w+)(?:\s*<\s*(\w+))?\s*$' + class_pattern = r"class\s+(\w+)(?:\s*<\s*(\w+))?\s*$" for match in re.finditer(class_pattern, content, re.MULTILINE): class_name = match.group(1) base_class = match.group(2) base_classes = [base_class] if base_class else [] - classes.append({ - 'name': class_name, - 'base_classes': base_classes, - 'methods': [], # Would need to parse class body - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1 - }) + classes.append( + { + "name": class_name, + "base_classes": base_classes, + "methods": [], # Would need to parse class body + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + } + ) # Extract method/function definitions # Matches: def method_name(params) - func_pattern = r'def\s+(?:self\.)?(\w+[?!]?)\s*(?:\(([^)]*)\))?' + func_pattern = r"def\s+(?:self\.)?(\w+[?!]?)\s*(?:\(([^)]*)\))?" for match in re.finditer(func_pattern, content): func_name = match.group(1) - params_str = match.group(2) if match.group(2) else '' + params_str = match.group(2) if match.group(2) else "" params = self._parse_ruby_parameters(params_str) - functions.append({ - 'name': func_name, - 'parameters': params, - 'return_type': None, # Ruby has no type annotations (usually) - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1, - 'is_async': False, - 'is_method': False, - 'decorators': [] - }) + functions.append( + { + "name": func_name, + "parameters": params, + "return_type": None, # Ruby has no type annotations (usually) + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + "is_async": False, + "is_method": False, + "decorators": [], + } + ) # Extract comments comments = self._extract_ruby_comments(content) - return { - 'classes': classes, - 'functions': functions, - 'comments': comments - } + return {"classes": classes, "functions": functions, "comments": comments} - def _parse_ruby_parameters(self, params_str: str) -> List[Dict]: + def _parse_ruby_parameters(self, params_str: str) -> list[dict]: """Parse Ruby parameter string.""" params = [] @@ -1245,7 +1205,7 @@ class CodeAnalyzer: return params # Split by comma - param_list = [p.strip() for p in params_str.split(',')] + param_list = [p.strip() for p in params_str.split(",")] for param in param_list: if not param: @@ -1253,23 +1213,19 @@ class CodeAnalyzer: # Check for default value default = None - if '=' in param: - name, default = param.split('=', 1) + if "=" in param: + name, default = param.split("=", 1) name = name.strip() default = default.strip() else: name = param # Ruby doesn't have type hints in method signatures - params.append({ - 'name': name, - 'type_hint': None, - 'default': default - }) + params.append({"name": name, "type_hint": None, "default": default}) return params - def _extract_ruby_comments(self, content: str) -> List[Dict]: + def _extract_ruby_comments(self, content: str) -> list[dict]: """Extract Ruby comments (# style).""" comments = [] @@ -1277,17 +1233,13 @@ class CodeAnalyzer: stripped = line.strip() # Ruby comments start with # - if stripped.startswith('#'): + if stripped.startswith("#"): comment_text = stripped[1:].strip() - comments.append({ - 'line': i, - 'text': comment_text, - 'type': 'inline' - }) + comments.append({"line": i, "text": comment_text, "type": "inline"}) return comments - def _analyze_php(self, content: str, file_path: str) -> Dict[str, Any]: + def _analyze_php(self, content: str, file_path: str) -> dict[str, Any]: """ Analyze PHP file using regex patterns. @@ -1301,7 +1253,7 @@ class CodeAnalyzer: functions = [] # Extract class definitions - class_pattern = r'(?:abstract\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([\w\s,]+))?\s*\{' + class_pattern = r"(?:abstract\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([\w\s,]+))?\s*\{" for match in re.finditer(class_pattern, content): class_name = match.group(1) base_class = match.group(2) @@ -1311,16 +1263,16 @@ class CodeAnalyzer: if base_class: base_classes.append(base_class) if interfaces_str: - base_classes.extend([i.strip() for i in interfaces_str.split(',')]) + base_classes.extend([i.strip() for i in interfaces_str.split(",")]) # Extract methods (simplified) class_block_start = match.end() brace_count = 1 class_block_end = class_block_start for i, char in enumerate(content[class_block_start:], class_block_start): - if char == '{': + if char == "{": brace_count += 1 - elif char == '}': + elif char == "}": brace_count -= 1 if brace_count == 0: class_block_end = i @@ -1332,16 +1284,18 @@ class CodeAnalyzer: else: methods = [] - classes.append({ - 'name': class_name, - 'base_classes': base_classes, - 'methods': methods, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1 - }) + classes.append( + { + "name": class_name, + "base_classes": base_classes, + "methods": methods, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + } + ) # Extract function definitions - func_pattern = r'function\s+(\w+)\s*\(([^)]*)\)(?:\s*:\s*(\??\w+))?' + func_pattern = r"function\s+(\w+)\s*\(([^)]*)\)(?:\s*:\s*(\??\w+))?" for match in re.finditer(func_pattern, content): func_name = match.group(1) params_str = match.group(2) @@ -1349,31 +1303,31 @@ class CodeAnalyzer: params = self._parse_php_parameters(params_str) - functions.append({ - 'name': func_name, - 'parameters': params, - 'return_type': return_type, - 'docstring': None, - 'line_number': content[:match.start()].count('\n') + 1, - 'is_async': False, - 'is_method': False, - 'decorators': [] - }) + functions.append( + { + "name": func_name, + "parameters": params, + "return_type": return_type, + "docstring": None, + "line_number": content[: match.start()].count("\n") + 1, + "is_async": False, + "is_method": False, + "decorators": [], + } + ) # Extract comments comments = self._extract_php_comments(content) - return { - 'classes': classes, - 'functions': functions, - 'comments': comments - } + return {"classes": classes, "functions": functions, "comments": comments} - def _extract_php_methods(self, class_body: str) -> List[Dict]: + def _extract_php_methods(self, class_body: str) -> list[dict]: """Extract PHP method signatures from class body.""" methods = [] - method_pattern = r'(?:public|private|protected)?\s*(?:static|final)?\s*function\s+(\w+)\s*\(([^)]*)\)(?:\s*:\s*(\??\w+))?' + method_pattern = ( + r"(?:public|private|protected)?\s*(?:static|final)?\s*function\s+(\w+)\s*\(([^)]*)\)(?:\s*:\s*(\??\w+))?" + ) for match in re.finditer(method_pattern, class_body): method_name = match.group(1) params_str = match.group(2) @@ -1381,20 +1335,22 @@ class CodeAnalyzer: params = self._parse_php_parameters(params_str) - methods.append({ - 'name': method_name, - 'parameters': params, - 'return_type': return_type, - 'docstring': None, - 'line_number': None, - 'is_async': False, - 'is_method': True, - 'decorators': [] - }) + methods.append( + { + "name": method_name, + "parameters": params, + "return_type": return_type, + "docstring": None, + "line_number": None, + "is_async": False, + "is_method": True, + "decorators": [], + } + ) return methods - def _parse_php_parameters(self, params_str: str) -> List[Dict]: + def _parse_php_parameters(self, params_str: str) -> list[dict]: """Parse PHP parameter string.""" params = [] @@ -1402,7 +1358,7 @@ class CodeAnalyzer: return params # Split by comma - param_list = [p.strip() for p in params_str.split(',')] + param_list = [p.strip() for p in params_str.split(",")] for param in param_list: if not param: @@ -1410,8 +1366,8 @@ class CodeAnalyzer: # Check for default value default = None - if '=' in param: - param, default = param.split('=', 1) + if "=" in param: + param, default = param.split("=", 1) param = param.strip() default = default.strip() @@ -1425,50 +1381,38 @@ class CodeAnalyzer: param_name = parts[0] if parts else "unknown" # Remove $ from variable name - if param_name.startswith('$'): + if param_name.startswith("$"): param_name = param_name[1:] - params.append({ - 'name': param_name, - 'type_hint': param_type, - 'default': default - }) + params.append({"name": param_name, "type_hint": param_type, "default": default}) return params - def _extract_php_comments(self, content: str) -> List[Dict]: + def _extract_php_comments(self, content: str) -> list[dict]: """Extract PHP comments (// and /* */ and # and /** PHPDoc */).""" comments = [] # Single-line comments (// and #) - for match in re.finditer(r'(?://|#)(.+)$', content, re.MULTILINE): - line_num = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"(?://|#)(.+)$", content, re.MULTILINE): + line_num = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() - comments.append({ - 'line': line_num, - 'text': comment_text, - 'type': 'inline' - }) + comments.append({"line": line_num, "text": comment_text, "type": "inline"}) # Multi-line and PHPDoc comments (/* */ and /** */) - for match in re.finditer(r'/\*\*?(.+?)\*/', content, re.DOTALL): - start_line = content[:match.start()].count('\n') + 1 + for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL): + start_line = content[: match.start()].count("\n") + 1 comment_text = match.group(1).strip() # Distinguish PHPDoc (starts with **) - comment_type = 'doc' if match.group(0).startswith('/**') else 'block' + comment_type = "doc" if match.group(0).startswith("/**") else "block" - comments.append({ - 'line': start_line, - 'text': comment_text, - 'type': comment_type - }) + comments.append({"line": start_line, "text": comment_text, "type": comment_type}) return comments -if __name__ == '__main__': +if __name__ == "__main__": # Test the analyzer python_code = ''' class Node2D: @@ -1487,18 +1431,22 @@ def create_sprite(texture: str) -> Node2D: return Node2D() ''' - analyzer = CodeAnalyzer(depth='deep') - result = analyzer.analyze_file('test.py', python_code, 'Python') + analyzer = CodeAnalyzer(depth="deep") + result = analyzer.analyze_file("test.py", python_code, "Python") print("Analysis Result:") print(f"Classes: {len(result.get('classes', []))}") print(f"Functions: {len(result.get('functions', []))}") - if result.get('classes'): - cls = result['classes'][0] + if result.get("classes"): + cls = result["classes"][0] print(f"\nClass: {cls['name']}") print(f" Methods: {len(cls['methods'])}") - for method in cls['methods']: - params = ', '.join([f"{p['name']}: {p['type_hint']}" + (f" = {p['default']}" if p.get('default') else "") - for p in method['parameters']]) + for method in cls["methods"]: + params = ", ".join( + [ + f"{p['name']}: {p['type_hint']}" + (f" = {p['default']}" if p.get("default") else "") + for p in method["parameters"] + ] + ) print(f" {method['name']}({params}) -> {method['return_type']}") diff --git a/src/skill_seekers/cli/codebase_scraper.py b/src/skill_seekers/cli/codebase_scraper.py index a4c12a9..03661b8 100644 --- a/src/skill_seekers/cli/codebase_scraper.py +++ b/src/skill_seekers/cli/codebase_scraper.py @@ -24,65 +24,80 @@ Credits: - pathspec for .gitignore support: https://pypi.org/project/pathspec/ """ +import argparse +import json +import logging import os import sys -import json -import argparse -import logging from pathlib import Path -from typing import Dict, List, Optional, Any +from typing import Any # Add parent directory to path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from skill_seekers.cli.code_analyzer import CodeAnalyzer from skill_seekers.cli.api_reference_builder import APIReferenceBuilder -from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer +from skill_seekers.cli.code_analyzer import CodeAnalyzer from skill_seekers.cli.config_extractor import ConfigExtractor +from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer # Try to import pathspec for .gitignore support try: import pathspec + PATHSPEC_AVAILABLE = True except ImportError: PATHSPEC_AVAILABLE = False # Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # Language extension mapping LANGUAGE_EXTENSIONS = { - '.py': 'Python', - '.js': 'JavaScript', - '.jsx': 'JavaScript', - '.ts': 'TypeScript', - '.tsx': 'TypeScript', - '.cpp': 'C++', - '.cc': 'C++', - '.cxx': 'C++', - '.h': 'C++', - '.hpp': 'C++', - '.hxx': 'C++', - '.c': 'C', - '.cs': 'C#', - '.go': 'Go', - '.rs': 'Rust', - '.java': 'Java', - '.rb': 'Ruby', - '.php': 'PHP', + ".py": "Python", + ".js": "JavaScript", + ".jsx": "JavaScript", + ".ts": "TypeScript", + ".tsx": "TypeScript", + ".cpp": "C++", + ".cc": "C++", + ".cxx": "C++", + ".h": "C++", + ".hpp": "C++", + ".hxx": "C++", + ".c": "C", + ".cs": "C#", + ".go": "Go", + ".rs": "Rust", + ".java": "Java", + ".rb": "Ruby", + ".php": "PHP", } # Default directories to exclude DEFAULT_EXCLUDED_DIRS = { - 'node_modules', 'venv', '__pycache__', '.git', '.svn', '.hg', - 'build', 'dist', 'target', '.pytest_cache', '.tox', '.mypy_cache', - 'htmlcov', 'coverage', '.coverage', '.eggs', '*.egg-info', - '.idea', '.vscode', '.vs', '__pypackages__' + "node_modules", + "venv", + "__pycache__", + ".git", + ".svn", + ".hg", + "build", + "dist", + "target", + ".pytest_cache", + ".tox", + ".mypy_cache", + "htmlcov", + "coverage", + ".coverage", + ".eggs", + "*.egg-info", + ".idea", + ".vscode", + ".vs", + "__pypackages__", } @@ -97,10 +112,10 @@ def detect_language(file_path: Path) -> str: Language name or 'Unknown' """ extension = file_path.suffix.lower() - return LANGUAGE_EXTENSIONS.get(extension, 'Unknown') + return LANGUAGE_EXTENSIONS.get(extension, "Unknown") -def load_gitignore(directory: Path) -> Optional[pathspec.PathSpec]: +def load_gitignore(directory: Path) -> pathspec.PathSpec | None: """ Load .gitignore file and create pathspec matcher. @@ -115,14 +130,14 @@ def load_gitignore(directory: Path) -> Optional[pathspec.PathSpec]: logger.warning("Install with: pip install pathspec") return None - gitignore_path = directory / '.gitignore' + gitignore_path = directory / ".gitignore" if not gitignore_path.exists(): logger.debug(f"No .gitignore found in {directory}") return None try: - with open(gitignore_path, 'r', encoding='utf-8') as f: - spec = pathspec.PathSpec.from_lines('gitwildmatch', f) + with open(gitignore_path, encoding="utf-8") as f: + spec = pathspec.PathSpec.from_lines("gitwildmatch", f) logger.info(f"Loaded .gitignore from {gitignore_path}") return spec except Exception as e: @@ -146,10 +161,10 @@ def should_exclude_dir(dir_name: str, excluded_dirs: set) -> bool: def walk_directory( root: Path, - patterns: Optional[List[str]] = None, - gitignore_spec: Optional[pathspec.PathSpec] = None, - excluded_dirs: Optional[set] = None -) -> List[Path]: + patterns: list[str] | None = None, + gitignore_spec: pathspec.PathSpec | None = None, + excluded_dirs: set | None = None, +) -> list[Path]: """ Walk directory tree and collect source files. @@ -205,9 +220,9 @@ def walk_directory( def analyze_codebase( directory: Path, output_dir: Path, - depth: str = 'deep', - languages: Optional[List[str]] = None, - file_patterns: Optional[List[str]] = None, + depth: str = "deep", + languages: list[str] | None = None, + file_patterns: list[str] | None = None, build_api_reference: bool = True, extract_comments: bool = True, build_dependency_graph: bool = True, @@ -216,8 +231,8 @@ def analyze_codebase( build_how_to_guides: bool = True, extract_config_patterns: bool = True, enhance_with_ai: bool = True, - ai_mode: str = "auto" -) -> Dict[str, Any]: + ai_mode: str = "auto", +) -> dict[str, Any]: """ Analyze local codebase and extract code knowledge. @@ -255,11 +270,7 @@ def analyze_codebase( # Walk directory tree logger.info("Scanning directory tree...") - files = walk_directory( - directory, - patterns=file_patterns, - gitignore_spec=gitignore_spec - ) + files = walk_directory(directory, patterns=file_patterns, gitignore_spec=gitignore_spec) logger.info(f"Found {len(files)} source files") @@ -273,27 +284,25 @@ def analyze_codebase( analyzer = CodeAnalyzer(depth=depth) # Analyze each file - results = {'files': []} + results = {"files": []} analyzed_count = 0 for file_path in files: try: - content = file_path.read_text(encoding='utf-8', errors='ignore') + content = file_path.read_text(encoding="utf-8", errors="ignore") language = detect_language(file_path) - if language == 'Unknown': + if language == "Unknown": continue # Analyze file analysis = analyzer.analyze_file(str(file_path), content, language) # Only include files with actual analysis results - if analysis and (analysis.get('classes') or analysis.get('functions')): - results['files'].append({ - 'file': str(file_path.relative_to(directory)), - 'language': language, - **analysis - }) + if analysis and (analysis.get("classes") or analysis.get("functions")): + results["files"].append( + {"file": str(file_path.relative_to(directory)), "language": language, **analysis} + ) analyzed_count += 1 if analyzed_count % 10 == 0: @@ -306,17 +315,17 @@ def analyze_codebase( logger.info(f"āœ… Successfully analyzed {analyzed_count} files") # Save results - output_json = output_dir / 'code_analysis.json' - with open(output_json, 'w', encoding='utf-8') as f: + output_json = output_dir / "code_analysis.json" + with open(output_json, "w", encoding="utf-8") as f: json.dump(results, f, indent=2) logger.info(f"šŸ“ Saved analysis to: {output_json}") # Build API reference if requested - if build_api_reference and results['files']: + if build_api_reference and results["files"]: logger.info("Building API reference documentation...") builder = APIReferenceBuilder(results) - api_output_dir = output_dir / 'api_reference' + api_output_dir = output_dir / "api_reference" generated_files = builder.build_reference(api_output_dir) logger.info(f"āœ… Generated {len(generated_files)} API reference files") logger.info(f"šŸ“ API reference: {api_output_dir}") @@ -329,10 +338,10 @@ def analyze_codebase( # Analyze dependencies for all files for file_path in files: try: - content = file_path.read_text(encoding='utf-8', errors='ignore') + content = file_path.read_text(encoding="utf-8", errors="ignore") language = detect_language(file_path) - if language != 'Unknown': + if language != "Unknown": # Use relative path from directory for better graph readability rel_path = str(file_path.relative_to(directory)) dep_analyzer.analyze_file(rel_path, content, language) @@ -348,7 +357,7 @@ def analyze_codebase( if cycles: logger.warning(f"āš ļø Found {len(cycles)} circular dependencies:") for i, cycle in enumerate(cycles[:5], 1): # Show first 5 - cycle_str = ' → '.join(cycle) + f" → {cycle[0]}" + cycle_str = " → ".join(cycle) + f" → {cycle[0]}" logger.warning(f" {i}. {cycle_str}") if len(cycles) > 5: logger.warning(f" ... and {len(cycles) - 5} more") @@ -356,32 +365,34 @@ def analyze_codebase( logger.info("āœ… No circular dependencies found") # Save dependency graph data - dep_output_dir = output_dir / 'dependencies' + dep_output_dir = output_dir / "dependencies" dep_output_dir.mkdir(parents=True, exist_ok=True) # Export as JSON - dep_json = dep_output_dir / 'dependency_graph.json' - with open(dep_json, 'w', encoding='utf-8') as f: + dep_json = dep_output_dir / "dependency_graph.json" + with open(dep_json, "w", encoding="utf-8") as f: json.dump(dep_analyzer.export_json(), f, indent=2) logger.info(f"šŸ“ Saved dependency graph: {dep_json}") # Export as Mermaid diagram - mermaid_file = dep_output_dir / 'dependency_graph.mmd' + mermaid_file = dep_output_dir / "dependency_graph.mmd" mermaid_file.write_text(dep_analyzer.export_mermaid()) logger.info(f"šŸ“ Saved Mermaid diagram: {mermaid_file}") # Save statistics stats = dep_analyzer.get_statistics() - stats_file = dep_output_dir / 'statistics.json' - with open(stats_file, 'w', encoding='utf-8') as f: + stats_file = dep_output_dir / "statistics.json" + with open(stats_file, "w", encoding="utf-8") as f: json.dump(stats, f, indent=2) - logger.info(f"šŸ“Š Statistics: {stats['total_files']} files, " - f"{stats['total_dependencies']} dependencies, " - f"{stats['circular_dependencies']} cycles") + logger.info( + f"šŸ“Š Statistics: {stats['total_files']} files, " + f"{stats['total_dependencies']} dependencies, " + f"{stats['circular_dependencies']} cycles" + ) # Try to export as DOT (requires pydot) try: - dot_file = dep_output_dir / 'dependency_graph.dot' + dot_file = dep_output_dir / "dependency_graph.dot" dep_analyzer.export_dot(str(dot_file)) except: pass # pydot not installed, skip DOT export @@ -396,13 +407,11 @@ def analyze_codebase( for file_path in files: try: - content = file_path.read_text(encoding='utf-8', errors='ignore') + content = file_path.read_text(encoding="utf-8", errors="ignore") language = detect_language(file_path) - if language != 'Unknown': - report = pattern_recognizer.analyze_file( - str(file_path), content, language - ) + if language != "Unknown": + report = pattern_recognizer.analyze_file(str(file_path), content, language) if report.patterns: pattern_results.append(report.to_dict()) @@ -412,14 +421,14 @@ def analyze_codebase( # Save pattern results if pattern_results: - pattern_output = output_dir / 'patterns' + pattern_output = output_dir / "patterns" pattern_output.mkdir(parents=True, exist_ok=True) - pattern_json = pattern_output / 'detected_patterns.json' - with open(pattern_json, 'w', encoding='utf-8') as f: + pattern_json = pattern_output / "detected_patterns.json" + with open(pattern_json, "w", encoding="utf-8") as f: json.dump(pattern_results, f, indent=2) - total_patterns = sum(len(r['patterns']) for r in pattern_results) + total_patterns = sum(len(r["patterns"]) for r in pattern_results) logger.info(f"āœ… Detected {total_patterns} patterns in {len(pattern_results)} files") logger.info(f"šŸ“ Saved to: {pattern_json}") else: @@ -432,35 +441,31 @@ def analyze_codebase( # Create extractor test_extractor = TestExampleExtractor( - min_confidence=0.5, - max_per_file=10, - languages=languages, - enhance_with_ai=enhance_with_ai + min_confidence=0.5, max_per_file=10, languages=languages, enhance_with_ai=enhance_with_ai ) # Extract examples from directory try: - example_report = test_extractor.extract_from_directory( - directory, - recursive=True - ) + example_report = test_extractor.extract_from_directory(directory, recursive=True) if example_report.total_examples > 0: # Save results - examples_output = output_dir / 'test_examples' + examples_output = output_dir / "test_examples" examples_output.mkdir(parents=True, exist_ok=True) # Save as JSON - examples_json = examples_output / 'test_examples.json' - with open(examples_json, 'w', encoding='utf-8') as f: + examples_json = examples_output / "test_examples.json" + with open(examples_json, "w", encoding="utf-8") as f: json.dump(example_report.to_dict(), f, indent=2) # Save as Markdown - examples_md = examples_output / 'test_examples.md' - examples_md.write_text(example_report.to_markdown(), encoding='utf-8') + examples_md = examples_output / "test_examples.md" + examples_md.write_text(example_report.to_markdown(), encoding="utf-8") - logger.info(f"āœ… Extracted {example_report.total_examples} test examples " - f"({example_report.high_value_count} high-value)") + logger.info( + f"āœ… Extracted {example_report.total_examples} test examples " + f"({example_report.high_value_count} high-value)" + ) logger.info(f"šŸ“ Saved to: {examples_output}") else: logger.info("No test examples extracted") @@ -479,25 +484,25 @@ def analyze_codebase( guide_builder = HowToGuideBuilder(enhance_with_ai=enhance_with_ai) # Build guides from workflow examples - tutorials_dir = output_dir / 'tutorials' + tutorials_dir = output_dir / "tutorials" # Get workflow examples from the example_report if available - if 'example_report' in locals() and example_report and example_report.total_examples > 0: + if "example_report" in locals() and example_report and example_report.total_examples > 0: # Convert example_report to list of dicts for processing - examples_list = example_report.to_dict().get('examples', []) + examples_list = example_report.to_dict().get("examples", []) guide_collection = guide_builder.build_guides_from_examples( examples_list, - grouping_strategy='ai-tutorial-group', + grouping_strategy="ai-tutorial-group", output_dir=tutorials_dir, enhance_with_ai=enhance_with_ai, - ai_mode=ai_mode + ai_mode=ai_mode, ) if guide_collection and guide_collection.total_guides > 0: # Save collection summary - collection_json = tutorials_dir / 'guide_collection.json' - with open(collection_json, 'w', encoding='utf-8') as f: + collection_json = tutorials_dir / "guide_collection.json" + with open(collection_json, "w", encoding="utf-8") as f: json.dump(guide_collection.to_dict(), f, indent=2) logger.info(f"āœ… Built {guide_collection.total_guides} how-to guides") @@ -524,9 +529,10 @@ def analyze_codebase( result_dict = config_extractor.to_dict(extraction_result) # AI Enhancement (if enabled) - if enhance_with_ai and ai_mode != 'none': + if enhance_with_ai and ai_mode != "none": try: from skill_seekers.cli.config_enhancer import ConfigEnhancer + logger.info(f"šŸ¤– Enhancing config analysis with AI (mode: {ai_mode})...") enhancer = ConfigEnhancer(mode=ai_mode) result_dict = enhancer.enhance_config_result(result_dict) @@ -535,28 +541,30 @@ def analyze_codebase( logger.warning(f"āš ļø Config AI enhancement failed: {e}") # Save results - config_output = output_dir / 'config_patterns' + config_output = output_dir / "config_patterns" config_output.mkdir(parents=True, exist_ok=True) # Save as JSON - config_json = config_output / 'config_patterns.json' - with open(config_json, 'w', encoding='utf-8') as f: + config_json = config_output / "config_patterns.json" + with open(config_json, "w", encoding="utf-8") as f: json.dump(result_dict, f, indent=2) # Save as Markdown (basic - AI enhancements in JSON only for now) - config_md = config_output / 'config_patterns.md' - config_md.write_text(extraction_result.to_markdown(), encoding='utf-8') + config_md = config_output / "config_patterns.md" + config_md.write_text(extraction_result.to_markdown(), encoding="utf-8") # Count total settings across all files total_settings = sum(len(cf.settings) for cf in extraction_result.config_files) total_patterns = sum(len(cf.patterns) for cf in extraction_result.config_files) - logger.info(f"āœ… Extracted {len(extraction_result.config_files)} config files " - f"with {total_settings} settings and {total_patterns} detected patterns") + logger.info( + f"āœ… Extracted {len(extraction_result.config_files)} config files " + f"with {total_settings} settings and {total_patterns} detected patterns" + ) - if 'ai_enhancements' in result_dict: - insights = result_dict['ai_enhancements'].get('overall_insights', {}) - if insights.get('security_issues_found'): + if "ai_enhancements" in result_dict: + insights = result_dict["ai_enhancements"].get("overall_insights", {}) + if insights.get("security_issues_found"): logger.info(f"šŸ” Security issues found: {insights['security_issues_found']}") logger.info(f"šŸ“ Saved to: {config_output}") @@ -572,15 +580,15 @@ def analyze_codebase( from skill_seekers.cli.architectural_pattern_detector import ArchitecturalPatternDetector arch_detector = ArchitecturalPatternDetector(enhance_with_ai=enhance_with_ai) - arch_report = arch_detector.analyze(directory, results['files']) + arch_report = arch_detector.analyze(directory, results["files"]) if arch_report.patterns: - arch_output = output_dir / 'architecture' + arch_output = output_dir / "architecture" arch_output.mkdir(parents=True, exist_ok=True) # Save as JSON - arch_json = arch_output / 'architectural_patterns.json' - with open(arch_json, 'w', encoding='utf-8') as f: + arch_json = arch_output / "architectural_patterns.json" + with open(arch_json, "w", encoding="utf-8") as f: json.dump(arch_report.to_dict(), f, indent=2) logger.info(f"šŸ—ļø Detected {len(arch_report.patterns)} architectural patterns") @@ -601,7 +609,7 @@ def analyze_codebase( build_dependency_graph=build_dependency_graph, detect_patterns=detect_patterns, extract_test_examples=extract_test_examples, - extract_config_patterns=extract_config_patterns + extract_config_patterns=extract_config_patterns, ) return results @@ -610,13 +618,13 @@ def analyze_codebase( def _generate_skill_md( output_dir: Path, directory: Path, - results: Dict[str, Any], + results: dict[str, Any], depth: str, build_api_reference: bool, build_dependency_graph: bool, detect_patterns: bool, extract_test_examples: bool, - extract_config_patterns: bool + extract_config_patterns: bool, ): """ Generate rich SKILL.md from codebase analysis results. @@ -635,14 +643,14 @@ def _generate_skill_md( repo_name = directory.name # Generate skill name (lowercase, hyphens only, max 64 chars) - skill_name = repo_name.lower().replace('_', '-').replace(' ', '-')[:64] + skill_name = repo_name.lower().replace("_", "-").replace(" ", "-")[:64] # Generate description description = f"Local codebase analysis for {repo_name}" # Count files by language - language_stats = _get_language_stats(results.get('files', [])) - total_files = len(results.get('files', [])) + language_stats = _get_language_stats(results.get("files", [])) + total_files = len(results.get("files", [])) # Start building content skill_content = f"""--- @@ -658,7 +666,7 @@ Local codebase analysis and documentation generated from code analysis. **Path:** `{directory}` **Files Analyzed:** {total_files} -**Languages:** {', '.join(language_stats.keys())} +**Languages:** {", ".join(language_stats.keys())} **Analysis Depth:** {depth} ## When to Use This Skill @@ -732,22 +740,22 @@ Use this skill when you need to: skill_content += "This skill includes detailed reference documentation:\n\n" refs_added = False - if build_api_reference and (output_dir / 'api_reference').exists(): + if build_api_reference and (output_dir / "api_reference").exists(): skill_content += "- **API Reference**: `references/api_reference/` - Complete API documentation\n" refs_added = True - if build_dependency_graph and (output_dir / 'dependencies').exists(): + if build_dependency_graph and (output_dir / "dependencies").exists(): skill_content += "- **Dependencies**: `references/dependencies/` - Dependency graph and analysis\n" refs_added = True - if detect_patterns and (output_dir / 'patterns').exists(): + if detect_patterns and (output_dir / "patterns").exists(): skill_content += "- **Patterns**: `references/patterns/` - Detected design patterns\n" refs_added = True - if extract_test_examples and (output_dir / 'test_examples').exists(): + if extract_test_examples and (output_dir / "test_examples").exists(): skill_content += "- **Examples**: `references/test_examples/` - Usage examples from tests\n" refs_added = True - if extract_config_patterns and (output_dir / 'config_patterns').exists(): + if extract_config_patterns and (output_dir / "config_patterns").exists(): skill_content += "- **Configuration**: `references/config_patterns/` - Configuration patterns\n" refs_added = True - if (output_dir / 'architecture').exists(): + if (output_dir / "architecture").exists(): skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n" refs_added = True @@ -762,34 +770,34 @@ Use this skill when you need to: # Write SKILL.md skill_path = output_dir / "SKILL.md" - skill_path.write_text(skill_content, encoding='utf-8') + skill_path.write_text(skill_content, encoding="utf-8") - line_count = len(skill_content.split('\n')) + line_count = len(skill_content.split("\n")) logger.info(f"āœ… Generated SKILL.md: {skill_path} ({line_count} lines)") # Generate references/ directory structure _generate_references(output_dir) -def _get_language_stats(files: List[Dict]) -> Dict[str, int]: +def _get_language_stats(files: list[dict]) -> dict[str, int]: """Count files by language from analysis results.""" stats = {} for file_data in files: # files is a list of dicts with 'language' key - lang = file_data.get('language', 'Unknown') - if lang != 'Unknown': + lang = file_data.get("language", "Unknown") + if lang != "Unknown": stats[lang] = stats.get(lang, 0) + 1 return stats def _format_patterns_section(output_dir: Path) -> str: """Format design patterns section from patterns/detected_patterns.json.""" - patterns_file = output_dir / 'patterns' / 'detected_patterns.json' + patterns_file = output_dir / "patterns" / "detected_patterns.json" if not patterns_file.exists(): return "" try: - with open(patterns_file, 'r', encoding='utf-8') as f: + with open(patterns_file, encoding="utf-8") as f: patterns_data = json.load(f) except Exception: return "" @@ -802,10 +810,10 @@ def _format_patterns_section(output_dir: Path) -> str: by_class = {} for pattern_file in patterns_data: - for pattern in pattern_file.get('patterns', []): - ptype = pattern.get('pattern_type', 'Unknown') - cls = pattern.get('class_name', '') - confidence = pattern.get('confidence', 0) + for pattern in pattern_file.get("patterns", []): + ptype = pattern.get("pattern_type", "Unknown") + cls = pattern.get("class_name", "") + confidence = pattern.get("confidence", 0) # Skip low confidence if confidence < 0.7: @@ -813,7 +821,7 @@ def _format_patterns_section(output_dir: Path) -> str: # Deduplicate by class key = f"{cls}:{ptype}" - if key not in by_class or by_class[key]['confidence'] < confidence: + if key not in by_class or by_class[key]["confidence"] < confidence: by_class[key] = pattern # Count by type @@ -836,22 +844,22 @@ def _format_patterns_section(output_dir: Path) -> str: def _format_examples_section(output_dir: Path) -> str: """Format code examples section from test_examples/test_examples.json.""" - examples_file = output_dir / 'test_examples' / 'test_examples.json' + examples_file = output_dir / "test_examples" / "test_examples.json" if not examples_file.exists(): return "" try: - with open(examples_file, 'r', encoding='utf-8') as f: + with open(examples_file, encoding="utf-8") as f: examples_data = json.load(f) except Exception: return "" - examples = examples_data.get('examples', []) + examples = examples_data.get("examples", []) if not examples: return "" # Filter high-value examples (complexity > 0.7) - high_value = [ex for ex in examples if ex.get('complexity_score', 0) > 0.7] + high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7] if not high_value: # If no high complexity, take any examples @@ -864,11 +872,11 @@ def _format_examples_section(output_dir: Path) -> str: content += "*High-quality examples extracted from test files (C3.2)*\n\n" # Top 10 examples - for ex in sorted(high_value, key=lambda x: x.get('complexity_score', 0), reverse=True)[:10]: - desc = ex.get('description', 'Example') - lang = ex.get('language', 'python').lower() - code = ex.get('code', '') - complexity = ex.get('complexity_score', 0) + for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]: + desc = ex.get("description", "Example") + lang = ex.get("language", "python").lower() + code = ex.get("code", "") + complexity = ex.get("complexity_score", 0) content += f"**{desc}** (complexity: {complexity:.2f})\n\n" content += f"```{lang}\n{code}\n```\n\n" @@ -879,16 +887,16 @@ def _format_examples_section(output_dir: Path) -> str: def _format_api_section(output_dir: Path) -> str: """Format API reference section.""" - api_dir = output_dir / 'api_reference' + api_dir = output_dir / "api_reference" if not api_dir.exists(): return "" - api_md = api_dir / 'api_reference.md' + api_md = api_dir / "api_reference.md" if not api_md.exists(): return "" try: - api_content = api_md.read_text(encoding='utf-8') + api_content = api_md.read_text(encoding="utf-8") except Exception: return "" @@ -906,17 +914,17 @@ def _format_api_section(output_dir: Path) -> str: def _format_architecture_section(output_dir: Path) -> str: """Format architecture section from architecture/architectural_patterns.json.""" - arch_file = output_dir / 'architecture' / 'architectural_patterns.json' + arch_file = output_dir / "architecture" / "architectural_patterns.json" if not arch_file.exists(): return "" try: - with open(arch_file, 'r', encoding='utf-8') as f: + with open(arch_file, encoding="utf-8") as f: arch_data = json.load(f) except Exception: return "" - patterns = arch_data.get('patterns', []) + patterns = arch_data.get("patterns", []) if not patterns: return "" @@ -925,9 +933,9 @@ def _format_architecture_section(output_dir: Path) -> str: content += "**Detected Architectural Patterns:**\n\n" for pattern in patterns[:5]: - name = pattern.get('pattern_name', 'Unknown') - confidence = pattern.get('confidence', 0) - indicators = pattern.get('indicators', []) + name = pattern.get("pattern_name", "Unknown") + confidence = pattern.get("confidence", 0) + indicators = pattern.get("indicators", []) content += f"- **{name}** (confidence: {confidence:.2f})\n" if indicators: @@ -940,22 +948,22 @@ def _format_architecture_section(output_dir: Path) -> str: def _format_config_section(output_dir: Path) -> str: """Format configuration patterns section.""" - config_file = output_dir / 'config_patterns' / 'config_patterns.json' + config_file = output_dir / "config_patterns" / "config_patterns.json" if not config_file.exists(): return "" try: - with open(config_file, 'r', encoding='utf-8') as f: + with open(config_file, encoding="utf-8") as f: config_data = json.load(f) except Exception: return "" - config_files = config_data.get('config_files', []) + config_files = config_data.get("config_files", []) if not config_files: return "" - total_settings = sum(len(cf.get('settings', [])) for cf in config_files) - total_patterns = sum(len(cf.get('patterns', [])) for cf in config_files) + total_settings = sum(len(cf.get("settings", [])) for cf in config_files) + total_patterns = sum(len(cf.get("patterns", [])) for cf in config_files) content = "## āš™ļø Configuration Patterns\n\n" content += "*From C3.4 configuration analysis*\n\n" @@ -966,7 +974,7 @@ def _format_config_section(output_dir: Path) -> str: # List config file types found file_types = {} for cf in config_files: - ctype = cf.get('config_type', 'unknown') + ctype = cf.get("config_type", "unknown") file_types[ctype] = file_types.get(ctype, 0) + 1 if file_types: @@ -985,18 +993,18 @@ def _generate_references(output_dir: Path): Creates a clean references/ directory that links to all analysis outputs. """ - references_dir = output_dir / 'references' + references_dir = output_dir / "references" references_dir.mkdir(exist_ok=True) # Map analysis directories to reference names mappings = { - 'api_reference': 'api_reference', - 'dependencies': 'dependencies', - 'patterns': 'patterns', - 'test_examples': 'test_examples', - 'tutorials': 'tutorials', - 'config_patterns': 'config_patterns', - 'architecture': 'architecture' + "api_reference": "api_reference", + "dependencies": "dependencies", + "patterns": "patterns", + "test_examples": "test_examples", + "tutorials": "tutorials", + "config_patterns": "config_patterns", + "architecture": "architecture", } for source, target in mappings.items(): @@ -1007,9 +1015,11 @@ def _generate_references(output_dir: Path): # Copy directory to references/ (not symlink, for portability) if target_dir.exists(): import shutil + shutil.rmtree(target_dir) import shutil + shutil.copytree(source_dir, target_dir) logger.debug(f"Copied {source} → references/{target}") @@ -1019,7 +1029,7 @@ def _generate_references(output_dir: Path): def main(): """Command-line interface for codebase analysis.""" parser = argparse.ArgumentParser( - description='Analyze local codebases and extract code knowledge', + description="Analyze local codebases and extract code knowledge", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -1043,101 +1053,78 @@ Examples: # Skip specific features codebase-scraper --directory . --skip-patterns --skip-test-examples -""" +""", ) + parser.add_argument("--directory", required=True, help="Directory to analyze") + parser.add_argument("--output", default="output/codebase/", help="Output directory (default: output/codebase/)") parser.add_argument( - '--directory', - required=True, - help='Directory to analyze' + "--depth", choices=["surface", "deep", "full"], default="deep", help="Analysis depth (default: deep)" ) + parser.add_argument("--languages", help="Comma-separated languages to analyze (e.g., Python,JavaScript,C++)") + parser.add_argument("--file-patterns", help="Comma-separated file patterns (e.g., *.py,src/**/*.js)") parser.add_argument( - '--output', - default='output/codebase/', - help='Output directory (default: output/codebase/)' - ) - parser.add_argument( - '--depth', - choices=['surface', 'deep', 'full'], - default='deep', - help='Analysis depth (default: deep)' - ) - parser.add_argument( - '--languages', - help='Comma-separated languages to analyze (e.g., Python,JavaScript,C++)' - ) - parser.add_argument( - '--file-patterns', - help='Comma-separated file patterns (e.g., *.py,src/**/*.js)' - ) - parser.add_argument( - '--skip-api-reference', - action='store_true', + "--skip-api-reference", + action="store_true", default=False, - help='Skip API reference markdown documentation generation (default: enabled)' + help="Skip API reference markdown documentation generation (default: enabled)", ) parser.add_argument( - '--skip-dependency-graph', - action='store_true', + "--skip-dependency-graph", + action="store_true", default=False, - help='Skip dependency graph and circular dependency detection (default: enabled)' + help="Skip dependency graph and circular dependency detection (default: enabled)", ) parser.add_argument( - '--skip-patterns', - action='store_true', + "--skip-patterns", + action="store_true", default=False, - help='Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)' + help="Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)", ) parser.add_argument( - '--skip-test-examples', - action='store_true', + "--skip-test-examples", + action="store_true", default=False, - help='Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)' + help="Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)", ) parser.add_argument( - '--skip-how-to-guides', - action='store_true', + "--skip-how-to-guides", + action="store_true", default=False, - help='Skip how-to guide generation from workflow examples (default: enabled)' + help="Skip how-to guide generation from workflow examples (default: enabled)", ) parser.add_argument( - '--skip-config-patterns', - action='store_true', + "--skip-config-patterns", + action="store_true", default=False, - help='Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)' + help="Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)", ) parser.add_argument( - '--ai-mode', - choices=['auto', 'api', 'local', 'none'], - default='auto', - help='AI enhancement mode for how-to guides: auto (detect best), api (Claude API), local (Claude Code CLI), none (disable) (default: auto)' - ) - parser.add_argument( - '--no-comments', - action='store_true', - help='Skip comment extraction' - ) - parser.add_argument( - '--verbose', - action='store_true', - help='Enable verbose logging' + "--ai-mode", + choices=["auto", "api", "local", "none"], + default="auto", + help="AI enhancement mode for how-to guides: auto (detect best), api (Claude API), local (Claude Code CLI), none (disable) (default: auto)", ) + parser.add_argument("--no-comments", action="store_true", help="Skip comment extraction") + parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") # Check for deprecated flags deprecated_flags = { - '--build-api-reference': '--skip-api-reference', - '--build-dependency-graph': '--skip-dependency-graph', - '--detect-patterns': '--skip-patterns', - '--extract-test-examples': '--skip-test-examples', - '--build-how-to-guides': '--skip-how-to-guides', - '--extract-config-patterns': '--skip-config-patterns' + "--build-api-reference": "--skip-api-reference", + "--build-dependency-graph": "--skip-dependency-graph", + "--detect-patterns": "--skip-patterns", + "--extract-test-examples": "--skip-test-examples", + "--build-how-to-guides": "--skip-how-to-guides", + "--extract-config-patterns": "--skip-config-patterns", } for old_flag, new_flag in deprecated_flags.items(): if old_flag in sys.argv: - logger.warning(f"āš ļø DEPRECATED: {old_flag} is deprecated. " - f"All features are now enabled by default. " - f"Use {new_flag} to disable this feature.") + logger.warning( + f"āš ļø DEPRECATED: {old_flag} is deprecated. " + f"All features are now enabled by default. " + f"Use {new_flag} to disable this feature." + ) args = parser.parse_args() @@ -1158,12 +1145,12 @@ Examples: # Parse languages languages = None if args.languages: - languages = [lang.strip() for lang in args.languages.split(',')] + languages = [lang.strip() for lang in args.languages.split(",")] # Parse file patterns file_patterns = None if args.file_patterns: - file_patterns = [p.strip() for p in args.file_patterns.split(',')] + file_patterns = [p.strip() for p in args.file_patterns.split(",")] # Analyze codebase try: @@ -1181,18 +1168,18 @@ Examples: build_how_to_guides=not args.skip_how_to_guides, extract_config_patterns=not args.skip_config_patterns, enhance_with_ai=True, # Auto-disables if no API key present - ai_mode=args.ai_mode # NEW: AI enhancement mode for how-to guides + ai_mode=args.ai_mode, # NEW: AI enhancement mode for how-to guides ) # Print summary - print(f"\n{'='*60}") - print(f"CODEBASE ANALYSIS COMPLETE") - print(f"{'='*60}") + print(f"\n{'=' * 60}") + print("CODEBASE ANALYSIS COMPLETE") + print(f"{'=' * 60}") print(f"Files analyzed: {len(results['files'])}") print(f"Output directory: {args.output}") if args.build_api_reference: print(f"API reference: {Path(args.output) / 'api_reference'}") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") return 0 @@ -1202,9 +1189,10 @@ Examples: except Exception as e: logger.error(f"Analysis failed: {e}") import traceback + traceback.print_exc() return 1 -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/src/skill_seekers/cli/config_command.py b/src/skill_seekers/cli/config_command.py index eeaf5a7..9bd89e8 100644 --- a/src/skill_seekers/cli/config_command.py +++ b/src/skill_seekers/cli/config_command.py @@ -4,9 +4,8 @@ Interactive Configuration Wizard for Skill Seekers Provides user-friendly setup for GitHub tokens, API keys, and settings. """ -import sys import webbrowser -from typing import Optional + from .config_manager import get_config_manager @@ -46,7 +45,7 @@ Documentation: https://github.com/SkillSeekers/skill-seekers # Ask if user wants to run setup now response = input("Would you like to run the configuration wizard now? [y/N]: ").strip().lower() - if response in ['y', 'yes']: + if response in ["y", "yes"]: main_menu() else: print("\nYou can run the configuration wizard anytime with:") @@ -158,7 +157,7 @@ def add_github_profile(): if name in config.config["github"]["profiles"]: print(f"āŒ Profile '{name}' already exists.") overwrite = input("Overwrite? [y/N]: ").strip().lower() - if overwrite not in ['y', 'yes']: + if overwrite not in ["y", "yes"]: continue break @@ -175,7 +174,7 @@ def add_github_profile(): print(" 4. Copy the token (ghp_...)\n") open_now = input("Open GitHub token page in browser? [Y/n]: ").strip().lower() - if open_now not in ['n', 'no']: + if open_now not in ["n", "no"]: open_github_token_page() while True: @@ -186,7 +185,7 @@ def add_github_profile(): if not (token.startswith("ghp_") or token.startswith("github_pat_")): print("āš ļø Warning: Token doesn't match GitHub format") proceed = input("Continue anyway? [y/N]: ").strip().lower() - if proceed not in ['y', 'yes']: + if proceed not in ["y", "yes"]: continue break @@ -198,12 +197,7 @@ def add_github_profile(): print(" 4. fail - Fail immediately") strategy_choice = input("\nSelect strategy [1-4] (default: 1): ").strip() or "1" - strategy_map = { - "1": "prompt", - "2": "wait", - "3": "switch", - "4": "fail" - } + strategy_map = {"1": "prompt", "2": "wait", "3": "switch", "4": "fail"} strategy = strategy_map.get(strategy_choice, "prompt") # Timeout @@ -217,7 +211,7 @@ def add_github_profile(): # Set as default has_profiles = bool(config.config["github"]["profiles"]) if has_profiles: - set_default = input("\nSet as default profile? [y/N]: ").strip().lower() in ['y', 'yes'] + set_default = input("\nSet as default profile? [y/N]: ").strip().lower() in ["y", "yes"] else: set_default = True # First profile is always default @@ -228,7 +222,7 @@ def add_github_profile(): description=description, rate_limit_strategy=strategy, timeout_minutes=timeout, - set_as_default=set_default + set_as_default=set_default, ) print(f"\nāœ… GitHub profile '{name}' added successfully!") @@ -258,7 +252,7 @@ def remove_github_profile(): if 1 <= choice_idx <= len(profiles): profile_name = profiles[choice_idx - 1]["name"] confirm = input(f"Really remove profile '{profile_name}'? [y/N]: ").strip().lower() - if confirm in ['y', 'yes']: + if confirm in ["y", "yes"]: config.remove_github_profile(profile_name) else: print("āŒ Invalid choice.") @@ -325,11 +319,10 @@ def api_keys_menu(): source = "" if key: import os - env_var = { - "anthropic": "ANTHROPIC_API_KEY", - "google": "GOOGLE_API_KEY", - "openai": "OPENAI_API_KEY" - }[provider] + + env_var = {"anthropic": "ANTHROPIC_API_KEY", "google": "GOOGLE_API_KEY", "openai": "OPENAI_API_KEY"}[ + provider + ] if os.getenv(env_var): source = " (from environment)" else: @@ -347,7 +340,7 @@ def api_keys_menu(): provider_map = { "1": ("anthropic", "https://console.anthropic.com/settings/keys"), "2": ("google", "https://makersuite.google.com/app/apikey"), - "3": ("openai", "https://platform.openai.com/api-keys") + "3": ("openai", "https://platform.openai.com/api-keys"), } if choice in provider_map: @@ -365,7 +358,7 @@ def set_api_key(provider: str, url: str): print(f"Get your API key at: {url}\n") open_now = input("Open in browser? [Y/n]: ").strip().lower() - if open_now not in ['n', 'no']: + if open_now not in ["n", "no"]: try: webbrowser.open(url) print("āœ… Opened in browser\n") @@ -390,7 +383,7 @@ def rate_limit_settings(): current = config.config["rate_limit"] - print(f"Current settings:") + print("Current settings:") print(f" • Default timeout: {current['default_timeout_minutes']} minutes") print(f" • Auto-switch profiles: {current['auto_switch_profiles']}") print(f" • Show countdown: {current['show_countdown']}\n") @@ -404,14 +397,16 @@ def rate_limit_settings(): print("āš ļø Invalid input, keeping current value") # Auto-switch - auto_switch_input = input(f"Auto-switch to other profiles? [y/n] ({current['auto_switch_profiles']}): ").strip().lower() + auto_switch_input = ( + input(f"Auto-switch to other profiles? [y/n] ({current['auto_switch_profiles']}): ").strip().lower() + ) if auto_switch_input: - config.config["rate_limit"]["auto_switch_profiles"] = auto_switch_input in ['y', 'yes'] + config.config["rate_limit"]["auto_switch_profiles"] = auto_switch_input in ["y", "yes"] # Show countdown countdown_input = input(f"Show countdown timer? [y/n] ({current['show_countdown']}): ").strip().lower() if countdown_input: - config.config["rate_limit"]["show_countdown"] = countdown_input in ['y', 'yes'] + config.config["rate_limit"]["show_countdown"] = countdown_input in ["y", "yes"] config.save_config() print("\nāœ… Rate limit settings updated") @@ -427,7 +422,7 @@ def resume_settings(): current = config.config["resume"] - print(f"Current settings:") + print("Current settings:") print(f" • Auto-save interval: {current['auto_save_interval_seconds']} seconds") print(f" • Keep progress for: {current['keep_progress_days']} days\n") @@ -467,13 +462,12 @@ def test_connections(): print(" āš ļø No GitHub profiles configured") else: import requests + for p in profiles: token = config.config["github"]["profiles"][p["name"]]["token"] try: response = requests.get( - "https://api.github.com/rate_limit", - headers={"Authorization": f"token {token}"}, - timeout=5 + "https://api.github.com/rate_limit", headers={"Authorization": f"token {token}"}, timeout=5 ) if response.status_code == 200: data = response.json() @@ -518,34 +512,12 @@ def main(): """Main entry point for config command.""" import argparse - parser = argparse.ArgumentParser( - description="Configure Skill Seekers settings" - ) - parser.add_argument( - "--github", - action="store_true", - help="Go directly to GitHub token setup" - ) - parser.add_argument( - "--api-keys", - action="store_true", - help="Go directly to API keys setup" - ) - parser.add_argument( - "--show", - action="store_true", - help="Show current configuration and exit" - ) - parser.add_argument( - "--test", - action="store_true", - help="Test connections and exit" - ) - parser.add_argument( - "--welcome", - action="store_true", - help="Show welcome message" - ) + parser = argparse.ArgumentParser(description="Configure Skill Seekers settings") + parser.add_argument("--github", action="store_true", help="Go directly to GitHub token setup") + parser.add_argument("--api-keys", action="store_true", help="Go directly to API keys setup") + parser.add_argument("--show", action="store_true", help="Show current configuration and exit") + parser.add_argument("--test", action="store_true", help="Test connections and exit") + parser.add_argument("--welcome", action="store_true", help="Show welcome message") args = parser.parse_args() diff --git a/src/skill_seekers/cli/config_enhancer.py b/src/skill_seekers/cli/config_enhancer.py index 0ed5cf8..b89eb7a 100644 --- a/src/skill_seekers/cli/config_enhancer.py +++ b/src/skill_seekers/cli/config_enhancer.py @@ -12,24 +12,24 @@ Provides dual-mode AI enhancement (API + LOCAL) for configuration analysis: Similar to GuideEnhancer (C3.3) but for configuration files. """ -import os -import sys import json import logging +import os import subprocess +import sys import tempfile -from pathlib import Path -from typing import Dict, List, Optional, Any from dataclasses import dataclass, field +from pathlib import Path # Configure logging -logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s') +logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s") logger = logging.getLogger(__name__) # Optional anthropic import ANTHROPIC_AVAILABLE = False try: import anthropic + ANTHROPIC_AVAILABLE = True except ImportError: pass @@ -38,6 +38,7 @@ except ImportError: @dataclass class ConfigEnhancement: """AI-generated enhancement for a configuration""" + explanation: str = "" # What this setting does best_practice: str = "" # Suggested improvement security_concern: str = "" # Security issue (if any) @@ -48,11 +49,12 @@ class ConfigEnhancement: @dataclass class EnhancedConfigFile: """Configuration file with AI enhancements""" + file_path: str config_type: str purpose: str enhancement: ConfigEnhancement - setting_enhancements: Dict[str, ConfigEnhancement] = field(default_factory=dict) + setting_enhancements: dict[str, ConfigEnhancement] = field(default_factory=dict) class ConfigEnhancer: @@ -73,7 +75,7 @@ class ConfigEnhancer: mode: Enhancement mode - "api", "local", or "auto" (default) """ self.mode = self._detect_mode(mode) - self.api_key = os.environ.get('ANTHROPIC_API_KEY') + self.api_key = os.environ.get("ANTHROPIC_API_KEY") self.client = None if self.mode == "api" and ANTHROPIC_AVAILABLE and self.api_key: @@ -93,14 +95,14 @@ class ConfigEnhancer: return requested_mode # Auto-detect - if os.environ.get('ANTHROPIC_API_KEY') and ANTHROPIC_AVAILABLE: + if os.environ.get("ANTHROPIC_API_KEY") and ANTHROPIC_AVAILABLE: logger.info("šŸ¤– AI enhancement: API mode (Claude API detected)") return "api" else: logger.info("šŸ¤– AI enhancement: LOCAL mode (using Claude Code CLI)") return "local" - def enhance_config_result(self, result: Dict) -> Dict: + def enhance_config_result(self, result: dict) -> dict: """ Enhance entire configuration extraction result. @@ -121,7 +123,7 @@ class ConfigEnhancer: # API MODE - Direct Claude API calls # ========================================================================= - def _enhance_via_api(self, result: Dict) -> Dict: + def _enhance_via_api(self, result: dict) -> dict: """Enhance configs using Claude API""" if not self.client: logger.error("āŒ API mode requested but no API key available") @@ -134,12 +136,7 @@ class ConfigEnhancer: # Call Claude API logger.info("šŸ“” Calling Claude API for config analysis...") response = self.client.messages.create( - model="claude-sonnet-4-20250514", - max_tokens=8000, - messages=[{ - "role": "user", - "content": prompt - }] + model="claude-sonnet-4-20250514", max_tokens=8000, messages=[{"role": "user", "content": prompt}] ) # Parse response @@ -151,23 +148,23 @@ class ConfigEnhancer: logger.error(f"āŒ API enhancement failed: {e}") return result - def _create_enhancement_prompt(self, result: Dict) -> str: + def _create_enhancement_prompt(self, result: dict) -> str: """Create prompt for Claude API""" - config_files = result.get('config_files', []) + config_files = result.get("config_files", []) # Summarize configs for prompt config_summary = [] for cf in config_files[:10]: # Limit to first 10 files settings_summary = [] - for setting in cf.get('settings', [])[:5]: # First 5 settings per file + for setting in cf.get("settings", [])[:5]: # First 5 settings per file settings_summary.append(f" - {setting['key']}: {setting['value']} ({setting['value_type']})") config_summary.append(f""" -File: {cf['relative_path']} ({cf['config_type']}) -Purpose: {cf['purpose']} +File: {cf["relative_path"]} ({cf["config_type"]}) +Purpose: {cf["purpose"]} Settings: {chr(10).join(settings_summary)} -Patterns: {', '.join(cf.get('patterns', []))} +Patterns: {", ".join(cf.get("patterns", []))} """) prompt = f"""Analyze these configuration files and provide AI-enhanced insights. @@ -207,12 +204,13 @@ Focus on actionable insights that help developers understand and improve their c """ return prompt - def _parse_api_response(self, response_text: str, original_result: Dict) -> Dict: + def _parse_api_response(self, response_text: str, original_result: dict) -> dict: """Parse Claude API response and merge with original result""" try: # Extract JSON from response import re - json_match = re.search(r'\{.*\}', response_text, re.DOTALL) + + json_match = re.search(r"\{.*\}", response_text, re.DOTALL) if not json_match: logger.warning("āš ļø No JSON found in API response") return original_result @@ -220,14 +218,14 @@ Focus on actionable insights that help developers understand and improve their c enhancements = json.loads(json_match.group()) # Merge enhancements into original result - original_result['ai_enhancements'] = enhancements + original_result["ai_enhancements"] = enhancements # Add enhancement flags to config files - file_enhancements = {e['file_path']: e for e in enhancements.get('file_enhancements', [])} - for cf in original_result.get('config_files', []): - file_path = cf.get('relative_path', cf.get('file_path')) + file_enhancements = {e["file_path"]: e for e in enhancements.get("file_enhancements", [])} + for cf in original_result.get("config_files", []): + file_path = cf.get("relative_path", cf.get("file_path")) if file_path in file_enhancements: - cf['ai_enhancement'] = file_enhancements[file_path] + cf["ai_enhancement"] = file_enhancements[file_path] return original_result @@ -239,11 +237,11 @@ Focus on actionable insights that help developers understand and improve their c # LOCAL MODE - Claude Code CLI # ========================================================================= - def _enhance_via_local(self, result: Dict) -> Dict: + def _enhance_via_local(self, result: dict) -> dict: """Enhance configs using Claude Code CLI""" try: # Create temporary prompt file - with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: prompt_file = Path(f.name) f.write(self._create_local_prompt(result)) @@ -263,7 +261,7 @@ Focus on actionable insights that help developers understand and improve their c if result_data: # Merge LOCAL enhancements - original_result['ai_enhancements'] = result_data + original_result["ai_enhancements"] = result_data logger.info("āœ… LOCAL enhancement complete") return original_result else: @@ -274,18 +272,18 @@ Focus on actionable insights that help developers understand and improve their c logger.error(f"āŒ LOCAL enhancement failed: {e}") return result - def _create_local_prompt(self, result: Dict) -> str: + def _create_local_prompt(self, result: dict) -> str: """Create prompt file for Claude Code CLI""" - config_files = result.get('config_files', []) + config_files = result.get("config_files", []) # Format config data for Claude config_data = [] for cf in config_files[:10]: config_data.append(f""" -### {cf['relative_path']} ({cf['config_type']}) -- Purpose: {cf['purpose']} -- Patterns: {', '.join(cf.get('patterns', []))} -- Settings count: {len(cf.get('settings', []))} +### {cf["relative_path"]} ({cf["config_type"]}) +- Purpose: {cf["purpose"]} +- Patterns: {", ".join(cf.get("patterns", []))} +- Settings count: {len(cf.get("settings", []))} """) prompt = f"""# Configuration Analysis Task @@ -332,15 +330,15 @@ Focus on actionable insights: """ return prompt - def _run_claude_cli(self, prompt_file: Path, output_file: Path) -> Optional[Dict]: + def _run_claude_cli(self, prompt_file: Path, output_file: Path) -> dict | None: """Run Claude Code CLI and wait for completion""" try: # Run claude command result = subprocess.run( - ['claude', str(prompt_file)], + ["claude", str(prompt_file)], capture_output=True, text=True, - timeout=300 # 5 minute timeout + timeout=300, # 5 minute timeout ) if result.returncode != 0: @@ -350,6 +348,7 @@ Focus on actionable insights: # Try to find output file (Claude might save it with different name) # Look for JSON files created in the last minute import time + current_time = time.time() potential_files = [] @@ -360,9 +359,9 @@ Focus on actionable insights: # Try to load the most recent JSON file for json_file in sorted(potential_files, key=lambda f: f.stat().st_mtime, reverse=True): try: - with open(json_file, 'r') as f: + with open(json_file) as f: data = json.load(f) - if 'file_enhancements' in data or 'overall_insights' in data: + if "file_enhancements" in data or "overall_insights" in data: logger.info(f"āœ… Found enhancement data in {json_file.name}") return data except: @@ -383,29 +382,18 @@ def main(): """Command-line interface for config enhancement""" import argparse - parser = argparse.ArgumentParser( - description='AI-enhance configuration extraction results' - ) + parser = argparse.ArgumentParser(description="AI-enhance configuration extraction results") + parser.add_argument("result_file", help="Path to config extraction JSON result file") parser.add_argument( - 'result_file', - help='Path to config extraction JSON result file' - ) - parser.add_argument( - '--mode', - choices=['auto', 'api', 'local'], - default='auto', - help='Enhancement mode (default: auto)' - ) - parser.add_argument( - '--output', - help='Output file for enhanced results (default: _enhanced.json)' + "--mode", choices=["auto", "api", "local"], default="auto", help="Enhancement mode (default: auto)" ) + parser.add_argument("--output", help="Output file for enhanced results (default: _enhanced.json)") args = parser.parse_args() # Load result file try: - with open(args.result_file, 'r') as f: + with open(args.result_file) as f: result = json.load(f) except Exception as e: logger.error(f"āŒ Failed to load result file: {e}") @@ -416,9 +404,9 @@ def main(): enhanced_result = enhancer.enhance_config_result(result) # Save - output_file = args.output or args.result_file.replace('.json', '_enhanced.json') + output_file = args.output or args.result_file.replace(".json", "_enhanced.json") try: - with open(output_file, 'w') as f: + with open(output_file, "w") as f: json.dump(enhanced_result, f, indent=2) logger.info(f"āœ… Enhanced results saved to: {output_file}") except Exception as e: @@ -428,5 +416,5 @@ def main(): return 0 -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/src/skill_seekers/cli/config_extractor.py b/src/skill_seekers/cli/config_extractor.py index 8accbb4..2ca0dca 100644 --- a/src/skill_seekers/cli/config_extractor.py +++ b/src/skill_seekers/cli/config_extractor.py @@ -9,19 +9,20 @@ This is different from C3.2 which extracts config examples from test code. C3.4 focuses on documenting the actual project configuration. """ +import ast import json import logging import re from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, List, Optional, Any, Set, Literal -import ast +from typing import Any, Literal logger = logging.getLogger(__name__) # Optional dependencies try: import yaml + YAML_AVAILABLE = True except ImportError: YAML_AVAILABLE = False @@ -29,10 +30,12 @@ except ImportError: try: import tomli + TOML_AVAILABLE = True except ImportError: try: import toml + TOML_AVAILABLE = True except ImportError: TOML_AVAILABLE = False @@ -42,68 +45,71 @@ except ImportError: @dataclass class ConfigSetting: """Individual configuration setting""" + key: str value: Any value_type: str # 'string', 'integer', 'boolean', 'array', 'object', 'null' - default_value: Optional[Any] = None + default_value: Any | None = None required: bool = False - env_var: Optional[str] = None + env_var: str | None = None description: str = "" - validation: Dict[str, Any] = field(default_factory=dict) - nested_path: List[str] = field(default_factory=list) # For nested configs + validation: dict[str, Any] = field(default_factory=dict) + nested_path: list[str] = field(default_factory=list) # For nested configs @dataclass class ConfigFile: """Represents a configuration file""" + file_path: str relative_path: str config_type: Literal["json", "yaml", "toml", "env", "ini", "python", "javascript", "dockerfile", "docker-compose"] purpose: str # Inferred purpose: database, api, logging, etc. - settings: List[ConfigSetting] = field(default_factory=list) - patterns: List[str] = field(default_factory=list) - raw_content: Optional[str] = None - parse_errors: List[str] = field(default_factory=list) + settings: list[ConfigSetting] = field(default_factory=list) + patterns: list[str] = field(default_factory=list) + raw_content: str | None = None + parse_errors: list[str] = field(default_factory=list) @dataclass class ConfigExtractionResult: """Result of config extraction""" - config_files: List[ConfigFile] = field(default_factory=list) + + config_files: list[ConfigFile] = field(default_factory=list) total_files: int = 0 total_settings: int = 0 - detected_patterns: Dict[str, List[str]] = field(default_factory=dict) # pattern -> files - errors: List[str] = field(default_factory=list) + detected_patterns: dict[str, list[str]] = field(default_factory=dict) # pattern -> files + errors: list[str] = field(default_factory=list) - def to_dict(self) -> Dict: + def to_dict(self) -> dict: """Convert result to dictionary for JSON output""" return { - 'total_files': self.total_files, - 'total_settings': self.total_settings, - 'detected_patterns': self.detected_patterns, - 'config_files': [ + "total_files": self.total_files, + "total_settings": self.total_settings, + "detected_patterns": self.detected_patterns, + "config_files": [ { - 'file_path': cf.file_path, - 'relative_path': cf.relative_path, - 'type': cf.config_type, - 'purpose': cf.purpose, - 'patterns': cf.patterns, - 'settings_count': len(cf.settings), - 'settings': [ + "file_path": cf.file_path, + "relative_path": cf.relative_path, + "type": cf.config_type, + "purpose": cf.purpose, + "patterns": cf.patterns, + "settings_count": len(cf.settings), + "settings": [ { - 'key': s.key, - 'value': s.value, - 'type': s.value_type, - 'env_var': s.env_var, - 'description': s.description, + "key": s.key, + "value": s.value, + "type": s.value_type, + "env_var": s.env_var, + "description": s.description, } for s in cf.settings ], - 'parse_errors': cf.parse_errors, + "parse_errors": cf.parse_errors, } for cf in self.config_files ], - 'errors': self.errors, + "errors": self.errors, } def to_markdown(self) -> str: @@ -115,11 +121,11 @@ class ConfigExtractionResult: # Handle both dict and list formats for detected_patterns if self.detected_patterns: if isinstance(self.detected_patterns, dict): - patterns_str = ', '.join(self.detected_patterns.keys()) + patterns_str = ", ".join(self.detected_patterns.keys()) else: - patterns_str = ', '.join(self.detected_patterns) + patterns_str = ", ".join(self.detected_patterns) else: - patterns_str = 'None' + patterns_str = "None" md += f"**Detected Patterns:** {patterns_str}\n\n" if self.config_files: @@ -148,52 +154,64 @@ class ConfigFileDetector: # Config file patterns by type CONFIG_PATTERNS = { - 'json': { - 'patterns': ['*.json', 'package.json', 'tsconfig.json', 'jsconfig.json'], - 'names': ['config.json', 'settings.json', 'app.json', '.eslintrc.json', '.prettierrc.json'], + "json": { + "patterns": ["*.json", "package.json", "tsconfig.json", "jsconfig.json"], + "names": ["config.json", "settings.json", "app.json", ".eslintrc.json", ".prettierrc.json"], }, - 'yaml': { - 'patterns': ['*.yaml', '*.yml'], - 'names': ['config.yml', 'settings.yml', '.travis.yml', '.gitlab-ci.yml', 'docker-compose.yml'], + "yaml": { + "patterns": ["*.yaml", "*.yml"], + "names": ["config.yml", "settings.yml", ".travis.yml", ".gitlab-ci.yml", "docker-compose.yml"], }, - 'toml': { - 'patterns': ['*.toml'], - 'names': ['pyproject.toml', 'Cargo.toml', 'config.toml'], + "toml": { + "patterns": ["*.toml"], + "names": ["pyproject.toml", "Cargo.toml", "config.toml"], }, - 'env': { - 'patterns': ['.env*', '*.env'], - 'names': ['.env', '.env.example', '.env.local', '.env.production'], + "env": { + "patterns": [".env*", "*.env"], + "names": [".env", ".env.example", ".env.local", ".env.production"], }, - 'ini': { - 'patterns': ['*.ini', '*.cfg'], - 'names': ['config.ini', 'setup.cfg', 'tox.ini'], + "ini": { + "patterns": ["*.ini", "*.cfg"], + "names": ["config.ini", "setup.cfg", "tox.ini"], }, - 'python': { - 'patterns': [], - 'names': ['settings.py', 'config.py', 'configuration.py', 'constants.py'], + "python": { + "patterns": [], + "names": ["settings.py", "config.py", "configuration.py", "constants.py"], }, - 'javascript': { - 'patterns': ['*.config.js', '*.config.ts'], - 'names': ['config.js', 'next.config.js', 'vue.config.js', 'webpack.config.js'], + "javascript": { + "patterns": ["*.config.js", "*.config.ts"], + "names": ["config.js", "next.config.js", "vue.config.js", "webpack.config.js"], }, - 'dockerfile': { - 'patterns': ['Dockerfile*'], - 'names': ['Dockerfile', 'Dockerfile.dev', 'Dockerfile.prod'], + "dockerfile": { + "patterns": ["Dockerfile*"], + "names": ["Dockerfile", "Dockerfile.dev", "Dockerfile.prod"], }, - 'docker-compose': { - 'patterns': ['docker-compose*.yml', 'docker-compose*.yaml'], - 'names': ['docker-compose.yml', 'docker-compose.yaml'], + "docker-compose": { + "patterns": ["docker-compose*.yml", "docker-compose*.yaml"], + "names": ["docker-compose.yml", "docker-compose.yaml"], }, } # Directories to skip SKIP_DIRS = { - 'node_modules', 'venv', 'env', '.venv', '__pycache__', '.git', - 'build', 'dist', '.tox', '.mypy_cache', '.pytest_cache', - 'htmlcov', 'coverage', '.eggs', '*.egg-info' + "node_modules", + "venv", + "env", + ".venv", + "__pycache__", + ".git", + "build", + "dist", + ".tox", + ".mypy_cache", + ".pytest_cache", + "htmlcov", + "coverage", + ".eggs", + "*.egg-info", } - def find_config_files(self, directory: Path, max_files: int = 100) -> List[ConfigFile]: + def find_config_files(self, directory: Path, max_files: int = 100) -> list[ConfigFile]: """ Find all configuration files in directory. @@ -219,7 +237,7 @@ class ConfigFileDetector: file_path=str(file_path), relative_path=relative_path, config_type=config_type, - purpose=self._infer_purpose(file_path, config_type) + purpose=self._infer_purpose(file_path, config_type), ) config_files.append(config_file) found_count += 1 @@ -230,7 +248,7 @@ class ConfigFileDetector: def _walk_directory(self, directory: Path): """Walk directory, skipping excluded directories""" - for item in directory.rglob('*'): + for item in directory.rglob("*"): # Skip directories if item.is_dir(): continue @@ -241,18 +259,18 @@ class ConfigFileDetector: yield item - def _detect_config_type(self, file_path: Path) -> Optional[str]: + def _detect_config_type(self, file_path: Path) -> str | None: """Detect configuration file type""" filename = file_path.name.lower() # Check each config type for config_type, patterns in self.CONFIG_PATTERNS.items(): # Check exact name matches - if filename in patterns['names']: + if filename in patterns["names"]: return config_type # Check pattern matches - for pattern in patterns['patterns']: + for pattern in patterns["patterns"]: if file_path.match(pattern): return config_type @@ -264,43 +282,43 @@ class ConfigFileDetector: filename = file_path.name.lower() # Database configs - if any(word in path_lower for word in ['database', 'db', 'postgres', 'mysql', 'mongo']): - return 'database_configuration' + if any(word in path_lower for word in ["database", "db", "postgres", "mysql", "mongo"]): + return "database_configuration" # API configs - if any(word in path_lower for word in ['api', 'rest', 'graphql', 'endpoint']): - return 'api_configuration' + if any(word in path_lower for word in ["api", "rest", "graphql", "endpoint"]): + return "api_configuration" # Logging configs - if any(word in path_lower for word in ['log', 'logger', 'logging']): - return 'logging_configuration' + if any(word in path_lower for word in ["log", "logger", "logging"]): + return "logging_configuration" # Docker configs - if 'docker' in filename: - return 'docker_configuration' + if "docker" in filename: + return "docker_configuration" # CI/CD configs - if any(word in path_lower for word in ['.travis', '.gitlab', '.github', 'ci', 'cd']): - return 'ci_cd_configuration' + if any(word in path_lower for word in [".travis", ".gitlab", ".github", "ci", "cd"]): + return "ci_cd_configuration" # Package configs - if filename in ['package.json', 'pyproject.toml', 'cargo.toml']: - return 'package_configuration' + if filename in ["package.json", "pyproject.toml", "cargo.toml"]: + return "package_configuration" # TypeScript/JavaScript configs - if filename in ['tsconfig.json', 'jsconfig.json']: - return 'typescript_configuration' + if filename in ["tsconfig.json", "jsconfig.json"]: + return "typescript_configuration" # Framework configs - if 'next.config' in filename or 'vue.config' in filename or 'webpack.config' in filename: - return 'framework_configuration' + if "next.config" in filename or "vue.config" in filename or "webpack.config" in filename: + return "framework_configuration" # Environment configs - if '.env' in filename: - return 'environment_configuration' + if ".env" in filename: + return "environment_configuration" # Default - return 'general_configuration' + return "general_configuration" class ConfigParser: @@ -318,27 +336,27 @@ class ConfigParser: """ try: # Read file content - with open(config_file.file_path, 'r', encoding='utf-8') as f: + with open(config_file.file_path, encoding="utf-8") as f: config_file.raw_content = f.read() # Parse based on type - if config_file.config_type == 'json': + if config_file.config_type == "json": self._parse_json(config_file) - elif config_file.config_type == 'yaml': + elif config_file.config_type == "yaml": self._parse_yaml(config_file) - elif config_file.config_type == 'toml': + elif config_file.config_type == "toml": self._parse_toml(config_file) - elif config_file.config_type == 'env': + elif config_file.config_type == "env": self._parse_env(config_file) - elif config_file.config_type == 'ini': + elif config_file.config_type == "ini": self._parse_ini(config_file) - elif config_file.config_type == 'python': + elif config_file.config_type == "python": self._parse_python_config(config_file) - elif config_file.config_type == 'javascript': + elif config_file.config_type == "javascript": self._parse_javascript_config(config_file) - elif config_file.config_type == 'dockerfile': + elif config_file.config_type == "dockerfile": self._parse_dockerfile(config_file) - elif config_file.config_type == 'docker-compose': + elif config_file.config_type == "docker-compose": self._parse_yaml(config_file) # Docker compose is YAML except Exception as e: @@ -376,10 +394,11 @@ class ConfigParser: return try: - if 'tomli' in globals(): + if "tomli" in globals(): data = tomli.loads(config_file.raw_content) else: import toml + data = toml.loads(config_file.raw_content) self._extract_settings_from_dict(data, config_file) @@ -388,17 +407,17 @@ class ConfigParser: def _parse_env(self, config_file: ConfigFile): """Parse .env file""" - lines = config_file.raw_content.split('\n') + lines = config_file.raw_content.split("\n") for line_num, line in enumerate(lines, 1): line = line.strip() # Skip comments and empty lines - if not line or line.startswith('#'): + if not line or line.startswith("#"): continue # Parse KEY=VALUE - match = re.match(r'([A-Z_][A-Z0-9_]*)\s*=\s*(.+)', line) + match = re.match(r"([A-Z_][A-Z0-9_]*)\s*=\s*(.+)", line) if match: key, value = match.groups() value = value.strip().strip('"').strip("'") @@ -408,7 +427,7 @@ class ConfigParser: value=value, value_type=self._infer_type(value), env_var=key, - description=self._extract_env_description(lines, line_num - 1) + description=self._extract_env_description(lines, line_num - 1), ) config_file.settings.append(setting) @@ -426,7 +445,7 @@ class ConfigParser: key=f"{section}.{key}", value=value, value_type=self._infer_type(value), - nested_path=[section, key] + nested_path=[section, key], ) config_file.settings.append(setting) except Exception as e: @@ -444,7 +463,7 @@ class ConfigParser: key = node.targets[0].id # Skip private variables - if key.startswith('_'): + if key.startswith("_"): continue # Extract value @@ -454,7 +473,7 @@ class ConfigParser: key=key, value=value, value_type=self._infer_type(value), - description=self._extract_python_docstring(node) + description=self._extract_python_docstring(node), ) config_file.settings.append(setting) except (ValueError, TypeError): @@ -469,8 +488,8 @@ class ConfigParser: # Simple regex-based extraction for common patterns patterns = [ r'(?:const|let|var)\s+(\w+)\s*[:=]\s*(["\'])(.*?)\2', # String values - r'(?:const|let|var)\s+(\w+)\s*[:=]\s*(\d+)', # Number values - r'(?:const|let|var)\s+(\w+)\s*[:=]\s*(true|false)', # Boolean values + r"(?:const|let|var)\s+(\w+)\s*[:=]\s*(\d+)", # Number values + r"(?:const|let|var)\s+(\w+)\s*[:=]\s*(true|false)", # Boolean values ] for pattern in patterns: @@ -479,47 +498,36 @@ class ConfigParser: key = match.group(1) value = match.group(3) if len(match.groups()) > 2 else match.group(2) - setting = ConfigSetting( - key=key, - value=value, - value_type=self._infer_type(value) - ) + setting = ConfigSetting(key=key, value=value, value_type=self._infer_type(value)) config_file.settings.append(setting) def _parse_dockerfile(self, config_file: ConfigFile): """Parse Dockerfile configuration""" - lines = config_file.raw_content.split('\n') + lines = config_file.raw_content.split("\n") for line in lines: line = line.strip() # Extract ENV variables - if line.startswith('ENV '): - parts = line[4:].split('=', 1) + if line.startswith("ENV "): + parts = line[4:].split("=", 1) if len(parts) == 2: key, value = parts setting = ConfigSetting( - key=key.strip(), - value=value.strip(), - value_type='string', - env_var=key.strip() + key=key.strip(), value=value.strip(), value_type="string", env_var=key.strip() ) config_file.settings.append(setting) # Extract ARG variables - elif line.startswith('ARG '): - parts = line[4:].split('=', 1) + elif line.startswith("ARG "): + parts = line[4:].split("=", 1) key = parts[0].strip() value = parts[1].strip() if len(parts) == 2 else None - setting = ConfigSetting( - key=key, - value=value, - value_type='string' - ) + setting = ConfigSetting(key=key, value=value, value_type="string") config_file.settings.append(setting) - def _extract_settings_from_dict(self, data: Dict, config_file: ConfigFile, parent_path: List[str] = None): + def _extract_settings_from_dict(self, data: dict, config_file: ConfigFile, parent_path: list[str] = None): """Recursively extract settings from dictionary""" if parent_path is None: parent_path = [] @@ -530,35 +538,35 @@ class ConfigParser: self._extract_settings_from_dict(value, config_file, parent_path + [key]) else: setting = ConfigSetting( - key='.'.join(parent_path + [key]) if parent_path else key, + key=".".join(parent_path + [key]) if parent_path else key, value=value, value_type=self._infer_type(value), - nested_path=parent_path + [key] + nested_path=parent_path + [key], ) config_file.settings.append(setting) def _infer_type(self, value: Any) -> str: """Infer value type""" if value is None: - return 'null' + return "null" elif isinstance(value, bool): - return 'boolean' + return "boolean" elif isinstance(value, int): - return 'integer' + return "integer" elif isinstance(value, float): - return 'number' + return "number" elif isinstance(value, (list, tuple)): - return 'array' + return "array" elif isinstance(value, dict): - return 'object' + return "object" else: - return 'string' + return "string" - def _extract_env_description(self, lines: List[str], line_index: int) -> str: + def _extract_env_description(self, lines: list[str], line_index: int) -> str: """Extract description from comment above env variable""" if line_index > 0: prev_line = lines[line_index - 1].strip() - if prev_line.startswith('#'): + if prev_line.startswith("#"): return prev_line[1:].strip() return "" @@ -573,37 +581,37 @@ class ConfigPatternDetector: # Known configuration patterns KNOWN_PATTERNS = { - 'database_config': { - 'keys': ['host', 'port', 'database', 'user', 'username', 'password', 'db_name'], - 'min_match': 3, + "database_config": { + "keys": ["host", "port", "database", "user", "username", "password", "db_name"], + "min_match": 3, }, - 'api_config': { - 'keys': ['base_url', 'api_key', 'api_secret', 'timeout', 'retry', 'endpoint'], - 'min_match': 2, + "api_config": { + "keys": ["base_url", "api_key", "api_secret", "timeout", "retry", "endpoint"], + "min_match": 2, }, - 'logging_config': { - 'keys': ['level', 'format', 'handler', 'file', 'console', 'log_level'], - 'min_match': 2, + "logging_config": { + "keys": ["level", "format", "handler", "file", "console", "log_level"], + "min_match": 2, }, - 'cache_config': { - 'keys': ['backend', 'ttl', 'timeout', 'max_size', 'redis', 'memcached'], - 'min_match': 2, + "cache_config": { + "keys": ["backend", "ttl", "timeout", "max_size", "redis", "memcached"], + "min_match": 2, }, - 'email_config': { - 'keys': ['smtp_host', 'smtp_port', 'email', 'from_email', 'mail_server'], - 'min_match': 2, + "email_config": { + "keys": ["smtp_host", "smtp_port", "email", "from_email", "mail_server"], + "min_match": 2, }, - 'auth_config': { - 'keys': ['secret_key', 'jwt_secret', 'token', 'oauth', 'authentication'], - 'min_match': 1, + "auth_config": { + "keys": ["secret_key", "jwt_secret", "token", "oauth", "authentication"], + "min_match": 1, }, - 'server_config': { - 'keys': ['host', 'port', 'bind', 'workers', 'threads'], - 'min_match': 2, + "server_config": { + "keys": ["host", "port", "bind", "workers", "threads"], + "min_match": 2, }, } - def detect_patterns(self, config_file: ConfigFile) -> List[str]: + def detect_patterns(self, config_file: ConfigFile) -> list[str]: """ Detect which patterns this config file matches. @@ -620,8 +628,8 @@ class ConfigPatternDetector: # Check against each known pattern for pattern_name, pattern_def in self.KNOWN_PATTERNS.items(): - pattern_keys = {k.lower() for k in pattern_def['keys']} - min_match = pattern_def['min_match'] + pattern_keys = {k.lower() for k in pattern_def["keys"]} + min_match = pattern_def["min_match"] # Count matches matches = len(setting_keys & pattern_keys) @@ -641,11 +649,7 @@ class ConfigExtractor: self.parser = ConfigParser() self.pattern_detector = ConfigPatternDetector() - def extract_from_directory( - self, - directory: Path, - max_files: int = 100 - ) -> ConfigExtractionResult: + def extract_from_directory(self, directory: Path, max_files: int = 100) -> ConfigExtractionResult: """ Extract configuration patterns from directory. @@ -696,35 +700,35 @@ class ConfigExtractor: return result - def to_dict(self, result: ConfigExtractionResult) -> Dict: + def to_dict(self, result: ConfigExtractionResult) -> dict: """Convert result to dictionary for JSON output""" return { - 'total_files': result.total_files, - 'total_settings': result.total_settings, - 'detected_patterns': result.detected_patterns, - 'config_files': [ + "total_files": result.total_files, + "total_settings": result.total_settings, + "detected_patterns": result.detected_patterns, + "config_files": [ { - 'file_path': cf.file_path, - 'relative_path': cf.relative_path, - 'type': cf.config_type, - 'purpose': cf.purpose, - 'patterns': cf.patterns, - 'settings_count': len(cf.settings), - 'settings': [ + "file_path": cf.file_path, + "relative_path": cf.relative_path, + "type": cf.config_type, + "purpose": cf.purpose, + "patterns": cf.patterns, + "settings_count": len(cf.settings), + "settings": [ { - 'key': s.key, - 'value': s.value, - 'type': s.value_type, - 'env_var': s.env_var, - 'description': s.description, + "key": s.key, + "value": s.value, + "type": s.value_type, + "env_var": s.env_var, + "description": s.description, } for s in cf.settings ], - 'parse_errors': cf.parse_errors, + "parse_errors": cf.parse_errors, } for cf in result.config_files ], - 'errors': result.errors, + "errors": result.errors, } @@ -732,19 +736,29 @@ def main(): """CLI entry point for config extraction""" import argparse - parser = argparse.ArgumentParser(description="Extract configuration patterns from codebase with optional AI enhancement") - parser.add_argument('directory', type=Path, help='Directory to analyze') - parser.add_argument('--output', '-o', type=Path, help='Output JSON file') - parser.add_argument('--max-files', type=int, default=100, help='Maximum config files to process') - parser.add_argument('--enhance', action='store_true', help='Enhance with AI analysis (API mode, requires ANTHROPIC_API_KEY)') - parser.add_argument('--enhance-local', action='store_true', help='Enhance with AI analysis (LOCAL mode, uses Claude Code CLI)') - parser.add_argument('--ai-mode', choices=['auto', 'api', 'local', 'none'], default='none', - help='AI enhancement mode: auto (detect), api (Claude API), local (Claude Code CLI), none (disable)') + parser = argparse.ArgumentParser( + description="Extract configuration patterns from codebase with optional AI enhancement" + ) + parser.add_argument("directory", type=Path, help="Directory to analyze") + parser.add_argument("--output", "-o", type=Path, help="Output JSON file") + parser.add_argument("--max-files", type=int, default=100, help="Maximum config files to process") + parser.add_argument( + "--enhance", action="store_true", help="Enhance with AI analysis (API mode, requires ANTHROPIC_API_KEY)" + ) + parser.add_argument( + "--enhance-local", action="store_true", help="Enhance with AI analysis (LOCAL mode, uses Claude Code CLI)" + ) + parser.add_argument( + "--ai-mode", + choices=["auto", "api", "local", "none"], + default="none", + help="AI enhancement mode: auto (detect), api (Claude API), local (Claude Code CLI), none (disable)", + ) args = parser.parse_args() # Setup logging - logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") # Extract extractor = ConfigExtractor() @@ -756,13 +770,14 @@ def main(): # AI Enhancement (if requested) enhance_mode = args.ai_mode if args.enhance: - enhance_mode = 'api' + enhance_mode = "api" elif args.enhance_local: - enhance_mode = 'local' + enhance_mode = "local" - if enhance_mode != 'none': + if enhance_mode != "none": try: from skill_seekers.cli.config_enhancer import ConfigEnhancer + logger.info(f"šŸ¤– Starting AI enhancement (mode: {enhance_mode})...") enhancer = ConfigEnhancer(mode=enhance_mode) output_dict = enhancer.enhance_config_result(output_dict) @@ -774,27 +789,27 @@ def main(): # Output if args.output: - with open(args.output, 'w') as f: + with open(args.output, "w") as f: json.dump(output_dict, f, indent=2) print(f"āœ… Saved config extraction results to: {args.output}") else: print(json.dumps(output_dict, indent=2)) # Summary - print(f"\nšŸ“Š Summary:") + print("\nšŸ“Š Summary:") print(f" Config files found: {result.total_files}") print(f" Total settings: {result.total_settings}") print(f" Detected patterns: {', '.join(result.detected_patterns.keys()) or 'None'}") - if 'ai_enhancements' in output_dict: + if "ai_enhancements" in output_dict: print(f" ✨ AI enhancements: Yes ({enhance_mode} mode)") - insights = output_dict['ai_enhancements'].get('overall_insights', {}) - if insights.get('security_issues_found'): + insights = output_dict["ai_enhancements"].get("overall_insights", {}) + if insights.get("security_issues_found"): print(f" šŸ” Security issues found: {insights['security_issues_found']}") if result.errors: print(f"\nāš ļø Errors: {len(result.errors)}") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/skill_seekers/cli/config_manager.py b/src/skill_seekers/cli/config_manager.py index 73609a1..13fbdd4 100644 --- a/src/skill_seekers/cli/config_manager.py +++ b/src/skill_seekers/cli/config_manager.py @@ -8,10 +8,10 @@ Provides secure storage with file permissions and auto-detection capabilities. import json import os import stat +import sys from datetime import datetime, timedelta from pathlib import Path -from typing import Dict, List, Optional, Any -import sys +from typing import Any class ConfigManager: @@ -26,28 +26,11 @@ class ConfigManager: # Default configuration DEFAULT_CONFIG = { "version": "1.0", - "github": { - "default_profile": None, - "profiles": {} - }, - "rate_limit": { - "default_timeout_minutes": 30, - "auto_switch_profiles": True, - "show_countdown": True - }, - "resume": { - "auto_save_interval_seconds": 60, - "keep_progress_days": 7 - }, - "api_keys": { - "anthropic": None, - "google": None, - "openai": None - }, - "first_run": { - "completed": False, - "version": "2.7.0" - } + "github": {"default_profile": None, "profiles": {}}, + "rate_limit": {"default_timeout_minutes": 30, "auto_switch_profiles": True, "show_countdown": True}, + "resume": {"auto_save_interval_seconds": 60, "keep_progress_days": 7}, + "api_keys": {"anthropic": None, "google": None, "openai": None}, + "first_run": {"completed": False, "version": "2.7.0"}, } def __init__(self): @@ -65,25 +48,26 @@ class ConfigManager: # Set directory permissions to 700 (rwx------) directory.chmod(stat.S_IRWXU) - def _load_config(self) -> Dict[str, Any]: + def _load_config(self) -> dict[str, Any]: """Load configuration from file or create default.""" if not self.config_file.exists(): return self.DEFAULT_CONFIG.copy() try: - with open(self.config_file, 'r') as f: + with open(self.config_file) as f: config = json.load(f) # Merge with defaults for any missing keys config = self._merge_with_defaults(config) return config - except (json.JSONDecodeError, IOError) as e: + except (OSError, json.JSONDecodeError) as e: print(f"āš ļø Warning: Could not load config file: {e}") - print(f" Using default configuration.") + print(" Using default configuration.") return self.DEFAULT_CONFIG.copy() - def _merge_with_defaults(self, config: Dict[str, Any]) -> Dict[str, Any]: + def _merge_with_defaults(self, config: dict[str, Any]) -> dict[str, Any]: """Merge loaded config with defaults to ensure all keys exist.""" + def deep_merge(default: dict, custom: dict) -> dict: result = default.copy() for key, value in custom.items(): @@ -98,13 +82,13 @@ class ConfigManager: def save_config(self): """Save configuration to file with secure permissions.""" try: - with open(self.config_file, 'w') as f: + with open(self.config_file, "w") as f: json.dump(self.config, f, indent=2) # Set file permissions to 600 (rw-------) self.config_file.chmod(stat.S_IRUSR | stat.S_IWUSR) - except IOError as e: + except OSError as e: print(f"āŒ Error saving config: {e}") sys.exit(1) @@ -117,7 +101,7 @@ class ConfigManager: description: str = "", rate_limit_strategy: str = "prompt", timeout_minutes: int = 30, - set_as_default: bool = False + set_as_default: bool = False, ): """Add a new GitHub profile.""" if not name: @@ -131,7 +115,7 @@ class ConfigManager: "description": description, "rate_limit_strategy": rate_limit_strategy, "timeout_minutes": timeout_minutes, - "added_at": datetime.now().isoformat() + "added_at": datetime.now().isoformat(), } self.config["github"]["profiles"][name] = profile @@ -142,7 +126,7 @@ class ConfigManager: self.save_config() print(f"āœ… Added GitHub profile: {name}") if set_as_default: - print(f"āœ… Set as default profile") + print("āœ… Set as default profile") def remove_github_profile(self, name: str): """Remove a GitHub profile.""" @@ -159,7 +143,7 @@ class ConfigManager: self.save_config() print(f"āœ… Removed GitHub profile: {name}") - def list_github_profiles(self) -> List[Dict[str, Any]]: + def list_github_profiles(self) -> list[dict[str, Any]]: """List all GitHub profiles.""" profiles = [] default = self.config["github"]["default_profile"] @@ -171,17 +155,13 @@ class ConfigManager: "strategy": data.get("rate_limit_strategy", "prompt"), "timeout": data.get("timeout_minutes", 30), "is_default": name == default, - "added_at": data.get("added_at", "Unknown") + "added_at": data.get("added_at", "Unknown"), } profiles.append(profile_info) return profiles - def get_github_token( - self, - profile_name: Optional[str] = None, - repo_url: Optional[str] = None - ) -> Optional[str]: + def get_github_token(self, profile_name: str | None = None, repo_url: str | None = None) -> str | None: """ Get GitHub token with smart fallback chain. @@ -214,14 +194,14 @@ class ConfigManager: # 4. No token available return None - def get_profile_for_token(self, token: str) -> Optional[str]: + def get_profile_for_token(self, token: str) -> str | None: """Get profile name for a given token.""" for name, profile in self.config["github"]["profiles"].items(): if profile["token"] == token: return name return None - def get_next_profile(self, current_token: str) -> Optional[tuple]: + def get_next_profile(self, current_token: str) -> tuple | None: """ Get next available profile for rate limit switching. @@ -248,7 +228,7 @@ class ConfigManager: name, profile = profiles[next_idx] return (name, profile["token"]) - def get_rate_limit_strategy(self, token: Optional[str] = None) -> str: + def get_rate_limit_strategy(self, token: str | None = None) -> str: """Get rate limit strategy for a token (or default).""" if token: profile_name = self.get_profile_for_token(token) @@ -259,7 +239,7 @@ class ConfigManager: # Default strategy return "prompt" - def get_timeout_minutes(self, token: Optional[str] = None) -> int: + def get_timeout_minutes(self, token: str | None = None) -> int: """Get timeout minutes for a token (or default).""" if token: profile_name = self.get_profile_for_token(token) @@ -280,7 +260,7 @@ class ConfigManager: self.save_config() print(f"āœ… Set {provider.capitalize()} API key") - def get_api_key(self, provider: str) -> Optional[str]: + def get_api_key(self, provider: str) -> str | None: """ Get API key with environment variable fallback. @@ -289,11 +269,7 @@ class ConfigManager: 2. Config file """ # Check environment first - env_map = { - "anthropic": "ANTHROPIC_API_KEY", - "google": "GOOGLE_API_KEY", - "openai": "OPENAI_API_KEY" - } + env_map = {"anthropic": "ANTHROPIC_API_KEY", "google": "GOOGLE_API_KEY", "openai": "OPENAI_API_KEY"} env_var = env_map.get(provider) if env_var: @@ -306,19 +282,19 @@ class ConfigManager: # Progress Management - def save_progress(self, job_id: str, progress_data: Dict[str, Any]): + def save_progress(self, job_id: str, progress_data: dict[str, Any]): """Save progress for a job.""" progress_file = self.progress_dir / f"{job_id}.json" progress_data["last_updated"] = datetime.now().isoformat() - with open(progress_file, 'w') as f: + with open(progress_file, "w") as f: json.dump(progress_data, f, indent=2) # Set file permissions to 600 progress_file.chmod(stat.S_IRUSR | stat.S_IWUSR) - def load_progress(self, job_id: str) -> Optional[Dict[str, Any]]: + def load_progress(self, job_id: str) -> dict[str, Any] | None: """Load progress for a job.""" progress_file = self.progress_dir / f"{job_id}.json" @@ -326,29 +302,31 @@ class ConfigManager: return None try: - with open(progress_file, 'r') as f: + with open(progress_file) as f: return json.load(f) - except (json.JSONDecodeError, IOError): + except (OSError, json.JSONDecodeError): return None - def list_resumable_jobs(self) -> List[Dict[str, Any]]: + def list_resumable_jobs(self) -> list[dict[str, Any]]: """List all resumable jobs.""" jobs = [] for progress_file in self.progress_dir.glob("*.json"): try: - with open(progress_file, 'r') as f: + with open(progress_file) as f: data = json.load(f) if data.get("can_resume", False): - jobs.append({ - "job_id": data.get("job_id", progress_file.stem), - "started_at": data.get("started_at"), - "command": data.get("command"), - "progress": data.get("progress", {}), - "last_updated": data.get("last_updated") - }) - except (json.JSONDecodeError, IOError): + jobs.append( + { + "job_id": data.get("job_id", progress_file.stem), + "started_at": data.get("started_at"), + "command": data.get("command"), + "progress": data.get("progress", {}), + "last_updated": data.get("last_updated"), + } + ) + except (OSError, json.JSONDecodeError): continue # Sort by last updated (newest first) @@ -447,8 +425,8 @@ class ConfigManager: print(f"\nšŸ“¦ Resumable Jobs: {len(jobs)}") for job in jobs[:5]: # Show max 5 print(f" • {job['job_id']}") - if job.get('progress'): - phase = job['progress'].get('phase', 'unknown') + if job.get("progress"): + phase = job["progress"].get("phase", "unknown") print(f" Phase: {phase}, Last: {job['last_updated']}") diff --git a/src/skill_seekers/cli/config_validator.py b/src/skill_seekers/cli/config_validator.py index 65c5c65..154bdfb 100644 --- a/src/skill_seekers/cli/config_validator.py +++ b/src/skill_seekers/cli/config_validator.py @@ -12,8 +12,8 @@ Also provides backward compatibility detection for legacy configs. import json import logging -from typing import Dict, Any, List, Optional, Union from pathlib import Path +from typing import Any logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -25,18 +25,18 @@ class ConfigValidator: """ # Valid source types - VALID_SOURCE_TYPES = {'documentation', 'github', 'pdf'} + VALID_SOURCE_TYPES = {"documentation", "github", "pdf"} # Valid merge modes - VALID_MERGE_MODES = {'rule-based', 'claude-enhanced'} + VALID_MERGE_MODES = {"rule-based", "claude-enhanced"} # Valid code analysis depth levels - VALID_DEPTH_LEVELS = {'surface', 'deep', 'full'} + VALID_DEPTH_LEVELS = {"surface", "deep", "full"} # Valid AI modes for C3.x enhancement - VALID_AI_MODES = {'auto', 'api', 'local', 'none'} + VALID_AI_MODES = {"auto", "api", "local", "none"} - def __init__(self, config_or_path: Union[Dict[str, Any], str]): + def __init__(self, config_or_path: dict[str, Any] | str): """ Initialize validator with config dict or file path. @@ -51,10 +51,10 @@ class ConfigValidator: self.config = self._load_config() self.is_unified = self._detect_format() - def _load_config(self) -> Dict[str, Any]: + def _load_config(self) -> dict[str, Any]: """Load JSON config file.""" try: - with open(self.config_path, 'r', encoding='utf-8') as f: + with open(self.config_path, encoding="utf-8") as f: return json.load(f) except FileNotFoundError: raise ValueError(f"Config file not found: {self.config_path}") @@ -69,7 +69,7 @@ class ConfigValidator: True if unified format (has 'sources' array) False if legacy format """ - return 'sources' in self.config and isinstance(self.config['sources'], list) + return "sources" in self.config and isinstance(self.config["sources"], list) def validate(self) -> bool: """ @@ -91,17 +91,17 @@ class ConfigValidator: logger.info("Validating unified config format...") # Required top-level fields - if 'name' not in self.config: + if "name" not in self.config: raise ValueError("Missing required field: 'name'") - if 'description' not in self.config: + if "description" not in self.config: raise ValueError("Missing required field: 'description'") - if 'sources' not in self.config: + if "sources" not in self.config: raise ValueError("Missing required field: 'sources'") # Validate sources array - sources = self.config['sources'] + sources = self.config["sources"] if not isinstance(sources, list): raise ValueError("'sources' must be an array") @@ -110,7 +110,7 @@ class ConfigValidator: raise ValueError("'sources' array cannot be empty") # Validate merge_mode (optional) - merge_mode = self.config.get('merge_mode', 'rule-based') + merge_mode = self.config.get("merge_mode", "rule-based") if merge_mode not in self.VALID_MERGE_MODES: raise ValueError(f"Invalid merge_mode: '{merge_mode}'. Must be one of {self.VALID_MERGE_MODES}") @@ -121,56 +121,52 @@ class ConfigValidator: logger.info(f"āœ… Unified config valid: {len(sources)} sources") return True - def _validate_source(self, source: Dict[str, Any], index: int): + def _validate_source(self, source: dict[str, Any], index: int): """Validate individual source configuration.""" # Check source has 'type' field - if 'type' not in source: + if "type" not in source: raise ValueError(f"Source {index}: Missing required field 'type'") - source_type = source['type'] + source_type = source["type"] if source_type not in self.VALID_SOURCE_TYPES: - raise ValueError( - f"Source {index}: Invalid type '{source_type}'. " - f"Must be one of {self.VALID_SOURCE_TYPES}" - ) + raise ValueError(f"Source {index}: Invalid type '{source_type}'. Must be one of {self.VALID_SOURCE_TYPES}") # Type-specific validation - if source_type == 'documentation': + if source_type == "documentation": self._validate_documentation_source(source, index) - elif source_type == 'github': + elif source_type == "github": self._validate_github_source(source, index) - elif source_type == 'pdf': + elif source_type == "pdf": self._validate_pdf_source(source, index) - def _validate_documentation_source(self, source: Dict[str, Any], index: int): + def _validate_documentation_source(self, source: dict[str, Any], index: int): """Validate documentation source configuration.""" - if 'base_url' not in source: + if "base_url" not in source: raise ValueError(f"Source {index} (documentation): Missing required field 'base_url'") # Optional but recommended fields - if 'selectors' not in source: + if "selectors" not in source: logger.warning(f"Source {index} (documentation): No 'selectors' specified, using defaults") - if 'max_pages' in source and not isinstance(source['max_pages'], int): + if "max_pages" in source and not isinstance(source["max_pages"], int): raise ValueError(f"Source {index} (documentation): 'max_pages' must be an integer") - def _validate_github_source(self, source: Dict[str, Any], index: int): + def _validate_github_source(self, source: dict[str, Any], index: int): """Validate GitHub source configuration.""" - if 'repo' not in source: + if "repo" not in source: raise ValueError(f"Source {index} (github): Missing required field 'repo'") # Validate repo format (owner/repo) - repo = source['repo'] - if '/' not in repo: + repo = source["repo"] + if "/" not in repo: raise ValueError( - f"Source {index} (github): Invalid repo format '{repo}'. " - f"Must be 'owner/repo' (e.g., 'facebook/react')" + f"Source {index} (github): Invalid repo format '{repo}'. Must be 'owner/repo' (e.g., 'facebook/react')" ) # Validate code_analysis_depth if specified - if 'code_analysis_depth' in source: - depth = source['code_analysis_depth'] + if "code_analysis_depth" in source: + depth = source["code_analysis_depth"] if depth not in self.VALID_DEPTH_LEVELS: raise ValueError( f"Source {index} (github): Invalid code_analysis_depth '{depth}'. " @@ -178,29 +174,28 @@ class ConfigValidator: ) # Validate max_issues if specified - if 'max_issues' in source and not isinstance(source['max_issues'], int): + if "max_issues" in source and not isinstance(source["max_issues"], int): raise ValueError(f"Source {index} (github): 'max_issues' must be an integer") # Validate enable_codebase_analysis if specified (C3.5) - if 'enable_codebase_analysis' in source and not isinstance(source['enable_codebase_analysis'], bool): + if "enable_codebase_analysis" in source and not isinstance(source["enable_codebase_analysis"], bool): raise ValueError(f"Source {index} (github): 'enable_codebase_analysis' must be a boolean") # Validate ai_mode if specified (C3.5) - if 'ai_mode' in source: - ai_mode = source['ai_mode'] + if "ai_mode" in source: + ai_mode = source["ai_mode"] if ai_mode not in self.VALID_AI_MODES: raise ValueError( - f"Source {index} (github): Invalid ai_mode '{ai_mode}'. " - f"Must be one of {self.VALID_AI_MODES}" + f"Source {index} (github): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}" ) - def _validate_pdf_source(self, source: Dict[str, Any], index: int): + def _validate_pdf_source(self, source: dict[str, Any], index: int): """Validate PDF source configuration.""" - if 'path' not in source: + if "path" not in source: raise ValueError(f"Source {index} (pdf): Missing required field 'path'") # Check if file exists - pdf_path = source['path'] + pdf_path = source["path"] if not Path(pdf_path).exists(): logger.warning(f"Source {index} (pdf): File not found: {pdf_path}") @@ -213,18 +208,18 @@ class ConfigValidator: logger.info("Detected legacy config format (backward compatible)") # Detect which legacy type based on fields - if 'base_url' in self.config: + if "base_url" in self.config: logger.info("Legacy type: documentation") - elif 'repo' in self.config: + elif "repo" in self.config: logger.info("Legacy type: github") - elif 'pdf' in self.config or 'path' in self.config: + elif "pdf" in self.config or "path" in self.config: logger.info("Legacy type: pdf") else: raise ValueError("Cannot detect legacy config type (missing base_url, repo, or pdf)") return True - def convert_legacy_to_unified(self) -> Dict[str, Any]: + def convert_legacy_to_unified(self) -> dict[str, Any]: """ Convert legacy config to unified format. @@ -238,64 +233,50 @@ class ConfigValidator: logger.info("Converting legacy config to unified format...") # Detect legacy type and convert - if 'base_url' in self.config: + if "base_url" in self.config: return self._convert_legacy_documentation() - elif 'repo' in self.config: + elif "repo" in self.config: return self._convert_legacy_github() - elif 'pdf' in self.config or 'path' in self.config: + elif "pdf" in self.config or "path" in self.config: return self._convert_legacy_pdf() else: raise ValueError("Cannot convert: unknown legacy format") - def _convert_legacy_documentation(self) -> Dict[str, Any]: + def _convert_legacy_documentation(self) -> dict[str, Any]: """Convert legacy documentation config to unified.""" unified = { - 'name': self.config.get('name', 'unnamed'), - 'description': self.config.get('description', 'Documentation skill'), - 'merge_mode': 'rule-based', - 'sources': [ - { - 'type': 'documentation', - **{k: v for k, v in self.config.items() - if k not in ['name', 'description']} - } - ] + "name": self.config.get("name", "unnamed"), + "description": self.config.get("description", "Documentation skill"), + "merge_mode": "rule-based", + "sources": [ + {"type": "documentation", **{k: v for k, v in self.config.items() if k not in ["name", "description"]}} + ], } return unified - def _convert_legacy_github(self) -> Dict[str, Any]: + def _convert_legacy_github(self) -> dict[str, Any]: """Convert legacy GitHub config to unified.""" unified = { - 'name': self.config.get('name', 'unnamed'), - 'description': self.config.get('description', 'GitHub repository skill'), - 'merge_mode': 'rule-based', - 'sources': [ - { - 'type': 'github', - **{k: v for k, v in self.config.items() - if k not in ['name', 'description']} - } - ] + "name": self.config.get("name", "unnamed"), + "description": self.config.get("description", "GitHub repository skill"), + "merge_mode": "rule-based", + "sources": [ + {"type": "github", **{k: v for k, v in self.config.items() if k not in ["name", "description"]}} + ], } return unified - def _convert_legacy_pdf(self) -> Dict[str, Any]: + def _convert_legacy_pdf(self) -> dict[str, Any]: """Convert legacy PDF config to unified.""" unified = { - 'name': self.config.get('name', 'unnamed'), - 'description': self.config.get('description', 'PDF document skill'), - 'merge_mode': 'rule-based', - 'sources': [ - { - 'type': 'pdf', - **{k: v for k, v in self.config.items() - if k not in ['name', 'description']} - } - ] + "name": self.config.get("name", "unnamed"), + "description": self.config.get("description", "PDF document skill"), + "merge_mode": "rule-based", + "sources": [{"type": "pdf", **{k: v for k, v in self.config.items() if k not in ["name", "description"]}}], } return unified - def get_sources_by_type(self, source_type: str) -> List[Dict[str, Any]]: + def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]: """ Get all sources of a specific type. @@ -308,17 +289,17 @@ class ConfigValidator: if not self.is_unified: # For legacy, convert and get sources unified = self.convert_legacy_to_unified() - sources = unified['sources'] + sources = unified["sources"] else: - sources = self.config['sources'] + sources = self.config["sources"] - return [s for s in sources if s.get('type') == source_type] + return [s for s in sources if s.get("type") == source_type] def has_multiple_sources(self) -> bool: """Check if config has multiple sources (requires merging).""" if not self.is_unified: return False - return len(self.config['sources']) > 1 + return len(self.config["sources"]) > 1 def needs_api_merge(self) -> bool: """ @@ -331,13 +312,11 @@ class ConfigValidator: return False has_docs_api = any( - s.get('type') == 'documentation' and s.get('extract_api', True) - for s in self.config['sources'] + s.get("type") == "documentation" and s.get("extract_api", True) for s in self.config["sources"] ) has_github_code = any( - s.get('type') == 'github' and s.get('include_code', False) - for s in self.config['sources'] + s.get("type") == "github" and s.get("include_code", False) for s in self.config["sources"] ) return has_docs_api and has_github_code @@ -361,7 +340,7 @@ def validate_config(config_path: str) -> ConfigValidator: return validator -if __name__ == '__main__': +if __name__ == "__main__": import sys if len(sys.argv) < 2: @@ -373,18 +352,18 @@ if __name__ == '__main__': try: validator = validate_config(config_file) - print(f"\nāœ… Config valid!") + print("\nāœ… Config valid!") print(f" Format: {'Unified' if validator.is_unified else 'Legacy'}") print(f" Name: {validator.config.get('name')}") if validator.is_unified: - sources = validator.config['sources'] + sources = validator.config["sources"] print(f" Sources: {len(sources)}") for i, source in enumerate(sources): - print(f" {i+1}. {source['type']}") + print(f" {i + 1}. {source['type']}") if validator.needs_api_merge(): - merge_mode = validator.config.get('merge_mode', 'rule-based') + merge_mode = validator.config.get("merge_mode", "rule-based") print(f" āš ļø API merge required (mode: {merge_mode})") except ValueError as e: diff --git a/src/skill_seekers/cli/conflict_detector.py b/src/skill_seekers/cli/conflict_detector.py index 5f7d4c2..ef84ff7 100644 --- a/src/skill_seekers/cli/conflict_detector.py +++ b/src/skill_seekers/cli/conflict_detector.py @@ -13,9 +13,9 @@ Used by unified scraper to identify discrepancies before merging. import json import logging -from typing import Dict, List, Any, Optional, Tuple -from dataclasses import dataclass, asdict +from dataclasses import asdict, dataclass from difflib import SequenceMatcher +from typing import Any logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -24,13 +24,14 @@ logger = logging.getLogger(__name__) @dataclass class Conflict: """Represents a conflict between documentation and code.""" + type: str # 'missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch' severity: str # 'low', 'medium', 'high' api_name: str - docs_info: Optional[Dict[str, Any]] = None - code_info: Optional[Dict[str, Any]] = None - difference: Optional[str] = None - suggestion: Optional[str] = None + docs_info: dict[str, Any] | None = None + code_info: dict[str, Any] | None = None + difference: str | None = None + suggestion: str | None = None class ConflictDetector: @@ -38,7 +39,7 @@ class ConflictDetector: Detects conflicts between documentation and code sources. """ - def __init__(self, docs_data: Dict[str, Any], github_data: Dict[str, Any]): + def __init__(self, docs_data: dict[str, Any], github_data: dict[str, Any]): """ Initialize conflict detector. @@ -56,7 +57,7 @@ class ConflictDetector: logger.info(f"Loaded {len(self.docs_apis)} APIs from documentation") logger.info(f"Loaded {len(self.code_apis)} APIs from code") - def _extract_docs_apis(self) -> Dict[str, Dict[str, Any]]: + def _extract_docs_apis(self) -> dict[str, dict[str, Any]]: """ Extract API information from documentation data. @@ -66,42 +67,43 @@ class ConflictDetector: apis = {} # Documentation structure varies, but typically has 'pages' or 'references' - pages = self.docs_data.get('pages', {}) + pages = self.docs_data.get("pages", {}) # Handle both dict and list formats if isinstance(pages, dict): # Format: {url: page_data, ...} for url, page_data in pages.items(): - content = page_data.get('content', '') - title = page_data.get('title', '') + content = page_data.get("content", "") + title = page_data.get("title", "") # Simple heuristic: if title or URL contains "api", "reference", "class", "function" # it might be an API page - if any(keyword in title.lower() or keyword in url.lower() - for keyword in ['api', 'reference', 'class', 'function', 'method']): - + if any( + keyword in title.lower() or keyword in url.lower() + for keyword in ["api", "reference", "class", "function", "method"] + ): # Extract API signatures from content (simplified) extracted_apis = self._parse_doc_content_for_apis(content, url) apis.update(extracted_apis) elif isinstance(pages, list): # Format: [{url: '...', apis: [...]}, ...] for page in pages: - url = page.get('url', '') - page_apis = page.get('apis', []) + url = page.get("url", "") + page_apis = page.get("apis", []) # If APIs are already extracted in the page data for api in page_apis: - api_name = api.get('name', '') + api_name = api.get("name", "") if api_name: apis[api_name] = { - 'parameters': api.get('parameters', []), - 'return_type': api.get('return_type', 'Any'), - 'source_url': url + "parameters": api.get("parameters", []), + "return_type": api.get("return_type", "Any"), + "source_url": url, } return apis - def _parse_doc_content_for_apis(self, content: str, source_url: str) -> Dict[str, Dict]: + def _parse_doc_content_for_apis(self, content: str, source_url: str) -> dict[str, dict]: """ Parse documentation content to extract API signatures. @@ -121,13 +123,13 @@ class ConflictDetector: # Pattern for common API signatures patterns = [ # Python style: def name(params) -> return - r'def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*(\w+))?', + r"def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*(\w+))?", # JavaScript style: function name(params) - r'function\s+(\w+)\s*\(([^)]*)\)', + r"function\s+(\w+)\s*\(([^)]*)\)", # C++ style: return_type name(params) - r'(\w+)\s+(\w+)\s*\(([^)]*)\)', + r"(\w+)\s+(\w+)\s*\(([^)]*)\)", # Method style: ClassName.method_name(params) - r'(\w+)\.(\w+)\s*\(([^)]*)\)' + r"(\w+)\.(\w+)\s*\(([^)]*)\)", ] for pattern in patterns: @@ -135,17 +137,17 @@ class ConflictDetector: groups = match.groups() # Parse based on pattern matched - if 'def' in pattern: + if "def" in pattern: # Python function name = groups[0] params_str = groups[1] return_type = groups[2] if len(groups) > 2 else None - elif 'function' in pattern: + elif "function" in pattern: # JavaScript function name = groups[0] params_str = groups[1] return_type = None - elif '.' in pattern: + elif "." in pattern: # Class method class_name = groups[0] method_name = groups[1] @@ -162,54 +164,54 @@ class ConflictDetector: params = self._parse_param_string(params_str) apis[name] = { - 'name': name, - 'parameters': params, - 'return_type': return_type, - 'source': source_url, - 'raw_signature': match.group(0) + "name": name, + "parameters": params, + "return_type": return_type, + "source": source_url, + "raw_signature": match.group(0), } return apis - def _parse_param_string(self, params_str: str) -> List[Dict]: + def _parse_param_string(self, params_str: str) -> list[dict]: """Parse parameter string into list of parameter dicts.""" if not params_str.strip(): return [] params = [] - for param in params_str.split(','): + for param in params_str.split(","): param = param.strip() if not param: continue # Try to extract name and type - param_info = {'name': param, 'type': None, 'default': None} + param_info = {"name": param, "type": None, "default": None} # Check for type annotation (: type) - if ':' in param: - parts = param.split(':', 1) - param_info['name'] = parts[0].strip() + if ":" in param: + parts = param.split(":", 1) + param_info["name"] = parts[0].strip() type_part = parts[1].strip() # Check for default value (= value) - if '=' in type_part: - type_str, default_str = type_part.split('=', 1) - param_info['type'] = type_str.strip() - param_info['default'] = default_str.strip() + if "=" in type_part: + type_str, default_str = type_part.split("=", 1) + param_info["type"] = type_str.strip() + param_info["default"] = default_str.strip() else: - param_info['type'] = type_part + param_info["type"] = type_part # Check for default without type (= value) - elif '=' in param: - parts = param.split('=', 1) - param_info['name'] = parts[0].strip() - param_info['default'] = parts[1].strip() + elif "=" in param: + parts = param.split("=", 1) + param_info["name"] = parts[0].strip() + param_info["default"] = parts[1].strip() params.append(param_info) return params - def _extract_code_apis(self) -> Dict[str, Dict[str, Any]]: + def _extract_code_apis(self) -> dict[str, dict[str, Any]]: """ Extract API information from GitHub code analysis. @@ -218,61 +220,61 @@ class ConflictDetector: """ apis = {} - code_analysis = self.github_data.get('code_analysis', {}) + code_analysis = self.github_data.get("code_analysis", {}) if not code_analysis: return apis # Support both 'files' and 'analyzed_files' keys - files = code_analysis.get('files', code_analysis.get('analyzed_files', [])) + files = code_analysis.get("files", code_analysis.get("analyzed_files", [])) for file_info in files: - file_path = file_info.get('file', 'unknown') + file_path = file_info.get("file", "unknown") # Extract classes and their methods - for class_info in file_info.get('classes', []): - class_name = class_info['name'] + for class_info in file_info.get("classes", []): + class_name = class_info["name"] # Add class itself apis[class_name] = { - 'name': class_name, - 'type': 'class', - 'source': file_path, - 'line': class_info.get('line_number'), - 'base_classes': class_info.get('base_classes', []), - 'docstring': class_info.get('docstring') + "name": class_name, + "type": "class", + "source": file_path, + "line": class_info.get("line_number"), + "base_classes": class_info.get("base_classes", []), + "docstring": class_info.get("docstring"), } # Add methods - for method in class_info.get('methods', []): + for method in class_info.get("methods", []): method_name = f"{class_name}.{method['name']}" apis[method_name] = { - 'name': method_name, - 'type': 'method', - 'parameters': method.get('parameters', []), - 'return_type': method.get('return_type'), - 'source': file_path, - 'line': method.get('line_number'), - 'docstring': method.get('docstring'), - 'is_async': method.get('is_async', False) + "name": method_name, + "type": "method", + "parameters": method.get("parameters", []), + "return_type": method.get("return_type"), + "source": file_path, + "line": method.get("line_number"), + "docstring": method.get("docstring"), + "is_async": method.get("is_async", False), } # Extract standalone functions - for func_info in file_info.get('functions', []): - func_name = func_info['name'] + for func_info in file_info.get("functions", []): + func_name = func_info["name"] apis[func_name] = { - 'name': func_name, - 'type': 'function', - 'parameters': func_info.get('parameters', []), - 'return_type': func_info.get('return_type'), - 'source': file_path, - 'line': func_info.get('line_number'), - 'docstring': func_info.get('docstring'), - 'is_async': func_info.get('is_async', False) + "name": func_name, + "type": "function", + "parameters": func_info.get("parameters", []), + "return_type": func_info.get("return_type"), + "source": file_path, + "line": func_info.get("line_number"), + "docstring": func_info.get("docstring"), + "is_async": func_info.get("is_async", False), } return apis - def detect_all_conflicts(self) -> List[Conflict]: + def detect_all_conflicts(self) -> list[Conflict]: """ Detect all types of conflicts. @@ -296,7 +298,7 @@ class ConflictDetector: return conflicts - def _find_missing_in_docs(self) -> List[Conflict]: + def _find_missing_in_docs(self) -> list[Conflict]: """Find APIs that exist in code but not in documentation.""" conflicts = [] @@ -304,40 +306,46 @@ class ConflictDetector: # Simple name matching (can be enhanced with fuzzy matching) if api_name not in self.docs_apis: # Check if it's a private/internal API (often not documented) - is_private = api_name.startswith('_') or '__' in api_name - severity = 'low' if is_private else 'medium' + is_private = api_name.startswith("_") or "__" in api_name + severity = "low" if is_private else "medium" - conflicts.append(Conflict( - type='missing_in_docs', - severity=severity, - api_name=api_name, - code_info=code_info, - difference=f"API exists in code ({code_info['source']}) but not found in documentation", - suggestion="Add documentation for this API" if not is_private else "Consider if this internal API should be documented" - )) + conflicts.append( + Conflict( + type="missing_in_docs", + severity=severity, + api_name=api_name, + code_info=code_info, + difference=f"API exists in code ({code_info['source']}) but not found in documentation", + suggestion="Add documentation for this API" + if not is_private + else "Consider if this internal API should be documented", + ) + ) logger.info(f"Found {len(conflicts)} APIs missing in documentation") return conflicts - def _find_missing_in_code(self) -> List[Conflict]: + def _find_missing_in_code(self) -> list[Conflict]: """Find APIs that are documented but don't exist in code.""" conflicts = [] for api_name, docs_info in self.docs_apis.items(): if api_name not in self.code_apis: - conflicts.append(Conflict( - type='missing_in_code', - severity='high', # This is serious - documented but doesn't exist - api_name=api_name, - docs_info=docs_info, - difference=f"API documented ({docs_info.get('source', 'unknown')}) but not found in code", - suggestion="Update documentation to remove this API, or add it to codebase" - )) + conflicts.append( + Conflict( + type="missing_in_code", + severity="high", # This is serious - documented but doesn't exist + api_name=api_name, + docs_info=docs_info, + difference=f"API documented ({docs_info.get('source', 'unknown')}) but not found in code", + suggestion="Update documentation to remove this API, or add it to codebase", + ) + ) logger.info(f"Found {len(conflicts)} APIs missing in code") return conflicts - def _find_signature_mismatches(self) -> List[Conflict]: + def _find_signature_mismatches(self) -> list[Conflict]: """Find APIs where signature differs between docs and code.""" conflicts = [] @@ -352,41 +360,43 @@ class ConflictDetector: mismatch = self._compare_signatures(docs_info, code_info) if mismatch: - conflicts.append(Conflict( - type='signature_mismatch', - severity=mismatch['severity'], - api_name=api_name, - docs_info=docs_info, - code_info=code_info, - difference=mismatch['difference'], - suggestion=mismatch['suggestion'] - )) + conflicts.append( + Conflict( + type="signature_mismatch", + severity=mismatch["severity"], + api_name=api_name, + docs_info=docs_info, + code_info=code_info, + difference=mismatch["difference"], + suggestion=mismatch["suggestion"], + ) + ) logger.info(f"Found {len(conflicts)} signature mismatches") return conflicts - def _compare_signatures(self, docs_info: Dict, code_info: Dict) -> Optional[Dict]: + def _compare_signatures(self, docs_info: dict, code_info: dict) -> dict | None: """ Compare signatures between docs and code. Returns: Dict with mismatch details if conflict found, None otherwise """ - docs_params = docs_info.get('parameters', []) - code_params = code_info.get('parameters', []) + docs_params = docs_info.get("parameters", []) + code_params = code_info.get("parameters", []) # Compare parameter counts if len(docs_params) != len(code_params): return { - 'severity': 'medium', - 'difference': f"Parameter count mismatch: docs has {len(docs_params)}, code has {len(code_params)}", - 'suggestion': f"Documentation shows {len(docs_params)} parameters, but code has {len(code_params)}" + "severity": "medium", + "difference": f"Parameter count mismatch: docs has {len(docs_params)}, code has {len(code_params)}", + "suggestion": f"Documentation shows {len(docs_params)} parameters, but code has {len(code_params)}", } # Compare parameter names and types for i, (doc_param, code_param) in enumerate(zip(docs_params, code_params)): - doc_name = doc_param.get('name', '') - code_name = code_param.get('name', '') + doc_name = doc_param.get("name", "") + code_name = code_param.get("name", "") # Parameter name mismatch if doc_name != code_name: @@ -394,36 +404,36 @@ class ConflictDetector: similarity = SequenceMatcher(None, doc_name, code_name).ratio() if similarity < 0.8: # Not similar enough return { - 'severity': 'medium', - 'difference': f"Parameter {i+1} name mismatch: '{doc_name}' in docs vs '{code_name}' in code", - 'suggestion': f"Update documentation to use parameter name '{code_name}'" + "severity": "medium", + "difference": f"Parameter {i + 1} name mismatch: '{doc_name}' in docs vs '{code_name}' in code", + "suggestion": f"Update documentation to use parameter name '{code_name}'", } # Type mismatch - doc_type = doc_param.get('type') - code_type = code_param.get('type_hint') + doc_type = doc_param.get("type") + code_type = code_param.get("type_hint") if doc_type and code_type and doc_type != code_type: return { - 'severity': 'low', - 'difference': f"Parameter '{doc_name}' type mismatch: '{doc_type}' in docs vs '{code_type}' in code", - 'suggestion': f"Verify correct type for parameter '{doc_name}'" + "severity": "low", + "difference": f"Parameter '{doc_name}' type mismatch: '{doc_type}' in docs vs '{code_type}' in code", + "suggestion": f"Verify correct type for parameter '{doc_name}'", } # Compare return types if both have them - docs_return = docs_info.get('return_type') - code_return = code_info.get('return_type') + docs_return = docs_info.get("return_type") + code_return = code_info.get("return_type") if docs_return and code_return and docs_return != code_return: return { - 'severity': 'low', - 'difference': f"Return type mismatch: '{docs_return}' in docs vs '{code_return}' in code", - 'suggestion': "Verify correct return type" + "severity": "low", + "difference": f"Return type mismatch: '{docs_return}' in docs vs '{code_return}' in code", + "suggestion": "Verify correct return type", } return None - def generate_summary(self, conflicts: List[Conflict]) -> Dict[str, Any]: + def generate_summary(self, conflicts: list[Conflict]) -> dict[str, Any]: """ Generate summary statistics for conflicts. @@ -434,25 +444,25 @@ class ConflictDetector: Summary dict with statistics """ summary = { - 'total': len(conflicts), - 'by_type': {}, - 'by_severity': {}, - 'apis_affected': len(set(c.api_name for c in conflicts)) + "total": len(conflicts), + "by_type": {}, + "by_severity": {}, + "apis_affected": len(set(c.api_name for c in conflicts)), } # Count by type - for conflict_type in ['missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch']: + for conflict_type in ["missing_in_docs", "missing_in_code", "signature_mismatch", "description_mismatch"]: count = sum(1 for c in conflicts if c.type == conflict_type) - summary['by_type'][conflict_type] = count + summary["by_type"][conflict_type] = count # Count by severity - for severity in ['low', 'medium', 'high']: + for severity in ["low", "medium", "high"]: count = sum(1 for c in conflicts if c.severity == severity) - summary['by_severity'][severity] = count + summary["by_severity"][severity] = count return summary - def save_conflicts(self, conflicts: List[Conflict], output_path: str): + def save_conflicts(self, conflicts: list[Conflict], output_path: str): """ Save conflicts to JSON file. @@ -460,18 +470,15 @@ class ConflictDetector: conflicts: List of Conflict objects output_path: Path to output JSON file """ - data = { - 'conflicts': [asdict(c) for c in conflicts], - 'summary': self.generate_summary(conflicts) - } + data = {"conflicts": [asdict(c) for c in conflicts], "summary": self.generate_summary(conflicts)} - with open(output_path, 'w', encoding='utf-8') as f: + with open(output_path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Conflicts saved to: {output_path}") -if __name__ == '__main__': +if __name__ == "__main__": import sys if len(sys.argv) < 3: @@ -482,10 +489,10 @@ if __name__ == '__main__': github_file = sys.argv[2] # Load data - with open(docs_file, 'r') as f: + with open(docs_file) as f: docs_data = json.load(f) - with open(github_file, 'r') as f: + with open(github_file) as f: github_data = json.load(f) # Detect conflicts @@ -498,16 +505,16 @@ if __name__ == '__main__': print(f" Total conflicts: {summary['total']}") print(f" APIs affected: {summary['apis_affected']}") print("\n By Type:") - for conflict_type, count in summary['by_type'].items(): + for conflict_type, count in summary["by_type"].items(): if count > 0: print(f" {conflict_type}: {count}") print("\n By Severity:") - for severity, count in summary['by_severity'].items(): + for severity, count in summary["by_severity"].items(): if count > 0: - emoji = 'šŸ”“' if severity == 'high' else '🟔' if severity == 'medium' else '🟢' + emoji = "šŸ”“" if severity == "high" else "🟔" if severity == "medium" else "🟢" print(f" {emoji} {severity}: {count}") # Save to file - output_file = 'conflicts.json' + output_file = "conflicts.json" detector.save_conflicts(conflicts, output_file) print(f"\nāœ… Full report saved to: {output_file}") diff --git a/src/skill_seekers/cli/constants.py b/src/skill_seekers/cli/constants.py index 2685e93..87fcb9a 100644 --- a/src/skill_seekers/cli/constants.py +++ b/src/skill_seekers/cli/constants.py @@ -8,7 +8,7 @@ across the CLI tools to improve maintainability and clarity. # Default scraping limits DEFAULT_RATE_LIMIT = 0.5 # seconds between requests -DEFAULT_MAX_PAGES = 500 # maximum pages to scrape +DEFAULT_MAX_PAGES = 500 # maximum pages to scrape DEFAULT_CHECKPOINT_INTERVAL = 1000 # pages between checkpoints DEFAULT_ASYNC_MODE = False # use async mode for parallel scraping (opt-in) @@ -26,7 +26,7 @@ CONTENT_MATCH_POINTS = 1 # points for content keyword match # API-based enhancement limits (uses Anthropic API) API_CONTENT_LIMIT = 100000 # max characters for API enhancement -API_PREVIEW_LIMIT = 40000 # max characters for preview +API_PREVIEW_LIMIT = 40000 # max characters for preview # Local enhancement limits (uses Claude Code Max) LOCAL_CONTENT_LIMIT = 50000 # max characters for local enhancement @@ -36,7 +36,7 @@ LOCAL_PREVIEW_LIMIT = 20000 # max characters for preview # Estimation and discovery settings DEFAULT_MAX_DISCOVERY = 1000 # default max pages to discover -DISCOVERY_THRESHOLD = 10000 # threshold for warnings +DISCOVERY_THRESHOLD = 10000 # threshold for warnings # ===== FILE LIMITS ===== @@ -48,25 +48,25 @@ MAX_CODE_BLOCKS_PER_PAGE = 5 # maximum code blocks to extract per page __all__ = [ # Scraping - 'DEFAULT_RATE_LIMIT', - 'DEFAULT_MAX_PAGES', - 'DEFAULT_CHECKPOINT_INTERVAL', - 'DEFAULT_ASYNC_MODE', - 'CONTENT_PREVIEW_LENGTH', - 'MAX_PAGES_WARNING_THRESHOLD', - 'MIN_CATEGORIZATION_SCORE', - 'URL_MATCH_POINTS', - 'TITLE_MATCH_POINTS', - 'CONTENT_MATCH_POINTS', + "DEFAULT_RATE_LIMIT", + "DEFAULT_MAX_PAGES", + "DEFAULT_CHECKPOINT_INTERVAL", + "DEFAULT_ASYNC_MODE", + "CONTENT_PREVIEW_LENGTH", + "MAX_PAGES_WARNING_THRESHOLD", + "MIN_CATEGORIZATION_SCORE", + "URL_MATCH_POINTS", + "TITLE_MATCH_POINTS", + "CONTENT_MATCH_POINTS", # Enhancement - 'API_CONTENT_LIMIT', - 'API_PREVIEW_LIMIT', - 'LOCAL_CONTENT_LIMIT', - 'LOCAL_PREVIEW_LIMIT', + "API_CONTENT_LIMIT", + "API_PREVIEW_LIMIT", + "LOCAL_CONTENT_LIMIT", + "LOCAL_PREVIEW_LIMIT", # Estimation - 'DEFAULT_MAX_DISCOVERY', - 'DISCOVERY_THRESHOLD', + "DEFAULT_MAX_DISCOVERY", + "DISCOVERY_THRESHOLD", # Limits - 'MAX_REFERENCE_FILES', - 'MAX_CODE_BLOCKS_PER_PAGE', + "MAX_REFERENCE_FILES", + "MAX_CODE_BLOCKS_PER_PAGE", ] diff --git a/src/skill_seekers/cli/dependency_analyzer.py b/src/skill_seekers/cli/dependency_analyzer.py index 17cd422..3df9ac5 100644 --- a/src/skill_seekers/cli/dependency_analyzer.py +++ b/src/skill_seekers/cli/dependency_analyzer.py @@ -37,15 +37,16 @@ Credits: - NetworkX for graph algorithms: https://networkx.org/ """ -import re import ast import logging -from pathlib import Path -from typing import Dict, List, Set, Tuple, Optional, Any +import re from dataclasses import dataclass, field +from pathlib import Path +from typing import Any try: import networkx as nx + NETWORKX_AVAILABLE = True except ImportError: NETWORKX_AVAILABLE = False @@ -56,6 +57,7 @@ logger = logging.getLogger(__name__) @dataclass class DependencyInfo: """Information about a single dependency relationship.""" + source_file: str imported_module: str import_type: str # 'import', 'from', 'require', 'include' @@ -66,10 +68,11 @@ class DependencyInfo: @dataclass class FileNode: """Represents a file node in the dependency graph.""" + file_path: str language: str - dependencies: List[str] = field(default_factory=list) - imported_by: List[str] = field(default_factory=list) + dependencies: list[str] = field(default_factory=list) + imported_by: list[str] = field(default_factory=list) class DependencyAnalyzer: @@ -83,16 +86,13 @@ class DependencyAnalyzer: def __init__(self): """Initialize dependency analyzer.""" if not NETWORKX_AVAILABLE: - raise ImportError( - "NetworkX is required for dependency analysis. " - "Install with: pip install networkx" - ) + raise ImportError("NetworkX is required for dependency analysis. Install with: pip install networkx") self.graph = nx.DiGraph() # Directed graph for dependencies - self.file_dependencies: Dict[str, List[DependencyInfo]] = {} - self.file_nodes: Dict[str, FileNode] = {} + self.file_dependencies: dict[str, list[DependencyInfo]] = {} + self.file_nodes: dict[str, FileNode] = {} - def analyze_file(self, file_path: str, content: str, language: str) -> List[DependencyInfo]: + def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]: """ Extract dependencies from a source file. @@ -104,23 +104,23 @@ class DependencyAnalyzer: Returns: List of DependencyInfo objects """ - if language == 'Python': + if language == "Python": deps = self._extract_python_imports(content, file_path) - elif language in ('JavaScript', 'TypeScript'): + elif language in ("JavaScript", "TypeScript"): deps = self._extract_js_imports(content, file_path) - elif language in ('C++', 'C'): + elif language in ("C++", "C"): deps = self._extract_cpp_includes(content, file_path) - elif language == 'C#': + elif language == "C#": deps = self._extract_csharp_imports(content, file_path) - elif language == 'Go': + elif language == "Go": deps = self._extract_go_imports(content, file_path) - elif language == 'Rust': + elif language == "Rust": deps = self._extract_rust_imports(content, file_path) - elif language == 'Java': + elif language == "Java": deps = self._extract_java_imports(content, file_path) - elif language == 'Ruby': + elif language == "Ruby": deps = self._extract_ruby_imports(content, file_path) - elif language == 'PHP': + elif language == "PHP": deps = self._extract_php_imports(content, file_path) else: logger.warning(f"Unsupported language: {language}") @@ -130,15 +130,11 @@ class DependencyAnalyzer: # Create file node imported_modules = [dep.imported_module for dep in deps] - self.file_nodes[file_path] = FileNode( - file_path=file_path, - language=language, - dependencies=imported_modules - ) + self.file_nodes[file_path] = FileNode(file_path=file_path, language=language, dependencies=imported_modules) return deps - def _extract_python_imports(self, content: str, file_path: str) -> List[DependencyInfo]: + def _extract_python_imports(self, content: str, file_path: str) -> list[DependencyInfo]: """ Extract Python import statements using AST. @@ -159,33 +155,37 @@ class DependencyAnalyzer: for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: - deps.append(DependencyInfo( - source_file=file_path, - imported_module=alias.name, - import_type='import', - is_relative=False, - line_number=node.lineno - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=alias.name, + import_type="import", + is_relative=False, + line_number=node.lineno, + ) + ) elif isinstance(node, ast.ImportFrom): - module = node.module or '' + module = node.module or "" is_relative = node.level > 0 # Handle relative imports if is_relative: - module = '.' * node.level + module + module = "." * node.level + module - deps.append(DependencyInfo( - source_file=file_path, - imported_module=module, - import_type='from', - is_relative=is_relative, - line_number=node.lineno - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=module, + import_type="from", + is_relative=is_relative, + line_number=node.lineno, + ) + ) return deps - def _extract_js_imports(self, content: str, file_path: str) -> List[DependencyInfo]: + def _extract_js_imports(self, content: str, file_path: str) -> list[DependencyInfo]: """ Extract JavaScript/TypeScript import statements. @@ -202,35 +202,39 @@ class DependencyAnalyzer: import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]" for match in re.finditer(import_pattern, content): module = match.group(1) - line_num = content[:match.start()].count('\n') + 1 - is_relative = module.startswith('.') or module.startswith('/') + line_num = content[: match.start()].count("\n") + 1 + is_relative = module.startswith(".") or module.startswith("/") - deps.append(DependencyInfo( - source_file=file_path, - imported_module=module, - import_type='import', - is_relative=is_relative, - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=module, + import_type="import", + is_relative=is_relative, + line_number=line_num, + ) + ) # CommonJS requires: require('module') require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)" for match in re.finditer(require_pattern, content): module = match.group(1) - line_num = content[:match.start()].count('\n') + 1 - is_relative = module.startswith('.') or module.startswith('/') + line_num = content[: match.start()].count("\n") + 1 + is_relative = module.startswith(".") or module.startswith("/") - deps.append(DependencyInfo( - source_file=file_path, - imported_module=module, - import_type='require', - is_relative=is_relative, - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=module, + import_type="require", + is_relative=is_relative, + line_number=line_num, + ) + ) return deps - def _extract_cpp_includes(self, content: str, file_path: str) -> List[DependencyInfo]: + def _extract_cpp_includes(self, content: str, file_path: str) -> list[DependencyInfo]: """ Extract C++ #include directives. @@ -244,22 +248,24 @@ class DependencyAnalyzer: include_pattern = r'#include\s+[<"]([^>"]+)[>"]' for match in re.finditer(include_pattern, content): header = match.group(1) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 # Headers with "" are usually local, <> are system headers is_relative = '"' in match.group(0) - deps.append(DependencyInfo( - source_file=file_path, - imported_module=header, - import_type='include', - is_relative=is_relative, - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=header, + import_type="include", + is_relative=is_relative, + line_number=line_num, + ) + ) return deps - def _extract_csharp_imports(self, content: str, file_path: str) -> List[DependencyInfo]: + def _extract_csharp_imports(self, content: str, file_path: str) -> list[DependencyInfo]: """ Extract C# using statements. @@ -275,27 +281,29 @@ class DependencyAnalyzer: deps = [] # Match using statements: using [static] Namespace[.Type]; - using_pattern = r'using\s+(?:static\s+)?(?:(\w+)\s*=\s*)?([A-Za-z_][\w.]*)\s*;' + using_pattern = r"using\s+(?:static\s+)?(?:(\w+)\s*=\s*)?([A-Za-z_][\w.]*)\s*;" for match in re.finditer(using_pattern, content): alias = match.group(1) # Optional alias namespace = match.group(2) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 # Skip 'using' statements for IDisposable (using var x = ...) - if '=' in match.group(0) and not alias: + if "=" in match.group(0) and not alias: continue - deps.append(DependencyInfo( - source_file=file_path, - imported_module=namespace, - import_type='using', - is_relative=False, # C# uses absolute namespaces - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=namespace, + import_type="using", + is_relative=False, # C# uses absolute namespaces + line_number=line_num, + ) + ) return deps - def _extract_go_imports(self, content: str, file_path: str) -> List[DependencyInfo]: + def _extract_go_imports(self, content: str, file_path: str) -> list[DependencyInfo]: """ Extract Go import statements. @@ -314,21 +322,23 @@ class DependencyAnalyzer: for match in re.finditer(single_import_pattern, content): alias = match.group(1) # Optional alias package = match.group(2) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 # Check if relative (starts with ./ or ../) - is_relative = package.startswith('./') + is_relative = package.startswith("./") - deps.append(DependencyInfo( - source_file=file_path, - imported_module=package, - import_type='import', - is_relative=is_relative, - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=package, + import_type="import", + is_relative=is_relative, + line_number=line_num, + ) + ) # Multi-import block: import ( ... ) - multi_import_pattern = r'import\s*\((.*?)\)' + multi_import_pattern = r"import\s*\((.*?)\)" for match in re.finditer(multi_import_pattern, content, re.DOTALL): block = match.group(1) block_start = match.start() @@ -338,21 +348,23 @@ class DependencyAnalyzer: for line_match in re.finditer(import_line_pattern, block): alias = line_match.group(1) package = line_match.group(2) - line_num = content[:block_start + line_match.start()].count('\n') + 1 + line_num = content[: block_start + line_match.start()].count("\n") + 1 - is_relative = package.startswith('./') + is_relative = package.startswith("./") - deps.append(DependencyInfo( - source_file=file_path, - imported_module=package, - import_type='import', - is_relative=is_relative, - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=package, + import_type="import", + is_relative=is_relative, + line_number=line_num, + ) + ) return deps - def _extract_rust_imports(self, content: str, file_path: str) -> List[DependencyInfo]: + def _extract_rust_imports(self, content: str, file_path: str) -> list[DependencyInfo]: """ Extract Rust use statements. @@ -369,43 +381,47 @@ class DependencyAnalyzer: # Match use statements: use path::to::item; (including curly braces with spaces) # This pattern matches: use word::word; or use word::{item, item}; - use_pattern = r'use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;' + use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;" for match in re.finditer(use_pattern, content): module_path = match.group(1) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 # Determine if relative - is_relative = module_path.startswith(('self::', 'super::')) + is_relative = module_path.startswith(("self::", "super::")) # Handle curly brace imports (use std::{io, fs}) - if '{' in module_path: + if "{" in module_path: # Extract base path - base_path = module_path.split('{')[0].rstrip(':') + base_path = module_path.split("{")[0].rstrip(":") # Extract items inside braces - items_match = re.search(r'\{([^}]+)\}', module_path) + items_match = re.search(r"\{([^}]+)\}", module_path) if items_match: - items = [item.strip() for item in items_match.group(1).split(',')] + items = [item.strip() for item in items_match.group(1).split(",")] for item in items: full_path = f"{base_path}::{item}" if base_path else item - deps.append(DependencyInfo( - source_file=file_path, - imported_module=full_path, - import_type='use', - is_relative=is_relative, - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=full_path, + import_type="use", + is_relative=is_relative, + line_number=line_num, + ) + ) else: - deps.append(DependencyInfo( - source_file=file_path, - imported_module=module_path, - import_type='use', - is_relative=is_relative, - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=module_path, + import_type="use", + is_relative=is_relative, + line_number=line_num, + ) + ) return deps - def _extract_java_imports(self, content: str, file_path: str) -> List[DependencyInfo]: + def _extract_java_imports(self, content: str, file_path: str) -> list[DependencyInfo]: """ Extract Java import statements. @@ -420,22 +436,24 @@ class DependencyAnalyzer: deps = [] # Match import statements: import [static] package.Class; - import_pattern = r'import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;' + import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;" for match in re.finditer(import_pattern, content): import_path = match.group(1) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 - deps.append(DependencyInfo( - source_file=file_path, - imported_module=import_path, - import_type='import', - is_relative=False, # Java uses absolute package names - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=import_path, + import_type="import", + is_relative=False, # Java uses absolute package names + line_number=line_num, + ) + ) return deps - def _extract_ruby_imports(self, content: str, file_path: str) -> List[DependencyInfo]: + def _extract_ruby_imports(self, content: str, file_path: str) -> list[DependencyInfo]: """ Extract Ruby require/require_relative/load statements. @@ -453,47 +471,53 @@ class DependencyAnalyzer: require_pattern = r"require\s+['\"]([^'\"]+)['\"]" for match in re.finditer(require_pattern, content): module = match.group(1) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 - deps.append(DependencyInfo( - source_file=file_path, - imported_module=module, - import_type='require', - is_relative=False, # require looks in load path - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=module, + import_type="require", + is_relative=False, # require looks in load path + line_number=line_num, + ) + ) # Match require_relative: require_relative 'file' require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]" for match in re.finditer(require_relative_pattern, content): module = match.group(1) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 - deps.append(DependencyInfo( - source_file=file_path, - imported_module=module, - import_type='require_relative', - is_relative=True, - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=module, + import_type="require_relative", + is_relative=True, + line_number=line_num, + ) + ) # Match load: load 'script.rb' load_pattern = r"load\s+['\"]([^'\"]+)['\"]" for match in re.finditer(load_pattern, content): module = match.group(1) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 - deps.append(DependencyInfo( - source_file=file_path, - imported_module=module, - import_type='load', - is_relative=True, # load is usually relative - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=module, + import_type="load", + is_relative=True, # load is usually relative + line_number=line_num, + ) + ) return deps - def _extract_php_imports(self, content: str, file_path: str) -> List[DependencyInfo]: + def _extract_php_imports(self, content: str, file_path: str) -> list[DependencyInfo]: """ Extract PHP require/include/use statements. @@ -513,35 +537,39 @@ class DependencyAnalyzer: require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]" for match in re.finditer(require_pattern, content): module = match.group(1) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 # Determine import type - import_type = 'require' if 'require' in match.group(0) else 'include' + import_type = "require" if "require" in match.group(0) else "include" # PHP file paths are relative by default - is_relative = not module.startswith(('/', 'http://', 'https://')) + is_relative = not module.startswith(("/", "http://", "https://")) - deps.append(DependencyInfo( - source_file=file_path, - imported_module=module, - import_type=import_type, - is_relative=is_relative, - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=module, + import_type=import_type, + is_relative=is_relative, + line_number=line_num, + ) + ) # Match namespace use: use Namespace\Class; - use_pattern = r'use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;' + use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;" for match in re.finditer(use_pattern, content): namespace = match.group(1) - line_num = content[:match.start()].count('\n') + 1 + line_num = content[: match.start()].count("\n") + 1 - deps.append(DependencyInfo( - source_file=file_path, - imported_module=namespace, - import_type='use', - is_relative=False, # Namespaces are absolute - line_number=line_num - )) + deps.append( + DependencyInfo( + source_file=file_path, + imported_module=namespace, + import_type="use", + is_relative=False, # Namespaces are absolute + line_number=line_num, + ) + ) return deps @@ -566,12 +594,7 @@ class DependencyAnalyzer: if target and target in self.file_nodes: # Add edge from source to dependency - self.graph.add_edge( - file_path, - target, - import_type=dep.import_type, - line_number=dep.line_number - ) + self.graph.add_edge(file_path, target, import_type=dep.import_type, line_number=dep.line_number) # Update imported_by lists if target in self.file_nodes: @@ -579,7 +602,7 @@ class DependencyAnalyzer: return self.graph - def _resolve_import(self, source_file: str, imported_module: str, is_relative: bool) -> Optional[str]: + def _resolve_import(self, source_file: str, imported_module: str, is_relative: bool) -> str | None: """ Resolve import statement to actual file path. @@ -609,7 +632,7 @@ class DependencyAnalyzer: return None - def detect_cycles(self) -> List[List[str]]: + def detect_cycles(self) -> list[list[str]]: """ Detect circular dependencies in the graph. @@ -627,7 +650,7 @@ class DependencyAnalyzer: logger.error(f"Error detecting cycles: {e}") return [] - def get_strongly_connected_components(self) -> List[Set[str]]: + def get_strongly_connected_components(self) -> list[set[str]]: """ Get strongly connected components (groups of mutually dependent files). @@ -645,13 +668,14 @@ class DependencyAnalyzer: """ try: from networkx.drawing.nx_pydot import write_dot + write_dot(self.graph, output_path) logger.info(f"Exported graph to DOT format: {output_path}") except ImportError: logger.warning("pydot not installed - cannot export to DOT format") logger.warning("Install with: pip install pydot") - def export_json(self) -> Dict[str, Any]: + def export_json(self) -> dict[str, Any]: """ Export graph as JSON structure. @@ -659,22 +683,19 @@ class DependencyAnalyzer: Dictionary with nodes and edges """ return { - 'nodes': [ - { - 'file': node, - 'language': data.get('language', 'Unknown') - } + "nodes": [ + {"file": node, "language": data.get("language", "Unknown")} for node, data in self.graph.nodes(data=True) ], - 'edges': [ + "edges": [ { - 'source': source, - 'target': target, - 'import_type': data.get('import_type', 'unknown'), - 'line_number': data.get('line_number', 0) + "source": source, + "target": target, + "import_type": data.get("import_type", "unknown"), + "line_number": data.get("line_number", 0), } for source, target, data in self.graph.edges(data=True) - ] + ], } def export_mermaid(self) -> str: @@ -684,7 +705,7 @@ class DependencyAnalyzer: Returns: Mermaid diagram as string """ - lines = ['graph TD'] + lines = ["graph TD"] # Create node labels (shorten file paths for readability) node_ids = {} @@ -700,9 +721,9 @@ class DependencyAnalyzer: target_id = node_ids[target] lines.append(f" {source_id} --> {target_id}") - return '\n'.join(lines) + return "\n".join(lines) - def get_statistics(self) -> Dict[str, Any]: + def get_statistics(self) -> dict[str, Any]: """ Get graph statistics. @@ -710,20 +731,15 @@ class DependencyAnalyzer: Dictionary with various statistics """ return { - 'total_files': self.graph.number_of_nodes(), - 'total_dependencies': self.graph.number_of_edges(), - 'circular_dependencies': len(self.detect_cycles()), - 'strongly_connected_components': len(self.get_strongly_connected_components()), - 'avg_dependencies_per_file': ( - self.graph.number_of_edges() / self.graph.number_of_nodes() - if self.graph.number_of_nodes() > 0 else 0 + "total_files": self.graph.number_of_nodes(), + "total_dependencies": self.graph.number_of_edges(), + "circular_dependencies": len(self.detect_cycles()), + "strongly_connected_components": len(self.get_strongly_connected_components()), + "avg_dependencies_per_file": ( + self.graph.number_of_edges() / self.graph.number_of_nodes() if self.graph.number_of_nodes() > 0 else 0 ), - 'files_with_no_dependencies': len([ - node for node in self.graph.nodes() - if self.graph.out_degree(node) == 0 - ]), - 'files_not_imported': len([ - node for node in self.graph.nodes() - if self.graph.in_degree(node) == 0 - ]), + "files_with_no_dependencies": len( + [node for node in self.graph.nodes() if self.graph.out_degree(node) == 0] + ), + "files_not_imported": len([node for node in self.graph.nodes() if self.graph.in_degree(node) == 0]), } diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 9f536e4..a9482c5 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -9,39 +9,40 @@ Usage: skill-seekers scrape --url https://react.dev/ --name react """ -import os -import sys -import json -import time -import re import argparse -import hashlib -import logging import asyncio -import requests -import httpx +import hashlib +import json +import logging +import os +import re +import sys +import time +from collections import defaultdict, deque from pathlib import Path +from typing import Any, Optional from urllib.parse import urljoin, urlparse + +import httpx +import requests from bs4 import BeautifulSoup -from collections import deque, defaultdict -from typing import Optional, Dict, List, Tuple, Set, Deque, Any # Add parent directory to path for imports when run as script sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector -from skill_seekers.cli.llms_txt_parser import LlmsTxtParser -from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader -from skill_seekers.cli.language_detector import LanguageDetector from skill_seekers.cli.constants import ( - DEFAULT_RATE_LIMIT, - DEFAULT_MAX_PAGES, - DEFAULT_CHECKPOINT_INTERVAL, - DEFAULT_ASYNC_MODE, CONTENT_PREVIEW_LENGTH, + DEFAULT_ASYNC_MODE, + DEFAULT_CHECKPOINT_INTERVAL, + DEFAULT_MAX_PAGES, + DEFAULT_RATE_LIMIT, MAX_PAGES_WARNING_THRESHOLD, - MIN_CATEGORIZATION_SCORE + MIN_CATEGORIZATION_SCORE, ) +from skill_seekers.cli.language_detector import LanguageDetector +from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector +from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader +from skill_seekers.cli.llms_txt_parser import LlmsTxtParser # Configure logging logger = logging.getLogger(__name__) @@ -61,14 +62,10 @@ def setup_logging(verbose: bool = False, quiet: bool = False) -> None: else: level = logging.INFO - logging.basicConfig( - level=level, - format='%(message)s', - force=True - ) + logging.basicConfig(level=level, format="%(message)s", force=True) -def infer_description_from_docs(base_url: str, first_page_content: Optional[str] = None, name: str = '') -> str: +def infer_description_from_docs(base_url: str, first_page_content: str | None = None, name: str = "") -> str: """ Infer skill description from documentation metadata or first page content. @@ -88,58 +85,64 @@ def infer_description_from_docs(base_url: str, first_page_content: Optional[str] # If we have first page content, try to extract description if first_page_content: try: - soup = BeautifulSoup(first_page_content, 'html.parser') + soup = BeautifulSoup(first_page_content, "html.parser") # Strategy 1: Try meta description tag - meta_desc = soup.find('meta', {'name': 'description'}) - if meta_desc and meta_desc.get('content'): - desc = meta_desc['content'].strip() + meta_desc = soup.find("meta", {"name": "description"}) + if meta_desc and meta_desc.get("content"): + desc = meta_desc["content"].strip() if len(desc) > 20: # Meaningful length # Clean and format if len(desc) > 150: - desc = desc[:147] + '...' - return f'Use when {desc.lower()}' + desc = desc[:147] + "..." + return f"Use when {desc.lower()}" # Strategy 2: Try OpenGraph description - og_desc = soup.find('meta', {'property': 'og:description'}) - if og_desc and og_desc.get('content'): - desc = og_desc['content'].strip() + og_desc = soup.find("meta", {"property": "og:description"}) + if og_desc and og_desc.get("content"): + desc = og_desc["content"].strip() if len(desc) > 20: if len(desc) > 150: - desc = desc[:147] + '...' - return f'Use when {desc.lower()}' + desc = desc[:147] + "..." + return f"Use when {desc.lower()}" # Strategy 3: Extract first meaningful paragraph from main content # Look for common documentation main content areas main_content = None - for selector in ['article', 'main', 'div[role="main"]', 'div.content', 'div.doc-content']: + for selector in ["article", "main", 'div[role="main"]', "div.content", "div.doc-content"]: main_content = soup.select_one(selector) if main_content: break if main_content: # Find first paragraph - for p in main_content.find_all('p', limit=5): + for p in main_content.find_all("p", limit=5): text = p.get_text().strip() # Skip empty, very short, or navigation-like paragraphs - if len(text) > 30 and not any(skip in text.lower() for skip in ['table of contents', 'on this page', 'navigation']): + if len(text) > 30 and not any( + skip in text.lower() for skip in ["table of contents", "on this page", "navigation"] + ): # Clean and format if len(text) > 150: - text = text[:147] + '...' - return f'Use when working with {text.lower()}' + text = text[:147] + "..." + return f"Use when working with {text.lower()}" except Exception as e: logger.debug(f"Could not infer description from page content: {e}") # Improved fallback template - return f'Use when working with {name}' if name else f'Use when working with documentation at {urlparse(base_url).netloc}' + return ( + f"Use when working with {name}" + if name + else f"Use when working with documentation at {urlparse(base_url).netloc}" + ) class DocToSkillConverter: - def __init__(self, config: Dict[str, Any], dry_run: bool = False, resume: bool = False) -> None: + def __init__(self, config: dict[str, Any], dry_run: bool = False, resume: bool = False) -> None: self.config = config - self.name = config['name'] - self.base_url = config['base_url'] + self.name = config["name"] + self.base_url = config["base_url"] self.dry_run = dry_run self.resume = resume @@ -149,34 +152,33 @@ class DocToSkillConverter: self.checkpoint_file = f"{self.data_dir}/checkpoint.json" # Checkpoint config - checkpoint_config = config.get('checkpoint', {}) - self.checkpoint_enabled = checkpoint_config.get('enabled', False) - self.checkpoint_interval = checkpoint_config.get('interval', DEFAULT_CHECKPOINT_INTERVAL) + checkpoint_config = config.get("checkpoint", {}) + self.checkpoint_enabled = checkpoint_config.get("enabled", False) + self.checkpoint_interval = checkpoint_config.get("interval", DEFAULT_CHECKPOINT_INTERVAL) # llms.txt detection state - skip_llms_txt_value = config.get('skip_llms_txt', False) + skip_llms_txt_value = config.get("skip_llms_txt", False) if not isinstance(skip_llms_txt_value, bool): logger.warning( - "Invalid value for 'skip_llms_txt': %r (expected bool). Defaulting to False.", - skip_llms_txt_value + "Invalid value for 'skip_llms_txt': %r (expected bool). Defaulting to False.", skip_llms_txt_value ) self.skip_llms_txt = False else: self.skip_llms_txt = skip_llms_txt_value self.llms_txt_detected = False self.llms_txt_variant = None - self.llms_txt_variants: List[str] = [] # Track all downloaded variants + self.llms_txt_variants: list[str] = [] # Track all downloaded variants # Parallel scraping config - self.workers = config.get('workers', 1) - self.async_mode = config.get('async_mode', DEFAULT_ASYNC_MODE) + self.workers = config.get("workers", 1) + self.async_mode = config.get("async_mode", DEFAULT_ASYNC_MODE) # State self.visited_urls: set[str] = set() # Support multiple starting URLs - start_urls = config.get('start_urls', [self.base_url]) + start_urls = config.get("start_urls", [self.base_url]) self.pending_urls = deque(start_urls) - self.pages: List[Dict[str, Any]] = [] + self.pages: list[dict[str, Any]] = [] self.pages_scraped = 0 # Language detection @@ -185,6 +187,7 @@ class DocToSkillConverter: # Thread-safe lock for parallel scraping if self.workers > 1: import threading + self.lock = threading.Lock() # Create directories (unless dry-run) @@ -197,7 +200,7 @@ class DocToSkillConverter: # Load checkpoint if resuming if resume and not dry_run: self.load_checkpoint() - + def is_valid_url(self, url: str) -> bool: """Check if URL should be scraped based on patterns. @@ -211,12 +214,12 @@ class DocToSkillConverter: return False # Include patterns - includes = self.config.get('url_patterns', {}).get('include', []) + includes = self.config.get("url_patterns", {}).get("include", []) if includes and not any(pattern in url for pattern in includes): return False # Exclude patterns - excludes = self.config.get('url_patterns', {}).get('exclude', []) + excludes = self.config.get("url_patterns", {}).get("exclude", []) if any(pattern in url for pattern in excludes): return False @@ -233,11 +236,11 @@ class DocToSkillConverter: "pending_urls": list(self.pending_urls), "pages_scraped": self.pages_scraped, "last_updated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), - "checkpoint_interval": self.checkpoint_interval + "checkpoint_interval": self.checkpoint_interval, } try: - with open(self.checkpoint_file, 'w', encoding='utf-8') as f: + with open(self.checkpoint_file, "w", encoding="utf-8") as f: json.dump(checkpoint_data, f, indent=2) logger.info(" šŸ’¾ Checkpoint saved (%d pages)", self.pages_scraped) except Exception as e: @@ -250,7 +253,7 @@ class DocToSkillConverter: return try: - with open(self.checkpoint_file, 'r', encoding='utf-8') as f: + with open(self.checkpoint_file, encoding="utf-8") as f: checkpoint_data = json.load(f) self.visited_urls = set(checkpoint_data["visited_urls"]) @@ -261,7 +264,7 @@ class DocToSkillConverter: logger.info(" Pages already scraped: %d", self.pages_scraped) logger.info(" URLs visited: %d", len(self.visited_urls)) logger.info(" URLs pending: %d", len(self.pending_urls)) - logger.info(" Last updated: %s", checkpoint_data['last_updated']) + logger.info(" Last updated: %s", checkpoint_data["last_updated"]) logger.info("") except Exception as e: @@ -277,79 +280,72 @@ class DocToSkillConverter: except Exception as e: logger.warning("āš ļø Failed to clear checkpoint: %s", e) - def extract_content(self, soup: Any, url: str) -> Dict[str, Any]: + def extract_content(self, soup: Any, url: str) -> dict[str, Any]: """Extract content with improved code and pattern detection""" page = { - 'url': url, - 'title': '', - 'content': '', - 'headings': [], - 'code_samples': [], - 'patterns': [], # NEW: Extract common patterns - 'links': [] + "url": url, + "title": "", + "content": "", + "headings": [], + "code_samples": [], + "patterns": [], # NEW: Extract common patterns + "links": [], } - - selectors = self.config.get('selectors', {}) - + + selectors = self.config.get("selectors", {}) + # Extract title - title_elem = soup.select_one(selectors.get('title', 'title')) + title_elem = soup.select_one(selectors.get("title", "title")) if title_elem: - page['title'] = self.clean_text(title_elem.get_text()) - + page["title"] = self.clean_text(title_elem.get_text()) + # Find main content - main_selector = selectors.get('main_content', 'div[role="main"]') + main_selector = selectors.get("main_content", 'div[role="main"]') main = soup.select_one(main_selector) - + if not main: logger.warning("⚠ No content: %s", url) return page - + # Extract headings with better structure - for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): + for h in main.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]): text = self.clean_text(h.get_text()) if text: - page['headings'].append({ - 'level': h.name, - 'text': text, - 'id': h.get('id', '') - }) - + page["headings"].append({"level": h.name, "text": text, "id": h.get("id", "")}) + # Extract code with language detection - code_selector = selectors.get('code_blocks', 'pre code') + code_selector = selectors.get("code_blocks", "pre code") for code_elem in main.select(code_selector): code = code_elem.get_text() if len(code.strip()) > 10: # Try to detect language lang = self.detect_language(code_elem, code) - page['code_samples'].append({ - 'code': code.strip(), - 'language': lang - }) - + page["code_samples"].append({"code": code.strip(), "language": lang}) + # Extract patterns (NEW: common code patterns) - page['patterns'] = self.extract_patterns(main, page['code_samples']) - + page["patterns"] = self.extract_patterns(main, page["code_samples"]) + # Extract paragraphs paragraphs = [] - for p in main.find_all('p'): + for p in main.find_all("p"): text = self.clean_text(p.get_text()) if text and len(text) > 20: # Skip very short paragraphs paragraphs.append(text) - - page['content'] = '\n\n'.join(paragraphs) + + page["content"] = "\n\n".join(paragraphs) # Extract links from entire page (not just main content) # This allows discovery of navigation links outside the main content area - for link in soup.find_all('a', href=True): - href = urljoin(url, link['href']) + for link in soup.find_all("a", href=True): + href = urljoin(url, link["href"]) # Strip anchor fragments to avoid treating #anchors as separate pages - href = href.split('#')[0] - if self.is_valid_url(href) and href not in page['links']: - page['links'].append(href) + href = href.split("#")[0] + if self.is_valid_url(href) and href not in page["links"]: + page["links"].append(href) return page - def _extract_markdown_content(self, content: str, url: str) -> Dict[str, Any]: + def _extract_markdown_content(self, content: str, url: str) -> dict[str, Any]: """Extract structured content from a Markdown file. Parses markdown files from llms.txt URLs to extract: @@ -382,76 +378,61 @@ class DocToSkillConverter: import re # Detect if content is actually HTML (some .md URLs return HTML) - if content.strip().startswith(' 10: - page['code_samples'].append({ - 'code': code.strip(), - 'language': lang or 'unknown' - }) + page["code_samples"].append({"code": code.strip(), "language": lang or "unknown"}) # Extract content (paragraphs) - content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL) + content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL) paragraphs = [] - for para in content_no_code.split('\n\n'): + for para in content_no_code.split("\n\n"): text = para.strip() # Skip headings and short text - if text and len(text) > 20 and not text.startswith('#'): + if text and len(text) > 20 and not text.startswith("#"): paragraphs.append(text) - page['content'] = '\n\n'.join(paragraphs) + page["content"] = "\n\n".join(paragraphs) # Extract links from markdown (only .md files to avoid client-side rendered HTML pages) - md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', content) + md_links = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", content) for _, href in md_links: - if href.startswith('http'): + if href.startswith("http"): full_url = href - elif not href.startswith('#'): + elif not href.startswith("#"): full_url = urljoin(url, href) else: continue # Strip anchor fragments - full_url = full_url.split('#')[0] + full_url = full_url.split("#")[0] # Only include .md URLs to avoid client-side rendered HTML pages - if '.md' in full_url and self.is_valid_url(full_url) and full_url not in page['links']: - page['links'].append(full_url) + if ".md" in full_url and self.is_valid_url(full_url) and full_url not in page["links"]: + page["links"].append(full_url) return page - def _extract_html_as_markdown(self, html_content: str, url: str) -> Dict[str, Any]: + def _extract_html_as_markdown(self, html_content: str, url: str) -> dict[str, Any]: """Extract content from HTML and convert to markdown-like structure. Fallback method when .md URL returns HTML content instead of markdown. @@ -483,22 +464,14 @@ class DocToSkillConverter: Falls back to if no semantic content container found. Language detection uses detect_language() method. """ - page = { - 'url': url, - 'title': '', - 'content': '', - 'headings': [], - 'code_samples': [], - 'patterns': [], - 'links': [] - } + page = {"url": url, "title": "", "content": "", "headings": [], "code_samples": [], "patterns": [], "links": []} - soup = BeautifulSoup(html_content, 'html.parser') + soup = BeautifulSoup(html_content, "html.parser") # Try to extract title - title_elem = soup.select_one('title') + title_elem = soup.select_one("title") if title_elem: - page['title'] = self.clean_text(title_elem.get_text()) + page["title"] = self.clean_text(title_elem.get_text()) # Try to find main content area main = soup.select_one('main, article, [role="main"], .content') @@ -507,32 +480,25 @@ class DocToSkillConverter: if main: # Extract headings - for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): + for h in main.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]): text = self.clean_text(h.get_text()) if text: - page['headings'].append({ - 'level': h.name, - 'text': text, - 'id': h.get('id', '') - }) + page["headings"].append({"level": h.name, "text": text, "id": h.get("id", "")}) # Extract code blocks - for code_elem in main.select('pre code, pre'): + for code_elem in main.select("pre code, pre"): code = code_elem.get_text() if len(code.strip()) > 10: lang = self.detect_language(code_elem, code) - page['code_samples'].append({ - 'code': code.strip(), - 'language': lang - }) + page["code_samples"].append({"code": code.strip(), "language": lang}) # Extract paragraphs paragraphs = [] - for p in main.find_all('p'): + for p in main.find_all("p"): text = self.clean_text(p.get_text()) if text and len(text) > 20: paragraphs.append(text) - page['content'] = '\n\n'.join(paragraphs) + page["content"] = "\n\n".join(paragraphs) return page @@ -548,47 +514,46 @@ class DocToSkillConverter: logger.debug(f"Low confidence language detection: {lang} ({confidence:.2f})") return lang # Return string for backward compatibility - - def extract_patterns(self, main: Any, code_samples: List[Dict[str, Any]]) -> List[Dict[str, str]]: + + def extract_patterns(self, main: Any, code_samples: list[dict[str, Any]]) -> list[dict[str, str]]: """Extract common coding patterns (NEW FEATURE)""" patterns = [] - + # Look for "Example:" or "Pattern:" sections - for elem in main.find_all(['p', 'div']): + for elem in main.find_all(["p", "div"]): text = elem.get_text().lower() - if any(word in text for word in ['example:', 'pattern:', 'usage:', 'typical use']): + if any(word in text for word in ["example:", "pattern:", "usage:", "typical use"]): # Get the code that follows - next_code = elem.find_next(['pre', 'code']) + next_code = elem.find_next(["pre", "code"]) if next_code: - patterns.append({ - 'description': self.clean_text(elem.get_text()), - 'code': next_code.get_text().strip() - }) - + patterns.append( + {"description": self.clean_text(elem.get_text()), "code": next_code.get_text().strip()} + ) + return patterns[:5] # Limit to 5 most relevant patterns - + def clean_text(self, text: str) -> str: """Clean text content""" - text = re.sub(r'\s+', ' ', text) + text = re.sub(r"\s+", " ", text) return text.strip() - - def save_page(self, page: Dict[str, Any]) -> None: + + def save_page(self, page: dict[str, Any]) -> None: """Save page data (skip pages with empty content)""" # Skip pages with empty or very short content - if not page.get('content') or len(page.get('content', '')) < 50: - logger.debug("Skipping page with empty/short content: %s", page.get('url', 'unknown')) + if not page.get("content") or len(page.get("content", "")) < 50: + logger.debug("Skipping page with empty/short content: %s", page.get("url", "unknown")) return - url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10] - safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50] - safe_title = re.sub(r'[-\s]+', '_', safe_title) + url_hash = hashlib.md5(page["url"].encode()).hexdigest()[:10] + safe_title = re.sub(r"[^\w\s-]", "", page["title"])[:50] + safe_title = re.sub(r"[-\s]+", "_", safe_title) filename = f"{safe_title}_{url_hash}.json" filepath = os.path.join(self.data_dir, "pages", filename) - with open(filepath, 'w', encoding='utf-8') as f: + with open(filepath, "w", encoding="utf-8") as f: json.dump(page, f, indent=2, ensure_ascii=False) - + def scrape_page(self, url: str) -> None: """Scrape a single page with thread-safe operations. @@ -604,15 +569,15 @@ class DocToSkillConverter: """ try: # Scraping part (no lock needed - independent) - headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper)'} + headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"} response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() # Check if this is a Markdown file - if url.endswith('.md') or '.md' in url: + if url.endswith(".md") or ".md" in url: page = self._extract_markdown_content(response.text, url) else: - soup = BeautifulSoup(response.content, 'html.parser') + soup = BeautifulSoup(response.content, "html.parser") page = self.extract_content(soup, url) # Thread-safe operations (lock required) @@ -623,7 +588,7 @@ class DocToSkillConverter: self.pages.append(page) # Add new URLs - for link in page['links']: + for link in page["links"]: if link not in self.visited_urls and link not in self.pending_urls: self.pending_urls.append(link) else: @@ -633,12 +598,12 @@ class DocToSkillConverter: self.pages.append(page) # Add new URLs - for link in page['links']: + for link in page["links"]: if link not in self.visited_urls and link not in self.pending_urls: self.pending_urls.append(link) # Rate limiting - rate_limit = self.config.get('rate_limit', DEFAULT_RATE_LIMIT) + rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT) if rate_limit > 0: time.sleep(rate_limit) @@ -665,16 +630,16 @@ class DocToSkillConverter: async with semaphore: # Limit concurrent requests try: # Async HTTP request - headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper)'} + headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"} response = await client.get(url, headers=headers, timeout=30.0) response.raise_for_status() # Check if this is a Markdown file - if url.endswith('.md') or '.md' in url: + if url.endswith(".md") or ".md" in url: page = self._extract_markdown_content(response.text, url) else: # BeautifulSoup parsing (still synchronous, but fast) - soup = BeautifulSoup(response.content, 'html.parser') + soup = BeautifulSoup(response.content, "html.parser") page = self.extract_content(soup, url) # Async-safe operations (no lock needed - single event loop) @@ -683,19 +648,19 @@ class DocToSkillConverter: self.pages.append(page) # Add new URLs - for link in page['links']: + for link in page["links"]: if link not in self.visited_urls and link not in self.pending_urls: self.pending_urls.append(link) # Rate limiting - rate_limit = self.config.get('rate_limit', DEFAULT_RATE_LIMIT) + rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT) if rate_limit > 0: await asyncio.sleep(rate_limit) except Exception as e: logger.error(" āœ— Error scraping %s: %s: %s", url, type(e).__name__, e) - def _convert_to_md_urls(self, urls: List[str]) -> List[str]: + def _convert_to_md_urls(self, urls: list[str]) -> list[str]: """ Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs. äøé¢„å…ˆę£€ęŸ„ URL ę˜Æå¦å­˜åœØļ¼Œē›“ęŽ„åŠ å…„é˜Ÿåˆ—ļ¼ŒåœØēˆ¬å–ę—¶å†éŖŒčÆć€‚ @@ -709,11 +674,11 @@ class DocToSkillConverter: md_urls = [] for url in urls: - if '.md' in url: + if ".md" in url: md_urls.append(url) else: # ē›“ęŽ„č½¬ę¢äøŗ .md ę ¼å¼ļ¼Œäøå‘é€ HEAD čÆ·ę±‚ę£€ęŸ„ - url = url.rstrip('/') + url = url.rstrip("/") md_url = f"{url}/index.html.md" md_urls.append(md_url) @@ -756,7 +721,7 @@ class DocToSkillConverter: logger.info("\nšŸ” Checking for llms.txt at %s...", self.base_url) # Check for explicit config URL first - explicit_url = self.config.get('llms_txt_url') + explicit_url = self.config.get("llms_txt_url") if explicit_url: logger.info("\nšŸ“Œ Using explicit llms_txt_url from config: %s", explicit_url) @@ -770,7 +735,7 @@ class DocToSkillConverter: filepath = os.path.join(self.skill_dir, "references", filename) os.makedirs(os.path.dirname(filepath), exist_ok=True) - with open(filepath, 'w', encoding='utf-8') as f: + with open(filepath, "w", encoding="utf-8") as f: f.write(content) logger.info(" šŸ’¾ Saved %s (%d chars)", filename, len(content)) @@ -781,8 +746,8 @@ class DocToSkillConverter: if variants: logger.info("\nšŸ” Found %d total variant(s), downloading remaining...", len(variants)) for variant_info in variants: - url = variant_info['url'] - variant = variant_info['variant'] + url = variant_info["url"] + variant = variant_info["variant"] # Skip the explicit one we already downloaded if url == explicit_url: @@ -795,7 +760,7 @@ class DocToSkillConverter: if extra_content: extra_filename = extra_downloader.get_proper_filename() extra_filepath = os.path.join(self.skill_dir, "references", extra_filename) - with open(extra_filepath, 'w', encoding='utf-8') as f: + with open(extra_filepath, "w", encoding="utf-8") as f: f.write(extra_content) logger.info(" āœ“ %s (%d chars)", extra_filename, len(extra_content)) @@ -807,8 +772,11 @@ class DocToSkillConverter: if extracted_urls: # Convert non-.md URLs to .md format by trying /index.html.md suffix md_urls = self._convert_to_md_urls(extracted_urls) - logger.info("\nšŸ”— Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", - len(extracted_urls), len(md_urls)) + logger.info( + "\nšŸ”— Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", + len(extracted_urls), + len(md_urls), + ) # Filter URLs based on url_patterns config for url in md_urls: @@ -819,7 +787,7 @@ class DocToSkillConverter: # Return False to trigger HTML scraping with the populated pending_urls self.llms_txt_detected = True - self.llms_txt_variant = 'explicit' + self.llms_txt_variant = "explicit" return False # Continue with BFS crawling # Fallback: if no URLs found, use section-based parsing @@ -831,7 +799,7 @@ class DocToSkillConverter: self.pages.append(page) self.llms_txt_detected = True - self.llms_txt_variant = 'explicit' + self.llms_txt_variant = "explicit" return True # Auto-detection: Find ALL variants @@ -847,8 +815,8 @@ class DocToSkillConverter: # Download ALL variants downloaded = {} for variant_info in variants: - url = variant_info['url'] - variant = variant_info['variant'] + url = variant_info["url"] + variant = variant_info["variant"] logger.info(" šŸ“„ Downloading %s...", variant) downloader = LlmsTxtDownloader(url) @@ -856,11 +824,7 @@ class DocToSkillConverter: if content: filename = downloader.get_proper_filename() - downloaded[variant] = { - 'content': content, - 'filename': filename, - 'size': len(content) - } + downloaded[variant] = {"content": content, "filename": filename, "size": len(content)} logger.info(" āœ“ %s (%d chars)", filename, len(content)) if not downloaded: @@ -871,24 +835,27 @@ class DocToSkillConverter: os.makedirs(os.path.join(self.skill_dir, "references"), exist_ok=True) for variant, data in downloaded.items(): - filepath = os.path.join(self.skill_dir, "references", data['filename']) - with open(filepath, 'w', encoding='utf-8') as f: - f.write(data['content']) - logger.info(" šŸ’¾ Saved %s", data['filename']) + filepath = os.path.join(self.skill_dir, "references", data["filename"]) + with open(filepath, "w", encoding="utf-8") as f: + f.write(data["content"]) + logger.info(" šŸ’¾ Saved %s", data["filename"]) # Parse LARGEST variant for skill building - largest = max(downloaded.items(), key=lambda x: x[1]['size']) - logger.info("\nšŸ“„ Parsing %s for skill building...", largest[1]['filename']) + largest = max(downloaded.items(), key=lambda x: x[1]["size"]) + logger.info("\nšŸ“„ Parsing %s for skill building...", largest[1]["filename"]) - parser = LlmsTxtParser(largest[1]['content'], self.base_url) + parser = LlmsTxtParser(largest[1]["content"], self.base_url) # Extract URLs from llms.txt and add to pending_urls for BFS crawling extracted_urls = parser.extract_urls() if extracted_urls: # Convert non-.md URLs to .md format by trying /index.html.md suffix md_urls = self._convert_to_md_urls(extracted_urls) - logger.info("\nšŸ”— Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", - len(extracted_urls), len(md_urls)) + logger.info( + "\nšŸ”— Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", + len(extracted_urls), + len(md_urls), + ) # Filter URLs based on url_patterns config for url in md_urls: @@ -956,7 +923,7 @@ class DocToSkillConverter: logger.info("Workers: %d parallel threads", self.workers) logger.info("") - max_pages = self.config.get('max_pages', DEFAULT_MAX_PAGES) + max_pages = self.config.get("max_pages", DEFAULT_MAX_PAGES) # Handle unlimited mode if max_pages is None or max_pages == -1: @@ -982,16 +949,16 @@ class DocToSkillConverter: # Just show what would be scraped logger.info(" [Preview] %s", url) try: - headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper - Dry Run)'} + headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper - Dry Run)"} response = requests.get(url, headers=headers, timeout=10) - soup = BeautifulSoup(response.content, 'html.parser') + soup = BeautifulSoup(response.content, "html.parser") - main_selector = self.config.get('selectors', {}).get('main_content', 'div[role="main"]') + main_selector = self.config.get("selectors", {}).get("main_content", 'div[role="main"]') main = soup.select_one(main_selector) if main: - for link in main.find_all('a', href=True): - href = urljoin(url, link['href']) + for link in main.find_all("a", href=True): + href = urljoin(url, link["href"]) if self.is_valid_url(href) and href not in self.visited_urls: self.pending_urls.append(href) except Exception as e: @@ -1115,13 +1082,13 @@ class DocToSkillConverter: logger.info("Workers: %d concurrent tasks (async)", self.workers) logger.info("") - max_pages = self.config.get('max_pages', DEFAULT_MAX_PAGES) + max_pages = self.config.get("max_pages", DEFAULT_MAX_PAGES) # Handle unlimited mode if max_pages is None or max_pages == -1: logger.warning("āš ļø UNLIMITED MODE: No page limit (will scrape all pages)\n") unlimited = True - preview_limit = float('inf') + preview_limit = float("inf") else: unlimited = False preview_limit = 20 if self.dry_run else max_pages @@ -1130,10 +1097,7 @@ class DocToSkillConverter: semaphore = asyncio.Semaphore(self.workers) # Create shared HTTP client with connection pooling - async with httpx.AsyncClient( - timeout=30.0, - limits=httpx.Limits(max_connections=self.workers * 2) - ) as client: + async with httpx.AsyncClient(timeout=30.0, limits=httpx.Limits(max_connections=self.workers * 2)) as client: tasks = [] while self.pending_urls and (unlimited or len(self.visited_urls) < preview_limit): @@ -1156,9 +1120,7 @@ class DocToSkillConverter: if self.dry_run: logger.info(" [Preview] %s", url) else: - task = asyncio.create_task( - self.scrape_page_async(url, semaphore, client) - ) + task = asyncio.create_task(self.scrape_page_async(url, semaphore, client)) tasks.append(task) # Wait for batch to complete before continuing @@ -1192,53 +1154,53 @@ class DocToSkillConverter: def save_summary(self) -> None: """Save scraping summary""" summary = { - 'name': self.name, - 'total_pages': len(self.pages), - 'base_url': self.base_url, - 'llms_txt_detected': self.llms_txt_detected, - 'llms_txt_variant': self.llms_txt_variant, - 'pages': [{'title': p['title'], 'url': p['url']} for p in self.pages] + "name": self.name, + "total_pages": len(self.pages), + "base_url": self.base_url, + "llms_txt_detected": self.llms_txt_detected, + "llms_txt_variant": self.llms_txt_variant, + "pages": [{"title": p["title"], "url": p["url"]} for p in self.pages], } - with open(f"{self.data_dir}/summary.json", 'w', encoding='utf-8') as f: + with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f: json.dump(summary, f, indent=2, ensure_ascii=False) - - def load_scraped_data(self) -> List[Dict[str, Any]]: + + def load_scraped_data(self) -> list[dict[str, Any]]: """Load previously scraped data""" pages = [] pages_dir = Path(self.data_dir) / "pages" - + if not pages_dir.exists(): return [] - + for json_file in pages_dir.glob("*.json"): try: - with open(json_file, 'r', encoding='utf-8') as f: + with open(json_file, encoding="utf-8") as f: pages.append(json.load(f)) except Exception as e: logger.error("āš ļø Error loading scraped data file %s: %s: %s", json_file, type(e).__name__, e) logger.error(" Suggestion: File may be corrupted, consider re-scraping with --fresh") - + return pages - - def smart_categorize(self, pages: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: + + def smart_categorize(self, pages: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]: """Improved categorization with better pattern matching""" - category_defs = self.config.get('categories', {}) - + category_defs = self.config.get("categories", {}) + # Default smart categories if none provided if not category_defs: category_defs = self.infer_categories(pages) - categories: Dict[str, List[Dict[str, Any]]] = {cat: [] for cat in category_defs.keys()} - categories['other'] = [] - + categories: dict[str, list[dict[str, Any]]] = {cat: [] for cat in category_defs.keys()} + categories["other"] = [] + for page in pages: - url = page['url'].lower() - title = page['title'].lower() - content = page.get('content', '').lower()[:CONTENT_PREVIEW_LENGTH] # Check first N chars for categorization - + url = page["url"].lower() + title = page["title"].lower() + content = page.get("content", "").lower()[:CONTENT_PREVIEW_LENGTH] # Check first N chars for categorization + categorized = False - + # Match against keywords for cat, keywords in category_defs.items(): score = 0 @@ -1250,138 +1212,140 @@ class DocToSkillConverter: score += 2 if keyword in content: score += 1 - + if score >= MIN_CATEGORIZATION_SCORE: # Threshold for categorization categories[cat].append(page) categorized = True break - + if not categorized: - categories['other'].append(page) - + categories["other"].append(page) + # Remove empty categories categories = {k: v for k, v in categories.items() if v} - + return categories - - def infer_categories(self, pages: List[Dict[str, Any]]) -> Dict[str, List[str]]: + + def infer_categories(self, pages: list[dict[str, Any]]) -> dict[str, list[str]]: """Infer categories from URL patterns (IMPROVED)""" url_segments: defaultdict[str, int] = defaultdict(int) - + for page in pages: - path = urlparse(page['url']).path - segments = [s for s in path.split('/') if s and s not in ['en', 'stable', 'latest', 'docs']] - + path = urlparse(page["url"]).path + segments = [s for s in path.split("/") if s and s not in ["en", "stable", "latest", "docs"]] + for seg in segments: url_segments[seg] += 1 - + # Top segments become categories top_segments = sorted(url_segments.items(), key=lambda x: x[1], reverse=True)[:8] - + categories = {} for seg, count in top_segments: if count >= 3: # At least 3 pages categories[seg] = [seg] - + # Add common defaults - if 'tutorial' not in categories and any('tutorial' in url for url in [p['url'] for p in pages]): - categories['tutorials'] = ['tutorial', 'guide', 'getting-started'] - - if 'api' not in categories and any('api' in url or 'reference' in url for url in [p['url'] for p in pages]): - categories['api'] = ['api', 'reference', 'class'] - + if "tutorial" not in categories and any("tutorial" in url for url in [p["url"] for p in pages]): + categories["tutorials"] = ["tutorial", "guide", "getting-started"] + + if "api" not in categories and any("api" in url or "reference" in url for url in [p["url"] for p in pages]): + categories["api"] = ["api", "reference", "class"] + return categories - - def generate_quick_reference(self, pages: List[Dict[str, Any]]) -> List[Dict[str, str]]: + + def generate_quick_reference(self, pages: list[dict[str, Any]]) -> list[dict[str, str]]: """Generate quick reference from common patterns (NEW FEATURE)""" quick_ref = [] - + # Collect all patterns all_patterns = [] for page in pages: - all_patterns.extend(page.get('patterns', [])) - + all_patterns.extend(page.get("patterns", [])) + # Get most common code patterns seen_codes = set() for pattern in all_patterns: - code = pattern['code'] + code = pattern["code"] if code not in seen_codes and len(code) < 300: quick_ref.append(pattern) seen_codes.add(code) if len(quick_ref) >= 15: break - + return quick_ref - - def create_reference_file(self, category: str, pages: List[Dict[str, Any]]) -> None: + + def create_reference_file(self, category: str, pages: list[dict[str, Any]]) -> None: """Create enhanced reference file""" if not pages: return - + lines = [] lines.append(f"# {self.name.title()} - {category.replace('_', ' ').title()}\n") lines.append(f"**Pages:** {len(pages)}\n") lines.append("---\n") - + for page in pages: lines.append(f"## {page['title']}\n") lines.append(f"**URL:** {page['url']}\n") - + # Table of contents from headings - if page.get('headings'): + if page.get("headings"): lines.append("**Contents:**") - for h in page['headings'][:10]: - level = int(h['level'][1]) if len(h['level']) > 1 else 1 + for h in page["headings"][:10]: + level = int(h["level"][1]) if len(h["level"]) > 1 else 1 indent = " " * max(0, level - 2) lines.append(f"{indent}- {h['text']}") lines.append("") - + # Content (NO TRUNCATION) - if page.get('content'): - lines.append(page['content']) + if page.get("content"): + lines.append(page["content"]) lines.append("") # Code examples with language (NO TRUNCATION) - if page.get('code_samples'): + if page.get("code_samples"): lines.append("**Examples:**\n") - for i, sample in enumerate(page['code_samples'][:4], 1): - lang = sample.get('language', 'unknown') - code = sample.get('code', sample if isinstance(sample, str) else '') + for i, sample in enumerate(page["code_samples"][:4], 1): + lang = sample.get("language", "unknown") + code = sample.get("code", sample if isinstance(sample, str) else "") lines.append(f"Example {i} ({lang}):") lines.append(f"```{lang}") lines.append(code) # Full code, no truncation lines.append("```\n") - + lines.append("---\n") - + filepath = os.path.join(self.skill_dir, "references", f"{category}.md") - with open(filepath, 'w', encoding='utf-8') as f: - f.write('\n'.join(lines)) + with open(filepath, "w", encoding="utf-8") as f: + f.write("\n".join(lines)) logger.info(" āœ“ %s.md (%d pages)", category, len(pages)) - - def create_enhanced_skill_md(self, categories: Dict[str, List[Dict[str, Any]]], quick_ref: List[Dict[str, str]]) -> None: + + def create_enhanced_skill_md( + self, categories: dict[str, list[dict[str, Any]]], quick_ref: list[dict[str, str]] + ) -> None: """Create SKILL.md with actual examples (IMPROVED)""" # Try to infer description if not in config - if 'description' not in self.config: + if "description" not in self.config: # Get first page HTML content to infer description first_page_html = None for pages in categories.values(): if pages: - first_page_html = pages[0].get('raw_html', '') + first_page_html = pages[0].get("raw_html", "") break description = infer_description_from_docs(self.base_url, first_page_html, self.name) else: - description = self.config['description'] - + description = self.config["description"] + # Extract actual code examples from docs example_codes = [] for pages in categories.values(): for page in pages[:3]: # First 3 pages per category - for sample in page.get('code_samples', [])[:2]: # First 2 samples per page - code = sample.get('code', sample if isinstance(sample, str) else '') - lang = sample.get('language', 'unknown') - if len(code) < 200 and lang != 'unknown': + for sample in page.get("code_samples", [])[:2]: # First 2 samples per page + code = sample.get("code", sample if isinstance(sample, str) else "") + lang = sample.get("language", "unknown") + if len(code) < 200 and lang != "unknown": example_codes.append((lang, code)) if len(example_codes) >= 10: break @@ -1389,7 +1353,7 @@ class DocToSkillConverter: break if len(example_codes) >= 10: break - + content = f"""--- name: {self.name} description: {description} @@ -1413,38 +1377,38 @@ This skill should be triggered when: ### Common Patterns """ - + # Add actual quick reference patterns if quick_ref: for i, pattern in enumerate(quick_ref[:8], 1): - desc = pattern.get('description', 'Example pattern') + desc = pattern.get("description", "Example pattern") # Format description: extract first sentence, truncate if too long - first_sentence = desc.split('.')[0] if '.' in desc else desc + first_sentence = desc.split(".")[0] if "." in desc else desc if len(first_sentence) > 150: - first_sentence = first_sentence[:147] + '...' + first_sentence = first_sentence[:147] + "..." content += f"**Pattern {i}:** {first_sentence}\n\n" content += "```\n" - content += pattern.get('code', '')[:300] + content += pattern.get("code", "")[:300] content += "\n```\n\n" else: content += "*Quick reference patterns will be added as you use the skill.*\n\n" - + # Add example codes from docs if example_codes: content += "### Example Code Patterns\n\n" for i, (lang, code) in enumerate(example_codes[:5], 1): content += f"**Example {i}** ({lang}):\n```{lang}\n{code}\n```\n\n" - - content += f"""## Reference Files + + content += """## Reference Files This skill includes comprehensive documentation in `references/`: """ - + for cat in sorted(categories.keys()): content += f"- **{cat}.md** - {cat.replace('_', ' ').title()} documentation\n" - + content += """ Use `view` to read specific reference files when detailed information is needed. @@ -1487,30 +1451,30 @@ To refresh this skill with updated documentation: 1. Re-run the scraper with the same configuration 2. The skill will be rebuilt with the latest information """ - + filepath = os.path.join(self.skill_dir, "SKILL.md") - with open(filepath, 'w', encoding='utf-8') as f: + with open(filepath, "w", encoding="utf-8") as f: f.write(content) logger.info(" āœ“ SKILL.md (enhanced with %d examples)", len(example_codes)) - - def create_index(self, categories: Dict[str, List[Dict[str, Any]]]) -> None: + + def create_index(self, categories: dict[str, list[dict[str, Any]]]) -> None: """Create navigation index""" lines = [] lines.append(f"# {self.name.title()} Documentation Index\n") lines.append("## Categories\n") - + for cat, pages in sorted(categories.items()): lines.append(f"### {cat.replace('_', ' ').title()}") lines.append(f"**File:** `{cat}.md`") lines.append(f"**Pages:** {len(pages)}\n") - + filepath = os.path.join(self.skill_dir, "references", "index.md") - with open(filepath, 'w', encoding='utf-8') as f: - f.write('\n'.join(lines)) + with open(filepath, "w", encoding="utf-8") as f: + f.write("\n".join(lines)) logger.info(" āœ“ index.md") - + def build_skill(self) -> bool: """Build the skill from scraped data. @@ -1561,7 +1525,7 @@ To refresh this skill with updated documentation: return True -def validate_config(config: Dict[str, Any]) -> Tuple[List[str], List[str]]: +def validate_config(config: dict[str, Any]) -> tuple[list[str], list[str]]: """Validate configuration structure and values. Args: @@ -1579,56 +1543,56 @@ def validate_config(config: Dict[str, Any]) -> Tuple[List[str], List[str]]: warnings = [] # Required fields - required_fields = ['name', 'base_url'] + required_fields = ["name", "base_url"] for field in required_fields: if field not in config: errors.append(f"Missing required field: '{field}'") # Validate name (alphanumeric, hyphens, underscores only) - if 'name' in config: - if not re.match(r'^[a-zA-Z0-9_-]+$', config['name']): + if "name" in config: + if not re.match(r"^[a-zA-Z0-9_-]+$", config["name"]): errors.append(f"Invalid name: '{config['name']}' (use only letters, numbers, hyphens, underscores)") # Validate base_url - if 'base_url' in config: - if not config['base_url'].startswith(('http://', 'https://')): + if "base_url" in config: + if not config["base_url"].startswith(("http://", "https://")): errors.append(f"Invalid base_url: '{config['base_url']}' (must start with http:// or https://)") # Validate selectors structure - if 'selectors' in config: - if not isinstance(config['selectors'], dict): + if "selectors" in config: + if not isinstance(config["selectors"], dict): errors.append("'selectors' must be a dictionary") else: - recommended_selectors = ['main_content', 'title', 'code_blocks'] + recommended_selectors = ["main_content", "title", "code_blocks"] for selector in recommended_selectors: - if selector not in config['selectors']: + if selector not in config["selectors"]: warnings.append(f"Missing recommended selector: '{selector}'") else: warnings.append("Missing 'selectors' section (recommended)") # Validate url_patterns - if 'url_patterns' in config: - if not isinstance(config['url_patterns'], dict): + if "url_patterns" in config: + if not isinstance(config["url_patterns"], dict): errors.append("'url_patterns' must be a dictionary") else: - for key in ['include', 'exclude']: - if key in config['url_patterns']: - if not isinstance(config['url_patterns'][key], list): + for key in ["include", "exclude"]: + if key in config["url_patterns"]: + if not isinstance(config["url_patterns"][key], list): errors.append(f"'url_patterns.{key}' must be a list") # Validate categories - if 'categories' in config: - if not isinstance(config['categories'], dict): + if "categories" in config: + if not isinstance(config["categories"], dict): errors.append("'categories' must be a dictionary") else: - for cat_name, keywords in config['categories'].items(): + for cat_name, keywords in config["categories"].items(): if not isinstance(keywords, list): errors.append(f"'categories.{cat_name}' must be a list of keywords") # Validate rate_limit - if 'rate_limit' in config: + if "rate_limit" in config: try: - rate = float(config['rate_limit']) + rate = float(config["rate_limit"]) if rate < 0: errors.append(f"'rate_limit' must be non-negative (got {rate})") elif rate > 10: @@ -1637,8 +1601,8 @@ def validate_config(config: Dict[str, Any]) -> Tuple[List[str], List[str]]: errors.append(f"'rate_limit' must be a number (got {config['rate_limit']})") # Validate max_pages - if 'max_pages' in config: - max_p_value = config['max_pages'] + if "max_pages" in config: + max_p_value = config["max_pages"] # Allow None for unlimited if max_p_value is None: @@ -1657,18 +1621,18 @@ def validate_config(config: Dict[str, Any]) -> Tuple[List[str], List[str]]: errors.append(f"'max_pages' must be an integer, -1, or null (got {config['max_pages']})") # Validate start_urls if present - if 'start_urls' in config: - if not isinstance(config['start_urls'], list): + if "start_urls" in config: + if not isinstance(config["start_urls"], list): errors.append("'start_urls' must be a list") else: - for url in config['start_urls']: - if not url.startswith(('http://', 'https://')): + for url in config["start_urls"]: + if not url.startswith(("http://", "https://")): errors.append(f"Invalid start_url: '{url}' (must start with http:// or https://)") return errors, warnings -def load_config(config_path: str) -> Dict[str, Any]: +def load_config(config_path: str) -> dict[str, Any]: """Load and validate configuration from JSON file. Args: @@ -1686,7 +1650,7 @@ def load_config(config_path: str) -> Dict[str, Any]: 'react' """ try: - with open(config_path, 'r', encoding='utf-8') as f: + with open(config_path, encoding="utf-8") as f: config = json.load(f) except json.JSONDecodeError as e: logger.error("āŒ Error: Invalid JSON in config file: %s", config_path) @@ -1720,7 +1684,7 @@ def load_config(config_path: str) -> Dict[str, Any]: return config -def interactive_config() -> Dict[str, Any]: +def interactive_config() -> dict[str, Any]: """Interactive configuration wizard for creating new configs. Prompts user for all required configuration fields step-by-step @@ -1735,48 +1699,48 @@ def interactive_config() -> Dict[str, Any]: >>> config['name'] 'react' """ - logger.info("\n" + "="*60) + logger.info("\n" + "=" * 60) logger.info("Documentation to Skill Converter") - logger.info("="*60 + "\n") + logger.info("=" * 60 + "\n") + + config: dict[str, Any] = {} - config: Dict[str, Any] = {} - # Basic info - config['name'] = input("Skill name (e.g., 'react', 'godot'): ").strip() - config['description'] = input("Skill description: ").strip() - config['base_url'] = input("Base URL (e.g., https://docs.example.com/): ").strip() - - if not config['base_url'].endswith('/'): - config['base_url'] += '/' - + config["name"] = input("Skill name (e.g., 'react', 'godot'): ").strip() + config["description"] = input("Skill description: ").strip() + config["base_url"] = input("Base URL (e.g., https://docs.example.com/): ").strip() + + if not config["base_url"].endswith("/"): + config["base_url"] += "/" + # Selectors logger.info("\nCSS Selectors (press Enter for defaults):") selectors = {} - selectors['main_content'] = input(" Main content [div[role='main']]: ").strip() or "div[role='main']" - selectors['title'] = input(" Title [title]: ").strip() or "title" - selectors['code_blocks'] = input(" Code blocks [pre code]: ").strip() or "pre code" - config['selectors'] = selectors - + selectors["main_content"] = input(" Main content [div[role='main']]: ").strip() or "div[role='main']" + selectors["title"] = input(" Title [title]: ").strip() or "title" + selectors["code_blocks"] = input(" Code blocks [pre code]: ").strip() or "pre code" + config["selectors"] = selectors + # URL patterns logger.info("\nURL Patterns (comma-separated, optional):") include = input(" Include: ").strip() exclude = input(" Exclude: ").strip() - config['url_patterns'] = { - 'include': [p.strip() for p in include.split(',') if p.strip()], - 'exclude': [p.strip() for p in exclude.split(',') if p.strip()] + config["url_patterns"] = { + "include": [p.strip() for p in include.split(",") if p.strip()], + "exclude": [p.strip() for p in exclude.split(",") if p.strip()], } - + # Settings rate = input(f"\nRate limit (seconds) [{DEFAULT_RATE_LIMIT}]: ").strip() - config['rate_limit'] = float(rate) if rate else DEFAULT_RATE_LIMIT + config["rate_limit"] = float(rate) if rate else DEFAULT_RATE_LIMIT max_p = input(f"Max pages [{DEFAULT_MAX_PAGES}]: ").strip() - config['max_pages'] = int(max_p) if max_p else DEFAULT_MAX_PAGES - + config["max_pages"] = int(max_p) if max_p else DEFAULT_MAX_PAGES + return config -def check_existing_data(name: str) -> Tuple[bool, int]: +def check_existing_data(name: str) -> tuple[bool, int]: """Check if scraped data already exists for a skill. Args: @@ -1792,9 +1756,9 @@ def check_existing_data(name: str) -> Tuple[bool, int]: """ data_dir = f"output/{name}_data" if os.path.exists(data_dir) and os.path.exists(f"{data_dir}/summary.json"): - with open(f"{data_dir}/summary.json", 'r', encoding='utf-8') as f: + with open(f"{data_dir}/summary.json", encoding="utf-8") as f: summary = json.load(f) - return True, summary.get('total_pages', 0) + return True, summary.get("total_pages", 0) return False, 0 @@ -1814,53 +1778,63 @@ def setup_argument_parser() -> argparse.ArgumentParser: configs/react.json """ parser = argparse.ArgumentParser( - description='Convert documentation websites to Claude skills', - formatter_class=argparse.RawDescriptionHelpFormatter + description="Convert documentation websites to Claude skills", + formatter_class=argparse.RawDescriptionHelpFormatter, ) - parser.add_argument('--interactive', '-i', action='store_true', - help='Interactive configuration mode') - parser.add_argument('--config', '-c', type=str, - help='Load configuration from file (e.g., configs/godot.json)') - parser.add_argument('--name', type=str, - help='Skill name') - parser.add_argument('--url', type=str, - help='Base documentation URL') - parser.add_argument('--description', '-d', type=str, - help='Skill description') - parser.add_argument('--skip-scrape', action='store_true', - help='Skip scraping, use existing data') - parser.add_argument('--dry-run', action='store_true', - help='Preview what will be scraped without actually scraping') - parser.add_argument('--enhance', action='store_true', - help='Enhance SKILL.md using Claude API after building (requires API key)') - parser.add_argument('--enhance-local', action='store_true', - help='Enhance SKILL.md using Claude Code (no API key needed, runs in background)') - parser.add_argument('--interactive-enhancement', action='store_true', - help='Open terminal window for enhancement (use with --enhance-local)') - parser.add_argument('--api-key', type=str, - help='Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)') - parser.add_argument('--resume', action='store_true', - help='Resume from last checkpoint (for interrupted scrapes)') - parser.add_argument('--fresh', action='store_true', - help='Clear checkpoint and start fresh') - parser.add_argument('--rate-limit', '-r', type=float, metavar='SECONDS', - help=f'Override rate limit in seconds (default: from config or {DEFAULT_RATE_LIMIT}). Use 0 for no delay.') - parser.add_argument('--workers', '-w', type=int, metavar='N', - help='Number of parallel workers for faster scraping (default: 1, max: 10)') - parser.add_argument('--async', dest='async_mode', action='store_true', - help='Enable async mode for better parallel performance (2-3x faster than threads)') - parser.add_argument('--no-rate-limit', action='store_true', - help='Disable rate limiting completely (same as --rate-limit 0)') - parser.add_argument('--verbose', '-v', action='store_true', - help='Enable verbose output (DEBUG level logging)') - parser.add_argument('--quiet', '-q', action='store_true', - help='Minimize output (WARNING level logging only)') + parser.add_argument("--interactive", "-i", action="store_true", help="Interactive configuration mode") + parser.add_argument("--config", "-c", type=str, help="Load configuration from file (e.g., configs/godot.json)") + parser.add_argument("--name", type=str, help="Skill name") + parser.add_argument("--url", type=str, help="Base documentation URL") + parser.add_argument("--description", "-d", type=str, help="Skill description") + parser.add_argument("--skip-scrape", action="store_true", help="Skip scraping, use existing data") + parser.add_argument("--dry-run", action="store_true", help="Preview what will be scraped without actually scraping") + parser.add_argument( + "--enhance", action="store_true", help="Enhance SKILL.md using Claude API after building (requires API key)" + ) + parser.add_argument( + "--enhance-local", + action="store_true", + help="Enhance SKILL.md using Claude Code (no API key needed, runs in background)", + ) + parser.add_argument( + "--interactive-enhancement", + action="store_true", + help="Open terminal window for enhancement (use with --enhance-local)", + ) + parser.add_argument("--api-key", type=str, help="Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)") + parser.add_argument("--resume", action="store_true", help="Resume from last checkpoint (for interrupted scrapes)") + parser.add_argument("--fresh", action="store_true", help="Clear checkpoint and start fresh") + parser.add_argument( + "--rate-limit", + "-r", + type=float, + metavar="SECONDS", + help=f"Override rate limit in seconds (default: from config or {DEFAULT_RATE_LIMIT}). Use 0 for no delay.", + ) + parser.add_argument( + "--workers", + "-w", + type=int, + metavar="N", + help="Number of parallel workers for faster scraping (default: 1, max: 10)", + ) + parser.add_argument( + "--async", + dest="async_mode", + action="store_true", + help="Enable async mode for better parallel performance (2-3x faster than threads)", + ) + parser.add_argument( + "--no-rate-limit", action="store_true", help="Disable rate limiting completely (same as --rate-limit 0)" + ) + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose output (DEBUG level logging)") + parser.add_argument("--quiet", "-q", action="store_true", help="Minimize output (WARNING level logging only)") return parser -def get_configuration(args: argparse.Namespace) -> Dict[str, Any]: +def get_configuration(args: argparse.Namespace) -> dict[str, Any]: """Load or create configuration from command-line arguments. Handles three configuration modes: @@ -1889,25 +1863,21 @@ def get_configuration(args: argparse.Namespace) -> Dict[str, Any]: config = interactive_config() else: config = { - 'name': args.name, - 'description': args.description or f'Use when working with {args.name}', - 'base_url': args.url, - 'selectors': { - 'main_content': "div[role='main']", - 'title': 'title', - 'code_blocks': 'pre code' - }, - 'url_patterns': {'include': [], 'exclude': []}, - 'rate_limit': DEFAULT_RATE_LIMIT, - 'max_pages': DEFAULT_MAX_PAGES + "name": args.name, + "description": args.description or f"Use when working with {args.name}", + "base_url": args.url, + "selectors": {"main_content": "div[role='main']", "title": "title", "code_blocks": "pre code"}, + "url_patterns": {"include": [], "exclude": []}, + "rate_limit": DEFAULT_RATE_LIMIT, + "max_pages": DEFAULT_MAX_PAGES, } # Apply CLI overrides for rate limiting if args.no_rate_limit: - config['rate_limit'] = 0 + config["rate_limit"] = 0 logger.info("⚔ Rate limiting disabled") elif args.rate_limit is not None: - config['rate_limit'] = args.rate_limit + config["rate_limit"] = args.rate_limit if args.rate_limit == 0: logger.info("⚔ Rate limiting disabled") else: @@ -1923,14 +1893,14 @@ def get_configuration(args: argparse.Namespace) -> Dict[str, Any]: if args.workers > 10: logger.warning("āš ļø Warning: --workers capped at 10 (requested %d)", args.workers) args.workers = 10 - config['workers'] = args.workers + config["workers"] = args.workers if args.workers > 1: logger.info("šŸš€ Parallel scraping enabled: %d workers", args.workers) # Apply CLI override for async mode if args.async_mode: - config['async_mode'] = True - if config.get('workers', 1) > 1: + config["async_mode"] = True + if config.get("workers", 1) > 1: logger.info("⚔ Async mode enabled (2-3x faster than threads)") else: logger.warning("āš ļø Async mode enabled but workers=1. Consider using --workers 4 for better performance") @@ -1938,7 +1908,7 @@ def get_configuration(args: argparse.Namespace) -> Dict[str, Any]: return config -def execute_scraping_and_building(config: Dict[str, Any], args: argparse.Namespace) -> Optional['DocToSkillConverter']: +def execute_scraping_and_building(config: dict[str, Any], args: argparse.Namespace) -> Optional["DocToSkillConverter"]: """Execute the scraping and skill building process. Handles dry run mode, existing data checks, scraping with checkpoints, @@ -1970,23 +1940,24 @@ def execute_scraping_and_building(config: Dict[str, Any], args: argparse.Namespa converter.scrape_all() logger.info("\nšŸ“‹ Configuration Summary:") - logger.info(" Name: %s", config['name']) - logger.info(" Base URL: %s", config['base_url']) - logger.info(" Max pages: %d", config.get('max_pages', DEFAULT_MAX_PAGES)) - logger.info(" Rate limit: %ss", config.get('rate_limit', DEFAULT_RATE_LIMIT)) - logger.info(" Categories: %d", len(config.get('categories', {}))) + logger.info(" Name: %s", config["name"]) + logger.info(" Base URL: %s", config["base_url"]) + logger.info(" Max pages: %d", config.get("max_pages", DEFAULT_MAX_PAGES)) + logger.info(" Rate limit: %ss", config.get("rate_limit", DEFAULT_RATE_LIMIT)) + logger.info(" Categories: %d", len(config.get("categories", {}))) return None # Check for existing data - exists, page_count = check_existing_data(config['name']) + exists, page_count = check_existing_data(config["name"]) if exists and not args.skip_scrape and not args.fresh: # Check force_rescrape flag from config - if config.get('force_rescrape', False): + if config.get("force_rescrape", False): # Auto-delete cached data and rescrape logger.info("\nāœ“ Found existing data: %d pages", page_count) logger.info(" force_rescrape enabled - deleting cached data and rescaping") import shutil + data_dir = f"output/{config['name']}_data" if os.path.exists(data_dir): shutil.rmtree(data_dir) @@ -1995,7 +1966,7 @@ def execute_scraping_and_building(config: Dict[str, Any], args: argparse.Namespa # Only prompt if force_rescrape is False logger.info("\nāœ“ Found existing data: %d pages", page_count) response = input("Use existing data? (y/n): ").strip().lower() - if response == 'y': + if response == "y": args.skip_scrape = True elif exists and args.fresh: logger.info("\nāœ“ Found existing data: %d pages", page_count) @@ -2024,9 +1995,9 @@ def execute_scraping_and_building(config: Dict[str, Any], args: argparse.Namespa if converter.checkpoint_enabled: converter.save_checkpoint() logger.info("šŸ’¾ Progress saved to checkpoint") - logger.info(" Resume with: --config %s --resume", args.config if args.config else 'config.json') + logger.info(" Resume with: --config %s --resume", args.config if args.config else "config.json") response = input("Continue with skill building? (y/n): ").strip().lower() - if response != 'y': + if response != "y": return None else: logger.info("\nā­ļø Skipping scrape, using existing data") @@ -2040,7 +2011,7 @@ def execute_scraping_and_building(config: Dict[str, Any], args: argparse.Namespa return converter -def execute_enhancement(config: Dict[str, Any], args: argparse.Namespace) -> None: +def execute_enhancement(config: dict[str, Any], args: argparse.Namespace) -> None: """Execute optional SKILL.md enhancement with Claude. Supports two enhancement modes: @@ -2067,9 +2038,9 @@ def execute_enhancement(config: Dict[str, Any], args: argparse.Namespace) -> Non logger.info("=" * 60 + "\n") try: - enhance_cmd = ['python3', 'cli/enhance_skill.py', f'output/{config["name"]}/'] + enhance_cmd = ["python3", "cli/enhance_skill.py", f"output/{config['name']}/"] if args.api_key: - enhance_cmd.extend(['--api-key', args.api_key]) + enhance_cmd.extend(["--api-key", args.api_key]) result = subprocess.run(enhance_cmd, check=True) if result.returncode == 0: @@ -2078,7 +2049,7 @@ def execute_enhancement(config: Dict[str, Any], args: argparse.Namespace) -> Non logger.warning("\n⚠ Enhancement failed, but skill was still built") except FileNotFoundError: logger.warning("\n⚠ enhance_skill.py not found. Run manually:") - logger.info(" skill-seekers-enhance output/%s/", config['name']) + logger.info(" skill-seekers-enhance output/%s/", config["name"]) # Optional enhancement with Claude Code (local, no API key) if args.enhance_local: @@ -2090,9 +2061,9 @@ def execute_enhancement(config: Dict[str, Any], args: argparse.Namespace) -> Non logger.info("=" * 60 + "\n") try: - enhance_cmd = ['skill-seekers-enhance', f'output/{config["name"]}/'] + enhance_cmd = ["skill-seekers-enhance", f"output/{config['name']}/"] if args.interactive_enhancement: - enhance_cmd.append('--interactive-enhancement') + enhance_cmd.append("--interactive-enhancement") result = subprocess.run(enhance_cmd, check=True) @@ -2102,18 +2073,18 @@ def execute_enhancement(config: Dict[str, Any], args: argparse.Namespace) -> Non logger.warning("\n⚠ Enhancement failed, but skill was still built") except FileNotFoundError: logger.warning("\n⚠ skill-seekers-enhance command not found. Run manually:") - logger.info(" skill-seekers-enhance output/%s/", config['name']) + logger.info(" skill-seekers-enhance output/%s/", config["name"]) # Print packaging instructions logger.info("\nšŸ“¦ Package your skill:") - logger.info(" skill-seekers-package output/%s/", config['name']) + logger.info(" skill-seekers-package output/%s/", config["name"]) # Suggest enhancement if not done if not args.enhance and not args.enhance_local: logger.info("\nšŸ’” Optional: Enhance SKILL.md with Claude:") - logger.info(" Local (recommended): skill-seekers-enhance output/%s/", config['name']) + logger.info(" Local (recommended): skill-seekers-enhance output/%s/", config["name"]) logger.info(" or re-run with: --enhance-local") - logger.info(" API-based: skill-seekers-enhance-api output/%s/", config['name']) + logger.info(" API-based: skill-seekers-enhance-api output/%s/", config["name"]) logger.info(" or re-run with: --enhance") logger.info("\nšŸ’” Tip: Use --interactive-enhancement with --enhance-local to open terminal window") diff --git a/src/skill_seekers/cli/enhance_skill.py b/src/skill_seekers/cli/enhance_skill.py index e24048f..646d9ad 100644 --- a/src/skill_seekers/cli/enhance_skill.py +++ b/src/skill_seekers/cli/enhance_skill.py @@ -15,10 +15,9 @@ Usage: skill-seekers enhance output/react/ --target openai --api-key sk-proj-... """ +import argparse import os import sys -import json -import argparse from pathlib import Path # Add parent directory to path for imports when run as script @@ -42,9 +41,7 @@ class SkillEnhancer: self.skill_md_path = self.skill_dir / "SKILL.md" # Get API key - support both ANTHROPIC_API_KEY and ANTHROPIC_AUTH_TOKEN - self.api_key = (api_key or - os.environ.get('ANTHROPIC_API_KEY') or - os.environ.get('ANTHROPIC_AUTH_TOKEN')) + self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN") if not self.api_key: raise ValueError( "No API key provided. Set ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN " @@ -52,10 +49,10 @@ class SkillEnhancer: ) # Support custom base URL for alternative API endpoints - base_url = os.environ.get('ANTHROPIC_BASE_URL') - client_kwargs = {'api_key': self.api_key} + base_url = os.environ.get("ANTHROPIC_BASE_URL") + client_kwargs = {"api_key": self.api_key} if base_url: - client_kwargs['base_url'] = base_url + client_kwargs["base_url"] = base_url print(f"ā„¹ļø Using custom API base URL: {base_url}") self.client = anthropic.Anthropic(**client_kwargs) @@ -64,7 +61,7 @@ class SkillEnhancer: """Read existing SKILL.md""" if not self.skill_md_path.exists(): return None - return self.skill_md_path.read_text(encoding='utf-8') + return self.skill_md_path.read_text(encoding="utf-8") def enhance_skill_md(self, references, current_skill_md): """Use Claude to enhance SKILL.md""" @@ -80,17 +77,14 @@ class SkillEnhancer: model="claude-sonnet-4-20250514", max_tokens=4096, temperature=0.3, - messages=[{ - "role": "user", - "content": prompt - }] + messages=[{"role": "user", "content": prompt}], ) # Handle response content - newer SDK versions may include ThinkingBlock # Find the TextBlock containing the actual response enhanced_content = None for block in message.content: - if hasattr(block, 'text'): + if hasattr(block, "text"): enhanced_content = block.text break @@ -113,10 +107,10 @@ class SkillEnhancer: # Analyze sources sources_found = set() for metadata in references.values(): - sources_found.add(metadata['source']) + sources_found.add(metadata["source"]) # Analyze conflicts if present - has_conflicts = any('conflicts' in meta['path'] for meta in references.values()) + has_conflicts = any("conflicts" in meta["path"] for meta in references.values()) prompt = f"""You are enhancing a Claude skill's SKILL.md file. This skill is about: {skill_name} @@ -124,14 +118,14 @@ I've scraped documentation from multiple sources and organized it into reference SKILL OVERVIEW: - Name: {skill_name} -- Source Types: {', '.join(sorted(sources_found))} -- Multi-Source: {'Yes' if len(sources_found) > 1 else 'No'} -- Conflicts Detected: {'Yes - see conflicts.md in references' if has_conflicts else 'No'} +- Source Types: {", ".join(sorted(sources_found))} +- Multi-Source: {"Yes" if len(sources_found) > 1 else "No"} +- Conflicts Detected: {"Yes - see conflicts.md in references" if has_conflicts else "No"} CURRENT SKILL.MD: -{'```markdown' if current_skill_md else '(none - create from scratch)'} -{current_skill_md or 'No existing SKILL.md'} -{'```' if current_skill_md else ''} +{"```markdown" if current_skill_md else "(none - create from scratch)"} +{current_skill_md or "No existing SKILL.md"} +{"```" if current_skill_md else ""} SOURCE ANALYSIS: This skill combines knowledge from {len(sources_found)} source type(s): @@ -141,8 +135,8 @@ This skill combines knowledge from {len(sources_found)} source type(s): # Group references by (source_type, repo_id) for multi-source support by_source = {} for filename, metadata in references.items(): - source = metadata['source'] - repo_id = metadata.get('repo_id') # None for single-source + source = metadata["source"] + repo_id = metadata.get("repo_id") # None for single-source key = (source, repo_id) if repo_id else (source, None) if key not in by_source: @@ -150,7 +144,7 @@ This skill combines knowledge from {len(sources_found)} source type(s): by_source[key].append((filename, metadata)) # Add source breakdown with repo identity - for (source, repo_id) in sorted(by_source.keys()): + for source, repo_id in sorted(by_source.keys()): files = by_source[(source, repo_id)] if repo_id: prompt += f"\n**{source.upper()} - {repo_id} ({len(files)} file(s))**\n" @@ -164,14 +158,14 @@ This skill combines knowledge from {len(sources_found)} source type(s): prompt += "\n\nREFERENCE DOCUMENTATION:\n" # Add references grouped by (source, repo_id) with metadata - for (source, repo_id) in sorted(by_source.keys()): + for source, repo_id in sorted(by_source.keys()): if repo_id: prompt += f"\n### {source.upper()} SOURCES - {repo_id}\n\n" else: prompt += f"\n### {source.upper()} SOURCES\n\n" for filename, metadata in by_source[(source, repo_id)]: - content = metadata['content'] + content = metadata["content"] # Limit per-file to 30K if len(content) > 30000: content = content[:30000] + "\n\n[Content truncated for size...]" @@ -197,12 +191,12 @@ MULTI-REPOSITORY HANDLING: # Detect multiple repos from same source type repo_ids = set() for metadata in references.values(): - if metadata.get('repo_id'): - repo_ids.add(metadata['repo_id']) + if metadata.get("repo_id"): + repo_ids.add(metadata["repo_id"]) if len(repo_ids) > 1: prompt += f""" -āš ļø MULTIPLE REPOSITORIES DETECTED: {', '.join(sorted(repo_ids))} +āš ļø MULTIPLE REPOSITORIES DETECTED: {", ".join(sorted(repo_ids))} This skill combines codebase analysis from {len(repo_ids)} different repositories. Each repo has its own ARCHITECTURE.md, patterns, examples, and configuration. @@ -285,27 +279,23 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---). """Save the enhanced SKILL.md""" # Backup original if self.skill_md_path.exists(): - backup_path = self.skill_md_path.with_suffix('.md.backup') + backup_path = self.skill_md_path.with_suffix(".md.backup") self.skill_md_path.rename(backup_path) print(f" šŸ’¾ Backed up original to: {backup_path.name}") # Save enhanced version - self.skill_md_path.write_text(content, encoding='utf-8') - print(f" āœ… Saved enhanced SKILL.md") + self.skill_md_path.write_text(content, encoding="utf-8") + print(" āœ… Saved enhanced SKILL.md") def run(self): """Main enhancement workflow""" - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"ENHANCING SKILL: {self.skill_dir.name}") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") # Read reference files print("šŸ“– Reading reference documentation...") - references = read_reference_files( - self.skill_dir, - max_chars=API_CONTENT_LIMIT, - preview_limit=API_PREVIEW_LIMIT - ) + references = read_reference_files(self.skill_dir, max_chars=API_CONTENT_LIMIT, preview_limit=API_PREVIEW_LIMIT) if not references: print("āŒ No reference files found to analyze") @@ -314,11 +304,11 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---). # Analyze sources sources_found = set() for metadata in references.values(): - sources_found.add(metadata['source']) + sources_found.add(metadata["source"]) print(f" āœ“ Read {len(references)} reference files") print(f" āœ“ Sources: {', '.join(sorted(sources_found))}") - total_size = sum(meta['size'] for meta in references.values()) + total_size = sum(meta["size"] for meta in references.values()) print(f" āœ“ Total size: {total_size:,} characters\n") # Read current SKILL.md @@ -326,7 +316,7 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---). if current_skill_md: print(f" ℹ Found existing SKILL.md ({len(current_skill_md)} chars)") else: - print(f" ℹ No existing SKILL.md, will create new one") + print(" ℹ No existing SKILL.md, will create new one") # Enhance with Claude enhanced = self.enhance_skill_md(references, current_skill_md) @@ -341,11 +331,11 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---). print("šŸ’¾ Saving enhanced SKILL.md...") self.save_enhanced_skill_md(enhanced) - print(f"\nāœ… Enhancement complete!") - print(f"\nNext steps:") + print("\nāœ… Enhancement complete!") + print("\nNext steps:") print(f" 1. Review: {self.skill_md_path}") print(f" 2. If you don't like it, restore backup: {self.skill_md_path.with_suffix('.md.backup')}") - print(f" 3. Package your skill:") + print(" 3. Package your skill:") print(f" skill-seekers package {self.skill_dir}/") return True @@ -353,7 +343,7 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---). def main(): parser = argparse.ArgumentParser( - description='Enhance SKILL.md using platform AI APIs', + description="Enhance SKILL.md using platform AI APIs", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -374,19 +364,18 @@ Examples: # Dry run skill-seekers enhance output/godot/ --dry-run -""" +""", ) - parser.add_argument('skill_dir', type=str, - help='Path to skill directory (e.g., output/steam-inventory/)') - parser.add_argument('--api-key', type=str, - help='Platform API key (or set environment variable)') - parser.add_argument('--target', - choices=['claude', 'gemini', 'openai'], - default='claude', - help='Target LLM platform (default: claude)') - parser.add_argument('--dry-run', action='store_true', - help='Show what would be done without calling API') + parser.add_argument("skill_dir", type=str, help="Path to skill directory (e.g., output/steam-inventory/)") + parser.add_argument("--api-key", type=str, help="Platform API key (or set environment variable)") + parser.add_argument( + "--target", + choices=["claude", "gemini", "openai"], + default="claude", + help="Target LLM platform (default: claude)", + ) + parser.add_argument("--dry-run", action="store_true", help="Show what would be done without calling API") args = parser.parse_args() @@ -402,7 +391,7 @@ Examples: # Dry run mode if args.dry_run: - print(f"šŸ” DRY RUN MODE") + print("šŸ” DRY RUN MODE") print(f" Would enhance: {skill_dir}") print(f" References: {skill_dir / 'references'}") print(f" SKILL.md: {skill_dir / 'SKILL.md'}") @@ -427,7 +416,7 @@ Examples: if not adaptor.supports_enhancement(): print(f"āŒ Error: {adaptor.PLATFORM_NAME} does not support AI enhancement") - print(f"\nSupported platforms for enhancement:") + print("\nSupported platforms for enhancement:") print(" - Claude AI (Anthropic)") print(" - Google Gemini") print(" - OpenAI ChatGPT") @@ -436,7 +425,7 @@ Examples: # Get API key api_key = args.api_key if not api_key: - api_key = os.environ.get(adaptor.get_env_var_name(), '').strip() + api_key = os.environ.get(adaptor.get_env_var_name(), "").strip() if not api_key: print(f"āŒ Error: {adaptor.get_env_var_name()} not set") @@ -447,19 +436,19 @@ Examples: sys.exit(1) # Run enhancement using adaptor - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"ENHANCING SKILL: {skill_dir}") print(f"Platform: {adaptor.PLATFORM_NAME}") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") success = adaptor.enhance(Path(skill_dir), api_key) if success: - print(f"\nāœ… Enhancement complete!") - print(f"\nNext steps:") + print("\nāœ… Enhancement complete!") + print("\nNext steps:") print(f" 1. Review: {Path(skill_dir) / 'SKILL.md'}") print(f" 2. If you don't like it, restore backup: {Path(skill_dir) / 'SKILL.md.backup'}") - print(f" 3. Package your skill:") + print(" 3. Package your skill:") print(f" skill-seekers package {skill_dir}/ --target {args.target}") sys.exit(0 if success else 1) @@ -474,6 +463,7 @@ Examples: except Exception as e: print(f"āŒ Unexpected error: {e}") import traceback + traceback.print_exc() sys.exit(1) diff --git a/src/skill_seekers/cli/enhance_skill_local.py b/src/skill_seekers/cli/enhance_skill_local.py index 4209230..bb8463b 100644 --- a/src/skill_seekers/cli/enhance_skill_local.py +++ b/src/skill_seekers/cli/enhance_skill_local.py @@ -36,15 +36,15 @@ Terminal Selection: Supported terminals: Ghostty, iTerm, Terminal, WezTerm """ -import os -import sys -import time -import subprocess -import tempfile import json +import os +import subprocess +import sys +import tempfile import threading -from pathlib import Path +import time from datetime import datetime +from pathlib import Path # Add parent directory to path for imports when run as script sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -77,29 +77,29 @@ def detect_terminal_app(): """ # Map TERM_PROGRAM values to macOS app names TERMINAL_MAP = { - 'Apple_Terminal': 'Terminal', - 'iTerm.app': 'iTerm', - 'ghostty': 'Ghostty', - 'WezTerm': 'WezTerm', + "Apple_Terminal": "Terminal", + "iTerm.app": "iTerm", + "ghostty": "Ghostty", + "WezTerm": "WezTerm", } # Priority 1: Check SKILL_SEEKER_TERMINAL env var (explicit preference) - preferred_terminal = os.environ.get('SKILL_SEEKER_TERMINAL', '').strip() + preferred_terminal = os.environ.get("SKILL_SEEKER_TERMINAL", "").strip() if preferred_terminal: - return preferred_terminal, 'SKILL_SEEKER_TERMINAL' + return preferred_terminal, "SKILL_SEEKER_TERMINAL" # Priority 2: Check TERM_PROGRAM (inherit current terminal) - term_program = os.environ.get('TERM_PROGRAM', '').strip() + term_program = os.environ.get("TERM_PROGRAM", "").strip() if term_program and term_program in TERMINAL_MAP: - return TERMINAL_MAP[term_program], 'TERM_PROGRAM' + return TERMINAL_MAP[term_program], "TERM_PROGRAM" # Priority 3: Fallback to Terminal.app if term_program: # TERM_PROGRAM is set but unknown - return 'Terminal', f'unknown TERM_PROGRAM ({term_program})' + return "Terminal", f"unknown TERM_PROGRAM ({term_program})" else: # No TERM_PROGRAM set - return 'Terminal', 'default' + return "Terminal", "default" class LocalSkillEnhancer: @@ -132,7 +132,7 @@ class LocalSkillEnhancer: Returns: Summarized content """ - lines = content.split('\n') + lines = content.split("\n") target_lines = int(len(lines) * target_ratio) # Priority 1: Keep introduction (first 20%) @@ -146,7 +146,7 @@ class LocalSkillEnhancer: block_start_idx = 0 for i, line in enumerate(lines[intro_lines:], start=intro_lines): - if line.strip().startswith('```'): + if line.strip().startswith("```"): if in_code_block: # End of code block - add closing ``` and save current_block.append(line) @@ -174,9 +174,9 @@ class LocalSkillEnhancer: headings_added = 0 while i < len(lines) and headings_added < 10: line = lines[i] - if line.startswith('#'): + if line.startswith("#"): # Found heading - keep it and next 3 lines - chunk = lines[i:min(i+4, len(lines))] + chunk = lines[i : min(i + 4, len(lines))] result.extend(chunk) headings_added += 1 i += 4 @@ -185,7 +185,7 @@ class LocalSkillEnhancer: result.append("\n\n[Content intelligently summarized - full details in reference files]") - return '\n'.join(result) + return "\n".join(result) def create_enhancement_prompt(self, use_summarization=False, summarization_ratio=0.3): """Create the prompt file for Claude Code @@ -197,9 +197,7 @@ class LocalSkillEnhancer: # Read reference files (with enriched metadata) references = read_reference_files( - self.skill_dir, - max_chars=LOCAL_CONTENT_LIMIT, - preview_limit=LOCAL_PREVIEW_LIMIT + self.skill_dir, max_chars=LOCAL_CONTENT_LIMIT, preview_limit=LOCAL_PREVIEW_LIMIT ) if not references: @@ -209,52 +207,54 @@ class LocalSkillEnhancer: # Analyze sources sources_found = set() for metadata in references.values(): - sources_found.add(metadata['source']) + sources_found.add(metadata["source"]) # Calculate total size - total_ref_size = sum(meta['size'] for meta in references.values()) + total_ref_size = sum(meta["size"] for meta in references.values()) # Apply summarization if requested or if content is too large if use_summarization or total_ref_size > 30000: if not use_summarization: print(f" āš ļø Large skill detected ({total_ref_size:,} chars)") - print(f" šŸ“Š Applying smart summarization (target: {int(summarization_ratio*100)}% of original)") + print(f" šŸ“Š Applying smart summarization (target: {int(summarization_ratio * 100)}% of original)") print() # Summarize each reference for filename, metadata in references.items(): - summarized = self.summarize_reference(metadata['content'], summarization_ratio) - metadata['content'] = summarized - metadata['size'] = len(summarized) + summarized = self.summarize_reference(metadata["content"], summarization_ratio) + metadata["content"] = summarized + metadata["size"] = len(summarized) - new_size = sum(meta['size'] for meta in references.values()) - print(f" āœ“ Reduced from {total_ref_size:,} to {new_size:,} chars ({int(new_size/total_ref_size*100)}%)") + new_size = sum(meta["size"] for meta in references.values()) + print( + f" āœ“ Reduced from {total_ref_size:,} to {new_size:,} chars ({int(new_size / total_ref_size * 100)}%)" + ) print() # Read current SKILL.md current_skill_md = "" if self.skill_md_path.exists(): - current_skill_md = self.skill_md_path.read_text(encoding='utf-8') + current_skill_md = self.skill_md_path.read_text(encoding="utf-8") # Analyze conflicts if present - has_conflicts = any('conflicts' in meta['path'] for meta in references.values()) + has_conflicts = any("conflicts" in meta["path"] for meta in references.values()) # Build prompt with multi-source awareness prompt = f"""I need you to enhance the SKILL.md file for the {self.skill_dir.name} skill. SKILL OVERVIEW: - Name: {self.skill_dir.name} -- Source Types: {', '.join(sorted(sources_found))} -- Multi-Source: {'Yes' if len(sources_found) > 1 else 'No'} -- Conflicts Detected: {'Yes - see conflicts.md in references' if has_conflicts else 'No'} +- Source Types: {", ".join(sorted(sources_found))} +- Multi-Source: {"Yes" if len(sources_found) > 1 else "No"} +- Conflicts Detected: {"Yes - see conflicts.md in references" if has_conflicts else "No"} CURRENT SKILL.MD: -{'-'*60} -{current_skill_md if current_skill_md else '(No existing SKILL.md - create from scratch)'} -{'-'*60} +{"-" * 60} +{current_skill_md if current_skill_md else "(No existing SKILL.md - create from scratch)"} +{"-" * 60} SOURCE ANALYSIS: -{'-'*60} +{"-" * 60} This skill combines knowledge from {len(sources_found)} source type(s): """ @@ -262,8 +262,8 @@ This skill combines knowledge from {len(sources_found)} source type(s): # Group references by (source_type, repo_id) for multi-source support by_source = {} for filename, metadata in references.items(): - source = metadata['source'] - repo_id = metadata.get('repo_id') # None for single-source + source = metadata["source"] + repo_id = metadata.get("repo_id") # None for single-source key = (source, repo_id) if repo_id else (source, None) if key not in by_source: @@ -271,7 +271,7 @@ This skill combines knowledge from {len(sources_found)} source type(s): by_source[key].append((filename, metadata)) # Add source breakdown with repo identity - for (source, repo_id) in sorted(by_source.keys()): + for source, repo_id in sorted(by_source.keys()): files = by_source[(source, repo_id)] if repo_id: prompt += f"\n**{source.upper()} - {repo_id} ({len(files)} file(s))**\n" @@ -283,14 +283,14 @@ This skill combines knowledge from {len(sources_found)} source type(s): prompt += f"- ... and {len(files) - 5} more\n" prompt += f""" -{'-'*60} +{"-" * 60} REFERENCE DOCUMENTATION: -{'-'*60} +{"-" * 60} """ # Add references grouped by (source, repo_id) with metadata - for (source, repo_id) in sorted(by_source.keys()): + for source, repo_id in sorted(by_source.keys()): if repo_id: prompt += f"\n### {source.upper()} SOURCES - {repo_id}\n\n" else: @@ -298,7 +298,7 @@ REFERENCE DOCUMENTATION: for filename, metadata in by_source[(source, repo_id)]: # Further limit per-file to 12K to be safe - content = metadata['content'] + content = metadata["content"] max_per_file = 12000 if len(content) > max_per_file: content = content[:max_per_file] + "\n\n[Content truncated for size...]" @@ -311,7 +311,7 @@ REFERENCE DOCUMENTATION: prompt += f"{content}\n" prompt += f""" -{'-'*60} +{"-" * 60} REFERENCE PRIORITY (when sources differ): 1. **Code patterns (codebase_analysis)**: Ground truth - what the code actually does @@ -325,12 +325,12 @@ MULTI-REPOSITORY HANDLING: # Detect multiple repos from same source type repo_ids = set() for metadata in references.values(): - if metadata.get('repo_id'): - repo_ids.add(metadata['repo_id']) + if metadata.get("repo_id"): + repo_ids.add(metadata["repo_id"]) if len(repo_ids) > 1: prompt += f""" -āš ļø MULTIPLE REPOSITORIES DETECTED: {', '.join(sorted(repo_ids))} +āš ļø MULTIPLE REPOSITORIES DETECTED: {", ".join(sorted(repo_ids))} This skill combines codebase analysis from {len(repo_ids)} different repositories. Each repo has its own ARCHITECTURE.md, patterns, examples, and configuration. @@ -435,10 +435,10 @@ After writing, the file SKILL.md should: "progress": progress, "timestamp": datetime.now().isoformat(), "skill_dir": str(self.skill_dir), - "error": error + "error": error, } - self.status_file.write_text(json.dumps(status_data, indent=2), encoding='utf-8') + self.status_file.write_text(json.dumps(status_data, indent=2), encoding="utf-8") def read_status(self): """Read enhancement status from file. @@ -450,7 +450,7 @@ After writing, the file SKILL.md should: return None try: - return json.loads(self.status_file.read_text(encoding='utf-8')) + return json.loads(self.status_file.read_text(encoding="utf-8")) except: return None @@ -482,9 +482,9 @@ After writing, the file SKILL.md should: # Daemon mode: Run as persistent process with monitoring if daemon: return self._run_daemon(timeout) - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"LOCAL ENHANCEMENT: {self.skill_dir.name}") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") # Validate if not self.skill_dir.exists(): @@ -494,9 +494,7 @@ After writing, the file SKILL.md should: # Read reference files print("šŸ“– Reading reference documentation...") references = read_reference_files( - self.skill_dir, - max_chars=LOCAL_CONTENT_LIMIT, - preview_limit=LOCAL_PREVIEW_LIMIT + self.skill_dir, max_chars=LOCAL_CONTENT_LIMIT, preview_limit=LOCAL_PREVIEW_LIMIT ) if not references: @@ -504,7 +502,7 @@ After writing, the file SKILL.md should: return False print(f" āœ“ Read {len(references)} reference files") - total_size = sum(ref['size'] for ref in references.values()) + total_size = sum(ref["size"] for ref in references.values()) print(f" āœ“ Total size: {total_size:,} characters\n") # Check if we need smart summarization @@ -513,7 +511,7 @@ After writing, the file SKILL.md should: if use_summarization: print("āš ļø LARGE SKILL DETECTED") print(f" šŸ“Š Reference content: {total_size:,} characters") - print(f" šŸ’” Claude CLI limit: ~30,000-40,000 characters") + print(" šŸ’” Claude CLI limit: ~30,000-40,000 characters") print() print(" šŸ”§ Applying smart summarization to ensure success...") print(" • Keeping introductions and overviews") @@ -530,13 +528,13 @@ After writing, the file SKILL.md should: return False # Save prompt to temp file - with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f: prompt_file = f.name f.write(prompt) if use_summarization: print(f" āœ“ Prompt created and optimized ({len(prompt):,} characters)") - print(f" āœ“ Ready for Claude CLI (within safe limits)") + print(" āœ“ Ready for Claude CLI (within safe limits)") print() else: print(f" āœ“ Prompt saved ({len(prompt):,} characters)\n") @@ -555,49 +553,49 @@ After writing, the file SKILL.md should: print() # Create a shell script to run in the terminal - shell_script = f'''#!/bin/bash + shell_script = f"""#!/bin/bash claude {prompt_file} echo "" echo "āœ… Enhancement complete!" echo "Press any key to close..." read -n 1 rm {prompt_file} -''' +""" # Save shell script - with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f: script_file = f.name f.write(shell_script) os.chmod(script_file, 0o755) # Launch in new terminal (macOS specific) - if sys.platform == 'darwin': + if sys.platform == "darwin": # Detect which terminal app to use terminal_app, detection_method = detect_terminal_app() # Show detection info - if detection_method == 'SKILL_SEEKER_TERMINAL': + if detection_method == "SKILL_SEEKER_TERMINAL": print(f" Using terminal: {terminal_app} (from SKILL_SEEKER_TERMINAL)") - elif detection_method == 'TERM_PROGRAM': + elif detection_method == "TERM_PROGRAM": print(f" Using terminal: {terminal_app} (inherited from current terminal)") - elif detection_method.startswith('unknown TERM_PROGRAM'): + elif detection_method.startswith("unknown TERM_PROGRAM"): print(f"āš ļø {detection_method}") - print(f" → Using Terminal.app as fallback") + print(" → Using Terminal.app as fallback") else: print(f" Using terminal: {terminal_app} (default)") try: - subprocess.Popen(['open', '-a', terminal_app, script_file]) + subprocess.Popen(["open", "-a", terminal_app, script_file]) except Exception as e: print(f"āš ļø Error launching {terminal_app}: {e}") print(f"\nManually run: {script_file}") return False else: print("āš ļø Auto-launch only works on macOS") - print(f"\nManually run this command in a new terminal:") + print("\nManually run this command in a new terminal:") print(f" claude '{prompt_file}'") - print(f"\nThen delete the prompt file:") + print("\nThen delete the prompt file:") print(f" rm '{prompt_file}'") return False @@ -614,7 +612,9 @@ rm {prompt_file} print() print("šŸ’” When done:") print(f" 1. Check the enhanced SKILL.md: {self.skill_md_path}") - print(f" 2. If you don't like it, restore: mv {self.skill_md_path.with_suffix('.md.backup')} {self.skill_md_path}") + print( + f" 2. If you don't like it, restore: mv {self.skill_md_path.with_suffix('.md.backup')} {self.skill_md_path}" + ) print(f" 3. Package: skill-seekers package {self.skill_dir}/") return True @@ -630,10 +630,9 @@ rm {prompt_file} bool: True if enhancement succeeded """ import time - from pathlib import Path print("✨ Running Claude Code enhancement (headless mode)...") - print(f" Timeout: {timeout} seconds ({timeout//60} minutes)") + print(f" Timeout: {timeout} seconds ({timeout // 60} minutes)") print() # Record initial state @@ -652,11 +651,11 @@ rm {prompt_file} print() result = subprocess.run( - ['claude', '--dangerously-skip-permissions', prompt_file], + ["claude", "--dangerously-skip-permissions", prompt_file], capture_output=True, text=True, timeout=timeout, - cwd=str(self.skill_dir) # Run from skill directory + cwd=str(self.skill_dir), # Run from skill directory ) elapsed = time.time() - start_time @@ -681,21 +680,21 @@ rm {prompt_file} return True else: - print(f"āš ļø Claude finished but SKILL.md was not updated") + print("āš ļø Claude finished but SKILL.md was not updated") print(f" Initial: mtime={initial_mtime}, size={initial_size}") print(f" Final: mtime={new_mtime}, size={new_size}") - print(f" This might indicate an error during enhancement") + print(" This might indicate an error during enhancement") print() # Show last 20 lines of stdout for debugging if result.stdout: print(" Last output from Claude:") - lines = result.stdout.strip().split('\n')[-20:] + lines = result.stdout.strip().split("\n")[-20:] for line in lines: print(f" | {line}") print() return False else: - print(f"āŒ SKILL.md not found after enhancement") + print("āŒ SKILL.md not found after enhancement") return False else: print(f"āŒ Claude Code returned error (exit code: {result.returncode})") @@ -750,9 +749,9 @@ rm {prompt_file} Returns: bool: True if background task started successfully """ - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"BACKGROUND ENHANCEMENT: {self.skill_dir.name}") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") # Write initial status self.write_status("pending", "Starting background enhancement...") @@ -764,9 +763,7 @@ rm {prompt_file} # Read reference files references = read_reference_files( - self.skill_dir, - max_chars=LOCAL_CONTENT_LIMIT, - preview_limit=LOCAL_PREVIEW_LIMIT + self.skill_dir, max_chars=LOCAL_CONTENT_LIMIT, preview_limit=LOCAL_PREVIEW_LIMIT ) if not references: @@ -785,7 +782,7 @@ rm {prompt_file} return # Save prompt to temp file - with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f: prompt_file = f.name f.write(prompt) @@ -794,12 +791,7 @@ rm {prompt_file} # Run enhancement if headless: # Run headless (subprocess.run - blocking in thread) - result = subprocess.run( - ['claude', prompt_file], - capture_output=True, - text=True, - timeout=timeout - ) + result = subprocess.run(["claude", prompt_file], capture_output=True, text=True, timeout=timeout) # Clean up try: @@ -848,9 +840,9 @@ rm {prompt_file} Returns: bool: True if daemon started successfully """ - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"DAEMON MODE: {self.skill_dir.name}") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") # Write initial status self.write_status("pending", "Starting daemon process...") @@ -939,7 +931,7 @@ except Exception as e: # Save daemon script daemon_script_path = self.skill_dir / ".enhancement_daemon.py" - daemon_script_path.write_text(daemon_script, encoding='utf-8') + daemon_script_path.write_text(daemon_script, encoding="utf-8") daemon_script_path.chmod(0o755) # Start daemon process (fully detached) @@ -950,19 +942,16 @@ except Exception as e: if self.force: # Force mode: No output, fully silent subprocess.Popen( - ['nohup', 'python3', str(daemon_script_path)], + ["nohup", "python3", str(daemon_script_path)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, - start_new_session=True + start_new_session=True, ) else: # Normal mode: Log to file - with open(log_file, 'w') as log: + with open(log_file, "w") as log: subprocess.Popen( - ['nohup', 'python3', str(daemon_script_path)], - stdout=log, - stderr=log, - start_new_session=True + ["nohup", "python3", str(daemon_script_path)], stdout=log, stderr=log, start_new_session=True ) # Give daemon time to start @@ -971,7 +960,7 @@ except Exception as e: # Read status to verify it started status = self.read_status() - if status and status.get('status') in ['pending', 'running']: + if status and status.get("status") in ["pending", "running"]: print("āœ… Daemon process started successfully!") print() print("šŸ“Š Monitoring:") @@ -1032,43 +1021,31 @@ Mode Comparison: Force Mode (Default ON): By default, all modes skip confirmations (auto-yes). Use --no-force to enable confirmation prompts. -""" +""", + ) + + parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)") + + parser.add_argument( + "--interactive-enhancement", + action="store_true", + help="Open terminal window for enhancement (default: headless mode)", ) parser.add_argument( - 'skill_directory', - help='Path to skill directory (e.g., output/react/)' + "--background", action="store_true", help="Run in background and return immediately (non-blocking)" + ) + + parser.add_argument("--daemon", action="store_true", help="Run as persistent daemon process (fully detached)") + + parser.add_argument( + "--no-force", + action="store_true", + help="Disable force mode: enable confirmation prompts (default: force mode ON)", ) parser.add_argument( - '--interactive-enhancement', - action='store_true', - help='Open terminal window for enhancement (default: headless mode)' - ) - - parser.add_argument( - '--background', - action='store_true', - help='Run in background and return immediately (non-blocking)' - ) - - parser.add_argument( - '--daemon', - action='store_true', - help='Run as persistent daemon process (fully detached)' - ) - - parser.add_argument( - '--no-force', - action='store_true', - help='Disable force mode: enable confirmation prompts (default: force mode ON)' - ) - - parser.add_argument( - '--timeout', - type=int, - default=600, - help='Timeout in seconds for headless mode (default: 600 = 10 minutes)' + "--timeout", type=int, default=600, help="Timeout in seconds for headless mode (default: 600 = 10 minutes)" ) args = parser.parse_args() @@ -1084,12 +1061,7 @@ Force Mode (Default ON): # Force mode is ON by default, use --no-force to disable enhancer = LocalSkillEnhancer(args.skill_directory, force=not args.no_force) headless = not args.interactive_enhancement # Invert: default is headless - success = enhancer.run( - headless=headless, - timeout=args.timeout, - background=args.background, - daemon=args.daemon - ) + success = enhancer.run(headless=headless, timeout=args.timeout, background=args.background, daemon=args.daemon) sys.exit(0 if success else 1) diff --git a/src/skill_seekers/cli/enhance_status.py b/src/skill_seekers/cli/enhance_status.py index 4a76e58..8590f64 100644 --- a/src/skill_seekers/cli/enhance_status.py +++ b/src/skill_seekers/cli/enhance_status.py @@ -10,9 +10,8 @@ Usage: skill-seekers enhance-status output/react/ --json """ -import os -import sys import json +import sys import time from pathlib import Path @@ -32,7 +31,7 @@ def read_status(skill_dir): return None try: - return json.loads(status_file.read_text(encoding='utf-8')) + return json.loads(status_file.read_text(encoding="utf-8")) except Exception as e: return {"error": f"Failed to read status: {e}"} @@ -53,26 +52,21 @@ def format_status(status): return f"āŒ {status['error']}" # Status emoji mapping - status_emojis = { - "pending": "ā³", - "running": "šŸ”„", - "completed": "āœ…", - "failed": "āŒ" - } + status_emojis = {"pending": "ā³", "running": "šŸ”„", "completed": "āœ…", "failed": "āŒ"} - emoji = status_emojis.get(status.get('status', ''), 'ā“') - status_text = status.get('status', 'unknown').upper() - message = status.get('message', '') - progress = status.get('progress', 0.0) - timestamp = status.get('timestamp', 'unknown') - error = status.get('error') - pid = status.get('pid') + emoji = status_emojis.get(status.get("status", ""), "ā“") + status_text = status.get("status", "unknown").upper() + message = status.get("message", "") + progress = status.get("progress", 0.0) + timestamp = status.get("timestamp", "unknown") + error = status.get("error") + pid = status.get("pid") # Build output lines = [] - lines.append(f"\n{'='*60}") + lines.append(f"\n{'=' * 60}") lines.append(f"ENHANCEMENT STATUS: {status_text}") - lines.append(f"{'='*60}\n") + lines.append(f"{'=' * 60}\n") lines.append(f"{emoji} Status: {status_text}") @@ -81,7 +75,7 @@ def format_status(status): if progress > 0: progress_pct = int(progress * 100) - progress_bar = 'ā–ˆ' * (progress_pct // 5) + 'ā–‘' * (20 - progress_pct // 5) + progress_bar = "ā–ˆ" * (progress_pct // 5) + "ā–‘" * (20 - progress_pct // 5) lines.append(f" Progress: [{progress_bar}] {progress_pct}%") if pid: @@ -94,7 +88,7 @@ def format_status(status): lines.append("") - return '\n'.join(lines) + return "\n".join(lines) def watch_status(skill_dir, interval=2): @@ -106,7 +100,7 @@ def watch_status(skill_dir, interval=2): """ print(f"šŸ‘€ Watching enhancement status for: {skill_dir}") print(f" Update interval: {interval} seconds") - print(f" Press Ctrl+C to stop\n") + print(" Press Ctrl+C to stop\n") try: last_status = None @@ -123,7 +117,7 @@ def watch_status(skill_dir, interval=2): last_status = status # Exit if completed or failed - if status and status.get('status') in ['completed', 'failed']: + if status and status.get("status") in ["completed", "failed"]: break time.sleep(interval) @@ -149,32 +143,18 @@ Examples: # Get JSON output (for scripts) skill-seekers enhance-status output/react/ --json -""" +""", ) - parser.add_argument( - 'skill_directory', - help='Path to skill directory (e.g., output/react/)' - ) + parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)") parser.add_argument( - '--watch', '-w', - action='store_true', - help='Watch status in real-time (updates every 2 seconds)' + "--watch", "-w", action="store_true", help="Watch status in real-time (updates every 2 seconds)" ) - parser.add_argument( - '--json', - action='store_true', - help='Output raw JSON (for scripting)' - ) + parser.add_argument("--json", action="store_true", help="Output raw JSON (for scripting)") - parser.add_argument( - '--interval', - type=int, - default=2, - help='Watch update interval in seconds (default: 2)' - ) + parser.add_argument("--interval", type=int, default=2, help="Watch update interval in seconds (default: 2)") args = parser.parse_args() @@ -197,9 +177,9 @@ Examples: # Exit code based on status if not status: sys.exit(2) # No status found - elif status.get('status') == 'completed': + elif status.get("status") == "completed": sys.exit(0) # Success - elif status.get('status') == 'failed': + elif status.get("status") == "failed": sys.exit(1) # Failed else: sys.exit(0) # In progress diff --git a/src/skill_seekers/cli/estimate_pages.py b/src/skill_seekers/cli/estimate_pages.py index 0527805..da68c3e 100755 --- a/src/skill_seekers/cli/estimate_pages.py +++ b/src/skill_seekers/cli/estimate_pages.py @@ -4,23 +4,20 @@ Page Count Estimator for Skill Seeker Quickly estimates how many pages a config will scrape without downloading content """ -import sys +import json import os +import sys +import time +from pathlib import Path +from urllib.parse import urljoin, urlparse + import requests from bs4 import BeautifulSoup -from urllib.parse import urljoin, urlparse -import time -import json -from pathlib import Path # Add parent directory to path for imports when run as script sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from skill_seekers.cli.constants import ( - DEFAULT_RATE_LIMIT, - DEFAULT_MAX_DISCOVERY, - DISCOVERY_THRESHOLD -) +from skill_seekers.cli.constants import DEFAULT_MAX_DISCOVERY, DEFAULT_RATE_LIMIT, DISCOVERY_THRESHOLD def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30): @@ -35,20 +32,20 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30): Returns: dict with estimation results """ - base_url = config['base_url'] - start_urls = config.get('start_urls', [base_url]) - url_patterns = config.get('url_patterns', {'include': [], 'exclude': []}) - rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT) + base_url = config["base_url"] + start_urls = config.get("start_urls", [base_url]) + url_patterns = config.get("url_patterns", {"include": [], "exclude": []}) + rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT) visited = set() pending = list(start_urls) discovered = 0 - include_patterns = url_patterns.get('include', []) - exclude_patterns = url_patterns.get('exclude', []) + include_patterns = url_patterns.get("include", []) + exclude_patterns = url_patterns.get("exclude", []) # Handle unlimited mode - unlimited = (max_discovery == -1 or max_discovery is None) + unlimited = max_discovery == -1 or max_discovery is None print(f"šŸ” Estimating pages for: {config['name']}") print(f"šŸ“ Base URL: {base_url}") @@ -56,8 +53,8 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30): print(f"ā±ļø Rate limit: {rate_limit}s") if unlimited: - print(f"šŸ”¢ Max discovery: UNLIMITED (will discover all pages)") - print(f"āš ļø WARNING: This may take a long time!") + print("šŸ”¢ Max discovery: UNLIMITED (will discover all pages)") + print("āš ļø WARNING: This may take a long time!") else: print(f"šŸ”¢ Max discovery: {max_discovery}") @@ -80,26 +77,26 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30): if discovered % 10 == 0: elapsed = time.time() - start_time rate = discovered / elapsed if elapsed > 0 else 0 - print(f"ā³ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end='\r') + print(f"ā³ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end="\r") try: # HEAD request first to check if page exists (faster) head_response = requests.head(url, timeout=timeout, allow_redirects=True) # Skip non-HTML content - content_type = head_response.headers.get('Content-Type', '') - if 'text/html' not in content_type: + content_type = head_response.headers.get("Content-Type", "") + if "text/html" not in content_type: continue # Now GET the page to find links response = requests.get(url, timeout=timeout) response.raise_for_status() - soup = BeautifulSoup(response.content, 'html.parser') + soup = BeautifulSoup(response.content, "html.parser") # Find all links - for link in soup.find_all('a', href=True): - href = link['href'] + for link in soup.find_all("a", href=True): + href = link["href"] full_url = urljoin(url, href) # Normalize URL @@ -117,10 +114,10 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30): # Rate limiting time.sleep(rate_limit) - except requests.RequestException as e: + except requests.RequestException: # Silently skip errors during estimation pass - except Exception as e: + except Exception: # Silently skip other errors pass @@ -128,13 +125,13 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30): # Results results = { - 'discovered': discovered, - 'pending': len(pending), - 'estimated_total': discovered + len(pending), - 'elapsed_seconds': round(elapsed, 2), - 'discovery_rate': round(discovered / elapsed if elapsed > 0 else 0, 2), - 'hit_limit': (not unlimited) and (discovered >= max_discovery), - 'unlimited': unlimited + "discovered": discovered, + "pending": len(pending), + "estimated_total": discovered + len(pending), + "elapsed_seconds": round(elapsed, 2), + "discovery_rate": round(discovered / elapsed if elapsed > 0 else 0, 2), + "hit_limit": (not unlimited) and (discovered >= max_discovery), + "unlimited": unlimited, } return results @@ -143,7 +140,7 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30): def is_valid_url(url, base_url, include_patterns, exclude_patterns): """Check if URL should be crawled""" # Must be same domain - if not url.startswith(base_url.rstrip('/')): + if not url.startswith(base_url.rstrip("/")): return False # Check exclude patterns first @@ -180,11 +177,11 @@ def print_results(results, config): print(f"ā±ļø Time Elapsed: {results['elapsed_seconds']}s") print(f"⚔ Discovery Rate: {results['discovery_rate']} pages/sec") - if results.get('unlimited', False): + if results.get("unlimited", False): print() print("āœ… UNLIMITED MODE - Discovered all reachable pages") print(f" Total pages: {results['estimated_total']}") - elif results['hit_limit']: + elif results["hit_limit"]: print() print("āš ļø Hit discovery limit - actual total may be higher") print(" Increase max_discovery parameter for more accurate estimate") @@ -195,8 +192,8 @@ def print_results(results, config): print("=" * 70) print() - estimated = results['estimated_total'] - current_max = config.get('max_pages', 100) + estimated = results["estimated_total"] + current_max = config.get("max_pages", 100) if estimated <= current_max: print(f"āœ… Current max_pages ({current_max}) is sufficient") @@ -207,7 +204,7 @@ def print_results(results, config): print(f" (Estimated {estimated} + 50 buffer)") # Estimate time for full scrape - rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT) + rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT) estimated_time = (estimated * rate_limit) / 60 # in minutes print() @@ -220,7 +217,7 @@ def print_results(results, config): def load_config(config_path): """Load configuration from JSON file""" try: - with open(config_path, 'r') as f: + with open(config_path) as f: config = json.load(f) return config except FileNotFoundError: @@ -298,7 +295,7 @@ def list_all_configs(): # Try to load the config to get name and description try: - with open(config_file, 'r') as f: + with open(config_file) as f: config_data = json.load(f) name = config_data.get("name", config_file.stem) @@ -308,20 +305,19 @@ def list_all_configs(): if len(description) > 60: description = description[:57] + "..." - by_category[category].append({ - "file": config_file.name, - "path": str(rel_path), - "name": name, - "description": description - }) + by_category[category].append( + {"file": config_file.name, "path": str(rel_path), "name": name, "description": description} + ) except Exception as e: # If we can't parse the config, just use the filename - by_category[category].append({ - "file": config_file.name, - "path": str(rel_path), - "name": config_file.stem, - "description": f"āš ļø Error loading config: {e}" - }) + by_category[category].append( + { + "file": config_file.name, + "path": str(rel_path), + "name": config_file.stem, + "description": f"āš ļø Error loading config: {e}", + } + ) # Print configs by category total = 0 @@ -351,7 +347,7 @@ def main(): import argparse parser = argparse.ArgumentParser( - description='Estimate page count for Skill Seeker configs', + description="Estimate page count for Skill Seeker configs", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -366,18 +362,25 @@ Examples: # Quick estimate (stop at 100 pages) skill-seekers estimate configs/vue.json --max-discovery 100 - """ + """, ) - parser.add_argument('config', nargs='?', help='Path to config JSON file') - parser.add_argument('--all', action='store_true', - help='List all available configs from api/configs_repo/official/') - parser.add_argument('--max-discovery', '-m', type=int, default=DEFAULT_MAX_DISCOVERY, - help=f'Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)') - parser.add_argument('--unlimited', '-u', action='store_true', - help='Remove discovery limit - discover all pages (same as --max-discovery -1)') - parser.add_argument('--timeout', '-t', type=int, default=30, - help='HTTP request timeout in seconds (default: 30)') + parser.add_argument("config", nargs="?", help="Path to config JSON file") + parser.add_argument("--all", action="store_true", help="List all available configs from api/configs_repo/official/") + parser.add_argument( + "--max-discovery", + "-m", + type=int, + default=DEFAULT_MAX_DISCOVERY, + help=f"Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)", + ) + parser.add_argument( + "--unlimited", + "-u", + action="store_true", + help="Remove discovery limit - discover all pages (same as --max-discovery -1)", + ) + parser.add_argument("--timeout", "-t", type=int, default=30, help="HTTP request timeout in seconds (default: 30)") args = parser.parse_args() @@ -401,7 +404,7 @@ Examples: print_results(results, config) # Return exit code based on results - if results['hit_limit']: + if results["hit_limit"]: return 2 # Warning: hit limit return 0 # Success @@ -413,5 +416,5 @@ Examples: return 1 -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/src/skill_seekers/cli/generate_router.py b/src/skill_seekers/cli/generate_router.py index 72eef9d..7f501a4 100644 --- a/src/skill_seekers/cli/generate_router.py +++ b/src/skill_seekers/cli/generate_router.py @@ -12,17 +12,17 @@ Phase 4 enhancements: - GitHub issue links for context """ +import argparse import json import sys -import argparse from pathlib import Path -from typing import Dict, List, Any, Tuple, Optional +from typing import Any, Optional # Import three-stream data classes (Phase 1) try: - from .github_fetcher import ThreeStreamData, DocsStream, InsightsStream - from .merge_sources import categorize_issues_by_topic + from .github_fetcher import DocsStream, InsightsStream, ThreeStreamData from .markdown_cleaner import MarkdownCleaner + from .merge_sources import categorize_issues_by_topic except ImportError: # Fallback if github_fetcher not available ThreeStreamData = None @@ -34,10 +34,9 @@ except ImportError: class RouterGenerator: """Generates router skills that direct to specialized sub-skills with GitHub integration""" - def __init__(self, - config_paths: List[str], - router_name: str = None, - github_streams: Optional['ThreeStreamData'] = None): + def __init__( + self, config_paths: list[str], router_name: str = None, github_streams: Optional["ThreeStreamData"] = None + ): """ Initialize router generator with optional GitHub streams. @@ -60,21 +59,21 @@ class RouterGenerator: if github_streams and github_streams.insights_stream: self.github_metadata = github_streams.insights_stream.metadata self.github_issues = { - 'common_problems': github_streams.insights_stream.common_problems, - 'known_solutions': github_streams.insights_stream.known_solutions, - 'top_labels': github_streams.insights_stream.top_labels + "common_problems": github_streams.insights_stream.common_problems, + "known_solutions": github_streams.insights_stream.known_solutions, + "top_labels": github_streams.insights_stream.top_labels, } if github_streams and github_streams.docs_stream: self.github_docs = { - 'readme': github_streams.docs_stream.readme, - 'contributing': github_streams.docs_stream.contributing + "readme": github_streams.docs_stream.readme, + "contributing": github_streams.docs_stream.contributing, } - def load_config(self, path: Path) -> Dict[str, Any]: + def load_config(self, path: Path) -> dict[str, Any]: """Load a config file""" try: - with open(path, 'r') as f: + with open(path) as f: return json.load(f) except Exception as e: print(f"āŒ Error loading {path}: {e}") @@ -83,17 +82,17 @@ class RouterGenerator: def infer_router_name(self) -> str: """Infer router name from sub-skill names""" # Find common prefix - names = [cfg['name'] for cfg in self.configs] + names = [cfg["name"] for cfg in self.configs] if not names: return "router" # Get common prefix before first dash first_name = names[0] - if '-' in first_name: - return first_name.split('-')[0] + if "-" in first_name: + return first_name.split("-")[0] return first_name - def extract_routing_keywords(self) -> Dict[str, List[str]]: + def extract_routing_keywords(self) -> dict[str, list[str]]: """ Extract keywords for routing to each skill (Phase 4 enhanced). @@ -103,26 +102,26 @@ class RouterGenerator: routing = {} for config in self.configs: - name = config['name'] + name = config["name"] keywords = [] # Extract from categories (base weight: 1x) - if 'categories' in config: - keywords.extend(config['categories'].keys()) + if "categories" in config: + keywords.extend(config["categories"].keys()) # Extract from name (part after dash) - if '-' in name: - skill_topic = name.split('-', 1)[1] + if "-" in name: + skill_topic = name.split("-", 1)[1] keywords.append(skill_topic) # Phase 4: Add GitHub issue labels (weight 2x by including twice) if self.github_issues: # Get top labels related to this skill topic - top_labels = self.github_issues.get('top_labels', []) + top_labels = self.github_issues.get("top_labels", []) skill_keywords = set(keywords) for label_info in top_labels[:10]: # Top 10 labels - label = label_info['label'].lower() + label = label_info["label"].lower() # Check if label relates to any skill keyword if any(keyword.lower() in label or label in keyword.lower() for keyword in skill_keywords): @@ -141,7 +140,7 @@ class RouterGenerator: return routing - def _extract_skill_specific_labels(self, skill_name: str, skill_keywords: set) -> List[str]: + def _extract_skill_specific_labels(self, skill_name: str, skill_keywords: set) -> list[str]: """ Extract labels from GitHub issues that match this specific skill. @@ -159,14 +158,14 @@ class RouterGenerator: if not self.github_issues: return [] - common_problems = self.github_issues.get('common_problems', []) - known_solutions = self.github_issues.get('known_solutions', []) + common_problems = self.github_issues.get("common_problems", []) + known_solutions = self.github_issues.get("known_solutions", []) all_issues = common_problems + known_solutions matching_labels = set() for issue in all_issues: - issue_labels = issue.get('labels', []) + issue_labels = issue.get("labels", []) issue_labels_lower = [label.lower() for label in issue_labels] # Check if this issue relates to the skill @@ -180,13 +179,20 @@ class RouterGenerator: # Add ALL labels from this matching issue for label in issue_labels_lower: # Skip generic labels that don't add routing value - if label not in ['bug', 'enhancement', 'question', 'help wanted', - 'good first issue', 'documentation', 'duplicate']: + if label not in [ + "bug", + "enhancement", + "question", + "help wanted", + "good first issue", + "documentation", + "duplicate", + ]: matching_labels.add(label) return list(matching_labels) - def _generate_frontmatter(self, routing_keywords: Dict[str, List[str]]) -> str: + def _generate_frontmatter(self, routing_keywords: dict[str, list[str]]) -> str: """ Generate YAML frontmatter compliant with agentskills.io spec. @@ -201,16 +207,16 @@ class RouterGenerator: # Build comprehensive description from all sub-skills all_topics = [] for config in self.configs: - desc = config.get('description', '') + desc = config.get("description", "") # Extract key topics from description (simple extraction) - topics = [word.strip() for word in desc.split(',') if word.strip()] + topics = [word.strip() for word in desc.split(",") if word.strip()] all_topics.extend(topics[:2]) # Max 2 topics per skill # Create keyword-rich description unique_topics = list(dict.fromkeys(all_topics))[:7] # Top 7 unique topics if unique_topics: - topics_str = ', '.join(unique_topics) + topics_str = ", ".join(unique_topics) description = f"{self.router_name.title()} framework. Use when working with: {topics_str}" else: description = f"Use when working with {self.router_name.title()} development and programming" @@ -225,21 +231,21 @@ class RouterGenerator: # Try to get language-specific compatibility if GitHub metadata available if self.github_metadata: - language = self.github_metadata.get('language', '') + language = self.github_metadata.get("language", "") compatibility_map = { - 'Python': f'Python 3.10+, requires {self.router_name} package', - 'JavaScript': f'Node.js 18+, requires {self.router_name} package', - 'TypeScript': f'Node.js 18+, TypeScript 5+, requires {self.router_name} package', - 'Go': f'Go 1.20+, requires {self.router_name} package', - 'Rust': f'Rust 1.70+, requires {self.router_name} package', - 'Java': f'Java 17+, requires {self.router_name} package', + "Python": f"Python 3.10+, requires {self.router_name} package", + "JavaScript": f"Node.js 18+, requires {self.router_name} package", + "TypeScript": f"Node.js 18+, TypeScript 5+, requires {self.router_name} package", + "Go": f"Go 1.20+, requires {self.router_name} package", + "Rust": f"Rust 1.70+, requires {self.router_name} package", + "Java": f"Java 17+, requires {self.router_name} package", } if language in compatibility_map: compatibility = compatibility_map[language] # Try to extract license - if isinstance(self.github_metadata.get('license'), dict): - license_info = self.github_metadata['license'].get('name', 'MIT') + if isinstance(self.github_metadata.get("license"), dict): + license_info = self.github_metadata["license"].get("name", "MIT") frontmatter = f"""--- name: {self.router_name} @@ -289,27 +295,27 @@ compatibility: {compatibility} """ # Remove router name prefix if skill_name.startswith(f"{self.router_name}-"): - topic = skill_name[len(self.router_name)+1:] + topic = skill_name[len(self.router_name) + 1 :] else: topic = skill_name # Capitalize and add context - topic = topic.replace('-', ' ').title() + topic = topic.replace("-", " ").title() # Add common suffixes for context topic_map = { - 'oauth': 'OAuth authentication', - 'auth': 'authentication', - 'async': 'async patterns', - 'api': 'API integration', - 'orm': 'ORM queries', - 'hooks': 'hooks', - 'routing': 'routing', - 'testing': 'testing', - '2d': '2D development', - '3d': '3D development', - 'scripting': 'scripting', - 'physics': 'physics', + "oauth": "OAuth authentication", + "auth": "authentication", + "async": "async patterns", + "api": "API integration", + "orm": "ORM queries", + "hooks": "hooks", + "routing": "routing", + "testing": "testing", + "2d": "2D development", + "3d": "3D development", + "scripting": "scripting", + "physics": "physics", } topic_lower = topic.lower() @@ -319,7 +325,7 @@ compatibility: {compatibility} return topic - def _generate_dynamic_examples(self, routing_keywords: Dict[str, List[str]]) -> str: + def _generate_dynamic_examples(self, routing_keywords: dict[str, list[str]]) -> str: """ Generate examples dynamically from actual sub-skill names and keywords. @@ -351,10 +357,7 @@ compatibility: {compatibility} topic = self._extract_topic_from_skill(first_skill) keyword = first_keywords[0] if first_keywords else topic - examples.append( - f'**Q:** "How do I implement {keyword}?"\n' - f'**A:** Activates {first_skill} skill' - ) + examples.append(f'**Q:** "How do I implement {keyword}?"\n**A:** Activates {first_skill} skill') # Example 2: Different skill (second sub-skill if available) if len(skill_names) >= 2: @@ -365,8 +368,7 @@ compatibility: {compatibility} keyword = second_keywords[0] if second_keywords else topic examples.append( - f'**Q:** "Working with {keyword} in {self.router_name.title()}"\n' - f'**A:** Activates {second_skill} skill' + f'**Q:** "Working with {keyword} in {self.router_name.title()}"\n**A:** Activates {second_skill} skill' ) # Example 3: Multi-skill activation (if 2+ skills) @@ -378,13 +380,12 @@ compatibility: {compatibility} topic_2 = self._extract_topic_from_skill(skill_2) examples.append( - f'**Q:** "Combining {topic_1} with {topic_2}"\n' - f'**A:** Activates {skill_1} + {skill_2} skills' + f'**Q:** "Combining {topic_1} with {topic_2}"\n**A:** Activates {skill_1} + {skill_2} skills' ) - return '\n\n'.join(examples) + return "\n\n".join(examples) - def _generate_examples_from_github(self, routing_keywords: Dict[str, List[str]]) -> str: + def _generate_examples_from_github(self, routing_keywords: dict[str, list[str]]) -> str: """ Generate examples from real GitHub issue titles. @@ -402,7 +403,7 @@ compatibility: {compatibility} return self._generate_dynamic_examples(routing_keywords) examples = [] - common_problems = self.github_issues.get('common_problems', []) + common_problems = self.github_issues.get("common_problems", []) if not common_problems: return self._generate_dynamic_examples(routing_keywords) @@ -414,29 +415,26 @@ compatibility: {compatibility} # Find first issue matching this skill's keywords for issue in common_problems: - issue_labels = [label.lower() for label in issue.get('labels', [])] + issue_labels = [label.lower() for label in issue.get("labels", [])] if any(label in skill_keywords_lower for label in issue_labels): matched_issue = issue common_problems.remove(issue) # Don't reuse same issue break if matched_issue: - title = matched_issue.get('title', '') + title = matched_issue.get("title", "") question = self._convert_issue_to_question(title) - examples.append( - f'**Q:** "{question}"\n' - f'**A:** Activates {skill_name} skill' - ) + examples.append(f'**Q:** "{question}"\n**A:** Activates {skill_name} skill') else: # Fallback to keyword-based example for this skill topic = self._extract_topic_from_skill(skill_name) keyword = keywords[0] if keywords else topic examples.append( f'**Q:** "Working with {keyword} in {self.router_name.title()}"\n' - f'**A:** Activates {skill_name} skill' + f"**A:** Activates {skill_name} skill" ) - return '\n\n'.join(examples) if examples else self._generate_dynamic_examples(routing_keywords) + return "\n\n".join(examples) if examples else self._generate_dynamic_examples(routing_keywords) def _convert_issue_to_question(self, issue_title: str) -> str: """ @@ -456,24 +454,24 @@ compatibility: {compatibility} title_lower = issue_title.lower() # Pattern 1: Error/Failure issues - if 'fail' in title_lower or 'error' in title_lower or 'issue' in title_lower: - cleaned = issue_title.replace(' fails', '').replace(' errors', '').replace(' issue', '') + if "fail" in title_lower or "error" in title_lower or "issue" in title_lower: + cleaned = issue_title.replace(" fails", "").replace(" errors", "").replace(" issue", "") return f"How do I fix {cleaned.lower()}?" # Pattern 2: Documentation requests - if 'documentation' in title_lower or 'docs' in title_lower: - cleaned = issue_title.replace(' documentation', '').replace(' docs', '') + if "documentation" in title_lower or "docs" in title_lower: + cleaned = issue_title.replace(" documentation", "").replace(" docs", "") return f"How do I use {cleaned.lower()}?" # Pattern 3: Feature requests - if title_lower.startswith('add ') or title_lower.startswith('added '): - feature = issue_title.replace('Add ', '').replace('Added ', '') + if title_lower.startswith("add ") or title_lower.startswith("added "): + feature = issue_title.replace("Add ", "").replace("Added ", "") return f"How do I implement {feature.lower()}?" # Default: Generic question return f"How do I handle {issue_title.lower()}?" - def _extract_common_patterns(self) -> List[Dict[str, str]]: + def _extract_common_patterns(self) -> list[dict[str, str]]: """ Extract problem-solution patterns from closed GitHub issues. @@ -487,25 +485,21 @@ compatibility: {compatibility} if not self.github_issues: return [] - known_solutions = self.github_issues.get('known_solutions', []) + known_solutions = self.github_issues.get("known_solutions", []) if not known_solutions: return [] patterns = [] # Top 5 closed issues with most engagement (comments indicate usefulness) - top_solutions = sorted(known_solutions, key=lambda x: x.get('comments', 0), reverse=True)[:5] + top_solutions = sorted(known_solutions, key=lambda x: x.get("comments", 0), reverse=True)[:5] for issue in top_solutions: - title = issue.get('title', '') - number = issue.get('number', 0) + title = issue.get("title", "") + number = issue.get("number", 0) problem, solution = self._parse_issue_pattern(title) - patterns.append({ - 'problem': problem, - 'solution': solution, - 'issue_number': number - }) + patterns.append({"problem": problem, "solution": solution, "issue_number": number}) return patterns @@ -530,24 +524,24 @@ compatibility: {compatibility} title_lower = issue_title.lower() # Pattern 1: "Fixed X" → "X not working" / "See fix" - if title_lower.startswith('fixed ') or title_lower.startswith('fix '): - problem_text = issue_title.replace('Fixed ', '').replace('Fix ', '') + if title_lower.startswith("fixed ") or title_lower.startswith("fix "): + problem_text = issue_title.replace("Fixed ", "").replace("Fix ", "") return (f"{problem_text} not working", "See fix implementation details") # Pattern 2: "Resolved X" → "X issue" / "See resolution" - if title_lower.startswith('resolved ') or title_lower.startswith('resolve '): - problem_text = issue_title.replace('Resolved ', '').replace('Resolve ', '') + if title_lower.startswith("resolved ") or title_lower.startswith("resolve "): + problem_text = issue_title.replace("Resolved ", "").replace("Resolve ", "") return (f"{problem_text} issue", "See resolution approach") # Pattern 3: "Added X" → "Missing X" / "Use X" - if title_lower.startswith('added ') or title_lower.startswith('add '): - feature_text = issue_title.replace('Added ', '').replace('Add ', '') + if title_lower.startswith("added ") or title_lower.startswith("add "): + feature_text = issue_title.replace("Added ", "").replace("Add ", "") return (f"Missing {feature_text}", f"Use {feature_text} feature") # Default: Use title as-is return (issue_title, "See issue for solution details") - def _detect_framework(self) -> Optional[str]: + def _detect_framework(self) -> str | None: """ Detect framework from router name and GitHub metadata. @@ -561,14 +555,14 @@ compatibility: {compatibility} router_lower = self.router_name.lower() framework_keywords = { - 'fastapi': 'fastapi', - 'django': 'django', - 'flask': 'flask', - 'react': 'react', - 'vue': 'vue', - 'express': 'express', - 'fastmcp': 'fastmcp', - 'mcp': 'fastmcp', + "fastapi": "fastapi", + "django": "django", + "flask": "flask", + "react": "react", + "vue": "vue", + "express": "express", + "fastmcp": "fastmcp", + "mcp": "fastmcp", } # Check router name first @@ -578,7 +572,7 @@ compatibility: {compatibility} # Check GitHub description if available if self.github_metadata: - description = self.github_metadata.get('description', '').lower() + description = self.github_metadata.get("description", "").lower() for keyword, framework in framework_keywords.items(): if keyword in description: return framework @@ -599,7 +593,7 @@ compatibility: {compatibility} Formatted Quick Start section with install + hello world code """ templates = { - 'fastapi': """## Quick Start + "fastapi": """## Quick Start ```bash pip install fastapi uvicorn @@ -617,7 +611,7 @@ def read_root(): # Run: uvicorn main:app --reload ``` """, - 'fastmcp': """## Quick Start + "fastmcp": """## Quick Start ```bash pip install fastmcp @@ -633,7 +627,7 @@ def greet(name: str) -> str: return f"Hello, {name}!" ``` """, - 'django': """## Quick Start + "django": """## Quick Start ```bash pip install django @@ -644,7 +638,7 @@ python manage.py runserver Visit http://127.0.0.1:8000/ to see your Django app. """, - 'react': """## Quick Start + "react": """## Quick Start ```bash npx create-react-app my-app @@ -677,16 +671,16 @@ export default App; all_topics = [] for config in self.configs: - desc = config.get('description', '') + desc = config.get("description", "") # Extract key topics from description (simple comma-separated extraction) - topics = [topic.strip() for topic in desc.split(',') if topic.strip()] + topics = [topic.strip() for topic in desc.split(",") if topic.strip()] all_topics.extend(topics[:2]) # Max 2 topics per skill # Deduplicate and take top 5-7 topics unique_topics = list(dict.fromkeys(all_topics))[:7] if not unique_topics: - return f'Use when working with {self.router_name} development and programming' + return f"Use when working with {self.router_name} development and programming" # Format as user-friendly bulleted list description = f"""Use this skill when working with: @@ -695,8 +689,8 @@ export default App; for topic in unique_topics: # Clean up topic text (remove "when working with" prefixes if present) - topic = topic.replace('when working with', '').strip() - topic = topic.replace('Use when', '').strip() + topic = topic.replace("when working with", "").strip() + topic = topic.replace("Use when", "").strip() if topic: description += f"- {topic}\n" @@ -721,7 +715,10 @@ export default App; # NEW: Generate comprehensive description from all sub-skills when_to_use = self._generate_comprehensive_description() - skill_md = frontmatter + "\n\n" + f"""# {self.router_name.replace('-', ' ').title()} Documentation + skill_md = ( + frontmatter + + "\n\n" + + f"""# {self.router_name.replace("-", " ").title()} Documentation ## When to Use This Skill @@ -730,26 +727,27 @@ export default App; This is a router skill that directs your questions to specialized sub-skills for efficient, focused assistance. """ + ) # Phase 4: Add GitHub repository metadata if self.github_metadata: # NEW: Use html_url from GitHub metadata instead of base_url from config - repo_url = self.github_metadata.get('html_url', '') - stars = self.github_metadata.get('stars', 0) - language = self.github_metadata.get('language', 'Unknown') - description = self.github_metadata.get('description', '') + repo_url = self.github_metadata.get("html_url", "") + stars = self.github_metadata.get("stars", 0) + language = self.github_metadata.get("language", "Unknown") + description = self.github_metadata.get("description", "") skill_md += f"""## Repository Info **Repository:** {repo_url} **Stars:** ⭐ {stars:,} | **Language:** {language} -{f'**Description:** {description}' if description else ''} +{f"**Description:** {description}" if description else ""} """ # Phase 4: Add Quick Start from README - if self.github_docs and self.github_docs.get('readme'): - readme = self.github_docs['readme'] + if self.github_docs and self.github_docs.get("readme"): + readme = self.github_docs["readme"] # NEW: Clean HTML and extract meaningful content quick_start = self._extract_clean_readme_section(readme) @@ -768,14 +766,20 @@ This is a router skill that directs your questions to specialized sub-skills for if framework: hello_world = self._get_framework_hello_world(framework) if hello_world: - skill_md += hello_world + "\n*Note: Generic template. See references/getting_started.md for project-specific setup.*\n\n" + skill_md += ( + hello_world + + "\n*Note: Generic template. See references/getting_started.md for project-specific setup.*\n\n" + ) else: # No README available - try framework fallback framework = self._detect_framework() if framework: hello_world = self._get_framework_hello_world(framework) if hello_world: - skill_md += hello_world + "\n*Note: Generic template. Check repository for specific installation instructions.*\n\n" + skill_md += ( + hello_world + + "\n*Note: Generic template. Check repository for specific installation instructions.*\n\n" + ) skill_md += """## How It Works @@ -785,11 +789,11 @@ This skill analyzes your question and activates the appropriate specialized skil # List sub-skills for config in self.configs: - name = config['name'] - desc = config.get('description', '') + name = config["name"] + desc = config.get("description", "") # Remove router name prefix from description if present if desc.startswith(f"{self.router_name.title()} -"): - desc = desc.split(' - ', 1)[1] + desc = desc.split(" - ", 1)[1] skill_md += f"### {name}\n{desc}\n\n" @@ -808,7 +812,7 @@ The router analyzes your question for topic keywords and activates relevant skil skill_md += f"- {keyword_str} → **{skill_name}**\n" # Quick reference - skill_md += f""" + skill_md += """ ## Quick Reference @@ -839,7 +843,7 @@ For quick answers, this router provides basic overview information. For detailed # Phase 4: Add Common Issues from GitHub (Summary with Reference) if self.github_issues: - common_problems = self.github_issues.get('common_problems', [])[:5] # Top 5 + common_problems = self.github_issues.get("common_problems", [])[:5] # Top 5 if common_problems: skill_md += """ @@ -850,9 +854,9 @@ Top 5 GitHub issues from the community: """ for i, issue in enumerate(common_problems, 1): - title = issue.get('title', '') - number = issue.get('number', 0) - comments = issue.get('comments', 0) + title = issue.get("title", "") + number = issue.get("number", 0) + comments = issue.get("comments", 0) skill_md += f"{i}. **{title}** (Issue #{number}, {comments} comments)\n" @@ -871,9 +875,9 @@ Problem-solution patterns from resolved GitHub issues: """ for i, pattern in enumerate(patterns, 1): - problem = pattern['problem'] - solution = pattern['solution'] - issue_num = pattern['issue_number'] + problem = pattern["problem"] + solution = pattern["solution"] + issue_num = pattern["issue_number"] skill_md += f"**Pattern {i}**: {problem}\n" skill_md += f"→ **Solution**: {solution} ([Issue #{issue_num}](references/github_issues.md))\n\n" @@ -888,10 +892,10 @@ Detailed documentation available in: """ if self.github_issues: skill_md += "- `references/github_issues.md` - Community problems and solutions\n" - if self.github_docs and self.github_docs.get('readme'): + if self.github_docs and self.github_docs.get("readme"): skill_md += "- `references/getting_started.md` - Detailed setup guide\n" - skill_md += f""" + skill_md += """ ## Need Help? @@ -904,7 +908,7 @@ Simply ask your question and mention the topic. The router will find the right s return skill_md - def generate_subskill_issues_section(self, skill_name: str, topics: List[str]) -> str: + def generate_subskill_issues_section(self, skill_name: str, topics: list[str]) -> str: """ Generate "Common Issues" section for a sub-skill (Phase 4). @@ -918,8 +922,8 @@ Simply ask your question and mention the topic. The router will find the right s if not self.github_issues or not categorize_issues_by_topic: return "" - common_problems = self.github_issues.get('common_problems', []) - known_solutions = self.github_issues.get('known_solutions', []) + common_problems = self.github_issues.get("common_problems", []) + known_solutions = self.github_issues.get("known_solutions", []) # Categorize issues by topic categorized = categorize_issues_by_topic(common_problems, known_solutions, topics) @@ -944,11 +948,11 @@ GitHub issues related to this topic: issues_md += f"\n### {topic.title()}\n\n" for issue in issues[:3]: # Top 3 per topic - title = issue.get('title', '') - number = issue.get('number', 0) - state = issue.get('state', 'unknown') - comments = issue.get('comments', 0) - labels = issue.get('labels', []) + title = issue.get("title", "") + number = issue.get("number", 0) + state = issue.get("state", "unknown") + comments = issue.get("comments", 0) + labels = issue.get("labels", []) # Format issue state_icon = "šŸ”“" if state == "open" else "āœ…" @@ -964,21 +968,24 @@ GitHub issues related to this topic: return issues_md - def create_router_config(self) -> Dict[str, Any]: + def create_router_config(self) -> dict[str, Any]: """Create router configuration""" routing_keywords = self.extract_routing_keywords() router_config = { "name": self.router_name, - "description": self.base_config.get('description', f'Use when working with {self.router_name} documentation (router for multiple sub-skills)'), - "base_url": self.base_config['base_url'], - "selectors": self.base_config.get('selectors', {}), - "url_patterns": self.base_config.get('url_patterns', {}), - "rate_limit": self.base_config.get('rate_limit', 0.5), + "description": self.base_config.get( + "description", + f"Use when working with {self.router_name} documentation (router for multiple sub-skills)", + ), + "base_url": self.base_config["base_url"], + "selectors": self.base_config.get("selectors", {}), + "url_patterns": self.base_config.get("url_patterns", {}), + "rate_limit": self.base_config.get("rate_limit", 0.5), "max_pages": 500, # Router only scrapes overview pages "_router": True, - "_sub_skills": [cfg['name'] for cfg in self.configs], - "_routing_keywords": routing_keywords + "_sub_skills": [cfg["name"] for cfg in self.configs], + "_routing_keywords": routing_keywords, } return router_config @@ -993,34 +1000,38 @@ GitHub issues related to this topic: md = "# Common GitHub Issues\n\n" md += "Top issues reported by the community:\n\n" - common_problems = self.github_issues.get('common_problems', [])[:10] if self.github_issues else [] - known_solutions = self.github_issues.get('known_solutions', [])[:10] if self.github_issues else [] + common_problems = self.github_issues.get("common_problems", [])[:10] if self.github_issues else [] + known_solutions = self.github_issues.get("known_solutions", [])[:10] if self.github_issues else [] if common_problems: md += "## Open Issues (Common Problems)\n\n" for i, issue in enumerate(common_problems, 1): - title = issue.get('title', '') - number = issue.get('number', 0) - comments = issue.get('comments', 0) - labels = issue.get('labels', []) + title = issue.get("title", "") + number = issue.get("number", 0) + comments = issue.get("comments", 0) + labels = issue.get("labels", []) if isinstance(labels, list): - labels_str = ', '.join(str(label) for label in labels) + labels_str = ", ".join(str(label) for label in labels) else: - labels_str = str(labels) if labels else '' + labels_str = str(labels) if labels else "" md += f"### {i}. {title}\n\n" md += f"**Issue**: #{number}\n" md += f"**Comments**: {comments}\n" if labels_str: md += f"**Labels**: {labels_str}\n" - md += f"**Link**: https://github.com/{self.github_metadata.get('html_url', '').replace('https://github.com/', '')}/issues/{number}\n\n" if self.github_metadata else "\n\n" + md += ( + f"**Link**: https://github.com/{self.github_metadata.get('html_url', '').replace('https://github.com/', '')}/issues/{number}\n\n" + if self.github_metadata + else "\n\n" + ) if known_solutions: md += "\n## Closed Issues (Known Solutions)\n\n" for i, issue in enumerate(known_solutions, 1): - title = issue.get('title', '') - number = issue.get('number', 0) - comments = issue.get('comments', 0) + title = issue.get("title", "") + number = issue.get("number", 0) + comments = issue.get("comments", 0) md += f"### {i}. {title}\n\n" md += f"**Issue**: #{number} (Closed)\n" @@ -1042,8 +1053,8 @@ GitHub issues related to this topic: md = "# Getting Started\n\n" md += "*Extracted from project README*\n\n" - if self.github_docs and self.github_docs.get('readme'): - readme = self.github_docs['readme'] + if self.github_docs and self.github_docs.get("readme"): + readme = self.github_docs["readme"] # Clean and extract full quick start section (up to 2000 chars) cleaner = MarkdownCleaner() @@ -1069,16 +1080,16 @@ GitHub issues related to this topic: # 1. GitHub Issues Reference if self.github_issues: issues_md = self._generate_github_issues_reference() - with open(references_dir / 'github_issues.md', 'w') as f: + with open(references_dir / "github_issues.md", "w") as f: f.write(issues_md) # 2. Getting Started Reference - if self.github_docs and self.github_docs.get('readme'): + if self.github_docs and self.github_docs.get("readme"): getting_started_md = self._generate_getting_started_reference() - with open(references_dir / 'getting_started.md', 'w') as f: + with open(references_dir / "getting_started.md", "w") as f: f.write(getting_started_md) - def generate(self, output_dir: Path = None) -> Tuple[Path, Path]: + def generate(self, output_dir: Path = None) -> tuple[Path, Path]: """Generate router skill and config with progressive disclosure""" if output_dir is None: output_dir = self.config_paths[0].parent @@ -1090,11 +1101,11 @@ GitHub issues related to this topic: skill_path = output_dir.parent / f"output/{self.router_name}/SKILL.md" skill_path.parent.mkdir(parents=True, exist_ok=True) - with open(skill_path, 'w') as f: + with open(skill_path, "w") as f: f.write(skill_md) # NEW: Create references/ directory and generate reference files - references_dir = skill_path.parent / 'references' + references_dir = skill_path.parent / "references" references_dir.mkdir(parents=True, exist_ok=True) self._generate_reference_files(references_dir) @@ -1102,7 +1113,7 @@ GitHub issues related to this topic: router_config = self.create_router_config() config_path = output_dir / f"{self.router_name}.json" - with open(config_path, 'w') as f: + with open(config_path, "w") as f: json.dump(router_config, f, indent=2) return config_path, skill_path @@ -1125,24 +1136,14 @@ Examples: # Custom output directory python3 generate_router.py configs/godot-*.json --output-dir configs/routers/ - """ + """, ) - parser.add_argument( - 'configs', - nargs='+', - help='Sub-skill config files' - ) + parser.add_argument("configs", nargs="+", help="Sub-skill config files") - parser.add_argument( - '--name', - help='Router skill name (default: inferred from sub-skills)' - ) + parser.add_argument("--name", help="Router skill name (default: inferred from sub-skills)") - parser.add_argument( - '--output-dir', - help='Output directory (default: same as input configs)' - ) + parser.add_argument("--output-dir", help="Output directory (default: same as input configs)") args = parser.parse_args() @@ -1150,16 +1151,16 @@ Examples: config_files = [] for path_str in args.configs: path = Path(path_str) - if path.exists() and not path.stem.endswith('-router'): + if path.exists() and not path.stem.endswith("-router"): config_files.append(path_str) if not config_files: print("āŒ Error: No valid config files provided") sys.exit(1) - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print("ROUTER SKILL GENERATOR") - print(f"{'='*60}") + print(f"{'=' * 60}") print(f"Sub-skills: {len(config_files)}") for cfg in config_files: print(f" - {Path(cfg).stem}") @@ -1172,11 +1173,11 @@ Examples: print(f"āœ… Router config created: {config_path}") print(f"āœ… Router SKILL.md created: {skill_path}") print("") - print(f"{'='*60}") + print(f"{'=' * 60}") print("NEXT STEPS") - print(f"{'='*60}") + print(f"{'=' * 60}") print(f"1. Review router SKILL.md: {skill_path}") - print(f"2. Optionally scrape router (for overview pages):") + print("2. Optionally scrape router (for overview pages):") print(f" skill-seekers scrape --config {config_path}") print("3. Package router skill:") print(f" skill-seekers package output/{generator.router_name}/") diff --git a/src/skill_seekers/cli/github_fetcher.py b/src/skill_seekers/cli/github_fetcher.py index 72146e5..5799a3d 100644 --- a/src/skill_seekers/cli/github_fetcher.py +++ b/src/skill_seekers/cli/github_fetcher.py @@ -12,43 +12,47 @@ This is the foundation of the unified codebase analyzer architecture. import os import subprocess import tempfile +from collections import Counter from dataclasses import dataclass from pathlib import Path -from typing import List, Dict, Optional, Tuple -from collections import Counter + import requests -from .rate_limit_handler import RateLimitHandler, RateLimitError, create_github_headers from .config_manager import get_config_manager +from .rate_limit_handler import RateLimitError, RateLimitHandler, create_github_headers @dataclass class CodeStream: """Code files for C3.x analysis.""" + directory: Path - files: List[Path] + files: list[Path] @dataclass class DocsStream: """Documentation files from repository.""" - readme: Optional[str] - contributing: Optional[str] - docs_files: List[Dict] # [{"path": "docs/oauth.md", "content": "..."}] + + readme: str | None + contributing: str | None + docs_files: list[dict] # [{"path": "docs/oauth.md", "content": "..."}] @dataclass class InsightsStream: """GitHub metadata and issues.""" - metadata: Dict # stars, forks, language, etc. - common_problems: List[Dict] - known_solutions: List[Dict] - top_labels: List[Dict] + + metadata: dict # stars, forks, language, etc. + common_problems: list[dict] + known_solutions: list[dict] + top_labels: list[dict] @dataclass class ThreeStreamData: """Complete output from GitHub fetcher.""" + code_stream: CodeStream docs_stream: DocsStream insights_stream: InsightsStream @@ -73,11 +77,7 @@ class GitHubThreeStreamFetcher: """ def __init__( - self, - repo_url: str, - github_token: Optional[str] = None, - interactive: bool = True, - profile_name: Optional[str] = None + self, repo_url: str, github_token: str | None = None, interactive: bool = True, profile_name: str | None = None ): """ Initialize fetcher. @@ -89,7 +89,7 @@ class GitHubThreeStreamFetcher: profile_name: Name of the GitHub profile being used """ self.repo_url = repo_url - self.github_token = github_token or os.getenv('GITHUB_TOKEN') + self.github_token = github_token or os.getenv("GITHUB_TOKEN") self.owner, self.repo = self.parse_repo_url(repo_url) self.interactive = interactive @@ -99,12 +99,10 @@ class GitHubThreeStreamFetcher: profile_name = config.get_profile_for_token(self.github_token) self.rate_limiter = RateLimitHandler( - token=self.github_token, - interactive=interactive, - profile_name=profile_name + token=self.github_token, interactive=interactive, profile_name=profile_name ) - def parse_repo_url(self, url: str) -> Tuple[str, str]: + def parse_repo_url(self, url: str) -> tuple[str, str]: """ Parse GitHub URL to extract owner and repo. @@ -115,18 +113,18 @@ class GitHubThreeStreamFetcher: Tuple of (owner, repo) """ # Remove .git suffix if present - if url.endswith('.git'): + if url.endswith(".git"): url = url[:-4] # Remove last 4 characters (.git) # Handle git@ URLs (SSH format) - if url.startswith('git@github.com:'): - parts = url.replace('git@github.com:', '').split('/') + if url.startswith("git@github.com:"): + parts = url.replace("git@github.com:", "").split("/") if len(parts) >= 2: return parts[0], parts[1] # Handle HTTPS URLs - if 'github.com/' in url: - parts = url.split('github.com/')[-1].split('/') + if "github.com/" in url: + parts = url.split("github.com/")[-1].split("/") if len(parts) >= 2: return parts[0], parts[1] @@ -150,18 +148,18 @@ class GitHubThreeStreamFetcher: raise RateLimitError("Rate limit check failed during startup") if output_dir is None: - output_dir = Path(tempfile.mkdtemp(prefix='github_fetch_')) + output_dir = Path(tempfile.mkdtemp(prefix="github_fetch_")) print(f"šŸ“¦ Cloning {self.repo_url}...") local_path = self.clone_repo(output_dir) - print(f"šŸ” Fetching GitHub metadata...") + print("šŸ” Fetching GitHub metadata...") metadata = self.fetch_github_metadata() - print(f"šŸ› Fetching issues...") + print("šŸ› Fetching issues...") issues = self.fetch_issues(max_issues=100) - print(f"šŸ“‚ Classifying files...") + print("šŸ“‚ Classifying files...") code_files, doc_files = self.classify_files(local_path) print(f" - Code: {len(code_files)} files") print(f" - Docs: {len(doc_files)} files") @@ -171,25 +169,22 @@ class GitHubThreeStreamFetcher: # Build three streams return ThreeStreamData( - code_stream=CodeStream( - directory=local_path, - files=code_files - ), + code_stream=CodeStream(directory=local_path, files=code_files), docs_stream=DocsStream( - readme=self.read_file(local_path / 'README.md'), - contributing=self.read_file(local_path / 'CONTRIBUTING.md'), + readme=self.read_file(local_path / "README.md"), + contributing=self.read_file(local_path / "CONTRIBUTING.md"), docs_files=[ - {'path': str(f.relative_to(local_path)), 'content': self.read_file(f)} + {"path": str(f.relative_to(local_path)), "content": self.read_file(f)} for f in doc_files - if f.name not in ['README.md', 'CONTRIBUTING.md'] - ] + if f.name not in ["README.md", "CONTRIBUTING.md"] + ], ), insights_stream=InsightsStream( metadata=metadata, - common_problems=issue_insights['common_problems'], - known_solutions=issue_insights['known_solutions'], - top_labels=issue_insights['top_labels'] - ) + common_problems=issue_insights["common_problems"], + known_solutions=issue_insights["known_solutions"], + top_labels=issue_insights["top_labels"], + ), ) def clone_repo(self, output_dir: Path) -> Path: @@ -206,7 +201,7 @@ class GitHubThreeStreamFetcher: repo_dir.mkdir(parents=True, exist_ok=True) # Clone with depth 1 for speed - cmd = ['git', 'clone', '--depth', '1', self.repo_url, str(repo_dir)] + cmd = ["git", "clone", "--depth", "1", self.repo_url, str(repo_dir)] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: @@ -214,7 +209,7 @@ class GitHubThreeStreamFetcher: return repo_dir - def fetch_github_metadata(self) -> Dict: + def fetch_github_metadata(self) -> dict: """ Fetch repo metadata via GitHub API. @@ -238,35 +233,35 @@ class GitHubThreeStreamFetcher: data = response.json() return { - 'stars': data.get('stargazers_count', 0), - 'forks': data.get('forks_count', 0), - 'open_issues': data.get('open_issues_count', 0), - 'language': data.get('language', 'Unknown'), - 'description': data.get('description', ''), - 'homepage': data.get('homepage', ''), - 'created_at': data.get('created_at', ''), - 'updated_at': data.get('updated_at', ''), - 'html_url': data.get('html_url', ''), # NEW: Repository URL - 'license': data.get('license', {}) # NEW: License info + "stars": data.get("stargazers_count", 0), + "forks": data.get("forks_count", 0), + "open_issues": data.get("open_issues_count", 0), + "language": data.get("language", "Unknown"), + "description": data.get("description", ""), + "homepage": data.get("homepage", ""), + "created_at": data.get("created_at", ""), + "updated_at": data.get("updated_at", ""), + "html_url": data.get("html_url", ""), # NEW: Repository URL + "license": data.get("license", {}), # NEW: License info } except RateLimitError: raise except Exception as e: print(f"āš ļø Failed to fetch metadata: {e}") return { - 'stars': 0, - 'forks': 0, - 'open_issues': 0, - 'language': 'Unknown', - 'description': '', - 'homepage': '', - 'created_at': '', - 'updated_at': '', - 'html_url': '', # NEW: Repository URL - 'license': {} # NEW: License info + "stars": 0, + "forks": 0, + "open_issues": 0, + "language": "Unknown", + "description": "", + "homepage": "", + "created_at": "", + "updated_at": "", + "html_url": "", # NEW: Repository URL + "license": {}, # NEW: License info } - def fetch_issues(self, max_issues: int = 100) -> List[Dict]: + def fetch_issues(self, max_issues: int = 100) -> list[dict]: """ Fetch GitHub issues (open + closed). @@ -279,14 +274,14 @@ class GitHubThreeStreamFetcher: all_issues = [] # Fetch open issues - all_issues.extend(self._fetch_issues_page(state='open', max_count=max_issues // 2)) + all_issues.extend(self._fetch_issues_page(state="open", max_count=max_issues // 2)) # Fetch closed issues - all_issues.extend(self._fetch_issues_page(state='closed', max_count=max_issues // 2)) + all_issues.extend(self._fetch_issues_page(state="closed", max_count=max_issues // 2)) return all_issues - def _fetch_issues_page(self, state: str, max_count: int) -> List[Dict]: + def _fetch_issues_page(self, state: str, max_count: int) -> list[dict]: """ Fetch one page of issues. @@ -304,10 +299,10 @@ class GitHubThreeStreamFetcher: headers = create_github_headers(self.github_token) params = { - 'state': state, - 'per_page': min(max_count, 100), # GitHub API limit - 'sort': 'comments', - 'direction': 'desc' + "state": state, + "per_page": min(max_count, 100), # GitHub API limit + "sort": "comments", + "direction": "desc", } try: @@ -321,7 +316,7 @@ class GitHubThreeStreamFetcher: issues = response.json() # Filter out pull requests (they appear in issues endpoint) - issues = [issue for issue in issues if 'pull_request' not in issue] + issues = [issue for issue in issues if "pull_request" not in issue] return issues except RateLimitError: @@ -330,7 +325,7 @@ class GitHubThreeStreamFetcher: print(f"āš ļø Failed to fetch {state} issues: {e}") return [] - def classify_files(self, repo_path: Path) -> Tuple[List[Path], List[Path]]: + def classify_files(self, repo_path: Path) -> tuple[list[Path], list[Path]]: """ Split files into code vs documentation. @@ -354,36 +349,61 @@ class GitHubThreeStreamFetcher: # Documentation patterns doc_patterns = [ - '**/README.md', - '**/CONTRIBUTING.md', - '**/CHANGELOG.md', - '**/LICENSE.md', - 'docs/*.md', # Files directly in docs/ - 'docs/**/*.md', # Files in subdirectories of docs/ - 'doc/*.md', # Files directly in doc/ - 'doc/**/*.md', # Files in subdirectories of doc/ - 'documentation/*.md', # Files directly in documentation/ - 'documentation/**/*.md', # Files in subdirectories of documentation/ - '**/*.rst', + "**/README.md", + "**/CONTRIBUTING.md", + "**/CHANGELOG.md", + "**/LICENSE.md", + "docs/*.md", # Files directly in docs/ + "docs/**/*.md", # Files in subdirectories of docs/ + "doc/*.md", # Files directly in doc/ + "doc/**/*.md", # Files in subdirectories of doc/ + "documentation/*.md", # Files directly in documentation/ + "documentation/**/*.md", # Files in subdirectories of documentation/ + "**/*.rst", ] # Code extensions code_extensions = [ - '.py', '.js', '.ts', '.jsx', '.tsx', - '.go', '.rs', '.java', '.kt', - '.c', '.cpp', '.h', '.hpp', - '.rb', '.php', '.swift', '.cs', - '.scala', '.clj', '.cljs' + ".py", + ".js", + ".ts", + ".jsx", + ".tsx", + ".go", + ".rs", + ".java", + ".kt", + ".c", + ".cpp", + ".h", + ".hpp", + ".rb", + ".php", + ".swift", + ".cs", + ".scala", + ".clj", + ".cljs", ] # Directories to exclude exclude_dirs = [ - 'node_modules', '__pycache__', 'venv', '.venv', - '.git', 'build', 'dist', '.tox', '.pytest_cache', - 'htmlcov', '.mypy_cache', '.eggs', '*.egg-info' + "node_modules", + "__pycache__", + "venv", + ".venv", + ".git", + "build", + "dist", + ".tox", + ".pytest_cache", + "htmlcov", + ".mypy_cache", + ".eggs", + "*.egg-info", ] - for file_path in repo_path.rglob('*'): + for file_path in repo_path.rglob("*"): if not file_path.is_file(): continue @@ -392,8 +412,8 @@ class GitHubThreeStreamFetcher: continue # Skip hidden files (but allow docs in docs/ directories) - is_in_docs_dir = any(pattern in str(file_path) for pattern in ['docs/', 'doc/', 'documentation/']) - if any(part.startswith('.') for part in file_path.parts): + is_in_docs_dir = any(pattern in str(file_path) for pattern in ["docs/", "doc/", "documentation/"]) + if any(part.startswith(".") for part in file_path.parts): if not is_in_docs_dir: continue @@ -407,7 +427,7 @@ class GitHubThreeStreamFetcher: return code_files, doc_files - def analyze_issues(self, issues: List[Dict]) -> Dict: + def analyze_issues(self, issues: list[dict]) -> dict: """ Analyze GitHub issues to extract insights. @@ -446,44 +466,41 @@ class GitHubThreeStreamFetcher: for issue in issues: # Handle both string labels and dict labels (GitHub API format) - raw_labels = issue.get('labels', []) + raw_labels = issue.get("labels", []) labels = [] for label in raw_labels: if isinstance(label, dict): - labels.append(label.get('name', '')) + labels.append(label.get("name", "")) else: labels.append(str(label)) all_labels.extend(labels) issue_data = { - 'title': issue.get('title', ''), - 'number': issue.get('number', 0), - 'labels': labels, - 'comments': issue.get('comments', 0), - 'state': issue.get('state', 'unknown') + "title": issue.get("title", ""), + "number": issue.get("number", 0), + "labels": labels, + "comments": issue.get("comments", 0), + "state": issue.get("state", "unknown"), } # Open issues with many comments = common problems - if issue['state'] == 'open' and issue.get('comments', 0) >= 5: + if issue["state"] == "open" and issue.get("comments", 0) >= 5: common_problems.append(issue_data) # Closed issues with comments = known solutions - elif issue['state'] == 'closed' and issue.get('comments', 0) > 0: + elif issue["state"] == "closed" and issue.get("comments", 0) > 0: known_solutions.append(issue_data) # Count label frequency label_counts = Counter(all_labels) return { - 'common_problems': sorted(common_problems, key=lambda x: x['comments'], reverse=True)[:10], - 'known_solutions': sorted(known_solutions, key=lambda x: x['comments'], reverse=True)[:10], - 'top_labels': [ - {'label': label, 'count': count} - for label, count in label_counts.most_common(10) - ] + "common_problems": sorted(common_problems, key=lambda x: x["comments"], reverse=True)[:10], + "known_solutions": sorted(known_solutions, key=lambda x: x["comments"], reverse=True)[:10], + "top_labels": [{"label": label, "count": count} for label, count in label_counts.most_common(10)], } - def read_file(self, file_path: Path) -> Optional[str]: + def read_file(self, file_path: Path) -> str | None: """ Read file content safely. @@ -497,10 +514,10 @@ class GitHubThreeStreamFetcher: return None try: - return file_path.read_text(encoding='utf-8') + return file_path.read_text(encoding="utf-8") except Exception: # Try with different encoding try: - return file_path.read_text(encoding='latin-1') + return file_path.read_text(encoding="latin-1") except Exception: return None diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py index c6900a4..e39c246 100644 --- a/src/skill_seekers/cli/github_scraper.py +++ b/src/skill_seekers/cli/github_scraper.py @@ -14,15 +14,14 @@ Usage: skill-seekers github --repo owner/repo --token $GITHUB_TOKEN """ -import os -import sys -import json -import re import argparse +import json import logging +import os +import re +import sys from pathlib import Path -from typing import Dict, List, Optional, Any -from datetime import datetime +from typing import Any, Optional try: from github import Github, GithubException, Repository @@ -34,20 +33,19 @@ except ImportError: # Try to import pathspec for .gitignore support try: import pathspec + PATHSPEC_AVAILABLE = True except ImportError: PATHSPEC_AVAILABLE = False # Configure logging FIRST (before using logger) -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # Import code analyzer for deep code analysis try: from .code_analyzer import CodeAnalyzer + CODE_ANALYZER_AVAILABLE = True except ImportError: CODE_ANALYZER_AVAILABLE = False @@ -55,13 +53,25 @@ except ImportError: # Directories to exclude from local repository analysis EXCLUDED_DIRS = { - 'venv', 'env', '.venv', '.env', # Virtual environments - 'node_modules', '__pycache__', '.pytest_cache', # Dependencies and caches - '.git', '.svn', '.hg', # Version control - 'build', 'dist', '*.egg-info', # Build artifacts - 'htmlcov', '.coverage', # Coverage reports - '.tox', '.nox', # Testing environments - '.mypy_cache', '.ruff_cache', # Linter caches + "venv", + "env", + ".venv", + ".env", # Virtual environments + "node_modules", + "__pycache__", + ".pytest_cache", # Dependencies and caches + ".git", + ".svn", + ".hg", # Version control + "build", + "dist", + "*.egg-info", # Build artifacts + "htmlcov", + ".coverage", # Coverage reports + ".tox", + ".nox", # Testing environments + ".mypy_cache", + ".ruff_cache", # Linter caches } @@ -80,10 +90,10 @@ def extract_description_from_readme(readme_content: str, repo_name: str) -> str: Description string, or improved fallback if extraction fails """ if not readme_content: - return f'Use when working with {repo_name.split("/")[-1]}' + return f"Use when working with {repo_name.split('/')[-1]}" try: - lines = readme_content.split('\n') + lines = readme_content.split("\n") # Skip badges, images, title - find first meaningful text paragraph meaningful_paragraph = None @@ -93,7 +103,7 @@ def extract_description_from_readme(readme_content: str, repo_name: str) -> str: stripped = line.strip() # Track code blocks - if stripped.startswith('```'): + if stripped.startswith("```"): in_code_block = not in_code_block continue @@ -102,11 +112,11 @@ def extract_description_from_readme(readme_content: str, repo_name: str) -> str: continue # Skip empty lines, badges, images, HTML - if not stripped or stripped.startswith(('#', '!', '<', '[![', '[![')): + if not stripped or stripped.startswith(("#", "!", "<", "[![", "[![")): continue # Skip lines that are just links or badges - if stripped.startswith('[') and '](' in stripped and len(stripped) < 100: + if stripped.startswith("[") and "](" in stripped and len(stripped) < 100: continue # Found a meaningful paragraph - take up to 200 chars @@ -117,33 +127,33 @@ def extract_description_from_readme(readme_content: str, repo_name: str) -> str: if meaningful_paragraph: # Clean up and extract purpose # Remove markdown formatting - clean = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', meaningful_paragraph) # Links - clean = re.sub(r'[*_`]', '', clean) # Bold, italic, code - clean = re.sub(r'<[^>]+>', '', clean) # HTML tags + clean = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", meaningful_paragraph) # Links + clean = re.sub(r"[*_`]", "", clean) # Bold, italic, code + clean = re.sub(r"<[^>]+>", "", clean) # HTML tags # Truncate if too long (keep first sentence or ~150 chars) - if '. ' in clean: - first_sentence = clean.split('. ')[0] + '.' + if ". " in clean: + first_sentence = clean.split(". ")[0] + "." if len(first_sentence) < 200: clean = first_sentence if len(clean) > 150: - clean = clean[:147] + '...' + clean = clean[:147] + "..." # Format as "Use when..." description # If it already starts with action words, use as-is - action_words = ['build', 'create', 'develop', 'work', 'use', 'implement', 'manage'] + action_words = ["build", "create", "develop", "work", "use", "implement", "manage"] if any(clean.lower().startswith(word) for word in action_words): - return f'Use when {clean.lower()}' + return f"Use when {clean.lower()}" else: - return f'Use when working with {clean.lower()}' + return f"Use when working with {clean.lower()}" except Exception as e: logger.debug(f"Could not extract description from README: {e}") # Improved fallback - project_name = repo_name.split('/')[-1] - return f'Use when working with {project_name}' + project_name = repo_name.split("/")[-1] + return f"Use when working with {project_name}" class GitHubScraper: @@ -162,16 +172,16 @@ class GitHubScraper: - Releases """ - def __init__(self, config: Dict[str, Any], local_repo_path: Optional[str] = None): + def __init__(self, config: dict[str, Any], local_repo_path: str | None = None): """Initialize GitHub scraper with configuration.""" self.config = config - self.repo_name = config['repo'] - self.name = config.get('name', self.repo_name.split('/')[-1]) + self.repo_name = config["repo"] + self.name = config.get("name", self.repo_name.split("/")[-1]) # Set initial description (will be improved after README extraction if not in config) - self.description = config.get('description', f'Use when working with {self.repo_name.split("/")[-1]}') + self.description = config.get("description", f"Use when working with {self.repo_name.split('/')[-1]}") # Local repository path (optional - enables unlimited analysis) - self.local_repo_path = local_repo_path or config.get('local_repo_path') + self.local_repo_path = local_repo_path or config.get("local_repo_path") if self.local_repo_path: self.local_repo_path = os.path.expanduser(self.local_repo_path) logger.info(f"Local repository mode enabled: {self.local_repo_path}") @@ -180,22 +190,16 @@ class GitHubScraper: self.excluded_dirs = set(EXCLUDED_DIRS) # Start with smart defaults # Option 1: Replace mode - Use only specified exclusions - if 'exclude_dirs' in config: - self.excluded_dirs = set(config['exclude_dirs']) - logger.warning( - f"Using custom directory exclusions ({len(self.excluded_dirs)} dirs) - " - "defaults overridden" - ) + if "exclude_dirs" in config: + self.excluded_dirs = set(config["exclude_dirs"]) + logger.warning(f"Using custom directory exclusions ({len(self.excluded_dirs)} dirs) - defaults overridden") logger.debug(f"Custom exclusions: {sorted(self.excluded_dirs)}") # Option 2: Extend mode - Add to default exclusions - elif 'exclude_dirs_additional' in config: - additional = set(config['exclude_dirs_additional']) + elif "exclude_dirs_additional" in config: + additional = set(config["exclude_dirs_additional"]) self.excluded_dirs = self.excluded_dirs.union(additional) - logger.info( - f"Added {len(additional)} custom directory exclusions " - f"(total: {len(self.excluded_dirs)})" - ) + logger.info(f"Added {len(additional)} custom directory exclusions (total: {len(self.excluded_dirs)})") logger.debug(f"Additional exclusions: {sorted(additional)}") # Load .gitignore for additional exclusions (C2.1) @@ -206,20 +210,20 @@ class GitHubScraper: # GitHub client setup (C1.1) token = self._get_token() self.github = Github(token) if token else Github() - self.repo: Optional[Repository.Repository] = None + self.repo: Repository.Repository | None = None # Options - self.include_issues = config.get('include_issues', True) - self.max_issues = config.get('max_issues', 100) - self.include_changelog = config.get('include_changelog', True) - self.include_releases = config.get('include_releases', True) - self.include_code = config.get('include_code', False) - self.code_analysis_depth = config.get('code_analysis_depth', 'surface') # 'surface', 'deep', 'full' - self.file_patterns = config.get('file_patterns', []) + self.include_issues = config.get("include_issues", True) + self.max_issues = config.get("max_issues", 100) + self.include_changelog = config.get("include_changelog", True) + self.include_releases = config.get("include_releases", True) + self.include_code = config.get("include_code", False) + self.code_analysis_depth = config.get("code_analysis_depth", "surface") # 'surface', 'deep', 'full' + self.file_patterns = config.get("file_patterns", []) # Initialize code analyzer if deep analysis requested self.code_analyzer = None - if self.code_analysis_depth != 'surface' and CODE_ANALYZER_AVAILABLE: + if self.code_analysis_depth != "surface" and CODE_ANALYZER_AVAILABLE: self.code_analyzer = CodeAnalyzer(depth=self.code_analysis_depth) logger.info(f"Code analysis depth: {self.code_analysis_depth}") @@ -229,30 +233,30 @@ class GitHubScraper: # Extracted data storage self.extracted_data = { - 'repo_info': {}, - 'readme': '', - 'file_tree': [], - 'languages': {}, - 'signatures': [], - 'test_examples': [], - 'issues': [], - 'changelog': '', - 'releases': [] + "repo_info": {}, + "readme": "", + "file_tree": [], + "languages": {}, + "signatures": [], + "test_examples": [], + "issues": [], + "changelog": "", + "releases": [], } - def _get_token(self) -> Optional[str]: + def _get_token(self) -> str | None: """ Get GitHub token from env var or config (both options supported). Priority: GITHUB_TOKEN env var > config file > None """ # Try environment variable first (recommended) - token = os.getenv('GITHUB_TOKEN') + token = os.getenv("GITHUB_TOKEN") if token: logger.info("Using GitHub token from GITHUB_TOKEN environment variable") return token # Fall back to config file - token = self.config.get('github_token') + token = self.config.get("github_token") if token: logger.warning("Using GitHub token from config file (less secure)") return token @@ -260,7 +264,7 @@ class GitHubScraper: logger.warning("No GitHub token provided - using unauthenticated access (lower rate limits)") return None - def scrape(self) -> Dict[str, Any]: + def scrape(self) -> dict[str, Any]: """ Main scraping entry point. Executes all C1 tasks in sequence. @@ -313,21 +317,21 @@ class GitHubScraper: self.repo = self.github.get_repo(self.repo_name) # Extract basic repo info - self.extracted_data['repo_info'] = { - 'name': self.repo.name, - 'full_name': self.repo.full_name, - 'description': self.repo.description, - 'url': self.repo.html_url, - 'homepage': self.repo.homepage, - 'stars': self.repo.stargazers_count, - 'forks': self.repo.forks_count, - 'open_issues': self.repo.open_issues_count, - 'default_branch': self.repo.default_branch, - 'created_at': self.repo.created_at.isoformat() if self.repo.created_at else None, - 'updated_at': self.repo.updated_at.isoformat() if self.repo.updated_at else None, - 'language': self.repo.language, - 'license': self.repo.license.name if self.repo.license else None, - 'topics': self.repo.get_topics() + self.extracted_data["repo_info"] = { + "name": self.repo.name, + "full_name": self.repo.full_name, + "description": self.repo.description, + "url": self.repo.html_url, + "homepage": self.repo.homepage, + "stars": self.repo.stargazers_count, + "forks": self.repo.forks_count, + "open_issues": self.repo.open_issues_count, + "default_branch": self.repo.default_branch, + "created_at": self.repo.created_at.isoformat() if self.repo.created_at else None, + "updated_at": self.repo.updated_at.isoformat() if self.repo.updated_at else None, + "language": self.repo.language, + "license": self.repo.license.name if self.repo.license else None, + "topics": self.repo.get_topics(), } logger.info(f"Repository fetched: {self.repo.full_name} ({self.repo.stargazers_count} stars)") @@ -337,7 +341,7 @@ class GitHubScraper: raise ValueError(f"Repository not found: {self.repo_name}") raise - def _get_file_content(self, file_path: str) -> Optional[str]: + def _get_file_content(self, file_path: str) -> str | None: """ Safely get file content, handling symlinks and encoding issues. @@ -353,8 +357,8 @@ class GitHubScraper: return None # Handle symlinks - follow the target to get actual file - if hasattr(content, 'type') and content.type == 'symlink': - target = getattr(content, 'target', None) + if hasattr(content, "type") and content.type == "symlink": + target = getattr(content, "target", None) if target: target = target.strip() logger.debug(f"File {file_path} is a symlink to {target}, following...") @@ -369,14 +373,15 @@ class GitHubScraper: # Handle large files (encoding="none") - download via URL # GitHub API doesn't base64-encode files >1MB - if hasattr(content, 'encoding') and content.encoding in [None, "none"]: - download_url = getattr(content, 'download_url', None) - file_size = getattr(content, 'size', 0) + if hasattr(content, "encoding") and content.encoding in [None, "none"]: + download_url = getattr(content, "download_url", None) + file_size = getattr(content, "size", 0) if download_url: logger.info(f"File {file_path} is large ({file_size:,} bytes), downloading via URL...") try: import requests + response = requests.get(download_url, timeout=30) response.raise_for_status() return response.text @@ -390,7 +395,7 @@ class GitHubScraper: # Handle regular files - decode content try: if isinstance(content.decoded_content, bytes): - return content.decoded_content.decode('utf-8') + return content.decoded_content.decode("utf-8") else: return str(content.decoded_content) except (UnicodeDecodeError, AttributeError, LookupError, AssertionError) as e: @@ -398,7 +403,7 @@ class GitHubScraper: # Try alternative encoding try: if isinstance(content.decoded_content, bytes): - return content.decoded_content.decode('latin-1') + return content.decoded_content.decode("latin-1") except Exception: return None return None @@ -414,21 +419,17 @@ class GitHubScraper: logger.info("Extracting README...") # Try common README locations - readme_files = ['README.md', 'README.rst', 'README.txt', 'README', - 'docs/README.md', '.github/README.md'] + readme_files = ["README.md", "README.rst", "README.txt", "README", "docs/README.md", ".github/README.md"] for readme_path in readme_files: readme_content = self._get_file_content(readme_path) if readme_content: - self.extracted_data['readme'] = readme_content + self.extracted_data["readme"] = readme_content logger.info(f"README found: {readme_path}") # Update description if not explicitly set in config - if 'description' not in self.config: - smart_description = extract_description_from_readme( - self.extracted_data['readme'], - self.repo_name - ) + if "description" not in self.config: + smart_description = extract_description_from_readme(self.extracted_data["readme"], self.repo_name) self.description = smart_description logger.debug(f"Generated description: {self.description}") @@ -461,10 +462,10 @@ class GitHubScraper: languages = self.repo.get_languages() total_bytes = sum(languages.values()) - self.extracted_data['languages'] = { + self.extracted_data["languages"] = { lang: { - 'bytes': bytes_count, - 'percentage': round((bytes_count / total_bytes) * 100, 2) if total_bytes > 0 else 0 + "bytes": bytes_count, + "percentage": round((bytes_count / total_bytes) * 100, 2) if total_bytes > 0 else 0, } for lang, bytes_count in languages.items() } @@ -486,7 +487,7 @@ class GitHubScraper: True if directory should be excluded """ # Check directory name - if dir_name in self.excluded_dirs or dir_name.startswith('.'): + if dir_name in self.excluded_dirs or dir_name.startswith("."): return True # Check full path if provided (for nested exclusions like "TextMesh Pro/Examples & Extras") @@ -500,14 +501,14 @@ class GitHubScraper: if self.gitignore_spec and dir_path: # For directories, we need to check both with and without trailing slash # as .gitignore patterns can match either way - dir_path_with_slash = dir_path if dir_path.endswith('/') else dir_path + '/' + dir_path_with_slash = dir_path if dir_path.endswith("/") else dir_path + "/" if self.gitignore_spec.match_file(dir_path) or self.gitignore_spec.match_file(dir_path_with_slash): logger.debug(f"Directory excluded by .gitignore: {dir_path}") return True return False - def _load_gitignore(self) -> Optional['pathspec.PathSpec']: + def _load_gitignore(self) -> Optional["pathspec.PathSpec"]: """ Load .gitignore file and create pathspec matcher (C2.1). @@ -522,14 +523,14 @@ class GitHubScraper: if not self.local_repo_path: return None - gitignore_path = Path(self.local_repo_path) / '.gitignore' + gitignore_path = Path(self.local_repo_path) / ".gitignore" if not gitignore_path.exists(): logger.debug(f"No .gitignore found in {self.local_repo_path}") return None try: - with open(gitignore_path, 'r', encoding='utf-8') as f: - spec = pathspec.PathSpec.from_lines('gitwildmatch', f) + with open(gitignore_path, encoding="utf-8") as f: + spec = pathspec.PathSpec.from_lines("gitwildmatch", f) logger.info(f"Loaded .gitignore from {gitignore_path}") return spec except Exception as e: @@ -561,8 +562,8 @@ class GitHubScraper: for root, dirs, files in os.walk(self.local_repo_path): # Calculate relative path from repo root first (needed for exclusion checks) rel_root = os.path.relpath(root, self.local_repo_path) - if rel_root == '.': - rel_root = '' + if rel_root == ".": + rel_root = "" # Exclude directories in-place to prevent os.walk from descending into them # Pass both dir name and full path for path-based exclusions @@ -579,11 +580,7 @@ class GitHubScraper: # Add directories for dir_name in dirs: dir_path = os.path.join(rel_root, dir_name) if rel_root else dir_name - file_tree.append({ - 'path': dir_path, - 'type': 'dir', - 'size': None - }) + file_tree.append({"path": dir_path, "type": "dir", "size": None}) # Add files for file_name in files: @@ -594,13 +591,9 @@ class GitHubScraper: except OSError: file_size = None - file_tree.append({ - 'path': file_path, - 'type': 'file', - 'size': file_size - }) + file_tree.append({"path": file_path, "type": "file", "size": file_size}) - self.extracted_data['file_tree'] = file_tree + self.extracted_data["file_tree"] = file_tree logger.info(f"File tree built (local mode): {len(file_tree)} items ({excluded_count} directories excluded)") def _extract_file_tree_github(self): @@ -613,16 +606,16 @@ class GitHubScraper: file_content = contents.pop(0) file_info = { - 'path': file_content.path, - 'type': file_content.type, - 'size': file_content.size if file_content.type == 'file' else None + "path": file_content.path, + "type": file_content.type, + "size": file_content.size if file_content.type == "file" else None, } file_tree.append(file_info) if file_content.type == "dir": contents.extend(self.repo.get_contents(file_content.path)) - self.extracted_data['file_tree'] = file_tree + self.extracted_data["file_tree"] = file_tree logger.info(f"File tree built (GitHub API mode): {len(file_tree)} items") except GithubException as e: @@ -637,7 +630,7 @@ class GitHubScraper: - deep: Parse files for signatures, parameters, types - full: Complete AST analysis (future enhancement) """ - if self.code_analysis_depth == 'surface': + if self.code_analysis_depth == "surface": logger.info("Code extraction: Surface level (file tree only)") return @@ -648,22 +641,22 @@ class GitHubScraper: logger.info(f"Extracting code signatures ({self.code_analysis_depth} analysis)...") # Get primary language for the repository - languages = self.extracted_data.get('languages', {}) + languages = self.extracted_data.get("languages", {}) if not languages: logger.warning("No languages detected - skipping code analysis") return # Determine primary language - primary_language = max(languages.items(), key=lambda x: x[1]['bytes'])[0] + primary_language = max(languages.items(), key=lambda x: x[1]["bytes"])[0] logger.info(f"Primary language: {primary_language}") # Determine file extensions to analyze extension_map = { - 'Python': ['.py'], - 'JavaScript': ['.js', '.jsx'], - 'TypeScript': ['.ts', '.tsx'], - 'C': ['.c', '.h'], - 'C++': ['.cpp', '.hpp', '.cc', '.hh', '.cxx'] + "Python": [".py"], + "JavaScript": [".js", ".jsx"], + "TypeScript": [".ts", ".tsx"], + "C": [".c", ".h"], + "C++": [".cpp", ".hpp", ".cc", ".hh", ".cxx"], } extensions = extension_map.get(primary_language, []) @@ -673,10 +666,10 @@ class GitHubScraper: # Analyze files matching patterns and extensions analyzed_files = [] - file_tree = self.extracted_data.get('file_tree', []) + file_tree = self.extracted_data.get("file_tree", []) for file_info in file_tree: - file_path = file_info['path'] + file_path = file_info["path"] # Check if file matches extension if not any(file_path.endswith(ext) for ext in extensions): @@ -685,6 +678,7 @@ class GitHubScraper: # Check if file matches patterns (if specified) if self.file_patterns: import fnmatch + if not any(fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns): continue @@ -694,29 +688,23 @@ class GitHubScraper: if self.local_repo_path: # Local mode - read from filesystem full_path = os.path.join(self.local_repo_path, file_path) - with open(full_path, 'r', encoding='utf-8') as f: + with open(full_path, encoding="utf-8") as f: content = f.read() else: # GitHub API mode - fetch from API file_content = self.repo.get_contents(file_path) - content = file_content.decoded_content.decode('utf-8') + content = file_content.decoded_content.decode("utf-8") - analysis_result = self.code_analyzer.analyze_file( - file_path, - content, - primary_language - ) + analysis_result = self.code_analyzer.analyze_file(file_path, content, primary_language) - if analysis_result and (analysis_result.get('classes') or analysis_result.get('functions')): - analyzed_files.append({ - 'file': file_path, - 'language': primary_language, - **analysis_result - }) + if analysis_result and (analysis_result.get("classes") or analysis_result.get("functions")): + analyzed_files.append({"file": file_path, "language": primary_language, **analysis_result}) - logger.debug(f"Analyzed {file_path}: " - f"{len(analysis_result.get('classes', []))} classes, " - f"{len(analysis_result.get('functions', []))} functions") + logger.debug( + f"Analyzed {file_path}: " + f"{len(analysis_result.get('classes', []))} classes, " + f"{len(analysis_result.get('functions', []))} functions" + ) except Exception as e: logger.debug(f"Could not analyze {file_path}: {e}") @@ -724,22 +712,23 @@ class GitHubScraper: # Limit number of files analyzed to avoid rate limits (GitHub API mode only) if not self.local_repo_path and len(analyzed_files) >= 50: - logger.info(f"Reached analysis limit (50 files, GitHub API mode)") + logger.info("Reached analysis limit (50 files, GitHub API mode)") break - self.extracted_data['code_analysis'] = { - 'depth': self.code_analysis_depth, - 'language': primary_language, - 'files_analyzed': len(analyzed_files), - 'files': analyzed_files + self.extracted_data["code_analysis"] = { + "depth": self.code_analysis_depth, + "language": primary_language, + "files_analyzed": len(analyzed_files), + "files": analyzed_files, } # Calculate totals - total_classes = sum(len(f.get('classes', [])) for f in analyzed_files) - total_functions = sum(len(f.get('functions', [])) for f in analyzed_files) + total_classes = sum(len(f.get("classes", [])) for f in analyzed_files) + total_functions = sum(len(f.get("functions", [])) for f in analyzed_files) - logger.info(f"Code analysis complete: {len(analyzed_files)} files, " - f"{total_classes} classes, {total_functions} functions") + logger.info( + f"Code analysis complete: {len(analyzed_files)} files, {total_classes} classes, {total_functions} functions" + ) def _extract_issues(self): """C1.7: Extract GitHub Issues (open/closed, labels, milestones).""" @@ -747,29 +736,29 @@ class GitHubScraper: try: # Fetch recent issues (open + closed) - issues = self.repo.get_issues(state='all', sort='updated', direction='desc') + issues = self.repo.get_issues(state="all", sort="updated", direction="desc") issue_list = [] - for issue in issues[:self.max_issues]: + for issue in issues[: self.max_issues]: # Skip pull requests (they appear in issues) if issue.pull_request: continue issue_data = { - 'number': issue.number, - 'title': issue.title, - 'state': issue.state, - 'labels': [label.name for label in issue.labels], - 'milestone': issue.milestone.title if issue.milestone else None, - 'created_at': issue.created_at.isoformat() if issue.created_at else None, - 'updated_at': issue.updated_at.isoformat() if issue.updated_at else None, - 'closed_at': issue.closed_at.isoformat() if issue.closed_at else None, - 'url': issue.html_url, - 'body': issue.body[:500] if issue.body else None # First 500 chars + "number": issue.number, + "title": issue.title, + "state": issue.state, + "labels": [label.name for label in issue.labels], + "milestone": issue.milestone.title if issue.milestone else None, + "created_at": issue.created_at.isoformat() if issue.created_at else None, + "updated_at": issue.updated_at.isoformat() if issue.updated_at else None, + "closed_at": issue.closed_at.isoformat() if issue.closed_at else None, + "url": issue.html_url, + "body": issue.body[:500] if issue.body else None, # First 500 chars } issue_list.append(issue_data) - self.extracted_data['issues'] = issue_list + self.extracted_data["issues"] = issue_list logger.info(f"Extracted {len(issue_list)} issues") except GithubException as e: @@ -780,14 +769,21 @@ class GitHubScraper: logger.info("Extracting CHANGELOG...") # Try common changelog locations - changelog_files = ['CHANGELOG.md', 'CHANGES.md', 'HISTORY.md', - 'CHANGELOG.rst', 'CHANGELOG.txt', 'CHANGELOG', - 'docs/CHANGELOG.md', '.github/CHANGELOG.md'] + changelog_files = [ + "CHANGELOG.md", + "CHANGES.md", + "HISTORY.md", + "CHANGELOG.rst", + "CHANGELOG.txt", + "CHANGELOG", + "docs/CHANGELOG.md", + ".github/CHANGELOG.md", + ] for changelog_path in changelog_files: changelog_content = self._get_file_content(changelog_path) if changelog_content: - self.extracted_data['changelog'] = changelog_content + self.extracted_data["changelog"] = changelog_content logger.info(f"CHANGELOG found: {changelog_path}") return @@ -803,20 +799,20 @@ class GitHubScraper: release_list = [] for release in releases: release_data = { - 'tag_name': release.tag_name, - 'name': release.title, - 'body': release.body, - 'draft': release.draft, - 'prerelease': release.prerelease, - 'created_at': release.created_at.isoformat() if release.created_at else None, - 'published_at': release.published_at.isoformat() if release.published_at else None, - 'url': release.html_url, - 'tarball_url': release.tarball_url, - 'zipball_url': release.zipball_url + "tag_name": release.tag_name, + "name": release.title, + "body": release.body, + "draft": release.draft, + "prerelease": release.prerelease, + "created_at": release.created_at.isoformat() if release.created_at else None, + "published_at": release.published_at.isoformat() if release.published_at else None, + "url": release.html_url, + "tarball_url": release.tarball_url, + "zipball_url": release.zipball_url, } release_list.append(release_data) - self.extracted_data['releases'] = release_list + self.extracted_data["releases"] = release_list logger.info(f"Extracted {len(release_list)} releases") except GithubException as e: @@ -824,9 +820,9 @@ class GitHubScraper: def _save_data(self): """Save extracted data to JSON file.""" - os.makedirs('output', exist_ok=True) + os.makedirs("output", exist_ok=True) - with open(self.data_file, 'w', encoding='utf-8') as f: + with open(self.data_file, "w", encoding="utf-8") as f: json.dump(self.extracted_data, f, indent=2, ensure_ascii=False) logger.info(f"Data saved to: {self.data_file}") @@ -837,10 +833,10 @@ class GitHubToSkillConverter: Convert extracted GitHub data to Claude skill format (C1.10). """ - def __init__(self, config: Dict[str, Any]): + def __init__(self, config: dict[str, Any]): """Initialize converter with configuration.""" self.config = config - self.name = config.get('name', config['repo'].split('/')[-1]) + self.name = config.get("name", config["repo"].split("/")[-1]) # Paths self.data_file = f"output/{self.name}_github_data.json" @@ -850,23 +846,23 @@ class GitHubToSkillConverter: self.data = self._load_data() # Set description (smart extraction from README if available) - if 'description' in config: - self.description = config['description'] + if "description" in config: + self.description = config["description"] else: # Try to extract from README in loaded data - readme_content = self.data.get('readme', '') - repo_name = config['repo'] + readme_content = self.data.get("readme", "") + repo_name = config["repo"] if readme_content: self.description = extract_description_from_readme(readme_content, repo_name) else: - self.description = f'Use when working with {repo_name.split("/")[-1]}' + self.description = f"Use when working with {repo_name.split('/')[-1]}" - def _load_data(self) -> Dict[str, Any]: + def _load_data(self) -> dict[str, Any]: """Load extracted GitHub data from JSON.""" if not os.path.exists(self.data_file): raise FileNotFoundError(f"Data file not found: {self.data_file}") - with open(self.data_file, 'r', encoding='utf-8') as f: + with open(self.data_file, encoding="utf-8") as f: return json.load(f) def build_skill(self): @@ -889,12 +885,12 @@ class GitHubToSkillConverter: def _generate_skill_md(self): """Generate main SKILL.md file (rich version with C3.x data if available).""" - repo_info = self.data.get('repo_info', {}) - c3_data = self.data.get('c3_analysis', {}) + repo_info = self.data.get("repo_info", {}) + c3_data = self.data.get("c3_analysis", {}) has_c3_data = bool(c3_data) # Generate skill name (lowercase, hyphens only, max 64 chars) - skill_name = self.name.lower().replace('_', '-').replace(' ', '-')[:64] + skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64] # Truncate description to 1024 chars if needed desc = self.description[:1024] if len(self.description) > 1024 else self.description @@ -905,23 +901,23 @@ name: {skill_name} description: {desc} --- -# {repo_info.get('name', self.name)} +# {repo_info.get("name", self.name)} {self.description} ## Description -{repo_info.get('description', 'GitHub repository skill')} +{repo_info.get("description", "GitHub repository skill")} -**Repository:** [{repo_info.get('full_name', 'N/A')}]({repo_info.get('url', '#')}) -**Language:** {repo_info.get('language', 'N/A')} -**Stars:** {repo_info.get('stars', 0):,} -**License:** {repo_info.get('license', 'N/A')} +**Repository:** [{repo_info.get("full_name", "N/A")}]({repo_info.get("url", "#")}) +**Language:** {repo_info.get("language", "N/A")} +**Stars:** {repo_info.get("stars", 0):,} +**License:** {repo_info.get("license", "N/A")} ## When to Use This Skill Use this skill when you need to: -- Understand how to use {repo_info.get('name', self.name)} +- Understand how to use {repo_info.get("name", self.name)} - Look up API documentation and implementation details - Find real-world usage examples from the codebase - Review design patterns and architecture @@ -944,19 +940,19 @@ Use this skill when you need to: skill_content += self._format_languages() + "\n\n" # Add C3.x pattern summary if available - if has_c3_data and c3_data.get('patterns'): + if has_c3_data and c3_data.get("patterns"): skill_content += self._format_pattern_summary(c3_data) # Add code examples if available (C3.2 test examples) - if has_c3_data and c3_data.get('test_examples'): + if has_c3_data and c3_data.get("test_examples"): skill_content += self._format_code_examples(c3_data) # Add API Reference if available (C2.5) - if has_c3_data and c3_data.get('api_reference'): + if has_c3_data and c3_data.get("api_reference"): skill_content += self._format_api_reference(c3_data) # Add Architecture Overview if available (C3.7) - if has_c3_data and c3_data.get('architecture'): + if has_c3_data and c3_data.get("architecture"): skill_content += self._format_architecture(c3_data) # Add Known Issues section @@ -976,13 +972,13 @@ Use this skill when you need to: if has_c3_data: skill_content += "\n### Codebase Analysis References\n\n" - if c3_data.get('patterns'): + if c3_data.get("patterns"): skill_content += "- `references/codebase_analysis/patterns/` - Design patterns detected\n" - if c3_data.get('test_examples'): + if c3_data.get("test_examples"): skill_content += "- `references/codebase_analysis/examples/` - Test examples extracted\n" - if c3_data.get('config_patterns'): + if c3_data.get("config_patterns"): skill_content += "- `references/codebase_analysis/configuration/` - Configuration analysis\n" - if c3_data.get('architecture'): + if c3_data.get("architecture"): skill_content += "- `references/codebase_analysis/ARCHITECTURE.md` - Architecture overview\n" # Usage @@ -998,27 +994,27 @@ Use this skill when you need to: # Write to file skill_path = f"{self.skill_dir}/SKILL.md" - with open(skill_path, 'w', encoding='utf-8') as f: + with open(skill_path, "w", encoding="utf-8") as f: f.write(skill_content) - line_count = len(skill_content.split('\n')) + line_count = len(skill_content.split("\n")) logger.info(f"Generated: {skill_path} ({line_count} lines)") def _format_languages(self) -> str: """Format language breakdown.""" - languages = self.data.get('languages', {}) + languages = self.data.get("languages", {}) if not languages: return "No language data available" lines = [] - for lang, info in sorted(languages.items(), key=lambda x: x[1]['bytes'], reverse=True): + for lang, info in sorted(languages.items(), key=lambda x: x[1]["bytes"], reverse=True): lines.append(f"- **{lang}:** {info['percentage']:.1f}%") - return '\n'.join(lines) + return "\n".join(lines) def _format_recent_releases(self) -> str: """Format recent releases (top 3).""" - releases = self.data.get('releases', []) + releases = self.data.get("releases", []) if not releases: return "No releases available" @@ -1026,11 +1022,11 @@ Use this skill when you need to: for release in releases[:3]: lines.append(f"- **{release['tag_name']}** ({release['published_at'][:10]}): {release['name']}") - return '\n'.join(lines) + return "\n".join(lines) - def _format_pattern_summary(self, c3_data: Dict[str, Any]) -> str: + def _format_pattern_summary(self, c3_data: dict[str, Any]) -> str: """Format design patterns summary (C3.1).""" - patterns_data = c3_data.get('patterns', []) + patterns_data = c3_data.get("patterns", []) if not patterns_data: return "" @@ -1039,10 +1035,10 @@ Use this skill when you need to: by_class = {} for pattern_file in patterns_data: - for pattern in pattern_file.get('patterns', []): - ptype = pattern.get('pattern_type', 'Unknown') - cls = pattern.get('class_name', '') - confidence = pattern.get('confidence', 0) + for pattern in pattern_file.get("patterns", []): + ptype = pattern.get("pattern_type", "Unknown") + cls = pattern.get("class_name", "") + confidence = pattern.get("confidence", 0) # Skip low confidence if confidence < 0.7: @@ -1050,7 +1046,7 @@ Use this skill when you need to: # Deduplicate by class key = f"{cls}:{ptype}" - if key not in by_class or by_class[key]['confidence'] < confidence: + if key not in by_class or by_class[key]["confidence"] < confidence: by_class[key] = pattern # Count by type @@ -1069,16 +1065,16 @@ Use this skill when you need to: content += f"\n*Total: {len(by_class)} high-confidence patterns*\n\n" return content - def _format_code_examples(self, c3_data: Dict[str, Any]) -> str: + def _format_code_examples(self, c3_data: dict[str, Any]) -> str: """Format code examples (C3.2).""" - examples_data = c3_data.get('test_examples', {}) - examples = examples_data.get('examples', []) + examples_data = c3_data.get("test_examples", {}) + examples = examples_data.get("examples", []) if not examples: return "" # Filter high-value examples (complexity > 0.7) - high_value = [ex for ex in examples if ex.get('complexity_score', 0) > 0.7] + high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7] if not high_value: return "" @@ -1087,20 +1083,20 @@ Use this skill when you need to: content += "*High-quality examples from codebase (C3.2)*\n\n" # Top 10 examples - for ex in sorted(high_value, key=lambda x: x.get('complexity_score', 0), reverse=True)[:10]: - desc = ex.get('description', 'Example') - lang = ex.get('language', 'python') - code = ex.get('code', '') - complexity = ex.get('complexity_score', 0) + for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]: + desc = ex.get("description", "Example") + lang = ex.get("language", "python") + code = ex.get("code", "") + complexity = ex.get("complexity_score", 0) content += f"**{desc}** (complexity: {complexity:.2f})\n\n" content += f"```{lang}\n{code}\n```\n\n" return content - def _format_api_reference(self, c3_data: Dict[str, Any]) -> str: + def _format_api_reference(self, c3_data: dict[str, Any]) -> str: """Format API reference (C2.5).""" - api_ref = c3_data.get('api_reference', {}) + api_ref = c3_data.get("api_reference", {}) if not api_ref: return "" @@ -1121,9 +1117,9 @@ Use this skill when you need to: content += "*See `references/codebase_analysis/api_reference/` for complete API docs*\n\n" return content - def _format_architecture(self, c3_data: Dict[str, Any]) -> str: + def _format_architecture(self, c3_data: dict[str, Any]) -> str: """Format architecture overview (C3.7).""" - arch_data = c3_data.get('architecture', {}) + arch_data = c3_data.get("architecture", {}) if not arch_data: return "" @@ -1132,7 +1128,7 @@ Use this skill when you need to: content += "*From C3.7 codebase analysis*\n\n" # Architecture patterns - patterns = arch_data.get('patterns', []) + patterns = arch_data.get("patterns", []) if patterns: content += "**Architectural Patterns:**\n" for pattern in patterns[:5]: @@ -1140,10 +1136,10 @@ Use this skill when you need to: content += "\n" # Dependencies (C2.6) - dep_data = c3_data.get('dependency_graph', {}) + dep_data = c3_data.get("dependency_graph", {}) if dep_data: - total_deps = dep_data.get('total_dependencies', 0) - circular = len(dep_data.get('circular_dependencies', [])) + total_deps = dep_data.get("total_dependencies", 0) + circular = len(dep_data.get("circular_dependencies", [])) if total_deps > 0: content += f"**Dependencies:** {total_deps} total" if circular > 0: @@ -1155,7 +1151,7 @@ Use this skill when you need to: def _format_known_issues(self) -> str: """Format known issues from GitHub.""" - issues = self.data.get('issues', []) + issues = self.data.get("issues", []) if not issues: return "" @@ -1165,111 +1161,111 @@ Use this skill when you need to: # Top 5 issues for issue in issues[:5]: - title = issue.get('title', 'Untitled') - number = issue.get('number', 0) - labels = ', '.join(issue.get('labels', [])) + title = issue.get("title", "Untitled") + number = issue.get("number", 0) + labels = ", ".join(issue.get("labels", [])) content += f"- **#{number}**: {title}" if labels: content += f" [`{labels}`]" content += "\n" - content += f"\n*See `references/issues.md` for complete list*\n\n" + content += "\n*See `references/issues.md` for complete list*\n\n" return content def _generate_references(self): """Generate all reference files.""" # README - if self.data.get('readme'): + if self.data.get("readme"): readme_path = f"{self.skill_dir}/references/README.md" - with open(readme_path, 'w', encoding='utf-8') as f: - f.write(self.data['readme']) + with open(readme_path, "w", encoding="utf-8") as f: + f.write(self.data["readme"]) logger.info(f"Generated: {readme_path}") # CHANGELOG - if self.data.get('changelog'): + if self.data.get("changelog"): changelog_path = f"{self.skill_dir}/references/CHANGELOG.md" - with open(changelog_path, 'w', encoding='utf-8') as f: - f.write(self.data['changelog']) + with open(changelog_path, "w", encoding="utf-8") as f: + f.write(self.data["changelog"]) logger.info(f"Generated: {changelog_path}") # Issues - if self.data.get('issues'): + if self.data.get("issues"): self._generate_issues_reference() # Releases - if self.data.get('releases'): + if self.data.get("releases"): self._generate_releases_reference() # File structure - if self.data.get('file_tree'): + if self.data.get("file_tree"): self._generate_file_structure_reference() def _generate_issues_reference(self): """Generate issues.md reference file.""" - issues = self.data['issues'] + issues = self.data["issues"] content = f"# GitHub Issues\n\nRecent issues from the repository ({len(issues)} total).\n\n" # Group by state - open_issues = [i for i in issues if i['state'] == 'open'] - closed_issues = [i for i in issues if i['state'] == 'closed'] + open_issues = [i for i in issues if i["state"] == "open"] + closed_issues = [i for i in issues if i["state"] == "closed"] content += f"## Open Issues ({len(open_issues)})\n\n" for issue in open_issues[:20]: - labels = ', '.join(issue['labels']) if issue['labels'] else 'No labels' + labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels" content += f"### #{issue['number']}: {issue['title']}\n" content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n" content += f"[View on GitHub]({issue['url']})\n\n" content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n" for issue in closed_issues[:10]: - labels = ', '.join(issue['labels']) if issue['labels'] else 'No labels' + labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels" content += f"### #{issue['number']}: {issue['title']}\n" content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n" content += f"[View on GitHub]({issue['url']})\n\n" issues_path = f"{self.skill_dir}/references/issues.md" - with open(issues_path, 'w', encoding='utf-8') as f: + with open(issues_path, "w", encoding="utf-8") as f: f.write(content) logger.info(f"Generated: {issues_path}") def _generate_releases_reference(self): """Generate releases.md reference file.""" - releases = self.data['releases'] + releases = self.data["releases"] content = f"# Releases\n\nVersion history for this repository ({len(releases)} releases).\n\n" for release in releases: content += f"## {release['tag_name']}: {release['name']}\n" content += f"**Published:** {release['published_at'][:10]}\n" - if release['prerelease']: - content += f"**Pre-release**\n" + if release["prerelease"]: + content += "**Pre-release**\n" content += f"\n{release['body']}\n\n" content += f"[View on GitHub]({release['url']})\n\n---\n\n" releases_path = f"{self.skill_dir}/references/releases.md" - with open(releases_path, 'w', encoding='utf-8') as f: + with open(releases_path, "w", encoding="utf-8") as f: f.write(content) logger.info(f"Generated: {releases_path}") def _generate_file_structure_reference(self): """Generate file_structure.md reference file.""" - file_tree = self.data['file_tree'] + file_tree = self.data["file_tree"] - content = f"# Repository File Structure\n\n" + content = "# Repository File Structure\n\n" content += f"Total items: {len(file_tree)}\n\n" content += "```\n" # Build tree structure for item in file_tree: - indent = " " * item['path'].count('/') - icon = "šŸ“" if item['type'] == 'dir' else "šŸ“„" + indent = " " * item["path"].count("/") + icon = "šŸ“" if item["type"] == "dir" else "šŸ“„" content += f"{indent}{icon} {os.path.basename(item['path'])}\n" content += "```\n" structure_path = f"{self.skill_dir}/references/file_structure.md" - with open(structure_path, 'w', encoding='utf-8') as f: + with open(structure_path, "w", encoding="utf-8") as f: f.write(content) logger.info(f"Generated: {structure_path}") @@ -1277,63 +1273,64 @@ Use this skill when you need to: def main(): """C1.10: CLI tool entry point.""" parser = argparse.ArgumentParser( - description='GitHub Repository to Claude Skill Converter', + description="GitHub Repository to Claude Skill Converter", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: skill-seekers github --repo facebook/react skill-seekers github --config configs/react_github.json skill-seekers github --repo owner/repo --token $GITHUB_TOKEN - """ + """, ) - parser.add_argument('--repo', help='GitHub repository (owner/repo)') - parser.add_argument('--config', help='Path to config JSON file') - parser.add_argument('--token', help='GitHub personal access token') - parser.add_argument('--name', help='Skill name (default: repo name)') - parser.add_argument('--description', help='Skill description') - parser.add_argument('--no-issues', action='store_true', help='Skip GitHub issues') - parser.add_argument('--no-changelog', action='store_true', help='Skip CHANGELOG') - parser.add_argument('--no-releases', action='store_true', help='Skip releases') - parser.add_argument('--max-issues', type=int, default=100, help='Max issues to fetch') - parser.add_argument('--scrape-only', action='store_true', help='Only scrape, don\'t build skill') - parser.add_argument('--enhance', action='store_true', - help='Enhance SKILL.md using Claude API after building (requires API key)') - parser.add_argument('--enhance-local', action='store_true', - help='Enhance SKILL.md using Claude Code (no API key needed)') - parser.add_argument('--api-key', type=str, - help='Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)') - parser.add_argument('--non-interactive', action='store_true', - help='Non-interactive mode for CI/CD (fail fast on rate limits)') - parser.add_argument('--profile', type=str, - help='GitHub profile name to use from config') + parser.add_argument("--repo", help="GitHub repository (owner/repo)") + parser.add_argument("--config", help="Path to config JSON file") + parser.add_argument("--token", help="GitHub personal access token") + parser.add_argument("--name", help="Skill name (default: repo name)") + parser.add_argument("--description", help="Skill description") + parser.add_argument("--no-issues", action="store_true", help="Skip GitHub issues") + parser.add_argument("--no-changelog", action="store_true", help="Skip CHANGELOG") + parser.add_argument("--no-releases", action="store_true", help="Skip releases") + parser.add_argument("--max-issues", type=int, default=100, help="Max issues to fetch") + parser.add_argument("--scrape-only", action="store_true", help="Only scrape, don't build skill") + parser.add_argument( + "--enhance", action="store_true", help="Enhance SKILL.md using Claude API after building (requires API key)" + ) + parser.add_argument( + "--enhance-local", action="store_true", help="Enhance SKILL.md using Claude Code (no API key needed)" + ) + parser.add_argument("--api-key", type=str, help="Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)") + parser.add_argument( + "--non-interactive", action="store_true", help="Non-interactive mode for CI/CD (fail fast on rate limits)" + ) + parser.add_argument("--profile", type=str, help="GitHub profile name to use from config") args = parser.parse_args() # Build config from args or file if args.config: - with open(args.config, 'r', encoding='utf-8') as f: + with open(args.config, encoding="utf-8") as f: config = json.load(f) # Override with CLI args if provided if args.non_interactive: - config['interactive'] = False + config["interactive"] = False if args.profile: - config['github_profile'] = args.profile + config["github_profile"] = args.profile elif args.repo: config = { - 'repo': args.repo, - 'name': args.name or args.repo.split('/')[-1], - 'description': args.description or f'Use when working with {args.repo.split("/")[-1]}', - 'github_token': args.token, - 'include_issues': not args.no_issues, - 'include_changelog': not args.no_changelog, - 'include_releases': not args.no_releases, - 'max_issues': args.max_issues, - 'interactive': not args.non_interactive, - 'github_profile': args.profile + "repo": args.repo, + "name": args.name or args.repo.split("/")[-1], + "description": args.description or f"Use when working with {args.repo.split('/')[-1]}", + "github_token": args.token, + "include_issues": not args.no_issues, + "include_changelog": not args.no_changelog, + "include_releases": not args.no_releases, + "max_issues": args.max_issues, + "interactive": not args.non_interactive, + "github_profile": args.profile, } else: - parser.error('Either --repo or --config is required') + parser.error("Either --repo or --config is required") try: # Phase 1: Scrape GitHub repository @@ -1348,7 +1345,7 @@ Examples: converter = GitHubToSkillConverter(config) converter.build_skill() - skill_name = config.get('name', config['repo'].split('/')[-1]) + skill_name = config.get("name", config["repo"].split("/")[-1]) skill_dir = f"output/{skill_name}" # Phase 3: Optional enhancement @@ -1357,9 +1354,10 @@ Examples: if args.enhance_local: # Local enhancement using Claude Code - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer from pathlib import Path + from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer + enhancer = LocalSkillEnhancer(Path(skill_dir)) enhancer.run(headless=True) logger.info("āœ… Local enhancement complete!") @@ -1367,7 +1365,8 @@ Examples: elif args.enhance: # API-based enhancement import os - api_key = args.api_key or os.environ.get('ANTHROPIC_API_KEY') + + api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY") if not api_key: logger.error("āŒ ANTHROPIC_API_KEY not set. Use --api-key or set environment variable.") logger.info("šŸ’” Tip: Use --enhance-local instead (no API key needed)") @@ -1375,6 +1374,7 @@ Examples: # Import and run API enhancement try: from skill_seekers.cli.enhance_skill import enhance_skill_md + enhance_skill_md(skill_dir, api_key) logger.info("āœ… API enhancement complete!") except ImportError: @@ -1386,7 +1386,7 @@ Examples: if not (args.enhance or args.enhance_local): logger.info("\nšŸ’” Optional: Enhance SKILL.md with Claude:") logger.info(f" Local (recommended): skill-seekers enhance {skill_dir}/") - logger.info(f" or re-run with: --enhance-local") + logger.info(" or re-run with: --enhance-local") logger.info(f"\nNext step: skill-seekers package {skill_dir}/") @@ -1395,5 +1395,5 @@ Examples: sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/skill_seekers/cli/guide_enhancer.py b/src/skill_seekers/cli/guide_enhancer.py index 686b987..25a2e8d 100644 --- a/src/skill_seekers/cli/guide_enhancer.py +++ b/src/skill_seekers/cli/guide_enhancer.py @@ -20,7 +20,7 @@ import subprocess import tempfile from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, List, Optional, TYPE_CHECKING +from typing import TYPE_CHECKING # Avoid circular imports by using TYPE_CHECKING if TYPE_CHECKING: @@ -40,15 +40,17 @@ else: @dataclass class TroubleshootingItem: problem: str - symptoms: List[str] = field(default_factory=list) + symptoms: list[str] = field(default_factory=list) solution: str = "" - diagnostic_steps: List[str] = field(default_factory=list) + diagnostic_steps: list[str] = field(default_factory=list) + logger = logging.getLogger(__name__) # Conditional import for Anthropic API try: import anthropic + ANTHROPIC_AVAILABLE = True except ImportError: ANTHROPIC_AVAILABLE = False @@ -58,9 +60,10 @@ except ImportError: @dataclass class StepEnhancement: """Enhanced step information (internal use only)""" + step_index: int explanation: str # Natural language explanation - variations: List[str] = field(default_factory=list) # Alternative approaches + variations: list[str] = field(default_factory=list) # Alternative approaches class GuideEnhancer: @@ -81,7 +84,7 @@ class GuideEnhancer: mode: Enhancement mode - "api", "local", or "auto" """ self.mode = self._detect_mode(mode) - self.api_key = os.environ.get('ANTHROPIC_API_KEY') + self.api_key = os.environ.get("ANTHROPIC_API_KEY") self.client = None if self.mode == "api": @@ -119,7 +122,7 @@ class GuideEnhancer: """ if requested_mode == "auto": # Prefer API if key available, else LOCAL - if os.environ.get('ANTHROPIC_API_KEY') and ANTHROPIC_AVAILABLE: + if os.environ.get("ANTHROPIC_API_KEY") and ANTHROPIC_AVAILABLE: return "api" elif self._check_claude_cli(): return "local" @@ -130,17 +133,12 @@ class GuideEnhancer: def _check_claude_cli(self) -> bool: """Check if Claude Code CLI is available.""" try: - result = subprocess.run( - ['claude', '--version'], - capture_output=True, - text=True, - timeout=5 - ) + result = subprocess.run(["claude", "--version"], capture_output=True, text=True, timeout=5) return result.returncode == 0 except (FileNotFoundError, subprocess.TimeoutExpired): return False - def enhance_guide(self, guide_data: Dict) -> Dict: + def enhance_guide(self, guide_data: dict) -> dict: """ Apply all 5 enhancements to a guide. @@ -164,7 +162,7 @@ class GuideEnhancer: logger.info("šŸ“ Returning original guide without enhancement") return guide_data - def enhance_step_descriptions(self, steps: List[Dict]) -> List[StepEnhancement]: + def enhance_step_descriptions(self, steps: list[dict]) -> list[StepEnhancement]: """ Enhancement 1: Add natural language explanations to steps. @@ -187,17 +185,17 @@ class GuideEnhancer: data = json.loads(response) return [ StepEnhancement( - step_index=item.get('step_index', i), - explanation=item.get('explanation', ''), - variations=item.get('variations', []) + step_index=item.get("step_index", i), + explanation=item.get("explanation", ""), + variations=item.get("variations", []), ) - for i, item in enumerate(data.get('step_descriptions', [])) + for i, item in enumerate(data.get("step_descriptions", [])) ] except (json.JSONDecodeError, KeyError) as e: logger.warning(f"āš ļø Failed to parse step descriptions: {e}") return [] - def enhance_troubleshooting(self, guide_data: Dict) -> List[TroubleshootingItem]: + def enhance_troubleshooting(self, guide_data: dict) -> list[TroubleshootingItem]: """ Enhancement 2: Generate diagnostic flows + solutions. @@ -220,18 +218,18 @@ class GuideEnhancer: data = json.loads(response) return [ TroubleshootingItem( - problem=item.get('problem', ''), - symptoms=item.get('symptoms', []), - diagnostic_steps=item.get('diagnostic_steps', []), - solution=item.get('solution', '') + problem=item.get("problem", ""), + symptoms=item.get("symptoms", []), + diagnostic_steps=item.get("diagnostic_steps", []), + solution=item.get("solution", ""), ) - for item in data.get('troubleshooting', []) + for item in data.get("troubleshooting", []) ] except (json.JSONDecodeError, KeyError) as e: logger.warning(f"āš ļø Failed to parse troubleshooting items: {e}") return [] - def enhance_prerequisites(self, prereqs: List[str]) -> List[PrerequisiteItem]: + def enhance_prerequisites(self, prereqs: list[str]) -> list[PrerequisiteItem]: """ Enhancement 3: Explain why prerequisites are needed. @@ -253,18 +251,14 @@ class GuideEnhancer: try: data = json.loads(response) return [ - PrerequisiteItem( - name=item.get('name', ''), - why=item.get('why', ''), - setup=item.get('setup', '') - ) - for item in data.get('prerequisites_detailed', []) + PrerequisiteItem(name=item.get("name", ""), why=item.get("why", ""), setup=item.get("setup", "")) + for item in data.get("prerequisites_detailed", []) ] except (json.JSONDecodeError, KeyError) as e: logger.warning(f"āš ļø Failed to parse prerequisites: {e}") return [] - def enhance_next_steps(self, guide_data: Dict) -> List[str]: + def enhance_next_steps(self, guide_data: dict) -> list[str]: """ Enhancement 4: Suggest related guides and variations. @@ -285,12 +279,12 @@ class GuideEnhancer: try: data = json.loads(response) - return data.get('next_steps', []) + return data.get("next_steps", []) except (json.JSONDecodeError, KeyError) as e: logger.warning(f"āš ļø Failed to parse next steps: {e}") return [] - def enhance_use_cases(self, guide_data: Dict) -> List[str]: + def enhance_use_cases(self, guide_data: dict) -> list[str]: """ Enhancement 5: Generate real-world scenario examples. @@ -311,14 +305,14 @@ class GuideEnhancer: try: data = json.loads(response) - return data.get('use_cases', []) + return data.get("use_cases", []) except (json.JSONDecodeError, KeyError) as e: logger.warning(f"āš ļø Failed to parse use cases: {e}") return [] # === AI Call Methods === - def _call_ai(self, prompt: str, max_tokens: int = 4000) -> Optional[str]: + def _call_ai(self, prompt: str, max_tokens: int = 4000) -> str | None: """ Call AI with the given prompt. @@ -335,7 +329,7 @@ class GuideEnhancer: return self._call_claude_local(prompt) return None - def _call_claude_api(self, prompt: str, max_tokens: int = 4000) -> Optional[str]: + def _call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str | None: """ Call Claude API. @@ -351,16 +345,14 @@ class GuideEnhancer: try: response = self.client.messages.create( - model="claude-sonnet-4-20250514", - max_tokens=max_tokens, - messages=[{"role": "user", "content": prompt}] + model="claude-sonnet-4-20250514", max_tokens=max_tokens, messages=[{"role": "user", "content": prompt}] ) return response.content[0].text except Exception as e: logger.warning(f"āš ļø Claude API call failed: {e}") return None - def _call_claude_local(self, prompt: str) -> Optional[str]: + def _call_claude_local(self, prompt: str) -> str | None: """ Call Claude Code CLI. @@ -372,16 +364,16 @@ class GuideEnhancer: """ try: # Create temporary prompt file - with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(prompt) prompt_file = f.name # Run claude CLI result = subprocess.run( - ['claude', prompt_file], + ["claude", prompt_file], capture_output=True, text=True, - timeout=300 # 5 min timeout + timeout=300, # 5 min timeout ) # Clean up prompt file @@ -399,7 +391,7 @@ class GuideEnhancer: # === Prompt Creation Methods === - def _enhance_via_api(self, guide_data: Dict) -> Dict: + def _enhance_via_api(self, guide_data: dict) -> dict: """ Enhance guide via API mode. @@ -417,7 +409,7 @@ class GuideEnhancer: return self._parse_enhancement_response(response, guide_data) - def _enhance_via_local(self, guide_data: Dict) -> Dict: + def _enhance_via_local(self, guide_data: dict) -> dict: """ Enhance guide via LOCAL mode. @@ -435,7 +427,7 @@ class GuideEnhancer: return self._parse_enhancement_response(response, guide_data) - def _create_enhancement_prompt(self, guide_data: Dict) -> str: + def _create_enhancement_prompt(self, guide_data: dict) -> str: """ Create comprehensive enhancement prompt for all 5 enhancements. @@ -445,13 +437,13 @@ class GuideEnhancer: Returns: Complete prompt text """ - title = guide_data.get('title', 'Unknown Guide') - steps = guide_data.get('steps', []) - language = guide_data.get('language', 'python') - prerequisites = guide_data.get('prerequisites', []) + title = guide_data.get("title", "Unknown Guide") + steps = guide_data.get("steps", []) + language = guide_data.get("language", "python") + prerequisites = guide_data.get("prerequisites", []) steps_text = self._format_steps_for_prompt(steps) - prereqs_text = ', '.join(prerequisites) if prerequisites else 'None specified' + prereqs_text = ", ".join(prerequisites) if prerequisites else "None specified" prompt = f"""I need you to enhance this how-to guide with 5 improvements: @@ -528,7 +520,7 @@ IMPORTANT: Return ONLY valid JSON, no markdown code blocks or extra text. """ return prompt - def _create_step_description_prompt(self, steps: List[Dict]) -> str: + def _create_step_description_prompt(self, steps: list[dict]) -> str: """Create prompt for step descriptions only.""" steps_text = self._format_steps_for_prompt(steps) return f"""Generate natural language explanations for these code steps: @@ -546,11 +538,11 @@ Return JSON: IMPORTANT: Return ONLY valid JSON. """ - def _create_troubleshooting_prompt(self, guide_data: Dict) -> str: + def _create_troubleshooting_prompt(self, guide_data: dict) -> str: """Create prompt for troubleshooting items.""" - title = guide_data.get('title', 'Unknown') - language = guide_data.get('language', 'python') - steps = guide_data.get('steps', []) + title = guide_data.get("title", "Unknown") + language = guide_data.get("language", "python") + steps = guide_data.get("steps", []) steps_text = self._format_steps_for_prompt(steps) return f"""Generate troubleshooting guidance for this {language} workflow: @@ -575,9 +567,9 @@ Return JSON with 3-5 common errors: IMPORTANT: Return ONLY valid JSON. """ - def _create_prerequisites_prompt(self, prereqs: List[str]) -> str: + def _create_prerequisites_prompt(self, prereqs: list[str]) -> str: """Create prompt for prerequisites enhancement.""" - prereqs_text = ', '.join(prereqs) + prereqs_text = ", ".join(prereqs) return f"""Explain why these prerequisites are needed and how to install them: Prerequisites: {prereqs_text} @@ -593,9 +585,9 @@ Return JSON: IMPORTANT: Return ONLY valid JSON. """ - def _create_next_steps_prompt(self, guide_data: Dict) -> str: + def _create_next_steps_prompt(self, guide_data: dict) -> str: """Create prompt for next steps suggestions.""" - title = guide_data.get('title', 'Unknown') + title = guide_data.get("title", "Unknown") return f"""Suggest 3-5 related guides and learning paths after completing: {title} Return JSON: @@ -610,10 +602,10 @@ Return JSON: IMPORTANT: Return ONLY valid JSON. """ - def _create_use_cases_prompt(self, guide_data: Dict) -> str: + def _create_use_cases_prompt(self, guide_data: dict) -> str: """Create prompt for use case examples.""" - title = guide_data.get('title', 'Unknown') - description = guide_data.get('description', '') + title = guide_data.get("title", "Unknown") + description = guide_data.get("description", "") return f"""Generate 2-3 real-world use cases for this guide: @@ -632,23 +624,23 @@ Return JSON: IMPORTANT: Return ONLY valid JSON. """ - def _format_steps_for_prompt(self, steps: List[Dict]) -> str: + def _format_steps_for_prompt(self, steps: list[dict]) -> str: """Format steps for inclusion in prompts.""" if not steps: return "No steps provided" formatted = [] for i, step in enumerate(steps): - desc = step.get('description', '') - code = step.get('code', '') + desc = step.get("description", "") + code = step.get("code", "") if code: - formatted.append(f"Step {i+1}: {desc}\n```\n{code}\n```") + formatted.append(f"Step {i + 1}: {desc}\n```\n{code}\n```") else: - formatted.append(f"Step {i+1}: {desc}") + formatted.append(f"Step {i + 1}: {desc}") return "\n\n".join(formatted) - def _parse_enhancement_response(self, response: str, guide_data: Dict) -> Dict: + def _parse_enhancement_response(self, response: str, guide_data: dict) -> dict: """ Parse AI enhancement response. @@ -661,8 +653,8 @@ IMPORTANT: Return ONLY valid JSON. """ try: # Try to extract JSON from response (in case there's extra text) - json_start = response.find('{') - json_end = response.rfind('}') + 1 + json_start = response.find("{") + json_end = response.rfind("}") + 1 if json_start >= 0 and json_end > json_start: json_text = response[json_start:json_end] data = json.loads(json_text) @@ -673,46 +665,42 @@ IMPORTANT: Return ONLY valid JSON. enhanced = guide_data.copy() # Step descriptions - if 'step_descriptions' in data: - enhanced['step_enhancements'] = [ + if "step_descriptions" in data: + enhanced["step_enhancements"] = [ StepEnhancement( - step_index=item.get('step_index', i), - explanation=item.get('explanation', ''), - variations=item.get('variations', []) + step_index=item.get("step_index", i), + explanation=item.get("explanation", ""), + variations=item.get("variations", []), ) - for i, item in enumerate(data['step_descriptions']) + for i, item in enumerate(data["step_descriptions"]) ] # Troubleshooting - if 'troubleshooting' in data: - enhanced['troubleshooting_detailed'] = [ + if "troubleshooting" in data: + enhanced["troubleshooting_detailed"] = [ TroubleshootingItem( - problem=item.get('problem', ''), - symptoms=item.get('symptoms', []), - diagnostic_steps=item.get('diagnostic_steps', []), - solution=item.get('solution', '') + problem=item.get("problem", ""), + symptoms=item.get("symptoms", []), + diagnostic_steps=item.get("diagnostic_steps", []), + solution=item.get("solution", ""), ) - for item in data['troubleshooting'] + for item in data["troubleshooting"] ] # Prerequisites - if 'prerequisites_detailed' in data: - enhanced['prerequisites_detailed'] = [ - PrerequisiteItem( - name=item.get('name', ''), - why=item.get('why', ''), - setup=item.get('setup', '') - ) - for item in data['prerequisites_detailed'] + if "prerequisites_detailed" in data: + enhanced["prerequisites_detailed"] = [ + PrerequisiteItem(name=item.get("name", ""), why=item.get("why", ""), setup=item.get("setup", "")) + for item in data["prerequisites_detailed"] ] # Next steps - if 'next_steps' in data: - enhanced['next_steps_detailed'] = data['next_steps'] + if "next_steps" in data: + enhanced["next_steps_detailed"] = data["next_steps"] # Use cases - if 'use_cases' in data: - enhanced['use_cases'] = data['use_cases'] + if "use_cases" in data: + enhanced["use_cases"] = data["use_cases"] logger.info("āœ… Successfully enhanced guide with all 5 improvements") return enhanced diff --git a/src/skill_seekers/cli/how_to_guide_builder.py b/src/skill_seekers/cli/how_to_guide_builder.py index 7b952bf..2c8b3f5 100644 --- a/src/skill_seekers/cli/how_to_guide_builder.py +++ b/src/skill_seekers/cli/how_to_guide_builder.py @@ -30,15 +30,15 @@ Example workflow → guide transformation: """ import ast -import re +import hashlib import json import logging -import hashlib -from dataclasses import dataclass, field, asdict -from typing import List, Dict, Optional, Literal, Tuple, Set -from pathlib import Path +import re from collections import defaultdict +from dataclasses import asdict, dataclass, field from datetime import datetime +from pathlib import Path +from typing import Literal logger = logging.getLogger(__name__) @@ -47,9 +47,11 @@ logger = logging.getLogger(__name__) # DATA MODELS # ============================================================================ + @dataclass class PrerequisiteItem: """Enhanced prerequisite with explanation (AI enhancement)""" + name: str why: str # Why this is needed setup: str # How to install/configure @@ -58,87 +60,88 @@ class PrerequisiteItem: @dataclass class TroubleshootingItem: """Enhanced troubleshooting with solutions (AI enhancement)""" + problem: str - symptoms: List[str] = field(default_factory=list) # How to recognize this issue + symptoms: list[str] = field(default_factory=list) # How to recognize this issue solution: str = "" # Step-by-step fix - diagnostic_steps: List[str] = field(default_factory=list) # How to diagnose + diagnostic_steps: list[str] = field(default_factory=list) # How to diagnose @dataclass class WorkflowStep: """Single step in a workflow guide""" + step_number: int code: str description: str - expected_result: Optional[str] = None - verification: Optional[str] = None # Assertion or checkpoint - setup_required: Optional[str] = None - explanation: Optional[str] = None # Why this step matters - common_pitfall: Optional[str] = None # Warning for this step - common_variations: List[str] = field(default_factory=list) # AI: Alternative approaches + expected_result: str | None = None + verification: str | None = None # Assertion or checkpoint + setup_required: str | None = None + explanation: str | None = None # Why this step matters + common_pitfall: str | None = None # Warning for this step + common_variations: list[str] = field(default_factory=list) # AI: Alternative approaches @dataclass class HowToGuide: """Complete how-to guide generated from workflow(s)""" + guide_id: str title: str overview: str complexity_level: Literal["beginner", "intermediate", "advanced"] # Prerequisites - prerequisites: List[str] = field(default_factory=list) - required_imports: List[str] = field(default_factory=list) - required_fixtures: List[str] = field(default_factory=list) + prerequisites: list[str] = field(default_factory=list) + required_imports: list[str] = field(default_factory=list) + required_fixtures: list[str] = field(default_factory=list) # Content - workflows: List[Dict] = field(default_factory=list) # Source workflow examples - steps: List[WorkflowStep] = field(default_factory=list) + workflows: list[dict] = field(default_factory=list) # Source workflow examples + steps: list[WorkflowStep] = field(default_factory=list) # Metadata use_case: str = "" - tags: List[str] = field(default_factory=list) + tags: list[str] = field(default_factory=list) estimated_time: str = "10 minutes" - source_files: List[str] = field(default_factory=list) + source_files: list[str] = field(default_factory=list) # Optional AI enhancement (basic) - common_pitfalls: List[str] = field(default_factory=list) - troubleshooting: Dict[str, str] = field(default_factory=dict) - variations: List[str] = field(default_factory=list) - related_guides: List[str] = field(default_factory=list) + common_pitfalls: list[str] = field(default_factory=list) + troubleshooting: dict[str, str] = field(default_factory=dict) + variations: list[str] = field(default_factory=list) + related_guides: list[str] = field(default_factory=list) # AI enhancement (comprehensive - NEW) - prerequisites_detailed: List[PrerequisiteItem] = field(default_factory=list) - troubleshooting_detailed: List[TroubleshootingItem] = field(default_factory=list) - next_steps_detailed: List[str] = field(default_factory=list) - use_cases: List[str] = field(default_factory=list) + prerequisites_detailed: list[PrerequisiteItem] = field(default_factory=list) + troubleshooting_detailed: list[TroubleshootingItem] = field(default_factory=list) + next_steps_detailed: list[str] = field(default_factory=list) + use_cases: list[str] = field(default_factory=list) - def to_dict(self) -> Dict: + def to_dict(self) -> dict: """Convert to dictionary""" result = asdict(self) # Convert WorkflowStep objects to dicts - result['steps'] = [asdict(step) for step in self.steps] + result["steps"] = [asdict(step) for step in self.steps] return result @dataclass class GuideCollection: """Collection of guides organized by category""" - total_guides: int - guides_by_complexity: Dict[str, int] - guides_by_use_case: Dict[str, List[HowToGuide]] - guides: List[HowToGuide] - def to_dict(self) -> Dict: + total_guides: int + guides_by_complexity: dict[str, int] + guides_by_use_case: dict[str, list[HowToGuide]] + guides: list[HowToGuide] + + def to_dict(self) -> dict: """Convert to dictionary""" return { - 'total_guides': self.total_guides, - 'guides_by_complexity': self.guides_by_complexity, - 'guides_by_use_case': { - k: [g.to_dict() for g in v] - for k, v in self.guides_by_use_case.items() - }, - 'guides': [g.to_dict() for g in self.guides] + "total_guides": self.total_guides, + "guides_by_complexity": self.guides_by_complexity, + "guides_by_use_case": {k: [g.to_dict() for g in v] for k, v in self.guides_by_use_case.items()}, + "guides": [g.to_dict() for g in self.guides], } @@ -146,10 +149,11 @@ class GuideCollection: # WORKFLOW ANALYZER # ============================================================================ + class WorkflowAnalyzer: """Analyze workflow examples to extract steps and metadata""" - def analyze_workflow(self, workflow: Dict) -> Tuple[List[WorkflowStep], Dict]: + def analyze_workflow(self, workflow: dict) -> tuple[list[WorkflowStep], dict]: """ Deep analysis of workflow structure. @@ -159,11 +163,11 @@ class WorkflowAnalyzer: Returns: (steps, metadata) where metadata includes prerequisites, complexity, etc. """ - code = workflow.get('code', '') - language = workflow.get('language', 'python').lower() + code = workflow.get("code", "") + language = workflow.get("language", "python").lower() # Extract steps based on language - if language == 'python': + if language == "python": steps = self._extract_steps_python(code, workflow) else: steps = self._extract_steps_heuristic(code, workflow) @@ -180,12 +184,12 @@ class WorkflowAnalyzer: step.verification = verifications[i] # Calculate complexity - metadata['complexity_level'] = self._calculate_complexity(steps, workflow) - metadata['estimated_time'] = self._estimate_time(steps) + metadata["complexity_level"] = self._calculate_complexity(steps, workflow) + metadata["estimated_time"] = self._estimate_time(steps) return steps, metadata - def _extract_steps_python(self, code: str, workflow: Dict) -> List[WorkflowStep]: + def _extract_steps_python(self, code: str, workflow: dict) -> list[WorkflowStep]: """Extract steps from Python code using AST""" steps = [] @@ -218,12 +222,11 @@ class WorkflowAnalyzer: if idx + 1 < len(statements) and isinstance(statements[idx + 1], ast.Assert): verification = ast.get_source_segment(code, statements[idx + 1]) - steps.append(WorkflowStep( - step_number=step_num, - code=step_code, - description=description, - verification=verification - )) + steps.append( + WorkflowStep( + step_number=step_num, code=step_code, description=description, verification=verification + ) + ) step_num += 1 except SyntaxError: @@ -232,10 +235,10 @@ class WorkflowAnalyzer: return steps - def _extract_steps_heuristic(self, code: str, workflow: Dict) -> List[WorkflowStep]: + def _extract_steps_heuristic(self, code: str, workflow: dict) -> list[WorkflowStep]: """Extract steps using heuristics (for non-Python or invalid syntax)""" steps = [] - lines = code.split('\n') + lines = code.split("\n") current_step = [] step_num = 1 @@ -244,17 +247,13 @@ class WorkflowAnalyzer: line_stripped = line.strip() # Skip empty lines and comments - if not line_stripped or line_stripped.startswith('#'): + if not line_stripped or line_stripped.startswith("#"): if current_step: # End of current step - step_code = '\n'.join(current_step) + step_code = "\n".join(current_step) description = self._infer_description_from_code(step_code) - steps.append(WorkflowStep( - step_number=step_num, - code=step_code, - description=description - )) + steps.append(WorkflowStep(step_number=step_num, code=step_code, description=description)) step_num += 1 current_step = [] continue @@ -263,13 +262,9 @@ class WorkflowAnalyzer: # Add final step if current_step: - step_code = '\n'.join(current_step) + step_code = "\n".join(current_step) description = self._infer_description_from_code(step_code) - steps.append(WorkflowStep( - step_number=step_num, - code=step_code, - description=description - )) + steps.append(WorkflowStep(step_number=step_num, code=step_code, description=description)) return steps @@ -285,7 +280,7 @@ class WorkflowAnalyzer: func_name = self._get_name(node.value.func) return f"Call {func_name}()" - return code.split('\n')[0] # First line as fallback + return code.split("\n")[0] # First line as fallback def _describe_value(self, node: ast.AST) -> str: """Describe AST value node""" @@ -313,71 +308,67 @@ class WorkflowAnalyzer: code = code.strip() # Method call patterns - if '(' in code and ')' in code: - match = re.search(r'(\w+)\s*\(', code) + if "(" in code and ")" in code: + match = re.search(r"(\w+)\s*\(", code) if match: return f"Call {match.group(1)}()" # Assignment patterns - if '=' in code and not code.startswith('assert'): - parts = code.split('=', 1) + if "=" in code and not code.startswith("assert"): + parts = code.split("=", 1) var_name = parts[0].strip() return f"Create {var_name}" # Assertion patterns - if code.startswith('assert'): + if code.startswith("assert"): return "Verify result" - return code.split('\n')[0] # First line + return code.split("\n")[0] # First line - def _detect_prerequisites(self, workflow: Dict) -> Dict: + def _detect_prerequisites(self, workflow: dict) -> dict: """Detect prerequisites from workflow""" - metadata = { - 'prerequisites': [], - 'required_imports': [], - 'required_fixtures': [] - } + metadata = {"prerequisites": [], "required_imports": [], "required_fixtures": []} # Get dependencies from workflow - dependencies = workflow.get('dependencies', []) - metadata['required_imports'] = dependencies + dependencies = workflow.get("dependencies", []) + metadata["required_imports"] = dependencies # Get setup code - setup_code = workflow.get('setup_code') + setup_code = workflow.get("setup_code") if setup_code: - metadata['prerequisites'].append("Setup code must be executed first") + metadata["prerequisites"].append("Setup code must be executed first") # Check for common fixtures in test name or setup - test_name = workflow.get('test_name', '').lower() - if 'database' in test_name or (setup_code and 'database' in setup_code.lower()): - metadata['required_fixtures'].append('database') - if 'api' in test_name or (setup_code and 'api' in setup_code.lower()): - metadata['required_fixtures'].append('api_client') + test_name = workflow.get("test_name", "").lower() + if "database" in test_name or (setup_code and "database" in setup_code.lower()): + metadata["required_fixtures"].append("database") + if "api" in test_name or (setup_code and "api" in setup_code.lower()): + metadata["required_fixtures"].append("api_client") return metadata - def _find_verification_points(self, code: str) -> List[str]: + def _find_verification_points(self, code: str) -> list[str]: """Find assertion statements in code""" verifications = [] - for line in code.split('\n'): + for line in code.split("\n"): line_stripped = line.strip() - if line_stripped.startswith('assert'): + if line_stripped.startswith("assert"): verifications.append(line_stripped) return verifications - def _calculate_complexity(self, steps: List[WorkflowStep], workflow: Dict) -> str: + def _calculate_complexity(self, steps: list[WorkflowStep], workflow: dict) -> str: """Calculate complexity level""" num_steps = len(steps) # Check for advanced patterns - code = workflow.get('code', '') - has_async = 'async' in code or 'await' in code - has_mock = 'mock' in code.lower() or 'patch' in code.lower() - has_error_handling = 'try' in code or 'except' in code + code = workflow.get("code", "") + has_async = "async" in code or "await" in code + has_mock = "mock" in code.lower() or "patch" in code.lower() + has_error_handling = "try" in code or "except" in code - complexity_score = workflow.get('complexity_score', 0.5) + complexity_score = workflow.get("complexity_score", 0.5) # Determine level if num_steps <= 3 and not has_async and not has_mock: @@ -387,7 +378,7 @@ class WorkflowAnalyzer: else: return "intermediate" - def _estimate_time(self, steps: List[WorkflowStep]) -> str: + def _estimate_time(self, steps: list[WorkflowStep]) -> str: """Estimate time to complete guide""" num_steps = len(steps) @@ -405,14 +396,11 @@ class WorkflowAnalyzer: # WORKFLOW GROUPER # ============================================================================ + class WorkflowGrouper: """Group related workflows into coherent guides""" - def group_workflows( - self, - workflows: List[Dict], - strategy: str = "ai-tutorial-group" - ) -> Dict[str, List[Dict]]: + def group_workflows(self, workflows: list[dict], strategy: str = "ai-tutorial-group") -> dict[str, list[dict]]: """ Group workflows using specified strategy. @@ -439,14 +427,14 @@ class WorkflowGrouper: groups = self._group_by_file_path(workflows) return groups - def _group_by_ai_tutorial_group(self, workflows: List[Dict]) -> Dict[str, List[Dict]]: + def _group_by_ai_tutorial_group(self, workflows: list[dict]) -> dict[str, list[dict]]: """Group by AI-generated tutorial_group (from C3.6 enhancement)""" groups = defaultdict(list) ungrouped = [] for workflow in workflows: - ai_analysis = workflow.get('ai_analysis', {}) - tutorial_group = ai_analysis.get('tutorial_group') + ai_analysis = workflow.get("ai_analysis", {}) + tutorial_group = ai_analysis.get("tutorial_group") if tutorial_group: groups[tutorial_group].append(workflow) @@ -455,56 +443,52 @@ class WorkflowGrouper: # Put ungrouped workflows in individual guides for workflow in ungrouped: - test_name = workflow.get('test_name', 'Unknown') + test_name = workflow.get("test_name", "Unknown") # Clean test name for title title = self._clean_test_name(test_name) groups[title] = [workflow] return dict(groups) - def _group_by_file_path(self, workflows: List[Dict]) -> Dict[str, List[Dict]]: + def _group_by_file_path(self, workflows: list[dict]) -> dict[str, list[dict]]: """Group workflows from same test file""" groups = defaultdict(list) for workflow in workflows: - file_path = workflow.get('file_path', '') + file_path = workflow.get("file_path", "") # Extract meaningful name from file path - file_name = Path(file_path).stem if file_path else 'Unknown' + file_name = Path(file_path).stem if file_path else "Unknown" # Remove test_ prefix - group_name = file_name.replace('test_', '').replace('_', ' ').title() + group_name = file_name.replace("test_", "").replace("_", " ").title() groups[group_name].append(workflow) return dict(groups) - def _group_by_test_name(self, workflows: List[Dict]) -> Dict[str, List[Dict]]: + def _group_by_test_name(self, workflows: list[dict]) -> dict[str, list[dict]]: """Group by common test name prefixes""" groups = defaultdict(list) for workflow in workflows: - test_name = workflow.get('test_name', '') + test_name = workflow.get("test_name", "") # Extract prefix (e.g., test_auth_login → auth) prefix = self._extract_prefix(test_name) groups[prefix].append(workflow) return dict(groups) - def _group_by_complexity(self, workflows: List[Dict]) -> Dict[str, List[Dict]]: + def _group_by_complexity(self, workflows: list[dict]) -> dict[str, list[dict]]: """Group by complexity level""" - groups = { - 'Beginner': [], - 'Intermediate': [], - 'Advanced': [] - } + groups = {"Beginner": [], "Intermediate": [], "Advanced": []} for workflow in workflows: - complexity_score = workflow.get('complexity_score', 0.5) + complexity_score = workflow.get("complexity_score", 0.5) if complexity_score < 0.4: - groups['Beginner'].append(workflow) + groups["Beginner"].append(workflow) elif complexity_score < 0.7: - groups['Intermediate'].append(workflow) + groups["Intermediate"].append(workflow) else: - groups['Advanced'].append(workflow) + groups["Advanced"].append(workflow) # Remove empty groups return {k: v for k, v in groups.items() if v} @@ -512,18 +496,18 @@ class WorkflowGrouper: def _clean_test_name(self, test_name: str) -> str: """Clean test name to readable title""" # Remove test_ prefix - name = test_name.replace('test_', '') + name = test_name.replace("test_", "") # Replace underscores with spaces - name = name.replace('_', ' ') + name = name.replace("_", " ") # Title case return name.title() def _extract_prefix(self, test_name: str) -> str: """Extract prefix from test name""" # Remove test_ prefix - name = test_name.replace('test_', '') + name = test_name.replace("test_", "") # Get first part before underscore - parts = name.split('_') + parts = name.split("_") if len(parts) > 1: return parts[0].title() return self._clean_test_name(test_name) @@ -533,6 +517,7 @@ class WorkflowGrouper: # GUIDE GENERATOR # ============================================================================ + class GuideGenerator: """Generate markdown guides from workflow data""" @@ -574,7 +559,7 @@ class GuideGenerator: # Footer sections.append(self._create_footer(guide)) - return '\n\n'.join(sections) + return "\n\n".join(sections) def _create_header(self, guide: HowToGuide) -> str: """Create guide header with metadata""" @@ -586,7 +571,7 @@ class GuideGenerator: if guide.tags: lines.append(f"**Tags**: {', '.join(guide.tags)}") - return '\n'.join(lines) + return "\n".join(lines) def _create_overview(self, guide: HowToGuide) -> str: """Create overview section""" @@ -618,16 +603,16 @@ class GuideGenerator: lines.append("") # Setup code if available - if guide.workflows and guide.workflows[0].get('setup_code'): - setup_code = guide.workflows[0]['setup_code'] + if guide.workflows and guide.workflows[0].get("setup_code"): + setup_code = guide.workflows[0]["setup_code"] lines.append("**Setup Required:**") lines.append("```python") lines.append(setup_code) lines.append("```") - return '\n'.join(lines) + return "\n".join(lines) - def _create_steps_section(self, steps: List[WorkflowStep]) -> str: + def _create_steps_section(self, steps: list[WorkflowStep]) -> str: """Create step-by-step guide section""" lines = ["## Step-by-Step Guide"] lines.append("") @@ -654,7 +639,7 @@ class GuideGenerator: # Verification checkpoint if step.verification: - lines.append(f"**Verification:**") + lines.append("**Verification:**") lines.append("```python") lines.append(step.verification) lines.append("```") @@ -665,7 +650,7 @@ class GuideGenerator: lines.append(f"āš ļø **Common Pitfall:** {step.common_pitfall}") lines.append("") - return '\n'.join(lines) + return "\n".join(lines) def _create_complete_example(self, guide: HowToGuide) -> str: """Create complete working example""" @@ -678,14 +663,14 @@ class GuideGenerator: workflow = guide.workflows[0] # Add setup code if present - if workflow.get('setup_code'): + if workflow.get("setup_code"): lines.append("# Setup") - lines.append(workflow['setup_code']) + lines.append(workflow["setup_code"]) lines.append("") # Add main workflow code lines.append("# Workflow") - lines.append(workflow.get('code', '')) + lines.append(workflow.get("code", "")) else: # Combine all steps for step in guide.steps: @@ -696,7 +681,7 @@ class GuideGenerator: lines.append("") lines.append("```") - return '\n'.join(lines) + return "\n".join(lines) def _create_troubleshooting(self, guide: HowToGuide) -> str: """Create troubleshooting section""" @@ -719,7 +704,7 @@ class GuideGenerator: lines.append(f"**Solution:** {solution}") lines.append("") - return '\n'.join(lines) + return "\n".join(lines) def _create_next_steps(self, guide: HowToGuide) -> str: """Create next steps and related guides""" @@ -741,7 +726,7 @@ class GuideGenerator: lines.append(f"- [{related}]") lines.append("") - return '\n'.join(lines) + return "\n".join(lines) def _create_footer(self, guide: HowToGuide) -> str: """Create guide footer with metadata""" @@ -753,7 +738,7 @@ class GuideGenerator: return f"---\n\n*{' | '.join(source_info)}*" - def generate_index(self, guides: List[HowToGuide]) -> str: + def generate_index(self, guides: list[HowToGuide]) -> str: """ Generate index/TOC markdown. @@ -783,8 +768,10 @@ class GuideGenerator: lines.append(f"### {use_case} ({len(case_guides)} guides)") for guide in sorted(case_guides, key=lambda g: g.complexity_level): # Create filename from guide title - filename = guide.title.lower().replace(' ', '-').replace(':', '') - lines.append(f"- [How To: {guide.title}]({use_case.lower()}/{filename}.md) - {guide.complexity_level.title()}") + filename = guide.title.lower().replace(" ", "-").replace(":", "") + lines.append( + f"- [How To: {guide.title}]({use_case.lower()}/{filename}.md) - {guide.complexity_level.title()}" + ) lines.append("") # Group by difficulty @@ -795,7 +782,7 @@ class GuideGenerator: lines.append("## By Difficulty Level") lines.append("") - for level in ['beginner', 'intermediate', 'advanced']: + for level in ["beginner", "intermediate", "advanced"]: if level in by_complexity: level_guides = by_complexity[level] lines.append(f"### {level.title()} ({len(level_guides)} guides)") @@ -803,13 +790,14 @@ class GuideGenerator: lines.append(f"- {guide.title}") lines.append("") - return '\n'.join(lines) + return "\n".join(lines) # ============================================================================ # HOW-TO GUIDE BUILDER (Main Orchestrator) # ============================================================================ + class HowToGuideBuilder: """Main orchestrator for building how-to guides from workflow examples""" @@ -827,11 +815,11 @@ class HowToGuideBuilder: def build_guides_from_examples( self, - examples: List[Dict], + examples: list[dict], grouping_strategy: str = "ai-tutorial-group", - output_dir: Optional[Path] = None, + output_dir: Path | None = None, enhance_with_ai: bool = True, - ai_mode: str = "auto" + ai_mode: str = "auto", ) -> GuideCollection: """ Main entry point - build guides from workflow examples. @@ -853,6 +841,7 @@ class HowToGuideBuilder: if enhance_with_ai and ai_mode != "none": try: from .guide_enhancer import GuideEnhancer + enhancer = GuideEnhancer(mode=ai_mode) logger.info(f"✨ AI enhancement enabled (mode: {enhancer.mode})") except Exception as e: @@ -865,12 +854,7 @@ class HowToGuideBuilder: if not workflows: logger.warning("No workflow examples found!") - return GuideCollection( - total_guides=0, - guides_by_complexity={}, - guides_by_use_case={}, - guides=[] - ) + return GuideCollection(total_guides=0, guides_by_complexity={}, guides_by_use_case={}, guides=[]) # Group workflows grouped_workflows = self.grouper.group_workflows(workflows, grouping_strategy) @@ -892,11 +876,11 @@ class HowToGuideBuilder: logger.info(f"āœ… Generated {len(guides)} how-to guides") return collection - def _extract_workflow_examples(self, examples: List[Dict]) -> List[Dict]: + def _extract_workflow_examples(self, examples: list[dict]) -> list[dict]: """Filter to workflow category only""" - return [ex for ex in examples if ex.get('category') == 'workflow'] + return [ex for ex in examples if ex.get("category") == "workflow"] - def _create_guide(self, title: str, workflows: List[Dict], enhancer=None) -> HowToGuide: + def _create_guide(self, title: str, workflows: list[dict], enhancer=None) -> HowToGuide: """ Generate single guide from workflow(s). @@ -919,17 +903,17 @@ class HowToGuideBuilder: # Extract use case from AI analysis or title use_case = title - if primary_workflow.get('ai_analysis'): - use_case = primary_workflow['ai_analysis'].get('tutorial_group', title) + if primary_workflow.get("ai_analysis"): + use_case = primary_workflow["ai_analysis"].get("tutorial_group", title) # Determine overview overview = self._generate_overview(primary_workflow, workflows) # Extract tags - tags = primary_workflow.get('tags', []) + tags = primary_workflow.get("tags", []) # Extract source files - source_files = [w.get('file_path', '') for w in workflows] + source_files = [w.get("file_path", "") for w in workflows] source_files = [f"{Path(f).name}:{w.get('line_start', 0)}" for f, w in zip(source_files, workflows)] # Create guide @@ -937,44 +921,44 @@ class HowToGuideBuilder: guide_id=guide_id, title=title, overview=overview, - complexity_level=metadata.get('complexity_level', 'intermediate'), - prerequisites=metadata.get('prerequisites', []), - required_imports=metadata.get('required_imports', []), - required_fixtures=metadata.get('required_fixtures', []), + complexity_level=metadata.get("complexity_level", "intermediate"), + prerequisites=metadata.get("prerequisites", []), + required_imports=metadata.get("required_imports", []), + required_fixtures=metadata.get("required_fixtures", []), workflows=workflows, steps=steps, use_case=use_case, tags=tags, - estimated_time=metadata.get('estimated_time', '10 minutes'), - source_files=source_files + estimated_time=metadata.get("estimated_time", "10 minutes"), + source_files=source_files, ) # Add AI enhancements if enhancer is available if enhancer: - self._enhance_guide_with_ai(guide, primary_workflow.get('ai_analysis', {}), enhancer) - elif self.enhance_with_ai and primary_workflow.get('ai_analysis'): + self._enhance_guide_with_ai(guide, primary_workflow.get("ai_analysis", {}), enhancer) + elif self.enhance_with_ai and primary_workflow.get("ai_analysis"): # Fallback to old enhancement method (basic) - self._enhance_guide_with_ai_basic(guide, primary_workflow['ai_analysis']) + self._enhance_guide_with_ai_basic(guide, primary_workflow["ai_analysis"]) return guide - def _generate_overview(self, primary_workflow: Dict, all_workflows: List[Dict]) -> str: + def _generate_overview(self, primary_workflow: dict, all_workflows: list[dict]) -> str: """Generate guide overview""" # Try to get explanation from AI analysis - if primary_workflow.get('ai_analysis'): - explanation = primary_workflow['ai_analysis'].get('explanation') + if primary_workflow.get("ai_analysis"): + explanation = primary_workflow["ai_analysis"].get("explanation") if explanation: return explanation # Fallback to description - description = primary_workflow.get('description', '') + description = primary_workflow.get("description", "") if description: return description # Final fallback return f"Learn how to use {primary_workflow.get('test_name', 'this feature')} in your code." - def _enhance_guide_with_ai(self, guide: HowToGuide, ai_analysis: Dict, enhancer): + def _enhance_guide_with_ai(self, guide: HowToGuide, ai_analysis: dict, enhancer): """ Comprehensively enhance guide with AI using GuideEnhancer. @@ -991,49 +975,43 @@ class HowToGuideBuilder: """ # Prepare guide data for enhancer guide_data = { - 'title': guide.title, - 'steps': [ - { - 'description': step.description, - 'code': step.code - } - for step in guide.steps - ], - 'language': 'python', # TODO: Detect from code - 'prerequisites': guide.prerequisites, - 'description': guide.overview + "title": guide.title, + "steps": [{"description": step.description, "code": step.code} for step in guide.steps], + "language": "python", # TODO: Detect from code + "prerequisites": guide.prerequisites, + "description": guide.overview, } # Call enhancer to get all 5 enhancements enhanced_data = enhancer.enhance_guide(guide_data) # Apply step enhancements - if 'step_enhancements' in enhanced_data: - for enhancement in enhanced_data['step_enhancements']: + if "step_enhancements" in enhanced_data: + for enhancement in enhanced_data["step_enhancements"]: idx = enhancement.step_index if 0 <= idx < len(guide.steps): guide.steps[idx].explanation = enhancement.explanation guide.steps[idx].common_variations = enhancement.variations # Apply detailed prerequisites - if 'prerequisites_detailed' in enhanced_data: - guide.prerequisites_detailed = enhanced_data['prerequisites_detailed'] + if "prerequisites_detailed" in enhanced_data: + guide.prerequisites_detailed = enhanced_data["prerequisites_detailed"] # Apply troubleshooting - if 'troubleshooting_detailed' in enhanced_data: - guide.troubleshooting_detailed = enhanced_data['troubleshooting_detailed'] + if "troubleshooting_detailed" in enhanced_data: + guide.troubleshooting_detailed = enhanced_data["troubleshooting_detailed"] # Apply next steps - if 'next_steps_detailed' in enhanced_data: - guide.next_steps_detailed = enhanced_data['next_steps_detailed'] + if "next_steps_detailed" in enhanced_data: + guide.next_steps_detailed = enhanced_data["next_steps_detailed"] # Apply use cases - if 'use_cases' in enhanced_data: - guide.use_cases = enhanced_data['use_cases'] + if "use_cases" in enhanced_data: + guide.use_cases = enhanced_data["use_cases"] logger.info(f"✨ Enhanced guide '{guide.title}' with comprehensive AI improvements") - def _enhance_guide_with_ai_basic(self, guide: HowToGuide, ai_analysis: Dict): + def _enhance_guide_with_ai_basic(self, guide: HowToGuide, ai_analysis: dict): """ Basic enhancement using pre-computed AI analysis from C3.6. @@ -1044,15 +1022,15 @@ class HowToGuideBuilder: ai_analysis: AI analysis data from C3.6 """ # Add best practices as variations - best_practices = ai_analysis.get('best_practices', []) + best_practices = ai_analysis.get("best_practices", []) guide.variations = best_practices # Add common mistakes as pitfalls - common_mistakes = ai_analysis.get('common_mistakes', []) + common_mistakes = ai_analysis.get("common_mistakes", []) guide.common_pitfalls = common_mistakes # Add related examples as related guides - related_examples = ai_analysis.get('related_examples', []) + related_examples = ai_analysis.get("related_examples", []) guide.related_guides = [f"How To: {ex}" for ex in related_examples] # Enhance step explanations @@ -1061,7 +1039,7 @@ class HowToGuideBuilder: if best_practices and step.step_number <= len(best_practices): step.explanation = best_practices[step.step_number - 1] - def _create_collection(self, guides: List[HowToGuide]) -> GuideCollection: + def _create_collection(self, guides: list[HowToGuide]) -> GuideCollection: """Create GuideCollection from guides""" # Count by complexity by_complexity = defaultdict(int) @@ -1078,7 +1056,7 @@ class HowToGuideBuilder: total_guides=len(guides), guides_by_complexity=dict(by_complexity), guides_by_use_case=dict(by_use_case), - guides=guides + guides=guides, ) def _save_guides_to_files(self, collection: GuideCollection, output_dir: Path): @@ -1091,21 +1069,21 @@ class HowToGuideBuilder: # Save individual guides for use_case, guides in collection.guides_by_use_case.items(): # Create use case directory - use_case_dir = output_dir / use_case.lower().replace(' ', '-') + use_case_dir = output_dir / use_case.lower().replace(" ", "-") use_case_dir.mkdir(parents=True, exist_ok=True) for guide in guides: # Generate filename from title - filename = guide.title.lower().replace(' ', '-').replace(':', '') + '.md' + filename = guide.title.lower().replace(" ", "-").replace(":", "") + ".md" file_path = use_case_dir / filename # Generate and save markdown markdown = self.generator.generate_guide_markdown(guide) - file_path.write_text(markdown, encoding='utf-8') + file_path.write_text(markdown, encoding="utf-8") # Save index index_markdown = self.generator.generate_index(collection.guides) - (output_dir / 'index.md').write_text(index_markdown, encoding='utf-8') + (output_dir / "index.md").write_text(index_markdown, encoding="utf-8") logger.info(f"āœ… Saved {collection.total_guides} guides + index to {output_dir}") @@ -1114,6 +1092,7 @@ class HowToGuideBuilder: # CLI INTERFACE # ============================================================================ + def main(): """CLI entry point for how-to guide builder""" import argparse @@ -1144,45 +1123,29 @@ Grouping Strategies: - file-path: Group by source test file - test-name: Group by test name patterns - complexity: Group by difficulty level -""" +""", + ) + + parser.add_argument("input", nargs="?", help="Input: directory with test files OR test_examples.json file") + + parser.add_argument("--input", dest="input_file", help="Input JSON file with test examples (from C3.2)") + + parser.add_argument( + "--output", + default="output/codebase/tutorials", + help="Output directory for generated guides (default: output/codebase/tutorials)", ) parser.add_argument( - 'input', - nargs='?', - help='Input: directory with test files OR test_examples.json file' + "--group-by", + choices=["ai-tutorial-group", "file-path", "test-name", "complexity"], + default="ai-tutorial-group", + help="Grouping strategy (default: ai-tutorial-group)", ) - parser.add_argument( - '--input', - dest='input_file', - help='Input JSON file with test examples (from C3.2)' - ) + parser.add_argument("--no-ai", action="store_true", help="Disable AI enhancement") - parser.add_argument( - '--output', - default='output/codebase/tutorials', - help='Output directory for generated guides (default: output/codebase/tutorials)' - ) - - parser.add_argument( - '--group-by', - choices=['ai-tutorial-group', 'file-path', 'test-name', 'complexity'], - default='ai-tutorial-group', - help='Grouping strategy (default: ai-tutorial-group)' - ) - - parser.add_argument( - '--no-ai', - action='store_true', - help='Disable AI enhancement' - ) - - parser.add_argument( - '--json-output', - action='store_true', - help='Output JSON summary instead of markdown files' - ) + parser.add_argument("--json-output", action="store_true", help="Output JSON summary instead of markdown files") args = parser.parse_args() @@ -1200,13 +1163,13 @@ Grouping Strategies: # Load examples examples = [] - if input_path.is_file() and input_path.suffix == '.json': + if input_path.is_file() and input_path.suffix == ".json": # Load from JSON file logger.info(f"Loading examples from {input_path}...") - with open(input_path, 'r') as f: + with open(input_path) as f: data = json.load(f) - if isinstance(data, dict) and 'examples' in data: - examples = data['examples'] + if isinstance(data, dict) and "examples" in data: + examples = data["examples"] elif isinstance(data, list): examples = data else: @@ -1228,11 +1191,7 @@ Grouping Strategies: builder = HowToGuideBuilder(enhance_with_ai=not args.no_ai) output_dir = Path(args.output) if not args.json_output else None - collection = builder.build_guides_from_examples( - examples, - grouping_strategy=args.group_by, - output_dir=output_dir - ) + collection = builder.build_guides_from_examples(examples, grouping_strategy=args.group_by, output_dir=output_dir) # Output results if args.json_output: @@ -1241,9 +1200,9 @@ Grouping Strategies: else: # Summary print() - print("="*60) + print("=" * 60) print("HOW-TO GUIDES GENERATED") - print("="*60) + print("=" * 60) print() print(f"Total Guides: {collection.total_guides}") print() diff --git a/src/skill_seekers/cli/install_agent.py b/src/skill_seekers/cli/install_agent.py index 1a93a54..9fda18d 100644 --- a/src/skill_seekers/cli/install_agent.py +++ b/src/skill_seekers/cli/install_agent.py @@ -26,30 +26,28 @@ Examples: import argparse import shutil import sys -from pathlib import Path -from typing import Dict, Optional, Tuple, Union from difflib import get_close_matches - +from pathlib import Path # Agent installation paths # Global paths (install to home directory): Use ~/.{agent}/skills/ # Project paths (install to current directory): Use .{agent}/skills/ AGENT_PATHS = { - 'claude': '~/.claude/skills/', # Global (home) - 'cursor': '.cursor/skills/', # Project-relative - 'vscode': '.github/skills/', # Project-relative - 'copilot': '.github/skills/', # Same as VSCode - 'amp': '~/.amp/skills/', # Global - 'goose': '~/.config/goose/skills/', # Global - 'opencode': '~/.opencode/skills/', # Global - 'letta': '~/.letta/skills/', # Global - 'aide': '~/.aide/skills/', # Global - 'windsurf': '~/.windsurf/skills/', # Global - 'neovate': '~/.neovate/skills/', # Global + "claude": "~/.claude/skills/", # Global (home) + "cursor": ".cursor/skills/", # Project-relative + "vscode": ".github/skills/", # Project-relative + "copilot": ".github/skills/", # Same as VSCode + "amp": "~/.amp/skills/", # Global + "goose": "~/.config/goose/skills/", # Global + "opencode": "~/.opencode/skills/", # Global + "letta": "~/.letta/skills/", # Global + "aide": "~/.aide/skills/", # Global + "windsurf": "~/.windsurf/skills/", # Global + "neovate": "~/.neovate/skills/", # Global } -def get_agent_path(agent_name: str, project_root: Optional[Path] = None) -> Path: +def get_agent_path(agent_name: str, project_root: Path | None = None) -> Path: """ Resolve the installation path for a given agent. @@ -75,7 +73,7 @@ def get_agent_path(agent_name: str, project_root: Optional[Path] = None) -> Path path_template = AGENT_PATHS[agent_name] # Handle home directory expansion (~) - if path_template.startswith('~'): + if path_template.startswith("~"): return Path(path_template).expanduser() # Handle project-relative paths @@ -95,7 +93,7 @@ def get_available_agents() -> list: return sorted(AGENT_PATHS.keys()) -def validate_agent_name(agent_name: str) -> Tuple[bool, Optional[str]]: +def validate_agent_name(agent_name: str) -> tuple[bool, str | None]: """ Validate an agent name and provide suggestions if invalid. @@ -111,7 +109,7 @@ def validate_agent_name(agent_name: str) -> Tuple[bool, Optional[str]]: - error_message: None if valid, error message with suggestions if invalid """ # Special case: 'all' is valid for installing to all agents - if agent_name.lower() == 'all': + if agent_name.lower() == "all": return True, None # Case-insensitive check @@ -130,13 +128,13 @@ def validate_agent_name(agent_name: str) -> Tuple[bool, Optional[str]]: error_msg += f"Did you mean: {suggestions[0]}?\n\n" error_msg += "Available agents:\n " - error_msg += ", ".join(available + ['all']) + error_msg += ", ".join(available + ["all"]) error_msg += f"\n\nUsage:\n skill-seekers install-agent --agent {suggestions[0] if suggestions else 'claude'}" return False, error_msg -def validate_skill_directory(skill_dir: Path) -> Tuple[bool, Optional[str]]: +def validate_skill_directory(skill_dir: Path) -> tuple[bool, str | None]: """ Validate that a directory is a valid skill directory. @@ -165,11 +163,8 @@ def validate_skill_directory(skill_dir: Path) -> Tuple[bool, Optional[str]]: def install_to_agent( - skill_dir: Union[str, Path], - agent_name: str, - force: bool = False, - dry_run: bool = False -) -> Tuple[bool, str]: + skill_dir: str | Path, agent_name: str, force: bool = False, dry_run: bool = False +) -> tuple[bool, str]: """ Install a skill to a specific agent's directory. @@ -212,7 +207,7 @@ def install_to_agent( # Check if already exists if target_path.exists() and not force: - error_msg = f"āŒ Skill already installed\n\n" + error_msg = "āŒ Skill already installed\n\n" error_msg += f"Location: {target_path}\n\n" error_msg += "Options:\n" error_msg += f" 1. Overwrite: skill-seekers install-agent {skill_dir} --agent {agent_name} --force\n" @@ -222,34 +217,34 @@ def install_to_agent( # Dry run mode - just preview if dry_run: - msg = f"šŸ” DRY RUN - No changes will be made\n\n" + msg = "šŸ” DRY RUN - No changes will be made\n\n" msg += f"Would install skill: {skill_name}\n" msg += f" Source: {skill_dir}\n" msg += f" Target: {target_path}\n\n" # Calculate total size - total_size = sum(f.stat().st_size for f in skill_dir.rglob('*') if f.is_file()) + total_size = sum(f.stat().st_size for f in skill_dir.rglob("*") if f.is_file()) - msg += f"Files to copy:\n" + msg += "Files to copy:\n" msg += f" SKILL.md ({(skill_dir / 'SKILL.md').stat().st_size / 1024:.1f} KB)\n" - references_dir = skill_dir / 'references' + references_dir = skill_dir / "references" if references_dir.exists(): - ref_files = list(references_dir.rglob('*.md')) + ref_files = list(references_dir.rglob("*.md")) ref_size = sum(f.stat().st_size for f in ref_files) msg += f" references/ ({len(ref_files)} files, {ref_size / 1024:.1f} KB)\n" - for subdir in ['scripts', 'assets']: + for subdir in ["scripts", "assets"]: subdir_path = skill_dir / subdir if subdir_path.exists(): - files = list(subdir_path.rglob('*')) + files = list(subdir_path.rglob("*")) if files: msg += f" {subdir}/ ({len(files)} files)\n" else: msg += f" {subdir}/ (empty)\n" msg += f"\nTotal size: {total_size / 1024:.1f} KB\n\n" - msg += f"To actually install, run:\n" + msg += "To actually install, run:\n" msg += f" skill-seekers install-agent {skill_dir} --agent {agent_name}" return True, msg @@ -258,7 +253,10 @@ def install_to_agent( try: agent_base_path.mkdir(parents=True, exist_ok=True) except PermissionError: - return False, f"āŒ Permission denied: {agent_base_path}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}" + return ( + False, + f"āŒ Permission denied: {agent_base_path}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}", + ) # Copy skill directory def ignore_files(directory, files): @@ -266,16 +264,13 @@ def install_to_agent( ignored = [] for f in files: # Exclude backup files - if f.endswith('.backup'): - ignored.append(f) - # Exclude Python cache - elif f == '__pycache__': - ignored.append(f) - # Exclude macOS metadata - elif f == '.DS_Store': - ignored.append(f) - # Exclude hidden files (except .github for vscode) - elif f.startswith('.') and f not in ['.github', '.cursor']: + if ( + f.endswith(".backup") + or f == "__pycache__" + or f == ".DS_Store" + or f.startswith(".") + and f not in [".github", ".cursor"] + ): ignored.append(f) return ignored @@ -288,16 +283,16 @@ def install_to_agent( shutil.copytree(skill_dir, target_path, ignore=ignore_files) # Success message - msg = f"āœ… Installation complete!\n\n" + msg = "āœ… Installation complete!\n\n" msg += f"Skill '{skill_name}' installed to {agent_name}\n" msg += f"Location: {target_path}\n\n" # Agent-specific restart instructions - if agent_name.lower() == 'claude': + if agent_name.lower() == "claude": msg += "Restart Claude Code to load the new skill." - elif agent_name.lower() == 'cursor': + elif agent_name.lower() == "cursor": msg += "Restart Cursor to load the new skill." - elif agent_name.lower() in ['vscode', 'copilot']: + elif agent_name.lower() in ["vscode", "copilot"]: msg += "Restart VS Code to load the new skill." else: msg += f"Restart {agent_name.capitalize()} to load the new skill." @@ -305,16 +300,17 @@ def install_to_agent( return True, msg except PermissionError as e: - return False, f"āŒ Permission denied: {e}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}" + return ( + False, + f"āŒ Permission denied: {e}\n\nTry: sudo mkdir -p {agent_base_path} && sudo chown -R $USER {agent_base_path}", + ) except Exception as e: return False, f"āŒ Installation failed: {e}" def install_to_all_agents( - skill_dir: Union[str, Path], - force: bool = False, - dry_run: bool = False -) -> Dict[str, Tuple[bool, str]]: + skill_dir: str | Path, force: bool = False, dry_run: bool = False +) -> dict[str, tuple[bool, str]]: """ Install a skill to all available agents. @@ -365,31 +361,16 @@ Examples: Supported agents: claude, cursor, vscode, copilot, amp, goose, opencode, letta, aide, windsurf, neovate, all - """ + """, ) - parser.add_argument( - "skill_directory", - help="Path to skill directory (e.g., output/react/)" - ) + parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)") - parser.add_argument( - "--agent", - required=True, - help="Agent name (use 'all' to install to all agents)" - ) + parser.add_argument("--agent", required=True, help="Agent name (use 'all' to install to all agents)") - parser.add_argument( - "--force", - action="store_true", - help="Overwrite existing installation without asking" - ) + parser.add_argument("--force", action="store_true", help="Overwrite existing installation without asking") - parser.add_argument( - "--dry-run", - action="store_true", - help="Preview installation without making changes" - ) + parser.add_argument("--dry-run", action="store_true", help="Preview installation without making changes") args = parser.parse_args() @@ -398,7 +379,7 @@ Supported agents: skill_name = skill_dir.name # Handle 'all' agent - if args.agent.lower() == 'all': + if args.agent.lower() == "all": print(f"\nšŸ“‹ Installing skill to all agents: {skill_name}\n") if args.dry_run: @@ -433,7 +414,7 @@ Supported agents: skipped_count += 1 # Summary - print(f"\nšŸ“Š Summary:") + print("\nšŸ“Š Summary:") if args.dry_run: print(f" Would install: {installed_count} agents") else: diff --git a/src/skill_seekers/cli/install_skill.py b/src/skill_seekers/cli/install_skill.py index 0a49a48..f02e2cc 100644 --- a/src/skill_seekers/cli/install_skill.py +++ b/src/skill_seekers/cli/install_skill.py @@ -26,8 +26,8 @@ Examples: skill-seekers install --config react --dry-run """ -import asyncio import argparse +import asyncio import sys from pathlib import Path @@ -78,51 +78,35 @@ Phases: 3. AI Enhancement (MANDATORY - no skip option) 4. Package for target platform (ZIP or tar.gz) 5. Upload to target platform (optional) -""" +""", ) parser.add_argument( - "--config", - required=True, - help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')" + "--config", required=True, help="Config name (e.g., 'react') or path (e.g., 'configs/custom.json')" ) - parser.add_argument( - "--destination", - default="output", - help="Output directory for skill files (default: output/)" - ) + parser.add_argument("--destination", default="output", help="Output directory for skill files (default: output/)") + + parser.add_argument("--no-upload", action="store_true", help="Skip automatic upload to Claude") parser.add_argument( - "--no-upload", - action="store_true", - help="Skip automatic upload to Claude" + "--unlimited", action="store_true", help="Remove page limits during scraping (WARNING: Can take hours)" ) - parser.add_argument( - "--unlimited", - action="store_true", - help="Remove page limits during scraping (WARNING: Can take hours)" - ) - - parser.add_argument( - "--dry-run", - action="store_true", - help="Preview workflow without executing" - ) + parser.add_argument("--dry-run", action="store_true", help="Preview workflow without executing") parser.add_argument( "--target", - choices=['claude', 'gemini', 'openai', 'markdown'], - default='claude', - help="Target LLM platform (default: claude)" + choices=["claude", "gemini", "openai", "markdown"], + default="claude", + help="Target LLM platform (default: claude)", ) args = parser.parse_args() # Determine if config is a name or path config_arg = args.config - if config_arg.endswith('.json') or '/' in config_arg or '\\' in config_arg: + if config_arg.endswith(".json") or "/" in config_arg or "\\" in config_arg: # It's a path config_path = config_arg config_name = None @@ -139,7 +123,7 @@ Phases: "auto_upload": not args.no_upload, "unlimited": args.unlimited, "dry_run": args.dry_run, - "target": args.target + "target": args.target, } # Run async tool diff --git a/src/skill_seekers/cli/language_detector.py b/src/skill_seekers/cli/language_detector.py index 2992c55..0d526e8 100644 --- a/src/skill_seekers/cli/language_detector.py +++ b/src/skill_seekers/cli/language_detector.py @@ -8,9 +8,8 @@ Supports 20+ programming languages with weighted pattern matching. Author: Skill Seekers Project """ -import re import logging -from typing import Optional, Tuple, Dict, List +import re logger = logging.getLogger(__name__) @@ -18,19 +17,11 @@ logger = logging.getLogger(__name__) try: from skill_seekers.cli.swift_patterns import SWIFT_PATTERNS except ImportError as e: - logger.warning( - "Swift language detection patterns unavailable. " - "Swift code detection will be disabled. Error: %s", - e - ) - SWIFT_PATTERNS: Dict[str, List[Tuple[str, int]]] = {} + logger.warning("Swift language detection patterns unavailable. Swift code detection will be disabled. Error: %s", e) + SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {} except Exception as e: - logger.error( - "Failed to load Swift patterns due to unexpected error: %s. " - "Swift detection disabled.", - e - ) - SWIFT_PATTERNS: Dict[str, List[Tuple[str, int]]] = {} + logger.error("Failed to load Swift patterns due to unexpected error: %s. Swift detection disabled.", e) + SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {} # Verify Swift patterns were loaded correctly if not SWIFT_PATTERNS: @@ -38,15 +29,13 @@ if not SWIFT_PATTERNS: "Swift pattern dictionary is empty. Swift detection is disabled. " "This may indicate swift_patterns.py has no patterns defined." ) -elif 'swift' not in SWIFT_PATTERNS: +elif "swift" not in SWIFT_PATTERNS: logger.error( - "Swift patterns loaded but 'swift' key is missing. " - "Swift detection is broken. Please file a bug report." + "Swift patterns loaded but 'swift' key is missing. Swift detection is broken. Please file a bug report." ) else: logger.info( - "Swift patterns loaded successfully: %d patterns for language detection", - len(SWIFT_PATTERNS.get('swift', [])) + "Swift patterns loaded successfully: %d patterns for language detection", len(SWIFT_PATTERNS.get("swift", [])) ) # Comprehensive language patterns with weighted confidence scoring @@ -56,355 +45,325 @@ else: # Weight 2: Moderate indicators # Weight 1: Weak indicators -LANGUAGE_PATTERNS: Dict[str, List[Tuple[str, int]]] = { +LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = { # ===== PRIORITY 1: Unity C# (Critical - User's Primary Issue) ===== - 'csharp': [ + "csharp": [ # Unity-specific patterns (weight 4-5, CRITICAL) - (r'\busing\s+UnityEngine', 5), - (r'\bMonoBehaviour\b', 5), - (r'\bGameObject\b', 4), - (r'\bTransform\b', 4), - (r'\bVector[23]\b', 3), - (r'\bQuaternion\b', 3), - (r'\bvoid\s+Start\s*\(\)', 4), - (r'\bvoid\s+Update\s*\(\)', 4), - (r'\bvoid\s+Awake\s*\(\)', 4), - (r'\bvoid\s+OnEnable\s*\(\)', 3), - (r'\bvoid\s+OnDisable\s*\(\)', 3), - (r'\bvoid\s+FixedUpdate\s*\(\)', 4), - (r'\bvoid\s+LateUpdate\s*\(\)', 4), - (r'\bvoid\s+OnCollisionEnter', 4), - (r'\bvoid\s+OnTriggerEnter', 4), - (r'\bIEnumerator\b', 4), - (r'\bStartCoroutine\s*\(', 4), - (r'\byield\s+return\s+new\s+WaitForSeconds', 4), - (r'\byield\s+return\s+null', 3), - (r'\byield\s+return', 4), - (r'\[SerializeField\]', 4), - (r'\[RequireComponent', 4), - (r'\[Header\(', 3), - (r'\[Range\(', 3), - (r'\bTime\.deltaTime\b', 4), - (r'\bInput\.Get', 4), - (r'\bRigidbody\b', 3), - (r'\bCollider\b', 3), - (r'\bRenderer\b', 3), - (r'\bGetComponent<', 3), - + (r"\busing\s+UnityEngine", 5), + (r"\bMonoBehaviour\b", 5), + (r"\bGameObject\b", 4), + (r"\bTransform\b", 4), + (r"\bVector[23]\b", 3), + (r"\bQuaternion\b", 3), + (r"\bvoid\s+Start\s*\(\)", 4), + (r"\bvoid\s+Update\s*\(\)", 4), + (r"\bvoid\s+Awake\s*\(\)", 4), + (r"\bvoid\s+OnEnable\s*\(\)", 3), + (r"\bvoid\s+OnDisable\s*\(\)", 3), + (r"\bvoid\s+FixedUpdate\s*\(\)", 4), + (r"\bvoid\s+LateUpdate\s*\(\)", 4), + (r"\bvoid\s+OnCollisionEnter", 4), + (r"\bvoid\s+OnTriggerEnter", 4), + (r"\bIEnumerator\b", 4), + (r"\bStartCoroutine\s*\(", 4), + (r"\byield\s+return\s+new\s+WaitForSeconds", 4), + (r"\byield\s+return\s+null", 3), + (r"\byield\s+return", 4), + (r"\[SerializeField\]", 4), + (r"\[RequireComponent", 4), + (r"\[Header\(", 3), + (r"\[Range\(", 3), + (r"\bTime\.deltaTime\b", 4), + (r"\bInput\.Get", 4), + (r"\bRigidbody\b", 3), + (r"\bCollider\b", 3), + (r"\bRenderer\b", 3), + (r"\bGetComponent<", 3), # Basic C# patterns (weight 2-4) - (r'\bnamespace\s+\w+', 3), - (r'\busing\s+System', 3), - (r'\bConsole\.WriteLine', 4), # C#-specific output - (r'\bConsole\.Write', 3), - (r'\bpublic\s+class\s+\w+', 4), # Increased to match Java weight - (r'\bprivate\s+class\s+\w+', 3), - (r'\binternal\s+class\s+\w+', 4), # C#-specific modifier - (r'\bstring\s+\w+\s*[;=]', 2), # C#-specific lowercase string - (r'\bprivate\s+\w+\s+\w+\s*;', 2), # Private fields (common in both C# and Java) - (r'\{\s*get;\s*set;\s*\}', 3), # Auto properties - (r'\{\s*get;\s*private\s+set;\s*\}', 3), - (r'\{\s*get\s*=>\s*', 2), # Expression properties - (r'\bpublic\s+static\s+void\s+', 2), - + (r"\bnamespace\s+\w+", 3), + (r"\busing\s+System", 3), + (r"\bConsole\.WriteLine", 4), # C#-specific output + (r"\bConsole\.Write", 3), + (r"\bpublic\s+class\s+\w+", 4), # Increased to match Java weight + (r"\bprivate\s+class\s+\w+", 3), + (r"\binternal\s+class\s+\w+", 4), # C#-specific modifier + (r"\bstring\s+\w+\s*[;=]", 2), # C#-specific lowercase string + (r"\bprivate\s+\w+\s+\w+\s*;", 2), # Private fields (common in both C# and Java) + (r"\{\s*get;\s*set;\s*\}", 3), # Auto properties + (r"\{\s*get;\s*private\s+set;\s*\}", 3), + (r"\{\s*get\s*=>\s*", 2), # Expression properties + (r"\bpublic\s+static\s+void\s+", 2), # Modern C# patterns (weight 2) - (r'\bfrom\s+\w+\s+in\s+', 2), # LINQ - (r'\.Where\s*\(', 2), - (r'\.Select\s*\(', 2), - (r'\basync\s+Task', 2), - (r'\bawait\s+', 2), - (r'\bvar\s+\w+\s*=', 1), + (r"\bfrom\s+\w+\s+in\s+", 2), # LINQ + (r"\.Where\s*\(", 2), + (r"\.Select\s*\(", 2), + (r"\basync\s+Task", 2), + (r"\bawait\s+", 2), + (r"\bvar\s+\w+\s*=", 1), ], - # ===== PRIORITY 2: Frontend Languages ===== - 'typescript': [ + "typescript": [ # TypeScript-specific (weight 4-5) - (r'\binterface\s+\w+\s*\{', 5), - (r'\btype\s+\w+\s*=', 4), - (r':\s*\w+\s*=', 3), # Type annotation - (r':\s*\w+\[\]', 3), # Array type - (r'<[\w,\s]+>', 2), # Generic type - (r'\bas\s+\w+', 2), # Type assertion - (r'\benum\s+\w+\s*\{', 4), - (r'\bimplements\s+\w+', 3), - (r'\bexport\s+interface', 4), - (r'\bexport\s+type', 4), - + (r"\binterface\s+\w+\s*\{", 5), + (r"\btype\s+\w+\s*=", 4), + (r":\s*\w+\s*=", 3), # Type annotation + (r":\s*\w+\[\]", 3), # Array type + (r"<[\w,\s]+>", 2), # Generic type + (r"\bas\s+\w+", 2), # Type assertion + (r"\benum\s+\w+\s*\{", 4), + (r"\bimplements\s+\w+", 3), + (r"\bexport\s+interface", 4), + (r"\bexport\s+type", 4), # Also has JS patterns (weight 1) - (r'\bconst\s+\w+\s*=', 1), - (r'\blet\s+\w+\s*=', 1), - (r'=>', 1), + (r"\bconst\s+\w+\s*=", 1), + (r"\blet\s+\w+\s*=", 1), + (r"=>", 1), ], - - 'javascript': [ - (r'\bfunction\s+\w+\s*\(', 3), - (r'\bconst\s+\w+\s*=', 2), - (r'\blet\s+\w+\s*=', 2), - (r'=>', 2), # Arrow function - (r'\bconsole\.log', 2), - (r'\bvar\s+\w+\s*=', 1), - (r'\.then\s*\(', 2), # Promise - (r'\.catch\s*\(', 2), # Promise - (r'\basync\s+function', 3), - (r'\bawait\s+', 2), - (r'require\s*\(', 2), # CommonJS - (r'\bexport\s+default', 2), # ES6 - (r'\bexport\s+const', 2), + "javascript": [ + (r"\bfunction\s+\w+\s*\(", 3), + (r"\bconst\s+\w+\s*=", 2), + (r"\blet\s+\w+\s*=", 2), + (r"=>", 2), # Arrow function + (r"\bconsole\.log", 2), + (r"\bvar\s+\w+\s*=", 1), + (r"\.then\s*\(", 2), # Promise + (r"\.catch\s*\(", 2), # Promise + (r"\basync\s+function", 3), + (r"\bawait\s+", 2), + (r"require\s*\(", 2), # CommonJS + (r"\bexport\s+default", 2), # ES6 + (r"\bexport\s+const", 2), ], - - 'jsx': [ + "jsx": [ # JSX patterns (weight 4-5) - (r'<\w+\s+[^>]*>', 4), # JSX tag with attributes - (r'<\w+\s*/>', 4), # Self-closing tag - (r'className=', 3), # React className - (r'onClick=', 3), # React event - (r'\brender\s*\(\s*\)\s*\{', 4), # React render - (r'\buseState\s*\(', 4), # React hook - (r'\buseEffect\s*\(', 4), # React hook - (r'\buseRef\s*\(', 3), - (r'\buseCallback\s*\(', 3), - (r'\buseMemo\s*\(', 3), - + (r"<\w+\s+[^>]*>", 4), # JSX tag with attributes + (r"<\w+\s*/>", 4), # Self-closing tag + (r"className=", 3), # React className + (r"onClick=", 3), # React event + (r"\brender\s*\(\s*\)\s*\{", 4), # React render + (r"\buseState\s*\(", 4), # React hook + (r"\buseEffect\s*\(", 4), # React hook + (r"\buseRef\s*\(", 3), + (r"\buseCallback\s*\(", 3), + (r"\buseMemo\s*\(", 3), # Also has JS patterns - (r'\bconst\s+\w+\s*=', 1), - (r'=>', 1), + (r"\bconst\s+\w+\s*=", 1), + (r"=>", 1), ], - - 'tsx': [ + "tsx": [ # TSX = TypeScript + JSX (weight 5) - (r'<\w+\s+[^>]*>', 3), # JSX tag - (r':\s*React\.\w+', 5), # React types - (r'interface\s+\w+Props', 5), # Props interface - (r'\bFunctionComponent<', 4), - (r'\bReact\.FC<', 4), - (r'\buseState<', 4), # Typed hook - (r'\buseRef<', 3), - + (r"<\w+\s+[^>]*>", 3), # JSX tag + (r":\s*React\.\w+", 5), # React types + (r"interface\s+\w+Props", 5), # Props interface + (r"\bFunctionComponent<", 4), + (r"\bReact\.FC<", 4), + (r"\buseState<", 4), # Typed hook + (r"\buseRef<", 3), # Also has TS patterns - (r'\binterface\s+\w+', 2), - (r'\btype\s+\w+\s*=', 2), + (r"\binterface\s+\w+", 2), + (r"\btype\s+\w+\s*=", 2), ], - - 'vue': [ + "vue": [ # Vue SFC patterns (weight 4-5) - (r'