1552 lines
58 KiB
Python
1552 lines
58 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
GitHub Repository to Claude Skill Converter (Tasks C1.1-C1.12)
|
|
|
|
Converts GitHub repositories into Claude AI skills by extracting:
|
|
- README and documentation
|
|
- Code structure and signatures
|
|
- GitHub Issues, Changelog, and Releases
|
|
- Usage examples from tests
|
|
|
|
Usage:
|
|
skill-seekers github --repo facebook/react
|
|
skill-seekers github --config configs/react_github.json
|
|
skill-seekers github --repo owner/repo --token $GITHUB_TOKEN
|
|
"""
|
|
|
|
import argparse
|
|
import fnmatch
|
|
import itertools
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
try:
|
|
from github import Github, GithubException, Repository
|
|
from github.GithubException import RateLimitExceededException
|
|
except ImportError:
|
|
print("Error: PyGithub not installed. Run: pip install PyGithub")
|
|
sys.exit(1)
|
|
|
|
from skill_seekers.cli.arguments.github import add_github_arguments
|
|
from skill_seekers.cli.utils import setup_logging
|
|
|
|
# Try to import pathspec for .gitignore support
|
|
try:
|
|
import pathspec
|
|
|
|
PATHSPEC_AVAILABLE = True
|
|
except ImportError:
|
|
PATHSPEC_AVAILABLE = False
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Import code analyzer for deep code analysis
|
|
try:
|
|
from .code_analyzer import CodeAnalyzer
|
|
|
|
CODE_ANALYZER_AVAILABLE = True
|
|
except ImportError:
|
|
CODE_ANALYZER_AVAILABLE = False
|
|
logger.warning("Code analyzer not available - deep analysis disabled")
|
|
|
|
# Directories to exclude from local repository analysis
|
|
EXCLUDED_DIRS = {
|
|
# Virtual environments
|
|
"venv",
|
|
"env",
|
|
".venv",
|
|
".env",
|
|
# Dependencies and caches
|
|
"node_modules",
|
|
"__pycache__",
|
|
".pytest_cache",
|
|
# Version control
|
|
".git",
|
|
".svn",
|
|
".hg",
|
|
# Build artifacts
|
|
"build",
|
|
"dist",
|
|
"*.egg-info",
|
|
# Coverage reports
|
|
"htmlcov",
|
|
".coverage",
|
|
# Testing environments
|
|
".tox",
|
|
".nox",
|
|
# Linter caches
|
|
".mypy_cache",
|
|
".ruff_cache",
|
|
# Unity (critical - contains massive build cache)
|
|
"Library",
|
|
"Temp",
|
|
"Logs",
|
|
"UserSettings",
|
|
"MemoryCaptures",
|
|
"Recordings",
|
|
# Unreal Engine
|
|
"Intermediate",
|
|
"Saved",
|
|
"DerivedDataCache",
|
|
# Godot
|
|
".godot",
|
|
".import",
|
|
# Misc
|
|
"tmp",
|
|
".tmp",
|
|
}
|
|
|
|
|
|
def extract_description_from_readme(readme_content: str, repo_name: str) -> str:
|
|
"""
|
|
Extract a meaningful description from README content for skill description.
|
|
|
|
Parses README to find the first meaningful paragraph that describes
|
|
what the project does, suitable for "Use when..." format.
|
|
|
|
Args:
|
|
readme_content: README.md content
|
|
repo_name: Repository name (e.g., 'facebook/react')
|
|
|
|
Returns:
|
|
Description string, or improved fallback if extraction fails
|
|
"""
|
|
if not readme_content:
|
|
return f"Use when working with {repo_name.split('/')[-1]}"
|
|
|
|
try:
|
|
lines = readme_content.split("\n")
|
|
|
|
# Skip badges, images, title - find first meaningful text paragraph
|
|
meaningful_paragraph = None
|
|
in_code_block = False
|
|
|
|
for _i, line in enumerate(lines):
|
|
stripped = line.strip()
|
|
|
|
# Track code blocks
|
|
if stripped.startswith("```"):
|
|
in_code_block = not in_code_block
|
|
continue
|
|
|
|
# Skip if in code block
|
|
if in_code_block:
|
|
continue
|
|
|
|
# Skip empty lines, badges, images, HTML
|
|
if not stripped or stripped.startswith(("#", "!", "<", "[ < 100:
|
|
continue
|
|
|
|
# Found a meaningful paragraph - take up to 200 chars
|
|
if len(stripped) > 20: # Meaningful length
|
|
meaningful_paragraph = stripped
|
|
break
|
|
|
|
if meaningful_paragraph:
|
|
# Clean up and extract purpose
|
|
# Remove markdown formatting
|
|
clean = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", meaningful_paragraph) # Links
|
|
clean = re.sub(r"[*_`]", "", clean) # Bold, italic, code
|
|
clean = re.sub(r"<[^>]+>", "", clean) # HTML tags
|
|
|
|
# Truncate if too long (keep first sentence or ~150 chars)
|
|
if ". " in clean:
|
|
first_sentence = clean.split(". ")[0] + "."
|
|
if len(first_sentence) < 200:
|
|
clean = first_sentence
|
|
|
|
if len(clean) > 150:
|
|
clean = clean[:147] + "..."
|
|
|
|
# Format as "Use when..." description
|
|
# If it already starts with action words, use as-is
|
|
action_words = ["build", "create", "develop", "work", "use", "implement", "manage"]
|
|
if any(clean.lower().startswith(word) for word in action_words):
|
|
return f"Use when {clean.lower()}"
|
|
else:
|
|
return f"Use when working with {clean.lower()}"
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Could not extract description from README: {e}")
|
|
|
|
# Improved fallback
|
|
project_name = repo_name.split("/")[-1]
|
|
return f"Use when working with {project_name}"
|
|
|
|
|
|
class GitHubScraper:
|
|
"""
|
|
GitHub Repository Scraper (C1.1-C1.9)
|
|
|
|
Extracts repository information for skill generation:
|
|
- Repository structure
|
|
- README files
|
|
- Code comments and docstrings
|
|
- Programming language detection
|
|
- Function/class signatures
|
|
- Test examples
|
|
- GitHub Issues
|
|
- CHANGELOG
|
|
- Releases
|
|
"""
|
|
|
|
def __init__(self, config: dict[str, Any], local_repo_path: str | None = None):
|
|
"""Initialize GitHub scraper with configuration."""
|
|
self.config = config
|
|
self.repo_name = config["repo"]
|
|
self.name = config.get("name", self.repo_name.split("/")[-1])
|
|
# Set initial description (will be improved after README extraction if not in config)
|
|
self.description = config.get(
|
|
"description", f"Use when working with {self.repo_name.split('/')[-1]}"
|
|
)
|
|
|
|
# Local repository path (optional - enables unlimited analysis)
|
|
self.local_repo_path = local_repo_path or config.get("local_repo_path")
|
|
if self.local_repo_path:
|
|
self.local_repo_path = os.path.expanduser(self.local_repo_path)
|
|
if not os.path.isdir(self.local_repo_path):
|
|
logger.warning(
|
|
f"local_repo_path does not exist or is not a directory: {self.local_repo_path}"
|
|
)
|
|
logger.warning("Falling back to GitHub API mode (local_repo_path ignored)")
|
|
self.local_repo_path = None
|
|
else:
|
|
logger.info(f"Local repository mode enabled: {self.local_repo_path}")
|
|
|
|
# Configure directory exclusions (smart defaults + optional customization)
|
|
self.excluded_dirs = set(EXCLUDED_DIRS) # Start with smart defaults
|
|
|
|
# Option 1: Replace mode - Use only specified exclusions
|
|
if "exclude_dirs" in config:
|
|
self.excluded_dirs = set(config["exclude_dirs"])
|
|
logger.warning(
|
|
f"Using custom directory exclusions ({len(self.excluded_dirs)} dirs) - defaults overridden"
|
|
)
|
|
logger.debug(f"Custom exclusions: {sorted(self.excluded_dirs)}")
|
|
|
|
# Option 2: Extend mode - Add to default exclusions
|
|
elif "exclude_dirs_additional" in config:
|
|
additional = set(config["exclude_dirs_additional"])
|
|
self.excluded_dirs = self.excluded_dirs.union(additional)
|
|
logger.info(
|
|
f"Added {len(additional)} custom directory exclusions (total: {len(self.excluded_dirs)})"
|
|
)
|
|
logger.debug(f"Additional exclusions: {sorted(additional)}")
|
|
|
|
# Load .gitignore for additional exclusions (C2.1)
|
|
self.gitignore_spec = None
|
|
if self.local_repo_path:
|
|
self.gitignore_spec = self._load_gitignore()
|
|
|
|
# GitHub client setup (C1.1)
|
|
token = self._get_token()
|
|
self.github = Github(token) if token else Github()
|
|
self.repo: Repository.Repository | None = None
|
|
|
|
# Options
|
|
self.include_issues = config.get("include_issues", True)
|
|
self.max_issues = config.get("max_issues", 100)
|
|
self.include_changelog = config.get("include_changelog", True)
|
|
self.include_releases = config.get("include_releases", True)
|
|
self.include_code = config.get("include_code", False)
|
|
self.code_analysis_depth = config.get(
|
|
"code_analysis_depth", "surface"
|
|
) # 'surface', 'deep', 'full'
|
|
self.file_patterns = config.get("file_patterns", [])
|
|
|
|
# Initialize code analyzer if deep analysis requested
|
|
self.code_analyzer = None
|
|
if self.code_analysis_depth != "surface" and CODE_ANALYZER_AVAILABLE:
|
|
self.code_analyzer = CodeAnalyzer(depth=self.code_analysis_depth)
|
|
logger.info(f"Code analysis depth: {self.code_analysis_depth}")
|
|
|
|
# Output paths
|
|
self.skill_dir = f"output/{self.name}"
|
|
self.data_file = f"output/{self.name}_github_data.json"
|
|
|
|
# Extracted data storage
|
|
self.extracted_data = {
|
|
"repo_info": {},
|
|
"readme": "",
|
|
"file_tree": [],
|
|
"languages": {},
|
|
"signatures": [],
|
|
"test_examples": [],
|
|
"issues": [],
|
|
"changelog": "",
|
|
"releases": [],
|
|
}
|
|
|
|
def _get_token(self) -> str | None:
|
|
"""
|
|
Get GitHub token from env var or config (both options supported).
|
|
Priority: GITHUB_TOKEN env var > config file > None
|
|
"""
|
|
# Try environment variable first (recommended)
|
|
token = os.getenv("GITHUB_TOKEN")
|
|
if token:
|
|
logger.info("Using GitHub token from GITHUB_TOKEN environment variable")
|
|
return token
|
|
|
|
# Fall back to config file
|
|
token = self.config.get("github_token")
|
|
if token:
|
|
logger.warning("Using GitHub token from config file (less secure)")
|
|
return token
|
|
|
|
logger.warning(
|
|
"No GitHub token provided - using unauthenticated access (lower rate limits)"
|
|
)
|
|
return None
|
|
|
|
def scrape(self) -> dict[str, Any]:
|
|
"""
|
|
Main scraping entry point.
|
|
Executes all C1 tasks in sequence.
|
|
"""
|
|
try:
|
|
logger.info(f"Starting GitHub scrape for: {self.repo_name}")
|
|
|
|
# C1.1: Fetch repository
|
|
self._fetch_repository()
|
|
|
|
# C1.2: Extract README
|
|
self._extract_readme()
|
|
|
|
# C1.3-C1.6: Extract code structure
|
|
self._extract_code_structure()
|
|
|
|
# C1.7: Extract Issues
|
|
if self.include_issues:
|
|
self._extract_issues()
|
|
|
|
# C1.8: Extract CHANGELOG
|
|
if self.include_changelog:
|
|
self._extract_changelog()
|
|
|
|
# C1.9: Extract Releases
|
|
if self.include_releases:
|
|
self._extract_releases()
|
|
|
|
# Save extracted data
|
|
self._save_data()
|
|
|
|
logger.info(f"✅ Scraping complete! Data saved to: {self.data_file}")
|
|
return self.extracted_data
|
|
|
|
except RateLimitExceededException:
|
|
logger.error("GitHub API rate limit exceeded. Please wait or use authentication token.")
|
|
raise
|
|
except GithubException as e:
|
|
logger.error(f"GitHub API error: {e}")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error during scraping: {e}")
|
|
raise
|
|
|
|
def _fetch_repository(self):
|
|
"""C1.1: Fetch repository structure using GitHub API."""
|
|
logger.info(f"Fetching repository: {self.repo_name}")
|
|
|
|
try:
|
|
self.repo = self.github.get_repo(self.repo_name)
|
|
|
|
# Extract basic repo info
|
|
self.extracted_data["repo_info"] = {
|
|
"name": self.repo.name,
|
|
"full_name": self.repo.full_name,
|
|
"description": self.repo.description,
|
|
"url": self.repo.html_url,
|
|
"homepage": self.repo.homepage,
|
|
"stars": self.repo.stargazers_count,
|
|
"forks": self.repo.forks_count,
|
|
"open_issues": self.repo.open_issues_count,
|
|
"default_branch": self.repo.default_branch,
|
|
"created_at": self.repo.created_at.isoformat() if self.repo.created_at else None,
|
|
"updated_at": self.repo.updated_at.isoformat() if self.repo.updated_at else None,
|
|
"language": self.repo.language,
|
|
"license": self.repo.license.name if self.repo.license else None,
|
|
"topics": self.repo.get_topics(),
|
|
}
|
|
|
|
logger.info(
|
|
f"Repository fetched: {self.repo.full_name} ({self.repo.stargazers_count} stars)"
|
|
)
|
|
|
|
except GithubException as e:
|
|
if e.status == 404:
|
|
raise ValueError(f"Repository not found: {self.repo_name}") from e
|
|
raise
|
|
|
|
def _get_file_content(self, file_path: str) -> str | None:
|
|
"""
|
|
Safely get file content, handling symlinks and encoding issues.
|
|
|
|
Args:
|
|
file_path: Path to file in repository
|
|
|
|
Returns:
|
|
File content as string, or None if file not found/error
|
|
"""
|
|
try:
|
|
content = self.repo.get_contents(file_path)
|
|
if not content:
|
|
return None
|
|
|
|
# Handle symlinks - follow the target to get actual file
|
|
if hasattr(content, "type") and content.type == "symlink":
|
|
target = getattr(content, "target", None)
|
|
if target:
|
|
target = target.strip()
|
|
logger.debug(f"File {file_path} is a symlink to {target}, following...")
|
|
try:
|
|
content = self.repo.get_contents(target)
|
|
except GithubException as e:
|
|
logger.warning(f"Failed to follow symlink {file_path} -> {target}: {e}")
|
|
return None
|
|
else:
|
|
logger.warning(f"Symlink {file_path} has no target")
|
|
return None
|
|
|
|
# Handle large files (encoding="none") - download via URL
|
|
# GitHub API doesn't base64-encode files >1MB
|
|
if hasattr(content, "encoding") and content.encoding in [None, "none"]:
|
|
download_url = getattr(content, "download_url", None)
|
|
file_size = getattr(content, "size", 0)
|
|
|
|
if download_url:
|
|
logger.info(
|
|
f"File {file_path} is large ({file_size:,} bytes), downloading via URL..."
|
|
)
|
|
try:
|
|
import requests
|
|
|
|
response = requests.get(download_url, timeout=30)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except Exception as e:
|
|
logger.warning(f"Failed to download {file_path} from {download_url}: {e}")
|
|
return None
|
|
else:
|
|
logger.warning(
|
|
f"File {file_path} has no download URL (encoding={content.encoding})"
|
|
)
|
|
return None
|
|
|
|
# Handle regular files - decode content
|
|
try:
|
|
if isinstance(content.decoded_content, bytes):
|
|
return content.decoded_content.decode("utf-8")
|
|
else:
|
|
return str(content.decoded_content)
|
|
except (UnicodeDecodeError, AttributeError, LookupError, AssertionError) as e:
|
|
logger.warning(f"Encoding issue with {file_path}: {e}")
|
|
# Try alternative encoding
|
|
try:
|
|
if isinstance(content.decoded_content, bytes):
|
|
return content.decoded_content.decode("latin-1")
|
|
except Exception:
|
|
return None
|
|
return None
|
|
|
|
except GithubException:
|
|
return None
|
|
except Exception as e:
|
|
logger.warning(f"Error reading {file_path}: {e}")
|
|
return None
|
|
|
|
def _extract_readme(self):
|
|
"""C1.2: Extract README.md files."""
|
|
logger.info("Extracting README...")
|
|
|
|
# Try common README locations
|
|
readme_files = [
|
|
"README.md",
|
|
"README.rst",
|
|
"README.txt",
|
|
"README",
|
|
"docs/README.md",
|
|
".github/README.md",
|
|
]
|
|
|
|
for readme_path in readme_files:
|
|
readme_content = self._get_file_content(readme_path)
|
|
if readme_content:
|
|
self.extracted_data["readme"] = readme_content
|
|
logger.info(f"README found: {readme_path}")
|
|
|
|
# Update description if not explicitly set in config
|
|
if "description" not in self.config:
|
|
smart_description = extract_description_from_readme(
|
|
self.extracted_data["readme"], self.repo_name
|
|
)
|
|
self.description = smart_description
|
|
logger.debug(f"Generated description: {self.description}")
|
|
|
|
return
|
|
|
|
logger.warning("No README found in repository")
|
|
|
|
def _extract_code_structure(self):
|
|
"""
|
|
C1.3-C1.6: Extract code structure, languages, signatures, and test examples.
|
|
Surface layer only - no full implementation code.
|
|
"""
|
|
logger.info("Extracting code structure...")
|
|
|
|
# C1.4: Get language breakdown
|
|
self._extract_languages()
|
|
|
|
# Get file tree
|
|
self._extract_file_tree()
|
|
|
|
# Extract signatures and test examples
|
|
if self.include_code:
|
|
self._extract_signatures_and_tests()
|
|
|
|
def _extract_languages(self):
|
|
"""C1.4: Detect programming languages in repository."""
|
|
logger.info("Detecting programming languages...")
|
|
|
|
try:
|
|
languages = self.repo.get_languages()
|
|
# Filter out non-integer metadata (e.g., "url" key from some API configurations)
|
|
non_lang_keys = {k for k, v in languages.items() if not isinstance(v, int)}
|
|
if non_lang_keys:
|
|
logger.debug(f"Filtered non-language keys from API response: {non_lang_keys}")
|
|
languages = {k: v for k, v in languages.items() if isinstance(v, int)}
|
|
total_bytes = sum(languages.values())
|
|
|
|
self.extracted_data["languages"] = {
|
|
lang: {
|
|
"bytes": bytes_count,
|
|
"percentage": round((bytes_count / total_bytes) * 100, 2)
|
|
if total_bytes > 0
|
|
else 0,
|
|
}
|
|
for lang, bytes_count in languages.items()
|
|
}
|
|
|
|
logger.info(f"Languages detected: {', '.join(languages.keys())}")
|
|
|
|
except GithubException as e:
|
|
logger.warning(f"Could not fetch languages: {e}")
|
|
|
|
def should_exclude_dir(self, dir_name: str, dir_path: str = None) -> bool:
|
|
"""
|
|
Check if directory should be excluded from analysis.
|
|
|
|
Args:
|
|
dir_name: Directory name (e.g., "Examples & Extras")
|
|
dir_path: Full relative path (e.g., "TextMesh Pro/Examples & Extras")
|
|
|
|
Returns:
|
|
True if directory should be excluded
|
|
"""
|
|
# Check directory name
|
|
if dir_name in self.excluded_dirs or dir_name.startswith("."):
|
|
return True
|
|
|
|
# Check full path if provided (for nested exclusions like "TextMesh Pro/Examples & Extras")
|
|
if dir_path:
|
|
for excluded in self.excluded_dirs:
|
|
# Match if path contains the exclusion pattern
|
|
if excluded in dir_path or dir_path.startswith(excluded):
|
|
return True
|
|
|
|
# Check .gitignore rules if available (C2.1)
|
|
if self.gitignore_spec and dir_path:
|
|
# For directories, we need to check both with and without trailing slash
|
|
# as .gitignore patterns can match either way
|
|
dir_path_with_slash = dir_path if dir_path.endswith("/") else dir_path + "/"
|
|
if self.gitignore_spec.match_file(dir_path) or self.gitignore_spec.match_file(
|
|
dir_path_with_slash
|
|
):
|
|
logger.debug(f"Directory excluded by .gitignore: {dir_path}")
|
|
return True
|
|
|
|
return False
|
|
|
|
def _load_gitignore(self) -> Optional["pathspec.PathSpec"]:
|
|
"""
|
|
Load .gitignore file and create pathspec matcher (C2.1).
|
|
|
|
Returns:
|
|
PathSpec object if .gitignore found, None otherwise
|
|
"""
|
|
if not PATHSPEC_AVAILABLE:
|
|
logger.warning("pathspec not installed - .gitignore support disabled")
|
|
logger.warning("Install with: pip install pathspec")
|
|
return None
|
|
|
|
if not self.local_repo_path:
|
|
return None
|
|
|
|
gitignore_path = Path(self.local_repo_path) / ".gitignore"
|
|
if not gitignore_path.exists():
|
|
logger.debug(f"No .gitignore found in {self.local_repo_path}")
|
|
return None
|
|
|
|
try:
|
|
with open(gitignore_path, encoding="utf-8") as f:
|
|
spec = pathspec.PathSpec.from_lines("gitwildmatch", f)
|
|
logger.info(f"Loaded .gitignore from {gitignore_path}")
|
|
return spec
|
|
except Exception as e:
|
|
logger.warning(f"Failed to load .gitignore: {e}")
|
|
return None
|
|
|
|
def _extract_file_tree(self):
|
|
"""Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
|
|
logger.info("Building file tree...")
|
|
|
|
if self.local_repo_path:
|
|
# Local filesystem mode - unlimited files
|
|
self._extract_file_tree_local()
|
|
else:
|
|
# GitHub API mode - limited by API rate limits
|
|
self._extract_file_tree_github()
|
|
|
|
def _extract_file_tree_local(self):
|
|
"""Extract file tree from local filesystem (unlimited files)."""
|
|
if not os.path.exists(self.local_repo_path):
|
|
logger.error(f"Local repository path not found: {self.local_repo_path}")
|
|
return
|
|
|
|
# Log exclusions for debugging
|
|
logger.info(
|
|
f"Directory exclusions ({len(self.excluded_dirs)} total): {sorted(list(self.excluded_dirs)[:10])}"
|
|
)
|
|
|
|
file_tree = []
|
|
excluded_count = 0
|
|
for root, dirs, files in os.walk(self.local_repo_path):
|
|
# Calculate relative path from repo root first (needed for exclusion checks)
|
|
rel_root = os.path.relpath(root, self.local_repo_path)
|
|
if rel_root == ".":
|
|
rel_root = ""
|
|
|
|
# Exclude directories in-place to prevent os.walk from descending into them
|
|
# Pass both dir name and full path for path-based exclusions
|
|
filtered_dirs = []
|
|
for d in dirs:
|
|
dir_path = os.path.join(rel_root, d) if rel_root else d
|
|
if self.should_exclude_dir(d, dir_path):
|
|
excluded_count += 1
|
|
logger.debug(f"Excluding directory: {dir_path}")
|
|
else:
|
|
filtered_dirs.append(d)
|
|
dirs[:] = filtered_dirs
|
|
|
|
# Add directories
|
|
for dir_name in dirs:
|
|
dir_path = os.path.join(rel_root, dir_name) if rel_root else dir_name
|
|
file_tree.append({"path": dir_path, "type": "dir", "size": None})
|
|
|
|
# Add files
|
|
for file_name in files:
|
|
file_path = os.path.join(rel_root, file_name) if rel_root else file_name
|
|
full_path = os.path.join(root, file_name)
|
|
try:
|
|
file_size = os.path.getsize(full_path)
|
|
except OSError:
|
|
file_size = None
|
|
|
|
file_tree.append({"path": file_path, "type": "file", "size": file_size})
|
|
|
|
self.extracted_data["file_tree"] = file_tree
|
|
logger.info(
|
|
f"File tree built (local mode): {len(file_tree)} items ({excluded_count} directories excluded)"
|
|
)
|
|
|
|
def _extract_file_tree_github(self):
|
|
"""Extract file tree from GitHub API (rate-limited)."""
|
|
try:
|
|
from collections import deque
|
|
|
|
contents = deque(self.repo.get_contents(""))
|
|
file_tree = []
|
|
|
|
while contents:
|
|
file_content = contents.popleft()
|
|
|
|
file_info = {
|
|
"path": file_content.path,
|
|
"type": file_content.type,
|
|
"size": file_content.size if file_content.type == "file" else None,
|
|
}
|
|
file_tree.append(file_info)
|
|
|
|
if file_content.type == "dir":
|
|
contents.extend(self.repo.get_contents(file_content.path))
|
|
|
|
self.extracted_data["file_tree"] = file_tree
|
|
logger.info(f"File tree built (GitHub API mode): {len(file_tree)} items")
|
|
|
|
except GithubException as e:
|
|
logger.warning(f"Could not build file tree: {e}")
|
|
|
|
def _extract_signatures_and_tests(self):
|
|
"""
|
|
C1.3, C1.5, C1.6: Extract signatures, docstrings, and test examples.
|
|
|
|
Extraction depth depends on code_analysis_depth setting:
|
|
- surface: File tree only (minimal)
|
|
- deep: Parse files for signatures, parameters, types
|
|
- full: Complete AST analysis (future enhancement)
|
|
"""
|
|
if self.code_analysis_depth == "surface":
|
|
logger.info("Code extraction: Surface level (file tree only)")
|
|
return
|
|
|
|
if not self.code_analyzer:
|
|
logger.warning("Code analyzer not available - skipping deep analysis")
|
|
return
|
|
|
|
logger.info(f"Extracting code signatures ({self.code_analysis_depth} analysis)...")
|
|
|
|
# Get primary language for the repository
|
|
languages = self.extracted_data.get("languages", {})
|
|
if not languages:
|
|
logger.warning("No languages detected - skipping code analysis")
|
|
return
|
|
|
|
# Determine primary language
|
|
primary_language = max(languages.items(), key=lambda x: x[1]["bytes"])[0]
|
|
logger.info(f"Primary language: {primary_language}")
|
|
|
|
# Determine file extensions to analyze
|
|
extension_map = {
|
|
"Python": [".py"],
|
|
"JavaScript": [".js", ".jsx"],
|
|
"TypeScript": [".ts", ".tsx"],
|
|
"C": [".c", ".h"],
|
|
"C++": [".cpp", ".hpp", ".cc", ".hh", ".cxx"],
|
|
}
|
|
|
|
extensions = extension_map.get(primary_language, [])
|
|
if not extensions:
|
|
logger.warning(f"No file extensions mapped for {primary_language}")
|
|
return
|
|
|
|
# Analyze files matching patterns and extensions
|
|
analyzed_files = []
|
|
file_tree = self.extracted_data.get("file_tree", [])
|
|
|
|
for file_info in file_tree:
|
|
file_path = file_info["path"]
|
|
|
|
# Check if file matches extension
|
|
if not any(file_path.endswith(ext) for ext in extensions):
|
|
continue
|
|
|
|
# Check if file matches patterns (if specified)
|
|
if self.file_patterns and not any(
|
|
fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns
|
|
):
|
|
continue
|
|
|
|
# Analyze this file
|
|
try:
|
|
# Read file content based on mode
|
|
if self.local_repo_path:
|
|
# Local mode - read from filesystem
|
|
full_path = os.path.join(self.local_repo_path, file_path)
|
|
with open(full_path, encoding="utf-8") as f:
|
|
content = f.read()
|
|
else:
|
|
# GitHub API mode - fetch from API
|
|
file_content = self.repo.get_contents(file_path)
|
|
content = file_content.decoded_content.decode("utf-8")
|
|
|
|
analysis_result = self.code_analyzer.analyze_file(
|
|
file_path, content, primary_language
|
|
)
|
|
|
|
if analysis_result and (
|
|
analysis_result.get("classes") or analysis_result.get("functions")
|
|
):
|
|
analyzed_files.append(
|
|
{"file": file_path, "language": primary_language, **analysis_result}
|
|
)
|
|
|
|
logger.debug(
|
|
f"Analyzed {file_path}: "
|
|
f"{len(analysis_result.get('classes', []))} classes, "
|
|
f"{len(analysis_result.get('functions', []))} functions"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Could not analyze {file_path}: {e}")
|
|
continue
|
|
|
|
# Limit number of files analyzed to avoid rate limits (GitHub API mode only)
|
|
if not self.local_repo_path and len(analyzed_files) >= 50:
|
|
logger.info("Reached analysis limit (50 files, GitHub API mode)")
|
|
break
|
|
|
|
self.extracted_data["code_analysis"] = {
|
|
"depth": self.code_analysis_depth,
|
|
"language": primary_language,
|
|
"files_analyzed": len(analyzed_files),
|
|
"files": analyzed_files,
|
|
}
|
|
|
|
# Calculate totals
|
|
total_classes = sum(len(f.get("classes", [])) for f in analyzed_files)
|
|
total_functions = sum(len(f.get("functions", [])) for f in analyzed_files)
|
|
|
|
logger.info(
|
|
f"Code analysis complete: {len(analyzed_files)} files, {total_classes} classes, {total_functions} functions"
|
|
)
|
|
|
|
def _extract_issues(self):
|
|
"""C1.7: Extract GitHub Issues (open/closed, labels, milestones)."""
|
|
logger.info(f"Extracting GitHub Issues (max {self.max_issues})...")
|
|
|
|
try:
|
|
# Fetch recent issues (open + closed)
|
|
issues = self.repo.get_issues(state="all", sort="updated", direction="desc")
|
|
|
|
issue_list = []
|
|
for issue in itertools.islice(issues, self.max_issues):
|
|
# Skip pull requests (they appear in issues)
|
|
if issue.pull_request:
|
|
continue
|
|
|
|
issue_data = {
|
|
"number": issue.number,
|
|
"title": issue.title,
|
|
"state": issue.state,
|
|
"labels": [label.name for label in issue.labels],
|
|
"milestone": issue.milestone.title if issue.milestone else None,
|
|
"created_at": issue.created_at.isoformat() if issue.created_at else None,
|
|
"updated_at": issue.updated_at.isoformat() if issue.updated_at else None,
|
|
"closed_at": issue.closed_at.isoformat() if issue.closed_at else None,
|
|
"url": issue.html_url,
|
|
"body": issue.body[:500] if issue.body else None, # First 500 chars
|
|
}
|
|
issue_list.append(issue_data)
|
|
|
|
self.extracted_data["issues"] = issue_list
|
|
logger.info(f"Extracted {len(issue_list)} issues")
|
|
|
|
except GithubException as e:
|
|
logger.warning(f"Could not fetch issues: {e}")
|
|
|
|
def _extract_changelog(self):
|
|
"""C1.8: Extract CHANGELOG.md and release notes."""
|
|
logger.info("Extracting CHANGELOG...")
|
|
|
|
# Try common changelog locations
|
|
changelog_files = [
|
|
"CHANGELOG.md",
|
|
"CHANGES.md",
|
|
"HISTORY.md",
|
|
"CHANGELOG.rst",
|
|
"CHANGELOG.txt",
|
|
"CHANGELOG",
|
|
"docs/CHANGELOG.md",
|
|
".github/CHANGELOG.md",
|
|
]
|
|
|
|
for changelog_path in changelog_files:
|
|
changelog_content = self._get_file_content(changelog_path)
|
|
if changelog_content:
|
|
self.extracted_data["changelog"] = changelog_content
|
|
logger.info(f"CHANGELOG found: {changelog_path}")
|
|
return
|
|
|
|
logger.warning("No CHANGELOG found in repository")
|
|
|
|
def _extract_releases(self):
|
|
"""C1.9: Extract GitHub Releases with version history."""
|
|
logger.info("Extracting GitHub Releases...")
|
|
|
|
try:
|
|
releases = self.repo.get_releases()
|
|
|
|
release_list = []
|
|
for release in releases:
|
|
release_data = {
|
|
"tag_name": release.tag_name,
|
|
"name": release.title,
|
|
"body": release.body,
|
|
"draft": release.draft,
|
|
"prerelease": release.prerelease,
|
|
"created_at": release.created_at.isoformat() if release.created_at else None,
|
|
"published_at": release.published_at.isoformat()
|
|
if release.published_at
|
|
else None,
|
|
"url": release.html_url,
|
|
"tarball_url": release.tarball_url,
|
|
"zipball_url": release.zipball_url,
|
|
}
|
|
release_list.append(release_data)
|
|
|
|
self.extracted_data["releases"] = release_list
|
|
logger.info(f"Extracted {len(release_list)} releases")
|
|
|
|
except GithubException as e:
|
|
logger.warning(f"Could not fetch releases: {e}")
|
|
|
|
def _save_data(self):
|
|
"""Save extracted data to JSON file."""
|
|
os.makedirs("output", exist_ok=True)
|
|
|
|
with open(self.data_file, "w", encoding="utf-8") as f:
|
|
json.dump(self.extracted_data, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Data saved to: {self.data_file}")
|
|
|
|
|
|
class GitHubToSkillConverter:
|
|
"""
|
|
Convert extracted GitHub data to Claude skill format (C1.10).
|
|
"""
|
|
|
|
def __init__(self, config: dict[str, Any]):
|
|
"""Initialize converter with configuration."""
|
|
self.config = config
|
|
self.name = config.get("name", config["repo"].split("/")[-1])
|
|
|
|
# Paths
|
|
self.data_file = f"output/{self.name}_github_data.json"
|
|
self.skill_dir = f"output/{self.name}"
|
|
|
|
# Load extracted data
|
|
self.data = self._load_data()
|
|
|
|
# Set description (smart extraction from README if available)
|
|
if "description" in config:
|
|
self.description = config["description"]
|
|
else:
|
|
# Try to extract from README in loaded data
|
|
readme_content = self.data.get("readme", "")
|
|
repo_name = config["repo"]
|
|
if readme_content:
|
|
self.description = extract_description_from_readme(readme_content, repo_name)
|
|
else:
|
|
self.description = f"Use when working with {repo_name.split('/')[-1]}"
|
|
|
|
def _load_data(self) -> dict[str, Any]:
|
|
"""Load extracted GitHub data from JSON."""
|
|
if not os.path.exists(self.data_file):
|
|
raise FileNotFoundError(f"Data file not found: {self.data_file}")
|
|
|
|
with open(self.data_file, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
def build_skill(self):
|
|
"""Build complete skill structure."""
|
|
logger.info(f"Building skill for: {self.name}")
|
|
|
|
# Create directories
|
|
os.makedirs(self.skill_dir, exist_ok=True)
|
|
os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
|
|
os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
|
|
os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
|
|
|
|
# Generate SKILL.md
|
|
self._generate_skill_md()
|
|
|
|
# Generate reference files
|
|
self._generate_references()
|
|
|
|
logger.info(f"✅ Skill built successfully: {self.skill_dir}/")
|
|
|
|
def _generate_skill_md(self):
|
|
"""Generate main SKILL.md file (rich version with C3.x data if available)."""
|
|
repo_info = self.data.get("repo_info", {})
|
|
c3_data = self.data.get("c3_analysis", {})
|
|
has_c3_data = bool(c3_data)
|
|
|
|
# Generate skill name (lowercase, hyphens only, max 64 chars)
|
|
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
|
|
|
|
# Truncate description to 1024 chars if needed
|
|
desc = self.description[:1024] if len(self.description) > 1024 else self.description
|
|
|
|
doc_version = self.config.get("doc_version", "")
|
|
|
|
# Build skill content
|
|
skill_content = f"""---
|
|
name: {skill_name}
|
|
description: {desc}
|
|
doc_version: {doc_version}
|
|
---
|
|
|
|
# {repo_info.get("name", self.name)}
|
|
|
|
{self.description}
|
|
|
|
## Description
|
|
|
|
{repo_info.get("description", "GitHub repository skill")}
|
|
|
|
**Repository:** [{repo_info.get("full_name", "N/A")}]({repo_info.get("url", "#")})
|
|
**Language:** {repo_info.get("language", "N/A")}
|
|
**Stars:** {repo_info.get("stars", 0):,}
|
|
**License:** {repo_info.get("license", "N/A")}
|
|
|
|
## When to Use This Skill
|
|
|
|
Use this skill when you need to:
|
|
- Understand how to use {repo_info.get("name", self.name)}
|
|
- Look up API documentation and implementation details
|
|
- Find real-world usage examples from the codebase
|
|
- Review design patterns and architecture
|
|
- Check for known issues or recent changes
|
|
- Explore release history and changelogs
|
|
"""
|
|
|
|
# Add Quick Reference section (enhanced with C3.x if available)
|
|
skill_content += "\n## ⚡ Quick Reference\n\n"
|
|
|
|
# Repository info
|
|
skill_content += "### Repository Info\n"
|
|
skill_content += f"- **Homepage:** {repo_info.get('homepage') or 'N/A'}\n"
|
|
skill_content += f"- **Topics:** {', '.join(repo_info.get('topics', []))}\n"
|
|
skill_content += f"- **Open Issues:** {repo_info.get('open_issues', 0)}\n"
|
|
updated_at = repo_info.get("updated_at") or "N/A"
|
|
skill_content += f"- **Last Updated:** {updated_at[:10]}\n\n"
|
|
|
|
# Languages
|
|
skill_content += "### Languages\n"
|
|
skill_content += self._format_languages() + "\n\n"
|
|
|
|
# Add C3.x pattern summary if available
|
|
if has_c3_data and c3_data.get("patterns"):
|
|
skill_content += self._format_pattern_summary(c3_data)
|
|
|
|
# Add code examples if available (C3.2 test examples)
|
|
if has_c3_data and c3_data.get("test_examples"):
|
|
skill_content += self._format_code_examples(c3_data)
|
|
|
|
# Add API Reference if available (C2.5)
|
|
if has_c3_data and c3_data.get("api_reference"):
|
|
skill_content += self._format_api_reference(c3_data)
|
|
|
|
# Add Architecture Overview if available (C3.7)
|
|
if has_c3_data and c3_data.get("architecture"):
|
|
skill_content += self._format_architecture(c3_data)
|
|
|
|
# Add Known Issues section
|
|
skill_content += self._format_known_issues()
|
|
|
|
# Add Recent Releases
|
|
skill_content += "### Recent Releases\n"
|
|
skill_content += self._format_recent_releases() + "\n\n"
|
|
|
|
# Available References
|
|
skill_content += "## 📖 Available References\n\n"
|
|
skill_content += "- `references/README.md` - Complete README documentation\n"
|
|
skill_content += "- `references/CHANGELOG.md` - Version history and changes\n"
|
|
skill_content += "- `references/issues.md` - Recent GitHub issues\n"
|
|
skill_content += "- `references/releases.md` - Release notes\n"
|
|
skill_content += "- `references/file_structure.md` - Repository structure\n"
|
|
|
|
if has_c3_data:
|
|
skill_content += "\n### Codebase Analysis References\n\n"
|
|
if c3_data.get("patterns"):
|
|
skill_content += (
|
|
"- `references/codebase_analysis/patterns/` - Design patterns detected\n"
|
|
)
|
|
if c3_data.get("test_examples"):
|
|
skill_content += (
|
|
"- `references/codebase_analysis/examples/` - Test examples extracted\n"
|
|
)
|
|
if c3_data.get("config_patterns"):
|
|
skill_content += (
|
|
"- `references/codebase_analysis/configuration/` - Configuration analysis\n"
|
|
)
|
|
if c3_data.get("architecture"):
|
|
skill_content += (
|
|
"- `references/codebase_analysis/ARCHITECTURE.md` - Architecture overview\n"
|
|
)
|
|
|
|
# Usage
|
|
skill_content += "\n## 💻 Usage\n\n"
|
|
skill_content += "See README.md for complete usage instructions and examples.\n\n"
|
|
|
|
# Footer
|
|
skill_content += "---\n\n"
|
|
if has_c3_data:
|
|
skill_content += "**Generated by Skill Seeker** | GitHub Repository Scraper with C3.x Codebase Analysis\n"
|
|
else:
|
|
skill_content += "**Generated by Skill Seeker** | GitHub Repository Scraper\n"
|
|
|
|
# Write to file
|
|
skill_path = f"{self.skill_dir}/SKILL.md"
|
|
with open(skill_path, "w", encoding="utf-8") as f:
|
|
f.write(skill_content)
|
|
|
|
line_count = len(skill_content.split("\n"))
|
|
logger.info(f"Generated: {skill_path} ({line_count} lines)")
|
|
|
|
def _format_languages(self) -> str:
|
|
"""Format language breakdown."""
|
|
languages = self.data.get("languages", {})
|
|
if not languages:
|
|
return "No language data available"
|
|
|
|
lines = []
|
|
for lang, info in sorted(languages.items(), key=lambda x: x[1]["bytes"], reverse=True):
|
|
lines.append(f"- **{lang}:** {info['percentage']:.1f}%")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_recent_releases(self) -> str:
|
|
"""Format recent releases (top 3)."""
|
|
releases = self.data.get("releases", [])
|
|
if not releases:
|
|
return "No releases available"
|
|
|
|
lines = []
|
|
for release in releases[:3]:
|
|
published_at = release.get("published_at") or "N/A"
|
|
release_name = release.get("name") or release["tag_name"]
|
|
lines.append(f"- **{release['tag_name']}** ({published_at[:10]}): {release_name}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_pattern_summary(self, c3_data: dict[str, Any]) -> str:
|
|
"""Format design patterns summary (C3.1)."""
|
|
patterns_data = c3_data.get("patterns", [])
|
|
if not patterns_data:
|
|
return ""
|
|
|
|
# Count patterns by type (deduplicate by class, keep highest confidence)
|
|
pattern_counts = {}
|
|
by_class = {}
|
|
|
|
for pattern_file in patterns_data:
|
|
for pattern in pattern_file.get("patterns", []):
|
|
ptype = pattern.get("pattern_type", "Unknown")
|
|
cls = pattern.get("class_name", "")
|
|
confidence = pattern.get("confidence", 0)
|
|
|
|
# Skip low confidence
|
|
if confidence < 0.7:
|
|
continue
|
|
|
|
# Deduplicate by class
|
|
key = f"{cls}:{ptype}"
|
|
if key not in by_class or by_class[key]["confidence"] < confidence:
|
|
by_class[key] = pattern
|
|
|
|
# Count by type
|
|
pattern_counts[ptype] = pattern_counts.get(ptype, 0) + 1
|
|
|
|
if not pattern_counts:
|
|
return ""
|
|
|
|
content = "### Design Patterns Detected\n\n"
|
|
content += "*From C3.1 codebase analysis (confidence > 0.7)*\n\n"
|
|
|
|
# Top 5 pattern types
|
|
for ptype, count in sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
|
|
content += f"- **{ptype}**: {count} instances\n"
|
|
|
|
content += f"\n*Total: {len(by_class)} high-confidence patterns*\n\n"
|
|
return content
|
|
|
|
def _format_code_examples(self, c3_data: dict[str, Any]) -> str:
|
|
"""Format code examples (C3.2)."""
|
|
examples_data = c3_data.get("test_examples", {})
|
|
examples = examples_data.get("examples", [])
|
|
|
|
if not examples:
|
|
return ""
|
|
|
|
# Filter high-value examples (complexity > 0.7)
|
|
high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7]
|
|
|
|
if not high_value:
|
|
return ""
|
|
|
|
content = "## 📝 Code Examples\n\n"
|
|
content += "*High-quality examples from codebase (C3.2)*\n\n"
|
|
|
|
# Top 10 examples
|
|
for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]:
|
|
desc = ex.get("description", "Example")
|
|
lang = ex.get("language", "python")
|
|
code = ex.get("code", "")
|
|
complexity = ex.get("complexity_score", 0)
|
|
|
|
content += f"**{desc}** (complexity: {complexity:.2f})\n\n"
|
|
content += f"```{lang}\n{code}\n```\n\n"
|
|
|
|
return content
|
|
|
|
def _format_api_reference(self, c3_data: dict[str, Any]) -> str:
|
|
"""Format API reference (C2.5)."""
|
|
api_ref = c3_data.get("api_reference", {})
|
|
|
|
if not api_ref:
|
|
return ""
|
|
|
|
content = "## 🔧 API Reference\n\n"
|
|
content += "*Extracted from codebase analysis (C2.5)*\n\n"
|
|
|
|
# Top 5 modules
|
|
for module_name, module_md in list(api_ref.items())[:5]:
|
|
content += f"### {module_name}\n\n"
|
|
# First 500 chars of module documentation
|
|
content += module_md[:500]
|
|
if len(module_md) > 500:
|
|
content += "...\n\n"
|
|
else:
|
|
content += "\n\n"
|
|
|
|
content += "*See `references/codebase_analysis/api_reference/` for complete API docs*\n\n"
|
|
return content
|
|
|
|
def _format_architecture(self, c3_data: dict[str, Any]) -> str:
|
|
"""Format architecture overview (C3.7)."""
|
|
arch_data = c3_data.get("architecture", {})
|
|
|
|
if not arch_data:
|
|
return ""
|
|
|
|
content = "## 🏗️ Architecture Overview\n\n"
|
|
content += "*From C3.7 codebase analysis*\n\n"
|
|
|
|
# Architecture patterns
|
|
patterns = arch_data.get("patterns", [])
|
|
if patterns:
|
|
content += "**Architectural Patterns:**\n"
|
|
for pattern in patterns[:5]:
|
|
content += (
|
|
f"- {pattern.get('name', 'Unknown')}: {pattern.get('description', 'N/A')}\n"
|
|
)
|
|
content += "\n"
|
|
|
|
# Dependencies (C2.6)
|
|
dep_data = c3_data.get("dependency_graph", {})
|
|
if dep_data:
|
|
total_deps = dep_data.get("total_dependencies", 0)
|
|
circular = len(dep_data.get("circular_dependencies", []))
|
|
if total_deps > 0:
|
|
content += f"**Dependencies:** {total_deps} total"
|
|
if circular > 0:
|
|
content += f" (⚠️ {circular} circular dependencies detected)"
|
|
content += "\n\n"
|
|
|
|
content += "*See `references/codebase_analysis/ARCHITECTURE.md` for complete overview*\n\n"
|
|
return content
|
|
|
|
def _format_known_issues(self) -> str:
|
|
"""Format known issues from GitHub."""
|
|
issues = self.data.get("issues", [])
|
|
|
|
if not issues:
|
|
return ""
|
|
|
|
content = "## ⚠️ Known Issues\n\n"
|
|
content += "*Recent issues from GitHub*\n\n"
|
|
|
|
# Top 5 issues
|
|
for issue in issues[:5]:
|
|
title = issue.get("title", "Untitled")
|
|
number = issue.get("number", 0)
|
|
labels = ", ".join(issue.get("labels", []))
|
|
content += f"- **#{number}**: {title}"
|
|
if labels:
|
|
content += f" [`{labels}`]"
|
|
content += "\n"
|
|
|
|
content += "\n*See `references/issues.md` for complete list*\n\n"
|
|
return content
|
|
|
|
def _generate_references(self):
|
|
"""Generate all reference files."""
|
|
# README
|
|
if self.data.get("readme"):
|
|
readme_path = f"{self.skill_dir}/references/README.md"
|
|
with open(readme_path, "w", encoding="utf-8") as f:
|
|
f.write(self.data["readme"])
|
|
logger.info(f"Generated: {readme_path}")
|
|
|
|
# CHANGELOG
|
|
if self.data.get("changelog"):
|
|
changelog_path = f"{self.skill_dir}/references/CHANGELOG.md"
|
|
with open(changelog_path, "w", encoding="utf-8") as f:
|
|
f.write(self.data["changelog"])
|
|
logger.info(f"Generated: {changelog_path}")
|
|
|
|
# Issues
|
|
if self.data.get("issues"):
|
|
self._generate_issues_reference()
|
|
|
|
# Releases
|
|
if self.data.get("releases"):
|
|
self._generate_releases_reference()
|
|
|
|
# File structure
|
|
if self.data.get("file_tree"):
|
|
self._generate_file_structure_reference()
|
|
|
|
def _generate_issues_reference(self):
|
|
"""Generate issues.md reference file."""
|
|
issues = self.data["issues"]
|
|
|
|
content = f"# GitHub Issues\n\nRecent issues from the repository ({len(issues)} total).\n\n"
|
|
|
|
# Group by state
|
|
open_issues = [i for i in issues if i["state"] == "open"]
|
|
closed_issues = [i for i in issues if i["state"] == "closed"]
|
|
|
|
content += f"## Open Issues ({len(open_issues)})\n\n"
|
|
for issue in open_issues:
|
|
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
|
|
created_at = issue.get("created_at") or "N/A"
|
|
content += f"### #{issue['number']}: {issue['title']}\n"
|
|
content += f"**Labels:** {labels} | **Created:** {created_at[:10]}\n"
|
|
content += f"[View on GitHub]({issue['url']})\n\n"
|
|
|
|
content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n"
|
|
for issue in closed_issues:
|
|
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
|
|
closed_at = issue.get("closed_at") or "N/A"
|
|
content += f"### #{issue['number']}: {issue['title']}\n"
|
|
content += f"**Labels:** {labels} | **Closed:** {closed_at[:10]}\n"
|
|
content += f"[View on GitHub]({issue['url']})\n\n"
|
|
|
|
issues_path = f"{self.skill_dir}/references/issues.md"
|
|
with open(issues_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
logger.info(f"Generated: {issues_path}")
|
|
|
|
def _generate_releases_reference(self):
|
|
"""Generate releases.md reference file."""
|
|
releases = self.data["releases"]
|
|
|
|
content = (
|
|
f"# Releases\n\nVersion history for this repository ({len(releases)} releases).\n\n"
|
|
)
|
|
|
|
for release in releases:
|
|
published_at = release.get("published_at") or "N/A"
|
|
release_name = release.get("name") or release["tag_name"]
|
|
release_body = release.get("body") or ""
|
|
content += f"## {release['tag_name']}: {release_name}\n"
|
|
content += f"**Published:** {published_at[:10]}\n"
|
|
if release["prerelease"]:
|
|
content += "**Pre-release**\n"
|
|
content += f"\n{release_body}\n\n"
|
|
content += f"[View on GitHub]({release['url']})\n\n---\n\n"
|
|
|
|
releases_path = f"{self.skill_dir}/references/releases.md"
|
|
with open(releases_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
logger.info(f"Generated: {releases_path}")
|
|
|
|
def _generate_file_structure_reference(self):
|
|
"""Generate file_structure.md reference file."""
|
|
file_tree = self.data["file_tree"]
|
|
|
|
content = "# Repository File Structure\n\n"
|
|
content += f"Total items: {len(file_tree)}\n\n"
|
|
content += "```\n"
|
|
|
|
# Build tree structure
|
|
for item in file_tree:
|
|
indent = " " * item["path"].count("/")
|
|
icon = "📁" if item["type"] == "dir" else "📄"
|
|
content += f"{indent}{icon} {os.path.basename(item['path'])}\n"
|
|
|
|
content += "```\n"
|
|
|
|
structure_path = f"{self.skill_dir}/references/file_structure.md"
|
|
with open(structure_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
logger.info(f"Generated: {structure_path}")
|
|
|
|
|
|
def setup_argument_parser() -> argparse.ArgumentParser:
|
|
"""Setup and configure command-line argument parser.
|
|
|
|
Creates an ArgumentParser with all CLI options for the github scraper.
|
|
All arguments are defined in skill_seekers.cli.arguments.github to ensure
|
|
consistency between the standalone scraper and unified CLI.
|
|
|
|
Returns:
|
|
argparse.ArgumentParser: Configured argument parser
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="GitHub Repository to Claude Skill Converter",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
skill-seekers github --repo facebook/react
|
|
skill-seekers github --config configs/react_github.json
|
|
skill-seekers github --repo owner/repo --token $GITHUB_TOKEN
|
|
""",
|
|
)
|
|
|
|
# Add all github arguments from shared definitions
|
|
# This ensures the standalone scraper and unified CLI stay in sync
|
|
add_github_arguments(parser)
|
|
|
|
return parser
|
|
|
|
|
|
def main():
|
|
"""C1.10: CLI tool entry point."""
|
|
parser = setup_argument_parser()
|
|
args = parser.parse_args()
|
|
|
|
setup_logging(verbose=getattr(args, "verbose", False), quiet=getattr(args, "quiet", False))
|
|
|
|
# Handle --dry-run
|
|
if getattr(args, "dry_run", False):
|
|
repo = args.repo or (args.config and "(from config)")
|
|
print(f"\n{'=' * 60}")
|
|
print(f"DRY RUN: GitHub Repository Analysis")
|
|
print(f"{'=' * 60}")
|
|
print(f"Repository: {repo}")
|
|
print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}")
|
|
print(f"Include issues: {not getattr(args, 'no_issues', False)}")
|
|
print(f"Include releases: {not getattr(args, 'no_releases', False)}")
|
|
print(f"Include changelog: {not getattr(args, 'no_changelog', False)}")
|
|
print(f"Max issues: {getattr(args, 'max_issues', 100)}")
|
|
print(f"Enhance level: {getattr(args, 'enhance_level', 0)}")
|
|
print(f"Profile: {getattr(args, 'profile', None) or '(default)'}")
|
|
print(f"\n✅ Dry run complete")
|
|
return 0
|
|
|
|
# Build config from args or file
|
|
if args.config:
|
|
with open(args.config, encoding="utf-8") as f:
|
|
config = json.load(f)
|
|
# Override with CLI args if provided
|
|
if args.non_interactive:
|
|
config["interactive"] = False
|
|
if args.profile:
|
|
config["github_profile"] = args.profile
|
|
elif args.repo:
|
|
config = {
|
|
"repo": args.repo,
|
|
"name": args.name or args.repo.split("/")[-1],
|
|
"description": args.description or f"Use when working with {args.repo.split('/')[-1]}",
|
|
"github_token": args.token,
|
|
"include_issues": not args.no_issues,
|
|
"include_changelog": not args.no_changelog,
|
|
"include_releases": not args.no_releases,
|
|
"max_issues": args.max_issues,
|
|
"interactive": not args.non_interactive,
|
|
"github_profile": args.profile,
|
|
"local_repo_path": getattr(args, "local_repo_path", None),
|
|
}
|
|
else:
|
|
parser.error("Either --repo or --config is required")
|
|
|
|
try:
|
|
# Phase 1: Scrape GitHub repository
|
|
scraper = GitHubScraper(config)
|
|
scraper.scrape()
|
|
|
|
if args.scrape_only:
|
|
logger.info("Scrape complete (--scrape-only mode)")
|
|
return
|
|
|
|
# Phase 2: Build skill
|
|
converter = GitHubToSkillConverter(config)
|
|
converter.build_skill()
|
|
|
|
skill_name = config.get("name", config["repo"].split("/")[-1])
|
|
skill_dir = f"output/{skill_name}"
|
|
|
|
# ============================================================
|
|
# WORKFLOW SYSTEM INTEGRATION (Phase 2 - github_scraper)
|
|
# ============================================================
|
|
from skill_seekers.cli.workflow_runner import run_workflows
|
|
|
|
# Pass GitHub-specific context to workflows
|
|
github_context = {
|
|
"repo": config.get("repo", ""),
|
|
"name": skill_name,
|
|
"description": config.get("description", ""),
|
|
}
|
|
|
|
workflow_executed, workflow_names = run_workflows(args, context=github_context)
|
|
workflow_name = ", ".join(workflow_names) if workflow_names else None
|
|
|
|
# Phase 3: Optional enhancement with auto-detected mode
|
|
# Note: Runs independently of workflow system (they complement each other)
|
|
if getattr(args, "enhance_level", 0) > 0:
|
|
import os
|
|
|
|
# Auto-detect mode based on API key availability
|
|
api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
mode = "API" if api_key else "LOCAL"
|
|
|
|
logger.info("\n" + "=" * 80)
|
|
logger.info(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
|
|
logger.info("=" * 80)
|
|
if workflow_executed:
|
|
logger.info(f" Running after workflow: {workflow_name}")
|
|
logger.info(
|
|
" (Workflow provides specialized analysis, enhancement provides general improvements)"
|
|
)
|
|
logger.info("")
|
|
|
|
if api_key:
|
|
# API-based enhancement
|
|
try:
|
|
from skill_seekers.cli.enhance_skill import enhance_skill_md
|
|
|
|
enhance_skill_md(skill_dir, api_key)
|
|
logger.info("✅ API enhancement complete!")
|
|
except ImportError:
|
|
logger.error("❌ API enhancement not available. Install: pip install anthropic")
|
|
logger.info("💡 Falling back to LOCAL mode...")
|
|
# Fall back to LOCAL mode
|
|
from pathlib import Path
|
|
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
|
|
|
|
enhancer = LocalSkillEnhancer(Path(skill_dir))
|
|
enhancer.run(headless=True)
|
|
logger.info("✅ Local enhancement complete!")
|
|
else:
|
|
# LOCAL enhancement (no API key)
|
|
from pathlib import Path
|
|
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
|
|
|
|
enhancer = LocalSkillEnhancer(Path(skill_dir))
|
|
enhancer.run(headless=True)
|
|
logger.info("✅ Local enhancement complete!")
|
|
|
|
logger.info(f"\n✅ Success! Skill created at: {skill_dir}/")
|
|
|
|
# Only suggest enhancement if neither workflow nor traditional enhancement was done
|
|
if not workflow_executed and getattr(args, "enhance_level", 0) == 0:
|
|
logger.info("\n💡 Optional: Enhance SKILL.md with Claude:")
|
|
logger.info(f" skill-seekers enhance {skill_dir}/ --enhance-level 2")
|
|
logger.info(" (auto-detects API vs LOCAL mode based on ANTHROPIC_API_KEY)")
|
|
logger.info("\n💡 Or use a workflow:")
|
|
logger.info(
|
|
f" skill-seekers github --repo {config['repo']} --enhance-workflow architecture-comprehensive"
|
|
)
|
|
|
|
logger.info(f"\nNext step: skill-seekers package {skill_dir}/")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|