#!/usr/bin/env python3 """ GitHub Repository to Claude Skill Converter (Tasks C1.1-C1.12) Converts GitHub repositories into Claude AI skills by extracting: - README and documentation - Code structure and signatures - GitHub Issues, Changelog, and Releases - Usage examples from tests Usage: skill-seekers github --repo facebook/react skill-seekers github --config configs/react_github.json skill-seekers github --repo owner/repo --token $GITHUB_TOKEN """ import argparse import json import logging import os import re import sys from pathlib import Path from typing import Any, Optional try: from github import Github, GithubException, Repository from github.GithubException import RateLimitExceededException except ImportError: print("Error: PyGithub not installed. Run: pip install PyGithub") sys.exit(1) from skill_seekers.cli.arguments.github import add_github_arguments # Try to import pathspec for .gitignore support try: import pathspec PATHSPEC_AVAILABLE = True except ImportError: PATHSPEC_AVAILABLE = False # Configure logging FIRST (before using logger) logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # Import code analyzer for deep code analysis try: from .code_analyzer import CodeAnalyzer CODE_ANALYZER_AVAILABLE = True except ImportError: CODE_ANALYZER_AVAILABLE = False logger.warning("Code analyzer not available - deep analysis disabled") # Directories to exclude from local repository analysis EXCLUDED_DIRS = { # Virtual environments "venv", "env", ".venv", ".env", # Dependencies and caches "node_modules", "__pycache__", ".pytest_cache", # Version control ".git", ".svn", ".hg", # Build artifacts "build", "dist", "*.egg-info", # Coverage reports "htmlcov", ".coverage", # Testing environments ".tox", ".nox", # Linter caches ".mypy_cache", ".ruff_cache", # Unity (critical - contains massive build cache) "Library", "Temp", "Logs", "UserSettings", "MemoryCaptures", "Recordings", # Unreal Engine "Intermediate", "Saved", "DerivedDataCache", # Godot ".godot", ".import", # Misc "tmp", ".tmp", } def extract_description_from_readme(readme_content: str, repo_name: str) -> str: """ Extract a meaningful description from README content for skill description. Parses README to find the first meaningful paragraph that describes what the project does, suitable for "Use when..." format. Args: readme_content: README.md content repo_name: Repository name (e.g., 'facebook/react') Returns: Description string, or improved fallback if extraction fails """ if not readme_content: return f"Use when working with {repo_name.split('/')[-1]}" try: lines = readme_content.split("\n") # Skip badges, images, title - find first meaningful text paragraph meaningful_paragraph = None in_code_block = False for _i, line in enumerate(lines): stripped = line.strip() # Track code blocks if stripped.startswith("```"): in_code_block = not in_code_block continue # Skip if in code block if in_code_block: continue # Skip empty lines, badges, images, HTML if not stripped or stripped.startswith(("#", "!", "<", "[![", "[![")): continue # Skip lines that are just links or badges if stripped.startswith("[") and "](" in stripped and len(stripped) < 100: continue # Found a meaningful paragraph - take up to 200 chars if len(stripped) > 20: # Meaningful length meaningful_paragraph = stripped break if meaningful_paragraph: # Clean up and extract purpose # Remove markdown formatting clean = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", meaningful_paragraph) # Links clean = re.sub(r"[*_`]", "", clean) # Bold, italic, code clean = re.sub(r"<[^>]+>", "", clean) # HTML tags # Truncate if too long (keep first sentence or ~150 chars) if ". " in clean: first_sentence = clean.split(". ")[0] + "." if len(first_sentence) < 200: clean = first_sentence if len(clean) > 150: clean = clean[:147] + "..." # Format as "Use when..." description # If it already starts with action words, use as-is action_words = ["build", "create", "develop", "work", "use", "implement", "manage"] if any(clean.lower().startswith(word) for word in action_words): return f"Use when {clean.lower()}" else: return f"Use when working with {clean.lower()}" except Exception as e: logger.debug(f"Could not extract description from README: {e}") # Improved fallback project_name = repo_name.split("/")[-1] return f"Use when working with {project_name}" class GitHubScraper: """ GitHub Repository Scraper (C1.1-C1.9) Extracts repository information for skill generation: - Repository structure - README files - Code comments and docstrings - Programming language detection - Function/class signatures - Test examples - GitHub Issues - CHANGELOG - Releases """ def __init__(self, config: dict[str, Any], local_repo_path: str | None = None): """Initialize GitHub scraper with configuration.""" self.config = config self.repo_name = config["repo"] self.name = config.get("name", self.repo_name.split("/")[-1]) # Set initial description (will be improved after README extraction if not in config) self.description = config.get( "description", f"Use when working with {self.repo_name.split('/')[-1]}" ) # Local repository path (optional - enables unlimited analysis) self.local_repo_path = local_repo_path or config.get("local_repo_path") if self.local_repo_path: self.local_repo_path = os.path.expanduser(self.local_repo_path) if not os.path.isdir(self.local_repo_path): logger.warning( f"local_repo_path does not exist or is not a directory: {self.local_repo_path}" ) logger.warning("Falling back to GitHub API mode (local_repo_path ignored)") self.local_repo_path = None else: logger.info(f"Local repository mode enabled: {self.local_repo_path}") # Configure directory exclusions (smart defaults + optional customization) self.excluded_dirs = set(EXCLUDED_DIRS) # Start with smart defaults # Option 1: Replace mode - Use only specified exclusions if "exclude_dirs" in config: self.excluded_dirs = set(config["exclude_dirs"]) logger.warning( f"Using custom directory exclusions ({len(self.excluded_dirs)} dirs) - defaults overridden" ) logger.debug(f"Custom exclusions: {sorted(self.excluded_dirs)}") # Option 2: Extend mode - Add to default exclusions elif "exclude_dirs_additional" in config: additional = set(config["exclude_dirs_additional"]) self.excluded_dirs = self.excluded_dirs.union(additional) logger.info( f"Added {len(additional)} custom directory exclusions (total: {len(self.excluded_dirs)})" ) logger.debug(f"Additional exclusions: {sorted(additional)}") # Load .gitignore for additional exclusions (C2.1) self.gitignore_spec = None if self.local_repo_path: self.gitignore_spec = self._load_gitignore() # GitHub client setup (C1.1) token = self._get_token() self.github = Github(token) if token else Github() self.repo: Repository.Repository | None = None # Options self.include_issues = config.get("include_issues", True) self.max_issues = config.get("max_issues", 100) self.include_changelog = config.get("include_changelog", True) self.include_releases = config.get("include_releases", True) self.include_code = config.get("include_code", False) self.code_analysis_depth = config.get( "code_analysis_depth", "surface" ) # 'surface', 'deep', 'full' self.file_patterns = config.get("file_patterns", []) # Initialize code analyzer if deep analysis requested self.code_analyzer = None if self.code_analysis_depth != "surface" and CODE_ANALYZER_AVAILABLE: self.code_analyzer = CodeAnalyzer(depth=self.code_analysis_depth) logger.info(f"Code analysis depth: {self.code_analysis_depth}") # Output paths self.skill_dir = f"output/{self.name}" self.data_file = f"output/{self.name}_github_data.json" # Extracted data storage self.extracted_data = { "repo_info": {}, "readme": "", "file_tree": [], "languages": {}, "signatures": [], "test_examples": [], "issues": [], "changelog": "", "releases": [], } def _get_token(self) -> str | None: """ Get GitHub token from env var or config (both options supported). Priority: GITHUB_TOKEN env var > config file > None """ # Try environment variable first (recommended) token = os.getenv("GITHUB_TOKEN") if token: logger.info("Using GitHub token from GITHUB_TOKEN environment variable") return token # Fall back to config file token = self.config.get("github_token") if token: logger.warning("Using GitHub token from config file (less secure)") return token logger.warning( "No GitHub token provided - using unauthenticated access (lower rate limits)" ) return None def scrape(self) -> dict[str, Any]: """ Main scraping entry point. Executes all C1 tasks in sequence. """ try: logger.info(f"Starting GitHub scrape for: {self.repo_name}") # C1.1: Fetch repository self._fetch_repository() # C1.2: Extract README self._extract_readme() # C1.3-C1.6: Extract code structure self._extract_code_structure() # C1.7: Extract Issues if self.include_issues: self._extract_issues() # C1.8: Extract CHANGELOG if self.include_changelog: self._extract_changelog() # C1.9: Extract Releases if self.include_releases: self._extract_releases() # Save extracted data self._save_data() logger.info(f"āœ… Scraping complete! Data saved to: {self.data_file}") return self.extracted_data except RateLimitExceededException: logger.error("GitHub API rate limit exceeded. Please wait or use authentication token.") raise except GithubException as e: logger.error(f"GitHub API error: {e}") raise except Exception as e: logger.error(f"Unexpected error during scraping: {e}") raise def _fetch_repository(self): """C1.1: Fetch repository structure using GitHub API.""" logger.info(f"Fetching repository: {self.repo_name}") try: self.repo = self.github.get_repo(self.repo_name) # Extract basic repo info self.extracted_data["repo_info"] = { "name": self.repo.name, "full_name": self.repo.full_name, "description": self.repo.description, "url": self.repo.html_url, "homepage": self.repo.homepage, "stars": self.repo.stargazers_count, "forks": self.repo.forks_count, "open_issues": self.repo.open_issues_count, "default_branch": self.repo.default_branch, "created_at": self.repo.created_at.isoformat() if self.repo.created_at else None, "updated_at": self.repo.updated_at.isoformat() if self.repo.updated_at else None, "language": self.repo.language, "license": self.repo.license.name if self.repo.license else None, "topics": self.repo.get_topics(), } logger.info( f"Repository fetched: {self.repo.full_name} ({self.repo.stargazers_count} stars)" ) except GithubException as e: if e.status == 404: raise ValueError(f"Repository not found: {self.repo_name}") from e raise def _get_file_content(self, file_path: str) -> str | None: """ Safely get file content, handling symlinks and encoding issues. Args: file_path: Path to file in repository Returns: File content as string, or None if file not found/error """ try: content = self.repo.get_contents(file_path) if not content: return None # Handle symlinks - follow the target to get actual file if hasattr(content, "type") and content.type == "symlink": target = getattr(content, "target", None) if target: target = target.strip() logger.debug(f"File {file_path} is a symlink to {target}, following...") try: content = self.repo.get_contents(target) except GithubException as e: logger.warning(f"Failed to follow symlink {file_path} -> {target}: {e}") return None else: logger.warning(f"Symlink {file_path} has no target") return None # Handle large files (encoding="none") - download via URL # GitHub API doesn't base64-encode files >1MB if hasattr(content, "encoding") and content.encoding in [None, "none"]: download_url = getattr(content, "download_url", None) file_size = getattr(content, "size", 0) if download_url: logger.info( f"File {file_path} is large ({file_size:,} bytes), downloading via URL..." ) try: import requests response = requests.get(download_url, timeout=30) response.raise_for_status() return response.text except Exception as e: logger.warning(f"Failed to download {file_path} from {download_url}: {e}") return None else: logger.warning( f"File {file_path} has no download URL (encoding={content.encoding})" ) return None # Handle regular files - decode content try: if isinstance(content.decoded_content, bytes): return content.decoded_content.decode("utf-8") else: return str(content.decoded_content) except (UnicodeDecodeError, AttributeError, LookupError, AssertionError) as e: logger.warning(f"Encoding issue with {file_path}: {e}") # Try alternative encoding try: if isinstance(content.decoded_content, bytes): return content.decoded_content.decode("latin-1") except Exception: return None return None except GithubException: return None except Exception as e: logger.warning(f"Error reading {file_path}: {e}") return None def _extract_readme(self): """C1.2: Extract README.md files.""" logger.info("Extracting README...") # Try common README locations readme_files = [ "README.md", "README.rst", "README.txt", "README", "docs/README.md", ".github/README.md", ] for readme_path in readme_files: readme_content = self._get_file_content(readme_path) if readme_content: self.extracted_data["readme"] = readme_content logger.info(f"README found: {readme_path}") # Update description if not explicitly set in config if "description" not in self.config: smart_description = extract_description_from_readme( self.extracted_data["readme"], self.repo_name ) self.description = smart_description logger.debug(f"Generated description: {self.description}") return logger.warning("No README found in repository") def _extract_code_structure(self): """ C1.3-C1.6: Extract code structure, languages, signatures, and test examples. Surface layer only - no full implementation code. """ logger.info("Extracting code structure...") # C1.4: Get language breakdown self._extract_languages() # Get file tree self._extract_file_tree() # Extract signatures and test examples if self.include_code: self._extract_signatures_and_tests() def _extract_languages(self): """C1.4: Detect programming languages in repository.""" logger.info("Detecting programming languages...") try: languages = self.repo.get_languages() total_bytes = sum(languages.values()) self.extracted_data["languages"] = { lang: { "bytes": bytes_count, "percentage": round((bytes_count / total_bytes) * 100, 2) if total_bytes > 0 else 0, } for lang, bytes_count in languages.items() } logger.info(f"Languages detected: {', '.join(languages.keys())}") except GithubException as e: logger.warning(f"Could not fetch languages: {e}") def should_exclude_dir(self, dir_name: str, dir_path: str = None) -> bool: """ Check if directory should be excluded from analysis. Args: dir_name: Directory name (e.g., "Examples & Extras") dir_path: Full relative path (e.g., "TextMesh Pro/Examples & Extras") Returns: True if directory should be excluded """ # Check directory name if dir_name in self.excluded_dirs or dir_name.startswith("."): return True # Check full path if provided (for nested exclusions like "TextMesh Pro/Examples & Extras") if dir_path: for excluded in self.excluded_dirs: # Match if path contains the exclusion pattern if excluded in dir_path or dir_path.startswith(excluded): return True # Check .gitignore rules if available (C2.1) if self.gitignore_spec and dir_path: # For directories, we need to check both with and without trailing slash # as .gitignore patterns can match either way dir_path_with_slash = dir_path if dir_path.endswith("/") else dir_path + "/" if self.gitignore_spec.match_file(dir_path) or self.gitignore_spec.match_file( dir_path_with_slash ): logger.debug(f"Directory excluded by .gitignore: {dir_path}") return True return False def _load_gitignore(self) -> Optional["pathspec.PathSpec"]: """ Load .gitignore file and create pathspec matcher (C2.1). Returns: PathSpec object if .gitignore found, None otherwise """ if not PATHSPEC_AVAILABLE: logger.warning("pathspec not installed - .gitignore support disabled") logger.warning("Install with: pip install pathspec") return None if not self.local_repo_path: return None gitignore_path = Path(self.local_repo_path) / ".gitignore" if not gitignore_path.exists(): logger.debug(f"No .gitignore found in {self.local_repo_path}") return None try: with open(gitignore_path, encoding="utf-8") as f: spec = pathspec.PathSpec.from_lines("gitwildmatch", f) logger.info(f"Loaded .gitignore from {gitignore_path}") return spec except Exception as e: logger.warning(f"Failed to load .gitignore: {e}") return None def _extract_file_tree(self): """Extract repository file tree structure (dual-mode: GitHub API or local filesystem).""" logger.info("Building file tree...") if self.local_repo_path: # Local filesystem mode - unlimited files self._extract_file_tree_local() else: # GitHub API mode - limited by API rate limits self._extract_file_tree_github() def _extract_file_tree_local(self): """Extract file tree from local filesystem (unlimited files).""" if not os.path.exists(self.local_repo_path): logger.error(f"Local repository path not found: {self.local_repo_path}") return # Log exclusions for debugging logger.info( f"Directory exclusions ({len(self.excluded_dirs)} total): {sorted(list(self.excluded_dirs)[:10])}" ) file_tree = [] excluded_count = 0 for root, dirs, files in os.walk(self.local_repo_path): # Calculate relative path from repo root first (needed for exclusion checks) rel_root = os.path.relpath(root, self.local_repo_path) if rel_root == ".": rel_root = "" # Exclude directories in-place to prevent os.walk from descending into them # Pass both dir name and full path for path-based exclusions filtered_dirs = [] for d in dirs: dir_path = os.path.join(rel_root, d) if rel_root else d if self.should_exclude_dir(d, dir_path): excluded_count += 1 logger.debug(f"Excluding directory: {dir_path}") else: filtered_dirs.append(d) dirs[:] = filtered_dirs # Add directories for dir_name in dirs: dir_path = os.path.join(rel_root, dir_name) if rel_root else dir_name file_tree.append({"path": dir_path, "type": "dir", "size": None}) # Add files for file_name in files: file_path = os.path.join(rel_root, file_name) if rel_root else file_name full_path = os.path.join(root, file_name) try: file_size = os.path.getsize(full_path) except OSError: file_size = None file_tree.append({"path": file_path, "type": "file", "size": file_size}) self.extracted_data["file_tree"] = file_tree logger.info( f"File tree built (local mode): {len(file_tree)} items ({excluded_count} directories excluded)" ) def _extract_file_tree_github(self): """Extract file tree from GitHub API (rate-limited).""" try: contents = self.repo.get_contents("") file_tree = [] while contents: file_content = contents.pop(0) file_info = { "path": file_content.path, "type": file_content.type, "size": file_content.size if file_content.type == "file" else None, } file_tree.append(file_info) if file_content.type == "dir": contents.extend(self.repo.get_contents(file_content.path)) self.extracted_data["file_tree"] = file_tree logger.info(f"File tree built (GitHub API mode): {len(file_tree)} items") except GithubException as e: logger.warning(f"Could not build file tree: {e}") def _extract_signatures_and_tests(self): """ C1.3, C1.5, C1.6: Extract signatures, docstrings, and test examples. Extraction depth depends on code_analysis_depth setting: - surface: File tree only (minimal) - deep: Parse files for signatures, parameters, types - full: Complete AST analysis (future enhancement) """ if self.code_analysis_depth == "surface": logger.info("Code extraction: Surface level (file tree only)") return if not self.code_analyzer: logger.warning("Code analyzer not available - skipping deep analysis") return logger.info(f"Extracting code signatures ({self.code_analysis_depth} analysis)...") # Get primary language for the repository languages = self.extracted_data.get("languages", {}) if not languages: logger.warning("No languages detected - skipping code analysis") return # Determine primary language primary_language = max(languages.items(), key=lambda x: x[1]["bytes"])[0] logger.info(f"Primary language: {primary_language}") # Determine file extensions to analyze extension_map = { "Python": [".py"], "JavaScript": [".js", ".jsx"], "TypeScript": [".ts", ".tsx"], "C": [".c", ".h"], "C++": [".cpp", ".hpp", ".cc", ".hh", ".cxx"], } extensions = extension_map.get(primary_language, []) if not extensions: logger.warning(f"No file extensions mapped for {primary_language}") return # Analyze files matching patterns and extensions analyzed_files = [] file_tree = self.extracted_data.get("file_tree", []) for file_info in file_tree: file_path = file_info["path"] # Check if file matches extension if not any(file_path.endswith(ext) for ext in extensions): continue # Check if file matches patterns (if specified) if self.file_patterns: import fnmatch if not any(fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns): continue # Analyze this file try: # Read file content based on mode if self.local_repo_path: # Local mode - read from filesystem full_path = os.path.join(self.local_repo_path, file_path) with open(full_path, encoding="utf-8") as f: content = f.read() else: # GitHub API mode - fetch from API file_content = self.repo.get_contents(file_path) content = file_content.decoded_content.decode("utf-8") analysis_result = self.code_analyzer.analyze_file( file_path, content, primary_language ) if analysis_result and ( analysis_result.get("classes") or analysis_result.get("functions") ): analyzed_files.append( {"file": file_path, "language": primary_language, **analysis_result} ) logger.debug( f"Analyzed {file_path}: " f"{len(analysis_result.get('classes', []))} classes, " f"{len(analysis_result.get('functions', []))} functions" ) except Exception as e: logger.debug(f"Could not analyze {file_path}: {e}") continue # Limit number of files analyzed to avoid rate limits (GitHub API mode only) if not self.local_repo_path and len(analyzed_files) >= 50: logger.info("Reached analysis limit (50 files, GitHub API mode)") break self.extracted_data["code_analysis"] = { "depth": self.code_analysis_depth, "language": primary_language, "files_analyzed": len(analyzed_files), "files": analyzed_files, } # Calculate totals total_classes = sum(len(f.get("classes", [])) for f in analyzed_files) total_functions = sum(len(f.get("functions", [])) for f in analyzed_files) logger.info( f"Code analysis complete: {len(analyzed_files)} files, {total_classes} classes, {total_functions} functions" ) def _extract_issues(self): """C1.7: Extract GitHub Issues (open/closed, labels, milestones).""" logger.info(f"Extracting GitHub Issues (max {self.max_issues})...") try: # Fetch recent issues (open + closed) issues = self.repo.get_issues(state="all", sort="updated", direction="desc") issue_list = [] for issue in issues[: self.max_issues]: # Skip pull requests (they appear in issues) if issue.pull_request: continue issue_data = { "number": issue.number, "title": issue.title, "state": issue.state, "labels": [label.name for label in issue.labels], "milestone": issue.milestone.title if issue.milestone else None, "created_at": issue.created_at.isoformat() if issue.created_at else None, "updated_at": issue.updated_at.isoformat() if issue.updated_at else None, "closed_at": issue.closed_at.isoformat() if issue.closed_at else None, "url": issue.html_url, "body": issue.body[:500] if issue.body else None, # First 500 chars } issue_list.append(issue_data) self.extracted_data["issues"] = issue_list logger.info(f"Extracted {len(issue_list)} issues") except GithubException as e: logger.warning(f"Could not fetch issues: {e}") def _extract_changelog(self): """C1.8: Extract CHANGELOG.md and release notes.""" logger.info("Extracting CHANGELOG...") # Try common changelog locations changelog_files = [ "CHANGELOG.md", "CHANGES.md", "HISTORY.md", "CHANGELOG.rst", "CHANGELOG.txt", "CHANGELOG", "docs/CHANGELOG.md", ".github/CHANGELOG.md", ] for changelog_path in changelog_files: changelog_content = self._get_file_content(changelog_path) if changelog_content: self.extracted_data["changelog"] = changelog_content logger.info(f"CHANGELOG found: {changelog_path}") return logger.warning("No CHANGELOG found in repository") def _extract_releases(self): """C1.9: Extract GitHub Releases with version history.""" logger.info("Extracting GitHub Releases...") try: releases = self.repo.get_releases() release_list = [] for release in releases: release_data = { "tag_name": release.tag_name, "name": release.title, "body": release.body, "draft": release.draft, "prerelease": release.prerelease, "created_at": release.created_at.isoformat() if release.created_at else None, "published_at": release.published_at.isoformat() if release.published_at else None, "url": release.html_url, "tarball_url": release.tarball_url, "zipball_url": release.zipball_url, } release_list.append(release_data) self.extracted_data["releases"] = release_list logger.info(f"Extracted {len(release_list)} releases") except GithubException as e: logger.warning(f"Could not fetch releases: {e}") def _save_data(self): """Save extracted data to JSON file.""" os.makedirs("output", exist_ok=True) with open(self.data_file, "w", encoding="utf-8") as f: json.dump(self.extracted_data, f, indent=2, ensure_ascii=False) logger.info(f"Data saved to: {self.data_file}") class GitHubToSkillConverter: """ Convert extracted GitHub data to Claude skill format (C1.10). """ def __init__(self, config: dict[str, Any]): """Initialize converter with configuration.""" self.config = config self.name = config.get("name", config["repo"].split("/")[-1]) # Paths self.data_file = f"output/{self.name}_github_data.json" self.skill_dir = f"output/{self.name}" # Load extracted data self.data = self._load_data() # Set description (smart extraction from README if available) if "description" in config: self.description = config["description"] else: # Try to extract from README in loaded data readme_content = self.data.get("readme", "") repo_name = config["repo"] if readme_content: self.description = extract_description_from_readme(readme_content, repo_name) else: self.description = f"Use when working with {repo_name.split('/')[-1]}" def _load_data(self) -> dict[str, Any]: """Load extracted GitHub data from JSON.""" if not os.path.exists(self.data_file): raise FileNotFoundError(f"Data file not found: {self.data_file}") with open(self.data_file, encoding="utf-8") as f: return json.load(f) def build_skill(self): """Build complete skill structure.""" logger.info(f"Building skill for: {self.name}") # Create directories os.makedirs(self.skill_dir, exist_ok=True) os.makedirs(f"{self.skill_dir}/references", exist_ok=True) os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) # Generate SKILL.md self._generate_skill_md() # Generate reference files self._generate_references() logger.info(f"āœ… Skill built successfully: {self.skill_dir}/") def _generate_skill_md(self): """Generate main SKILL.md file (rich version with C3.x data if available).""" repo_info = self.data.get("repo_info", {}) c3_data = self.data.get("c3_analysis", {}) has_c3_data = bool(c3_data) # Generate skill name (lowercase, hyphens only, max 64 chars) skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64] # Truncate description to 1024 chars if needed desc = self.description[:1024] if len(self.description) > 1024 else self.description # Build skill content skill_content = f"""--- name: {skill_name} description: {desc} --- # {repo_info.get("name", self.name)} {self.description} ## Description {repo_info.get("description", "GitHub repository skill")} **Repository:** [{repo_info.get("full_name", "N/A")}]({repo_info.get("url", "#")}) **Language:** {repo_info.get("language", "N/A")} **Stars:** {repo_info.get("stars", 0):,} **License:** {repo_info.get("license", "N/A")} ## When to Use This Skill Use this skill when you need to: - Understand how to use {repo_info.get("name", self.name)} - Look up API documentation and implementation details - Find real-world usage examples from the codebase - Review design patterns and architecture - Check for known issues or recent changes - Explore release history and changelogs """ # Add Quick Reference section (enhanced with C3.x if available) skill_content += "\n## ⚔ Quick Reference\n\n" # Repository info skill_content += "### Repository Info\n" skill_content += f"- **Homepage:** {repo_info.get('homepage', 'N/A')}\n" skill_content += f"- **Topics:** {', '.join(repo_info.get('topics', []))}\n" skill_content += f"- **Open Issues:** {repo_info.get('open_issues', 0)}\n" skill_content += f"- **Last Updated:** {repo_info.get('updated_at', 'N/A')[:10]}\n\n" # Languages skill_content += "### Languages\n" skill_content += self._format_languages() + "\n\n" # Add C3.x pattern summary if available if has_c3_data and c3_data.get("patterns"): skill_content += self._format_pattern_summary(c3_data) # Add code examples if available (C3.2 test examples) if has_c3_data and c3_data.get("test_examples"): skill_content += self._format_code_examples(c3_data) # Add API Reference if available (C2.5) if has_c3_data and c3_data.get("api_reference"): skill_content += self._format_api_reference(c3_data) # Add Architecture Overview if available (C3.7) if has_c3_data and c3_data.get("architecture"): skill_content += self._format_architecture(c3_data) # Add Known Issues section skill_content += self._format_known_issues() # Add Recent Releases skill_content += "### Recent Releases\n" skill_content += self._format_recent_releases() + "\n\n" # Available References skill_content += "## šŸ“– Available References\n\n" skill_content += "- `references/README.md` - Complete README documentation\n" skill_content += "- `references/CHANGELOG.md` - Version history and changes\n" skill_content += "- `references/issues.md` - Recent GitHub issues\n" skill_content += "- `references/releases.md` - Release notes\n" skill_content += "- `references/file_structure.md` - Repository structure\n" if has_c3_data: skill_content += "\n### Codebase Analysis References\n\n" if c3_data.get("patterns"): skill_content += ( "- `references/codebase_analysis/patterns/` - Design patterns detected\n" ) if c3_data.get("test_examples"): skill_content += ( "- `references/codebase_analysis/examples/` - Test examples extracted\n" ) if c3_data.get("config_patterns"): skill_content += ( "- `references/codebase_analysis/configuration/` - Configuration analysis\n" ) if c3_data.get("architecture"): skill_content += ( "- `references/codebase_analysis/ARCHITECTURE.md` - Architecture overview\n" ) # Usage skill_content += "\n## šŸ’» Usage\n\n" skill_content += "See README.md for complete usage instructions and examples.\n\n" # Footer skill_content += "---\n\n" if has_c3_data: skill_content += "**Generated by Skill Seeker** | GitHub Repository Scraper with C3.x Codebase Analysis\n" else: skill_content += "**Generated by Skill Seeker** | GitHub Repository Scraper\n" # Write to file skill_path = f"{self.skill_dir}/SKILL.md" with open(skill_path, "w", encoding="utf-8") as f: f.write(skill_content) line_count = len(skill_content.split("\n")) logger.info(f"Generated: {skill_path} ({line_count} lines)") def _format_languages(self) -> str: """Format language breakdown.""" languages = self.data.get("languages", {}) if not languages: return "No language data available" lines = [] for lang, info in sorted(languages.items(), key=lambda x: x[1]["bytes"], reverse=True): lines.append(f"- **{lang}:** {info['percentage']:.1f}%") return "\n".join(lines) def _format_recent_releases(self) -> str: """Format recent releases (top 3).""" releases = self.data.get("releases", []) if not releases: return "No releases available" lines = [] for release in releases[:3]: lines.append( f"- **{release['tag_name']}** ({release['published_at'][:10]}): {release['name']}" ) return "\n".join(lines) def _format_pattern_summary(self, c3_data: dict[str, Any]) -> str: """Format design patterns summary (C3.1).""" patterns_data = c3_data.get("patterns", []) if not patterns_data: return "" # Count patterns by type (deduplicate by class, keep highest confidence) pattern_counts = {} by_class = {} for pattern_file in patterns_data: for pattern in pattern_file.get("patterns", []): ptype = pattern.get("pattern_type", "Unknown") cls = pattern.get("class_name", "") confidence = pattern.get("confidence", 0) # Skip low confidence if confidence < 0.7: continue # Deduplicate by class key = f"{cls}:{ptype}" if key not in by_class or by_class[key]["confidence"] < confidence: by_class[key] = pattern # Count by type pattern_counts[ptype] = pattern_counts.get(ptype, 0) + 1 if not pattern_counts: return "" content = "### Design Patterns Detected\n\n" content += "*From C3.1 codebase analysis (confidence > 0.7)*\n\n" # Top 5 pattern types for ptype, count in sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)[:5]: content += f"- **{ptype}**: {count} instances\n" content += f"\n*Total: {len(by_class)} high-confidence patterns*\n\n" return content def _format_code_examples(self, c3_data: dict[str, Any]) -> str: """Format code examples (C3.2).""" examples_data = c3_data.get("test_examples", {}) examples = examples_data.get("examples", []) if not examples: return "" # Filter high-value examples (complexity > 0.7) high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7] if not high_value: return "" content = "## šŸ“ Code Examples\n\n" content += "*High-quality examples from codebase (C3.2)*\n\n" # Top 10 examples for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]: desc = ex.get("description", "Example") lang = ex.get("language", "python") code = ex.get("code", "") complexity = ex.get("complexity_score", 0) content += f"**{desc}** (complexity: {complexity:.2f})\n\n" content += f"```{lang}\n{code}\n```\n\n" return content def _format_api_reference(self, c3_data: dict[str, Any]) -> str: """Format API reference (C2.5).""" api_ref = c3_data.get("api_reference", {}) if not api_ref: return "" content = "## šŸ”§ API Reference\n\n" content += "*Extracted from codebase analysis (C2.5)*\n\n" # Top 5 modules for module_name, module_md in list(api_ref.items())[:5]: content += f"### {module_name}\n\n" # First 500 chars of module documentation content += module_md[:500] if len(module_md) > 500: content += "...\n\n" else: content += "\n\n" content += "*See `references/codebase_analysis/api_reference/` for complete API docs*\n\n" return content def _format_architecture(self, c3_data: dict[str, Any]) -> str: """Format architecture overview (C3.7).""" arch_data = c3_data.get("architecture", {}) if not arch_data: return "" content = "## šŸ—ļø Architecture Overview\n\n" content += "*From C3.7 codebase analysis*\n\n" # Architecture patterns patterns = arch_data.get("patterns", []) if patterns: content += "**Architectural Patterns:**\n" for pattern in patterns[:5]: content += ( f"- {pattern.get('name', 'Unknown')}: {pattern.get('description', 'N/A')}\n" ) content += "\n" # Dependencies (C2.6) dep_data = c3_data.get("dependency_graph", {}) if dep_data: total_deps = dep_data.get("total_dependencies", 0) circular = len(dep_data.get("circular_dependencies", [])) if total_deps > 0: content += f"**Dependencies:** {total_deps} total" if circular > 0: content += f" (āš ļø {circular} circular dependencies detected)" content += "\n\n" content += "*See `references/codebase_analysis/ARCHITECTURE.md` for complete overview*\n\n" return content def _format_known_issues(self) -> str: """Format known issues from GitHub.""" issues = self.data.get("issues", []) if not issues: return "" content = "## āš ļø Known Issues\n\n" content += "*Recent issues from GitHub*\n\n" # Top 5 issues for issue in issues[:5]: title = issue.get("title", "Untitled") number = issue.get("number", 0) labels = ", ".join(issue.get("labels", [])) content += f"- **#{number}**: {title}" if labels: content += f" [`{labels}`]" content += "\n" content += "\n*See `references/issues.md` for complete list*\n\n" return content def _generate_references(self): """Generate all reference files.""" # README if self.data.get("readme"): readme_path = f"{self.skill_dir}/references/README.md" with open(readme_path, "w", encoding="utf-8") as f: f.write(self.data["readme"]) logger.info(f"Generated: {readme_path}") # CHANGELOG if self.data.get("changelog"): changelog_path = f"{self.skill_dir}/references/CHANGELOG.md" with open(changelog_path, "w", encoding="utf-8") as f: f.write(self.data["changelog"]) logger.info(f"Generated: {changelog_path}") # Issues if self.data.get("issues"): self._generate_issues_reference() # Releases if self.data.get("releases"): self._generate_releases_reference() # File structure if self.data.get("file_tree"): self._generate_file_structure_reference() def _generate_issues_reference(self): """Generate issues.md reference file.""" issues = self.data["issues"] content = f"# GitHub Issues\n\nRecent issues from the repository ({len(issues)} total).\n\n" # Group by state open_issues = [i for i in issues if i["state"] == "open"] closed_issues = [i for i in issues if i["state"] == "closed"] content += f"## Open Issues ({len(open_issues)})\n\n" for issue in open_issues[:20]: labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels" content += f"### #{issue['number']}: {issue['title']}\n" content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n" content += f"[View on GitHub]({issue['url']})\n\n" content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n" for issue in closed_issues[:10]: labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels" content += f"### #{issue['number']}: {issue['title']}\n" content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n" content += f"[View on GitHub]({issue['url']})\n\n" issues_path = f"{self.skill_dir}/references/issues.md" with open(issues_path, "w", encoding="utf-8") as f: f.write(content) logger.info(f"Generated: {issues_path}") def _generate_releases_reference(self): """Generate releases.md reference file.""" releases = self.data["releases"] content = ( f"# Releases\n\nVersion history for this repository ({len(releases)} releases).\n\n" ) for release in releases: content += f"## {release['tag_name']}: {release['name']}\n" content += f"**Published:** {release['published_at'][:10]}\n" if release["prerelease"]: content += "**Pre-release**\n" content += f"\n{release['body']}\n\n" content += f"[View on GitHub]({release['url']})\n\n---\n\n" releases_path = f"{self.skill_dir}/references/releases.md" with open(releases_path, "w", encoding="utf-8") as f: f.write(content) logger.info(f"Generated: {releases_path}") def _generate_file_structure_reference(self): """Generate file_structure.md reference file.""" file_tree = self.data["file_tree"] content = "# Repository File Structure\n\n" content += f"Total items: {len(file_tree)}\n\n" content += "```\n" # Build tree structure for item in file_tree: indent = " " * item["path"].count("/") icon = "šŸ“" if item["type"] == "dir" else "šŸ“„" content += f"{indent}{icon} {os.path.basename(item['path'])}\n" content += "```\n" structure_path = f"{self.skill_dir}/references/file_structure.md" with open(structure_path, "w", encoding="utf-8") as f: f.write(content) logger.info(f"Generated: {structure_path}") def setup_argument_parser() -> argparse.ArgumentParser: """Setup and configure command-line argument parser. Creates an ArgumentParser with all CLI options for the github scraper. All arguments are defined in skill_seekers.cli.arguments.github to ensure consistency between the standalone scraper and unified CLI. Returns: argparse.ArgumentParser: Configured argument parser """ parser = argparse.ArgumentParser( description="GitHub Repository to Claude Skill Converter", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: skill-seekers github --repo facebook/react skill-seekers github --config configs/react_github.json skill-seekers github --repo owner/repo --token $GITHUB_TOKEN """, ) # Add all github arguments from shared definitions # This ensures the standalone scraper and unified CLI stay in sync add_github_arguments(parser) return parser def main(): """C1.10: CLI tool entry point.""" parser = setup_argument_parser() args = parser.parse_args() # Build config from args or file if args.config: with open(args.config, encoding="utf-8") as f: config = json.load(f) # Override with CLI args if provided if args.non_interactive: config["interactive"] = False if args.profile: config["github_profile"] = args.profile elif args.repo: config = { "repo": args.repo, "name": args.name or args.repo.split("/")[-1], "description": args.description or f"Use when working with {args.repo.split('/')[-1]}", "github_token": args.token, "include_issues": not args.no_issues, "include_changelog": not args.no_changelog, "include_releases": not args.no_releases, "max_issues": args.max_issues, "interactive": not args.non_interactive, "github_profile": args.profile, "local_repo_path": getattr(args, "local_repo_path", None), } else: parser.error("Either --repo or --config is required") try: # Phase 1: Scrape GitHub repository scraper = GitHubScraper(config) scraper.scrape() if args.scrape_only: logger.info("Scrape complete (--scrape-only mode)") return # Phase 2: Build skill converter = GitHubToSkillConverter(config) converter.build_skill() skill_name = config.get("name", config["repo"].split("/")[-1]) skill_dir = f"output/{skill_name}" # ============================================================ # WORKFLOW SYSTEM INTEGRATION (Phase 2 - github_scraper) # ============================================================ from skill_seekers.cli.workflow_runner import run_workflows # Pass GitHub-specific context to workflows github_context = { "repo": config.get("repo", ""), "name": skill_name, "description": config.get("description", ""), } workflow_executed, workflow_names = run_workflows(args, context=github_context) workflow_name = ", ".join(workflow_names) if workflow_names else None # Phase 3: Optional enhancement with auto-detected mode # Note: Runs independently of workflow system (they complement each other) if getattr(args, "enhance_level", 0) > 0: import os # Auto-detect mode based on API key availability api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY") mode = "API" if api_key else "LOCAL" logger.info("\n" + "=" * 80) logger.info(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") logger.info("=" * 80) if workflow_executed: logger.info(f" Running after workflow: {workflow_name}") logger.info( " (Workflow provides specialized analysis, enhancement provides general improvements)" ) logger.info("") if api_key: # API-based enhancement try: from skill_seekers.cli.enhance_skill import enhance_skill_md enhance_skill_md(skill_dir, api_key) logger.info("āœ… API enhancement complete!") except ImportError: logger.error("āŒ API enhancement not available. Install: pip install anthropic") logger.info("šŸ’” Falling back to LOCAL mode...") # Fall back to LOCAL mode from pathlib import Path from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer enhancer = LocalSkillEnhancer(Path(skill_dir)) enhancer.run(headless=True) logger.info("āœ… Local enhancement complete!") else: # LOCAL enhancement (no API key) from pathlib import Path from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer enhancer = LocalSkillEnhancer(Path(skill_dir)) enhancer.run(headless=True) logger.info("āœ… Local enhancement complete!") logger.info(f"\nāœ… Success! Skill created at: {skill_dir}/") # Only suggest enhancement if neither workflow nor traditional enhancement was done if not workflow_executed and getattr(args, "enhance_level", 0) == 0: logger.info("\nšŸ’” Optional: Enhance SKILL.md with Claude:") logger.info(f" skill-seekers enhance {skill_dir}/ --enhance-level 2") logger.info(" (auto-detects API vs LOCAL mode based on ANTHROPIC_API_KEY)") logger.info("\nšŸ’” Or use a workflow:") logger.info( f" skill-seekers github --repo {config['repo']} --enhance-workflow architecture-comprehensive" ) logger.info(f"\nNext step: skill-seekers package {skill_dir}/") except Exception as e: logger.error(f"Error: {e}") sys.exit(1) if __name__ == "__main__": main()