#!/usr/bin/env python3 """ Codebase Scraper CLI Tool Standalone tool for analyzing local codebases without GitHub API. Extracts code signatures, comments, and optionally generates API documentation. Usage: codebase-scraper --directory /path/to/repo --output output/codebase/ codebase-scraper --directory . --depth deep --languages Python,JavaScript codebase-scraper --directory /path/to/repo --build-api-reference Features: - File tree walking with .gitignore support - Multi-language code analysis (9 languages: Python, JavaScript/TypeScript, C/C++, C#, Go, Rust, Java, Ruby, PHP) - API reference generation - Comment extraction - Dependency graph analysis - Configurable depth levels Credits: - Language parsing patterns inspired by official language specifications - NetworkX for dependency graph analysis: https://networkx.org/ - pathspec for .gitignore support: https://pypi.org/project/pathspec/ """ import argparse import json import logging import os import sys from pathlib import Path from typing import Any # Add parent directory to path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from skill_seekers.cli.api_reference_builder import APIReferenceBuilder from skill_seekers.cli.code_analyzer import CodeAnalyzer from skill_seekers.cli.config_extractor import ConfigExtractor from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer # Try to import pathspec for .gitignore support try: import pathspec PATHSPEC_AVAILABLE = True except ImportError: PATHSPEC_AVAILABLE = False # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # Language extension mapping LANGUAGE_EXTENSIONS = { ".py": "Python", ".js": "JavaScript", ".jsx": "JavaScript", ".ts": "TypeScript", ".tsx": "TypeScript", ".cpp": "C++", ".cc": "C++", ".cxx": "C++", ".h": "C++", ".hpp": "C++", ".hxx": "C++", ".c": "C", ".cs": "C#", ".go": "Go", ".rs": "Rust", ".java": "Java", ".rb": "Ruby", ".php": "PHP", } # Markdown extension mapping MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdown", ".mkd"} # Common documentation folders to scan DOC_FOLDERS = {"docs", "doc", "documentation", "wiki", ".github"} # Root-level doc files → category mapping ROOT_DOC_CATEGORIES = { "readme": "overview", "contributing": "contributing", "changelog": "changelog", "history": "changelog", "license": "license", "authors": "authors", "code_of_conduct": "community", "security": "security", "architecture": "architecture", "design": "architecture", } # Folder name → category mapping FOLDER_CATEGORIES = { "architecture": "architecture", "arch": "architecture", "design": "architecture", "guides": "guides", "guide": "guides", "tutorials": "guides", "tutorial": "guides", "howto": "guides", "how-to": "guides", "workflows": "workflows", "workflow": "workflows", "templates": "templates", "template": "templates", "api": "api", "reference": "api", "examples": "examples", "example": "examples", "specs": "specifications", "spec": "specifications", "rfcs": "specifications", "rfc": "specifications", "features": "features", "feature": "features", } # Default directories to exclude DEFAULT_EXCLUDED_DIRS = { "node_modules", "venv", "__pycache__", ".git", ".svn", ".hg", "build", "dist", "target", ".pytest_cache", ".tox", ".mypy_cache", "htmlcov", "coverage", ".coverage", ".eggs", "*.egg-info", ".idea", ".vscode", ".vs", "__pypackages__", } def detect_language(file_path: Path) -> str: """ Detect programming language from file extension. Args: file_path: Path to source file Returns: Language name or 'Unknown' """ extension = file_path.suffix.lower() return LANGUAGE_EXTENSIONS.get(extension, "Unknown") def load_gitignore(directory: Path) -> pathspec.PathSpec | None: """ Load .gitignore file and create pathspec matcher. Args: directory: Root directory to search for .gitignore Returns: PathSpec object if .gitignore found, None otherwise """ if not PATHSPEC_AVAILABLE: logger.warning("pathspec not installed - .gitignore support disabled") logger.warning("Install with: pip install pathspec") return None gitignore_path = directory / ".gitignore" if not gitignore_path.exists(): logger.debug(f"No .gitignore found in {directory}") return None try: with open(gitignore_path, encoding="utf-8") as f: spec = pathspec.PathSpec.from_lines("gitwildmatch", f) logger.info(f"Loaded .gitignore from {gitignore_path}") return spec except Exception as e: logger.warning(f"Failed to load .gitignore: {e}") return None def should_exclude_dir(dir_name: str, excluded_dirs: set) -> bool: """ Check if directory should be excluded from analysis. Args: dir_name: Directory name excluded_dirs: Set of directory names to exclude Returns: True if directory should be excluded """ return dir_name in excluded_dirs def walk_directory( root: Path, patterns: list[str] | None = None, gitignore_spec: pathspec.PathSpec | None = None, excluded_dirs: set | None = None, ) -> list[Path]: """ Walk directory tree and collect source files. Args: root: Root directory to walk patterns: Optional file patterns to include (e.g., ['*.py', '*.js']) gitignore_spec: Optional PathSpec object for .gitignore rules excluded_dirs: Set of directory names to exclude Returns: List of source file paths """ if excluded_dirs is None: excluded_dirs = DEFAULT_EXCLUDED_DIRS files = [] root = Path(root).resolve() for dirpath, dirnames, filenames in os.walk(root): current_dir = Path(dirpath) # Filter out excluded directories (in-place modification) dirnames[:] = [d for d in dirnames if not should_exclude_dir(d, excluded_dirs)] for filename in filenames: file_path = current_dir / filename # Check .gitignore rules if gitignore_spec: try: rel_path = file_path.relative_to(root) if gitignore_spec.match_file(str(rel_path)): logger.debug(f"Skipping (gitignore): {rel_path}") continue except ValueError: # File is outside root, skip it continue # Check file extension if file_path.suffix.lower() not in LANGUAGE_EXTENSIONS: continue # Check file patterns if provided if patterns and not any(file_path.match(pattern) for pattern in patterns): continue files.append(file_path) return sorted(files) def walk_markdown_files( root: Path, gitignore_spec: pathspec.PathSpec | None = None, excluded_dirs: set | None = None, ) -> list[Path]: """ Walk directory tree and collect markdown documentation files. Args: root: Root directory to walk gitignore_spec: Optional PathSpec object for .gitignore rules excluded_dirs: Set of directory names to exclude Returns: List of markdown file paths """ if excluded_dirs is None: excluded_dirs = DEFAULT_EXCLUDED_DIRS files = [] root = Path(root).resolve() for dirpath, dirnames, filenames in os.walk(root): current_dir = Path(dirpath) # Filter out excluded directories (in-place modification) dirnames[:] = [d for d in dirnames if not should_exclude_dir(d, excluded_dirs)] for filename in filenames: file_path = current_dir / filename # Check .gitignore rules if gitignore_spec: try: rel_path = file_path.relative_to(root) if gitignore_spec.match_file(str(rel_path)): logger.debug(f"Skipping (gitignore): {rel_path}") continue except ValueError: continue # Check if markdown file if file_path.suffix.lower() not in MARKDOWN_EXTENSIONS: continue files.append(file_path) return sorted(files) def categorize_markdown_file(file_path: Path, root: Path) -> str: """ Categorize a markdown file based on its location and filename. Args: file_path: Path to the markdown file root: Root directory of the project Returns: Category name (e.g., 'overview', 'guides', 'architecture') """ try: rel_path = file_path.relative_to(root) except ValueError: return "other" # Check root-level files by filename if len(rel_path.parts) == 1: filename_lower = file_path.stem.lower().replace("-", "_").replace(" ", "_") for key, category in ROOT_DOC_CATEGORIES.items(): if key in filename_lower: return category return "overview" # Default for root .md files # Check folder-based categorization for part in rel_path.parts[:-1]: # Exclude filename part_lower = part.lower().replace("-", "_").replace(" ", "_") for key, category in FOLDER_CATEGORIES.items(): if key in part_lower: return category # Default category return "other" def extract_markdown_structure(content: str) -> dict[str, Any]: """ Extract structure from markdown content (headers, code blocks, links). Args: content: Markdown file content Returns: Dictionary with extracted structure """ import re structure = { "title": None, "headers": [], "code_blocks": [], "links": [], "word_count": len(content.split()), "line_count": len(content.split("\n")), } lines = content.split("\n") # Extract headers for i, line in enumerate(lines): header_match = re.match(r"^(#{1,6})\s+(.+)$", line) if header_match: level = len(header_match.group(1)) text = header_match.group(2).strip() structure["headers"].append( { "level": level, "text": text, "line": i + 1, } ) # First h1 is the title if level == 1 and structure["title"] is None: structure["title"] = text # Extract code blocks (fenced) code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL) for match in code_block_pattern.finditer(content): language = match.group(1) or "text" code = match.group(2).strip() if len(code) > 0: structure["code_blocks"].append( { "language": language, "code": code[:500], # Truncate long code blocks "full_length": len(code), } ) # Extract links link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") for match in link_pattern.finditer(content): structure["links"].append( { "text": match.group(1), "url": match.group(2), } ) return structure def generate_markdown_summary( content: str, structure: dict[str, Any], max_length: int = 500 ) -> str: """ Generate a summary of markdown content. Args: content: Full markdown content structure: Extracted structure from extract_markdown_structure() max_length: Maximum summary length Returns: Summary string """ # Start with title if available summary_parts = [] if structure.get("title"): summary_parts.append(f"**{structure['title']}**") # Add header outline (first 5 h2/h3 headers) h2_h3 = [h for h in structure.get("headers", []) if h["level"] in (2, 3)][:5] if h2_h3: sections = [h["text"] for h in h2_h3] summary_parts.append(f"Sections: {', '.join(sections)}") # Extract first paragraph (skip headers and empty lines) lines = content.split("\n") first_para = [] in_para = False for line in lines: stripped = line.strip() if stripped.startswith("#") or stripped.startswith("```"): if in_para: break continue if stripped: in_para = True first_para.append(stripped) elif in_para: break if first_para: para_text = " ".join(first_para) if len(para_text) > 200: para_text = para_text[:200] + "..." summary_parts.append(para_text) # Add stats stats = f"({structure.get('word_count', 0)} words, {len(structure.get('code_blocks', []))} code blocks)" summary_parts.append(stats) summary = "\n".join(summary_parts) if len(summary) > max_length: summary = summary[:max_length] + "..." return summary def process_markdown_docs( directory: Path, output_dir: Path, depth: str = "deep", gitignore_spec: pathspec.PathSpec | None = None, enhance_with_ai: bool = False, ai_mode: str = "none", ) -> dict[str, Any]: """ Process all markdown documentation files in a directory. Args: directory: Root directory to scan output_dir: Output directory for processed docs depth: Processing depth ('surface', 'deep', 'full') gitignore_spec: Optional .gitignore spec enhance_with_ai: Whether to use AI enhancement ai_mode: AI mode ('none', 'auto', 'api', 'local') Returns: Dictionary with processed documentation data """ logger.info("Scanning for markdown documentation...") # Find all markdown files md_files = walk_markdown_files(directory, gitignore_spec) logger.info(f"Found {len(md_files)} markdown files") if not md_files: return {"files": [], "categories": {}, "total_files": 0} # Process each file processed_docs = [] categories = {} for md_path in md_files: try: content = md_path.read_text(encoding="utf-8", errors="ignore") rel_path = str(md_path.relative_to(directory)) category = categorize_markdown_file(md_path, directory) doc_data = { "path": rel_path, "filename": md_path.name, "category": category, "size_bytes": len(content.encode("utf-8")), } # Surface depth: just path and category if depth == "surface": processed_docs.append(doc_data) else: # Deep/Full: extract structure and summary structure = extract_markdown_structure(content) summary = generate_markdown_summary(content, structure) doc_data.update( { "title": structure.get("title") or md_path.stem, "structure": structure, "summary": summary, "content": content if depth == "full" else None, } ) processed_docs.append(doc_data) # Track categories if category not in categories: categories[category] = [] categories[category].append(rel_path) except Exception as e: logger.warning(f"Failed to process {md_path}: {e}") continue # AI Enhancement (if enabled and enhance_level >= 2) if enhance_with_ai and ai_mode != "none" and processed_docs: logger.info("🤖 Enhancing documentation analysis with AI...") try: processed_docs = _enhance_docs_with_ai(processed_docs, ai_mode) logger.info("✅ AI documentation enhancement complete") except Exception as e: logger.warning(f"⚠️ AI enhancement failed: {e}") # Save processed docs to output docs_output_dir = output_dir / "documentation" docs_output_dir.mkdir(parents=True, exist_ok=True) # Copy files organized by category for doc in processed_docs: try: src_path = directory / doc["path"] category = doc["category"] category_dir = docs_output_dir / category category_dir.mkdir(parents=True, exist_ok=True) # Copy file to category folder dest_path = category_dir / doc["filename"] import shutil shutil.copy2(src_path, dest_path) except Exception as e: logger.debug(f"Failed to copy {doc['path']}: {e}") # Save documentation index index_data = { "total_files": len(processed_docs), "categories": categories, "files": processed_docs, } index_json = docs_output_dir / "documentation_index.json" with open(index_json, "w", encoding="utf-8") as f: json.dump(index_data, f, indent=2, default=str) logger.info( f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories" ) logger.info(f"📁 Saved to: {docs_output_dir}") return index_data def _enhance_docs_with_ai(docs: list[dict], ai_mode: str) -> list[dict]: """ Enhance documentation analysis with AI. Args: docs: List of processed document dictionaries ai_mode: AI mode ('api' or 'local') Returns: Enhanced document list """ # Try API mode first if ai_mode in ("api", "auto"): api_key = os.environ.get("ANTHROPIC_API_KEY") if api_key: return _enhance_docs_api(docs, api_key) # Fall back to LOCAL mode if ai_mode in ("local", "auto"): return _enhance_docs_local(docs) return docs def _enhance_docs_api(docs: list[dict], api_key: str) -> list[dict]: """Enhance docs using Claude API.""" try: import anthropic client = anthropic.Anthropic(api_key=api_key) # Batch documents for efficiency batch_size = 10 for i in range(0, len(docs), batch_size): batch = docs[i : i + batch_size] # Create prompt for batch docs_text = "\n\n".join( [ f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}" for d in batch if d.get("summary") ] ) if not docs_text: continue prompt = f"""Analyze these documentation files and provide: 1. A brief description of what each document covers 2. Key topics/concepts mentioned 3. How they relate to each other Documents: {docs_text} Return JSON with format: {{"enhancements": [{{"filename": "...", "description": "...", "key_topics": [...], "related_to": [...]}}]}}""" response = client.messages.create( model="claude-sonnet-4-20250514", max_tokens=2000, messages=[{"role": "user", "content": prompt}], ) # Parse response and merge enhancements try: import re json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL) if json_match: enhancements = json.loads(json_match.group()) for enh in enhancements.get("enhancements", []): for doc in batch: if doc["filename"] == enh.get("filename"): doc["ai_description"] = enh.get("description") doc["ai_topics"] = enh.get("key_topics", []) doc["ai_related"] = enh.get("related_to", []) except Exception: pass except Exception as e: logger.warning(f"API enhancement failed: {e}") return docs def _enhance_docs_local(docs: list[dict]) -> list[dict]: """Enhance docs using Claude Code CLI (LOCAL mode).""" import subprocess import tempfile # Prepare batch of docs for enhancement docs_with_summary = [d for d in docs if d.get("summary")] if not docs_with_summary: return docs docs_text = "\n\n".join( [ f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}" for d in docs_with_summary[:20] # Limit to 20 docs ] ) prompt = f"""Analyze these documentation files from a codebase and provide insights. For each document, provide: 1. A brief description of what it covers 2. Key topics/concepts 3. Related documents Documents: {docs_text} Output JSON only: {{"enhancements": [{{"filename": "...", "description": "...", "key_topics": ["..."], "related_to": ["..."]}}]}}""" try: with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: f.write(prompt) prompt_file = f.name result = subprocess.run( ["claude", "--dangerously-skip-permissions", "-p", prompt], capture_output=True, text=True, timeout=120, ) os.unlink(prompt_file) if result.returncode == 0 and result.stdout: import re json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL) if json_match: enhancements = json.loads(json_match.group()) for enh in enhancements.get("enhancements", []): for doc in docs: if doc["filename"] == enh.get("filename"): doc["ai_description"] = enh.get("description") doc["ai_topics"] = enh.get("key_topics", []) doc["ai_related"] = enh.get("related_to", []) except Exception as e: logger.warning(f"LOCAL enhancement failed: {e}") return docs def analyze_codebase( directory: Path, output_dir: Path, depth: str = "deep", languages: list[str] | None = None, file_patterns: list[str] | None = None, build_api_reference: bool = True, extract_comments: bool = True, build_dependency_graph: bool = True, detect_patterns: bool = True, extract_test_examples: bool = True, build_how_to_guides: bool = True, extract_config_patterns: bool = True, extract_docs: bool = True, enhance_level: int = 0, ) -> dict[str, Any]: """ Analyze local codebase and extract code knowledge. Args: directory: Directory to analyze output_dir: Output directory for results depth: Analysis depth (surface, deep, full) languages: Optional list of languages to analyze file_patterns: Optional file patterns to include build_api_reference: Generate API reference markdown extract_comments: Extract inline comments build_dependency_graph: Generate dependency graph and detect circular dependencies detect_patterns: Detect design patterns (Singleton, Factory, Observer, etc.) extract_test_examples: Extract usage examples from test files build_how_to_guides: Build how-to guides from workflow examples (C3.3) extract_config_patterns: Extract configuration patterns from config files (C3.4) extract_docs: Extract and process markdown documentation files (default: True) enhance_level: AI enhancement level (0=off, 1=SKILL.md only, 2=+config+arch+docs, 3=full) Returns: Analysis results dictionary """ # Determine AI enhancement settings based on level # Level 0: No AI enhancement # Level 1: SKILL.md only (handled in main.py) # Level 2: Architecture + Config AI enhancement # Level 3: Full AI enhancement (patterns, tests, config, architecture) enhance_patterns = enhance_level >= 3 enhance_tests = enhance_level >= 3 enhance_config = enhance_level >= 2 enhance_architecture = enhance_level >= 2 ai_mode = "auto" if enhance_level > 0 else "none" if enhance_level > 0: level_names = {1: "SKILL.md only", 2: "SKILL.md+Architecture+Config", 3: "full"} logger.info( f"🤖 AI Enhancement Level: {enhance_level} ({level_names.get(enhance_level, 'unknown')})" ) # Resolve directory to absolute path to avoid relative_to() errors directory = Path(directory).resolve() logger.info(f"Analyzing codebase: {directory}") logger.info(f"Depth: {depth}") # Create output directory output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Load .gitignore gitignore_spec = load_gitignore(directory) # Walk directory tree logger.info("Scanning directory tree...") files = walk_directory(directory, patterns=file_patterns, gitignore_spec=gitignore_spec) logger.info(f"Found {len(files)} source files") # Filter by language if specified if languages: language_set = set(languages) files = [f for f in files if detect_language(f) in language_set] logger.info(f"Filtered to {len(files)} files for languages: {', '.join(languages)}") # Initialize code analyzer analyzer = CodeAnalyzer(depth=depth) # Analyze each file results = {"files": []} analyzed_count = 0 for file_path in files: try: content = file_path.read_text(encoding="utf-8", errors="ignore") language = detect_language(file_path) if language == "Unknown": continue # Analyze file analysis = analyzer.analyze_file(str(file_path), content, language) # Only include files with actual analysis results if analysis and (analysis.get("classes") or analysis.get("functions")): results["files"].append( { "file": str(file_path.relative_to(directory)), "language": language, **analysis, } ) analyzed_count += 1 if analyzed_count % 10 == 0: logger.info(f"Analyzed {analyzed_count}/{len(files)} files...") except Exception as e: logger.warning(f"Error analyzing {file_path}: {e}") continue logger.info(f"✅ Successfully analyzed {analyzed_count} files") # Save results output_json = output_dir / "code_analysis.json" with open(output_json, "w", encoding="utf-8") as f: json.dump(results, f, indent=2) logger.info(f"📁 Saved analysis to: {output_json}") # Build API reference if requested if build_api_reference and results["files"]: logger.info("Building API reference documentation...") builder = APIReferenceBuilder(results) api_output_dir = output_dir / "api_reference" generated_files = builder.build_reference(api_output_dir) logger.info(f"✅ Generated {len(generated_files)} API reference files") logger.info(f"📁 API reference: {api_output_dir}") # Build dependency graph if requested (C2.6) if build_dependency_graph: logger.info("Building dependency graph...") dep_analyzer = DependencyAnalyzer() # Analyze dependencies for all files for file_path in files: try: content = file_path.read_text(encoding="utf-8", errors="ignore") language = detect_language(file_path) if language != "Unknown": # Use relative path from directory for better graph readability rel_path = str(file_path.relative_to(directory)) dep_analyzer.analyze_file(rel_path, content, language) except Exception as e: logger.warning(f"Error analyzing dependencies for {file_path}: {e}") continue # Build the graph graph = dep_analyzer.build_graph() # Detect circular dependencies cycles = dep_analyzer.detect_cycles() if cycles: logger.warning(f"⚠️ Found {len(cycles)} circular dependencies:") for i, cycle in enumerate(cycles[:5], 1): # Show first 5 cycle_str = " → ".join(cycle) + f" → {cycle[0]}" logger.warning(f" {i}. {cycle_str}") if len(cycles) > 5: logger.warning(f" ... and {len(cycles) - 5} more") else: logger.info("✅ No circular dependencies found") # Save dependency graph data dep_output_dir = output_dir / "dependencies" dep_output_dir.mkdir(parents=True, exist_ok=True) # Export as JSON dep_json = dep_output_dir / "dependency_graph.json" with open(dep_json, "w", encoding="utf-8") as f: json.dump(dep_analyzer.export_json(), f, indent=2) logger.info(f"📁 Saved dependency graph: {dep_json}") # Export as Mermaid diagram mermaid_file = dep_output_dir / "dependency_graph.mmd" mermaid_file.write_text(dep_analyzer.export_mermaid()) logger.info(f"📁 Saved Mermaid diagram: {mermaid_file}") # Save statistics stats = dep_analyzer.get_statistics() stats_file = dep_output_dir / "statistics.json" with open(stats_file, "w", encoding="utf-8") as f: json.dump(stats, f, indent=2) logger.info( f"📊 Statistics: {stats['total_files']} files, " f"{stats['total_dependencies']} dependencies, " f"{stats['circular_dependencies']} cycles" ) # Try to export as DOT (requires pydot) try: dot_file = dep_output_dir / "dependency_graph.dot" dep_analyzer.export_dot(str(dot_file)) except Exception: pass # pydot not installed, skip DOT export # Detect design patterns if requested (C3.1) if detect_patterns: logger.info("Detecting design patterns...") from skill_seekers.cli.pattern_recognizer import PatternRecognizer pattern_recognizer = PatternRecognizer(depth=depth, enhance_with_ai=enhance_patterns) pattern_results = [] for file_path in files: try: content = file_path.read_text(encoding="utf-8", errors="ignore") language = detect_language(file_path) if language != "Unknown": report = pattern_recognizer.analyze_file(str(file_path), content, language) if report.patterns: pattern_results.append(report.to_dict()) except Exception as e: logger.warning(f"Pattern detection failed for {file_path}: {e}") continue # Save pattern results if pattern_results: pattern_output = output_dir / "patterns" pattern_output.mkdir(parents=True, exist_ok=True) pattern_json = pattern_output / "detected_patterns.json" with open(pattern_json, "w", encoding="utf-8") as f: json.dump(pattern_results, f, indent=2) total_patterns = sum(len(r["patterns"]) for r in pattern_results) logger.info(f"✅ Detected {total_patterns} patterns in {len(pattern_results)} files") logger.info(f"📁 Saved to: {pattern_json}") else: logger.info("No design patterns detected") # Extract test examples if requested (C3.2) if extract_test_examples: logger.info("Extracting usage examples from test files...") from skill_seekers.cli.test_example_extractor import TestExampleExtractor # Create extractor test_extractor = TestExampleExtractor( min_confidence=0.5, max_per_file=10, languages=languages, enhance_with_ai=enhance_tests, ) # Extract examples from directory try: example_report = test_extractor.extract_from_directory(directory, recursive=True) if example_report.total_examples > 0: # Save results examples_output = output_dir / "test_examples" examples_output.mkdir(parents=True, exist_ok=True) # Save as JSON examples_json = examples_output / "test_examples.json" with open(examples_json, "w", encoding="utf-8") as f: json.dump(example_report.to_dict(), f, indent=2) # Save as Markdown examples_md = examples_output / "test_examples.md" examples_md.write_text(example_report.to_markdown(), encoding="utf-8") logger.info( f"✅ Extracted {example_report.total_examples} test examples " f"({example_report.high_value_count} high-value)" ) logger.info(f"📁 Saved to: {examples_output}") else: logger.info("No test examples extracted") except Exception as e: logger.warning(f"Test example extraction failed: {e}") example_report = None # Build how-to guides from workflow examples (C3.3) if build_how_to_guides and extract_test_examples: logger.info("Building how-to guides from workflow examples...") try: from skill_seekers.cli.how_to_guide_builder import HowToGuideBuilder # Create guide builder (uses same enhance level as test examples) guide_builder = HowToGuideBuilder(enhance_with_ai=enhance_tests) # Build guides from workflow examples tutorials_dir = output_dir / "tutorials" # Get workflow examples from the example_report if available if ( "example_report" in locals() and example_report and example_report.total_examples > 0 ): # Convert example_report to list of dicts for processing examples_list = example_report.to_dict().get("examples", []) guide_collection = guide_builder.build_guides_from_examples( examples_list, grouping_strategy="ai-tutorial-group", output_dir=tutorials_dir, enhance_with_ai=enhance_tests, ai_mode=ai_mode, ) if guide_collection and guide_collection.total_guides > 0: # Save collection summary collection_json = tutorials_dir / "guide_collection.json" with open(collection_json, "w", encoding="utf-8") as f: json.dump(guide_collection.to_dict(), f, indent=2) logger.info(f"✅ Built {guide_collection.total_guides} how-to guides") logger.info(f"📁 Saved to: {tutorials_dir}") else: logger.info("No how-to guides generated (insufficient workflow examples)") else: logger.info("No workflow examples available for guide generation") except Exception as e: logger.warning(f"How-to guide building failed: {e}") # Extract configuration patterns (C3.4) if extract_config_patterns: logger.info("Extracting configuration patterns...") try: config_extractor = ConfigExtractor() # Extract config patterns from directory extraction_result = config_extractor.extract_from_directory(directory) if extraction_result.config_files: # Convert to dict for enhancement result_dict = config_extractor.to_dict(extraction_result) # AI Enhancement (if enabled - level 2+) if enhance_config and ai_mode != "none": try: from skill_seekers.cli.config_enhancer import ConfigEnhancer logger.info(f"🤖 Enhancing config analysis with AI (mode: {ai_mode})...") enhancer = ConfigEnhancer(mode=ai_mode) result_dict = enhancer.enhance_config_result(result_dict) logger.info("✅ AI enhancement complete") except Exception as e: logger.warning(f"⚠️ Config AI enhancement failed: {e}") # Save results config_output = output_dir / "config_patterns" config_output.mkdir(parents=True, exist_ok=True) # Save as JSON config_json = config_output / "config_patterns.json" with open(config_json, "w", encoding="utf-8") as f: json.dump(result_dict, f, indent=2) # Save as Markdown (basic - AI enhancements in JSON only for now) config_md = config_output / "config_patterns.md" config_md.write_text(extraction_result.to_markdown(), encoding="utf-8") # Count total settings across all files total_settings = sum(len(cf.settings) for cf in extraction_result.config_files) total_patterns = sum(len(cf.patterns) for cf in extraction_result.config_files) logger.info( f"✅ Extracted {len(extraction_result.config_files)} config files " f"with {total_settings} settings and {total_patterns} detected patterns" ) if "ai_enhancements" in result_dict: insights = result_dict["ai_enhancements"].get("overall_insights", {}) if insights.get("security_issues_found"): logger.info( f"🔐 Security issues found: {insights['security_issues_found']}" ) logger.info(f"📁 Saved to: {config_output}") else: logger.info("No configuration files found") except Exception as e: logger.warning(f"Config pattern extraction failed: {e}") # Detect architectural patterns (C3.7) # Always run this - it provides high-level overview logger.info("Analyzing architectural patterns...") from skill_seekers.cli.architectural_pattern_detector import ArchitecturalPatternDetector arch_detector = ArchitecturalPatternDetector(enhance_with_ai=enhance_architecture) arch_report = arch_detector.analyze(directory, results["files"]) if arch_report.patterns: arch_output = output_dir / "architecture" arch_output.mkdir(parents=True, exist_ok=True) # Save as JSON arch_json = arch_output / "architectural_patterns.json" with open(arch_json, "w", encoding="utf-8") as f: json.dump(arch_report.to_dict(), f, indent=2) logger.info(f"🏗️ Detected {len(arch_report.patterns)} architectural patterns") for pattern in arch_report.patterns: logger.info(f" - {pattern.pattern_name} (confidence: {pattern.confidence:.2f})") logger.info(f"📁 Saved to: {arch_json}") else: logger.info("No clear architectural patterns detected") # Extract markdown documentation (C3.9) docs_data = None if extract_docs: logger.info("Extracting project documentation...") try: # Determine AI enhancement for docs (level 2+) enhance_docs_ai = enhance_level >= 2 docs_data = process_markdown_docs( directory=directory, output_dir=output_dir, depth=depth, gitignore_spec=gitignore_spec, enhance_with_ai=enhance_docs_ai, ai_mode=ai_mode, ) if docs_data and docs_data.get("total_files", 0) > 0: logger.info( f"✅ Extracted {docs_data['total_files']} documentation files " f"in {len(docs_data.get('categories', {}))} categories" ) else: logger.info("No markdown documentation files found") except Exception as e: logger.warning(f"Documentation extraction failed: {e}") docs_data = None # Generate SKILL.md and references/ directory logger.info("Generating SKILL.md and references...") _generate_skill_md( output_dir=output_dir, directory=directory, results=results, depth=depth, build_api_reference=build_api_reference, build_dependency_graph=build_dependency_graph, detect_patterns=detect_patterns, extract_test_examples=extract_test_examples, extract_config_patterns=extract_config_patterns, extract_docs=extract_docs, docs_data=docs_data, ) return results def _generate_skill_md( output_dir: Path, directory: Path, results: dict[str, Any], depth: str, build_api_reference: bool, build_dependency_graph: bool, detect_patterns: bool, extract_test_examples: bool, extract_config_patterns: bool, extract_docs: bool = True, docs_data: dict[str, Any] | None = None, ): """ Generate rich SKILL.md from codebase analysis results. Creates a 300+ line skill file with: - Front matter (name, description) - Repository info (path, languages, file count) - When to Use section - Quick Reference (patterns, languages, stats) - Code Examples (from test files) - API Reference (from code analysis) - Architecture Overview - Configuration Patterns - Available References """ repo_name = directory.name # Generate skill name (lowercase, hyphens only, max 64 chars) skill_name = repo_name.lower().replace("_", "-").replace(" ", "-")[:64] # Generate description description = f"Local codebase analysis for {repo_name}" # Count files by language language_stats = _get_language_stats(results.get("files", [])) total_files = len(results.get("files", [])) # Start building content skill_content = f"""--- name: {skill_name} description: {description} --- # {repo_name} Codebase ## Description Local codebase analysis and documentation generated from code analysis. **Path:** `{directory}` **Files Analyzed:** {total_files} **Languages:** {", ".join(language_stats.keys())} **Analysis Depth:** {depth} ## When to Use This Skill Use this skill when you need to: - Understand the codebase architecture and design patterns - Find implementation examples and usage patterns - Review API documentation extracted from code - Check configuration patterns and best practices - Explore test examples and real-world usage - Navigate the codebase structure efficiently ## ⚡ Quick Reference ### Codebase Statistics """ # Language breakdown skill_content += "**Languages:**\n" for lang, count in sorted(language_stats.items(), key=lambda x: x[1], reverse=True): percentage = (count / total_files * 100) if total_files > 0 else 0 skill_content += f"- **{lang}**: {count} files ({percentage:.1f}%)\n" skill_content += "\n" # Analysis features performed skill_content += "**Analysis Performed:**\n" if build_api_reference: skill_content += "- ✅ API Reference (C2.5)\n" if build_dependency_graph: skill_content += "- ✅ Dependency Graph (C2.6)\n" if detect_patterns: skill_content += "- ✅ Design Patterns (C3.1)\n" if extract_test_examples: skill_content += "- ✅ Test Examples (C3.2)\n" if extract_config_patterns: skill_content += "- ✅ Configuration Patterns (C3.4)\n" skill_content += "- ✅ Architectural Analysis (C3.7)\n" if extract_docs: skill_content += "- ✅ Project Documentation (C3.9)\n" skill_content += "\n" # Add design patterns if available if detect_patterns: patterns_content = _format_patterns_section(output_dir) if patterns_content: skill_content += patterns_content # Add code examples if available if extract_test_examples: examples_content = _format_examples_section(output_dir) if examples_content: skill_content += examples_content # Add API reference if available if build_api_reference: api_content = _format_api_section(output_dir) if api_content: skill_content += api_content # Add architecture if available arch_content = _format_architecture_section(output_dir) if arch_content: skill_content += arch_content # Add configuration patterns if available if extract_config_patterns: config_content = _format_config_section(output_dir) if config_content: skill_content += config_content # Add project documentation if available if extract_docs and docs_data: docs_content = _format_documentation_section(output_dir, docs_data) if docs_content: skill_content += docs_content # Available references skill_content += "## 📚 Available References\n\n" skill_content += "This skill includes detailed reference documentation:\n\n" refs_added = False if build_api_reference and (output_dir / "api_reference").exists(): skill_content += ( "- **API Reference**: `references/api_reference/` - Complete API documentation\n" ) refs_added = True if build_dependency_graph and (output_dir / "dependencies").exists(): skill_content += ( "- **Dependencies**: `references/dependencies/` - Dependency graph and analysis\n" ) refs_added = True if detect_patterns and (output_dir / "patterns").exists(): skill_content += "- **Patterns**: `references/patterns/` - Detected design patterns\n" refs_added = True if extract_test_examples and (output_dir / "test_examples").exists(): skill_content += "- **Examples**: `references/test_examples/` - Usage examples from tests\n" refs_added = True if extract_config_patterns and (output_dir / "config_patterns").exists(): skill_content += ( "- **Configuration**: `references/config_patterns/` - Configuration patterns\n" ) refs_added = True if (output_dir / "architecture").exists(): skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n" refs_added = True if extract_docs and (output_dir / "documentation").exists(): skill_content += ( "- **Documentation**: `references/documentation/` - Project documentation\n" ) refs_added = True if not refs_added: skill_content += "No additional references generated (analysis features disabled).\n" skill_content += "\n" # Footer skill_content += "---\n\n" skill_content += "**Generated by Skill Seeker** | Codebase Analyzer with C3.x Analysis\n" # Write SKILL.md skill_path = output_dir / "SKILL.md" skill_path.write_text(skill_content, encoding="utf-8") line_count = len(skill_content.split("\n")) logger.info(f"✅ Generated SKILL.md: {skill_path} ({line_count} lines)") # Generate references/ directory structure _generate_references(output_dir) def _get_language_stats(files: list[dict]) -> dict[str, int]: """Count files by language from analysis results.""" stats = {} for file_data in files: # files is a list of dicts with 'language' key lang = file_data.get("language", "Unknown") if lang != "Unknown": stats[lang] = stats.get(lang, 0) + 1 return stats def _format_patterns_section(output_dir: Path) -> str: """Format design patterns section from patterns/detected_patterns.json.""" patterns_file = output_dir / "patterns" / "detected_patterns.json" if not patterns_file.exists(): return "" try: with open(patterns_file, encoding="utf-8") as f: patterns_data = json.load(f) except Exception: return "" if not patterns_data: return "" # Count patterns by type (deduplicate by class, keep highest confidence) pattern_counts = {} by_class = {} for pattern_file in patterns_data: for pattern in pattern_file.get("patterns", []): ptype = pattern.get("pattern_type", "Unknown") cls = pattern.get("class_name", "") confidence = pattern.get("confidence", 0) # Skip low confidence if confidence < 0.7: continue # Deduplicate by class key = f"{cls}:{ptype}" if key not in by_class or by_class[key]["confidence"] < confidence: by_class[key] = pattern # Count by type pattern_counts[ptype] = pattern_counts.get(ptype, 0) + 1 if not pattern_counts: return "" content = "### 🎨 Design Patterns Detected\n\n" content += "*From C3.1 codebase analysis (confidence > 0.7)*\n\n" # Top 5 pattern types for ptype, count in sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)[:5]: content += f"- **{ptype}**: {count} instances\n" content += f"\n*Total: {len(by_class)} high-confidence patterns*\n\n" content += "*See `references/patterns/` for complete pattern analysis*\n\n" return content def _format_examples_section(output_dir: Path) -> str: """Format code examples section from test_examples/test_examples.json.""" examples_file = output_dir / "test_examples" / "test_examples.json" if not examples_file.exists(): return "" try: with open(examples_file, encoding="utf-8") as f: examples_data = json.load(f) except Exception: return "" examples = examples_data.get("examples", []) if not examples: return "" # Filter high-value examples (complexity > 0.7) high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7] if not high_value: # If no high complexity, take any examples high_value = examples[:10] if not high_value: return "" content = "## 📝 Code Examples\n\n" content += "*High-quality examples extracted from test files (C3.2)*\n\n" # Top 10 examples for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]: desc = ex.get("description", "Example") lang = ex.get("language", "python").lower() code = ex.get("code", "") complexity = ex.get("complexity_score", 0) content += f"**{desc}** (complexity: {complexity:.2f})\n\n" content += f"```{lang}\n{code}\n```\n\n" content += "*See `references/test_examples/` for all extracted examples*\n\n" return content def _format_api_section(output_dir: Path) -> str: """Format API reference section.""" api_dir = output_dir / "api_reference" if not api_dir.exists(): return "" api_md = api_dir / "api_reference.md" if not api_md.exists(): return "" try: api_content = api_md.read_text(encoding="utf-8") except Exception: return "" # Extract first section (up to 500 chars) preview = api_content[:500] if len(api_content) > 500: preview += "..." content = "## 🔧 API Reference\n\n" content += "*Extracted from codebase analysis (C2.5)*\n\n" content += preview + "\n\n" content += "*See `references/api_reference/` for complete API documentation*\n\n" return content def _format_architecture_section(output_dir: Path) -> str: """Format architecture section from architecture/architectural_patterns.json.""" arch_file = output_dir / "architecture" / "architectural_patterns.json" if not arch_file.exists(): return "" try: with open(arch_file, encoding="utf-8") as f: arch_data = json.load(f) except Exception: return "" patterns = arch_data.get("patterns", []) if not patterns: return "" content = "## 🏗️ Architecture Overview\n\n" content += "*From C3.7 architectural analysis*\n\n" content += "**Detected Architectural Patterns:**\n\n" for pattern in patterns[:5]: name = pattern.get("pattern_name", "Unknown") confidence = pattern.get("confidence", 0) indicators = pattern.get("indicators", []) content += f"- **{name}** (confidence: {confidence:.2f})\n" if indicators: content += f" - Indicators: {', '.join(indicators[:3])}\n" content += f"\n*Total: {len(patterns)} architectural patterns detected*\n\n" content += "*See `references/architecture/` for complete architectural analysis*\n\n" return content def _format_config_section(output_dir: Path) -> str: """Format configuration patterns section.""" config_file = output_dir / "config_patterns" / "config_patterns.json" if not config_file.exists(): return "" try: with open(config_file, encoding="utf-8") as f: config_data = json.load(f) except Exception: return "" config_files = config_data.get("config_files", []) if not config_files: return "" total_settings = sum(len(cf.get("settings", [])) for cf in config_files) total_patterns = sum(len(cf.get("patterns", [])) for cf in config_files) content = "## ⚙️ Configuration Patterns\n\n" content += "*From C3.4 configuration analysis*\n\n" content += f"**Configuration Files Analyzed:** {len(config_files)}\n" content += f"**Total Settings:** {total_settings}\n" content += f"**Patterns Detected:** {total_patterns}\n\n" # List config file types found file_types = {} for cf in config_files: ctype = cf.get("config_type", "unknown") file_types[ctype] = file_types.get(ctype, 0) + 1 if file_types: content += "**Configuration Types:**\n" for ctype, count in sorted(file_types.items(), key=lambda x: x[1], reverse=True): content += f"- {ctype}: {count} files\n" content += "\n" content += "*See `references/config_patterns/` for detailed configuration analysis*\n\n" return content def _format_documentation_section(_output_dir: Path, docs_data: dict[str, Any]) -> str: """Format project documentation section from extracted markdown files. Note: output_dir parameter is unused but kept for consistency with other _format_* functions. Documentation data is provided via docs_data parameter. """ if not docs_data or docs_data.get("total_files", 0) == 0: return "" categories = docs_data.get("categories", {}) files = docs_data.get("files", []) content = "## 📖 Project Documentation\n\n" content += "*Extracted from markdown files in the project (C3.9)*\n\n" content += f"**Total Documentation Files:** {docs_data['total_files']}\n" content += f"**Categories:** {len(categories)}\n\n" # List documents by category (most important first) priority_order = [ "overview", "architecture", "guides", "workflows", "features", "api", "examples", ] # Sort categories by priority sorted_categories = [] for cat in priority_order: if cat in categories: sorted_categories.append(cat) for cat in sorted(categories.keys()): if cat not in sorted_categories: sorted_categories.append(cat) for category in sorted_categories[:6]: # Limit to 6 categories in SKILL.md cat_files = categories[category] content += f"### {category.title()}\n\n" # Get file details for this category cat_docs = [f for f in files if f.get("category") == category] for doc in cat_docs[:5]: # Limit to 5 docs per category title = doc.get("title") or doc.get("filename", "Unknown") path = doc.get("path", "") # Add summary if available (deep/full depth) if doc.get("ai_description"): content += f"- **{title}**: {doc['ai_description']}\n" elif doc.get("summary"): # Extract first sentence from summary summary = doc["summary"].split("\n")[0] if len(summary) > 100: summary = summary[:100] + "..." content += f"- **{title}**: {summary}\n" else: content += f"- **{title}** (`{path}`)\n" if len(cat_files) > 5: content += f"- *...and {len(cat_files) - 5} more*\n" content += "\n" # AI-enhanced topics if available all_topics = [] for doc in files: all_topics.extend(doc.get("ai_topics", [])) if all_topics: # Deduplicate and count from collections import Counter topic_counts = Counter(all_topics) top_topics = [t for t, _ in topic_counts.most_common(10)] content += f"**Key Topics:** {', '.join(top_topics)}\n\n" content += "*See `references/documentation/` for all project documentation*\n\n" return content def _generate_references(output_dir: Path): """ Generate references/ directory structure by symlinking analysis output. Creates a clean references/ directory that links to all analysis outputs. """ references_dir = output_dir / "references" references_dir.mkdir(exist_ok=True) # Map analysis directories to reference names mappings = { "api_reference": "api_reference", "dependencies": "dependencies", "patterns": "patterns", "test_examples": "test_examples", "tutorials": "tutorials", "config_patterns": "config_patterns", "architecture": "architecture", "documentation": "documentation", } for source, target in mappings.items(): source_dir = output_dir / source target_dir = references_dir / target if source_dir.exists() and source_dir.is_dir(): # Copy directory to references/ (not symlink, for portability) if target_dir.exists(): import shutil shutil.rmtree(target_dir) import shutil shutil.copytree(source_dir, target_dir) logger.debug(f"Copied {source} → references/{target}") logger.info(f"✅ Generated references directory: {references_dir}") def main(): """Command-line interface for codebase analysis.""" parser = argparse.ArgumentParser( description="Analyze local codebases and extract code knowledge", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Analyze current directory codebase-scraper --directory . --output output/codebase/ # Deep analysis with API reference and dependency graph codebase-scraper --directory /path/to/repo --depth deep --build-api-reference --build-dependency-graph # Analyze only Python and JavaScript codebase-scraper --directory . --languages Python,JavaScript # Use file patterns codebase-scraper --directory . --file-patterns "*.py,src/**/*.js" # Full analysis with all features (default) codebase-scraper --directory . --depth deep # Surface analysis (fast, skip all analysis features) codebase-scraper --directory . --depth surface --skip-api-reference --skip-dependency-graph --skip-patterns --skip-test-examples # Skip specific features codebase-scraper --directory . --skip-patterns --skip-test-examples """, ) parser.add_argument("--directory", required=True, help="Directory to analyze") parser.add_argument( "--output", default="output/codebase/", help="Output directory (default: output/codebase/)" ) parser.add_argument( "--depth", choices=["surface", "deep", "full"], default="deep", help=( "Analysis depth: " "surface (basic code structure, ~1-2 min), " "deep (code + patterns + tests, ~5-10 min, DEFAULT), " "full (everything + AI enhancement, ~20-60 min). " "💡 TIP: Use --quick or --comprehensive presets instead for better UX!" ), ) parser.add_argument( "--languages", help="Comma-separated languages to analyze (e.g., Python,JavaScript,C++)" ) parser.add_argument( "--file-patterns", help="Comma-separated file patterns (e.g., *.py,src/**/*.js)" ) parser.add_argument( "--skip-api-reference", action="store_true", default=False, help="Skip API reference markdown documentation generation (default: enabled)", ) parser.add_argument( "--skip-dependency-graph", action="store_true", default=False, help="Skip dependency graph and circular dependency detection (default: enabled)", ) parser.add_argument( "--skip-patterns", action="store_true", default=False, help="Skip design pattern detection (Singleton, Factory, Observer, etc.) (default: enabled)", ) parser.add_argument( "--skip-test-examples", action="store_true", default=False, help="Skip test example extraction (instantiation, method calls, configs, etc.) (default: enabled)", ) parser.add_argument( "--skip-how-to-guides", action="store_true", default=False, help="Skip how-to guide generation from workflow examples (default: enabled)", ) parser.add_argument( "--skip-config-patterns", action="store_true", default=False, help="Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)", ) parser.add_argument( "--skip-docs", action="store_true", default=False, help="Skip project documentation extraction from markdown files (README, docs/, etc.) (default: enabled)", ) parser.add_argument( "--ai-mode", choices=["auto", "api", "local", "none"], default="auto", help=( "AI enhancement mode for how-to guides: " "auto (auto-detect: API if ANTHROPIC_API_KEY set, else LOCAL), " "api (Claude API, requires ANTHROPIC_API_KEY), " "local (Claude Code Max, FREE, no API key), " "none (disable AI enhancement). " "💡 TIP: Use --enhance flag instead for simpler UX!" ), ) parser.add_argument("--no-comments", action="store_true", help="Skip comment extraction") parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") parser.add_argument( "--enhance-level", type=int, choices=[0, 1, 2, 3], default=0, help=( "AI enhancement level: " "0=off (default), " "1=SKILL.md only, " "2=SKILL.md+Architecture+Config, " "3=full (patterns, tests, config, architecture, SKILL.md)" ), ) # Check for deprecated flags deprecated_flags = { "--build-api-reference": "--skip-api-reference", "--build-dependency-graph": "--skip-dependency-graph", "--detect-patterns": "--skip-patterns", "--extract-test-examples": "--skip-test-examples", "--build-how-to-guides": "--skip-how-to-guides", "--extract-config-patterns": "--skip-config-patterns", } for old_flag, new_flag in deprecated_flags.items(): if old_flag in sys.argv: logger.warning( f"⚠️ DEPRECATED: {old_flag} is deprecated. " f"All features are now enabled by default. " f"Use {new_flag} to disable this feature." ) args = parser.parse_args() # Handle presets (Phase 1 feature - NEW) if ( hasattr(args, "quick") and args.quick and hasattr(args, "comprehensive") and args.comprehensive ): logger.error("❌ Cannot use --quick and --comprehensive together. Choose one.") return 1 if hasattr(args, "quick") and args.quick: # Override depth and disable advanced features args.depth = "surface" args.skip_patterns = True args.skip_test_examples = True args.skip_how_to_guides = True args.skip_config_patterns = True args.ai_mode = "none" logger.info("⚡ Quick analysis mode: surface depth, basic features only (~1-2 min)") if hasattr(args, "comprehensive") and args.comprehensive: # Override depth and enable all features args.depth = "full" args.skip_patterns = False args.skip_test_examples = False args.skip_how_to_guides = False args.skip_config_patterns = False args.ai_mode = "auto" logger.info("🚀 Comprehensive analysis mode: all features + AI enhancement (~20-60 min)") # Set logging level if args.verbose: logging.getLogger().setLevel(logging.DEBUG) # Validate directory directory = Path(args.directory) if not directory.exists(): logger.error(f"Directory not found: {directory}") return 1 if not directory.is_dir(): logger.error(f"Not a directory: {directory}") return 1 # Parse languages languages = None if args.languages: languages = [lang.strip() for lang in args.languages.split(",")] # Parse file patterns file_patterns = None if args.file_patterns: file_patterns = [p.strip() for p in args.file_patterns.split(",")] # Analyze codebase try: results = analyze_codebase( directory=directory, output_dir=Path(args.output), depth=args.depth, languages=languages, file_patterns=file_patterns, build_api_reference=not args.skip_api_reference, extract_comments=not args.no_comments, build_dependency_graph=not args.skip_dependency_graph, detect_patterns=not args.skip_patterns, extract_test_examples=not args.skip_test_examples, build_how_to_guides=not args.skip_how_to_guides, extract_config_patterns=not args.skip_config_patterns, extract_docs=not args.skip_docs, enhance_level=args.enhance_level, # AI enhancement level (0-3) ) # Print summary print(f"\n{'=' * 60}") print("CODEBASE ANALYSIS COMPLETE") print(f"{'=' * 60}") print(f"Files analyzed: {len(results['files'])}") print(f"Output directory: {args.output}") if not args.skip_api_reference: print(f"API reference: {Path(args.output) / 'api_reference'}") print(f"{'=' * 60}\n") return 0 except KeyboardInterrupt: logger.error("\nAnalysis interrupted by user") return 130 except Exception as e: logger.error(f"Analysis failed: {e}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": sys.exit(main())