diff --git a/CLAUDE.md b/CLAUDE.md index f23d246..2c8b023 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -297,9 +297,17 @@ skill-seekers analyze --directory . --skip-patterns --skip-how-to-guides ``` - Generates 300+ line standalone SKILL.md files from codebases -- All C3.x features integrated (patterns, tests, guides, config, architecture) +- All C3.x features integrated (patterns, tests, guides, config, architecture, docs) - Complete codebase analysis without documentation scraping +**C3.9 Project Documentation Extraction** (`codebase_scraper.py`): +- Extracts and categorizes all markdown files from the project +- Auto-detects categories: overview, architecture, guides, workflows, features, etc. +- Integrates documentation into SKILL.md with summaries +- AI enhancement (level 2+) adds topic extraction and cross-references +- Controlled by depth: surface=raw copy, deep=parse+summarize, full=AI-enhanced +- Default ON, use `--skip-docs` to disable + **Key Architecture Decision (BREAKING in v2.5.2):** - Changed from opt-in (`--build-*`) to opt-out (`--skip-*`) flags - All analysis features now ON by default for maximum value diff --git a/src/skill_seekers/cli/codebase_scraper.py b/src/skill_seekers/cli/codebase_scraper.py index 7955d9c..b64beb0 100644 --- a/src/skill_seekers/cli/codebase_scraper.py +++ b/src/skill_seekers/cli/codebase_scraper.py @@ -75,6 +75,53 @@ LANGUAGE_EXTENSIONS = { ".php": "PHP", } +# Markdown extension mapping +MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdown", ".mkd"} + +# Common documentation folders to scan +DOC_FOLDERS = {"docs", "doc", "documentation", "wiki", ".github"} + +# Root-level doc files → category mapping +ROOT_DOC_CATEGORIES = { + "readme": "overview", + "contributing": "contributing", + "changelog": "changelog", + "history": "changelog", + "license": "license", + "authors": "authors", + "code_of_conduct": "community", + "security": "security", + "architecture": "architecture", + "design": "architecture", +} + +# Folder name → category mapping +FOLDER_CATEGORIES = { + "architecture": "architecture", + "arch": "architecture", + "design": "architecture", + "guides": "guides", + "guide": "guides", + "tutorials": "guides", + "tutorial": "guides", + "howto": "guides", + "how-to": "guides", + "workflows": "workflows", + "workflow": "workflows", + "templates": "templates", + "template": "templates", + "api": "api", + "reference": "api", + "examples": "examples", + "example": "examples", + "specs": "specifications", + "spec": "specifications", + "rfcs": "specifications", + "rfc": "specifications", + "features": "features", + "feature": "features", +} + # Default directories to exclude DEFAULT_EXCLUDED_DIRS = { "node_modules", @@ -216,6 +263,469 @@ def walk_directory( return sorted(files) +def walk_markdown_files( + root: Path, + gitignore_spec: pathspec.PathSpec | None = None, + excluded_dirs: set | None = None, +) -> list[Path]: + """ + Walk directory tree and collect markdown documentation files. + + Args: + root: Root directory to walk + gitignore_spec: Optional PathSpec object for .gitignore rules + excluded_dirs: Set of directory names to exclude + + Returns: + List of markdown file paths + """ + if excluded_dirs is None: + excluded_dirs = DEFAULT_EXCLUDED_DIRS + + files = [] + root = Path(root).resolve() + + for dirpath, dirnames, filenames in os.walk(root): + current_dir = Path(dirpath) + + # Filter out excluded directories (in-place modification) + dirnames[:] = [d for d in dirnames if not should_exclude_dir(d, excluded_dirs)] + + for filename in filenames: + file_path = current_dir / filename + + # Check .gitignore rules + if gitignore_spec: + try: + rel_path = file_path.relative_to(root) + if gitignore_spec.match_file(str(rel_path)): + logger.debug(f"Skipping (gitignore): {rel_path}") + continue + except ValueError: + continue + + # Check if markdown file + if file_path.suffix.lower() not in MARKDOWN_EXTENSIONS: + continue + + files.append(file_path) + + return sorted(files) + + +def categorize_markdown_file(file_path: Path, root: Path) -> str: + """ + Categorize a markdown file based on its location and filename. + + Args: + file_path: Path to the markdown file + root: Root directory of the project + + Returns: + Category name (e.g., 'overview', 'guides', 'architecture') + """ + try: + rel_path = file_path.relative_to(root) + except ValueError: + return "other" + + # Check root-level files by filename + if len(rel_path.parts) == 1: + filename_lower = file_path.stem.lower().replace("-", "_").replace(" ", "_") + for key, category in ROOT_DOC_CATEGORIES.items(): + if key in filename_lower: + return category + return "overview" # Default for root .md files + + # Check folder-based categorization + for part in rel_path.parts[:-1]: # Exclude filename + part_lower = part.lower().replace("-", "_").replace(" ", "_") + for key, category in FOLDER_CATEGORIES.items(): + if key in part_lower: + return category + + # Default category + return "other" + + +def extract_markdown_structure(content: str) -> dict[str, Any]: + """ + Extract structure from markdown content (headers, code blocks, links). + + Args: + content: Markdown file content + + Returns: + Dictionary with extracted structure + """ + import re + + structure = { + "title": None, + "headers": [], + "code_blocks": [], + "links": [], + "word_count": len(content.split()), + "line_count": len(content.split("\n")), + } + + lines = content.split("\n") + + # Extract headers + for i, line in enumerate(lines): + header_match = re.match(r"^(#{1,6})\s+(.+)$", line) + if header_match: + level = len(header_match.group(1)) + text = header_match.group(2).strip() + structure["headers"].append({ + "level": level, + "text": text, + "line": i + 1, + }) + # First h1 is the title + if level == 1 and structure["title"] is None: + structure["title"] = text + + # Extract code blocks (fenced) + code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL) + for match in code_block_pattern.finditer(content): + language = match.group(1) or "text" + code = match.group(2).strip() + if len(code) > 0: + structure["code_blocks"].append({ + "language": language, + "code": code[:500], # Truncate long code blocks + "full_length": len(code), + }) + + # Extract links + link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") + for match in link_pattern.finditer(content): + structure["links"].append({ + "text": match.group(1), + "url": match.group(2), + }) + + return structure + + +def generate_markdown_summary(content: str, structure: dict[str, Any], max_length: int = 500) -> str: + """ + Generate a summary of markdown content. + + Args: + content: Full markdown content + structure: Extracted structure from extract_markdown_structure() + max_length: Maximum summary length + + Returns: + Summary string + """ + # Start with title if available + summary_parts = [] + + if structure.get("title"): + summary_parts.append(f"**{structure['title']}**") + + # Add header outline (first 5 h2/h3 headers) + h2_h3 = [h for h in structure.get("headers", []) if h["level"] in (2, 3)][:5] + if h2_h3: + sections = [h["text"] for h in h2_h3] + summary_parts.append(f"Sections: {', '.join(sections)}") + + # Extract first paragraph (skip headers and empty lines) + lines = content.split("\n") + first_para = [] + in_para = False + for line in lines: + stripped = line.strip() + if stripped.startswith("#") or stripped.startswith("```"): + if in_para: + break + continue + if stripped: + in_para = True + first_para.append(stripped) + elif in_para: + break + + if first_para: + para_text = " ".join(first_para) + if len(para_text) > 200: + para_text = para_text[:200] + "..." + summary_parts.append(para_text) + + # Add stats + stats = f"({structure.get('word_count', 0)} words, {len(structure.get('code_blocks', []))} code blocks)" + summary_parts.append(stats) + + summary = "\n".join(summary_parts) + if len(summary) > max_length: + summary = summary[:max_length] + "..." + + return summary + + +def process_markdown_docs( + directory: Path, + output_dir: Path, + depth: str = "deep", + gitignore_spec: pathspec.PathSpec | None = None, + enhance_with_ai: bool = False, + ai_mode: str = "none", +) -> dict[str, Any]: + """ + Process all markdown documentation files in a directory. + + Args: + directory: Root directory to scan + output_dir: Output directory for processed docs + depth: Processing depth ('surface', 'deep', 'full') + gitignore_spec: Optional .gitignore spec + enhance_with_ai: Whether to use AI enhancement + ai_mode: AI mode ('none', 'auto', 'api', 'local') + + Returns: + Dictionary with processed documentation data + """ + logger.info("Scanning for markdown documentation...") + + # Find all markdown files + md_files = walk_markdown_files(directory, gitignore_spec) + logger.info(f"Found {len(md_files)} markdown files") + + if not md_files: + return {"files": [], "categories": {}, "total_files": 0} + + # Process each file + processed_docs = [] + categories = {} + + for md_path in md_files: + try: + content = md_path.read_text(encoding="utf-8", errors="ignore") + rel_path = str(md_path.relative_to(directory)) + category = categorize_markdown_file(md_path, directory) + + doc_data = { + "path": rel_path, + "filename": md_path.name, + "category": category, + "size_bytes": len(content.encode("utf-8")), + } + + # Surface depth: just path and category + if depth == "surface": + processed_docs.append(doc_data) + else: + # Deep/Full: extract structure and summary + structure = extract_markdown_structure(content) + summary = generate_markdown_summary(content, structure) + + doc_data.update({ + "title": structure.get("title") or md_path.stem, + "structure": structure, + "summary": summary, + "content": content if depth == "full" else None, + }) + processed_docs.append(doc_data) + + # Track categories + if category not in categories: + categories[category] = [] + categories[category].append(rel_path) + + except Exception as e: + logger.warning(f"Failed to process {md_path}: {e}") + continue + + # AI Enhancement (if enabled and enhance_level >= 2) + if enhance_with_ai and ai_mode != "none" and processed_docs: + logger.info("🤖 Enhancing documentation analysis with AI...") + try: + processed_docs = _enhance_docs_with_ai(processed_docs, ai_mode) + logger.info("✅ AI documentation enhancement complete") + except Exception as e: + logger.warning(f"⚠️ AI enhancement failed: {e}") + + # Save processed docs to output + docs_output_dir = output_dir / "documentation" + docs_output_dir.mkdir(parents=True, exist_ok=True) + + # Copy files organized by category + for doc in processed_docs: + try: + src_path = directory / doc["path"] + category = doc["category"] + category_dir = docs_output_dir / category + category_dir.mkdir(parents=True, exist_ok=True) + + # Copy file to category folder + dest_path = category_dir / doc["filename"] + import shutil + shutil.copy2(src_path, dest_path) + except Exception as e: + logger.debug(f"Failed to copy {doc['path']}: {e}") + + # Save documentation index + index_data = { + "total_files": len(processed_docs), + "categories": categories, + "files": processed_docs, + } + + index_json = docs_output_dir / "documentation_index.json" + with open(index_json, "w", encoding="utf-8") as f: + json.dump(index_data, f, indent=2, default=str) + + logger.info(f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories") + logger.info(f"📁 Saved to: {docs_output_dir}") + + return index_data + + +def _enhance_docs_with_ai(docs: list[dict], ai_mode: str) -> list[dict]: + """ + Enhance documentation analysis with AI. + + Args: + docs: List of processed document dictionaries + ai_mode: AI mode ('api' or 'local') + + Returns: + Enhanced document list + """ + # Try API mode first + if ai_mode in ("api", "auto"): + api_key = os.environ.get("ANTHROPIC_API_KEY") + if api_key: + return _enhance_docs_api(docs, api_key) + + # Fall back to LOCAL mode + if ai_mode in ("local", "auto"): + return _enhance_docs_local(docs) + + return docs + + +def _enhance_docs_api(docs: list[dict], api_key: str) -> list[dict]: + """Enhance docs using Claude API.""" + try: + import anthropic + client = anthropic.Anthropic(api_key=api_key) + + # Batch documents for efficiency + batch_size = 10 + for i in range(0, len(docs), batch_size): + batch = docs[i:i + batch_size] + + # Create prompt for batch + docs_text = "\n\n".join([ + f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}" + for d in batch if d.get("summary") + ]) + + if not docs_text: + continue + + prompt = f"""Analyze these documentation files and provide: +1. A brief description of what each document covers +2. Key topics/concepts mentioned +3. How they relate to each other + +Documents: +{docs_text} + +Return JSON with format: +{{"enhancements": [{{"filename": "...", "description": "...", "key_topics": [...], "related_to": [...]}}]}}""" + + response = client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=2000, + messages=[{"role": "user", "content": prompt}] + ) + + # Parse response and merge enhancements + try: + import re + json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL) + if json_match: + enhancements = json.loads(json_match.group()) + for enh in enhancements.get("enhancements", []): + for doc in batch: + if doc["filename"] == enh.get("filename"): + doc["ai_description"] = enh.get("description") + doc["ai_topics"] = enh.get("key_topics", []) + doc["ai_related"] = enh.get("related_to", []) + except Exception: + pass + + except Exception as e: + logger.warning(f"API enhancement failed: {e}") + + return docs + + +def _enhance_docs_local(docs: list[dict]) -> list[dict]: + """Enhance docs using Claude Code CLI (LOCAL mode).""" + import subprocess + import tempfile + + # Prepare batch of docs for enhancement + docs_with_summary = [d for d in docs if d.get("summary")] + if not docs_with_summary: + return docs + + docs_text = "\n\n".join([ + f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}" + for d in docs_with_summary[:20] # Limit to 20 docs + ]) + + prompt = f"""Analyze these documentation files from a codebase and provide insights. + +For each document, provide: +1. A brief description of what it covers +2. Key topics/concepts +3. Related documents + +Documents: +{docs_text} + +Output JSON only: +{{"enhancements": [{{"filename": "...", "description": "...", "key_topics": ["..."], "related_to": ["..."]}}]}}""" + + try: + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: + f.write(prompt) + prompt_file = f.name + + result = subprocess.run( + ["claude", "--dangerously-skip-permissions", "-p", prompt], + capture_output=True, + text=True, + timeout=120, + ) + + os.unlink(prompt_file) + + if result.returncode == 0 and result.stdout: + import re + json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL) + if json_match: + enhancements = json.loads(json_match.group()) + for enh in enhancements.get("enhancements", []): + for doc in docs: + if doc["filename"] == enh.get("filename"): + doc["ai_description"] = enh.get("description") + doc["ai_topics"] = enh.get("key_topics", []) + doc["ai_related"] = enh.get("related_to", []) + + except Exception as e: + logger.warning(f"LOCAL enhancement failed: {e}") + + return docs + + def analyze_codebase( directory: Path, output_dir: Path, @@ -229,6 +739,7 @@ def analyze_codebase( extract_test_examples: bool = True, build_how_to_guides: bool = True, extract_config_patterns: bool = True, + extract_docs: bool = True, enhance_level: int = 0, ) -> dict[str, Any]: """ @@ -247,7 +758,8 @@ def analyze_codebase( extract_test_examples: Extract usage examples from test files build_how_to_guides: Build how-to guides from workflow examples (C3.3) extract_config_patterns: Extract configuration patterns from config files (C3.4) - enhance_level: AI enhancement level (0=off, 1=SKILL.md only, 2=+config+arch, 3=full) + extract_docs: Extract and process markdown documentation files (default: True) + enhance_level: AI enhancement level (0=off, 1=SKILL.md only, 2=+config+arch+docs, 3=full) Returns: Analysis results dictionary @@ -622,6 +1134,33 @@ def analyze_codebase( else: logger.info("No clear architectural patterns detected") + # Extract markdown documentation (C3.9) + docs_data = None + if extract_docs: + logger.info("Extracting project documentation...") + try: + # Determine AI enhancement for docs (level 2+) + enhance_docs_ai = enhance_level >= 2 + docs_data = process_markdown_docs( + directory=directory, + output_dir=output_dir, + depth=depth, + gitignore_spec=gitignore_spec, + enhance_with_ai=enhance_docs_ai, + ai_mode=ai_mode, + ) + + if docs_data and docs_data.get("total_files", 0) > 0: + logger.info( + f"✅ Extracted {docs_data['total_files']} documentation files " + f"in {len(docs_data.get('categories', {}))} categories" + ) + else: + logger.info("No markdown documentation files found") + except Exception as e: + logger.warning(f"Documentation extraction failed: {e}") + docs_data = None + # Generate SKILL.md and references/ directory logger.info("Generating SKILL.md and references...") _generate_skill_md( @@ -634,6 +1173,8 @@ def analyze_codebase( detect_patterns=detect_patterns, extract_test_examples=extract_test_examples, extract_config_patterns=extract_config_patterns, + extract_docs=extract_docs, + docs_data=docs_data, ) return results @@ -649,6 +1190,8 @@ def _generate_skill_md( detect_patterns: bool, extract_test_examples: bool, extract_config_patterns: bool, + extract_docs: bool = True, + docs_data: dict[str, Any] | None = None, ): """ Generate rich SKILL.md from codebase analysis results. @@ -728,7 +1271,10 @@ Use this skill when you need to: skill_content += "- ✅ Test Examples (C3.2)\n" if extract_config_patterns: skill_content += "- ✅ Configuration Patterns (C3.4)\n" - skill_content += "- ✅ Architectural Analysis (C3.7)\n\n" + skill_content += "- ✅ Architectural Analysis (C3.7)\n" + if extract_docs: + skill_content += "- ✅ Project Documentation (C3.9)\n" + skill_content += "\n" # Add design patterns if available if detect_patterns: @@ -759,6 +1305,12 @@ Use this skill when you need to: if config_content: skill_content += config_content + # Add project documentation if available + if extract_docs and docs_data: + docs_content = _format_documentation_section(output_dir, docs_data) + if docs_content: + skill_content += docs_content + # Available references skill_content += "## 📚 Available References\n\n" skill_content += "This skill includes detailed reference documentation:\n\n" @@ -788,6 +1340,9 @@ Use this skill when you need to: if (output_dir / "architecture").exists(): skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n" refs_added = True + if extract_docs and (output_dir / "documentation").exists(): + skill_content += "- **Documentation**: `references/documentation/` - Project documentation\n" + refs_added = True if not refs_added: skill_content += "No additional references generated (analysis features disabled).\n" @@ -1017,6 +1572,75 @@ def _format_config_section(output_dir: Path) -> str: return content +def _format_documentation_section(output_dir: Path, docs_data: dict[str, Any]) -> str: + """Format project documentation section from extracted markdown files.""" + if not docs_data or docs_data.get("total_files", 0) == 0: + return "" + + categories = docs_data.get("categories", {}) + files = docs_data.get("files", []) + + content = "## 📖 Project Documentation\n\n" + content += "*Extracted from markdown files in the project (C3.9)*\n\n" + content += f"**Total Documentation Files:** {docs_data['total_files']}\n" + content += f"**Categories:** {len(categories)}\n\n" + + # List documents by category (most important first) + priority_order = ["overview", "architecture", "guides", "workflows", "features", "api", "examples"] + + # Sort categories by priority + sorted_categories = [] + for cat in priority_order: + if cat in categories: + sorted_categories.append(cat) + for cat in sorted(categories.keys()): + if cat not in sorted_categories: + sorted_categories.append(cat) + + for category in sorted_categories[:6]: # Limit to 6 categories in SKILL.md + cat_files = categories[category] + content += f"### {category.title()}\n\n" + + # Get file details for this category + cat_docs = [f for f in files if f.get("category") == category] + + for doc in cat_docs[:5]: # Limit to 5 docs per category + title = doc.get("title") or doc.get("filename", "Unknown") + path = doc.get("path", "") + + # Add summary if available (deep/full depth) + if doc.get("ai_description"): + content += f"- **{title}**: {doc['ai_description']}\n" + elif doc.get("summary"): + # Extract first sentence from summary + summary = doc["summary"].split("\n")[0] + if len(summary) > 100: + summary = summary[:100] + "..." + content += f"- **{title}**: {summary}\n" + else: + content += f"- **{title}** (`{path}`)\n" + + if len(cat_files) > 5: + content += f"- *...and {len(cat_files) - 5} more*\n" + + content += "\n" + + # AI-enhanced topics if available + all_topics = [] + for doc in files: + all_topics.extend(doc.get("ai_topics", [])) + + if all_topics: + # Deduplicate and count + from collections import Counter + topic_counts = Counter(all_topics) + top_topics = [t for t, _ in topic_counts.most_common(10)] + content += f"**Key Topics:** {', '.join(top_topics)}\n\n" + + content += "*See `references/documentation/` for all project documentation*\n\n" + return content + + def _generate_references(output_dir: Path): """ Generate references/ directory structure by symlinking analysis output. @@ -1035,6 +1659,7 @@ def _generate_references(output_dir: Path): "tutorials": "tutorials", "config_patterns": "config_patterns", "architecture": "architecture", + "documentation": "documentation", } for source, target in mappings.items(): @@ -1144,6 +1769,12 @@ Examples: default=False, help="Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)", ) + parser.add_argument( + "--skip-docs", + action="store_true", + default=False, + help="Skip project documentation extraction from markdown files (README, docs/, etc.) (default: enabled)", + ) parser.add_argument( "--ai-mode", choices=["auto", "api", "local", "none"], @@ -1257,6 +1888,7 @@ Examples: extract_test_examples=not args.skip_test_examples, build_how_to_guides=not args.skip_how_to_guides, extract_config_patterns=not args.skip_config_patterns, + extract_docs=not args.skip_docs, enhance_level=args.enhance_level, # AI enhancement level (0-3) ) diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py index 72469a3..17073c8 100644 --- a/src/skill_seekers/cli/main.py +++ b/src/skill_seekers/cli/main.py @@ -315,6 +315,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers analyze_parser.add_argument("--skip-test-examples", action="store_true", help="Skip test examples") analyze_parser.add_argument("--skip-how-to-guides", action="store_true", help="Skip guides") analyze_parser.add_argument("--skip-config-patterns", action="store_true", help="Skip config") + analyze_parser.add_argument("--skip-docs", action="store_true", help="Skip project docs (README, docs/)") analyze_parser.add_argument("--no-comments", action="store_true", help="Skip comments") analyze_parser.add_argument("--verbose", action="store_true", help="Verbose logging") @@ -609,6 +610,8 @@ def main(argv: list[str] | None = None) -> int: sys.argv.append("--skip-how-to-guides") if args.skip_config_patterns: sys.argv.append("--skip-config-patterns") + if args.skip_docs: + sys.argv.append("--skip-docs") if args.no_comments: sys.argv.append("--no-comments") if args.verbose: diff --git a/src/skill_seekers/mcp/tools/scraping_tools.py b/src/skill_seekers/mcp/tools/scraping_tools.py index 0554f53..f4b986a 100644 --- a/src/skill_seekers/mcp/tools/scraping_tools.py +++ b/src/skill_seekers/mcp/tools/scraping_tools.py @@ -464,6 +464,7 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]: - skip_test_examples (bool, optional): Skip test example extraction (default: False) - skip_how_to_guides (bool, optional): Skip how-to guide generation (default: False) - skip_config_patterns (bool, optional): Skip config pattern extraction (default: False) + - skip_docs (bool, optional): Skip project documentation extraction (default: False) Returns: List[TextContent]: Tool execution results @@ -497,6 +498,7 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]: skip_test_examples = args.get("skip_test_examples", False) skip_how_to_guides = args.get("skip_how_to_guides", False) skip_config_patterns = args.get("skip_config_patterns", False) + skip_docs = args.get("skip_docs", False) # Build command cmd = [sys.executable, "-m", "skill_seekers.cli.codebase_scraper"] @@ -526,6 +528,8 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]: cmd.append("--skip-how-to-guides") if skip_config_patterns: cmd.append("--skip-config-patterns") + if skip_docs: + cmd.append("--skip-docs") # Adjust timeout based on enhance_level timeout = 600 # 10 minutes base diff --git a/tests/test_analyze_command.py b/tests/test_analyze_command.py index 2ceeac9..7e1e648 100644 --- a/tests/test_analyze_command.py +++ b/tests/test_analyze_command.py @@ -74,7 +74,8 @@ class TestAnalyzeSubcommand(unittest.TestCase): "--skip-patterns", "--skip-test-examples", "--skip-how-to-guides", - "--skip-config-patterns" + "--skip-config-patterns", + "--skip-docs" ]) self.assertTrue(args.skip_api_reference) self.assertTrue(args.skip_dependency_graph) @@ -82,6 +83,7 @@ class TestAnalyzeSubcommand(unittest.TestCase): self.assertTrue(args.skip_test_examples) self.assertTrue(args.skip_how_to_guides) self.assertTrue(args.skip_config_patterns) + self.assertTrue(args.skip_docs) def test_backward_compatible_depth_flag(self): """Test that deprecated --depth flag still works.""" diff --git a/tests/test_codebase_scraper.py b/tests/test_codebase_scraper.py index 42be5ae..b179620 100644 --- a/tests/test_codebase_scraper.py +++ b/tests/test_codebase_scraper.py @@ -21,10 +21,17 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) from skill_seekers.cli.codebase_scraper import ( DEFAULT_EXCLUDED_DIRS, + FOLDER_CATEGORIES, + MARKDOWN_EXTENSIONS, + ROOT_DOC_CATEGORIES, + categorize_markdown_file, detect_language, + extract_markdown_structure, + generate_markdown_summary, load_gitignore, should_exclude_dir, walk_directory, + walk_markdown_files, ) @@ -201,6 +208,191 @@ class TestGitignoreLoading(unittest.TestCase): self.assertIsNotNone(spec) +class TestMarkdownDocumentation(unittest.TestCase): + """Tests for markdown documentation extraction (C3.9)""" + + def setUp(self): + """Set up test environment""" + self.temp_dir = tempfile.mkdtemp() + self.root = Path(self.temp_dir) + + def tearDown(self): + """Clean up test environment""" + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_markdown_extensions(self): + """Test that markdown extensions are properly defined.""" + self.assertIn(".md", MARKDOWN_EXTENSIONS) + self.assertIn(".markdown", MARKDOWN_EXTENSIONS) + + def test_root_doc_categories(self): + """Test root document category mapping.""" + self.assertEqual(ROOT_DOC_CATEGORIES.get("readme"), "overview") + self.assertEqual(ROOT_DOC_CATEGORIES.get("changelog"), "changelog") + self.assertEqual(ROOT_DOC_CATEGORIES.get("architecture"), "architecture") + + def test_folder_categories(self): + """Test folder category mapping.""" + self.assertEqual(FOLDER_CATEGORIES.get("guides"), "guides") + self.assertEqual(FOLDER_CATEGORIES.get("tutorials"), "guides") + self.assertEqual(FOLDER_CATEGORIES.get("workflows"), "workflows") + self.assertEqual(FOLDER_CATEGORIES.get("architecture"), "architecture") + + def test_walk_markdown_files(self): + """Test walking directory for markdown files.""" + # Create test markdown files + (self.root / "README.md").write_text("# Test README") + (self.root / "test.py").write_text("print('test')") + + docs_dir = self.root / "docs" + docs_dir.mkdir() + (docs_dir / "guide.md").write_text("# Guide") + + files = walk_markdown_files(self.root) + + # Should find markdown files only + self.assertEqual(len(files), 2) + filenames = [f.name for f in files] + self.assertIn("README.md", filenames) + self.assertIn("guide.md", filenames) + + def test_categorize_root_readme(self): + """Test categorizing root README file.""" + readme_path = self.root / "README.md" + readme_path.write_text("# Test") + + category = categorize_markdown_file(readme_path, self.root) + self.assertEqual(category, "overview") + + def test_categorize_changelog(self): + """Test categorizing CHANGELOG file.""" + changelog_path = self.root / "CHANGELOG.md" + changelog_path.write_text("# Changelog") + + category = categorize_markdown_file(changelog_path, self.root) + self.assertEqual(category, "changelog") + + def test_categorize_docs_guide(self): + """Test categorizing file in docs/guides folder.""" + guides_dir = self.root / "docs" / "guides" + guides_dir.mkdir(parents=True) + guide_path = guides_dir / "getting-started.md" + guide_path.write_text("# Getting Started") + + category = categorize_markdown_file(guide_path, self.root) + self.assertEqual(category, "guides") + + def test_categorize_architecture(self): + """Test categorizing architecture documentation.""" + arch_dir = self.root / "docs" / "architecture" + arch_dir.mkdir(parents=True) + arch_path = arch_dir / "overview.md" + arch_path.write_text("# Architecture") + + category = categorize_markdown_file(arch_path, self.root) + self.assertEqual(category, "architecture") + + +class TestMarkdownStructureExtraction(unittest.TestCase): + """Tests for markdown structure extraction""" + + def test_extract_headers(self): + """Test extracting headers from markdown.""" + content = """# Main Title + +## Section 1 +Some content + +### Subsection +More content + +## Section 2 +""" + structure = extract_markdown_structure(content) + + self.assertEqual(structure["title"], "Main Title") + self.assertEqual(len(structure["headers"]), 4) + self.assertEqual(structure["headers"][0]["level"], 1) + self.assertEqual(structure["headers"][1]["level"], 2) + + def test_extract_code_blocks(self): + """Test extracting code blocks from markdown.""" + content = """# Example + +```python +def hello(): + print("Hello") +``` + +```javascript +console.log("test"); +``` +""" + structure = extract_markdown_structure(content) + + self.assertEqual(len(structure["code_blocks"]), 2) + self.assertEqual(structure["code_blocks"][0]["language"], "python") + self.assertEqual(structure["code_blocks"][1]["language"], "javascript") + + def test_extract_links(self): + """Test extracting links from markdown.""" + content = """# Links + +Check out [Example](https://example.com) and [Another](./local.md). +""" + structure = extract_markdown_structure(content) + + self.assertEqual(len(structure["links"]), 2) + self.assertEqual(structure["links"][0]["text"], "Example") + self.assertEqual(structure["links"][0]["url"], "https://example.com") + + def test_word_and_line_count(self): + """Test word and line count.""" + content = "First line\nSecond line\nThird line" + structure = extract_markdown_structure(content) + + self.assertEqual(structure["line_count"], 3) + self.assertEqual(structure["word_count"], 6) # First, line, Second, line, Third, line + + +class TestMarkdownSummaryGeneration(unittest.TestCase): + """Tests for markdown summary generation""" + + def test_generate_summary_with_title(self): + """Test summary includes title.""" + content = "# My Title\n\nSome content here." + structure = extract_markdown_structure(content) + summary = generate_markdown_summary(content, structure) + + self.assertIn("**My Title**", summary) + + def test_generate_summary_with_sections(self): + """Test summary includes section names.""" + content = """# Main + +## Getting Started +Content + +## Installation +Content + +## Usage +Content +""" + structure = extract_markdown_structure(content) + summary = generate_markdown_summary(content, structure) + + self.assertIn("Sections:", summary) + + def test_generate_summary_truncation(self): + """Test summary is truncated to max length.""" + content = "# Title\n\n" + "Long content. " * 100 + structure = extract_markdown_structure(content) + summary = generate_markdown_summary(content, structure, max_length=200) + + self.assertLessEqual(len(summary), 210) # Allow some buffer for truncation marker + + if __name__ == "__main__": # Run tests with verbose output unittest.main(verbosity=2)