feat(C3.9): Add project documentation extraction from markdown files

- Scan ALL .md files in project (README, docs/, etc.) - Smart categorization by folder/filename (overview, architecture, guides, etc.) - Processing depth: surface=raw copy, deep=parse+summarize, full=AI-enhanced - AI enhancement at level 2+ adds topic extraction and cross-references - New "Project Documentation" section in SKILL.md with summaries - Output to references/documentation/ organized by category - Default ON, use --skip-docs to disable - Add skip_docs parameter to MCP scrape_codebase_tool - Add 15 new tests for markdown documentation features Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 13:54:56 +03:00
parent 4cfb94e14f
commit 170dd0fd75
6 changed files with 845 additions and 4 deletions
--- a/tests/test_analyze_command.py
+++ b/tests/test_analyze_command.py
@@ -74,7 +74,8 @@ class TestAnalyzeSubcommand(unittest.TestCase):
            "--skip-patterns",
            "--skip-test-examples",
            "--skip-how-to-guides",
-            "--skip-config-patterns"
+            "--skip-config-patterns",
+            "--skip-docs"
        ])
        self.assertTrue(args.skip_api_reference)
        self.assertTrue(args.skip_dependency_graph)
@@ -82,6 +83,7 @@ class TestAnalyzeSubcommand(unittest.TestCase):
        self.assertTrue(args.skip_test_examples)
        self.assertTrue(args.skip_how_to_guides)
        self.assertTrue(args.skip_config_patterns)
+        self.assertTrue(args.skip_docs)

    def test_backward_compatible_depth_flag(self):
        """Test that deprecated --depth flag still works."""
--- a/tests/test_codebase_scraper.py
+++ b/tests/test_codebase_scraper.py
@@ -21,10 +21,17 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))

 from skill_seekers.cli.codebase_scraper import (
    DEFAULT_EXCLUDED_DIRS,
+    FOLDER_CATEGORIES,
+    MARKDOWN_EXTENSIONS,
+    ROOT_DOC_CATEGORIES,
+    categorize_markdown_file,
    detect_language,
+    extract_markdown_structure,
+    generate_markdown_summary,
    load_gitignore,
    should_exclude_dir,
    walk_directory,
+    walk_markdown_files,
 )


@@ -201,6 +208,191 @@ class TestGitignoreLoading(unittest.TestCase):
            self.assertIsNotNone(spec)


+class TestMarkdownDocumentation(unittest.TestCase):
+    """Tests for markdown documentation extraction (C3.9)"""
+
+    def setUp(self):
+        """Set up test environment"""
+        self.temp_dir = tempfile.mkdtemp()
+        self.root = Path(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up test environment"""
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_markdown_extensions(self):
+        """Test that markdown extensions are properly defined."""
+        self.assertIn(".md", MARKDOWN_EXTENSIONS)
+        self.assertIn(".markdown", MARKDOWN_EXTENSIONS)
+
+    def test_root_doc_categories(self):
+        """Test root document category mapping."""
+        self.assertEqual(ROOT_DOC_CATEGORIES.get("readme"), "overview")
+        self.assertEqual(ROOT_DOC_CATEGORIES.get("changelog"), "changelog")
+        self.assertEqual(ROOT_DOC_CATEGORIES.get("architecture"), "architecture")
+
+    def test_folder_categories(self):
+        """Test folder category mapping."""
+        self.assertEqual(FOLDER_CATEGORIES.get("guides"), "guides")
+        self.assertEqual(FOLDER_CATEGORIES.get("tutorials"), "guides")
+        self.assertEqual(FOLDER_CATEGORIES.get("workflows"), "workflows")
+        self.assertEqual(FOLDER_CATEGORIES.get("architecture"), "architecture")
+
+    def test_walk_markdown_files(self):
+        """Test walking directory for markdown files."""
+        # Create test markdown files
+        (self.root / "README.md").write_text("# Test README")
+        (self.root / "test.py").write_text("print('test')")
+
+        docs_dir = self.root / "docs"
+        docs_dir.mkdir()
+        (docs_dir / "guide.md").write_text("# Guide")
+
+        files = walk_markdown_files(self.root)
+
+        # Should find markdown files only
+        self.assertEqual(len(files), 2)
+        filenames = [f.name for f in files]
+        self.assertIn("README.md", filenames)
+        self.assertIn("guide.md", filenames)
+
+    def test_categorize_root_readme(self):
+        """Test categorizing root README file."""
+        readme_path = self.root / "README.md"
+        readme_path.write_text("# Test")
+
+        category = categorize_markdown_file(readme_path, self.root)
+        self.assertEqual(category, "overview")
+
+    def test_categorize_changelog(self):
+        """Test categorizing CHANGELOG file."""
+        changelog_path = self.root / "CHANGELOG.md"
+        changelog_path.write_text("# Changelog")
+
+        category = categorize_markdown_file(changelog_path, self.root)
+        self.assertEqual(category, "changelog")
+
+    def test_categorize_docs_guide(self):
+        """Test categorizing file in docs/guides folder."""
+        guides_dir = self.root / "docs" / "guides"
+        guides_dir.mkdir(parents=True)
+        guide_path = guides_dir / "getting-started.md"
+        guide_path.write_text("# Getting Started")
+
+        category = categorize_markdown_file(guide_path, self.root)
+        self.assertEqual(category, "guides")
+
+    def test_categorize_architecture(self):
+        """Test categorizing architecture documentation."""
+        arch_dir = self.root / "docs" / "architecture"
+        arch_dir.mkdir(parents=True)
+        arch_path = arch_dir / "overview.md"
+        arch_path.write_text("# Architecture")
+
+        category = categorize_markdown_file(arch_path, self.root)
+        self.assertEqual(category, "architecture")
+
+
+class TestMarkdownStructureExtraction(unittest.TestCase):
+    """Tests for markdown structure extraction"""
+
+    def test_extract_headers(self):
+        """Test extracting headers from markdown."""
+        content = """# Main Title
+
+## Section 1
+Some content
+
+### Subsection
+More content
+
+## Section 2
+"""
+        structure = extract_markdown_structure(content)
+
+        self.assertEqual(structure["title"], "Main Title")
+        self.assertEqual(len(structure["headers"]), 4)
+        self.assertEqual(structure["headers"][0]["level"], 1)
+        self.assertEqual(structure["headers"][1]["level"], 2)
+
+    def test_extract_code_blocks(self):
+        """Test extracting code blocks from markdown."""
+        content = """# Example
+
+```python
+def hello():
+    print("Hello")
+```
+
+```javascript
+console.log("test");
+```
+"""
+        structure = extract_markdown_structure(content)
+
+        self.assertEqual(len(structure["code_blocks"]), 2)
+        self.assertEqual(structure["code_blocks"][0]["language"], "python")
+        self.assertEqual(structure["code_blocks"][1]["language"], "javascript")
+
+    def test_extract_links(self):
+        """Test extracting links from markdown."""
+        content = """# Links
+
+Check out [Example](https://example.com) and [Another](./local.md).
+"""
+        structure = extract_markdown_structure(content)
+
+        self.assertEqual(len(structure["links"]), 2)
+        self.assertEqual(structure["links"][0]["text"], "Example")
+        self.assertEqual(structure["links"][0]["url"], "https://example.com")
+
+    def test_word_and_line_count(self):
+        """Test word and line count."""
+        content = "First line\nSecond line\nThird line"
+        structure = extract_markdown_structure(content)
+
+        self.assertEqual(structure["line_count"], 3)
+        self.assertEqual(structure["word_count"], 6)  # First, line, Second, line, Third, line
+
+
+class TestMarkdownSummaryGeneration(unittest.TestCase):
+    """Tests for markdown summary generation"""
+
+    def test_generate_summary_with_title(self):
+        """Test summary includes title."""
+        content = "# My Title\n\nSome content here."
+        structure = extract_markdown_structure(content)
+        summary = generate_markdown_summary(content, structure)
+
+        self.assertIn("**My Title**", summary)
+
+    def test_generate_summary_with_sections(self):
+        """Test summary includes section names."""
+        content = """# Main
+
+## Getting Started
+Content
+
+## Installation
+Content
+
+## Usage
+Content
+"""
+        structure = extract_markdown_structure(content)
+        summary = generate_markdown_summary(content, structure)
+
+        self.assertIn("Sections:", summary)
+
+    def test_generate_summary_truncation(self):
+        """Test summary is truncated to max length."""
+        content = "# Title\n\n" + "Long content. " * 100
+        structure = extract_markdown_structure(content)
+        summary = generate_markdown_summary(content, structure, max_length=200)
+
+        self.assertLessEqual(len(summary), 210)  # Allow some buffer for truncation marker
+
+
 if __name__ == "__main__":
    # Run tests with verbose output
    unittest.main(verbosity=2)