Problem:
The analyze command created duplicate documentation directories:
- output/skill-seekers/documentation/ (1.5MB) - Not referenced
- output/skill-seekers/references/documentation/ (1.5MB) - Referenced
This wasted 1.5MB per skill (50% duplication).
Root Cause:
_generate_references() copied directories to references/ but never
cleaned up the source directories.
Solution:
After copying each directory to references/, immediately remove the
source directory using shutil.rmtree(). SKILL.md only references
references/{target}, making the source directories redundant.
Changes:
- Add cleanup in _generate_references() after each copytree operation
- Add 2 comprehensive tests to verify no duplicate directories
- Test coverage: 38/38 tests passing in test_codebase_scraper.py
Impact:
- Saves 1.5MB per skill (documentation size varies)
- Prevents 50% duplication of all analysis output directories
- Clean, efficient disk usage
Tests Added:
- test_no_duplicate_directories_created: Verifies source cleanup
- test_no_disk_space_wasted: Verifies single copy in references/
Reported by: @yangshare via Issue #279
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
479 lines
16 KiB
Python
479 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Tests for codebase_scraper.py - Standalone codebase analysis CLI.
|
|
|
|
Test Coverage:
|
|
- Language detection
|
|
- Directory exclusion
|
|
- File walking
|
|
- .gitignore loading
|
|
"""
|
|
|
|
import os
|
|
import shutil
|
|
import sys
|
|
import tempfile
|
|
import unittest
|
|
from pathlib import Path
|
|
|
|
# Add src to path
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
|
|
|
from skill_seekers.cli.codebase_scraper import (
|
|
DEFAULT_EXCLUDED_DIRS,
|
|
FOLDER_CATEGORIES,
|
|
MARKDOWN_EXTENSIONS,
|
|
ROOT_DOC_CATEGORIES,
|
|
_generate_references,
|
|
categorize_markdown_file,
|
|
detect_language,
|
|
extract_markdown_structure,
|
|
generate_markdown_summary,
|
|
load_gitignore,
|
|
should_exclude_dir,
|
|
walk_directory,
|
|
walk_markdown_files,
|
|
)
|
|
|
|
|
|
class TestLanguageDetection(unittest.TestCase):
|
|
"""Tests for language detection from file extensions"""
|
|
|
|
def test_python_detection(self):
|
|
"""Test Python file detection."""
|
|
self.assertEqual(detect_language(Path("test.py")), "Python")
|
|
|
|
def test_javascript_detection(self):
|
|
"""Test JavaScript file detection."""
|
|
self.assertEqual(detect_language(Path("test.js")), "JavaScript")
|
|
self.assertEqual(detect_language(Path("test.jsx")), "JavaScript")
|
|
|
|
def test_typescript_detection(self):
|
|
"""Test TypeScript file detection."""
|
|
self.assertEqual(detect_language(Path("test.ts")), "TypeScript")
|
|
self.assertEqual(detect_language(Path("test.tsx")), "TypeScript")
|
|
|
|
def test_cpp_detection(self):
|
|
"""Test C++ file detection."""
|
|
self.assertEqual(detect_language(Path("test.cpp")), "C++")
|
|
self.assertEqual(detect_language(Path("test.h")), "C++")
|
|
self.assertEqual(detect_language(Path("test.hpp")), "C++")
|
|
|
|
def test_csharp_detection(self):
|
|
"""Test C# file detection."""
|
|
self.assertEqual(detect_language(Path("test.cs")), "C#")
|
|
|
|
def test_go_detection(self):
|
|
"""Test Go file detection."""
|
|
self.assertEqual(detect_language(Path("test.go")), "Go")
|
|
|
|
def test_rust_detection(self):
|
|
"""Test Rust file detection."""
|
|
self.assertEqual(detect_language(Path("test.rs")), "Rust")
|
|
|
|
def test_java_detection(self):
|
|
"""Test Java file detection."""
|
|
self.assertEqual(detect_language(Path("test.java")), "Java")
|
|
|
|
def test_ruby_detection(self):
|
|
"""Test Ruby file detection."""
|
|
self.assertEqual(detect_language(Path("test.rb")), "Ruby")
|
|
|
|
def test_php_detection(self):
|
|
"""Test PHP file detection."""
|
|
self.assertEqual(detect_language(Path("test.php")), "PHP")
|
|
|
|
def test_unknown_language(self):
|
|
"""Test unknown file extension."""
|
|
self.assertEqual(detect_language(Path("test.swift")), "Unknown")
|
|
self.assertEqual(detect_language(Path("test.txt")), "Unknown")
|
|
|
|
|
|
class TestDirectoryExclusion(unittest.TestCase):
|
|
"""Tests for directory exclusion logic"""
|
|
|
|
def test_node_modules_excluded(self):
|
|
"""Test that node_modules is excluded."""
|
|
self.assertTrue(should_exclude_dir("node_modules", DEFAULT_EXCLUDED_DIRS))
|
|
|
|
def test_venv_excluded(self):
|
|
"""Test that venv is excluded."""
|
|
self.assertTrue(should_exclude_dir("venv", DEFAULT_EXCLUDED_DIRS))
|
|
|
|
def test_git_excluded(self):
|
|
"""Test that .git is excluded."""
|
|
self.assertTrue(should_exclude_dir(".git", DEFAULT_EXCLUDED_DIRS))
|
|
|
|
def test_normal_dir_not_excluded(self):
|
|
"""Test that normal directories are not excluded."""
|
|
self.assertFalse(should_exclude_dir("src", DEFAULT_EXCLUDED_DIRS))
|
|
self.assertFalse(should_exclude_dir("tests", DEFAULT_EXCLUDED_DIRS))
|
|
|
|
|
|
class TestDirectoryWalking(unittest.TestCase):
|
|
"""Tests for directory walking functionality"""
|
|
|
|
def setUp(self):
|
|
"""Set up test environment"""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.root = Path(self.temp_dir)
|
|
|
|
def tearDown(self):
|
|
"""Clean up test environment"""
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_walk_empty_directory(self):
|
|
"""Test walking empty directory."""
|
|
files = walk_directory(self.root)
|
|
self.assertEqual(len(files), 0)
|
|
|
|
def test_walk_with_python_files(self):
|
|
"""Test walking directory with Python files."""
|
|
# Create test files
|
|
(self.root / "test1.py").write_text('print("test")')
|
|
(self.root / "test2.py").write_text('print("test2")')
|
|
(self.root / "readme.txt").write_text("readme")
|
|
|
|
files = walk_directory(self.root)
|
|
|
|
# Should only find Python files
|
|
self.assertEqual(len(files), 2)
|
|
self.assertTrue(all(f.suffix == ".py" for f in files))
|
|
|
|
def test_walk_excludes_node_modules(self):
|
|
"""Test that node_modules directory is excluded."""
|
|
# Create test files
|
|
(self.root / "test.py").write_text("test")
|
|
|
|
# Create node_modules with files
|
|
node_modules = self.root / "node_modules"
|
|
node_modules.mkdir()
|
|
(node_modules / "package.js").write_text("test")
|
|
|
|
files = walk_directory(self.root)
|
|
|
|
# Should only find root test.py, not package.js
|
|
self.assertEqual(len(files), 1)
|
|
self.assertEqual(files[0].name, "test.py")
|
|
|
|
def test_walk_with_subdirectories(self):
|
|
"""Test walking nested directory structure."""
|
|
# Create nested structure
|
|
src_dir = self.root / "src"
|
|
src_dir.mkdir()
|
|
(src_dir / "module.py").write_text("test")
|
|
|
|
tests_dir = self.root / "tests"
|
|
tests_dir.mkdir()
|
|
(tests_dir / "test_module.py").write_text("test")
|
|
|
|
files = walk_directory(self.root)
|
|
|
|
# Should find both files
|
|
self.assertEqual(len(files), 2)
|
|
filenames = [f.name for f in files]
|
|
self.assertIn("module.py", filenames)
|
|
self.assertIn("test_module.py", filenames)
|
|
|
|
|
|
class TestGitignoreLoading(unittest.TestCase):
|
|
"""Tests for .gitignore loading"""
|
|
|
|
def setUp(self):
|
|
"""Set up test environment"""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.root = Path(self.temp_dir)
|
|
|
|
def tearDown(self):
|
|
"""Clean up test environment"""
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_no_gitignore(self):
|
|
"""Test behavior when no .gitignore exists."""
|
|
spec = load_gitignore(self.root)
|
|
# Should return None when no .gitignore found
|
|
self.assertIsNone(spec)
|
|
|
|
def test_load_gitignore(self):
|
|
"""Test loading valid .gitignore file."""
|
|
# Create .gitignore
|
|
gitignore_path = self.root / ".gitignore"
|
|
gitignore_path.write_text("*.log\ntemp/\n")
|
|
|
|
spec = load_gitignore(self.root)
|
|
|
|
# Should successfully load pathspec (if pathspec is installed)
|
|
# If pathspec is not installed, spec will be None
|
|
if spec is not None:
|
|
# Verify it's a PathSpec object
|
|
self.assertIsNotNone(spec)
|
|
|
|
|
|
class TestMarkdownDocumentation(unittest.TestCase):
|
|
"""Tests for markdown documentation extraction (C3.9)"""
|
|
|
|
def setUp(self):
|
|
"""Set up test environment"""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.root = Path(self.temp_dir)
|
|
|
|
def tearDown(self):
|
|
"""Clean up test environment"""
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_markdown_extensions(self):
|
|
"""Test that markdown extensions are properly defined."""
|
|
self.assertIn(".md", MARKDOWN_EXTENSIONS)
|
|
self.assertIn(".markdown", MARKDOWN_EXTENSIONS)
|
|
|
|
def test_root_doc_categories(self):
|
|
"""Test root document category mapping."""
|
|
self.assertEqual(ROOT_DOC_CATEGORIES.get("readme"), "overview")
|
|
self.assertEqual(ROOT_DOC_CATEGORIES.get("changelog"), "changelog")
|
|
self.assertEqual(ROOT_DOC_CATEGORIES.get("architecture"), "architecture")
|
|
|
|
def test_folder_categories(self):
|
|
"""Test folder category mapping."""
|
|
self.assertEqual(FOLDER_CATEGORIES.get("guides"), "guides")
|
|
self.assertEqual(FOLDER_CATEGORIES.get("tutorials"), "guides")
|
|
self.assertEqual(FOLDER_CATEGORIES.get("workflows"), "workflows")
|
|
self.assertEqual(FOLDER_CATEGORIES.get("architecture"), "architecture")
|
|
|
|
def test_walk_markdown_files(self):
|
|
"""Test walking directory for markdown files."""
|
|
# Create test markdown files
|
|
(self.root / "README.md").write_text("# Test README")
|
|
(self.root / "test.py").write_text("print('test')")
|
|
|
|
docs_dir = self.root / "docs"
|
|
docs_dir.mkdir()
|
|
(docs_dir / "guide.md").write_text("# Guide")
|
|
|
|
files = walk_markdown_files(self.root)
|
|
|
|
# Should find markdown files only
|
|
self.assertEqual(len(files), 2)
|
|
filenames = [f.name for f in files]
|
|
self.assertIn("README.md", filenames)
|
|
self.assertIn("guide.md", filenames)
|
|
|
|
def test_categorize_root_readme(self):
|
|
"""Test categorizing root README file."""
|
|
readme_path = self.root / "README.md"
|
|
readme_path.write_text("# Test")
|
|
|
|
category = categorize_markdown_file(readme_path, self.root)
|
|
self.assertEqual(category, "overview")
|
|
|
|
def test_categorize_changelog(self):
|
|
"""Test categorizing CHANGELOG file."""
|
|
changelog_path = self.root / "CHANGELOG.md"
|
|
changelog_path.write_text("# Changelog")
|
|
|
|
category = categorize_markdown_file(changelog_path, self.root)
|
|
self.assertEqual(category, "changelog")
|
|
|
|
def test_categorize_docs_guide(self):
|
|
"""Test categorizing file in docs/guides folder."""
|
|
guides_dir = self.root / "docs" / "guides"
|
|
guides_dir.mkdir(parents=True)
|
|
guide_path = guides_dir / "getting-started.md"
|
|
guide_path.write_text("# Getting Started")
|
|
|
|
category = categorize_markdown_file(guide_path, self.root)
|
|
self.assertEqual(category, "guides")
|
|
|
|
def test_categorize_architecture(self):
|
|
"""Test categorizing architecture documentation."""
|
|
arch_dir = self.root / "docs" / "architecture"
|
|
arch_dir.mkdir(parents=True)
|
|
arch_path = arch_dir / "overview.md"
|
|
arch_path.write_text("# Architecture")
|
|
|
|
category = categorize_markdown_file(arch_path, self.root)
|
|
self.assertEqual(category, "architecture")
|
|
|
|
|
|
class TestMarkdownStructureExtraction(unittest.TestCase):
|
|
"""Tests for markdown structure extraction"""
|
|
|
|
def test_extract_headers(self):
|
|
"""Test extracting headers from markdown."""
|
|
content = """# Main Title
|
|
|
|
## Section 1
|
|
Some content
|
|
|
|
### Subsection
|
|
More content
|
|
|
|
## Section 2
|
|
"""
|
|
structure = extract_markdown_structure(content)
|
|
|
|
self.assertEqual(structure["title"], "Main Title")
|
|
self.assertEqual(len(structure["headers"]), 4)
|
|
self.assertEqual(structure["headers"][0]["level"], 1)
|
|
self.assertEqual(structure["headers"][1]["level"], 2)
|
|
|
|
def test_extract_code_blocks(self):
|
|
"""Test extracting code blocks from markdown."""
|
|
content = """# Example
|
|
|
|
```python
|
|
def hello():
|
|
print("Hello")
|
|
```
|
|
|
|
```javascript
|
|
console.log("test");
|
|
```
|
|
"""
|
|
structure = extract_markdown_structure(content)
|
|
|
|
self.assertEqual(len(structure["code_blocks"]), 2)
|
|
self.assertEqual(structure["code_blocks"][0]["language"], "python")
|
|
self.assertEqual(structure["code_blocks"][1]["language"], "javascript")
|
|
|
|
def test_extract_links(self):
|
|
"""Test extracting links from markdown."""
|
|
content = """# Links
|
|
|
|
Check out [Example](https://example.com) and [Another](./local.md).
|
|
"""
|
|
structure = extract_markdown_structure(content)
|
|
|
|
self.assertEqual(len(structure["links"]), 2)
|
|
self.assertEqual(structure["links"][0]["text"], "Example")
|
|
self.assertEqual(structure["links"][0]["url"], "https://example.com")
|
|
|
|
def test_word_and_line_count(self):
|
|
"""Test word and line count."""
|
|
content = "First line\nSecond line\nThird line"
|
|
structure = extract_markdown_structure(content)
|
|
|
|
self.assertEqual(structure["line_count"], 3)
|
|
self.assertEqual(structure["word_count"], 6) # First, line, Second, line, Third, line
|
|
|
|
|
|
class TestMarkdownSummaryGeneration(unittest.TestCase):
|
|
"""Tests for markdown summary generation"""
|
|
|
|
def test_generate_summary_with_title(self):
|
|
"""Test summary includes title."""
|
|
content = "# My Title\n\nSome content here."
|
|
structure = extract_markdown_structure(content)
|
|
summary = generate_markdown_summary(content, structure)
|
|
|
|
self.assertIn("**My Title**", summary)
|
|
|
|
def test_generate_summary_with_sections(self):
|
|
"""Test summary includes section names."""
|
|
content = """# Main
|
|
|
|
## Getting Started
|
|
Content
|
|
|
|
## Installation
|
|
Content
|
|
|
|
## Usage
|
|
Content
|
|
"""
|
|
structure = extract_markdown_structure(content)
|
|
summary = generate_markdown_summary(content, structure)
|
|
|
|
self.assertIn("Sections:", summary)
|
|
|
|
def test_generate_summary_truncation(self):
|
|
"""Test summary is truncated to max length."""
|
|
content = "# Title\n\n" + "Long content. " * 100
|
|
structure = extract_markdown_structure(content)
|
|
summary = generate_markdown_summary(content, structure, max_length=200)
|
|
|
|
self.assertLessEqual(len(summary), 210) # Allow some buffer for truncation marker
|
|
|
|
|
|
class TestReferenceGeneration(unittest.TestCase):
|
|
"""Tests for _generate_references function (Issue #279)"""
|
|
|
|
def setUp(self):
|
|
"""Create temporary directory for testing."""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.output_dir = Path(self.temp_dir) / "output"
|
|
self.output_dir.mkdir()
|
|
|
|
def tearDown(self):
|
|
"""Clean up temporary directory."""
|
|
if os.path.exists(self.temp_dir):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_no_duplicate_directories_created(self):
|
|
"""Test that source directories are cleaned up after copying to references/ (Issue #279)."""
|
|
# Create test directories that will be copied
|
|
test_dirs = ["documentation", "api_reference", "patterns"]
|
|
for dir_name in test_dirs:
|
|
dir_path = self.output_dir / dir_name
|
|
dir_path.mkdir()
|
|
# Add a test file
|
|
(dir_path / "test.txt").write_text(f"Test content for {dir_name}")
|
|
|
|
# Generate references (should copy and then cleanup)
|
|
_generate_references(self.output_dir)
|
|
|
|
# Verify references/ exists
|
|
references_dir = self.output_dir / "references"
|
|
self.assertTrue(references_dir.exists(), "references/ should exist")
|
|
|
|
# Verify content was copied to references/
|
|
for dir_name in test_dirs:
|
|
ref_path = references_dir / dir_name
|
|
self.assertTrue(ref_path.exists(), f"references/{dir_name} should exist")
|
|
self.assertTrue(
|
|
(ref_path / "test.txt").exists(),
|
|
f"references/{dir_name}/test.txt should exist",
|
|
)
|
|
|
|
# Verify source directories were cleaned up (Issue #279 fix)
|
|
for dir_name in test_dirs:
|
|
source_path = self.output_dir / dir_name
|
|
self.assertFalse(
|
|
source_path.exists(),
|
|
f"Source directory {dir_name}/ should be cleaned up to avoid duplication",
|
|
)
|
|
|
|
def test_no_disk_space_wasted(self):
|
|
"""Test that disk space is not wasted by duplicate directories."""
|
|
# Create a documentation directory with some content
|
|
doc_dir = self.output_dir / "documentation"
|
|
doc_dir.mkdir()
|
|
test_content = "x" * 1000 # 1KB of content
|
|
(doc_dir / "large_file.txt").write_text(test_content)
|
|
|
|
# Generate references
|
|
_generate_references(self.output_dir)
|
|
|
|
# Verify only one copy exists (in references/)
|
|
ref_doc_dir = self.output_dir / "references" / "documentation"
|
|
source_doc_dir = self.output_dir / "documentation"
|
|
|
|
self.assertTrue(ref_doc_dir.exists(), "references/documentation/ should exist")
|
|
self.assertFalse(
|
|
source_doc_dir.exists(), "Source documentation/ should not exist (cleaned up)"
|
|
)
|
|
|
|
# Verify content is accessible in references/
|
|
self.assertTrue(
|
|
(ref_doc_dir / "large_file.txt").exists(), "File should exist in references/"
|
|
)
|
|
self.assertEqual(
|
|
(ref_doc_dir / "large_file.txt").read_text(),
|
|
test_content,
|
|
"File content should be preserved",
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Run tests with verbose output
|
|
unittest.main(verbosity=2)
|