fix: Remove duplicate documentation directories to save disk space (fixes #279)

Problem:
The analyze command created duplicate documentation directories:
- output/skill-seekers/documentation/ (1.5MB) - Not referenced
- output/skill-seekers/references/documentation/ (1.5MB) - Referenced
This wasted 1.5MB per skill (50% duplication).

Root Cause:
_generate_references() copied directories to references/ but never
cleaned up the source directories.

Solution:
After copying each directory to references/, immediately remove the
source directory using shutil.rmtree(). SKILL.md only references
references/{target}, making the source directories redundant.

Changes:
- Add cleanup in _generate_references() after each copytree operation
- Add 2 comprehensive tests to verify no duplicate directories
- Test coverage: 38/38 tests passing in test_codebase_scraper.py

Impact:
- Saves 1.5MB per skill (documentation size varies)
- Prevents 50% duplication of all analysis output directories
- Clean, efficient disk usage

Tests Added:
- test_no_duplicate_directories_created: Verifies source cleanup
- test_no_disk_space_wasted: Verifies single copy in references/

Reported by: @yangshare via Issue #279

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-05 21:27:41 +03:00
parent 31d83245da
commit 5492fe3dc0
2 changed files with 85 additions and 0 deletions

View File

@@ -1855,6 +1855,11 @@ def _generate_references(output_dir: Path):
shutil.copytree(source_dir, target_dir)
logger.debug(f"Copied {source} → references/{target}")
# Clean up source directory to avoid duplication (Issue #279)
# SKILL.md only references references/{target}, so source dir is redundant
shutil.rmtree(source_dir)
logger.debug(f"Cleaned up duplicate {source}/ directory")
logger.info(f"✅ Generated references directory: {references_dir}")

View File

@@ -24,6 +24,7 @@ from skill_seekers.cli.codebase_scraper import (
FOLDER_CATEGORIES,
MARKDOWN_EXTENSIONS,
ROOT_DOC_CATEGORIES,
_generate_references,
categorize_markdown_file,
detect_language,
extract_markdown_structure,
@@ -393,6 +394,85 @@ Content
self.assertLessEqual(len(summary), 210) # Allow some buffer for truncation marker
class TestReferenceGeneration(unittest.TestCase):
"""Tests for _generate_references function (Issue #279)"""
def setUp(self):
"""Create temporary directory for testing."""
self.temp_dir = tempfile.mkdtemp()
self.output_dir = Path(self.temp_dir) / "output"
self.output_dir.mkdir()
def tearDown(self):
"""Clean up temporary directory."""
if os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)
def test_no_duplicate_directories_created(self):
"""Test that source directories are cleaned up after copying to references/ (Issue #279)."""
# Create test directories that will be copied
test_dirs = ["documentation", "api_reference", "patterns"]
for dir_name in test_dirs:
dir_path = self.output_dir / dir_name
dir_path.mkdir()
# Add a test file
(dir_path / "test.txt").write_text(f"Test content for {dir_name}")
# Generate references (should copy and then cleanup)
_generate_references(self.output_dir)
# Verify references/ exists
references_dir = self.output_dir / "references"
self.assertTrue(references_dir.exists(), "references/ should exist")
# Verify content was copied to references/
for dir_name in test_dirs:
ref_path = references_dir / dir_name
self.assertTrue(ref_path.exists(), f"references/{dir_name} should exist")
self.assertTrue(
(ref_path / "test.txt").exists(),
f"references/{dir_name}/test.txt should exist",
)
# Verify source directories were cleaned up (Issue #279 fix)
for dir_name in test_dirs:
source_path = self.output_dir / dir_name
self.assertFalse(
source_path.exists(),
f"Source directory {dir_name}/ should be cleaned up to avoid duplication",
)
def test_no_disk_space_wasted(self):
"""Test that disk space is not wasted by duplicate directories."""
# Create a documentation directory with some content
doc_dir = self.output_dir / "documentation"
doc_dir.mkdir()
test_content = "x" * 1000 # 1KB of content
(doc_dir / "large_file.txt").write_text(test_content)
# Generate references
_generate_references(self.output_dir)
# Verify only one copy exists (in references/)
ref_doc_dir = self.output_dir / "references" / "documentation"
source_doc_dir = self.output_dir / "documentation"
self.assertTrue(ref_doc_dir.exists(), "references/documentation/ should exist")
self.assertFalse(
source_doc_dir.exists(), "Source documentation/ should not exist (cleaned up)"
)
# Verify content is accessible in references/
self.assertTrue(
(ref_doc_dir / "large_file.txt").exists(), "File should exist in references/"
)
self.assertEqual(
(ref_doc_dir / "large_file.txt").read_text(),
test_content,
"File content should be preserved",
)
if __name__ == "__main__":
# Run tests with verbose output
unittest.main(verbosity=2)