skill-seekers-reference/tests/test_codebase_scraper.py

#!/usr/bin/env python3
"""
Tests for codebase_scraper.py - Standalone codebase analysis CLI.

Test Coverage:
- Language detection
- Directory exclusion
- File walking
- .gitignore loading
"""

import os
import shutil
import sys
import tempfile
import unittest
from pathlib import Path

# Add src to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))

from skill_seekers.cli.codebase_scraper import (
    DEFAULT_EXCLUDED_DIRS,
    FOLDER_CATEGORIES,
    MARKDOWN_EXTENSIONS,
    ROOT_DOC_CATEGORIES,
    _generate_references,
    categorize_markdown_file,
    detect_language,
    extract_markdown_structure,
    generate_markdown_summary,
    load_gitignore,
    should_exclude_dir,
    walk_directory,
    walk_markdown_files,
)


class TestLanguageDetection(unittest.TestCase):
    """Tests for language detection from file extensions"""

    def test_python_detection(self):
        """Test Python file detection."""
        self.assertEqual(detect_language(Path("test.py")), "Python")

    def test_javascript_detection(self):
        """Test JavaScript file detection."""
        self.assertEqual(detect_language(Path("test.js")), "JavaScript")
        self.assertEqual(detect_language(Path("test.jsx")), "JavaScript")

    def test_typescript_detection(self):
        """Test TypeScript file detection."""
        self.assertEqual(detect_language(Path("test.ts")), "TypeScript")
        self.assertEqual(detect_language(Path("test.tsx")), "TypeScript")

    def test_cpp_detection(self):
        """Test C++ file detection."""
        self.assertEqual(detect_language(Path("test.cpp")), "C++")
        self.assertEqual(detect_language(Path("test.h")), "C++")
        self.assertEqual(detect_language(Path("test.hpp")), "C++")

    def test_csharp_detection(self):
        """Test C# file detection."""
        self.assertEqual(detect_language(Path("test.cs")), "C#")

    def test_go_detection(self):
        """Test Go file detection."""
        self.assertEqual(detect_language(Path("test.go")), "Go")

    def test_rust_detection(self):
        """Test Rust file detection."""
        self.assertEqual(detect_language(Path("test.rs")), "Rust")

    def test_java_detection(self):
        """Test Java file detection."""
        self.assertEqual(detect_language(Path("test.java")), "Java")

    def test_ruby_detection(self):
        """Test Ruby file detection."""
        self.assertEqual(detect_language(Path("test.rb")), "Ruby")

    def test_php_detection(self):
        """Test PHP file detection."""
        self.assertEqual(detect_language(Path("test.php")), "PHP")

    def test_unknown_language(self):
        """Test unknown file extension."""
        self.assertEqual(detect_language(Path("test.swift")), "Unknown")
        self.assertEqual(detect_language(Path("test.txt")), "Unknown")


class TestDirectoryExclusion(unittest.TestCase):
    """Tests for directory exclusion logic"""

    def test_node_modules_excluded(self):
        """Test that node_modules is excluded."""
        self.assertTrue(should_exclude_dir("node_modules", DEFAULT_EXCLUDED_DIRS))

    def test_venv_excluded(self):
        """Test that venv is excluded."""
        self.assertTrue(should_exclude_dir("venv", DEFAULT_EXCLUDED_DIRS))

    def test_git_excluded(self):
        """Test that .git is excluded."""
        self.assertTrue(should_exclude_dir(".git", DEFAULT_EXCLUDED_DIRS))

    def test_normal_dir_not_excluded(self):
        """Test that normal directories are not excluded."""
        self.assertFalse(should_exclude_dir("src", DEFAULT_EXCLUDED_DIRS))
        self.assertFalse(should_exclude_dir("tests", DEFAULT_EXCLUDED_DIRS))


class TestDirectoryWalking(unittest.TestCase):
    """Tests for directory walking functionality"""

    def setUp(self):
        """Set up test environment"""
        self.temp_dir = tempfile.mkdtemp()
        self.root = Path(self.temp_dir)

    def tearDown(self):
        """Clean up test environment"""
        shutil.rmtree(self.temp_dir, ignore_errors=True)

    def test_walk_empty_directory(self):
        """Test walking empty directory."""
        files = walk_directory(self.root)
        self.assertEqual(len(files), 0)

    def test_walk_with_python_files(self):
        """Test walking directory with Python files."""
        # Create test files
        (self.root / "test1.py").write_text('print("test")')
        (self.root / "test2.py").write_text('print("test2")')
        (self.root / "readme.txt").write_text("readme")

        files = walk_directory(self.root)

        # Should only find Python files
        self.assertEqual(len(files), 2)
        self.assertTrue(all(f.suffix == ".py" for f in files))

    def test_walk_excludes_node_modules(self):
        """Test that node_modules directory is excluded."""
        # Create test files
        (self.root / "test.py").write_text("test")

        # Create node_modules with files
        node_modules = self.root / "node_modules"
        node_modules.mkdir()
        (node_modules / "package.js").write_text("test")

        files = walk_directory(self.root)

        # Should only find root test.py, not package.js
        self.assertEqual(len(files), 1)
        self.assertEqual(files[0].name, "test.py")

    def test_walk_with_subdirectories(self):
        """Test walking nested directory structure."""
        # Create nested structure
        src_dir = self.root / "src"
        src_dir.mkdir()
        (src_dir / "module.py").write_text("test")

        tests_dir = self.root / "tests"
        tests_dir.mkdir()
        (tests_dir / "test_module.py").write_text("test")

        files = walk_directory(self.root)

        # Should find both files
        self.assertEqual(len(files), 2)
        filenames = [f.name for f in files]
        self.assertIn("module.py", filenames)
        self.assertIn("test_module.py", filenames)


class TestGitignoreLoading(unittest.TestCase):
    """Tests for .gitignore loading"""

    def setUp(self):
        """Set up test environment"""
        self.temp_dir = tempfile.mkdtemp()
        self.root = Path(self.temp_dir)

    def tearDown(self):
        """Clean up test environment"""
        shutil.rmtree(self.temp_dir, ignore_errors=True)

    def test_no_gitignore(self):
        """Test behavior when no .gitignore exists."""
        spec = load_gitignore(self.root)
        # Should return None when no .gitignore found
        self.assertIsNone(spec)

    def test_load_gitignore(self):
        """Test loading valid .gitignore file."""
        # Create .gitignore
        gitignore_path = self.root / ".gitignore"
        gitignore_path.write_text("*.log\ntemp/\n")

        spec = load_gitignore(self.root)

        # Should successfully load pathspec (if pathspec is installed)
        # If pathspec is not installed, spec will be None
        if spec is not None:
            # Verify it's a PathSpec object
            self.assertIsNotNone(spec)


class TestMarkdownDocumentation(unittest.TestCase):
    """Tests for markdown documentation extraction (C3.9)"""

    def setUp(self):
        """Set up test environment"""
        self.temp_dir = tempfile.mkdtemp()
        self.root = Path(self.temp_dir)

    def tearDown(self):
        """Clean up test environment"""
        shutil.rmtree(self.temp_dir, ignore_errors=True)

    def test_markdown_extensions(self):
        """Test that markdown extensions are properly defined."""
        self.assertIn(".md", MARKDOWN_EXTENSIONS)
        self.assertIn(".markdown", MARKDOWN_EXTENSIONS)

    def test_root_doc_categories(self):
        """Test root document category mapping."""
        self.assertEqual(ROOT_DOC_CATEGORIES.get("readme"), "overview")
        self.assertEqual(ROOT_DOC_CATEGORIES.get("changelog"), "changelog")
        self.assertEqual(ROOT_DOC_CATEGORIES.get("architecture"), "architecture")

    def test_folder_categories(self):
        """Test folder category mapping."""
        self.assertEqual(FOLDER_CATEGORIES.get("guides"), "guides")
        self.assertEqual(FOLDER_CATEGORIES.get("tutorials"), "guides")
        self.assertEqual(FOLDER_CATEGORIES.get("workflows"), "workflows")
        self.assertEqual(FOLDER_CATEGORIES.get("architecture"), "architecture")

    def test_walk_markdown_files(self):
        """Test walking directory for markdown files."""
        # Create test markdown files
        (self.root / "README.md").write_text("# Test README")
        (self.root / "test.py").write_text("print('test')")

        docs_dir = self.root / "docs"
        docs_dir.mkdir()
        (docs_dir / "guide.md").write_text("# Guide")

        files = walk_markdown_files(self.root)

        # Should find markdown files only
        self.assertEqual(len(files), 2)
        filenames = [f.name for f in files]
        self.assertIn("README.md", filenames)
        self.assertIn("guide.md", filenames)

    def test_categorize_root_readme(self):
        """Test categorizing root README file."""
        readme_path = self.root / "README.md"
        readme_path.write_text("# Test")

        category = categorize_markdown_file(readme_path, self.root)
        self.assertEqual(category, "overview")

    def test_categorize_changelog(self):
        """Test categorizing CHANGELOG file."""
        changelog_path = self.root / "CHANGELOG.md"
        changelog_path.write_text("# Changelog")

        category = categorize_markdown_file(changelog_path, self.root)
        self.assertEqual(category, "changelog")

    def test_categorize_docs_guide(self):
        """Test categorizing file in docs/guides folder."""
        guides_dir = self.root / "docs" / "guides"
        guides_dir.mkdir(parents=True)
        guide_path = guides_dir / "getting-started.md"
        guide_path.write_text("# Getting Started")

        category = categorize_markdown_file(guide_path, self.root)
        self.assertEqual(category, "guides")

    def test_categorize_architecture(self):
        """Test categorizing architecture documentation."""
        arch_dir = self.root / "docs" / "architecture"
        arch_dir.mkdir(parents=True)
        arch_path = arch_dir / "overview.md"
        arch_path.write_text("# Architecture")

        category = categorize_markdown_file(arch_path, self.root)
        self.assertEqual(category, "architecture")


class TestMarkdownStructureExtraction(unittest.TestCase):
    """Tests for markdown structure extraction"""

    def test_extract_headers(self):
        """Test extracting headers from markdown."""
        content = """# Main Title

## Section 1
Some content

### Subsection
More content

## Section 2
"""
        structure = extract_markdown_structure(content)

        self.assertEqual(structure["title"], "Main Title")
        self.assertEqual(len(structure["headers"]), 4)
        self.assertEqual(structure["headers"][0]["level"], 1)
        self.assertEqual(structure["headers"][1]["level"], 2)

    def test_extract_code_blocks(self):
        """Test extracting code blocks from markdown."""
        content = """# Example

```python
def hello():
    print("Hello")
```

```javascript
console.log("test");
```
"""
        structure = extract_markdown_structure(content)

        self.assertEqual(len(structure["code_blocks"]), 2)
        self.assertEqual(structure["code_blocks"][0]["language"], "python")
        self.assertEqual(structure["code_blocks"][1]["language"], "javascript")

    def test_extract_links(self):
        """Test extracting links from markdown."""
        content = """# Links

Check out [Example](https://example.com) and [Another](./local.md).
"""
        structure = extract_markdown_structure(content)

        self.assertEqual(len(structure["links"]), 2)
        self.assertEqual(structure["links"][0]["text"], "Example")
        self.assertEqual(structure["links"][0]["url"], "https://example.com")

    def test_word_and_line_count(self):
        """Test word and line count."""
        content = "First line\nSecond line\nThird line"
        structure = extract_markdown_structure(content)

        self.assertEqual(structure["line_count"], 3)
        self.assertEqual(structure["word_count"], 6)  # First, line, Second, line, Third, line


class TestMarkdownSummaryGeneration(unittest.TestCase):
    """Tests for markdown summary generation"""

    def test_generate_summary_with_title(self):
        """Test summary includes title."""
        content = "# My Title\n\nSome content here."
        structure = extract_markdown_structure(content)
        summary = generate_markdown_summary(content, structure)

        self.assertIn("**My Title**", summary)

    def test_generate_summary_with_sections(self):
        """Test summary includes section names."""
        content = """# Main

## Getting Started
Content

## Installation
Content

## Usage
Content
"""
        structure = extract_markdown_structure(content)
        summary = generate_markdown_summary(content, structure)

        self.assertIn("Sections:", summary)

    def test_generate_summary_truncation(self):
        """Test summary is truncated to max length."""
        content = "# Title\n\n" + "Long content. " * 100
        structure = extract_markdown_structure(content)
        summary = generate_markdown_summary(content, structure, max_length=200)

        self.assertLessEqual(len(summary), 210)  # Allow some buffer for truncation marker


class TestReferenceGeneration(unittest.TestCase):
    """Tests for _generate_references function (Issue #279)"""

    def setUp(self):
        """Create temporary directory for testing."""
        self.temp_dir = tempfile.mkdtemp()
        self.output_dir = Path(self.temp_dir) / "output"
        self.output_dir.mkdir()

    def tearDown(self):
        """Clean up temporary directory."""
        if os.path.exists(self.temp_dir):
            shutil.rmtree(self.temp_dir)

    def test_no_duplicate_directories_created(self):
        """Test that source directories are cleaned up after copying to references/ (Issue #279)."""
        # Create test directories that will be copied
        test_dirs = ["documentation", "api_reference", "patterns"]
        for dir_name in test_dirs:
            dir_path = self.output_dir / dir_name
            dir_path.mkdir()
            # Add a test file
            (dir_path / "test.txt").write_text(f"Test content for {dir_name}")

        # Generate references (should copy and then cleanup)
        _generate_references(self.output_dir)

        # Verify references/ exists
        references_dir = self.output_dir / "references"
        self.assertTrue(references_dir.exists(), "references/ should exist")

        # Verify content was copied to references/
        for dir_name in test_dirs:
            ref_path = references_dir / dir_name
            self.assertTrue(ref_path.exists(), f"references/{dir_name} should exist")
            self.assertTrue(
                (ref_path / "test.txt").exists(),
                f"references/{dir_name}/test.txt should exist",
            )

        # Verify source directories were cleaned up (Issue #279 fix)
        for dir_name in test_dirs:
            source_path = self.output_dir / dir_name
            self.assertFalse(
                source_path.exists(),
                f"Source directory {dir_name}/ should be cleaned up to avoid duplication",
            )

    def test_no_disk_space_wasted(self):
        """Test that disk space is not wasted by duplicate directories."""
        # Create a documentation directory with some content
        doc_dir = self.output_dir / "documentation"
        doc_dir.mkdir()
        test_content = "x" * 1000  # 1KB of content
        (doc_dir / "large_file.txt").write_text(test_content)

        # Generate references
        _generate_references(self.output_dir)

        # Verify only one copy exists (in references/)
        ref_doc_dir = self.output_dir / "references" / "documentation"
        source_doc_dir = self.output_dir / "documentation"

        self.assertTrue(ref_doc_dir.exists(), "references/documentation/ should exist")
        self.assertFalse(
            source_doc_dir.exists(), "Source documentation/ should not exist (cleaned up)"
        )

        # Verify content is accessible in references/
        self.assertTrue(
            (ref_doc_dir / "large_file.txt").exists(), "File should exist in references/"
        )
        self.assertEqual(
            (ref_doc_dir / "large_file.txt").read_text(),
            test_content,
            "File content should be preserved",
        )


if __name__ == "__main__":
    # Run tests with verbose output
    unittest.main(verbosity=2)