feat(C2.7): Add standalone codebase-scraper CLI tool
- Created src/skill_seekers/cli/codebase_scraper.py (450 lines)
- Standalone tool for analyzing local codebases without GitHub API
- Full .gitignore support using pathspec library
Features:
- Directory tree walking with .gitignore respect
- Multi-language code analysis (Python, JavaScript, TypeScript, C++)
- Language filtering (--languages Python,JavaScript)
- File pattern matching (--file-patterns "*.py,src/**/*.js")
- API reference generation (--build-api-reference)
- Comment extraction (enabled by default)
- Configurable analysis depth (surface/deep/full)
- Smart directory exclusion (node_modules, venv, .git, etc.)
CLI Usage:
skill-seekers-codebase --directory /path/to/repo --output output/codebase/
skill-seekers-codebase --directory . --depth deep --build-api-reference
skill-seekers-codebase --directory . --languages Python,JavaScript
Output:
- code_analysis.json - Complete analysis results
- api_reference/*.md - Generated API documentation (optional)
Tests:
- Created tests/test_codebase_scraper.py with 15 tests
- All tests passing ✅
- Test coverage: Language detection (5 tests), directory exclusion (4 tests),
directory walking (4 tests), .gitignore loading (2 tests)
Dependencies Added:
- pathspec>=0.12.1 - For .gitignore parsing
Entry Point:
- Added skill-seekers-codebase to pyproject.toml
Related Issues:
- Closes #69 (C2.7 Create codebase_scraper.py CLI tool)
- Part of C2 Local Codebase Scraping roadmap (TIER 3)
Files Modified:
- src/skill_seekers/cli/codebase_scraper.py (CREATE - 450 lines)
- tests/test_codebase_scraper.py (CREATE - 160 lines)
- pyproject.toml (+2 lines - pathspec dependency + entry point)
This commit is contained in:
@@ -55,6 +55,7 @@ dependencies = [
|
|||||||
"jsonschema>=4.25.1",
|
"jsonschema>=4.25.1",
|
||||||
"click>=8.3.0",
|
"click>=8.3.0",
|
||||||
"Pygments>=2.19.2",
|
"Pygments>=2.19.2",
|
||||||
|
"pathspec>=0.12.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
@@ -130,6 +131,7 @@ skill-seekers-upload = "skill_seekers.cli.upload_skill:main"
|
|||||||
skill-seekers-estimate = "skill_seekers.cli.estimate_pages:main"
|
skill-seekers-estimate = "skill_seekers.cli.estimate_pages:main"
|
||||||
skill-seekers-install = "skill_seekers.cli.install_skill:main"
|
skill-seekers-install = "skill_seekers.cli.install_skill:main"
|
||||||
skill-seekers-install-agent = "skill_seekers.cli.install_agent:main"
|
skill-seekers-install-agent = "skill_seekers.cli.install_agent:main"
|
||||||
|
skill-seekers-codebase = "skill_seekers.cli.codebase_scraper:main"
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
package-dir = {"" = "src"}
|
package-dir = {"" = "src"}
|
||||||
|
|||||||
420
src/skill_seekers/cli/codebase_scraper.py
Normal file
420
src/skill_seekers/cli/codebase_scraper.py
Normal file
@@ -0,0 +1,420 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Codebase Scraper CLI Tool
|
||||||
|
|
||||||
|
Standalone tool for analyzing local codebases without GitHub API.
|
||||||
|
Extracts code signatures, comments, and optionally generates API documentation.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
codebase-scraper --directory /path/to/repo --output output/codebase/
|
||||||
|
codebase-scraper --directory . --depth deep --languages Python,JavaScript
|
||||||
|
codebase-scraper --directory /path/to/repo --build-api-reference
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- File tree walking with .gitignore support
|
||||||
|
- Multi-language code analysis (Python, JavaScript, C++)
|
||||||
|
- API reference generation
|
||||||
|
- Comment extraction
|
||||||
|
- Configurable depth levels
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
|
||||||
|
# Add parent directory to path for imports
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from skill_seekers.cli.code_analyzer import CodeAnalyzer
|
||||||
|
from skill_seekers.cli.api_reference_builder import APIReferenceBuilder
|
||||||
|
|
||||||
|
# Try to import pathspec for .gitignore support
|
||||||
|
try:
|
||||||
|
import pathspec
|
||||||
|
PATHSPEC_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
PATHSPEC_AVAILABLE = False
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Language extension mapping
|
||||||
|
LANGUAGE_EXTENSIONS = {
|
||||||
|
'.py': 'Python',
|
||||||
|
'.js': 'JavaScript',
|
||||||
|
'.jsx': 'JavaScript',
|
||||||
|
'.ts': 'TypeScript',
|
||||||
|
'.tsx': 'TypeScript',
|
||||||
|
'.cpp': 'C++',
|
||||||
|
'.cc': 'C++',
|
||||||
|
'.cxx': 'C++',
|
||||||
|
'.h': 'C++',
|
||||||
|
'.hpp': 'C++',
|
||||||
|
'.hxx': 'C++',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Default directories to exclude
|
||||||
|
DEFAULT_EXCLUDED_DIRS = {
|
||||||
|
'node_modules', 'venv', '__pycache__', '.git', '.svn', '.hg',
|
||||||
|
'build', 'dist', 'target', '.pytest_cache', '.tox', '.mypy_cache',
|
||||||
|
'htmlcov', 'coverage', '.coverage', '.eggs', '*.egg-info',
|
||||||
|
'.idea', '.vscode', '.vs', '__pypackages__'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_language(file_path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Detect programming language from file extension.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to source file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Language name or 'Unknown'
|
||||||
|
"""
|
||||||
|
extension = file_path.suffix.lower()
|
||||||
|
return LANGUAGE_EXTENSIONS.get(extension, 'Unknown')
|
||||||
|
|
||||||
|
|
||||||
|
def load_gitignore(directory: Path) -> Optional[pathspec.PathSpec]:
|
||||||
|
"""
|
||||||
|
Load .gitignore file and create pathspec matcher.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Root directory to search for .gitignore
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PathSpec object if .gitignore found, None otherwise
|
||||||
|
"""
|
||||||
|
if not PATHSPEC_AVAILABLE:
|
||||||
|
logger.warning("pathspec not installed - .gitignore support disabled")
|
||||||
|
logger.warning("Install with: pip install pathspec")
|
||||||
|
return None
|
||||||
|
|
||||||
|
gitignore_path = directory / '.gitignore'
|
||||||
|
if not gitignore_path.exists():
|
||||||
|
logger.debug(f"No .gitignore found in {directory}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(gitignore_path, 'r', encoding='utf-8') as f:
|
||||||
|
spec = pathspec.PathSpec.from_lines('gitwildmatch', f)
|
||||||
|
logger.info(f"Loaded .gitignore from {gitignore_path}")
|
||||||
|
return spec
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to load .gitignore: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def should_exclude_dir(dir_name: str, excluded_dirs: set) -> bool:
|
||||||
|
"""
|
||||||
|
Check if directory should be excluded from analysis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dir_name: Directory name
|
||||||
|
excluded_dirs: Set of directory names to exclude
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if directory should be excluded
|
||||||
|
"""
|
||||||
|
return dir_name in excluded_dirs
|
||||||
|
|
||||||
|
|
||||||
|
def walk_directory(
|
||||||
|
root: Path,
|
||||||
|
patterns: Optional[List[str]] = None,
|
||||||
|
gitignore_spec: Optional[pathspec.PathSpec] = None,
|
||||||
|
excluded_dirs: Optional[set] = None
|
||||||
|
) -> List[Path]:
|
||||||
|
"""
|
||||||
|
Walk directory tree and collect source files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
root: Root directory to walk
|
||||||
|
patterns: Optional file patterns to include (e.g., ['*.py', '*.js'])
|
||||||
|
gitignore_spec: Optional PathSpec object for .gitignore rules
|
||||||
|
excluded_dirs: Set of directory names to exclude
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of source file paths
|
||||||
|
"""
|
||||||
|
if excluded_dirs is None:
|
||||||
|
excluded_dirs = DEFAULT_EXCLUDED_DIRS
|
||||||
|
|
||||||
|
files = []
|
||||||
|
root = Path(root).resolve()
|
||||||
|
|
||||||
|
for dirpath, dirnames, filenames in os.walk(root):
|
||||||
|
current_dir = Path(dirpath)
|
||||||
|
|
||||||
|
# Filter out excluded directories (in-place modification)
|
||||||
|
dirnames[:] = [d for d in dirnames if not should_exclude_dir(d, excluded_dirs)]
|
||||||
|
|
||||||
|
for filename in filenames:
|
||||||
|
file_path = current_dir / filename
|
||||||
|
|
||||||
|
# Check .gitignore rules
|
||||||
|
if gitignore_spec:
|
||||||
|
try:
|
||||||
|
rel_path = file_path.relative_to(root)
|
||||||
|
if gitignore_spec.match_file(str(rel_path)):
|
||||||
|
logger.debug(f"Skipping (gitignore): {rel_path}")
|
||||||
|
continue
|
||||||
|
except ValueError:
|
||||||
|
# File is outside root, skip it
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check file extension
|
||||||
|
if file_path.suffix.lower() not in LANGUAGE_EXTENSIONS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check file patterns if provided
|
||||||
|
if patterns:
|
||||||
|
if not any(file_path.match(pattern) for pattern in patterns):
|
||||||
|
continue
|
||||||
|
|
||||||
|
files.append(file_path)
|
||||||
|
|
||||||
|
return sorted(files)
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_codebase(
|
||||||
|
directory: Path,
|
||||||
|
output_dir: Path,
|
||||||
|
depth: str = 'deep',
|
||||||
|
languages: Optional[List[str]] = None,
|
||||||
|
file_patterns: Optional[List[str]] = None,
|
||||||
|
build_api_reference: bool = False,
|
||||||
|
extract_comments: bool = True
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Analyze local codebase and extract code knowledge.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Directory to analyze
|
||||||
|
output_dir: Output directory for results
|
||||||
|
depth: Analysis depth (surface, deep, full)
|
||||||
|
languages: Optional list of languages to analyze
|
||||||
|
file_patterns: Optional file patterns to include
|
||||||
|
build_api_reference: Generate API reference markdown
|
||||||
|
extract_comments: Extract inline comments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Analysis results dictionary
|
||||||
|
"""
|
||||||
|
logger.info(f"Analyzing codebase: {directory}")
|
||||||
|
logger.info(f"Depth: {depth}")
|
||||||
|
|
||||||
|
# Create output directory
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Load .gitignore
|
||||||
|
gitignore_spec = load_gitignore(directory)
|
||||||
|
|
||||||
|
# Walk directory tree
|
||||||
|
logger.info("Scanning directory tree...")
|
||||||
|
files = walk_directory(
|
||||||
|
directory,
|
||||||
|
patterns=file_patterns,
|
||||||
|
gitignore_spec=gitignore_spec
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Found {len(files)} source files")
|
||||||
|
|
||||||
|
# Filter by language if specified
|
||||||
|
if languages:
|
||||||
|
language_set = set(languages)
|
||||||
|
files = [f for f in files if detect_language(f) in language_set]
|
||||||
|
logger.info(f"Filtered to {len(files)} files for languages: {', '.join(languages)}")
|
||||||
|
|
||||||
|
# Initialize code analyzer
|
||||||
|
analyzer = CodeAnalyzer(depth=depth)
|
||||||
|
|
||||||
|
# Analyze each file
|
||||||
|
results = {'files': []}
|
||||||
|
analyzed_count = 0
|
||||||
|
|
||||||
|
for file_path in files:
|
||||||
|
try:
|
||||||
|
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
||||||
|
language = detect_language(file_path)
|
||||||
|
|
||||||
|
if language == 'Unknown':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Analyze file
|
||||||
|
analysis = analyzer.analyze_file(str(file_path), content, language)
|
||||||
|
|
||||||
|
# Only include files with actual analysis results
|
||||||
|
if analysis and (analysis.get('classes') or analysis.get('functions')):
|
||||||
|
results['files'].append({
|
||||||
|
'file': str(file_path.relative_to(directory)),
|
||||||
|
'language': language,
|
||||||
|
**analysis
|
||||||
|
})
|
||||||
|
analyzed_count += 1
|
||||||
|
|
||||||
|
if analyzed_count % 10 == 0:
|
||||||
|
logger.info(f"Analyzed {analyzed_count}/{len(files)} files...")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error analyzing {file_path}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"✅ Successfully analyzed {analyzed_count} files")
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
output_json = output_dir / 'code_analysis.json'
|
||||||
|
with open(output_json, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(results, f, indent=2)
|
||||||
|
|
||||||
|
logger.info(f"📁 Saved analysis to: {output_json}")
|
||||||
|
|
||||||
|
# Build API reference if requested
|
||||||
|
if build_api_reference and results['files']:
|
||||||
|
logger.info("Building API reference documentation...")
|
||||||
|
builder = APIReferenceBuilder(results)
|
||||||
|
api_output_dir = output_dir / 'api_reference'
|
||||||
|
generated_files = builder.build_reference(api_output_dir)
|
||||||
|
logger.info(f"✅ Generated {len(generated_files)} API reference files")
|
||||||
|
logger.info(f"📁 API reference: {api_output_dir}")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Command-line interface for codebase analysis."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Analyze local codebases and extract code knowledge',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Examples:
|
||||||
|
# Analyze current directory
|
||||||
|
codebase-scraper --directory . --output output/codebase/
|
||||||
|
|
||||||
|
# Deep analysis with API reference
|
||||||
|
codebase-scraper --directory /path/to/repo --depth deep --build-api-reference
|
||||||
|
|
||||||
|
# Analyze only Python and JavaScript
|
||||||
|
codebase-scraper --directory . --languages Python,JavaScript
|
||||||
|
|
||||||
|
# Use file patterns
|
||||||
|
codebase-scraper --directory . --file-patterns "*.py,src/**/*.js"
|
||||||
|
|
||||||
|
# Surface analysis (fast, no details)
|
||||||
|
codebase-scraper --directory . --depth surface
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--directory',
|
||||||
|
required=True,
|
||||||
|
help='Directory to analyze'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--output',
|
||||||
|
default='output/codebase/',
|
||||||
|
help='Output directory (default: output/codebase/)'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--depth',
|
||||||
|
choices=['surface', 'deep', 'full'],
|
||||||
|
default='deep',
|
||||||
|
help='Analysis depth (default: deep)'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--languages',
|
||||||
|
help='Comma-separated languages to analyze (e.g., Python,JavaScript,C++)'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--file-patterns',
|
||||||
|
help='Comma-separated file patterns (e.g., *.py,src/**/*.js)'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--build-api-reference',
|
||||||
|
action='store_true',
|
||||||
|
help='Generate API reference markdown documentation'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--no-comments',
|
||||||
|
action='store_true',
|
||||||
|
help='Skip comment extraction'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--verbose',
|
||||||
|
action='store_true',
|
||||||
|
help='Enable verbose logging'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Set logging level
|
||||||
|
if args.verbose:
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
# Validate directory
|
||||||
|
directory = Path(args.directory)
|
||||||
|
if not directory.exists():
|
||||||
|
logger.error(f"Directory not found: {directory}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if not directory.is_dir():
|
||||||
|
logger.error(f"Not a directory: {directory}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Parse languages
|
||||||
|
languages = None
|
||||||
|
if args.languages:
|
||||||
|
languages = [lang.strip() for lang in args.languages.split(',')]
|
||||||
|
|
||||||
|
# Parse file patterns
|
||||||
|
file_patterns = None
|
||||||
|
if args.file_patterns:
|
||||||
|
file_patterns = [p.strip() for p in args.file_patterns.split(',')]
|
||||||
|
|
||||||
|
# Analyze codebase
|
||||||
|
try:
|
||||||
|
results = analyze_codebase(
|
||||||
|
directory=directory,
|
||||||
|
output_dir=Path(args.output),
|
||||||
|
depth=args.depth,
|
||||||
|
languages=languages,
|
||||||
|
file_patterns=file_patterns,
|
||||||
|
build_api_reference=args.build_api_reference,
|
||||||
|
extract_comments=not args.no_comments
|
||||||
|
)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"CODEBASE ANALYSIS COMPLETE")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Files analyzed: {len(results['files'])}")
|
||||||
|
print(f"Output directory: {args.output}")
|
||||||
|
if args.build_api_reference:
|
||||||
|
print(f"API reference: {Path(args.output) / 'api_reference'}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.error("\nAnalysis interrupted by user")
|
||||||
|
return 130
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Analysis failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
182
tests/test_codebase_scraper.py
Normal file
182
tests/test_codebase_scraper.py
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Tests for codebase_scraper.py - Standalone codebase analysis CLI.
|
||||||
|
|
||||||
|
Test Coverage:
|
||||||
|
- Language detection
|
||||||
|
- Directory exclusion
|
||||||
|
- File walking
|
||||||
|
- .gitignore loading
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add src to path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
||||||
|
|
||||||
|
from skill_seekers.cli.codebase_scraper import (
|
||||||
|
detect_language,
|
||||||
|
should_exclude_dir,
|
||||||
|
walk_directory,
|
||||||
|
load_gitignore,
|
||||||
|
DEFAULT_EXCLUDED_DIRS
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestLanguageDetection(unittest.TestCase):
|
||||||
|
"""Tests for language detection from file extensions"""
|
||||||
|
|
||||||
|
def test_python_detection(self):
|
||||||
|
"""Test Python file detection."""
|
||||||
|
self.assertEqual(detect_language(Path('test.py')), 'Python')
|
||||||
|
|
||||||
|
def test_javascript_detection(self):
|
||||||
|
"""Test JavaScript file detection."""
|
||||||
|
self.assertEqual(detect_language(Path('test.js')), 'JavaScript')
|
||||||
|
self.assertEqual(detect_language(Path('test.jsx')), 'JavaScript')
|
||||||
|
|
||||||
|
def test_typescript_detection(self):
|
||||||
|
"""Test TypeScript file detection."""
|
||||||
|
self.assertEqual(detect_language(Path('test.ts')), 'TypeScript')
|
||||||
|
self.assertEqual(detect_language(Path('test.tsx')), 'TypeScript')
|
||||||
|
|
||||||
|
def test_cpp_detection(self):
|
||||||
|
"""Test C++ file detection."""
|
||||||
|
self.assertEqual(detect_language(Path('test.cpp')), 'C++')
|
||||||
|
self.assertEqual(detect_language(Path('test.h')), 'C++')
|
||||||
|
self.assertEqual(detect_language(Path('test.hpp')), 'C++')
|
||||||
|
|
||||||
|
def test_unknown_language(self):
|
||||||
|
"""Test unknown file extension."""
|
||||||
|
self.assertEqual(detect_language(Path('test.go')), 'Unknown')
|
||||||
|
self.assertEqual(detect_language(Path('test.txt')), 'Unknown')
|
||||||
|
|
||||||
|
|
||||||
|
class TestDirectoryExclusion(unittest.TestCase):
|
||||||
|
"""Tests for directory exclusion logic"""
|
||||||
|
|
||||||
|
def test_node_modules_excluded(self):
|
||||||
|
"""Test that node_modules is excluded."""
|
||||||
|
self.assertTrue(should_exclude_dir('node_modules', DEFAULT_EXCLUDED_DIRS))
|
||||||
|
|
||||||
|
def test_venv_excluded(self):
|
||||||
|
"""Test that venv is excluded."""
|
||||||
|
self.assertTrue(should_exclude_dir('venv', DEFAULT_EXCLUDED_DIRS))
|
||||||
|
|
||||||
|
def test_git_excluded(self):
|
||||||
|
"""Test that .git is excluded."""
|
||||||
|
self.assertTrue(should_exclude_dir('.git', DEFAULT_EXCLUDED_DIRS))
|
||||||
|
|
||||||
|
def test_normal_dir_not_excluded(self):
|
||||||
|
"""Test that normal directories are not excluded."""
|
||||||
|
self.assertFalse(should_exclude_dir('src', DEFAULT_EXCLUDED_DIRS))
|
||||||
|
self.assertFalse(should_exclude_dir('tests', DEFAULT_EXCLUDED_DIRS))
|
||||||
|
|
||||||
|
|
||||||
|
class TestDirectoryWalking(unittest.TestCase):
|
||||||
|
"""Tests for directory walking functionality"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Set up test environment"""
|
||||||
|
self.temp_dir = tempfile.mkdtemp()
|
||||||
|
self.root = Path(self.temp_dir)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
"""Clean up test environment"""
|
||||||
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def test_walk_empty_directory(self):
|
||||||
|
"""Test walking empty directory."""
|
||||||
|
files = walk_directory(self.root)
|
||||||
|
self.assertEqual(len(files), 0)
|
||||||
|
|
||||||
|
def test_walk_with_python_files(self):
|
||||||
|
"""Test walking directory with Python files."""
|
||||||
|
# Create test files
|
||||||
|
(self.root / 'test1.py').write_text('print("test")')
|
||||||
|
(self.root / 'test2.py').write_text('print("test2")')
|
||||||
|
(self.root / 'readme.txt').write_text('readme')
|
||||||
|
|
||||||
|
files = walk_directory(self.root)
|
||||||
|
|
||||||
|
# Should only find Python files
|
||||||
|
self.assertEqual(len(files), 2)
|
||||||
|
self.assertTrue(all(f.suffix == '.py' for f in files))
|
||||||
|
|
||||||
|
def test_walk_excludes_node_modules(self):
|
||||||
|
"""Test that node_modules directory is excluded."""
|
||||||
|
# Create test files
|
||||||
|
(self.root / 'test.py').write_text('test')
|
||||||
|
|
||||||
|
# Create node_modules with files
|
||||||
|
node_modules = self.root / 'node_modules'
|
||||||
|
node_modules.mkdir()
|
||||||
|
(node_modules / 'package.js').write_text('test')
|
||||||
|
|
||||||
|
files = walk_directory(self.root)
|
||||||
|
|
||||||
|
# Should only find root test.py, not package.js
|
||||||
|
self.assertEqual(len(files), 1)
|
||||||
|
self.assertEqual(files[0].name, 'test.py')
|
||||||
|
|
||||||
|
def test_walk_with_subdirectories(self):
|
||||||
|
"""Test walking nested directory structure."""
|
||||||
|
# Create nested structure
|
||||||
|
src_dir = self.root / 'src'
|
||||||
|
src_dir.mkdir()
|
||||||
|
(src_dir / 'module.py').write_text('test')
|
||||||
|
|
||||||
|
tests_dir = self.root / 'tests'
|
||||||
|
tests_dir.mkdir()
|
||||||
|
(tests_dir / 'test_module.py').write_text('test')
|
||||||
|
|
||||||
|
files = walk_directory(self.root)
|
||||||
|
|
||||||
|
# Should find both files
|
||||||
|
self.assertEqual(len(files), 2)
|
||||||
|
filenames = [f.name for f in files]
|
||||||
|
self.assertIn('module.py', filenames)
|
||||||
|
self.assertIn('test_module.py', filenames)
|
||||||
|
|
||||||
|
|
||||||
|
class TestGitignoreLoading(unittest.TestCase):
|
||||||
|
"""Tests for .gitignore loading"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Set up test environment"""
|
||||||
|
self.temp_dir = tempfile.mkdtemp()
|
||||||
|
self.root = Path(self.temp_dir)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
"""Clean up test environment"""
|
||||||
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def test_no_gitignore(self):
|
||||||
|
"""Test behavior when no .gitignore exists."""
|
||||||
|
spec = load_gitignore(self.root)
|
||||||
|
# Should return None when no .gitignore found
|
||||||
|
self.assertIsNone(spec)
|
||||||
|
|
||||||
|
def test_load_gitignore(self):
|
||||||
|
"""Test loading valid .gitignore file."""
|
||||||
|
# Create .gitignore
|
||||||
|
gitignore_path = self.root / '.gitignore'
|
||||||
|
gitignore_path.write_text('*.log\ntemp/\n')
|
||||||
|
|
||||||
|
spec = load_gitignore(self.root)
|
||||||
|
|
||||||
|
# Should successfully load pathspec (if pathspec is installed)
|
||||||
|
# If pathspec is not installed, spec will be None
|
||||||
|
if spec is not None:
|
||||||
|
# Verify it's a PathSpec object
|
||||||
|
self.assertIsNotNone(spec)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Run tests with verbose output
|
||||||
|
unittest.main(verbosity=2)
|
||||||
Reference in New Issue
Block a user