206 lines
5.4 KiB
Python
Executable File
206 lines
5.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Generate a compact onboarding summary for a codebase (stdlib only)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from typing import Dict, Iterable, List
|
|
|
|
IGNORED_DIRS = {
|
|
".git",
|
|
"node_modules",
|
|
".next",
|
|
"dist",
|
|
"build",
|
|
"coverage",
|
|
"venv",
|
|
".venv",
|
|
"__pycache__",
|
|
}
|
|
|
|
EXT_TO_LANG = {
|
|
".py": "Python",
|
|
".ts": "TypeScript",
|
|
".tsx": "TypeScript",
|
|
".js": "JavaScript",
|
|
".jsx": "JavaScript",
|
|
".go": "Go",
|
|
".rs": "Rust",
|
|
".java": "Java",
|
|
".kt": "Kotlin",
|
|
".rb": "Ruby",
|
|
".php": "PHP",
|
|
".cs": "C#",
|
|
".c": "C",
|
|
".cpp": "C++",
|
|
".h": "C/C++",
|
|
".swift": "Swift",
|
|
".sql": "SQL",
|
|
".sh": "Shell",
|
|
}
|
|
|
|
KEY_CONFIG_FILES = [
|
|
"package.json",
|
|
"pnpm-workspace.yaml",
|
|
"turbo.json",
|
|
"nx.json",
|
|
"lerna.json",
|
|
"tsconfig.json",
|
|
"next.config.js",
|
|
"next.config.mjs",
|
|
"pyproject.toml",
|
|
"requirements.txt",
|
|
"go.mod",
|
|
"Cargo.toml",
|
|
"docker-compose.yml",
|
|
"Dockerfile",
|
|
".github/workflows",
|
|
]
|
|
|
|
|
|
def iter_files(root: Path) -> Iterable[Path]:
|
|
for dirpath, dirnames, filenames in os.walk(root):
|
|
dirnames[:] = [d for d in dirnames if d not in IGNORED_DIRS]
|
|
for name in filenames:
|
|
path = Path(dirpath) / name
|
|
if path.is_file():
|
|
yield path
|
|
|
|
|
|
def detect_languages(paths: Iterable[Path]) -> Dict[str, int]:
|
|
counts: Counter[str] = Counter()
|
|
for path in paths:
|
|
lang = EXT_TO_LANG.get(path.suffix.lower())
|
|
if lang:
|
|
counts[lang] += 1
|
|
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
|
|
|
|
|
|
def find_key_configs(root: Path) -> List[str]:
|
|
found: List[str] = []
|
|
for rel in KEY_CONFIG_FILES:
|
|
if (root / rel).exists():
|
|
found.append(rel)
|
|
return found
|
|
|
|
|
|
def top_level_structure(root: Path, max_depth: int) -> List[str]:
|
|
lines: List[str] = []
|
|
for dirpath, dirnames, filenames in os.walk(root):
|
|
rel = Path(dirpath).relative_to(root)
|
|
depth = 0 if str(rel) == "." else len(rel.parts)
|
|
if depth > max_depth:
|
|
dirnames[:] = []
|
|
continue
|
|
|
|
if any(part in IGNORED_DIRS for part in rel.parts):
|
|
dirnames[:] = []
|
|
continue
|
|
|
|
indent = " " * depth
|
|
if str(rel) != ".":
|
|
lines.append(f"{indent}{rel.name}/")
|
|
|
|
visible_files = [f for f in sorted(filenames) if not f.startswith(".")]
|
|
for filename in visible_files[:10]:
|
|
lines.append(f"{indent} {filename}")
|
|
|
|
dirnames[:] = sorted([d for d in dirnames if d not in IGNORED_DIRS])
|
|
return lines
|
|
|
|
|
|
def build_report(root: Path, max_depth: int) -> Dict[str, object]:
|
|
files = list(iter_files(root))
|
|
languages = detect_languages(files)
|
|
total_files = len(files)
|
|
file_count_by_ext: Counter[str] = Counter(p.suffix.lower() or "<no-ext>" for p in files)
|
|
|
|
largest = sorted(
|
|
((str(p.relative_to(root)), p.stat().st_size) for p in files),
|
|
key=lambda item: item[1],
|
|
reverse=True,
|
|
)[:20]
|
|
|
|
return {
|
|
"root": str(root),
|
|
"file_count": total_files,
|
|
"languages": languages,
|
|
"key_config_files": find_key_configs(root),
|
|
"top_extensions": dict(file_count_by_ext.most_common(12)),
|
|
"largest_files": largest,
|
|
"directory_structure": top_level_structure(root, max_depth),
|
|
}
|
|
|
|
|
|
def format_size(num_bytes: int) -> str:
|
|
units = ["B", "KB", "MB", "GB"]
|
|
value = float(num_bytes)
|
|
for unit in units:
|
|
if value < 1024 or unit == units[-1]:
|
|
return f"{value:.1f}{unit}"
|
|
value /= 1024
|
|
return f"{num_bytes}B"
|
|
|
|
|
|
def print_text(report: Dict[str, object]) -> None:
|
|
print("Codebase Onboarding Summary")
|
|
print(f"Root: {report['root']}")
|
|
print(f"Total files: {report['file_count']}")
|
|
print("")
|
|
|
|
print("Languages detected")
|
|
if report["languages"]:
|
|
for lang, count in report["languages"].items():
|
|
print(f"- {lang}: {count}")
|
|
else:
|
|
print("- No recognized source file extensions")
|
|
print("")
|
|
|
|
print("Key config files")
|
|
configs = report["key_config_files"]
|
|
if configs:
|
|
for cfg in configs:
|
|
print(f"- {cfg}")
|
|
else:
|
|
print("- None found from default checklist")
|
|
print("")
|
|
|
|
print("Largest files")
|
|
for rel, size in report["largest_files"][:10]:
|
|
print(f"- {rel}: {format_size(size)}")
|
|
print("")
|
|
|
|
print("Directory structure")
|
|
for line in report["directory_structure"][:200]:
|
|
print(line)
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Scan a repository and generate onboarding summary facts.")
|
|
parser.add_argument("path", help="Path to project directory")
|
|
parser.add_argument("--max-depth", type=int, default=2, help="Max depth for structure output (default: 2)")
|
|
parser.add_argument("--json", action="store_true", help="Print JSON output")
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
root = Path(args.path).expanduser().resolve()
|
|
if not root.exists() or not root.is_dir():
|
|
raise SystemExit(f"Path is not a directory: {root}")
|
|
|
|
report = build_report(root, max_depth=max(1, args.max_depth))
|
|
if args.json:
|
|
print(json.dumps(report, indent=2))
|
|
else:
|
|
print_text(report)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|