fix(engineering): improve codebase-onboarding - add scripts + extract references
This commit is contained in:
205
engineering/codebase-onboarding/scripts/codebase_analyzer.py
Executable file
205
engineering/codebase-onboarding/scripts/codebase_analyzer.py
Executable file
@@ -0,0 +1,205 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate a compact onboarding summary for a codebase (stdlib only)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List
|
||||
|
||||
IGNORED_DIRS = {
|
||||
".git",
|
||||
"node_modules",
|
||||
".next",
|
||||
"dist",
|
||||
"build",
|
||||
"coverage",
|
||||
"venv",
|
||||
".venv",
|
||||
"__pycache__",
|
||||
}
|
||||
|
||||
EXT_TO_LANG = {
|
||||
".py": "Python",
|
||||
".ts": "TypeScript",
|
||||
".tsx": "TypeScript",
|
||||
".js": "JavaScript",
|
||||
".jsx": "JavaScript",
|
||||
".go": "Go",
|
||||
".rs": "Rust",
|
||||
".java": "Java",
|
||||
".kt": "Kotlin",
|
||||
".rb": "Ruby",
|
||||
".php": "PHP",
|
||||
".cs": "C#",
|
||||
".c": "C",
|
||||
".cpp": "C++",
|
||||
".h": "C/C++",
|
||||
".swift": "Swift",
|
||||
".sql": "SQL",
|
||||
".sh": "Shell",
|
||||
}
|
||||
|
||||
KEY_CONFIG_FILES = [
|
||||
"package.json",
|
||||
"pnpm-workspace.yaml",
|
||||
"turbo.json",
|
||||
"nx.json",
|
||||
"lerna.json",
|
||||
"tsconfig.json",
|
||||
"next.config.js",
|
||||
"next.config.mjs",
|
||||
"pyproject.toml",
|
||||
"requirements.txt",
|
||||
"go.mod",
|
||||
"Cargo.toml",
|
||||
"docker-compose.yml",
|
||||
"Dockerfile",
|
||||
".github/workflows",
|
||||
]
|
||||
|
||||
|
||||
def iter_files(root: Path) -> Iterable[Path]:
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
dirnames[:] = [d for d in dirnames if d not in IGNORED_DIRS]
|
||||
for name in filenames:
|
||||
path = Path(dirpath) / name
|
||||
if path.is_file():
|
||||
yield path
|
||||
|
||||
|
||||
def detect_languages(paths: Iterable[Path]) -> Dict[str, int]:
|
||||
counts: Counter[str] = Counter()
|
||||
for path in paths:
|
||||
lang = EXT_TO_LANG.get(path.suffix.lower())
|
||||
if lang:
|
||||
counts[lang] += 1
|
||||
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
|
||||
|
||||
|
||||
def find_key_configs(root: Path) -> List[str]:
|
||||
found: List[str] = []
|
||||
for rel in KEY_CONFIG_FILES:
|
||||
if (root / rel).exists():
|
||||
found.append(rel)
|
||||
return found
|
||||
|
||||
|
||||
def top_level_structure(root: Path, max_depth: int) -> List[str]:
|
||||
lines: List[str] = []
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
rel = Path(dirpath).relative_to(root)
|
||||
depth = 0 if str(rel) == "." else len(rel.parts)
|
||||
if depth > max_depth:
|
||||
dirnames[:] = []
|
||||
continue
|
||||
|
||||
if any(part in IGNORED_DIRS for part in rel.parts):
|
||||
dirnames[:] = []
|
||||
continue
|
||||
|
||||
indent = " " * depth
|
||||
if str(rel) != ".":
|
||||
lines.append(f"{indent}{rel.name}/")
|
||||
|
||||
visible_files = [f for f in sorted(filenames) if not f.startswith(".")]
|
||||
for filename in visible_files[:10]:
|
||||
lines.append(f"{indent} {filename}")
|
||||
|
||||
dirnames[:] = sorted([d for d in dirnames if d not in IGNORED_DIRS])
|
||||
return lines
|
||||
|
||||
|
||||
def build_report(root: Path, max_depth: int) -> Dict[str, object]:
|
||||
files = list(iter_files(root))
|
||||
languages = detect_languages(files)
|
||||
total_files = len(files)
|
||||
file_count_by_ext: Counter[str] = Counter(p.suffix.lower() or "<no-ext>" for p in files)
|
||||
|
||||
largest = sorted(
|
||||
((str(p.relative_to(root)), p.stat().st_size) for p in files),
|
||||
key=lambda item: item[1],
|
||||
reverse=True,
|
||||
)[:20]
|
||||
|
||||
return {
|
||||
"root": str(root),
|
||||
"file_count": total_files,
|
||||
"languages": languages,
|
||||
"key_config_files": find_key_configs(root),
|
||||
"top_extensions": dict(file_count_by_ext.most_common(12)),
|
||||
"largest_files": largest,
|
||||
"directory_structure": top_level_structure(root, max_depth),
|
||||
}
|
||||
|
||||
|
||||
def format_size(num_bytes: int) -> str:
|
||||
units = ["B", "KB", "MB", "GB"]
|
||||
value = float(num_bytes)
|
||||
for unit in units:
|
||||
if value < 1024 or unit == units[-1]:
|
||||
return f"{value:.1f}{unit}"
|
||||
value /= 1024
|
||||
return f"{num_bytes}B"
|
||||
|
||||
|
||||
def print_text(report: Dict[str, object]) -> None:
|
||||
print("Codebase Onboarding Summary")
|
||||
print(f"Root: {report['root']}")
|
||||
print(f"Total files: {report['file_count']}")
|
||||
print("")
|
||||
|
||||
print("Languages detected")
|
||||
if report["languages"]:
|
||||
for lang, count in report["languages"].items():
|
||||
print(f"- {lang}: {count}")
|
||||
else:
|
||||
print("- No recognized source file extensions")
|
||||
print("")
|
||||
|
||||
print("Key config files")
|
||||
configs = report["key_config_files"]
|
||||
if configs:
|
||||
for cfg in configs:
|
||||
print(f"- {cfg}")
|
||||
else:
|
||||
print("- None found from default checklist")
|
||||
print("")
|
||||
|
||||
print("Largest files")
|
||||
for rel, size in report["largest_files"][:10]:
|
||||
print(f"- {rel}: {format_size(size)}")
|
||||
print("")
|
||||
|
||||
print("Directory structure")
|
||||
for line in report["directory_structure"][:200]:
|
||||
print(line)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Scan a repository and generate onboarding summary facts.")
|
||||
parser.add_argument("path", help="Path to project directory")
|
||||
parser.add_argument("--max-depth", type=int, default=2, help="Max depth for structure output (default: 2)")
|
||||
parser.add_argument("--json", action="store_true", help="Print JSON output")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
root = Path(args.path).expanduser().resolve()
|
||||
if not root.exists() or not root.is_dir():
|
||||
raise SystemExit(f"Path is not a directory: {root}")
|
||||
|
||||
report = build_report(root, max_depth=max(1, args.max_depth))
|
||||
if args.json:
|
||||
print(json.dumps(report, indent=2))
|
||||
else:
|
||||
print_text(report)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user