Files
skill-seekers-reference/api/config_analyzer.py
Pablo Estevez 5ed767ff9a run ruff
2026-01-17 17:29:21 +00:00

346 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Config Analyzer - Extract metadata from Skill Seekers config files
"""
import json
import subprocess
from datetime import datetime
from pathlib import Path
from typing import Any
class ConfigAnalyzer:
"""Analyzes Skill Seekers config files and extracts metadata"""
# Category mapping based on config content
CATEGORY_MAPPING = {
"web-frameworks": ["react", "vue", "django", "fastapi", "laravel", "astro", "hono"],
"game-engines": ["godot", "unity", "unreal"],
"devops": ["kubernetes", "ansible", "docker", "terraform"],
"css-frameworks": ["tailwind", "bootstrap", "bulma"],
"development-tools": ["claude-code", "vscode", "git"],
"gaming": ["steam"],
"testing": ["pytest", "jest", "test"],
}
# Tag extraction keywords
TAG_KEYWORDS = {
"javascript": ["react", "vue", "astro", "hono", "javascript", "js", "node"],
"python": ["django", "fastapi", "ansible", "python", "flask"],
"php": ["laravel", "php"],
"frontend": ["react", "vue", "astro", "tailwind", "frontend", "ui"],
"backend": ["django", "fastapi", "laravel", "backend", "server", "api"],
"css": ["tailwind", "css", "styling"],
"game-development": ["godot", "unity", "unreal", "game"],
"devops": ["kubernetes", "ansible", "docker", "k8s", "devops"],
"documentation": ["docs", "documentation"],
"testing": ["test", "testing", "pytest", "jest"],
}
def __init__(self, config_dir: Path, base_url: str = "https://api.skillseekersweb.com"):
"""
Initialize config analyzer
Args:
config_dir: Path to configs directory
base_url: Base URL for download links
"""
self.config_dir = Path(config_dir)
self.base_url = base_url
if not self.config_dir.exists():
raise ValueError(f"Config directory not found: {self.config_dir}")
def analyze_all_configs(self) -> list[dict[str, Any]]:
"""
Analyze all config files and extract metadata
Returns:
List of config metadata dicts
"""
configs = []
# Find all JSON files recursively in configs directory and subdirectories
for config_file in sorted(self.config_dir.rglob("*.json")):
# Skip test/example configs in test-examples directory
if "test-examples" in config_file.parts:
continue
try:
metadata = self.analyze_config(config_file)
if metadata: # Skip invalid configs
configs.append(metadata)
except Exception as e:
print(f"Warning: Failed to analyze {config_file.name}: {e}")
continue
return configs
def analyze_config(self, config_path: Path) -> dict[str, Any] | None:
"""
Analyze a single config file and extract metadata
Args:
config_path: Path to config JSON file
Returns:
Config metadata dict or None if invalid
"""
try:
# Read config file
with open(config_path) as f:
config_data = json.load(f)
# Skip if no name field
if "name" not in config_data:
return None
name = config_data["name"]
description = config_data.get("description", "")
# Determine config type
config_type = self._determine_type(config_data)
# Get primary source (base_url or repo)
primary_source = self._get_primary_source(config_data, config_type)
# Auto-categorize
category = self._categorize_config(name, description, config_data)
# Extract tags
tags = self._extract_tags(name, description, config_data)
# Get file metadata
file_size = config_path.stat().st_size
last_updated = self._get_last_updated(config_path)
# Generate download URL
download_url = f"{self.base_url}/api/download/{config_path.name}"
# Get max_pages (for estimation)
max_pages = self._get_max_pages(config_data)
return {
"name": name,
"description": description,
"type": config_type,
"category": category,
"tags": tags,
"primary_source": primary_source,
"max_pages": max_pages,
"file_size": file_size,
"last_updated": last_updated,
"download_url": download_url,
"config_file": config_path.name,
}
except json.JSONDecodeError as e:
print(f"Invalid JSON in {config_path.name}: {e}")
return None
except Exception as e:
print(f"Error analyzing {config_path.name}: {e}")
return None
def get_config_by_name(self, name: str) -> dict[str, Any] | None:
"""
Get config metadata by name
Args:
name: Config name (e.g., "react", "django")
Returns:
Config metadata or None if not found
"""
configs = self.analyze_all_configs()
for config in configs:
if config["name"] == name:
return config
return None
def _determine_type(self, config_data: dict[str, Any]) -> str:
"""
Determine if config is single-source or unified
Args:
config_data: Config JSON data
Returns:
"single-source" or "unified"
"""
# Unified configs have "sources" array
if "sources" in config_data:
return "unified"
# Check for merge_mode (another indicator of unified configs)
if "merge_mode" in config_data:
return "unified"
return "single-source"
def _get_primary_source(self, config_data: dict[str, Any], config_type: str) -> str:
"""
Get primary source URL/repo
Args:
config_data: Config JSON data
config_type: "single-source" or "unified"
Returns:
Primary source URL or repo name
"""
if config_type == "unified":
# Get first source
sources = config_data.get("sources", [])
if sources:
first_source = sources[0]
if first_source.get("type") == "documentation":
return first_source.get("base_url", "")
elif first_source.get("type") == "github":
return f"github.com/{first_source.get('repo', '')}"
elif first_source.get("type") == "pdf":
return first_source.get("pdf_url", "PDF file")
return "Multiple sources"
# Single-source configs
if "base_url" in config_data:
return config_data["base_url"]
elif "repo" in config_data:
return f"github.com/{config_data['repo']}"
elif "pdf_url" in config_data or "pdf" in config_data:
return "PDF file"
return "Unknown"
def _categorize_config(self, name: str, description: str, config_data: dict[str, Any]) -> str:
"""
Auto-categorize config based on name and content
Args:
name: Config name
description: Config description
config_data: Full config data
Returns:
Category name
"""
name_lower = name.lower()
# Check against category mapping
for category, keywords in self.CATEGORY_MAPPING.items():
if any(keyword in name_lower for keyword in keywords):
return category
# Check description for hints
desc_lower = description.lower()
if "framework" in desc_lower or "library" in desc_lower:
if any(word in desc_lower for word in ["web", "frontend", "backend", "api"]):
return "web-frameworks"
if "game" in desc_lower or "engine" in desc_lower:
return "game-engines"
if "devops" in desc_lower or "deployment" in desc_lower or "infrastructure" in desc_lower:
return "devops"
# Default to uncategorized
return "uncategorized"
def _extract_tags(self, name: str, description: str, config_data: dict[str, Any]) -> list[str]:
"""
Extract relevant tags from config
Args:
name: Config name
description: Config description
config_data: Full config data
Returns:
List of tags
"""
tags = set()
name_lower = name.lower()
desc_lower = description.lower()
# Check against tag keywords
for tag, keywords in self.TAG_KEYWORDS.items():
if any(keyword in name_lower or keyword in desc_lower for keyword in keywords):
tags.add(tag)
# Add config type as tag
config_type = self._determine_type(config_data)
if config_type == "unified":
tags.add("multi-source")
# Add source type tags
if "base_url" in config_data or (
config_type == "unified" and any(s.get("type") == "documentation" for s in config_data.get("sources", []))
):
tags.add("documentation")
if "repo" in config_data or (
config_type == "unified" and any(s.get("type") == "github" for s in config_data.get("sources", []))
):
tags.add("github")
if (
"pdf" in config_data
or "pdf_url" in config_data
or (config_type == "unified" and any(s.get("type") == "pdf" for s in config_data.get("sources", [])))
):
tags.add("pdf")
return sorted(list(tags))
def _get_max_pages(self, config_data: dict[str, Any]) -> int | None:
"""
Get max_pages value from config
Args:
config_data: Config JSON data
Returns:
max_pages value or None
"""
# Single-source configs
if "max_pages" in config_data:
return config_data["max_pages"]
# Unified configs - get from first documentation source
if "sources" in config_data:
for source in config_data["sources"]:
if source.get("type") == "documentation" and "max_pages" in source:
return source["max_pages"]
return None
def _get_last_updated(self, config_path: Path) -> str:
"""
Get last updated date from git history
Args:
config_path: Path to config file
Returns:
ISO format date string
"""
try:
# Try to get last commit date for this file
result = subprocess.run(
["git", "log", "-1", "--format=%cI", str(config_path)],
cwd=config_path.parent.parent,
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
except Exception:
pass
# Fallback to file modification time
mtime = config_path.stat().st_mtime
return datetime.fromtimestamp(mtime).isoformat()