#!/usr/bin/env python3 """ Config Analyzer - Extract metadata from Skill Seekers config files """ import json import subprocess from datetime import datetime from pathlib import Path from typing import Any class ConfigAnalyzer: """Analyzes Skill Seekers config files and extracts metadata""" # Category mapping based on config content CATEGORY_MAPPING = { "web-frameworks": ["react", "vue", "django", "fastapi", "laravel", "astro", "hono"], "game-engines": ["godot", "unity", "unreal"], "devops": ["kubernetes", "ansible", "docker", "terraform"], "css-frameworks": ["tailwind", "bootstrap", "bulma"], "development-tools": ["claude-code", "vscode", "git"], "gaming": ["steam"], "testing": ["pytest", "jest", "test"], } # Tag extraction keywords TAG_KEYWORDS = { "javascript": ["react", "vue", "astro", "hono", "javascript", "js", "node"], "python": ["django", "fastapi", "ansible", "python", "flask"], "php": ["laravel", "php"], "frontend": ["react", "vue", "astro", "tailwind", "frontend", "ui"], "backend": ["django", "fastapi", "laravel", "backend", "server", "api"], "css": ["tailwind", "css", "styling"], "game-development": ["godot", "unity", "unreal", "game"], "devops": ["kubernetes", "ansible", "docker", "k8s", "devops"], "documentation": ["docs", "documentation"], "testing": ["test", "testing", "pytest", "jest"], } def __init__(self, config_dir: Path, base_url: str = "https://api.skillseekersweb.com"): """ Initialize config analyzer Args: config_dir: Path to configs directory base_url: Base URL for download links """ self.config_dir = Path(config_dir) self.base_url = base_url if not self.config_dir.exists(): raise ValueError(f"Config directory not found: {self.config_dir}") def analyze_all_configs(self) -> list[dict[str, Any]]: """ Analyze all config files and extract metadata Returns: List of config metadata dicts """ configs = [] # Find all JSON files recursively in configs directory and subdirectories for config_file in sorted(self.config_dir.rglob("*.json")): # Skip test/example configs in test-examples directory if "test-examples" in config_file.parts: continue try: metadata = self.analyze_config(config_file) if metadata: # Skip invalid configs configs.append(metadata) except Exception as e: print(f"Warning: Failed to analyze {config_file.name}: {e}") continue return configs def analyze_config(self, config_path: Path) -> dict[str, Any] | None: """ Analyze a single config file and extract metadata Args: config_path: Path to config JSON file Returns: Config metadata dict or None if invalid """ try: # Read config file with open(config_path) as f: config_data = json.load(f) # Skip if no name field if "name" not in config_data: return None name = config_data["name"] description = config_data.get("description", "") # Determine config type config_type = self._determine_type(config_data) # Get primary source (base_url or repo) primary_source = self._get_primary_source(config_data, config_type) # Use directory name as category (official/{category}/{name}.json) # Fall back to keyword-based categorization if not in a named subdirectory category = self._categorize_config(name, description, config_data, config_path) # Extract tags tags = self._extract_tags(name, description, config_data) # Get file metadata file_size = config_path.stat().st_size last_updated = self._get_last_updated(config_path) # Generate download URL download_url = f"{self.base_url}/api/download/{config_path.name}" # Get max_pages (for estimation) max_pages = self._get_max_pages(config_data) return { "name": name, "description": description, "type": config_type, "category": category, "tags": tags, "primary_source": primary_source, "max_pages": max_pages, "file_size": file_size, "last_updated": last_updated, "download_url": download_url, "config_file": config_path.name, } except json.JSONDecodeError as e: print(f"Invalid JSON in {config_path.name}: {e}") return None except Exception as e: print(f"Error analyzing {config_path.name}: {e}") return None def get_config_by_name(self, name: str) -> dict[str, Any] | None: """ Get config metadata by name Args: name: Config name (e.g., "react", "django") Returns: Config metadata or None if not found """ configs = self.analyze_all_configs() for config in configs: if config["name"] == name: return config return None def _determine_type(self, config_data: dict[str, Any]) -> str: """ Determine if config is single-source or unified Args: config_data: Config JSON data Returns: "single-source" or "unified" """ # Unified configs have "sources" array if "sources" in config_data: return "unified" # Check for merge_mode (another indicator of unified configs) if "merge_mode" in config_data: return "unified" return "single-source" def _get_primary_source(self, config_data: dict[str, Any], config_type: str) -> str: """ Get primary source URL/repo Args: config_data: Config JSON data config_type: "single-source" or "unified" Returns: Primary source URL or repo name """ if config_type == "unified": # Get first source sources = config_data.get("sources", []) if sources: first_source = sources[0] if first_source.get("type") == "documentation": return first_source.get("base_url", "") elif first_source.get("type") == "github": return f"github.com/{first_source.get('repo', '')}" elif first_source.get("type") == "pdf": return first_source.get("pdf_url", "PDF file") return "Multiple sources" # Single-source configs if "base_url" in config_data: return config_data["base_url"] elif "repo" in config_data: return f"github.com/{config_data['repo']}" elif "pdf_url" in config_data or "pdf" in config_data: return "PDF file" return "Unknown" def _categorize_config( self, name: str, description: str, config_data: dict[str, Any], config_path: Path | None = None, ) -> str: """ Categorize config using directory structure first, then keyword fallback. The configs_repo organizes files as official/{category}/{name}.json so the parent directory name is the authoritative category. Args: name: Config name description: Config description config_data: Full config data config_path: Path to config file (used to read directory-based category) Returns: Category name """ # Primary: use directory structure (official/{category}/{name}.json) if config_path is not None: parent = config_path.parent.name # Exclude generic/root directories from being used as categories if parent not in ("official", "community", "configs", "configs_repo", "."): return parent # Fallback: keyword matching against config name name_lower = name.lower() for category, keywords in self.CATEGORY_MAPPING.items(): if any(keyword in name_lower for keyword in keywords): return category # Fallback: description hints desc_lower = description.lower() if "framework" in desc_lower or "library" in desc_lower: if any(word in desc_lower for word in ["web", "frontend", "backend", "api"]): return "web-frameworks" if "game" in desc_lower or "engine" in desc_lower: return "game-engines" if "devops" in desc_lower or "deployment" in desc_lower or "infrastructure" in desc_lower: return "devops" return "uncategorized" def _extract_tags(self, name: str, description: str, config_data: dict[str, Any]) -> list[str]: """ Extract relevant tags from config Args: name: Config name description: Config description config_data: Full config data Returns: List of tags """ tags = set() name_lower = name.lower() desc_lower = description.lower() # Check against tag keywords for tag, keywords in self.TAG_KEYWORDS.items(): if any(keyword in name_lower or keyword in desc_lower for keyword in keywords): tags.add(tag) # Add config type as tag config_type = self._determine_type(config_data) if config_type == "unified": tags.add("multi-source") # Add source type tags if "base_url" in config_data or ( config_type == "unified" and any(s.get("type") == "documentation" for s in config_data.get("sources", [])) ): tags.add("documentation") if "repo" in config_data or ( config_type == "unified" and any(s.get("type") == "github" for s in config_data.get("sources", [])) ): tags.add("github") if ( "pdf" in config_data or "pdf_url" in config_data or ( config_type == "unified" and any(s.get("type") == "pdf" for s in config_data.get("sources", [])) ) ): tags.add("pdf") return sorted(list(tags)) def _get_max_pages(self, config_data: dict[str, Any]) -> int | None: """ Get max_pages value from config Args: config_data: Config JSON data Returns: max_pages value or None """ # Single-source configs if "max_pages" in config_data: return config_data["max_pages"] # Unified configs - get from first documentation source if "sources" in config_data: for source in config_data["sources"]: if source.get("type") == "documentation" and "max_pages" in source: return source["max_pages"] return None def _get_last_updated(self, config_path: Path) -> str: """ Get last updated date from git history Args: config_path: Path to config file Returns: ISO format date string """ try: # Try to get last commit date for this file result = subprocess.run( ["git", "log", "-1", "--format=%cI", str(config_path)], cwd=config_path.parent.parent, capture_output=True, text=True, timeout=5, ) if result.returncode == 0 and result.stdout.strip(): return result.stdout.strip() except Exception: pass # Fallback to file modification time mtime = config_path.stat().st_mtime return datetime.fromtimestamp(mtime).isoformat()