Files
skill-seekers-reference/api/config_analyzer.py
yusyus c6602da203 feat(api): Update base URL to api.skillseekersweb.com
- Update default base_url in ConfigAnalyzer to api.skillseekersweb.com
- Update website URL in API root endpoint
- Update test_api.py to use custom domain
- Prepare for custom domain deployment
2025-11-30 18:26:57 +03:00

349 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Config Analyzer - Extract metadata from Skill Seekers config files
"""
import json
import os
import subprocess
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
class ConfigAnalyzer:
"""Analyzes Skill Seekers config files and extracts metadata"""
# Category mapping based on config content
CATEGORY_MAPPING = {
"web-frameworks": [
"react", "vue", "django", "fastapi", "laravel", "astro", "hono"
],
"game-engines": [
"godot", "unity", "unreal"
],
"devops": [
"kubernetes", "ansible", "docker", "terraform"
],
"css-frameworks": [
"tailwind", "bootstrap", "bulma"
],
"development-tools": [
"claude-code", "vscode", "git"
],
"gaming": [
"steam"
],
"testing": [
"pytest", "jest", "test"
]
}
# Tag extraction keywords
TAG_KEYWORDS = {
"javascript": ["react", "vue", "astro", "hono", "javascript", "js", "node"],
"python": ["django", "fastapi", "ansible", "python", "flask"],
"php": ["laravel", "php"],
"frontend": ["react", "vue", "astro", "tailwind", "frontend", "ui"],
"backend": ["django", "fastapi", "laravel", "backend", "server", "api"],
"css": ["tailwind", "css", "styling"],
"game-development": ["godot", "unity", "unreal", "game"],
"devops": ["kubernetes", "ansible", "docker", "k8s", "devops"],
"documentation": ["docs", "documentation"],
"testing": ["test", "testing", "pytest", "jest"]
}
def __init__(self, config_dir: Path, base_url: str = "https://api.skillseekersweb.com"):
"""
Initialize config analyzer
Args:
config_dir: Path to configs directory
base_url: Base URL for download links
"""
self.config_dir = Path(config_dir)
self.base_url = base_url
if not self.config_dir.exists():
raise ValueError(f"Config directory not found: {self.config_dir}")
def analyze_all_configs(self) -> List[Dict[str, Any]]:
"""
Analyze all config files and extract metadata
Returns:
List of config metadata dicts
"""
configs = []
# Find all JSON files in configs directory
for config_file in sorted(self.config_dir.glob("*.json")):
try:
metadata = self.analyze_config(config_file)
if metadata: # Skip invalid configs
configs.append(metadata)
except Exception as e:
print(f"Warning: Failed to analyze {config_file.name}: {e}")
continue
return configs
def analyze_config(self, config_path: Path) -> Optional[Dict[str, Any]]:
"""
Analyze a single config file and extract metadata
Args:
config_path: Path to config JSON file
Returns:
Config metadata dict or None if invalid
"""
try:
# Read config file
with open(config_path, 'r') as f:
config_data = json.load(f)
# Skip if no name field
if "name" not in config_data:
return None
name = config_data["name"]
description = config_data.get("description", "")
# Determine config type
config_type = self._determine_type(config_data)
# Get primary source (base_url or repo)
primary_source = self._get_primary_source(config_data, config_type)
# Auto-categorize
category = self._categorize_config(name, description, config_data)
# Extract tags
tags = self._extract_tags(name, description, config_data)
# Get file metadata
file_size = config_path.stat().st_size
last_updated = self._get_last_updated(config_path)
# Generate download URL
download_url = f"{self.base_url}/api/download/{config_path.name}"
# Get max_pages (for estimation)
max_pages = self._get_max_pages(config_data)
return {
"name": name,
"description": description,
"type": config_type,
"category": category,
"tags": tags,
"primary_source": primary_source,
"max_pages": max_pages,
"file_size": file_size,
"last_updated": last_updated,
"download_url": download_url,
"config_file": config_path.name
}
except json.JSONDecodeError as e:
print(f"Invalid JSON in {config_path.name}: {e}")
return None
except Exception as e:
print(f"Error analyzing {config_path.name}: {e}")
return None
def get_config_by_name(self, name: str) -> Optional[Dict[str, Any]]:
"""
Get config metadata by name
Args:
name: Config name (e.g., "react", "django")
Returns:
Config metadata or None if not found
"""
configs = self.analyze_all_configs()
for config in configs:
if config["name"] == name:
return config
return None
def _determine_type(self, config_data: Dict[str, Any]) -> str:
"""
Determine if config is single-source or unified
Args:
config_data: Config JSON data
Returns:
"single-source" or "unified"
"""
# Unified configs have "sources" array
if "sources" in config_data:
return "unified"
# Check for merge_mode (another indicator of unified configs)
if "merge_mode" in config_data:
return "unified"
return "single-source"
def _get_primary_source(self, config_data: Dict[str, Any], config_type: str) -> str:
"""
Get primary source URL/repo
Args:
config_data: Config JSON data
config_type: "single-source" or "unified"
Returns:
Primary source URL or repo name
"""
if config_type == "unified":
# Get first source
sources = config_data.get("sources", [])
if sources:
first_source = sources[0]
if first_source.get("type") == "documentation":
return first_source.get("base_url", "")
elif first_source.get("type") == "github":
return f"github.com/{first_source.get('repo', '')}"
elif first_source.get("type") == "pdf":
return first_source.get("pdf_url", "PDF file")
return "Multiple sources"
# Single-source configs
if "base_url" in config_data:
return config_data["base_url"]
elif "repo" in config_data:
return f"github.com/{config_data['repo']}"
elif "pdf_url" in config_data or "pdf" in config_data:
return "PDF file"
return "Unknown"
def _categorize_config(self, name: str, description: str, config_data: Dict[str, Any]) -> str:
"""
Auto-categorize config based on name and content
Args:
name: Config name
description: Config description
config_data: Full config data
Returns:
Category name
"""
name_lower = name.lower()
# Check against category mapping
for category, keywords in self.CATEGORY_MAPPING.items():
if any(keyword in name_lower for keyword in keywords):
return category
# Check description for hints
desc_lower = description.lower()
if "framework" in desc_lower or "library" in desc_lower:
if any(word in desc_lower for word in ["web", "frontend", "backend", "api"]):
return "web-frameworks"
if "game" in desc_lower or "engine" in desc_lower:
return "game-engines"
if "devops" in desc_lower or "deployment" in desc_lower or "infrastructure" in desc_lower:
return "devops"
# Default to uncategorized
return "uncategorized"
def _extract_tags(self, name: str, description: str, config_data: Dict[str, Any]) -> List[str]:
"""
Extract relevant tags from config
Args:
name: Config name
description: Config description
config_data: Full config data
Returns:
List of tags
"""
tags = set()
name_lower = name.lower()
desc_lower = description.lower()
# Check against tag keywords
for tag, keywords in self.TAG_KEYWORDS.items():
if any(keyword in name_lower or keyword in desc_lower for keyword in keywords):
tags.add(tag)
# Add config type as tag
config_type = self._determine_type(config_data)
if config_type == "unified":
tags.add("multi-source")
# Add source type tags
if "base_url" in config_data or (config_type == "unified" and any(s.get("type") == "documentation" for s in config_data.get("sources", []))):
tags.add("documentation")
if "repo" in config_data or (config_type == "unified" and any(s.get("type") == "github" for s in config_data.get("sources", []))):
tags.add("github")
if "pdf" in config_data or "pdf_url" in config_data or (config_type == "unified" and any(s.get("type") == "pdf" for s in config_data.get("sources", []))):
tags.add("pdf")
return sorted(list(tags))
def _get_max_pages(self, config_data: Dict[str, Any]) -> Optional[int]:
"""
Get max_pages value from config
Args:
config_data: Config JSON data
Returns:
max_pages value or None
"""
# Single-source configs
if "max_pages" in config_data:
return config_data["max_pages"]
# Unified configs - get from first documentation source
if "sources" in config_data:
for source in config_data["sources"]:
if source.get("type") == "documentation" and "max_pages" in source:
return source["max_pages"]
return None
def _get_last_updated(self, config_path: Path) -> str:
"""
Get last updated date from git history
Args:
config_path: Path to config file
Returns:
ISO format date string
"""
try:
# Try to get last commit date for this file
result = subprocess.run(
["git", "log", "-1", "--format=%cI", str(config_path)],
cwd=config_path.parent.parent,
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
except Exception:
pass
# Fallback to file modification time
mtime = config_path.stat().st_mtime
return datetime.fromtimestamp(mtime).isoformat()