skill-seekers-reference/api/config_analyzer.py

#!/usr/bin/env python3
"""
Config Analyzer - Extract metadata from Skill Seekers config files
"""

import json
import subprocess
from datetime import datetime
from pathlib import Path
from typing import Any


class ConfigAnalyzer:
    """Analyzes Skill Seekers config files and extracts metadata"""

    # Category mapping based on config content
    CATEGORY_MAPPING = {
        "web-frameworks": ["react", "vue", "django", "fastapi", "laravel", "astro", "hono"],
        "game-engines": ["godot", "unity", "unreal"],
        "devops": ["kubernetes", "ansible", "docker", "terraform"],
        "css-frameworks": ["tailwind", "bootstrap", "bulma"],
        "development-tools": ["claude-code", "vscode", "git"],
        "gaming": ["steam"],
        "testing": ["pytest", "jest", "test"],
    }

    # Tag extraction keywords
    TAG_KEYWORDS = {
        "javascript": ["react", "vue", "astro", "hono", "javascript", "js", "node"],
        "python": ["django", "fastapi", "ansible", "python", "flask"],
        "php": ["laravel", "php"],
        "frontend": ["react", "vue", "astro", "tailwind", "frontend", "ui"],
        "backend": ["django", "fastapi", "laravel", "backend", "server", "api"],
        "css": ["tailwind", "css", "styling"],
        "game-development": ["godot", "unity", "unreal", "game"],
        "devops": ["kubernetes", "ansible", "docker", "k8s", "devops"],
        "documentation": ["docs", "documentation"],
        "testing": ["test", "testing", "pytest", "jest"],
    }

    def __init__(self, config_dir: Path, base_url: str = "https://api.skillseekersweb.com"):
        """
        Initialize config analyzer

        Args:
            config_dir: Path to configs directory
            base_url: Base URL for download links
        """
        self.config_dir = Path(config_dir)
        self.base_url = base_url

        if not self.config_dir.exists():
            raise ValueError(f"Config directory not found: {self.config_dir}")

    def analyze_all_configs(self) -> list[dict[str, Any]]:
        """
        Analyze all config files and extract metadata

        Returns:
            List of config metadata dicts
        """
        configs = []

        # Find all JSON files recursively in configs directory and subdirectories
        for config_file in sorted(self.config_dir.rglob("*.json")):
            # Skip test/example configs in test-examples directory
            if "test-examples" in config_file.parts:
                continue

            try:
                metadata = self.analyze_config(config_file)
                if metadata:  # Skip invalid configs
                    configs.append(metadata)
            except Exception as e:
                print(f"Warning: Failed to analyze {config_file.name}: {e}")
                continue

        return configs

    def analyze_config(self, config_path: Path) -> dict[str, Any] | None:
        """
        Analyze a single config file and extract metadata

        Args:
            config_path: Path to config JSON file

        Returns:
            Config metadata dict or None if invalid
        """
        try:
            # Read config file
            with open(config_path) as f:
                config_data = json.load(f)

            # Skip if no name field
            if "name" not in config_data:
                return None

            name = config_data["name"]
            description = config_data.get("description", "")

            # Determine config type
            config_type = self._determine_type(config_data)

            # Get primary source (base_url or repo)
            primary_source = self._get_primary_source(config_data, config_type)

            # Auto-categorize
            category = self._categorize_config(name, description, config_data)

            # Extract tags
            tags = self._extract_tags(name, description, config_data)

            # Get file metadata
            file_size = config_path.stat().st_size
            last_updated = self._get_last_updated(config_path)

            # Generate download URL
            download_url = f"{self.base_url}/api/download/{config_path.name}"

            # Get max_pages (for estimation)
            max_pages = self._get_max_pages(config_data)

            return {
                "name": name,
                "description": description,
                "type": config_type,
                "category": category,
                "tags": tags,
                "primary_source": primary_source,
                "max_pages": max_pages,
                "file_size": file_size,
                "last_updated": last_updated,
                "download_url": download_url,
                "config_file": config_path.name,
            }

        except json.JSONDecodeError as e:
            print(f"Invalid JSON in {config_path.name}: {e}")
            return None
        except Exception as e:
            print(f"Error analyzing {config_path.name}: {e}")
            return None

    def get_config_by_name(self, name: str) -> dict[str, Any] | None:
        """
        Get config metadata by name

        Args:
            name: Config name (e.g., "react", "django")

        Returns:
            Config metadata or None if not found
        """
        configs = self.analyze_all_configs()
        for config in configs:
            if config["name"] == name:
                return config
        return None

    def _determine_type(self, config_data: dict[str, Any]) -> str:
        """
        Determine if config is single-source or unified

        Args:
            config_data: Config JSON data

        Returns:
            "single-source" or "unified"
        """
        # Unified configs have "sources" array
        if "sources" in config_data:
            return "unified"

        # Check for merge_mode (another indicator of unified configs)
        if "merge_mode" in config_data:
            return "unified"

        return "single-source"

    def _get_primary_source(self, config_data: dict[str, Any], config_type: str) -> str:
        """
        Get primary source URL/repo

        Args:
            config_data: Config JSON data
            config_type: "single-source" or "unified"

        Returns:
            Primary source URL or repo name
        """
        if config_type == "unified":
            # Get first source
            sources = config_data.get("sources", [])
            if sources:
                first_source = sources[0]
                if first_source.get("type") == "documentation":
                    return first_source.get("base_url", "")
                elif first_source.get("type") == "github":
                    return f"github.com/{first_source.get('repo', '')}"
                elif first_source.get("type") == "pdf":
                    return first_source.get("pdf_url", "PDF file")
            return "Multiple sources"

        # Single-source configs
        if "base_url" in config_data:
            return config_data["base_url"]
        elif "repo" in config_data:
            return f"github.com/{config_data['repo']}"
        elif "pdf_url" in config_data or "pdf" in config_data:
            return "PDF file"

        return "Unknown"

    def _categorize_config(self, name: str, description: str, config_data: dict[str, Any]) -> str:
        """
        Auto-categorize config based on name and content

        Args:
            name: Config name
            description: Config description
            config_data: Full config data

        Returns:
            Category name
        """
        name_lower = name.lower()

        # Check against category mapping
        for category, keywords in self.CATEGORY_MAPPING.items():
            if any(keyword in name_lower for keyword in keywords):
                return category

        # Check description for hints
        desc_lower = description.lower()
        if "framework" in desc_lower or "library" in desc_lower:
            if any(word in desc_lower for word in ["web", "frontend", "backend", "api"]):
                return "web-frameworks"

        if "game" in desc_lower or "engine" in desc_lower:
            return "game-engines"

        if "devops" in desc_lower or "deployment" in desc_lower or "infrastructure" in desc_lower:
            return "devops"

        # Default to uncategorized
        return "uncategorized"

    def _extract_tags(self, name: str, description: str, config_data: dict[str, Any]) -> list[str]:
        """
        Extract relevant tags from config

        Args:
            name: Config name
            description: Config description
            config_data: Full config data

        Returns:
            List of tags
        """
        tags = set()
        name_lower = name.lower()
        desc_lower = description.lower()

        # Check against tag keywords
        for tag, keywords in self.TAG_KEYWORDS.items():
            if any(keyword in name_lower or keyword in desc_lower for keyword in keywords):
                tags.add(tag)

        # Add config type as tag
        config_type = self._determine_type(config_data)
        if config_type == "unified":
            tags.add("multi-source")

        # Add source type tags
        if "base_url" in config_data or (
            config_type == "unified" and any(s.get("type") == "documentation" for s in config_data.get("sources", []))
        ):
            tags.add("documentation")

        if "repo" in config_data or (
            config_type == "unified" and any(s.get("type") == "github" for s in config_data.get("sources", []))
        ):
            tags.add("github")

        if (
            "pdf" in config_data
            or "pdf_url" in config_data
            or (config_type == "unified" and any(s.get("type") == "pdf" for s in config_data.get("sources", [])))
        ):
            tags.add("pdf")

        return sorted(list(tags))

    def _get_max_pages(self, config_data: dict[str, Any]) -> int | None:
        """
        Get max_pages value from config

        Args:
            config_data: Config JSON data

        Returns:
            max_pages value or None
        """
        # Single-source configs
        if "max_pages" in config_data:
            return config_data["max_pages"]

        # Unified configs - get from first documentation source
        if "sources" in config_data:
            for source in config_data["sources"]:
                if source.get("type") == "documentation" and "max_pages" in source:
                    return source["max_pages"]

        return None

    def _get_last_updated(self, config_path: Path) -> str:
        """
        Get last updated date from git history

        Args:
            config_path: Path to config file

        Returns:
            ISO format date string
        """
        try:
            # Try to get last commit date for this file
            result = subprocess.run(
                ["git", "log", "-1", "--format=%cI", str(config_path)],
                cwd=config_path.parent.parent,
                capture_output=True,
                text=True,
                timeout=5,
            )

            if result.returncode == 0 and result.stdout.strip():
                return result.stdout.strip()

        except Exception:
            pass

        # Fallback to file modification time
        mtime = config_path.stat().st_mtime
        return datetime.fromtimestamp(mtime).isoformat()