skill-seekers-reference/src/skill_seekers/cli/config_validator.py

#!/usr/bin/env python3
"""
Unified Config Validator

Validates unified config format that supports multiple sources:
- documentation (website scraping)
- github (repository scraping)
- pdf (PDF document scraping)
- local (local codebase analysis)

Legacy config format support removed in v2.11.0.
All configs must use unified format with 'sources' array.
"""

import json
import logging
from pathlib import Path
from typing import Any

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class ConfigValidator:
    """
    Validates unified config format (legacy support removed in v2.11.0).
    """

    # Valid source types
    VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local", "word", "video"}

    # Valid merge modes
    VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}

    # Valid code analysis depth levels
    VALID_DEPTH_LEVELS = {"surface", "deep", "full"}

    # Valid AI modes for C3.x enhancement
    VALID_AI_MODES = {"auto", "api", "local", "none"}

    def __init__(self, config_or_path: dict[str, Any] | str):
        """
        Initialize validator with config dict or file path.

        Args:
            config_or_path: Either a config dict or path to config JSON file
        """
        if isinstance(config_or_path, dict):
            self.config_path = None
            self.config = config_or_path
        else:
            self.config_path = config_or_path
            self.config = self._load_config()
        self.is_unified = True  # Always unified format now

    def _load_config(self) -> dict[str, Any]:
        """Load JSON config file."""
        try:
            with open(self.config_path, encoding="utf-8") as f:
                return json.load(f)
        except FileNotFoundError as e:
            raise ValueError(f"Config file not found: {self.config_path}") from e
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON in config file: {e}") from e

    def validate(self) -> bool:
        """
        Validate unified config format.

        Returns:
            True if valid

        Raises:
            ValueError if invalid with detailed error message
        """
        # Check if legacy format (no sources array)
        if "sources" not in self.config:
            raise ValueError(
                "\n❌ LEGACY CONFIG FORMAT DETECTED\n\n"
                "   Legacy config format was removed in v2.11.0.\n"
                "   All configs must now use unified format with 'sources' array.\n\n"
                "   OLD FORMAT (removed):\n"
                "   {\n"
                '     "name": "example",\n'
                '     "base_url": "https://..."\n'
                "   }\n\n"
                "   NEW FORMAT (required):\n"
                "   {\n"
                '     "name": "example",\n'
                '     "description": "...",\n'
                '     "sources": [\n'
                "       {\n"
                '         "type": "documentation",\n'
                '         "base_url": "https://..."\n'
                "       }\n"
                "     ]\n"
                "   }\n\n"
                "   📖 See: https://skillseekersweb.com/docs/config-format\n"
            )

        return self._validate_unified()

    def _validate_unified(self) -> bool:
        """Validate unified config format."""
        logger.info("Validating unified config format...")

        # Required top-level fields
        if "name" not in self.config:
            raise ValueError("Missing required field: 'name'")

        if "description" not in self.config:
            raise ValueError("Missing required field: 'description'")

        if "sources" not in self.config:
            raise ValueError("Missing required field: 'sources'")

        # Validate sources array
        sources = self.config["sources"]

        if not isinstance(sources, list):
            raise ValueError("'sources' must be an array")

        if len(sources) == 0:
            raise ValueError("'sources' array cannot be empty")

        # Validate merge_mode (optional)
        merge_mode = self.config.get("merge_mode", "rule-based")
        if merge_mode not in self.VALID_MERGE_MODES:
            raise ValueError(
                f"Invalid merge_mode: '{merge_mode}'. Must be one of {self.VALID_MERGE_MODES}"
            )

        # Validate each source
        for i, source in enumerate(sources):
            self._validate_source(source, i)

        logger.info(f"✅ Unified config valid: {len(sources)} sources")
        return True

    def _validate_source(self, source: dict[str, Any], index: int):
        """Validate individual source configuration."""
        # Check source has 'type' field
        if "type" not in source:
            raise ValueError(f"Source {index}: Missing required field 'type'")

        source_type = source["type"]

        if source_type not in self.VALID_SOURCE_TYPES:
            raise ValueError(
                f"Source {index}: Invalid type '{source_type}'. Must be one of {self.VALID_SOURCE_TYPES}"
            )

        # Type-specific validation
        if source_type == "documentation":
            self._validate_documentation_source(source, index)
        elif source_type == "github":
            self._validate_github_source(source, index)
        elif source_type == "pdf":
            self._validate_pdf_source(source, index)
        elif source_type == "local":
            self._validate_local_source(source, index)

    def _validate_documentation_source(self, source: dict[str, Any], index: int):
        """Validate documentation source configuration."""
        if "base_url" not in source:
            raise ValueError(f"Source {index} (documentation): Missing required field 'base_url'")

        # Optional but recommended fields
        if "selectors" not in source:
            logger.warning(
                f"Source {index} (documentation): No 'selectors' specified, using defaults"
            )

        if "max_pages" in source and not isinstance(source["max_pages"], int):
            raise ValueError(f"Source {index} (documentation): 'max_pages' must be an integer")

    def _validate_github_source(self, source: dict[str, Any], index: int):
        """Validate GitHub source configuration."""
        if "repo" not in source:
            raise ValueError(f"Source {index} (github): Missing required field 'repo'")

        # Validate repo format (owner/repo)
        repo = source["repo"]
        if "/" not in repo:
            raise ValueError(
                f"Source {index} (github): Invalid repo format '{repo}'. Must be 'owner/repo' (e.g., 'facebook/react')"
            )

        # Validate code_analysis_depth if specified
        if "code_analysis_depth" in source:
            depth = source["code_analysis_depth"]
            if depth not in self.VALID_DEPTH_LEVELS:
                raise ValueError(
                    f"Source {index} (github): Invalid code_analysis_depth '{depth}'. "
                    f"Must be one of {self.VALID_DEPTH_LEVELS}"
                )

        # Validate max_issues if specified
        if "max_issues" in source and not isinstance(source["max_issues"], int):
            raise ValueError(f"Source {index} (github): 'max_issues' must be an integer")

        # Validate enable_codebase_analysis if specified (C3.5)
        if "enable_codebase_analysis" in source and not isinstance(
            source["enable_codebase_analysis"], bool
        ):
            raise ValueError(
                f"Source {index} (github): 'enable_codebase_analysis' must be a boolean"
            )

        # Validate ai_mode if specified (C3.5)
        if "ai_mode" in source:
            ai_mode = source["ai_mode"]
            if ai_mode not in self.VALID_AI_MODES:
                raise ValueError(
                    f"Source {index} (github): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
                )

    def _validate_pdf_source(self, source: dict[str, Any], index: int):
        """Validate PDF source configuration."""
        if "path" not in source:
            raise ValueError(f"Source {index} (pdf): Missing required field 'path'")

        # Check if file exists
        pdf_path = source["path"]
        if not Path(pdf_path).exists():
            logger.warning(f"Source {index} (pdf): File not found: {pdf_path}")

    def _validate_local_source(self, source: dict[str, Any], index: int):
        """Validate local codebase source configuration."""
        if "path" not in source:
            raise ValueError(f"Source {index} (local): Missing required field 'path'")

        # Check if directory exists
        local_path = source["path"]
        if not Path(local_path).exists():
            logger.warning(f"Source {index} (local): Directory not found: {local_path}")
        elif not Path(local_path).is_dir():
            raise ValueError(f"Source {index} (local): Path is not a directory: {local_path}")

        # Validate analysis_depth if provided
        if "analysis_depth" in source:
            depth = source["analysis_depth"]
            if depth not in self.VALID_DEPTH_LEVELS:
                raise ValueError(
                    f"Source {index} (local): Invalid analysis_depth '{depth}'. Must be one of {self.VALID_DEPTH_LEVELS}"
                )

        # Validate ai_mode if provided
        if "ai_mode" in source:
            ai_mode = source["ai_mode"]
            if ai_mode not in self.VALID_AI_MODES:
                raise ValueError(
                    f"Source {index} (local): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
                )

    def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]:
        """
        Get all sources of a specific type.

        Args:
            source_type: 'documentation', 'github', 'pdf', or 'local'

        Returns:
            List of sources matching the type
        """
        sources = self.config["sources"]
        return [s for s in sources if s.get("type") == source_type]

    def has_multiple_sources(self) -> bool:
        """Check if config has multiple sources (requires merging)."""
        return len(self.config["sources"]) > 1

    def needs_api_merge(self) -> bool:
        """
        Check if config needs API merging.

        Returns True if both documentation and github sources exist
        with API extraction enabled.
        """
        if not self.has_multiple_sources():
            return False

        has_docs_api = any(
            s.get("type") == "documentation" and s.get("extract_api", True)
            for s in self.config["sources"]
        )

        has_github_code = any(
            s.get("type") == "github" and s.get("include_code", False)
            for s in self.config["sources"]
        )

        return has_docs_api and has_github_code


def validate_config(config_path: str) -> ConfigValidator:
    """
    Validate config file and return validator instance.

    Args:
        config_path: Path to config JSON file

    Returns:
        ConfigValidator instance

    Raises:
        ValueError if config is invalid
    """
    validator = ConfigValidator(config_path)
    validator.validate()
    return validator


if __name__ == "__main__":
    import sys

    if len(sys.argv) < 2:
        print("Usage: python config_validator.py <config.json>")
        sys.exit(1)

    config_file = sys.argv[1]

    try:
        validator = validate_config(config_file)

        print("\n✅ Config valid!")
        print(f"   Name: {validator.config.get('name')}")

        sources = validator.config["sources"]
        print(f"   Sources: {len(sources)}")
        for i, source in enumerate(sources):
            print(f"     {i + 1}. {source['type']}")

        if validator.needs_api_merge():
            merge_mode = validator.config.get("merge_mode", "rule-based")
            print(f"   ⚠️  API merge required (mode: {merge_mode})")

    except ValueError as e:
        print(f"\n❌ Config invalid: {e}")
        sys.exit(1)