#!/usr/bin/env python3 """ Unified Config Validator Validates unified config format that supports multiple sources: - documentation (website scraping) - github (repository scraping) - pdf (PDF document scraping) - local (local codebase analysis) Legacy config format support removed in v2.11.0. All configs must use unified format with 'sources' array. """ import json import logging from pathlib import Path from typing import Any logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ConfigValidator: """ Validates unified config format (legacy support removed in v2.11.0). """ # Valid source types VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local"} # Valid merge modes VALID_MERGE_MODES = {"rule-based", "claude-enhanced"} # Valid code analysis depth levels VALID_DEPTH_LEVELS = {"surface", "deep", "full"} # Valid AI modes for C3.x enhancement VALID_AI_MODES = {"auto", "api", "local", "none"} def __init__(self, config_or_path: dict[str, Any] | str): """ Initialize validator with config dict or file path. Args: config_or_path: Either a config dict or path to config JSON file """ if isinstance(config_or_path, dict): self.config_path = None self.config = config_or_path else: self.config_path = config_or_path self.config = self._load_config() self.is_unified = True # Always unified format now def _load_config(self) -> dict[str, Any]: """Load JSON config file.""" try: with open(self.config_path, encoding="utf-8") as f: return json.load(f) except FileNotFoundError as e: raise ValueError(f"Config file not found: {self.config_path}") from e except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON in config file: {e}") from e def validate(self) -> bool: """ Validate unified config format. Returns: True if valid Raises: ValueError if invalid with detailed error message """ # Check if legacy format (no sources array) if "sources" not in self.config: raise ValueError( "\nāŒ LEGACY CONFIG FORMAT DETECTED\n\n" " Legacy config format was removed in v2.11.0.\n" " All configs must now use unified format with 'sources' array.\n\n" " OLD FORMAT (removed):\n" " {\n" ' "name": "example",\n' ' "base_url": "https://..."\n' " }\n\n" " NEW FORMAT (required):\n" " {\n" ' "name": "example",\n' ' "description": "...",\n' ' "sources": [\n' " {\n" ' "type": "documentation",\n' ' "base_url": "https://..."\n' " }\n" " ]\n" " }\n\n" " šŸ“– See: https://skillseekersweb.com/docs/config-format\n" ) return self._validate_unified() def _validate_unified(self) -> bool: """Validate unified config format.""" logger.info("Validating unified config format...") # Required top-level fields if "name" not in self.config: raise ValueError("Missing required field: 'name'") if "description" not in self.config: raise ValueError("Missing required field: 'description'") if "sources" not in self.config: raise ValueError("Missing required field: 'sources'") # Validate sources array sources = self.config["sources"] if not isinstance(sources, list): raise ValueError("'sources' must be an array") if len(sources) == 0: raise ValueError("'sources' array cannot be empty") # Validate merge_mode (optional) merge_mode = self.config.get("merge_mode", "rule-based") if merge_mode not in self.VALID_MERGE_MODES: raise ValueError( f"Invalid merge_mode: '{merge_mode}'. Must be one of {self.VALID_MERGE_MODES}" ) # Validate each source for i, source in enumerate(sources): self._validate_source(source, i) logger.info(f"āœ… Unified config valid: {len(sources)} sources") return True def _validate_source(self, source: dict[str, Any], index: int): """Validate individual source configuration.""" # Check source has 'type' field if "type" not in source: raise ValueError(f"Source {index}: Missing required field 'type'") source_type = source["type"] if source_type not in self.VALID_SOURCE_TYPES: raise ValueError( f"Source {index}: Invalid type '{source_type}'. Must be one of {self.VALID_SOURCE_TYPES}" ) # Type-specific validation if source_type == "documentation": self._validate_documentation_source(source, index) elif source_type == "github": self._validate_github_source(source, index) elif source_type == "pdf": self._validate_pdf_source(source, index) elif source_type == "local": self._validate_local_source(source, index) def _validate_documentation_source(self, source: dict[str, Any], index: int): """Validate documentation source configuration.""" if "base_url" not in source: raise ValueError(f"Source {index} (documentation): Missing required field 'base_url'") # Optional but recommended fields if "selectors" not in source: logger.warning( f"Source {index} (documentation): No 'selectors' specified, using defaults" ) if "max_pages" in source and not isinstance(source["max_pages"], int): raise ValueError(f"Source {index} (documentation): 'max_pages' must be an integer") def _validate_github_source(self, source: dict[str, Any], index: int): """Validate GitHub source configuration.""" if "repo" not in source: raise ValueError(f"Source {index} (github): Missing required field 'repo'") # Validate repo format (owner/repo) repo = source["repo"] if "/" not in repo: raise ValueError( f"Source {index} (github): Invalid repo format '{repo}'. Must be 'owner/repo' (e.g., 'facebook/react')" ) # Validate code_analysis_depth if specified if "code_analysis_depth" in source: depth = source["code_analysis_depth"] if depth not in self.VALID_DEPTH_LEVELS: raise ValueError( f"Source {index} (github): Invalid code_analysis_depth '{depth}'. " f"Must be one of {self.VALID_DEPTH_LEVELS}" ) # Validate max_issues if specified if "max_issues" in source and not isinstance(source["max_issues"], int): raise ValueError(f"Source {index} (github): 'max_issues' must be an integer") # Validate enable_codebase_analysis if specified (C3.5) if "enable_codebase_analysis" in source and not isinstance( source["enable_codebase_analysis"], bool ): raise ValueError( f"Source {index} (github): 'enable_codebase_analysis' must be a boolean" ) # Validate ai_mode if specified (C3.5) if "ai_mode" in source: ai_mode = source["ai_mode"] if ai_mode not in self.VALID_AI_MODES: raise ValueError( f"Source {index} (github): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}" ) def _validate_pdf_source(self, source: dict[str, Any], index: int): """Validate PDF source configuration.""" if "path" not in source: raise ValueError(f"Source {index} (pdf): Missing required field 'path'") # Check if file exists pdf_path = source["path"] if not Path(pdf_path).exists(): logger.warning(f"Source {index} (pdf): File not found: {pdf_path}") def _validate_local_source(self, source: dict[str, Any], index: int): """Validate local codebase source configuration.""" if "path" not in source: raise ValueError(f"Source {index} (local): Missing required field 'path'") # Check if directory exists local_path = source["path"] if not Path(local_path).exists(): logger.warning(f"Source {index} (local): Directory not found: {local_path}") elif not Path(local_path).is_dir(): raise ValueError(f"Source {index} (local): Path is not a directory: {local_path}") # Validate analysis_depth if provided if "analysis_depth" in source: depth = source["analysis_depth"] if depth not in self.VALID_DEPTH_LEVELS: raise ValueError( f"Source {index} (local): Invalid analysis_depth '{depth}'. Must be one of {self.VALID_DEPTH_LEVELS}" ) # Validate ai_mode if provided if "ai_mode" in source: ai_mode = source["ai_mode"] if ai_mode not in self.VALID_AI_MODES: raise ValueError( f"Source {index} (local): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}" ) def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]: """ Get all sources of a specific type. Args: source_type: 'documentation', 'github', 'pdf', or 'local' Returns: List of sources matching the type """ sources = self.config["sources"] return [s for s in sources if s.get("type") == source_type] def has_multiple_sources(self) -> bool: """Check if config has multiple sources (requires merging).""" return len(self.config["sources"]) > 1 def needs_api_merge(self) -> bool: """ Check if config needs API merging. Returns True if both documentation and github sources exist with API extraction enabled. """ if not self.has_multiple_sources(): return False has_docs_api = any( s.get("type") == "documentation" and s.get("extract_api", True) for s in self.config["sources"] ) has_github_code = any( s.get("type") == "github" and s.get("include_code", False) for s in self.config["sources"] ) return has_docs_api and has_github_code def validate_config(config_path: str) -> ConfigValidator: """ Validate config file and return validator instance. Args: config_path: Path to config JSON file Returns: ConfigValidator instance Raises: ValueError if config is invalid """ validator = ConfigValidator(config_path) validator.validate() return validator if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: python config_validator.py ") sys.exit(1) config_file = sys.argv[1] try: validator = validate_config(config_file) print("\nāœ… Config valid!") print(f" Name: {validator.config.get('name')}") sources = validator.config["sources"] print(f" Sources: {len(sources)}") for i, source in enumerate(sources): print(f" {i + 1}. {source['type']}") if validator.needs_api_merge(): merge_mode = validator.config.get("merge_mode", "rule-based") print(f" āš ļø API merge required (mode: {merge_mode})") except ValueError as e: print(f"\nāŒ Config invalid: {e}") sys.exit(1)