Files
skill-seekers-reference/src/skill_seekers/cli/config_validator.py
yusyus 9666938eb0 fix: Resolve 21 ruff linting errors (SIM102, SIM117, B904, SIM113, B007)
Fixed all 21 linting errors identified in GitHub Actions:

SIM102 (7 errors - nested if statements):
- config_extractor.py:468 - Combined nested conditions
- config_validator.py (was B904, already fixed)
- pattern_recognizer.py:430,538,916 - Combined nested conditions
- test_example_extractor.py:365,412,460 - Combined nested conditions
- unified_skill_builder.py:1070 - Combined nested conditions

SIM117 (9 errors - multiple with statements):
- test_install_agent.py:418 - Combined with statements
- test_issue_219_e2e.py:278 - Combined with statements
- test_llms_txt_downloader.py:33,88 - Combined with statements
- test_skip_llms_txt.py:75,98,121,148,172,304 - Combined with statements

B904 (1 error - exception handling):
- config_validator.py:62 - Added 'from e' to exception chain

SIM113 (1 error - enumerate usage):
- doc_scraper.py:1068 - Removed unused 'completed' counter variable

B007 (1 error - unused loop variable):
- pdf_scraper.py:167 - Changed 'keywords' to '_' for unused variable

All changes improve code quality without altering functionality.
Tests: 1214 passed, 167 skipped (4 pre-existing failures unrelated)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-17 23:54:22 +03:00

395 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Unified Config Validator
Validates unified config format that supports multiple sources:
- documentation (website scraping)
- github (repository scraping)
- pdf (PDF document scraping)
Also provides backward compatibility detection for legacy configs.
"""
import json
import logging
from pathlib import Path
from typing import Any
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ConfigValidator:
"""
Validates unified config format and provides backward compatibility.
"""
# Valid source types
VALID_SOURCE_TYPES = {"documentation", "github", "pdf"}
# Valid merge modes
VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}
# Valid code analysis depth levels
VALID_DEPTH_LEVELS = {"surface", "deep", "full"}
# Valid AI modes for C3.x enhancement
VALID_AI_MODES = {"auto", "api", "local", "none"}
def __init__(self, config_or_path: dict[str, Any] | str):
"""
Initialize validator with config dict or file path.
Args:
config_or_path: Either a config dict or path to config JSON file
"""
if isinstance(config_or_path, dict):
self.config_path = None
self.config = config_or_path
else:
self.config_path = config_or_path
self.config = self._load_config()
self.is_unified = self._detect_format()
def _load_config(self) -> dict[str, Any]:
"""Load JSON config file."""
try:
with open(self.config_path, encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError as e:
raise ValueError(f"Config file not found: {self.config_path}") from e
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in config file: {e}") from e
def _detect_format(self) -> bool:
"""
Detect if config is unified format or legacy.
Returns:
True if unified format (has 'sources' array)
False if legacy format
"""
return "sources" in self.config and isinstance(self.config["sources"], list)
def validate(self) -> bool:
"""
Validate config based on detected format.
Returns:
True if valid
Raises:
ValueError if invalid with detailed error message
"""
if self.is_unified:
return self._validate_unified()
else:
return self._validate_legacy()
def _validate_unified(self) -> bool:
"""Validate unified config format."""
logger.info("Validating unified config format...")
# Required top-level fields
if "name" not in self.config:
raise ValueError("Missing required field: 'name'")
if "description" not in self.config:
raise ValueError("Missing required field: 'description'")
if "sources" not in self.config:
raise ValueError("Missing required field: 'sources'")
# Validate sources array
sources = self.config["sources"]
if not isinstance(sources, list):
raise ValueError("'sources' must be an array")
if len(sources) == 0:
raise ValueError("'sources' array cannot be empty")
# Validate merge_mode (optional)
merge_mode = self.config.get("merge_mode", "rule-based")
if merge_mode not in self.VALID_MERGE_MODES:
raise ValueError(
f"Invalid merge_mode: '{merge_mode}'. Must be one of {self.VALID_MERGE_MODES}"
)
# Validate each source
for i, source in enumerate(sources):
self._validate_source(source, i)
logger.info(f"✅ Unified config valid: {len(sources)} sources")
return True
def _validate_source(self, source: dict[str, Any], index: int):
"""Validate individual source configuration."""
# Check source has 'type' field
if "type" not in source:
raise ValueError(f"Source {index}: Missing required field 'type'")
source_type = source["type"]
if source_type not in self.VALID_SOURCE_TYPES:
raise ValueError(
f"Source {index}: Invalid type '{source_type}'. Must be one of {self.VALID_SOURCE_TYPES}"
)
# Type-specific validation
if source_type == "documentation":
self._validate_documentation_source(source, index)
elif source_type == "github":
self._validate_github_source(source, index)
elif source_type == "pdf":
self._validate_pdf_source(source, index)
def _validate_documentation_source(self, source: dict[str, Any], index: int):
"""Validate documentation source configuration."""
if "base_url" not in source:
raise ValueError(f"Source {index} (documentation): Missing required field 'base_url'")
# Optional but recommended fields
if "selectors" not in source:
logger.warning(
f"Source {index} (documentation): No 'selectors' specified, using defaults"
)
if "max_pages" in source and not isinstance(source["max_pages"], int):
raise ValueError(f"Source {index} (documentation): 'max_pages' must be an integer")
def _validate_github_source(self, source: dict[str, Any], index: int):
"""Validate GitHub source configuration."""
if "repo" not in source:
raise ValueError(f"Source {index} (github): Missing required field 'repo'")
# Validate repo format (owner/repo)
repo = source["repo"]
if "/" not in repo:
raise ValueError(
f"Source {index} (github): Invalid repo format '{repo}'. Must be 'owner/repo' (e.g., 'facebook/react')"
)
# Validate code_analysis_depth if specified
if "code_analysis_depth" in source:
depth = source["code_analysis_depth"]
if depth not in self.VALID_DEPTH_LEVELS:
raise ValueError(
f"Source {index} (github): Invalid code_analysis_depth '{depth}'. "
f"Must be one of {self.VALID_DEPTH_LEVELS}"
)
# Validate max_issues if specified
if "max_issues" in source and not isinstance(source["max_issues"], int):
raise ValueError(f"Source {index} (github): 'max_issues' must be an integer")
# Validate enable_codebase_analysis if specified (C3.5)
if "enable_codebase_analysis" in source and not isinstance(
source["enable_codebase_analysis"], bool
):
raise ValueError(
f"Source {index} (github): 'enable_codebase_analysis' must be a boolean"
)
# Validate ai_mode if specified (C3.5)
if "ai_mode" in source:
ai_mode = source["ai_mode"]
if ai_mode not in self.VALID_AI_MODES:
raise ValueError(
f"Source {index} (github): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
)
def _validate_pdf_source(self, source: dict[str, Any], index: int):
"""Validate PDF source configuration."""
if "path" not in source:
raise ValueError(f"Source {index} (pdf): Missing required field 'path'")
# Check if file exists
pdf_path = source["path"]
if not Path(pdf_path).exists():
logger.warning(f"Source {index} (pdf): File not found: {pdf_path}")
def _validate_legacy(self) -> bool:
"""
Validate legacy config format (backward compatibility).
Legacy configs are the old format used by doc_scraper, github_scraper, pdf_scraper.
"""
logger.info("Detected legacy config format (backward compatible)")
# Detect which legacy type based on fields
if "base_url" in self.config:
logger.info("Legacy type: documentation")
elif "repo" in self.config:
logger.info("Legacy type: github")
elif "pdf" in self.config or "path" in self.config:
logger.info("Legacy type: pdf")
else:
raise ValueError("Cannot detect legacy config type (missing base_url, repo, or pdf)")
return True
def convert_legacy_to_unified(self) -> dict[str, Any]:
"""
Convert legacy config to unified format.
Returns:
Unified config dict
"""
if self.is_unified:
logger.info("Config already in unified format")
return self.config
logger.info("Converting legacy config to unified format...")
# Detect legacy type and convert
if "base_url" in self.config:
return self._convert_legacy_documentation()
elif "repo" in self.config:
return self._convert_legacy_github()
elif "pdf" in self.config or "path" in self.config:
return self._convert_legacy_pdf()
else:
raise ValueError("Cannot convert: unknown legacy format")
def _convert_legacy_documentation(self) -> dict[str, Any]:
"""Convert legacy documentation config to unified."""
unified = {
"name": self.config.get("name", "unnamed"),
"description": self.config.get("description", "Documentation skill"),
"merge_mode": "rule-based",
"sources": [
{
"type": "documentation",
**{k: v for k, v in self.config.items() if k not in ["name", "description"]},
}
],
}
return unified
def _convert_legacy_github(self) -> dict[str, Any]:
"""Convert legacy GitHub config to unified."""
unified = {
"name": self.config.get("name", "unnamed"),
"description": self.config.get("description", "GitHub repository skill"),
"merge_mode": "rule-based",
"sources": [
{
"type": "github",
**{k: v for k, v in self.config.items() if k not in ["name", "description"]},
}
],
}
return unified
def _convert_legacy_pdf(self) -> dict[str, Any]:
"""Convert legacy PDF config to unified."""
unified = {
"name": self.config.get("name", "unnamed"),
"description": self.config.get("description", "PDF document skill"),
"merge_mode": "rule-based",
"sources": [
{
"type": "pdf",
**{k: v for k, v in self.config.items() if k not in ["name", "description"]},
}
],
}
return unified
def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]:
"""
Get all sources of a specific type.
Args:
source_type: 'documentation', 'github', or 'pdf'
Returns:
List of sources matching the type
"""
if not self.is_unified:
# For legacy, convert and get sources
unified = self.convert_legacy_to_unified()
sources = unified["sources"]
else:
sources = self.config["sources"]
return [s for s in sources if s.get("type") == source_type]
def has_multiple_sources(self) -> bool:
"""Check if config has multiple sources (requires merging)."""
if not self.is_unified:
return False
return len(self.config["sources"]) > 1
def needs_api_merge(self) -> bool:
"""
Check if config needs API merging.
Returns True if both documentation and github sources exist
with API extraction enabled.
"""
if not self.has_multiple_sources():
return False
has_docs_api = any(
s.get("type") == "documentation" and s.get("extract_api", True)
for s in self.config["sources"]
)
has_github_code = any(
s.get("type") == "github" and s.get("include_code", False)
for s in self.config["sources"]
)
return has_docs_api and has_github_code
def validate_config(config_path: str) -> ConfigValidator:
"""
Validate config file and return validator instance.
Args:
config_path: Path to config JSON file
Returns:
ConfigValidator instance
Raises:
ValueError if config is invalid
"""
validator = ConfigValidator(config_path)
validator.validate()
return validator
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python config_validator.py <config.json>")
sys.exit(1)
config_file = sys.argv[1]
try:
validator = validate_config(config_file)
print("\n✅ Config valid!")
print(f" Format: {'Unified' if validator.is_unified else 'Legacy'}")
print(f" Name: {validator.config.get('name')}")
if validator.is_unified:
sources = validator.config["sources"]
print(f" Sources: {len(sources)}")
for i, source in enumerate(sources):
print(f" {i + 1}. {source['type']}")
if validator.needs_api_merge():
merge_mode = validator.config.get("merge_mode", "rule-based")
print(f" ⚠️ API merge required (mode: {merge_mode})")
except ValueError as e:
print(f"\n❌ Config invalid: {e}")
sys.exit(1)