From af8757273573e66d6a350c450cc03c758f85f85b Mon Sep 17 00:00:00 2001 From: yusyus Date: Sun, 19 Oct 2025 14:55:56 +0300 Subject: [PATCH] Remove unnecessary validation limits from config validator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove max_pages upper limit (was 10,000, now unlimited) - Remove rate_limit upper limit (was 10s, now unlimited) - Convert missing selector checks from errors to warnings - Add warnings system (non-blocking) vs errors (blocking) - Allow users to scrape large documentation sites (45k+ pages) - Allow flexible rate limiting for different site requirements All reasonable validations remain (required fields, valid URLs, correct data types, no negative values). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- doc_scraper.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/doc_scraper.py b/doc_scraper.py index 3d0fd45..f0928b5 100644 --- a/doc_scraper.py +++ b/doc_scraper.py @@ -643,6 +643,7 @@ To refresh this skill with updated documentation: def validate_config(config): """Validate configuration structure""" errors = [] + warnings = [] # Required fields required_fields = ['name', 'base_url'] @@ -668,9 +669,9 @@ def validate_config(config): recommended_selectors = ['main_content', 'title', 'code_blocks'] for selector in recommended_selectors: if selector not in config['selectors']: - errors.append(f"Missing recommended selector: '{selector}'") + warnings.append(f"Missing recommended selector: '{selector}'") else: - errors.append("Missing 'selectors' section (recommended)") + warnings.append("Missing 'selectors' section (recommended)") # Validate url_patterns if 'url_patterns' in config: @@ -695,8 +696,8 @@ def validate_config(config): if 'rate_limit' in config: try: rate = float(config['rate_limit']) - if rate < 0 or rate > 10: - errors.append(f"'rate_limit' should be between 0 and 10 (got {rate})") + if rate < 0: + errors.append(f"'rate_limit' must be non-negative (got {rate})") except (ValueError, TypeError): errors.append(f"'rate_limit' must be a number (got {config['rate_limit']})") @@ -704,8 +705,8 @@ def validate_config(config): if 'max_pages' in config: try: max_p = int(config['max_pages']) - if max_p < 1 or max_p > 10000: - errors.append(f"'max_pages' should be between 1 and 10000 (got {max_p})") + if max_p < 1: + errors.append(f"'max_pages' must be at least 1 (got {max_p})") except (ValueError, TypeError): errors.append(f"'max_pages' must be an integer (got {config['max_pages']})") @@ -718,7 +719,7 @@ def validate_config(config): if not url.startswith(('http://', 'https://')): errors.append(f"Invalid start_url: '{url}' (must start with http:// or https://)") - return errors + return errors, warnings def load_config(config_path): @@ -734,7 +735,16 @@ def load_config(config_path): sys.exit(1) # Validate config - errors = validate_config(config) + errors, warnings = validate_config(config) + + # Show warnings (non-blocking) + if warnings: + print(f"⚠️ Configuration warnings in {config_path}:") + for warning in warnings: + print(f" - {warning}") + print() + + # Show errors (blocking) if errors: print(f"❌ Configuration validation errors in {config_path}:") for error in errors: