run ruff

2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions
--- a/src/skill_seekers/cli/estimate_pages.py
+++ b/src/skill_seekers/cli/estimate_pages.py
@@ -4,23 +4,20 @@ Page Count Estimator for Skill Seeker
 Quickly estimates how many pages a config will scrape without downloading content
 """

-import sys
+import json
 import os
+import sys
+import time
+from pathlib import Path
+from urllib.parse import urljoin, urlparse
+
 import requests
 from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
-import time
-import json
-from pathlib import Path

 # Add parent directory to path for imports when run as script
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

-from skill_seekers.cli.constants import (
-    DEFAULT_RATE_LIMIT,
-    DEFAULT_MAX_DISCOVERY,
-    DISCOVERY_THRESHOLD
-)
+from skill_seekers.cli.constants import DEFAULT_MAX_DISCOVERY, DEFAULT_RATE_LIMIT, DISCOVERY_THRESHOLD


 def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
@@ -35,20 +32,20 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
    Returns:
        dict with estimation results
    """
-    base_url = config['base_url']
-    start_urls = config.get('start_urls', [base_url])
-    url_patterns = config.get('url_patterns', {'include': [], 'exclude': []})
-    rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
+    base_url = config["base_url"]
+    start_urls = config.get("start_urls", [base_url])
+    url_patterns = config.get("url_patterns", {"include": [], "exclude": []})
+    rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)

    visited = set()
    pending = list(start_urls)
    discovered = 0

-    include_patterns = url_patterns.get('include', [])
-    exclude_patterns = url_patterns.get('exclude', [])
+    include_patterns = url_patterns.get("include", [])
+    exclude_patterns = url_patterns.get("exclude", [])

    # Handle unlimited mode
-    unlimited = (max_discovery == -1 or max_discovery is None)
+    unlimited = max_discovery == -1 or max_discovery is None

    print(f"🔍 Estimating pages for: {config['name']}")
    print(f"📍 Base URL: {base_url}")
@@ -56,8 +53,8 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
    print(f"⏱️  Rate limit: {rate_limit}s")

    if unlimited:
-        print(f"🔢 Max discovery: UNLIMITED (will discover all pages)")
-        print(f"⚠️  WARNING: This may take a long time!")
+        print("🔢 Max discovery: UNLIMITED (will discover all pages)")
+        print("⚠️  WARNING: This may take a long time!")
    else:
        print(f"🔢 Max discovery: {max_discovery}")

@@ -80,26 +77,26 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
        if discovered % 10 == 0:
            elapsed = time.time() - start_time
            rate = discovered / elapsed if elapsed > 0 else 0
-            print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end='\r')
+            print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end="\r")

        try:
            # HEAD request first to check if page exists (faster)
            head_response = requests.head(url, timeout=timeout, allow_redirects=True)

            # Skip non-HTML content
-            content_type = head_response.headers.get('Content-Type', '')
-            if 'text/html' not in content_type:
+            content_type = head_response.headers.get("Content-Type", "")
+            if "text/html" not in content_type:
                continue

            # Now GET the page to find links
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()

-            soup = BeautifulSoup(response.content, 'html.parser')
+            soup = BeautifulSoup(response.content, "html.parser")

            # Find all links
-            for link in soup.find_all('a', href=True):
-                href = link['href']
+            for link in soup.find_all("a", href=True):
+                href = link["href"]
                full_url = urljoin(url, href)

                # Normalize URL
@@ -117,10 +114,10 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
            # Rate limiting
            time.sleep(rate_limit)

-        except requests.RequestException as e:
+        except requests.RequestException:
            # Silently skip errors during estimation
            pass
-        except Exception as e:
+        except Exception:
            # Silently skip other errors
            pass

@@ -128,13 +125,13 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):

    # Results
    results = {
-        'discovered': discovered,
-        'pending': len(pending),
-        'estimated_total': discovered + len(pending),
-        'elapsed_seconds': round(elapsed, 2),
-        'discovery_rate': round(discovered / elapsed if elapsed > 0 else 0, 2),
-        'hit_limit': (not unlimited) and (discovered >= max_discovery),
-        'unlimited': unlimited
+        "discovered": discovered,
+        "pending": len(pending),
+        "estimated_total": discovered + len(pending),
+        "elapsed_seconds": round(elapsed, 2),
+        "discovery_rate": round(discovered / elapsed if elapsed > 0 else 0, 2),
+        "hit_limit": (not unlimited) and (discovered >= max_discovery),
+        "unlimited": unlimited,
    }

    return results
@@ -143,7 +140,7 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
 def is_valid_url(url, base_url, include_patterns, exclude_patterns):
    """Check if URL should be crawled"""
    # Must be same domain
-    if not url.startswith(base_url.rstrip('/')):
+    if not url.startswith(base_url.rstrip("/")):
        return False

    # Check exclude patterns first
@@ -180,11 +177,11 @@ def print_results(results, config):
    print(f"⏱️  Time Elapsed: {results['elapsed_seconds']}s")
    print(f"⚡ Discovery Rate: {results['discovery_rate']} pages/sec")

-    if results.get('unlimited', False):
+    if results.get("unlimited", False):
        print()
        print("✅ UNLIMITED MODE - Discovered all reachable pages")
        print(f"   Total pages: {results['estimated_total']}")
-    elif results['hit_limit']:
+    elif results["hit_limit"]:
        print()
        print("⚠️  Hit discovery limit - actual total may be higher")
        print("   Increase max_discovery parameter for more accurate estimate")
@@ -195,8 +192,8 @@ def print_results(results, config):
    print("=" * 70)
    print()

-    estimated = results['estimated_total']
-    current_max = config.get('max_pages', 100)
+    estimated = results["estimated_total"]
+    current_max = config.get("max_pages", 100)

    if estimated <= current_max:
        print(f"✅ Current max_pages ({current_max}) is sufficient")
@@ -207,7 +204,7 @@ def print_results(results, config):
        print(f"   (Estimated {estimated} + 50 buffer)")

    # Estimate time for full scrape
-    rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
+    rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
    estimated_time = (estimated * rate_limit) / 60  # in minutes

    print()
@@ -220,7 +217,7 @@ def print_results(results, config):
 def load_config(config_path):
    """Load configuration from JSON file"""
    try:
-        with open(config_path, 'r') as f:
+        with open(config_path) as f:
            config = json.load(f)
        return config
    except FileNotFoundError:
@@ -298,7 +295,7 @@ def list_all_configs():

        # Try to load the config to get name and description
        try:
-            with open(config_file, 'r') as f:
+            with open(config_file) as f:
                config_data = json.load(f)

            name = config_data.get("name", config_file.stem)
@@ -308,20 +305,19 @@ def list_all_configs():
            if len(description) > 60:
                description = description[:57] + "..."

-            by_category[category].append({
-                "file": config_file.name,
-                "path": str(rel_path),
-                "name": name,
-                "description": description
-            })
+            by_category[category].append(
+                {"file": config_file.name, "path": str(rel_path), "name": name, "description": description}
+            )
        except Exception as e:
            # If we can't parse the config, just use the filename
-            by_category[category].append({
-                "file": config_file.name,
-                "path": str(rel_path),
-                "name": config_file.stem,
-                "description": f"⚠️  Error loading config: {e}"
-            })
+            by_category[category].append(
+                {
+                    "file": config_file.name,
+                    "path": str(rel_path),
+                    "name": config_file.stem,
+                    "description": f"⚠️  Error loading config: {e}",
+                }
+            )

    # Print configs by category
    total = 0
@@ -351,7 +347,7 @@ def main():
    import argparse

    parser = argparse.ArgumentParser(
-        description='Estimate page count for Skill Seeker configs',
+        description="Estimate page count for Skill Seeker configs",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
@@ -366,18 +362,25 @@ Examples:

  # Quick estimate (stop at 100 pages)
  skill-seekers estimate configs/vue.json --max-discovery 100
-        """
+        """,
    )

-    parser.add_argument('config', nargs='?', help='Path to config JSON file')
-    parser.add_argument('--all', action='store_true',
-                       help='List all available configs from api/configs_repo/official/')
-    parser.add_argument('--max-discovery', '-m', type=int, default=DEFAULT_MAX_DISCOVERY,
-                       help=f'Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)')
-    parser.add_argument('--unlimited', '-u', action='store_true',
-                       help='Remove discovery limit - discover all pages (same as --max-discovery -1)')
-    parser.add_argument('--timeout', '-t', type=int, default=30,
-                       help='HTTP request timeout in seconds (default: 30)')
+    parser.add_argument("config", nargs="?", help="Path to config JSON file")
+    parser.add_argument("--all", action="store_true", help="List all available configs from api/configs_repo/official/")
+    parser.add_argument(
+        "--max-discovery",
+        "-m",
+        type=int,
+        default=DEFAULT_MAX_DISCOVERY,
+        help=f"Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)",
+    )
+    parser.add_argument(
+        "--unlimited",
+        "-u",
+        action="store_true",
+        help="Remove discovery limit - discover all pages (same as --max-discovery -1)",
+    )
+    parser.add_argument("--timeout", "-t", type=int, default=30, help="HTTP request timeout in seconds (default: 30)")

    args = parser.parse_args()

@@ -401,7 +404,7 @@ Examples:
        print_results(results, config)

        # Return exit code based on results
-        if results['hit_limit']:
+        if results["hit_limit"]:
            return 2  # Warning: hit limit
        return 0  # Success

@@ -413,5 +416,5 @@ Examples:
        return 1


-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main())