run ruff
This commit is contained in:
@@ -4,23 +4,20 @@ Page Count Estimator for Skill Seeker
|
||||
Quickly estimates how many pages a config will scrape without downloading content
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports when run as script
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from skill_seekers.cli.constants import (
|
||||
DEFAULT_RATE_LIMIT,
|
||||
DEFAULT_MAX_DISCOVERY,
|
||||
DISCOVERY_THRESHOLD
|
||||
)
|
||||
from skill_seekers.cli.constants import DEFAULT_MAX_DISCOVERY, DEFAULT_RATE_LIMIT, DISCOVERY_THRESHOLD
|
||||
|
||||
|
||||
def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
@@ -35,20 +32,20 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
Returns:
|
||||
dict with estimation results
|
||||
"""
|
||||
base_url = config['base_url']
|
||||
start_urls = config.get('start_urls', [base_url])
|
||||
url_patterns = config.get('url_patterns', {'include': [], 'exclude': []})
|
||||
rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
|
||||
base_url = config["base_url"]
|
||||
start_urls = config.get("start_urls", [base_url])
|
||||
url_patterns = config.get("url_patterns", {"include": [], "exclude": []})
|
||||
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
|
||||
|
||||
visited = set()
|
||||
pending = list(start_urls)
|
||||
discovered = 0
|
||||
|
||||
include_patterns = url_patterns.get('include', [])
|
||||
exclude_patterns = url_patterns.get('exclude', [])
|
||||
include_patterns = url_patterns.get("include", [])
|
||||
exclude_patterns = url_patterns.get("exclude", [])
|
||||
|
||||
# Handle unlimited mode
|
||||
unlimited = (max_discovery == -1 or max_discovery is None)
|
||||
unlimited = max_discovery == -1 or max_discovery is None
|
||||
|
||||
print(f"🔍 Estimating pages for: {config['name']}")
|
||||
print(f"📍 Base URL: {base_url}")
|
||||
@@ -56,8 +53,8 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
print(f"⏱️ Rate limit: {rate_limit}s")
|
||||
|
||||
if unlimited:
|
||||
print(f"🔢 Max discovery: UNLIMITED (will discover all pages)")
|
||||
print(f"⚠️ WARNING: This may take a long time!")
|
||||
print("🔢 Max discovery: UNLIMITED (will discover all pages)")
|
||||
print("⚠️ WARNING: This may take a long time!")
|
||||
else:
|
||||
print(f"🔢 Max discovery: {max_discovery}")
|
||||
|
||||
@@ -80,26 +77,26 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
if discovered % 10 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
rate = discovered / elapsed if elapsed > 0 else 0
|
||||
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end='\r')
|
||||
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end="\r")
|
||||
|
||||
try:
|
||||
# HEAD request first to check if page exists (faster)
|
||||
head_response = requests.head(url, timeout=timeout, allow_redirects=True)
|
||||
|
||||
# Skip non-HTML content
|
||||
content_type = head_response.headers.get('Content-Type', '')
|
||||
if 'text/html' not in content_type:
|
||||
content_type = head_response.headers.get("Content-Type", "")
|
||||
if "text/html" not in content_type:
|
||||
continue
|
||||
|
||||
# Now GET the page to find links
|
||||
response = requests.get(url, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Find all links
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link['href']
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
full_url = urljoin(url, href)
|
||||
|
||||
# Normalize URL
|
||||
@@ -117,10 +114,10 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
# Rate limiting
|
||||
time.sleep(rate_limit)
|
||||
|
||||
except requests.RequestException as e:
|
||||
except requests.RequestException:
|
||||
# Silently skip errors during estimation
|
||||
pass
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
# Silently skip other errors
|
||||
pass
|
||||
|
||||
@@ -128,13 +125,13 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
|
||||
# Results
|
||||
results = {
|
||||
'discovered': discovered,
|
||||
'pending': len(pending),
|
||||
'estimated_total': discovered + len(pending),
|
||||
'elapsed_seconds': round(elapsed, 2),
|
||||
'discovery_rate': round(discovered / elapsed if elapsed > 0 else 0, 2),
|
||||
'hit_limit': (not unlimited) and (discovered >= max_discovery),
|
||||
'unlimited': unlimited
|
||||
"discovered": discovered,
|
||||
"pending": len(pending),
|
||||
"estimated_total": discovered + len(pending),
|
||||
"elapsed_seconds": round(elapsed, 2),
|
||||
"discovery_rate": round(discovered / elapsed if elapsed > 0 else 0, 2),
|
||||
"hit_limit": (not unlimited) and (discovered >= max_discovery),
|
||||
"unlimited": unlimited,
|
||||
}
|
||||
|
||||
return results
|
||||
@@ -143,7 +140,7 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
||||
def is_valid_url(url, base_url, include_patterns, exclude_patterns):
|
||||
"""Check if URL should be crawled"""
|
||||
# Must be same domain
|
||||
if not url.startswith(base_url.rstrip('/')):
|
||||
if not url.startswith(base_url.rstrip("/")):
|
||||
return False
|
||||
|
||||
# Check exclude patterns first
|
||||
@@ -180,11 +177,11 @@ def print_results(results, config):
|
||||
print(f"⏱️ Time Elapsed: {results['elapsed_seconds']}s")
|
||||
print(f"⚡ Discovery Rate: {results['discovery_rate']} pages/sec")
|
||||
|
||||
if results.get('unlimited', False):
|
||||
if results.get("unlimited", False):
|
||||
print()
|
||||
print("✅ UNLIMITED MODE - Discovered all reachable pages")
|
||||
print(f" Total pages: {results['estimated_total']}")
|
||||
elif results['hit_limit']:
|
||||
elif results["hit_limit"]:
|
||||
print()
|
||||
print("⚠️ Hit discovery limit - actual total may be higher")
|
||||
print(" Increase max_discovery parameter for more accurate estimate")
|
||||
@@ -195,8 +192,8 @@ def print_results(results, config):
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
estimated = results['estimated_total']
|
||||
current_max = config.get('max_pages', 100)
|
||||
estimated = results["estimated_total"]
|
||||
current_max = config.get("max_pages", 100)
|
||||
|
||||
if estimated <= current_max:
|
||||
print(f"✅ Current max_pages ({current_max}) is sufficient")
|
||||
@@ -207,7 +204,7 @@ def print_results(results, config):
|
||||
print(f" (Estimated {estimated} + 50 buffer)")
|
||||
|
||||
# Estimate time for full scrape
|
||||
rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
|
||||
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
|
||||
estimated_time = (estimated * rate_limit) / 60 # in minutes
|
||||
|
||||
print()
|
||||
@@ -220,7 +217,7 @@ def print_results(results, config):
|
||||
def load_config(config_path):
|
||||
"""Load configuration from JSON file"""
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
return config
|
||||
except FileNotFoundError:
|
||||
@@ -298,7 +295,7 @@ def list_all_configs():
|
||||
|
||||
# Try to load the config to get name and description
|
||||
try:
|
||||
with open(config_file, 'r') as f:
|
||||
with open(config_file) as f:
|
||||
config_data = json.load(f)
|
||||
|
||||
name = config_data.get("name", config_file.stem)
|
||||
@@ -308,20 +305,19 @@ def list_all_configs():
|
||||
if len(description) > 60:
|
||||
description = description[:57] + "..."
|
||||
|
||||
by_category[category].append({
|
||||
"file": config_file.name,
|
||||
"path": str(rel_path),
|
||||
"name": name,
|
||||
"description": description
|
||||
})
|
||||
by_category[category].append(
|
||||
{"file": config_file.name, "path": str(rel_path), "name": name, "description": description}
|
||||
)
|
||||
except Exception as e:
|
||||
# If we can't parse the config, just use the filename
|
||||
by_category[category].append({
|
||||
"file": config_file.name,
|
||||
"path": str(rel_path),
|
||||
"name": config_file.stem,
|
||||
"description": f"⚠️ Error loading config: {e}"
|
||||
})
|
||||
by_category[category].append(
|
||||
{
|
||||
"file": config_file.name,
|
||||
"path": str(rel_path),
|
||||
"name": config_file.stem,
|
||||
"description": f"⚠️ Error loading config: {e}",
|
||||
}
|
||||
)
|
||||
|
||||
# Print configs by category
|
||||
total = 0
|
||||
@@ -351,7 +347,7 @@ def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Estimate page count for Skill Seeker configs',
|
||||
description="Estimate page count for Skill Seeker configs",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -366,18 +362,25 @@ Examples:
|
||||
|
||||
# Quick estimate (stop at 100 pages)
|
||||
skill-seekers estimate configs/vue.json --max-discovery 100
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument('config', nargs='?', help='Path to config JSON file')
|
||||
parser.add_argument('--all', action='store_true',
|
||||
help='List all available configs from api/configs_repo/official/')
|
||||
parser.add_argument('--max-discovery', '-m', type=int, default=DEFAULT_MAX_DISCOVERY,
|
||||
help=f'Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)')
|
||||
parser.add_argument('--unlimited', '-u', action='store_true',
|
||||
help='Remove discovery limit - discover all pages (same as --max-discovery -1)')
|
||||
parser.add_argument('--timeout', '-t', type=int, default=30,
|
||||
help='HTTP request timeout in seconds (default: 30)')
|
||||
parser.add_argument("config", nargs="?", help="Path to config JSON file")
|
||||
parser.add_argument("--all", action="store_true", help="List all available configs from api/configs_repo/official/")
|
||||
parser.add_argument(
|
||||
"--max-discovery",
|
||||
"-m",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_DISCOVERY,
|
||||
help=f"Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--unlimited",
|
||||
"-u",
|
||||
action="store_true",
|
||||
help="Remove discovery limit - discover all pages (same as --max-discovery -1)",
|
||||
)
|
||||
parser.add_argument("--timeout", "-t", type=int, default=30, help="HTTP request timeout in seconds (default: 30)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -401,7 +404,7 @@ Examples:
|
||||
print_results(results, config)
|
||||
|
||||
# Return exit code based on results
|
||||
if results['hit_limit']:
|
||||
if results["hit_limit"]:
|
||||
return 2 # Warning: hit limit
|
||||
return 0 # Success
|
||||
|
||||
@@ -413,5 +416,5 @@ Examples:
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
Reference in New Issue
Block a user