This commit is contained in:
Pablo Estevez
2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions

View File

@@ -4,23 +4,20 @@ Page Count Estimator for Skill Seeker
Quickly estimates how many pages a config will scrape without downloading content
"""
import sys
import json
import os
import sys
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import json
from pathlib import Path
# Add parent directory to path for imports when run as script
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from skill_seekers.cli.constants import (
DEFAULT_RATE_LIMIT,
DEFAULT_MAX_DISCOVERY,
DISCOVERY_THRESHOLD
)
from skill_seekers.cli.constants import DEFAULT_MAX_DISCOVERY, DEFAULT_RATE_LIMIT, DISCOVERY_THRESHOLD
def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
@@ -35,20 +32,20 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
Returns:
dict with estimation results
"""
base_url = config['base_url']
start_urls = config.get('start_urls', [base_url])
url_patterns = config.get('url_patterns', {'include': [], 'exclude': []})
rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
base_url = config["base_url"]
start_urls = config.get("start_urls", [base_url])
url_patterns = config.get("url_patterns", {"include": [], "exclude": []})
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
visited = set()
pending = list(start_urls)
discovered = 0
include_patterns = url_patterns.get('include', [])
exclude_patterns = url_patterns.get('exclude', [])
include_patterns = url_patterns.get("include", [])
exclude_patterns = url_patterns.get("exclude", [])
# Handle unlimited mode
unlimited = (max_discovery == -1 or max_discovery is None)
unlimited = max_discovery == -1 or max_discovery is None
print(f"🔍 Estimating pages for: {config['name']}")
print(f"📍 Base URL: {base_url}")
@@ -56,8 +53,8 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
print(f"⏱️ Rate limit: {rate_limit}s")
if unlimited:
print(f"🔢 Max discovery: UNLIMITED (will discover all pages)")
print(f"⚠️ WARNING: This may take a long time!")
print("🔢 Max discovery: UNLIMITED (will discover all pages)")
print("⚠️ WARNING: This may take a long time!")
else:
print(f"🔢 Max discovery: {max_discovery}")
@@ -80,26 +77,26 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
if discovered % 10 == 0:
elapsed = time.time() - start_time
rate = discovered / elapsed if elapsed > 0 else 0
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end='\r')
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end="\r")
try:
# HEAD request first to check if page exists (faster)
head_response = requests.head(url, timeout=timeout, allow_redirects=True)
# Skip non-HTML content
content_type = head_response.headers.get('Content-Type', '')
if 'text/html' not in content_type:
content_type = head_response.headers.get("Content-Type", "")
if "text/html" not in content_type:
continue
# Now GET the page to find links
response = requests.get(url, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
soup = BeautifulSoup(response.content, "html.parser")
# Find all links
for link in soup.find_all('a', href=True):
href = link['href']
for link in soup.find_all("a", href=True):
href = link["href"]
full_url = urljoin(url, href)
# Normalize URL
@@ -117,10 +114,10 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
# Rate limiting
time.sleep(rate_limit)
except requests.RequestException as e:
except requests.RequestException:
# Silently skip errors during estimation
pass
except Exception as e:
except Exception:
# Silently skip other errors
pass
@@ -128,13 +125,13 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
# Results
results = {
'discovered': discovered,
'pending': len(pending),
'estimated_total': discovered + len(pending),
'elapsed_seconds': round(elapsed, 2),
'discovery_rate': round(discovered / elapsed if elapsed > 0 else 0, 2),
'hit_limit': (not unlimited) and (discovered >= max_discovery),
'unlimited': unlimited
"discovered": discovered,
"pending": len(pending),
"estimated_total": discovered + len(pending),
"elapsed_seconds": round(elapsed, 2),
"discovery_rate": round(discovered / elapsed if elapsed > 0 else 0, 2),
"hit_limit": (not unlimited) and (discovered >= max_discovery),
"unlimited": unlimited,
}
return results
@@ -143,7 +140,7 @@ def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
def is_valid_url(url, base_url, include_patterns, exclude_patterns):
"""Check if URL should be crawled"""
# Must be same domain
if not url.startswith(base_url.rstrip('/')):
if not url.startswith(base_url.rstrip("/")):
return False
# Check exclude patterns first
@@ -180,11 +177,11 @@ def print_results(results, config):
print(f"⏱️ Time Elapsed: {results['elapsed_seconds']}s")
print(f"⚡ Discovery Rate: {results['discovery_rate']} pages/sec")
if results.get('unlimited', False):
if results.get("unlimited", False):
print()
print("✅ UNLIMITED MODE - Discovered all reachable pages")
print(f" Total pages: {results['estimated_total']}")
elif results['hit_limit']:
elif results["hit_limit"]:
print()
print("⚠️ Hit discovery limit - actual total may be higher")
print(" Increase max_discovery parameter for more accurate estimate")
@@ -195,8 +192,8 @@ def print_results(results, config):
print("=" * 70)
print()
estimated = results['estimated_total']
current_max = config.get('max_pages', 100)
estimated = results["estimated_total"]
current_max = config.get("max_pages", 100)
if estimated <= current_max:
print(f"✅ Current max_pages ({current_max}) is sufficient")
@@ -207,7 +204,7 @@ def print_results(results, config):
print(f" (Estimated {estimated} + 50 buffer)")
# Estimate time for full scrape
rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT)
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
estimated_time = (estimated * rate_limit) / 60 # in minutes
print()
@@ -220,7 +217,7 @@ def print_results(results, config):
def load_config(config_path):
"""Load configuration from JSON file"""
try:
with open(config_path, 'r') as f:
with open(config_path) as f:
config = json.load(f)
return config
except FileNotFoundError:
@@ -298,7 +295,7 @@ def list_all_configs():
# Try to load the config to get name and description
try:
with open(config_file, 'r') as f:
with open(config_file) as f:
config_data = json.load(f)
name = config_data.get("name", config_file.stem)
@@ -308,20 +305,19 @@ def list_all_configs():
if len(description) > 60:
description = description[:57] + "..."
by_category[category].append({
"file": config_file.name,
"path": str(rel_path),
"name": name,
"description": description
})
by_category[category].append(
{"file": config_file.name, "path": str(rel_path), "name": name, "description": description}
)
except Exception as e:
# If we can't parse the config, just use the filename
by_category[category].append({
"file": config_file.name,
"path": str(rel_path),
"name": config_file.stem,
"description": f"⚠️ Error loading config: {e}"
})
by_category[category].append(
{
"file": config_file.name,
"path": str(rel_path),
"name": config_file.stem,
"description": f"⚠️ Error loading config: {e}",
}
)
# Print configs by category
total = 0
@@ -351,7 +347,7 @@ def main():
import argparse
parser = argparse.ArgumentParser(
description='Estimate page count for Skill Seeker configs',
description="Estimate page count for Skill Seeker configs",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -366,18 +362,25 @@ Examples:
# Quick estimate (stop at 100 pages)
skill-seekers estimate configs/vue.json --max-discovery 100
"""
""",
)
parser.add_argument('config', nargs='?', help='Path to config JSON file')
parser.add_argument('--all', action='store_true',
help='List all available configs from api/configs_repo/official/')
parser.add_argument('--max-discovery', '-m', type=int, default=DEFAULT_MAX_DISCOVERY,
help=f'Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)')
parser.add_argument('--unlimited', '-u', action='store_true',
help='Remove discovery limit - discover all pages (same as --max-discovery -1)')
parser.add_argument('--timeout', '-t', type=int, default=30,
help='HTTP request timeout in seconds (default: 30)')
parser.add_argument("config", nargs="?", help="Path to config JSON file")
parser.add_argument("--all", action="store_true", help="List all available configs from api/configs_repo/official/")
parser.add_argument(
"--max-discovery",
"-m",
type=int,
default=DEFAULT_MAX_DISCOVERY,
help=f"Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)",
)
parser.add_argument(
"--unlimited",
"-u",
action="store_true",
help="Remove discovery limit - discover all pages (same as --max-discovery -1)",
)
parser.add_argument("--timeout", "-t", type=int, default=30, help="HTTP request timeout in seconds (default: 30)")
args = parser.parse_args()
@@ -401,7 +404,7 @@ Examples:
print_results(results, config)
# Return exit code based on results
if results['hit_limit']:
if results["hit_limit"]:
return 2 # Warning: hit limit
return 0 # Success
@@ -413,5 +416,5 @@ Examples:
return 1
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main())