From 105218f85eb17eb7e7f634791aa8eee502fe3509 Mon Sep 17 00:00:00 2001 From: yusyus Date: Sun, 19 Oct 2025 20:50:24 +0300 Subject: [PATCH] Add checkpoint/resume feature for long scrapes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement automatic progress saving and resumption for interrupted or very long documentation scrapes (40K+ pages). **Features:** - Automatic checkpoint saving every N pages (configurable, default: 1000) - Resume from last checkpoint with --resume flag - Fresh start with --fresh flag (clears checkpoint) - Progress state saved: visited URLs, pending URLs, pages scraped - Checkpoint saved on interruption (Ctrl+C) - Checkpoint cleared after successful completion **Configuration:** ```json { "checkpoint": { "enabled": true, "interval": 1000 } } ``` **Usage:** ```bash # Start scraping (with checkpoints enabled in config) python3 cli/doc_scraper.py --config configs/large-docs.json # If interrupted (Ctrl+C), resume later: python3 cli/doc_scraper.py --config configs/large-docs.json --resume # Start fresh (clear checkpoint): python3 cli/doc_scraper.py --config configs/large-docs.json --fresh ``` **Checkpoint Data:** - config: Full configuration - visited_urls: All URLs already scraped - pending_urls: Queue of URLs to scrape - pages_scraped: Count of pages completed - last_updated: Timestamp - checkpoint_interval: Interval setting **Benefits:** ✅ Never lose progress on long scrapes ✅ Handle interruptions gracefully ✅ Resume multi-hour scrapes easily ✅ Automatic save every 1000 pages ✅ Essential for 40K+ page documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- cli/doc_scraper.py | 103 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 6 deletions(-) diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py index da519ac..f741d81 100644 --- a/cli/doc_scraper.py +++ b/cli/doc_scraper.py @@ -24,15 +24,22 @@ from collections import deque, defaultdict class DocToSkillConverter: - def __init__(self, config, dry_run=False): + def __init__(self, config, dry_run=False, resume=False): self.config = config self.name = config['name'] self.base_url = config['base_url'] self.dry_run = dry_run + self.resume = resume # Paths self.data_dir = f"output/{self.name}_data" self.skill_dir = f"output/{self.name}" + self.checkpoint_file = f"{self.data_dir}/checkpoint.json" + + # Checkpoint config + checkpoint_config = config.get('checkpoint', {}) + self.checkpoint_enabled = checkpoint_config.get('enabled', False) + self.checkpoint_interval = checkpoint_config.get('interval', 1000) # State self.visited_urls = set() @@ -40,6 +47,7 @@ class DocToSkillConverter: start_urls = config.get('start_urls', [self.base_url]) self.pending_urls = deque(start_urls) self.pages = [] + self.pages_scraped = 0 # Create directories (unless dry-run) if not dry_run: @@ -47,24 +55,83 @@ class DocToSkillConverter: os.makedirs(f"{self.skill_dir}/references", exist_ok=True) os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) + + # Load checkpoint if resuming + if resume and not dry_run: + self.load_checkpoint() def is_valid_url(self, url): """Check if URL should be scraped""" if not url.startswith(self.base_url): return False - + # Include patterns includes = self.config.get('url_patterns', {}).get('include', []) if includes and not any(pattern in url for pattern in includes): return False - + # Exclude patterns excludes = self.config.get('url_patterns', {}).get('exclude', []) if any(pattern in url for pattern in excludes): return False - + return True - + + def save_checkpoint(self): + """Save progress checkpoint""" + if not self.checkpoint_enabled or self.dry_run: + return + + checkpoint_data = { + "config": self.config, + "visited_urls": list(self.visited_urls), + "pending_urls": list(self.pending_urls), + "pages_scraped": self.pages_scraped, + "last_updated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "checkpoint_interval": self.checkpoint_interval + } + + try: + with open(self.checkpoint_file, 'w') as f: + json.dump(checkpoint_data, f, indent=2) + print(f" 💾 Checkpoint saved ({self.pages_scraped} pages)") + except Exception as e: + print(f" ⚠️ Failed to save checkpoint: {e}") + + def load_checkpoint(self): + """Load progress from checkpoint""" + if not os.path.exists(self.checkpoint_file): + print("ℹ️ No checkpoint found, starting fresh") + return + + try: + with open(self.checkpoint_file, 'r') as f: + checkpoint_data = json.load(f) + + self.visited_urls = set(checkpoint_data["visited_urls"]) + self.pending_urls = deque(checkpoint_data["pending_urls"]) + self.pages_scraped = checkpoint_data["pages_scraped"] + + print(f"✅ Resumed from checkpoint") + print(f" Pages already scraped: {self.pages_scraped}") + print(f" URLs visited: {len(self.visited_urls)}") + print(f" URLs pending: {len(self.pending_urls)}") + print(f" Last updated: {checkpoint_data['last_updated']}") + print("") + + except Exception as e: + print(f"⚠️ Failed to load checkpoint: {e}") + print(" Starting fresh") + + def clear_checkpoint(self): + """Remove checkpoint file""" + if os.path.exists(self.checkpoint_file): + try: + os.remove(self.checkpoint_file) + print(f"✅ Checkpoint cleared") + except Exception as e: + print(f"⚠️ Failed to clear checkpoint: {e}") + def extract_content(self, soup, url): """Extract content with improved code and pattern detection""" page = { @@ -276,6 +343,11 @@ class DocToSkillConverter: pass # Ignore errors in dry run else: self.scrape_page(url) + self.pages_scraped += 1 + + # Save checkpoint at interval + if self.checkpoint_enabled and self.pages_scraped % self.checkpoint_interval == 0: + self.save_checkpoint() if len(self.visited_urls) % 10 == 0: print(f" [{len(self.visited_urls)} pages]") @@ -837,6 +909,10 @@ def main(): help='Enhance SKILL.md using Claude Code in new terminal (no API key needed)') parser.add_argument('--api-key', type=str, help='Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)') + parser.add_argument('--resume', action='store_true', + help='Resume from last checkpoint (for interrupted scrapes)') + parser.add_argument('--fresh', action='store_true', + help='Clear checkpoint and start fresh') args = parser.parse_args() @@ -888,14 +964,29 @@ def main(): args.skip_scrape = True # Create converter - converter = DocToSkillConverter(config) + converter = DocToSkillConverter(config, resume=args.resume) + + # Handle fresh start (clear checkpoint) + if args.fresh: + converter.clear_checkpoint() # Scrape or skip if not args.skip_scrape: try: converter.scrape_all() + # Save final checkpoint + if converter.checkpoint_enabled: + converter.save_checkpoint() + print("\n💾 Final checkpoint saved") + # Clear checkpoint after successful completion + converter.clear_checkpoint() + print("✅ Scraping complete - checkpoint cleared") except KeyboardInterrupt: print("\n\nScraping interrupted.") + if converter.checkpoint_enabled: + converter.save_checkpoint() + print(f"💾 Progress saved to checkpoint") + print(f" Resume with: --config {args.config if args.config else 'config.json'} --resume") response = input("Continue with skill building? (y/n): ").strip().lower() if response != 'y': return