diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py index 4702bec..fa4c059 100755 --- a/cli/doc_scraper.py +++ b/cli/doc_scraper.py @@ -257,15 +257,16 @@ class DocToSkillConverter: paragraphs.append(text) page['content'] = '\n\n'.join(paragraphs) - - # Extract links - for link in main.find_all('a', href=True): + + # Extract links from entire page (not just main content) + # This allows discovery of navigation links outside the main content area + for link in soup.find_all('a', href=True): href = urljoin(url, link['href']) # Strip anchor fragments to avoid treating #anchors as separate pages href = href.split('#')[0] if self.is_valid_url(href) and href not in page['links']: page['links'].append(href) - + return page def _extract_language_from_classes(self, classes): @@ -1641,11 +1642,14 @@ def execute_scraping_and_building(config: Dict[str, Any], args: argparse.Namespa # Check for existing data exists, page_count = check_existing_data(config['name']) - if exists and not args.skip_scrape: + if exists and not args.skip_scrape and not args.fresh: logger.info("\n✓ Found existing data: %d pages", page_count) response = input("Use existing data? (y/n): ").strip().lower() if response == 'y': args.skip_scrape = True + elif exists and args.fresh: + logger.info("\n✓ Found existing data: %d pages", page_count) + logger.info(" --fresh flag set, will re-scrape from scratch") # Create converter converter = DocToSkillConverter(config, resume=args.resume) diff --git a/skill_seeker_mcp/server.py b/skill_seeker_mcp/server.py index a6f5c77..4307a2f 100644 --- a/skill_seeker_mcp/server.py +++ b/skill_seeker_mcp/server.py @@ -603,6 +603,10 @@ async def scrape_docs_tool(args: dict) -> list[TextContent]: if is_unified and merge_mode: cmd.extend(["--merge-mode", merge_mode]) + # Add --fresh to avoid user input prompts when existing data found + if not skip_scrape: + cmd.append("--fresh") + if enhance_local: cmd.append("--enhance-local") if skip_scrape: