From 12424e390c6f88b41837e6048015f4389dd293d2 Mon Sep 17 00:00:00 2001 From: "Edgar I." Date: Fri, 24 Oct 2025 13:34:22 +0400 Subject: [PATCH] feat: integrate llms.txt detection into scraping workflow --- cli/doc_scraper.py | 73 +++++++++++++++++++++++++++++++++++++-- tests/test_integration.py | 42 ++++++++++++++++++++++ 2 files changed, 113 insertions(+), 2 deletions(-) mode change 100644 => 100755 cli/doc_scraper.py diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py old mode 100644 new mode 100755 index 54f8bfa..330ff73 --- a/cli/doc_scraper.py +++ b/cli/doc_scraper.py @@ -22,6 +22,13 @@ from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from collections import deque, defaultdict +# Add parent directory to path for imports when run as script +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from cli.llms_txt_detector import LlmsTxtDetector +from cli.llms_txt_parser import LlmsTxtParser +from cli.llms_txt_downloader import LlmsTxtDownloader + class DocToSkillConverter: def __init__(self, config, dry_run=False, resume=False): @@ -41,6 +48,10 @@ class DocToSkillConverter: self.checkpoint_enabled = checkpoint_config.get('enabled', False) self.checkpoint_interval = checkpoint_config.get('interval', 1000) + # llms.txt detection state + self.llms_txt_detected = False + self.llms_txt_variant = None + # Parallel scraping config self.workers = config.get('workers', 1) @@ -322,9 +333,67 @@ class DocToSkillConverter: print(f" āœ— Error on {url}: {e}") else: print(f" āœ— Error: {e}") - + + def _try_llms_txt(self) -> bool: + """ + Try to use llms.txt instead of HTML scraping. + + Returns: + True if llms.txt was found and parsed successfully + """ + print(f"\nšŸ” Checking for llms.txt at {self.base_url}...") + + # Detect llms.txt + detector = LlmsTxtDetector(self.base_url) + result = detector.detect() + + if not result: + print("ā„¹ļø No llms.txt found, using HTML scraping") + return False + + print(f"āœ… Found {result['variant']} llms.txt: {result['url']}") + + # Download content + downloader = LlmsTxtDownloader(result['url']) + content = downloader.download() + + if not content: + print("āš ļø Failed to download llms.txt, falling back to HTML scraping") + return False + + print(f"šŸ“„ Downloaded {len(content)} characters") + + # Parse into pages + parser = LlmsTxtParser(content) + pages = parser.parse() + + if not pages: + print("āš ļø Failed to parse llms.txt, falling back to HTML scraping") + return False + + print(f"šŸ“„ Parsed {len(pages)} sections") + + # Save pages + for page in pages: + self.save_page(page) + self.pages.append(page) + + self.llms_txt_detected = True + self.llms_txt_variant = result['variant'] + + return True + def scrape_all(self): - """Scrape all pages (supports parallel scraping)""" + """Scrape all pages (supports llms.txt and HTML scraping)""" + + # Try llms.txt first (unless dry-run) + if not self.dry_run: + llms_result = self._try_llms_txt() + if llms_result: + print(f"\nāœ… Used llms.txt ({self.llms_txt_variant}) - skipping HTML scraping") + return + + # HTML scraping (original logic) print(f"\n{'='*60}") if self.dry_run: print(f"DRY RUN: {self.name}") diff --git a/tests/test_integration.py b/tests/test_integration.py index d278e67..88f7268 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -247,6 +247,48 @@ class TestURLProcessing(unittest.TestCase): self.assertEqual(len(converter.pending_urls), 3) +class TestLlmsTxtIntegration(unittest.TestCase): + """Test llms.txt integration into scraping workflow""" + + def test_scraper_has_llms_txt_attributes(self): + """Test that scraper has llms.txt detection attributes""" + config = { + 'name': 'test-llms', + 'base_url': 'https://hono.dev/docs', + 'selectors': { + 'main_content': 'article', + 'title': 'h1', + 'code_blocks': 'pre code' + }, + 'max_pages': 50 + } + + scraper = DocToSkillConverter(config, dry_run=True) + + # Should have llms.txt attributes + self.assertFalse(scraper.llms_txt_detected) + self.assertIsNone(scraper.llms_txt_variant) + + def test_scraper_has_try_llms_txt_method(self): + """Test that scraper has _try_llms_txt method""" + config = { + 'name': 'test-llms', + 'base_url': 'https://hono.dev/docs', + 'selectors': { + 'main_content': 'article', + 'title': 'h1', + 'code_blocks': 'pre code' + }, + 'max_pages': 50 + } + + scraper = DocToSkillConverter(config, dry_run=True) + + # Should have _try_llms_txt method + self.assertTrue(hasattr(scraper, '_try_llms_txt')) + self.assertTrue(callable(getattr(scraper, '_try_llms_txt'))) + + class TestContentExtraction(unittest.TestCase): """Test content extraction functionality"""