From 12424e390c6f88b41837e6048015f4389dd293d2 Mon Sep 17 00:00:00 2001
From: "Edgar I." <eibrahimov@gmail.com>
Date: Fri, 24 Oct 2025 13:34:22 +0400
Subject: [PATCH] feat: integrate llms.txt detection into scraping workflow

---
 cli/doc_scraper.py        | 73 +++++++++++++++++++++++++++++++++++++--
 tests/test_integration.py | 42 ++++++++++++++++++++++
 2 files changed, 113 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 cli/doc_scraper.py

diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py
old mode 100644
new mode 100755
index 54f8bfa..330ff73
--- a/cli/doc_scraper.py
+++ b/cli/doc_scraper.py
@@ -22,6 +22,13 @@ from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from collections import deque, defaultdict
 
+# Add parent directory to path for imports when run as script
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from cli.llms_txt_detector import LlmsTxtDetector
+from cli.llms_txt_parser import LlmsTxtParser
+from cli.llms_txt_downloader import LlmsTxtDownloader
+
 
 class DocToSkillConverter:
     def __init__(self, config, dry_run=False, resume=False):
@@ -41,6 +48,10 @@ class DocToSkillConverter:
         self.checkpoint_enabled = checkpoint_config.get('enabled', False)
         self.checkpoint_interval = checkpoint_config.get('interval', 1000)
 
+        # llms.txt detection state
+        self.llms_txt_detected = False
+        self.llms_txt_variant = None
+
         # Parallel scraping config
         self.workers = config.get('workers', 1)
 
@@ -322,9 +333,67 @@ class DocToSkillConverter:
                     print(f"  ✗ Error on {url}: {e}")
             else:
                 print(f"  ✗ Error: {e}")
-    
+
+    def _try_llms_txt(self) -> bool:
+        """
+        Try to use llms.txt instead of HTML scraping.
+
+        Returns:
+            True if llms.txt was found and parsed successfully
+        """
+        print(f"\n🔍 Checking for llms.txt at {self.base_url}...")
+
+        # Detect llms.txt
+        detector = LlmsTxtDetector(self.base_url)
+        result = detector.detect()
+
+        if not result:
+            print("ℹ️  No llms.txt found, using HTML scraping")
+            return False
+
+        print(f"✅ Found {result['variant']} llms.txt: {result['url']}")
+
+        # Download content
+        downloader = LlmsTxtDownloader(result['url'])
+        content = downloader.download()
+
+        if not content:
+            print("⚠️  Failed to download llms.txt, falling back to HTML scraping")
+            return False
+
+        print(f"📥 Downloaded {len(content)} characters")
+
+        # Parse into pages
+        parser = LlmsTxtParser(content)
+        pages = parser.parse()
+
+        if not pages:
+            print("⚠️  Failed to parse llms.txt, falling back to HTML scraping")
+            return False
+
+        print(f"📄 Parsed {len(pages)} sections")
+
+        # Save pages
+        for page in pages:
+            self.save_page(page)
+            self.pages.append(page)
+
+        self.llms_txt_detected = True
+        self.llms_txt_variant = result['variant']
+
+        return True
+
     def scrape_all(self):
-        """Scrape all pages (supports parallel scraping)"""
+        """Scrape all pages (supports llms.txt and HTML scraping)"""
+
+        # Try llms.txt first (unless dry-run)
+        if not self.dry_run:
+            llms_result = self._try_llms_txt()
+            if llms_result:
+                print(f"\n✅ Used llms.txt ({self.llms_txt_variant}) - skipping HTML scraping")
+                return
+
+        # HTML scraping (original logic)
         print(f"\n{'='*60}")
         if self.dry_run:
             print(f"DRY RUN: {self.name}")
diff --git a/tests/test_integration.py b/tests/test_integration.py
index d278e67..88f7268 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -247,6 +247,48 @@ class TestURLProcessing(unittest.TestCase):
         self.assertEqual(len(converter.pending_urls), 3)
 
 
+class TestLlmsTxtIntegration(unittest.TestCase):
+    """Test llms.txt integration into scraping workflow"""
+
+    def test_scraper_has_llms_txt_attributes(self):
+        """Test that scraper has llms.txt detection attributes"""
+        config = {
+            'name': 'test-llms',
+            'base_url': 'https://hono.dev/docs',
+            'selectors': {
+                'main_content': 'article',
+                'title': 'h1',
+                'code_blocks': 'pre code'
+            },
+            'max_pages': 50
+        }
+
+        scraper = DocToSkillConverter(config, dry_run=True)
+
+        # Should have llms.txt attributes
+        self.assertFalse(scraper.llms_txt_detected)
+        self.assertIsNone(scraper.llms_txt_variant)
+
+    def test_scraper_has_try_llms_txt_method(self):
+        """Test that scraper has _try_llms_txt method"""
+        config = {
+            'name': 'test-llms',
+            'base_url': 'https://hono.dev/docs',
+            'selectors': {
+                'main_content': 'article',
+                'title': 'h1',
+                'code_blocks': 'pre code'
+            },
+            'max_pages': 50
+        }
+
+        scraper = DocToSkillConverter(config, dry_run=True)
+
+        # Should have _try_llms_txt method
+        self.assertTrue(hasattr(scraper, '_try_llms_txt'))
+        self.assertTrue(callable(getattr(scraper, '_try_llms_txt')))
+
+
 class TestContentExtraction(unittest.TestCase):
     """Test content extraction functionality"""