From a0b1c2f42f2deba86903a017218d2653982e58dc Mon Sep 17 00:00:00 2001
From: sogoiii <sogoiii@gmail.com>
Date: Thu, 20 Nov 2025 13:55:46 -0800
Subject: [PATCH 1/3] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20skip=5Fllms=5Ftxt?=
 =?UTF-8?q?=20config=20option=20to=20bypass=20llms.txt=20detection?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add skip_llms_txt config option (default: False)
- Validate value is boolean, warn and default to False if not
- Support in both sync and async scraping modes
- Add 17 tests for config, behavior, and edge cases
---
 src/skill_seekers/cli/doc_scraper.py |  17 +-
 tests/test_config_validation.py      |  24 ++
 tests/test_skip_llms_txt.py          | 321 +++++++++++++++++++++++++++
 3 files changed, 358 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_skip_llms_txt.py

diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py
index 7a2f4b1..fc69b05 100755
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -86,6 +86,15 @@ class DocToSkillConverter:
         self.checkpoint_interval = checkpoint_config.get('interval', DEFAULT_CHECKPOINT_INTERVAL)
 
         # llms.txt detection state
+        skip_llms_txt_value = config.get('skip_llms_txt', False)
+        if not isinstance(skip_llms_txt_value, bool):
+            logger.warning(
+                "Invalid value for 'skip_llms_txt': %r (expected bool). Defaulting to False.",
+                skip_llms_txt_value
+            )
+            self.skip_llms_txt = False
+        else:
+            self.skip_llms_txt = skip_llms_txt_value
         self.llms_txt_detected = False
         self.llms_txt_variant = None
         self.llms_txt_variants: List[str] = []  # Track all downloaded variants
@@ -618,8 +627,8 @@ class DocToSkillConverter:
             asyncio.run(self.scrape_all_async())
             return
 
-        # Try llms.txt first (unless dry-run)
-        if not self.dry_run:
+        # Try llms.txt first (unless dry-run or explicitly disabled)
+        if not self.dry_run and not self.skip_llms_txt:
             llms_result = self._try_llms_txt()
             if llms_result:
                 logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant)
@@ -778,8 +787,8 @@ class DocToSkillConverter:
 
         Performance: ~2-3x faster than sync mode with same worker count.
         """
-        # Try llms.txt first (unless dry-run)
-        if not self.dry_run:
+        # Try llms.txt first (unless dry-run or explicitly disabled)
+        if not self.dry_run and not self.skip_llms_txt:
             llms_result = self._try_llms_txt()
             if llms_result:
                 logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant)
diff --git a/tests/test_config_validation.py b/tests/test_config_validation.py
index 4bef957..eaae6ec 100644
--- a/tests/test_config_validation.py
+++ b/tests/test_config_validation.py
@@ -307,6 +307,30 @@ class TestConfigValidation(unittest.TestCase):
         # Should be valid
         self.assertEqual(config.get('llms_txt_url'), 'https://example.com/llms-full.txt')
 
+    def test_config_with_skip_llms_txt(self):
+        """Test config validation accepts skip_llms_txt"""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/docs',
+            'skip_llms_txt': True
+        }
+
+        errors, warnings = validate_config(config)
+        self.assertEqual(errors, [])
+        self.assertTrue(config.get('skip_llms_txt'))
+
+    def test_config_with_skip_llms_txt_false(self):
+        """Test config validation accepts skip_llms_txt as False"""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/docs',
+            'skip_llms_txt': False
+        }
+
+        errors, warnings = validate_config(config)
+        self.assertEqual(errors, [])
+        self.assertFalse(config.get('skip_llms_txt'))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_skip_llms_txt.py b/tests/test_skip_llms_txt.py
new file mode 100644
index 0000000..16d7745
--- /dev/null
+++ b/tests/test_skip_llms_txt.py
@@ -0,0 +1,321 @@
+"""Tests for skip_llms_txt configuration option.
+
+This config option allows users to explicitly skip llms.txt detection and fetching,
+which is useful when:
+- A site's llms.txt is incomplete or incorrect
+- You need specific pages not in llms.txt
+- You want to force HTML scraping
+"""
+
+import os
+import tempfile
+import unittest
+import logging
+from unittest.mock import patch, Mock, MagicMock
+
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from cli.doc_scraper import DocToSkillConverter
+
+
+class TestSkipLlmsTxtConfig(unittest.TestCase):
+    """Test skip_llms_txt configuration option."""
+
+    def test_default_skip_llms_txt_is_false(self):
+        """Test that skip_llms_txt defaults to False when not specified."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'}
+        }
+
+        converter = DocToSkillConverter(config, dry_run=True)
+        self.assertFalse(converter.skip_llms_txt)
+
+    def test_skip_llms_txt_can_be_set_true(self):
+        """Test that skip_llms_txt can be explicitly set to True."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': True
+        }
+
+        converter = DocToSkillConverter(config, dry_run=True)
+        self.assertTrue(converter.skip_llms_txt)
+
+    def test_skip_llms_txt_can_be_set_false(self):
+        """Test that skip_llms_txt can be explicitly set to False."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': False
+        }
+
+        converter = DocToSkillConverter(config, dry_run=True)
+        self.assertFalse(converter.skip_llms_txt)
+
+
+class TestSkipLlmsTxtSyncBehavior(unittest.TestCase):
+    """Test skip_llms_txt behavior in sync scraping mode."""
+
+    def test_llms_txt_tried_when_not_skipped(self):
+        """Test that _try_llms_txt is called when skip_llms_txt is False."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': False
+        }
+
+        original_cwd = os.getcwd()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                os.chdir(tmpdir)
+                converter = DocToSkillConverter(config, dry_run=False)
+
+                with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
+                    with patch.object(converter, 'scrape_page'):
+                        with patch.object(converter, 'save_summary'):
+                            converter.scrape_all()
+                            mock_try.assert_called_once()
+            finally:
+                os.chdir(original_cwd)
+
+    def test_llms_txt_skipped_when_skip_true(self):
+        """Test that _try_llms_txt is NOT called when skip_llms_txt is True."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': True
+        }
+
+        original_cwd = os.getcwd()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                os.chdir(tmpdir)
+                converter = DocToSkillConverter(config, dry_run=False)
+
+                with patch.object(converter, '_try_llms_txt') as mock_try:
+                    with patch.object(converter, 'scrape_page'):
+                        with patch.object(converter, 'save_summary'):
+                            converter.scrape_all()
+                            mock_try.assert_not_called()
+            finally:
+                os.chdir(original_cwd)
+
+    def test_llms_txt_skipped_in_dry_run_mode(self):
+        """Test that _try_llms_txt is NOT called in dry-run mode regardless of skip setting."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': False  # Even when False
+        }
+
+        original_cwd = os.getcwd()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                os.chdir(tmpdir)
+                converter = DocToSkillConverter(config, dry_run=True)
+
+                with patch.object(converter, '_try_llms_txt') as mock_try:
+                    with patch.object(converter, 'save_summary'):
+                        converter.scrape_all()
+                        mock_try.assert_not_called()
+            finally:
+                os.chdir(original_cwd)
+
+
+class TestSkipLlmsTxtAsyncBehavior(unittest.TestCase):
+    """Test skip_llms_txt behavior in async scraping mode."""
+
+    def test_async_llms_txt_tried_when_not_skipped(self):
+        """Test that _try_llms_txt is called in async mode when skip_llms_txt is False."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'async_mode': True,
+            'skip_llms_txt': False
+        }
+
+        original_cwd = os.getcwd()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                os.chdir(tmpdir)
+                converter = DocToSkillConverter(config, dry_run=False)
+
+                with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
+                    with patch.object(converter, 'scrape_page_async', return_value=None):
+                        with patch.object(converter, 'save_summary'):
+                            converter.scrape_all()
+                            mock_try.assert_called_once()
+            finally:
+                os.chdir(original_cwd)
+
+    def test_async_llms_txt_skipped_when_skip_true(self):
+        """Test that _try_llms_txt is NOT called in async mode when skip_llms_txt is True."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'async_mode': True,
+            'skip_llms_txt': True
+        }
+
+        original_cwd = os.getcwd()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                os.chdir(tmpdir)
+                converter = DocToSkillConverter(config, dry_run=False)
+
+                with patch.object(converter, '_try_llms_txt') as mock_try:
+                    with patch.object(converter, 'scrape_page_async', return_value=None):
+                        with patch.object(converter, 'save_summary'):
+                            converter.scrape_all()
+                            mock_try.assert_not_called()
+            finally:
+                os.chdir(original_cwd)
+
+
+class TestSkipLlmsTxtWithRealConfig(unittest.TestCase):
+    """Test skip_llms_txt with real-world config patterns."""
+
+    def test_telegram_bots_config_pattern(self):
+        """Test the telegram-bots config pattern which uses skip_llms_txt."""
+        config = {
+            'name': 'telegram-bots',
+            'description': 'Telegram bot documentation',
+            'base_url': 'https://core.telegram.org/bots',
+            'skip_llms_txt': True,  # Telegram doesn't have useful llms.txt
+            'start_urls': [
+                'https://core.telegram.org/bots',
+                'https://core.telegram.org/bots/api'
+            ],
+            'selectors': {
+                'main_content': '#dev_page_content, main, article',
+                'title': 'h1, title',
+                'code_blocks': 'pre code, pre'
+            }
+        }
+
+        converter = DocToSkillConverter(config, dry_run=True)
+        self.assertTrue(converter.skip_llms_txt)
+        self.assertEqual(converter.name, 'telegram-bots')
+
+    def test_skip_llms_txt_with_multiple_start_urls(self):
+        """Test skip_llms_txt works correctly with multiple start URLs."""
+        config = {
+            'name': 'test-multi',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': True,
+            'start_urls': [
+                'https://example.com/docs/',
+                'https://example.com/api/',
+                'https://example.com/guide/'
+            ]
+        }
+
+        converter = DocToSkillConverter(config, dry_run=True)
+        self.assertTrue(converter.skip_llms_txt)
+        # start_urls are stored in pending_urls deque
+        self.assertEqual(len(converter.pending_urls), 3)
+
+
+class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
+    """Test edge cases for skip_llms_txt."""
+
+    def test_skip_llms_txt_with_int_zero_logs_warning(self):
+        """Test that integer 0 logs warning and defaults to False."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': 0  # Invalid type
+        }
+
+        with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
+            converter = DocToSkillConverter(config, dry_run=True)
+            self.assertFalse(converter.skip_llms_txt)
+            self.assertTrue(any('Invalid value' in log and '0' in log for log in cm.output))
+
+    def test_skip_llms_txt_with_int_one_logs_warning(self):
+        """Test that integer 1 logs warning and defaults to False."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': 1  # Invalid type
+        }
+
+        with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
+            converter = DocToSkillConverter(config, dry_run=True)
+            self.assertFalse(converter.skip_llms_txt)
+            self.assertTrue(any('Invalid value' in log and '1' in log for log in cm.output))
+
+    def test_skip_llms_txt_with_string_logs_warning(self):
+        """Test that string values log warning and default to False."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': "true"  # Invalid type
+        }
+
+        with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
+            converter = DocToSkillConverter(config, dry_run=True)
+            self.assertFalse(converter.skip_llms_txt)
+            self.assertTrue(any('Invalid value' in log and 'true' in log for log in cm.output))
+
+    def test_skip_llms_txt_with_none_logs_warning(self):
+        """Test that None logs warning and defaults to False."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': None  # Invalid type
+        }
+
+        with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
+            converter = DocToSkillConverter(config, dry_run=True)
+            self.assertFalse(converter.skip_llms_txt)
+            self.assertTrue(any('Invalid value' in log and 'None' in log for log in cm.output))
+
+    def test_scraping_proceeds_when_llms_txt_skipped(self):
+        """Test that HTML scraping proceeds normally when llms.txt is skipped."""
+        config = {
+            'name': 'test',
+            'base_url': 'https://example.com/',
+            'selectors': {'main_content': 'article'},
+            'skip_llms_txt': True
+        }
+
+        original_cwd = os.getcwd()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                os.chdir(tmpdir)
+                converter = DocToSkillConverter(config, dry_run=False)
+
+                # Track if scrape_page was called
+                scrape_called = []
+
+                def mock_scrape(url):
+                    scrape_called.append(url)
+                    return None
+
+                with patch.object(converter, 'scrape_page', side_effect=mock_scrape):
+                    with patch.object(converter, 'save_summary'):
+                        converter.scrape_all()
+                        # Should have attempted to scrape the base URL
+                        self.assertTrue(len(scrape_called) > 0)
+            finally:
+                os.chdir(original_cwd)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 91692db87c32be951e90b697eb93412b85b5a5ff Mon Sep 17 00:00:00 2001
From: sogoiii <sogoiii@gmail.com>
Date: Thu, 20 Nov 2025 14:00:55 -0800
Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=93=9D=20docs:=20add=20skip=5Fllms=5F?=
 =?UTF-8?q?txt=20to=20config=20parameters=20documentation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CLAUDE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index a15cfcc..2177795 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -434,6 +434,7 @@ Config files (`configs/*.json`) define scraping behavior:
 - `categories`: Keyword mapping for categorization
 - `rate_limit`: Delay between requests (seconds)
 - `max_pages`: Maximum pages to scrape
+- `skip_llms_txt`: Skip llms.txt detection, force HTML scraping (default: false)
 
 ## Key Features & Implementation
 

From 8031ce69ce0616c0da8bff86e3c99763d1e83bc4 Mon Sep 17 00:00:00 2001
From: yusyus <yusufkaraaslan.yk@pm.me>
Date: Sat, 29 Nov 2025 22:56:37 +0300
Subject: [PATCH 3/3] fix: Update test imports to use proper package names

Fixed import paths in test_skip_llms_txt.py to use skill_seekers
package name instead of old-style cli imports.

Changes:
- Updated import from 'cli.doc_scraper' to 'skill_seekers.cli.doc_scraper'
- Updated logger names from 'cli.doc_scraper' to 'skill_seekers.cli.doc_scraper'
- Removed sys.path manipulation (no longer needed with proper imports)

All 17 tests now pass successfully (15 in test_skip_llms_txt.py + 2 in test_config_validation.py)
---
 tests/test_skip_llms_txt.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/test_skip_llms_txt.py b/tests/test_skip_llms_txt.py
index 16d7745..f863b43 100644
--- a/tests/test_skip_llms_txt.py
+++ b/tests/test_skip_llms_txt.py
@@ -13,10 +13,7 @@ import unittest
 import logging
 from unittest.mock import patch, Mock, MagicMock
 
-import sys
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from cli.doc_scraper import DocToSkillConverter
+from skill_seekers.cli.doc_scraper import DocToSkillConverter
 
 
 class TestSkipLlmsTxtConfig(unittest.TestCase):
@@ -239,7 +236,7 @@ class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
             'skip_llms_txt': 0  # Invalid type
         }
 
-        with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
+        with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
             converter = DocToSkillConverter(config, dry_run=True)
             self.assertFalse(converter.skip_llms_txt)
             self.assertTrue(any('Invalid value' in log and '0' in log for log in cm.output))
@@ -253,7 +250,7 @@ class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
             'skip_llms_txt': 1  # Invalid type
         }
 
-        with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
+        with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
             converter = DocToSkillConverter(config, dry_run=True)
             self.assertFalse(converter.skip_llms_txt)
             self.assertTrue(any('Invalid value' in log and '1' in log for log in cm.output))
@@ -267,7 +264,7 @@ class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
             'skip_llms_txt': "true"  # Invalid type
         }
 
-        with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
+        with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
             converter = DocToSkillConverter(config, dry_run=True)
             self.assertFalse(converter.skip_llms_txt)
             self.assertTrue(any('Invalid value' in log and 'true' in log for log in cm.output))
@@ -281,7 +278,7 @@ class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
             'skip_llms_txt': None  # Invalid type
         }
 
-        with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
+        with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
             converter = DocToSkillConverter(config, dry_run=True)
             self.assertFalse(converter.skip_llms_txt)
             self.assertTrue(any('Invalid value' in log and 'None' in log for log in cm.output))