From a0b1c2f42f2deba86903a017218d2653982e58dc Mon Sep 17 00:00:00 2001 From: sogoiii Date: Thu, 20 Nov 2025 13:55:46 -0800 Subject: [PATCH 1/3] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20skip=5Fllms=5Ftxt?= =?UTF-8?q?=20config=20option=20to=20bypass=20llms.txt=20detection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add skip_llms_txt config option (default: False) - Validate value is boolean, warn and default to False if not - Support in both sync and async scraping modes - Add 17 tests for config, behavior, and edge cases --- src/skill_seekers/cli/doc_scraper.py | 17 +- tests/test_config_validation.py | 24 ++ tests/test_skip_llms_txt.py | 321 +++++++++++++++++++++++++++ 3 files changed, 358 insertions(+), 4 deletions(-) create mode 100644 tests/test_skip_llms_txt.py diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 7a2f4b1..fc69b05 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -86,6 +86,15 @@ class DocToSkillConverter: self.checkpoint_interval = checkpoint_config.get('interval', DEFAULT_CHECKPOINT_INTERVAL) # llms.txt detection state + skip_llms_txt_value = config.get('skip_llms_txt', False) + if not isinstance(skip_llms_txt_value, bool): + logger.warning( + "Invalid value for 'skip_llms_txt': %r (expected bool). Defaulting to False.", + skip_llms_txt_value + ) + self.skip_llms_txt = False + else: + self.skip_llms_txt = skip_llms_txt_value self.llms_txt_detected = False self.llms_txt_variant = None self.llms_txt_variants: List[str] = [] # Track all downloaded variants @@ -618,8 +627,8 @@ class DocToSkillConverter: asyncio.run(self.scrape_all_async()) return - # Try llms.txt first (unless dry-run) - if not self.dry_run: + # Try llms.txt first (unless dry-run or explicitly disabled) + if not self.dry_run and not self.skip_llms_txt: llms_result = self._try_llms_txt() if llms_result: logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant) @@ -778,8 +787,8 @@ class DocToSkillConverter: Performance: ~2-3x faster than sync mode with same worker count. """ - # Try llms.txt first (unless dry-run) - if not self.dry_run: + # Try llms.txt first (unless dry-run or explicitly disabled) + if not self.dry_run and not self.skip_llms_txt: llms_result = self._try_llms_txt() if llms_result: logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant) diff --git a/tests/test_config_validation.py b/tests/test_config_validation.py index 4bef957..eaae6ec 100644 --- a/tests/test_config_validation.py +++ b/tests/test_config_validation.py @@ -307,6 +307,30 @@ class TestConfigValidation(unittest.TestCase): # Should be valid self.assertEqual(config.get('llms_txt_url'), 'https://example.com/llms-full.txt') + def test_config_with_skip_llms_txt(self): + """Test config validation accepts skip_llms_txt""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/docs', + 'skip_llms_txt': True + } + + errors, warnings = validate_config(config) + self.assertEqual(errors, []) + self.assertTrue(config.get('skip_llms_txt')) + + def test_config_with_skip_llms_txt_false(self): + """Test config validation accepts skip_llms_txt as False""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/docs', + 'skip_llms_txt': False + } + + errors, warnings = validate_config(config) + self.assertEqual(errors, []) + self.assertFalse(config.get('skip_llms_txt')) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_skip_llms_txt.py b/tests/test_skip_llms_txt.py new file mode 100644 index 0000000..16d7745 --- /dev/null +++ b/tests/test_skip_llms_txt.py @@ -0,0 +1,321 @@ +"""Tests for skip_llms_txt configuration option. + +This config option allows users to explicitly skip llms.txt detection and fetching, +which is useful when: +- A site's llms.txt is incomplete or incorrect +- You need specific pages not in llms.txt +- You want to force HTML scraping +""" + +import os +import tempfile +import unittest +import logging +from unittest.mock import patch, Mock, MagicMock + +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from cli.doc_scraper import DocToSkillConverter + + +class TestSkipLlmsTxtConfig(unittest.TestCase): + """Test skip_llms_txt configuration option.""" + + def test_default_skip_llms_txt_is_false(self): + """Test that skip_llms_txt defaults to False when not specified.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'} + } + + converter = DocToSkillConverter(config, dry_run=True) + self.assertFalse(converter.skip_llms_txt) + + def test_skip_llms_txt_can_be_set_true(self): + """Test that skip_llms_txt can be explicitly set to True.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': True + } + + converter = DocToSkillConverter(config, dry_run=True) + self.assertTrue(converter.skip_llms_txt) + + def test_skip_llms_txt_can_be_set_false(self): + """Test that skip_llms_txt can be explicitly set to False.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': False + } + + converter = DocToSkillConverter(config, dry_run=True) + self.assertFalse(converter.skip_llms_txt) + + +class TestSkipLlmsTxtSyncBehavior(unittest.TestCase): + """Test skip_llms_txt behavior in sync scraping mode.""" + + def test_llms_txt_tried_when_not_skipped(self): + """Test that _try_llms_txt is called when skip_llms_txt is False.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': False + } + + original_cwd = os.getcwd() + with tempfile.TemporaryDirectory() as tmpdir: + try: + os.chdir(tmpdir) + converter = DocToSkillConverter(config, dry_run=False) + + with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try: + with patch.object(converter, 'scrape_page'): + with patch.object(converter, 'save_summary'): + converter.scrape_all() + mock_try.assert_called_once() + finally: + os.chdir(original_cwd) + + def test_llms_txt_skipped_when_skip_true(self): + """Test that _try_llms_txt is NOT called when skip_llms_txt is True.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': True + } + + original_cwd = os.getcwd() + with tempfile.TemporaryDirectory() as tmpdir: + try: + os.chdir(tmpdir) + converter = DocToSkillConverter(config, dry_run=False) + + with patch.object(converter, '_try_llms_txt') as mock_try: + with patch.object(converter, 'scrape_page'): + with patch.object(converter, 'save_summary'): + converter.scrape_all() + mock_try.assert_not_called() + finally: + os.chdir(original_cwd) + + def test_llms_txt_skipped_in_dry_run_mode(self): + """Test that _try_llms_txt is NOT called in dry-run mode regardless of skip setting.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': False # Even when False + } + + original_cwd = os.getcwd() + with tempfile.TemporaryDirectory() as tmpdir: + try: + os.chdir(tmpdir) + converter = DocToSkillConverter(config, dry_run=True) + + with patch.object(converter, '_try_llms_txt') as mock_try: + with patch.object(converter, 'save_summary'): + converter.scrape_all() + mock_try.assert_not_called() + finally: + os.chdir(original_cwd) + + +class TestSkipLlmsTxtAsyncBehavior(unittest.TestCase): + """Test skip_llms_txt behavior in async scraping mode.""" + + def test_async_llms_txt_tried_when_not_skipped(self): + """Test that _try_llms_txt is called in async mode when skip_llms_txt is False.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'async_mode': True, + 'skip_llms_txt': False + } + + original_cwd = os.getcwd() + with tempfile.TemporaryDirectory() as tmpdir: + try: + os.chdir(tmpdir) + converter = DocToSkillConverter(config, dry_run=False) + + with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try: + with patch.object(converter, 'scrape_page_async', return_value=None): + with patch.object(converter, 'save_summary'): + converter.scrape_all() + mock_try.assert_called_once() + finally: + os.chdir(original_cwd) + + def test_async_llms_txt_skipped_when_skip_true(self): + """Test that _try_llms_txt is NOT called in async mode when skip_llms_txt is True.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'async_mode': True, + 'skip_llms_txt': True + } + + original_cwd = os.getcwd() + with tempfile.TemporaryDirectory() as tmpdir: + try: + os.chdir(tmpdir) + converter = DocToSkillConverter(config, dry_run=False) + + with patch.object(converter, '_try_llms_txt') as mock_try: + with patch.object(converter, 'scrape_page_async', return_value=None): + with patch.object(converter, 'save_summary'): + converter.scrape_all() + mock_try.assert_not_called() + finally: + os.chdir(original_cwd) + + +class TestSkipLlmsTxtWithRealConfig(unittest.TestCase): + """Test skip_llms_txt with real-world config patterns.""" + + def test_telegram_bots_config_pattern(self): + """Test the telegram-bots config pattern which uses skip_llms_txt.""" + config = { + 'name': 'telegram-bots', + 'description': 'Telegram bot documentation', + 'base_url': 'https://core.telegram.org/bots', + 'skip_llms_txt': True, # Telegram doesn't have useful llms.txt + 'start_urls': [ + 'https://core.telegram.org/bots', + 'https://core.telegram.org/bots/api' + ], + 'selectors': { + 'main_content': '#dev_page_content, main, article', + 'title': 'h1, title', + 'code_blocks': 'pre code, pre' + } + } + + converter = DocToSkillConverter(config, dry_run=True) + self.assertTrue(converter.skip_llms_txt) + self.assertEqual(converter.name, 'telegram-bots') + + def test_skip_llms_txt_with_multiple_start_urls(self): + """Test skip_llms_txt works correctly with multiple start URLs.""" + config = { + 'name': 'test-multi', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': True, + 'start_urls': [ + 'https://example.com/docs/', + 'https://example.com/api/', + 'https://example.com/guide/' + ] + } + + converter = DocToSkillConverter(config, dry_run=True) + self.assertTrue(converter.skip_llms_txt) + # start_urls are stored in pending_urls deque + self.assertEqual(len(converter.pending_urls), 3) + + +class TestSkipLlmsTxtEdgeCases(unittest.TestCase): + """Test edge cases for skip_llms_txt.""" + + def test_skip_llms_txt_with_int_zero_logs_warning(self): + """Test that integer 0 logs warning and defaults to False.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': 0 # Invalid type + } + + with self.assertLogs('cli.doc_scraper', level='WARNING') as cm: + converter = DocToSkillConverter(config, dry_run=True) + self.assertFalse(converter.skip_llms_txt) + self.assertTrue(any('Invalid value' in log and '0' in log for log in cm.output)) + + def test_skip_llms_txt_with_int_one_logs_warning(self): + """Test that integer 1 logs warning and defaults to False.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': 1 # Invalid type + } + + with self.assertLogs('cli.doc_scraper', level='WARNING') as cm: + converter = DocToSkillConverter(config, dry_run=True) + self.assertFalse(converter.skip_llms_txt) + self.assertTrue(any('Invalid value' in log and '1' in log for log in cm.output)) + + def test_skip_llms_txt_with_string_logs_warning(self): + """Test that string values log warning and default to False.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': "true" # Invalid type + } + + with self.assertLogs('cli.doc_scraper', level='WARNING') as cm: + converter = DocToSkillConverter(config, dry_run=True) + self.assertFalse(converter.skip_llms_txt) + self.assertTrue(any('Invalid value' in log and 'true' in log for log in cm.output)) + + def test_skip_llms_txt_with_none_logs_warning(self): + """Test that None logs warning and defaults to False.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': None # Invalid type + } + + with self.assertLogs('cli.doc_scraper', level='WARNING') as cm: + converter = DocToSkillConverter(config, dry_run=True) + self.assertFalse(converter.skip_llms_txt) + self.assertTrue(any('Invalid value' in log and 'None' in log for log in cm.output)) + + def test_scraping_proceeds_when_llms_txt_skipped(self): + """Test that HTML scraping proceeds normally when llms.txt is skipped.""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article'}, + 'skip_llms_txt': True + } + + original_cwd = os.getcwd() + with tempfile.TemporaryDirectory() as tmpdir: + try: + os.chdir(tmpdir) + converter = DocToSkillConverter(config, dry_run=False) + + # Track if scrape_page was called + scrape_called = [] + + def mock_scrape(url): + scrape_called.append(url) + return None + + with patch.object(converter, 'scrape_page', side_effect=mock_scrape): + with patch.object(converter, 'save_summary'): + converter.scrape_all() + # Should have attempted to scrape the base URL + self.assertTrue(len(scrape_called) > 0) + finally: + os.chdir(original_cwd) + + +if __name__ == '__main__': + unittest.main() From 91692db87c32be951e90b697eb93412b85b5a5ff Mon Sep 17 00:00:00 2001 From: sogoiii Date: Thu, 20 Nov 2025 14:00:55 -0800 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=93=9D=20docs:=20add=20skip=5Fllms=5F?= =?UTF-8?q?txt=20to=20config=20parameters=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CLAUDE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CLAUDE.md b/CLAUDE.md index a15cfcc..2177795 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -434,6 +434,7 @@ Config files (`configs/*.json`) define scraping behavior: - `categories`: Keyword mapping for categorization - `rate_limit`: Delay between requests (seconds) - `max_pages`: Maximum pages to scrape +- `skip_llms_txt`: Skip llms.txt detection, force HTML scraping (default: false) ## Key Features & Implementation From 8031ce69ce0616c0da8bff86e3c99763d1e83bc4 Mon Sep 17 00:00:00 2001 From: yusyus Date: Sat, 29 Nov 2025 22:56:37 +0300 Subject: [PATCH 3/3] fix: Update test imports to use proper package names Fixed import paths in test_skip_llms_txt.py to use skill_seekers package name instead of old-style cli imports. Changes: - Updated import from 'cli.doc_scraper' to 'skill_seekers.cli.doc_scraper' - Updated logger names from 'cli.doc_scraper' to 'skill_seekers.cli.doc_scraper' - Removed sys.path manipulation (no longer needed with proper imports) All 17 tests now pass successfully (15 in test_skip_llms_txt.py + 2 in test_config_validation.py) --- tests/test_skip_llms_txt.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/test_skip_llms_txt.py b/tests/test_skip_llms_txt.py index 16d7745..f863b43 100644 --- a/tests/test_skip_llms_txt.py +++ b/tests/test_skip_llms_txt.py @@ -13,10 +13,7 @@ import unittest import logging from unittest.mock import patch, Mock, MagicMock -import sys -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from cli.doc_scraper import DocToSkillConverter +from skill_seekers.cli.doc_scraper import DocToSkillConverter class TestSkipLlmsTxtConfig(unittest.TestCase): @@ -239,7 +236,7 @@ class TestSkipLlmsTxtEdgeCases(unittest.TestCase): 'skip_llms_txt': 0 # Invalid type } - with self.assertLogs('cli.doc_scraper', level='WARNING') as cm: + with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm: converter = DocToSkillConverter(config, dry_run=True) self.assertFalse(converter.skip_llms_txt) self.assertTrue(any('Invalid value' in log and '0' in log for log in cm.output)) @@ -253,7 +250,7 @@ class TestSkipLlmsTxtEdgeCases(unittest.TestCase): 'skip_llms_txt': 1 # Invalid type } - with self.assertLogs('cli.doc_scraper', level='WARNING') as cm: + with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm: converter = DocToSkillConverter(config, dry_run=True) self.assertFalse(converter.skip_llms_txt) self.assertTrue(any('Invalid value' in log and '1' in log for log in cm.output)) @@ -267,7 +264,7 @@ class TestSkipLlmsTxtEdgeCases(unittest.TestCase): 'skip_llms_txt': "true" # Invalid type } - with self.assertLogs('cli.doc_scraper', level='WARNING') as cm: + with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm: converter = DocToSkillConverter(config, dry_run=True) self.assertFalse(converter.skip_llms_txt) self.assertTrue(any('Invalid value' in log and 'true' in log for log in cm.output)) @@ -281,7 +278,7 @@ class TestSkipLlmsTxtEdgeCases(unittest.TestCase): 'skip_llms_txt': None # Invalid type } - with self.assertLogs('cli.doc_scraper', level='WARNING') as cm: + with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm: converter = DocToSkillConverter(config, dry_run=True) self.assertFalse(converter.skip_llms_txt) self.assertTrue(any('Invalid value' in log and 'None' in log for log in cm.output))