feat: add skip_llms_txt config option to bypass llms.txt detection

- Add skip_llms_txt config option (default: False)
- Validate value is boolean, warn and default to False if not
- Support in both sync and async scraping modes
- Add 17 tests for config, behavior, and edge cases
This commit is contained in:
sogoiii
2025-11-20 13:55:46 -08:00
parent 4cbd0a0a3c
commit a0b1c2f42f
3 changed files with 358 additions and 4 deletions

View File

@@ -86,6 +86,15 @@ class DocToSkillConverter:
self.checkpoint_interval = checkpoint_config.get('interval', DEFAULT_CHECKPOINT_INTERVAL)
# llms.txt detection state
skip_llms_txt_value = config.get('skip_llms_txt', False)
if not isinstance(skip_llms_txt_value, bool):
logger.warning(
"Invalid value for 'skip_llms_txt': %r (expected bool). Defaulting to False.",
skip_llms_txt_value
)
self.skip_llms_txt = False
else:
self.skip_llms_txt = skip_llms_txt_value
self.llms_txt_detected = False
self.llms_txt_variant = None
self.llms_txt_variants: List[str] = [] # Track all downloaded variants
@@ -618,8 +627,8 @@ class DocToSkillConverter:
asyncio.run(self.scrape_all_async())
return
# Try llms.txt first (unless dry-run)
if not self.dry_run:
# Try llms.txt first (unless dry-run or explicitly disabled)
if not self.dry_run and not self.skip_llms_txt:
llms_result = self._try_llms_txt()
if llms_result:
logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant)
@@ -778,8 +787,8 @@ class DocToSkillConverter:
Performance: ~2-3x faster than sync mode with same worker count.
"""
# Try llms.txt first (unless dry-run)
if not self.dry_run:
# Try llms.txt first (unless dry-run or explicitly disabled)
if not self.dry_run and not self.skip_llms_txt:
llms_result = self._try_llms_txt()
if llms_result:
logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant)

View File

@@ -307,6 +307,30 @@ class TestConfigValidation(unittest.TestCase):
# Should be valid
self.assertEqual(config.get('llms_txt_url'), 'https://example.com/llms-full.txt')
def test_config_with_skip_llms_txt(self):
"""Test config validation accepts skip_llms_txt"""
config = {
'name': 'test',
'base_url': 'https://example.com/docs',
'skip_llms_txt': True
}
errors, warnings = validate_config(config)
self.assertEqual(errors, [])
self.assertTrue(config.get('skip_llms_txt'))
def test_config_with_skip_llms_txt_false(self):
"""Test config validation accepts skip_llms_txt as False"""
config = {
'name': 'test',
'base_url': 'https://example.com/docs',
'skip_llms_txt': False
}
errors, warnings = validate_config(config)
self.assertEqual(errors, [])
self.assertFalse(config.get('skip_llms_txt'))
if __name__ == '__main__':
unittest.main()

321
tests/test_skip_llms_txt.py Normal file
View File

@@ -0,0 +1,321 @@
"""Tests for skip_llms_txt configuration option.
This config option allows users to explicitly skip llms.txt detection and fetching,
which is useful when:
- A site's llms.txt is incomplete or incorrect
- You need specific pages not in llms.txt
- You want to force HTML scraping
"""
import os
import tempfile
import unittest
import logging
from unittest.mock import patch, Mock, MagicMock
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from cli.doc_scraper import DocToSkillConverter
class TestSkipLlmsTxtConfig(unittest.TestCase):
"""Test skip_llms_txt configuration option."""
def test_default_skip_llms_txt_is_false(self):
"""Test that skip_llms_txt defaults to False when not specified."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'}
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
def test_skip_llms_txt_can_be_set_true(self):
"""Test that skip_llms_txt can be explicitly set to True."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': True
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertTrue(converter.skip_llms_txt)
def test_skip_llms_txt_can_be_set_false(self):
"""Test that skip_llms_txt can be explicitly set to False."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': False
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
class TestSkipLlmsTxtSyncBehavior(unittest.TestCase):
"""Test skip_llms_txt behavior in sync scraping mode."""
def test_llms_txt_tried_when_not_skipped(self):
"""Test that _try_llms_txt is called when skip_llms_txt is False."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': False
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
with patch.object(converter, 'scrape_page'):
with patch.object(converter, 'save_summary'):
converter.scrape_all()
mock_try.assert_called_once()
finally:
os.chdir(original_cwd)
def test_llms_txt_skipped_when_skip_true(self):
"""Test that _try_llms_txt is NOT called when skip_llms_txt is True."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': True
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
with patch.object(converter, '_try_llms_txt') as mock_try:
with patch.object(converter, 'scrape_page'):
with patch.object(converter, 'save_summary'):
converter.scrape_all()
mock_try.assert_not_called()
finally:
os.chdir(original_cwd)
def test_llms_txt_skipped_in_dry_run_mode(self):
"""Test that _try_llms_txt is NOT called in dry-run mode regardless of skip setting."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': False # Even when False
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=True)
with patch.object(converter, '_try_llms_txt') as mock_try:
with patch.object(converter, 'save_summary'):
converter.scrape_all()
mock_try.assert_not_called()
finally:
os.chdir(original_cwd)
class TestSkipLlmsTxtAsyncBehavior(unittest.TestCase):
"""Test skip_llms_txt behavior in async scraping mode."""
def test_async_llms_txt_tried_when_not_skipped(self):
"""Test that _try_llms_txt is called in async mode when skip_llms_txt is False."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'async_mode': True,
'skip_llms_txt': False
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
with patch.object(converter, 'scrape_page_async', return_value=None):
with patch.object(converter, 'save_summary'):
converter.scrape_all()
mock_try.assert_called_once()
finally:
os.chdir(original_cwd)
def test_async_llms_txt_skipped_when_skip_true(self):
"""Test that _try_llms_txt is NOT called in async mode when skip_llms_txt is True."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'async_mode': True,
'skip_llms_txt': True
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
with patch.object(converter, '_try_llms_txt') as mock_try:
with patch.object(converter, 'scrape_page_async', return_value=None):
with patch.object(converter, 'save_summary'):
converter.scrape_all()
mock_try.assert_not_called()
finally:
os.chdir(original_cwd)
class TestSkipLlmsTxtWithRealConfig(unittest.TestCase):
"""Test skip_llms_txt with real-world config patterns."""
def test_telegram_bots_config_pattern(self):
"""Test the telegram-bots config pattern which uses skip_llms_txt."""
config = {
'name': 'telegram-bots',
'description': 'Telegram bot documentation',
'base_url': 'https://core.telegram.org/bots',
'skip_llms_txt': True, # Telegram doesn't have useful llms.txt
'start_urls': [
'https://core.telegram.org/bots',
'https://core.telegram.org/bots/api'
],
'selectors': {
'main_content': '#dev_page_content, main, article',
'title': 'h1, title',
'code_blocks': 'pre code, pre'
}
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertTrue(converter.skip_llms_txt)
self.assertEqual(converter.name, 'telegram-bots')
def test_skip_llms_txt_with_multiple_start_urls(self):
"""Test skip_llms_txt works correctly with multiple start URLs."""
config = {
'name': 'test-multi',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': True,
'start_urls': [
'https://example.com/docs/',
'https://example.com/api/',
'https://example.com/guide/'
]
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertTrue(converter.skip_llms_txt)
# start_urls are stored in pending_urls deque
self.assertEqual(len(converter.pending_urls), 3)
class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
"""Test edge cases for skip_llms_txt."""
def test_skip_llms_txt_with_int_zero_logs_warning(self):
"""Test that integer 0 logs warning and defaults to False."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': 0 # Invalid type
}
with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
self.assertTrue(any('Invalid value' in log and '0' in log for log in cm.output))
def test_skip_llms_txt_with_int_one_logs_warning(self):
"""Test that integer 1 logs warning and defaults to False."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': 1 # Invalid type
}
with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
self.assertTrue(any('Invalid value' in log and '1' in log for log in cm.output))
def test_skip_llms_txt_with_string_logs_warning(self):
"""Test that string values log warning and default to False."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': "true" # Invalid type
}
with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
self.assertTrue(any('Invalid value' in log and 'true' in log for log in cm.output))
def test_skip_llms_txt_with_none_logs_warning(self):
"""Test that None logs warning and defaults to False."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': None # Invalid type
}
with self.assertLogs('cli.doc_scraper', level='WARNING') as cm:
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
self.assertTrue(any('Invalid value' in log and 'None' in log for log in cm.output))
def test_scraping_proceeds_when_llms_txt_skipped(self):
"""Test that HTML scraping proceeds normally when llms.txt is skipped."""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'skip_llms_txt': True
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
# Track if scrape_page was called
scrape_called = []
def mock_scrape(url):
scrape_called.append(url)
return None
with patch.object(converter, 'scrape_page', side_effect=mock_scrape):
with patch.object(converter, 'save_summary'):
converter.scrape_all()
# Should have attempted to scrape the base URL
self.assertTrue(len(scrape_called) > 0)
finally:
os.chdir(original_cwd)
if __name__ == '__main__':
unittest.main()