Merge PR #198: Skip llms.txt Config Option
Merges feat/add-skip-llm-to-config by @sogoiii. This PR adds a valuable configuration option to explicitly skip llms.txt detection, useful when a site's llms.txt is incomplete, incorrect, or when specific HTML scraping is needed. Key features: - New 'skip_llms_txt' config option (default: false, backward compatible) - Boolean type validation with warning for invalid values - Support in both sync and async scraping modes - 17 comprehensive tests (15 feature tests + 2 config validation tests) All tests passing after fixing import paths to use proper package names. Test results: ✅ 17/17 tests passing Full test suite: ✅ 391 tests passing Co-authored-by: sogoiii <sogoiii@users.noreply.github.com>
This commit is contained in:
@@ -307,6 +307,30 @@ class TestConfigValidation(unittest.TestCase):
|
||||
# Should be valid
|
||||
self.assertEqual(config.get('llms_txt_url'), 'https://example.com/llms-full.txt')
|
||||
|
||||
def test_config_with_skip_llms_txt(self):
|
||||
"""Test config validation accepts skip_llms_txt"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/docs',
|
||||
'skip_llms_txt': True
|
||||
}
|
||||
|
||||
errors, warnings = validate_config(config)
|
||||
self.assertEqual(errors, [])
|
||||
self.assertTrue(config.get('skip_llms_txt'))
|
||||
|
||||
def test_config_with_skip_llms_txt_false(self):
|
||||
"""Test config validation accepts skip_llms_txt as False"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/docs',
|
||||
'skip_llms_txt': False
|
||||
}
|
||||
|
||||
errors, warnings = validate_config(config)
|
||||
self.assertEqual(errors, [])
|
||||
self.assertFalse(config.get('skip_llms_txt'))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
318
tests/test_skip_llms_txt.py
Normal file
318
tests/test_skip_llms_txt.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""Tests for skip_llms_txt configuration option.
|
||||
|
||||
This config option allows users to explicitly skip llms.txt detection and fetching,
|
||||
which is useful when:
|
||||
- A site's llms.txt is incomplete or incorrect
|
||||
- You need specific pages not in llms.txt
|
||||
- You want to force HTML scraping
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
import logging
|
||||
from unittest.mock import patch, Mock, MagicMock
|
||||
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
|
||||
class TestSkipLlmsTxtConfig(unittest.TestCase):
|
||||
"""Test skip_llms_txt configuration option."""
|
||||
|
||||
def test_default_skip_llms_txt_is_false(self):
|
||||
"""Test that skip_llms_txt defaults to False when not specified."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'}
|
||||
}
|
||||
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertFalse(converter.skip_llms_txt)
|
||||
|
||||
def test_skip_llms_txt_can_be_set_true(self):
|
||||
"""Test that skip_llms_txt can be explicitly set to True."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': True
|
||||
}
|
||||
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertTrue(converter.skip_llms_txt)
|
||||
|
||||
def test_skip_llms_txt_can_be_set_false(self):
|
||||
"""Test that skip_llms_txt can be explicitly set to False."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': False
|
||||
}
|
||||
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertFalse(converter.skip_llms_txt)
|
||||
|
||||
|
||||
class TestSkipLlmsTxtSyncBehavior(unittest.TestCase):
|
||||
"""Test skip_llms_txt behavior in sync scraping mode."""
|
||||
|
||||
def test_llms_txt_tried_when_not_skipped(self):
|
||||
"""Test that _try_llms_txt is called when skip_llms_txt is False."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': False
|
||||
}
|
||||
|
||||
original_cwd = os.getcwd()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
try:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=False)
|
||||
|
||||
with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
|
||||
with patch.object(converter, 'scrape_page'):
|
||||
with patch.object(converter, 'save_summary'):
|
||||
converter.scrape_all()
|
||||
mock_try.assert_called_once()
|
||||
finally:
|
||||
os.chdir(original_cwd)
|
||||
|
||||
def test_llms_txt_skipped_when_skip_true(self):
|
||||
"""Test that _try_llms_txt is NOT called when skip_llms_txt is True."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': True
|
||||
}
|
||||
|
||||
original_cwd = os.getcwd()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
try:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=False)
|
||||
|
||||
with patch.object(converter, '_try_llms_txt') as mock_try:
|
||||
with patch.object(converter, 'scrape_page'):
|
||||
with patch.object(converter, 'save_summary'):
|
||||
converter.scrape_all()
|
||||
mock_try.assert_not_called()
|
||||
finally:
|
||||
os.chdir(original_cwd)
|
||||
|
||||
def test_llms_txt_skipped_in_dry_run_mode(self):
|
||||
"""Test that _try_llms_txt is NOT called in dry-run mode regardless of skip setting."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': False # Even when False
|
||||
}
|
||||
|
||||
original_cwd = os.getcwd()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
try:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
|
||||
with patch.object(converter, '_try_llms_txt') as mock_try:
|
||||
with patch.object(converter, 'save_summary'):
|
||||
converter.scrape_all()
|
||||
mock_try.assert_not_called()
|
||||
finally:
|
||||
os.chdir(original_cwd)
|
||||
|
||||
|
||||
class TestSkipLlmsTxtAsyncBehavior(unittest.TestCase):
|
||||
"""Test skip_llms_txt behavior in async scraping mode."""
|
||||
|
||||
def test_async_llms_txt_tried_when_not_skipped(self):
|
||||
"""Test that _try_llms_txt is called in async mode when skip_llms_txt is False."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'async_mode': True,
|
||||
'skip_llms_txt': False
|
||||
}
|
||||
|
||||
original_cwd = os.getcwd()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
try:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=False)
|
||||
|
||||
with patch.object(converter, '_try_llms_txt', return_value=False) as mock_try:
|
||||
with patch.object(converter, 'scrape_page_async', return_value=None):
|
||||
with patch.object(converter, 'save_summary'):
|
||||
converter.scrape_all()
|
||||
mock_try.assert_called_once()
|
||||
finally:
|
||||
os.chdir(original_cwd)
|
||||
|
||||
def test_async_llms_txt_skipped_when_skip_true(self):
|
||||
"""Test that _try_llms_txt is NOT called in async mode when skip_llms_txt is True."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'async_mode': True,
|
||||
'skip_llms_txt': True
|
||||
}
|
||||
|
||||
original_cwd = os.getcwd()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
try:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=False)
|
||||
|
||||
with patch.object(converter, '_try_llms_txt') as mock_try:
|
||||
with patch.object(converter, 'scrape_page_async', return_value=None):
|
||||
with patch.object(converter, 'save_summary'):
|
||||
converter.scrape_all()
|
||||
mock_try.assert_not_called()
|
||||
finally:
|
||||
os.chdir(original_cwd)
|
||||
|
||||
|
||||
class TestSkipLlmsTxtWithRealConfig(unittest.TestCase):
|
||||
"""Test skip_llms_txt with real-world config patterns."""
|
||||
|
||||
def test_telegram_bots_config_pattern(self):
|
||||
"""Test the telegram-bots config pattern which uses skip_llms_txt."""
|
||||
config = {
|
||||
'name': 'telegram-bots',
|
||||
'description': 'Telegram bot documentation',
|
||||
'base_url': 'https://core.telegram.org/bots',
|
||||
'skip_llms_txt': True, # Telegram doesn't have useful llms.txt
|
||||
'start_urls': [
|
||||
'https://core.telegram.org/bots',
|
||||
'https://core.telegram.org/bots/api'
|
||||
],
|
||||
'selectors': {
|
||||
'main_content': '#dev_page_content, main, article',
|
||||
'title': 'h1, title',
|
||||
'code_blocks': 'pre code, pre'
|
||||
}
|
||||
}
|
||||
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertTrue(converter.skip_llms_txt)
|
||||
self.assertEqual(converter.name, 'telegram-bots')
|
||||
|
||||
def test_skip_llms_txt_with_multiple_start_urls(self):
|
||||
"""Test skip_llms_txt works correctly with multiple start URLs."""
|
||||
config = {
|
||||
'name': 'test-multi',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': True,
|
||||
'start_urls': [
|
||||
'https://example.com/docs/',
|
||||
'https://example.com/api/',
|
||||
'https://example.com/guide/'
|
||||
]
|
||||
}
|
||||
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertTrue(converter.skip_llms_txt)
|
||||
# start_urls are stored in pending_urls deque
|
||||
self.assertEqual(len(converter.pending_urls), 3)
|
||||
|
||||
|
||||
class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
|
||||
"""Test edge cases for skip_llms_txt."""
|
||||
|
||||
def test_skip_llms_txt_with_int_zero_logs_warning(self):
|
||||
"""Test that integer 0 logs warning and defaults to False."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': 0 # Invalid type
|
||||
}
|
||||
|
||||
with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertFalse(converter.skip_llms_txt)
|
||||
self.assertTrue(any('Invalid value' in log and '0' in log for log in cm.output))
|
||||
|
||||
def test_skip_llms_txt_with_int_one_logs_warning(self):
|
||||
"""Test that integer 1 logs warning and defaults to False."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': 1 # Invalid type
|
||||
}
|
||||
|
||||
with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertFalse(converter.skip_llms_txt)
|
||||
self.assertTrue(any('Invalid value' in log and '1' in log for log in cm.output))
|
||||
|
||||
def test_skip_llms_txt_with_string_logs_warning(self):
|
||||
"""Test that string values log warning and default to False."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': "true" # Invalid type
|
||||
}
|
||||
|
||||
with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertFalse(converter.skip_llms_txt)
|
||||
self.assertTrue(any('Invalid value' in log and 'true' in log for log in cm.output))
|
||||
|
||||
def test_skip_llms_txt_with_none_logs_warning(self):
|
||||
"""Test that None logs warning and defaults to False."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': None # Invalid type
|
||||
}
|
||||
|
||||
with self.assertLogs('skill_seekers.cli.doc_scraper', level='WARNING') as cm:
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertFalse(converter.skip_llms_txt)
|
||||
self.assertTrue(any('Invalid value' in log and 'None' in log for log in cm.output))
|
||||
|
||||
def test_scraping_proceeds_when_llms_txt_skipped(self):
|
||||
"""Test that HTML scraping proceeds normally when llms.txt is skipped."""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'skip_llms_txt': True
|
||||
}
|
||||
|
||||
original_cwd = os.getcwd()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
try:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=False)
|
||||
|
||||
# Track if scrape_page was called
|
||||
scrape_called = []
|
||||
|
||||
def mock_scrape(url):
|
||||
scrape_called.append(url)
|
||||
return None
|
||||
|
||||
with patch.object(converter, 'scrape_page', side_effect=mock_scrape):
|
||||
with patch.object(converter, 'save_summary'):
|
||||
converter.scrape_all()
|
||||
# Should have attempted to scrape the base URL
|
||||
self.assertTrue(len(scrape_called) > 0)
|
||||
finally:
|
||||
os.chdir(original_cwd)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user