feat: add skip_llms_txt config option to bypass llms.txt detection

- Add skip_llms_txt config option (default: False)
- Validate value is boolean, warn and default to False if not
- Support in both sync and async scraping modes
- Add 17 tests for config, behavior, and edge cases
This commit is contained in:
sogoiii
2025-11-20 13:55:46 -08:00
parent 4cbd0a0a3c
commit a0b1c2f42f
3 changed files with 358 additions and 4 deletions

View File

@@ -307,6 +307,30 @@ class TestConfigValidation(unittest.TestCase):
# Should be valid
self.assertEqual(config.get('llms_txt_url'), 'https://example.com/llms-full.txt')
def test_config_with_skip_llms_txt(self):
"""Test config validation accepts skip_llms_txt"""
config = {
'name': 'test',
'base_url': 'https://example.com/docs',
'skip_llms_txt': True
}
errors, warnings = validate_config(config)
self.assertEqual(errors, [])
self.assertTrue(config.get('skip_llms_txt'))
def test_config_with_skip_llms_txt_false(self):
"""Test config validation accepts skip_llms_txt as False"""
config = {
'name': 'test',
'base_url': 'https://example.com/docs',
'skip_llms_txt': False
}
errors, warnings = validate_config(config)
self.assertEqual(errors, [])
self.assertFalse(config.get('skip_llms_txt'))
if __name__ == '__main__':
unittest.main()