From 99a40d3a1b8f31b12d506cbec32eca1e05327a45 Mon Sep 17 00:00:00 2001 From: "Edgar I." Date: Fri, 24 Oct 2025 13:40:36 +0400 Subject: [PATCH] feat: support explicit llms_txt_url in config --- cli/doc_scraper.py | 27 +++++++++++++++++++++++++++ tests/test_config_validation.py | 11 +++++++++++ 2 files changed, 38 insertions(+) diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py index 330ff73..1c07f9a 100755 --- a/cli/doc_scraper.py +++ b/cli/doc_scraper.py @@ -341,6 +341,33 @@ class DocToSkillConverter: Returns: True if llms.txt was found and parsed successfully """ + # Check for explicit config URL first + explicit_url = self.config.get('llms_txt_url') + if explicit_url: + print(f"\nšŸ“Œ Using explicit llms_txt_url from config: {explicit_url}") + + downloader = LlmsTxtDownloader(explicit_url) + content = downloader.download() + + if not content: + print("āš ļø Failed to download, falling back to auto-detection") + # Continue to auto-detection below + else: + # Parse and save (same as auto-detected flow) + parser = LlmsTxtParser(content) + pages = parser.parse() + + if pages: + print(f"šŸ“„ Parsed {len(pages)} sections") + for page in pages: + self.save_page(page) + self.pages.append(page) + + self.llms_txt_detected = True + self.llms_txt_variant = 'explicit' + return True + + # Original auto-detection logic continues... print(f"\nšŸ” Checking for llms.txt at {self.base_url}...") # Detect llms.txt diff --git a/tests/test_config_validation.py b/tests/test_config_validation.py index a270707..ced51d3 100644 --- a/tests/test_config_validation.py +++ b/tests/test_config_validation.py @@ -296,6 +296,17 @@ class TestConfigValidation(unittest.TestCase): url_errors = [e for e in errors if 'start_url' in e.lower()] self.assertEqual(len(url_errors), 0, "Valid start_urls should pass validation") + def test_config_with_llms_txt_url(self): + """Test config validation with explicit llms_txt_url""" + config = { + 'name': 'test', + 'llms_txt_url': 'https://example.com/llms-full.txt', + 'base_url': 'https://example.com/docs' + } + + # Should be valid + self.assertEqual(config.get('llms_txt_url'), 'https://example.com/llms-full.txt') + if __name__ == '__main__': unittest.main()