diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py index 35b43e5..ebf22f1 100755 --- a/cli/doc_scraper.py +++ b/cli/doc_scraper.py @@ -51,6 +51,7 @@ class DocToSkillConverter: # llms.txt detection state self.llms_txt_detected = False self.llms_txt_variant = None + self.llms_txt_variants = [] # Track all downloaded variants # Parallel scraping config self.workers = config.get('workers', 1) @@ -337,10 +338,13 @@ class DocToSkillConverter: def _try_llms_txt(self) -> bool: """ Try to use llms.txt instead of HTML scraping. + Downloads ALL available variants and stores with .md extension. Returns: - True if llms.txt was found and parsed successfully + True if llms.txt was found and processed successfully """ + print(f"\nšŸ” Checking for llms.txt at {self.base_url}...") + # Check for explicit config URL first explicit_url = self.config.get('llms_txt_url') if explicit_url: @@ -349,16 +353,21 @@ class DocToSkillConverter: downloader = LlmsTxtDownloader(explicit_url) content = downloader.download() - if not content: - print("āš ļø Failed to download, falling back to auto-detection") - # Continue to auto-detection below - else: - # Parse and save (same as auto-detected flow) + if content: + # Save with proper .md extension + filename = downloader.get_proper_filename() + filepath = os.path.join(self.skill_dir, "references", filename) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + print(f" šŸ’¾ Saved {filename} ({len(content)} chars)") + + # Parse and save pages parser = LlmsTxtParser(content) pages = parser.parse() if pages: - print(f"šŸ“„ Parsed {len(pages)} sections") for page in pages: self.save_page(page) self.pages.append(page) @@ -367,46 +376,68 @@ class DocToSkillConverter: self.llms_txt_variant = 'explicit' return True - # Original auto-detection logic continues... - print(f"\nšŸ” Checking for llms.txt at {self.base_url}...") - - # Detect llms.txt + # Auto-detection: Find ALL variants detector = LlmsTxtDetector(self.base_url) - result = detector.detect() + variants = detector.detect_all() - if not result: + if not variants: print("ā„¹ļø No llms.txt found, using HTML scraping") return False - print(f"āœ… Found {result['variant']} llms.txt: {result['url']}") + print(f"āœ… Found {len(variants)} llms.txt variant(s)") - # Download content - downloader = LlmsTxtDownloader(result['url']) - content = downloader.download() + # Download ALL variants + downloaded = {} + for variant_info in variants: + url = variant_info['url'] + variant = variant_info['variant'] - if not content: - print("āš ļø Failed to download llms.txt, falling back to HTML scraping") + print(f" šŸ“„ Downloading {variant}...") + downloader = LlmsTxtDownloader(url) + content = downloader.download() + + if content: + filename = downloader.get_proper_filename() + downloaded[variant] = { + 'content': content, + 'filename': filename, + 'size': len(content) + } + print(f" āœ“ {filename} ({len(content)} chars)") + + if not downloaded: + print("āš ļø Failed to download any variants, falling back to HTML scraping") return False - print(f"šŸ“„ Downloaded {len(content)} characters") + # Save ALL variants to references/ + os.makedirs(os.path.join(self.skill_dir, "references"), exist_ok=True) - # Parse into pages - parser = LlmsTxtParser(content) + for variant, data in downloaded.items(): + filepath = os.path.join(self.skill_dir, "references", data['filename']) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(data['content']) + print(f" šŸ’¾ Saved {data['filename']}") + + # Parse LARGEST variant for skill building + largest = max(downloaded.items(), key=lambda x: x[1]['size']) + print(f"\nšŸ“„ Parsing {largest[1]['filename']} for skill building...") + + parser = LlmsTxtParser(largest[1]['content']) pages = parser.parse() if not pages: print("āš ļø Failed to parse llms.txt, falling back to HTML scraping") return False - print(f"šŸ“„ Parsed {len(pages)} sections") + print(f" āœ“ Parsed {len(pages)} sections") - # Save pages + # Save pages for skill building for page in pages: self.save_page(page) self.pages.append(page) self.llms_txt_detected = True - self.llms_txt_variant = result['variant'] + self.llms_txt_variants = list(downloaded.keys()) return True diff --git a/tests/test_integration.py b/tests/test_integration.py index 2135b63..bc0f9f6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -527,6 +527,70 @@ app.use('*', cors()) self.assertIn('llms_txt_detected', summary) self.assertTrue(summary['llms_txt_detected']) + def test_multi_variant_download(self): + """Test downloading all 3 llms.txt variants""" + from unittest.mock import patch, Mock + + config = { + 'name': 'test-multi-variant', + 'base_url': 'https://hono.dev/docs', + 'selectors': { + 'main_content': 'article', + 'title': 'h1', + 'code_blocks': 'pre code' + }, + 'max_pages': 50 + } + + # Mock all 3 variants + sample_full = "# Full\n" + "x" * 1000 + sample_standard = "# Standard\n" + "x" * 200 + sample_small = "# Small\n" + "x" * 500 + + with patch('cli.llms_txt_detector.requests.head') as mock_head, \ + patch('cli.llms_txt_downloader.requests.get') as mock_get: + + # Mock detection (all exist) + mock_head_response = Mock() + mock_head_response.status_code = 200 + mock_head.return_value = mock_head_response + + # Mock downloads + def mock_download(url, **kwargs): + response = Mock() + response.status_code = 200 + if 'llms-full.txt' in url: + response.text = sample_full + elif 'llms-small.txt' in url: + response.text = sample_small + else: # llms.txt + response.text = sample_standard + response.raise_for_status = Mock() + return response + + mock_get.side_effect = mock_download + + # Run scraper + from cli.doc_scraper import DocToSkillConverter as DocumentationScraper + scraper = DocumentationScraper(config, dry_run=False) + result = scraper._try_llms_txt() + + # Verify all 3 files created + refs_dir = Path(f"output/{config['name']}/references") + + self.assertTrue(refs_dir.exists(), "references directory should exist") + self.assertTrue((refs_dir / 'llms-full.md').exists(), "llms-full.md should exist") + self.assertTrue((refs_dir / 'llms.md').exists(), "llms.md should exist") + self.assertTrue((refs_dir / 'llms-small.md').exists(), "llms-small.md should exist") + + # Verify content not truncated + full_content = (refs_dir / 'llms-full.md').read_text() + self.assertEqual(len(full_content), len(sample_full)) + + # Clean up + shutil.rmtree(f"output/{config['name']}_data", ignore_errors=True) + shutil.rmtree(f"output/{config['name']}", ignore_errors=True) + if __name__ == '__main__': unittest.main()