diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py index 1c07f9a..35b43e5 100755 --- a/cli/doc_scraper.py +++ b/cli/doc_scraper.py @@ -418,6 +418,7 @@ class DocToSkillConverter: llms_result = self._try_llms_txt() if llms_result: print(f"\n✅ Used llms.txt ({self.llms_txt_variant}) - skipping HTML scraping") + self.save_summary() return # HTML scraping (original logic) @@ -568,9 +569,11 @@ class DocToSkillConverter: 'name': self.name, 'total_pages': len(self.pages), 'base_url': self.base_url, + 'llms_txt_detected': self.llms_txt_detected, + 'llms_txt_variant': self.llms_txt_variant, 'pages': [{'title': p['title'], 'url': p['url']} for p in self.pages] } - + with open(f"{self.data_dir}/summary.json", 'w', encoding='utf-8') as f: json.dump(summary, f, indent=2, ensure_ascii=False) diff --git a/tests/test_integration.py b/tests/test_integration.py index 88f7268..2135b63 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -347,5 +347,186 @@ class TestContentExtraction(unittest.TestCase): self.assertEqual(page['code_samples'][0]['language'], 'python') +class TestFullLlmsTxtWorkflow(unittest.TestCase): + """Test complete llms.txt workflow with mocked HTTP requests""" + + def setUp(self): + """Set up test configuration and temporary directory""" + self.temp_dir = tempfile.mkdtemp() + self.config = { + 'name': 'test-e2e-llms', + 'base_url': 'https://hono.dev/docs', + 'llms_txt_url': 'https://hono.dev/llms-full.txt', + 'selectors': { + 'main_content': 'article', + 'title': 'h1', + 'code_blocks': 'pre code' + }, + 'max_pages': 50 + } + + # Sample llms.txt content for testing + self.sample_llms_content = """# Getting Started + +Welcome to the framework documentation. This is the introduction section. + +## Installation + +To install the framework, run the following command: + +```bash +npm install hono +``` + +## Quick Start + +Create a simple application: + +```javascript +import { Hono } from 'hono' + +const app = new Hono() + +app.get('/', (c) => { + return c.text('Hello World!') +}) + +export default app +``` + +# API Reference + +This section covers the API documentation for the framework. + +## Context + +The context object provides request and response handling: + +```typescript +interface Context { + req: Request + res: Response + text: (text: string) => Response +} +``` + +# Middleware + +Middleware functions run before route handlers. + +## Built-in Middleware + +The framework provides several built-in middleware functions: + +```javascript +import { logger, cors } from 'hono/middleware' + +app.use('*', logger()) +app.use('*', cors()) +``` +""" + + def tearDown(self): + """Clean up temporary directory and test output""" + shutil.rmtree(self.temp_dir, ignore_errors=True) + # Clean up test output directories + shutil.rmtree(f"output/{self.config['name']}_data", ignore_errors=True) + shutil.rmtree(f"output/{self.config['name']}", ignore_errors=True) + + def test_full_llms_txt_workflow(self): + """Test complete workflow: config -> scrape (llms.txt) -> build -> verify""" + from unittest.mock import patch, MagicMock + import requests + + # Mock the requests.get call for downloading llms.txt + with patch('cli.llms_txt_downloader.requests.get') as mock_get: + # Configure mock response + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = self.sample_llms_content + mock_response.raise_for_status = MagicMock() + mock_get.return_value = mock_response + + # Create scraper and scrape + scraper = DocToSkillConverter(self.config, dry_run=False) + scraper.scrape_all() + + # Verify llms.txt was detected + self.assertTrue(scraper.llms_txt_detected, + "llms.txt should be detected") + self.assertEqual(scraper.llms_txt_variant, 'explicit', + "Should use explicit variant from config") + + # Verify pages were parsed + self.assertGreater(len(scraper.pages), 0, + "Should have parsed pages from llms.txt") + + # Verify page structure + self.assertTrue(all('title' in page for page in scraper.pages), + "All pages should have titles") + self.assertTrue(all('content' in page for page in scraper.pages), + "All pages should have content") + self.assertTrue(any(len(page.get('code_samples', [])) > 0 + for page in scraper.pages), + "At least one page should have code samples") + + # Verify code samples have language detection + pages_with_code = [p for p in scraper.pages + if len(p.get('code_samples', [])) > 0] + if pages_with_code: + sample = pages_with_code[0]['code_samples'][0] + self.assertIn('language', sample, + "Code samples should have language field") + self.assertIn('code', sample, + "Code samples should have code field") + + # Build skill + scraper.build_skill() + + # Verify SKILL.md exists + skill_md_path = Path(f"output/{self.config['name']}/SKILL.md") + self.assertTrue(skill_md_path.exists(), + "SKILL.md should be created") + + # Verify SKILL.md content + skill_content = skill_md_path.read_text() + self.assertIn(self.config['name'], skill_content, + "SKILL.md should contain skill name") + self.assertGreater(len(skill_content), 100, + "SKILL.md should have substantial content") + + # Verify references directory exists + refs_dir = Path(f"output/{self.config['name']}/references") + self.assertTrue(refs_dir.exists(), + "references directory should exist") + + # Verify at least index.md was created + index_md = refs_dir / 'index.md' + self.assertTrue(index_md.exists(), + "references/index.md should exist") + + # Verify reference files have content + ref_files = list(refs_dir.glob('*.md')) + self.assertGreater(len(ref_files), 0, + "Should have at least one reference file") + + # Verify data directory was created and has summary + data_dir = Path(f"output/{self.config['name']}_data") + self.assertTrue(data_dir.exists(), + "Data directory should exist") + + summary_path = data_dir / 'summary.json' + self.assertTrue(summary_path.exists(), + "summary.json should exist") + + # Verify summary content + with open(summary_path) as f: + summary = json.load(f) + self.assertEqual(summary['name'], self.config['name']) + self.assertGreater(summary['total_pages'], 0) + self.assertIn('llms_txt_detected', summary) + self.assertTrue(summary['llms_txt_detected']) + + if __name__ == '__main__': unittest.main()