test: add e2e test for llms.txt workflow

2025-10-24 13:44:55 +04:00
parent 104818f983
commit 41d1846278
2 changed files with 185 additions and 1 deletions
--- a/cli/doc_scraper.py
+++ b/cli/doc_scraper.py
@@ -418,6 +418,7 @@ class DocToSkillConverter:
            llms_result = self._try_llms_txt()
            if llms_result:
                print(f"\n✅ Used llms.txt ({self.llms_txt_variant}) - skipping HTML scraping")
+                self.save_summary()
                return

        # HTML scraping (original logic)
@@ -568,9 +569,11 @@ class DocToSkillConverter:
            'name': self.name,
            'total_pages': len(self.pages),
            'base_url': self.base_url,
+            'llms_txt_detected': self.llms_txt_detected,
+            'llms_txt_variant': self.llms_txt_variant,
            'pages': [{'title': p['title'], 'url': p['url']} for p in self.pages]
        }
-        
+
        with open(f"{self.data_dir}/summary.json", 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=2, ensure_ascii=False)
    
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -347,5 +347,186 @@ class TestContentExtraction(unittest.TestCase):
        self.assertEqual(page['code_samples'][0]['language'], 'python')


+class TestFullLlmsTxtWorkflow(unittest.TestCase):
+    """Test complete llms.txt workflow with mocked HTTP requests"""
+
+    def setUp(self):
+        """Set up test configuration and temporary directory"""
+        self.temp_dir = tempfile.mkdtemp()
+        self.config = {
+            'name': 'test-e2e-llms',
+            'base_url': 'https://hono.dev/docs',
+            'llms_txt_url': 'https://hono.dev/llms-full.txt',
+            'selectors': {
+                'main_content': 'article',
+                'title': 'h1',
+                'code_blocks': 'pre code'
+            },
+            'max_pages': 50
+        }
+
+        # Sample llms.txt content for testing
+        self.sample_llms_content = """# Getting Started
+
+Welcome to the framework documentation. This is the introduction section.
+
+## Installation
+
+To install the framework, run the following command:
+
+```bash
+npm install hono
+```
+
+## Quick Start
+
+Create a simple application:
+
+```javascript
+import { Hono } from 'hono'
+
+const app = new Hono()
+
+app.get('/', (c) => {
+  return c.text('Hello World!')
+})
+
+export default app
+```
+
+# API Reference
+
+This section covers the API documentation for the framework.
+
+## Context
+
+The context object provides request and response handling:
+
+```typescript
+interface Context {
+  req: Request
+  res: Response
+  text: (text: string) => Response
+}
+```
+
+# Middleware
+
+Middleware functions run before route handlers.
+
+## Built-in Middleware
+
+The framework provides several built-in middleware functions:
+
+```javascript
+import { logger, cors } from 'hono/middleware'
+
+app.use('*', logger())
+app.use('*', cors())
+```
+"""
+
+    def tearDown(self):
+        """Clean up temporary directory and test output"""
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+        # Clean up test output directories
+        shutil.rmtree(f"output/{self.config['name']}_data", ignore_errors=True)
+        shutil.rmtree(f"output/{self.config['name']}", ignore_errors=True)
+
+    def test_full_llms_txt_workflow(self):
+        """Test complete workflow: config -> scrape (llms.txt) -> build -> verify"""
+        from unittest.mock import patch, MagicMock
+        import requests
+
+        # Mock the requests.get call for downloading llms.txt
+        with patch('cli.llms_txt_downloader.requests.get') as mock_get:
+            # Configure mock response
+            mock_response = MagicMock()
+            mock_response.status_code = 200
+            mock_response.text = self.sample_llms_content
+            mock_response.raise_for_status = MagicMock()
+            mock_get.return_value = mock_response
+
+            # Create scraper and scrape
+            scraper = DocToSkillConverter(self.config, dry_run=False)
+            scraper.scrape_all()
+
+            # Verify llms.txt was detected
+            self.assertTrue(scraper.llms_txt_detected,
+                          "llms.txt should be detected")
+            self.assertEqual(scraper.llms_txt_variant, 'explicit',
+                           "Should use explicit variant from config")
+
+            # Verify pages were parsed
+            self.assertGreater(len(scraper.pages), 0,
+                             "Should have parsed pages from llms.txt")
+
+            # Verify page structure
+            self.assertTrue(all('title' in page for page in scraper.pages),
+                          "All pages should have titles")
+            self.assertTrue(all('content' in page for page in scraper.pages),
+                          "All pages should have content")
+            self.assertTrue(any(len(page.get('code_samples', [])) > 0
+                              for page in scraper.pages),
+                          "At least one page should have code samples")
+
+            # Verify code samples have language detection
+            pages_with_code = [p for p in scraper.pages
+                             if len(p.get('code_samples', [])) > 0]
+            if pages_with_code:
+                sample = pages_with_code[0]['code_samples'][0]
+                self.assertIn('language', sample,
+                            "Code samples should have language field")
+                self.assertIn('code', sample,
+                            "Code samples should have code field")
+
+            # Build skill
+            scraper.build_skill()
+
+            # Verify SKILL.md exists
+            skill_md_path = Path(f"output/{self.config['name']}/SKILL.md")
+            self.assertTrue(skill_md_path.exists(),
+                          "SKILL.md should be created")
+
+            # Verify SKILL.md content
+            skill_content = skill_md_path.read_text()
+            self.assertIn(self.config['name'], skill_content,
+                        "SKILL.md should contain skill name")
+            self.assertGreater(len(skill_content), 100,
+                             "SKILL.md should have substantial content")
+
+            # Verify references directory exists
+            refs_dir = Path(f"output/{self.config['name']}/references")
+            self.assertTrue(refs_dir.exists(),
+                          "references directory should exist")
+
+            # Verify at least index.md was created
+            index_md = refs_dir / 'index.md'
+            self.assertTrue(index_md.exists(),
+                          "references/index.md should exist")
+
+            # Verify reference files have content
+            ref_files = list(refs_dir.glob('*.md'))
+            self.assertGreater(len(ref_files), 0,
+                             "Should have at least one reference file")
+
+            # Verify data directory was created and has summary
+            data_dir = Path(f"output/{self.config['name']}_data")
+            self.assertTrue(data_dir.exists(),
+                          "Data directory should exist")
+
+            summary_path = data_dir / 'summary.json'
+            self.assertTrue(summary_path.exists(),
+                          "summary.json should exist")
+
+            # Verify summary content
+            with open(summary_path) as f:
+                summary = json.load(f)
+                self.assertEqual(summary['name'], self.config['name'])
+                self.assertGreater(summary['total_pages'], 0)
+                self.assertIn('llms_txt_detected', summary)
+                self.assertTrue(summary['llms_txt_detected'])
+
+
 if __name__ == '__main__':
    unittest.main()