Page Title
This is some content.
This is more content with sufficient length to be included.
print("hello")
#!/usr/bin/env python3 """ Integration tests for doc_scraper Tests complete workflows and dry-run mode """ import json import os import shutil import sys import tempfile import unittest from pathlib import Path # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from skill_seekers.cli.config_validator import ConfigValidator from skill_seekers.cli.doc_scraper import DocToSkillConverter, load_config, validate_config class TestDryRunMode(unittest.TestCase): """Test dry-run mode functionality""" def setUp(self): """Set up test configuration""" self.config = { "name": "test-dry-run", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "url_patterns": {"include": [], "exclude": []}, "rate_limit": 0.1, "max_pages": 10, } def test_dry_run_no_directories_created(self): """Test that dry-run mode doesn't create directories""" _converter = DocToSkillConverter(self.config, dry_run=True) # Check directories were NOT created data_dir = Path(f"output/{self.config['name']}_data") skill_dir = Path(f"output/{self.config['name']}") self.assertFalse(data_dir.exists(), "Dry-run should not create data directory") self.assertFalse(skill_dir.exists(), "Dry-run should not create skill directory") def test_dry_run_flag_set(self): """Test that dry_run flag is properly set""" converter = DocToSkillConverter(self.config, dry_run=True) self.assertTrue(converter.dry_run) converter_normal = DocToSkillConverter(self.config, dry_run=False) self.assertFalse(converter_normal.dry_run) # Clean up shutil.rmtree(f"output/{self.config['name']}_data", ignore_errors=True) shutil.rmtree(f"output/{self.config['name']}", ignore_errors=True) def test_normal_mode_creates_directories(self): """Test that normal mode creates directories""" _converter = DocToSkillConverter(self.config, dry_run=False) # Check directories WERE created data_dir = Path(f"output/{self.config['name']}_data") skill_dir = Path(f"output/{self.config['name']}") self.assertTrue(data_dir.exists(), "Normal mode should create data directory") self.assertTrue(skill_dir.exists(), "Normal mode should create skill directory") # Clean up shutil.rmtree(data_dir, ignore_errors=True) shutil.rmtree(skill_dir, ignore_errors=True) class TestConfigLoading(unittest.TestCase): """Test configuration loading and validation""" def setUp(self): """Set up temporary directory for test configs""" self.temp_dir = tempfile.mkdtemp() def tearDown(self): """Clean up temporary directory""" shutil.rmtree(self.temp_dir, ignore_errors=True) def test_load_valid_config(self): """Test loading a valid configuration file (unified format)""" config_data = { "name": "test-config", "description": "Test configuration", "sources": [ { "type": "documentation", "base_url": "https://example.com/", "selectors": { "main_content": "article", "title": "h1", "code_blocks": "pre code", }, "rate_limit": 0.5, "max_pages": 100, } ], } config_path = Path(self.temp_dir) / "test.json" with open(config_path, "w") as f: json.dump(config_data, f) loaded_config = load_config(str(config_path)) self.assertEqual(loaded_config["name"], "test-config") self.assertEqual(len(loaded_config["sources"]), 1) self.assertEqual(loaded_config["sources"][0]["base_url"], "https://example.com/") def test_load_invalid_json(self): """Test loading an invalid JSON file""" config_path = Path(self.temp_dir) / "invalid.json" with open(config_path, "w") as f: f.write("{ invalid json }") with self.assertRaises(SystemExit): load_config(str(config_path)) def test_load_nonexistent_file(self): """Test loading a nonexistent file""" config_path = Path(self.temp_dir) / "nonexistent.json" with self.assertRaises(SystemExit): load_config(str(config_path)) def test_load_config_with_validation_errors(self): """Test loading a config with validation errors - must be missing required fields""" # Legacy validator is lenient, only checks for presence of fields, not format # To trigger validation error, we need a config that's missing required fields entirely config_data = { "description": "Test config", # Missing both 'base_url' and 'repo' - cannot detect type } config_path = Path(self.temp_dir) / "invalid_config.json" with open(config_path, "w") as f: json.dump(config_data, f) with self.assertRaises(SystemExit): load_config(str(config_path)) class TestRealConfigFiles(unittest.TestCase): """Test that real config files in the repository are valid""" def test_godot_config(self): """Test Godot config is valid - uses unified format""" config_path = "configs/godot.json" if os.path.exists(config_path): # Godot config uses unified format (sources array), use ConfigValidator validator = ConfigValidator(config_path) try: validator.validate() # If we get here, validation passed self.assertTrue(True) except ValueError as e: self.fail(f"Godot config validation failed: {e}") def test_react_config(self): """Test React config is valid""" config_path = "configs/react.json" if os.path.exists(config_path): config = load_config(config_path) errors, _ = validate_config(config) self.assertEqual(len(errors), 0, f"React config should be valid, got errors: {errors}") def test_vue_config(self): """Test Vue config is valid""" config_path = "configs/vue.json" if os.path.exists(config_path): config = load_config(config_path) errors, _ = validate_config(config) self.assertEqual(len(errors), 0, f"Vue config should be valid, got errors: {errors}") def test_django_config(self): """Test Django config is valid""" config_path = "configs/django.json" if os.path.exists(config_path): config = load_config(config_path) errors, _ = validate_config(config) self.assertEqual(len(errors), 0, f"Django config should be valid, got errors: {errors}") def test_fastapi_config(self): """Test FastAPI config is valid""" config_path = "configs/fastapi.json" if os.path.exists(config_path): config = load_config(config_path) errors, _ = validate_config(config) self.assertEqual( len(errors), 0, f"FastAPI config should be valid, got errors: {errors}" ) def test_steam_economy_config(self): """Test Steam Economy config is valid""" config_path = "configs/steam-economy-complete.json" if os.path.exists(config_path): config = load_config(config_path) errors, _ = validate_config(config) self.assertEqual( len(errors), 0, f"Steam Economy config should be valid, got errors: {errors}" ) class TestURLProcessing(unittest.TestCase): """Test URL processing and validation""" def test_url_normalization(self): """Test URL normalization in converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "url_patterns": {"include": [], "exclude": []}, "rate_limit": 0.1, "max_pages": 10, } converter = DocToSkillConverter(config, dry_run=True) # Base URL should be stored correctly self.assertEqual(converter.base_url, "https://example.com/") def test_start_urls_fallback(self): """Test that start_urls defaults to base_url""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } converter = DocToSkillConverter(config, dry_run=True) # Should have base_url in pending_urls self.assertEqual(len(converter.pending_urls), 1) self.assertEqual(converter.pending_urls[0], "https://example.com/") def test_multiple_start_urls(self): """Test multiple start URLs""" config = { "name": "test", "base_url": "https://example.com/", "start_urls": [ "https://example.com/guide/", "https://example.com/api/", "https://example.com/tutorial/", ], "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } converter = DocToSkillConverter(config, dry_run=True) # Should have all start URLs in pending_urls self.assertEqual(len(converter.pending_urls), 3) class TestLlmsTxtIntegration(unittest.TestCase): """Test llms.txt integration into scraping workflow""" def test_scraper_has_llms_txt_attributes(self): """Test that scraper has llms.txt detection attributes""" config = { "name": "test-llms", "base_url": "https://hono.dev/docs", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "max_pages": 50, } scraper = DocToSkillConverter(config, dry_run=True) # Should have llms.txt attributes self.assertFalse(scraper.llms_txt_detected) self.assertIsNone(scraper.llms_txt_variant) def test_scraper_has_try_llms_txt_method(self): """Test that scraper has _try_llms_txt method""" config = { "name": "test-llms", "base_url": "https://hono.dev/docs", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "max_pages": 50, } scraper = DocToSkillConverter(config, dry_run=True) # Should have _try_llms_txt method self.assertTrue(hasattr(scraper, "_try_llms_txt")) self.assertTrue(callable(scraper._try_llms_txt)) class TestContentExtraction(unittest.TestCase): """Test content extraction functionality""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_extract_empty_content(self): """Test extracting from empty HTML""" from bs4 import BeautifulSoup html = "
" soup = BeautifulSoup(html, "html.parser") page = self.converter.extract_content(soup, "https://example.com/test") self.assertEqual(page["url"], "https://example.com/test") self.assertEqual(page["title"], "") self.assertEqual(page["content"], "") self.assertEqual(len(page["code_samples"]), 0) def test_extract_basic_content(self): """Test extracting basic content""" from bs4 import BeautifulSoup html = """This is some content.
This is more content with sufficient length to be included.
print("hello")