#!/usr/bin/env python3 """ Test suite for doc_scraper core features Tests URL validation, language detection, pattern extraction, and categorization """ import os import sys import unittest from bs4 import BeautifulSoup # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from skill_seekers.cli.doc_scraper import DocToSkillConverter class TestURLValidation(unittest.TestCase): """Test URL validation logic""" def setUp(self): """Set up test converter""" self.config = { "name": "test", "base_url": "https://docs.example.com/", "url_patterns": {"include": ["/guide/", "/api/"], "exclude": ["/blog/", "/about/"]}, "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(self.config, dry_run=True) def test_valid_url_with_include_pattern(self): """Test URL matching include pattern""" url = "https://docs.example.com/guide/getting-started" self.assertTrue(self.converter.is_valid_url(url)) def test_valid_url_with_api_pattern(self): """Test URL matching API pattern""" url = "https://docs.example.com/api/reference" self.assertTrue(self.converter.is_valid_url(url)) def test_invalid_url_with_exclude_pattern(self): """Test URL matching exclude pattern""" url = "https://docs.example.com/blog/announcement" self.assertFalse(self.converter.is_valid_url(url)) def test_invalid_url_different_domain(self): """Test URL from different domain""" url = "https://other-site.com/guide/tutorial" self.assertFalse(self.converter.is_valid_url(url)) def test_invalid_url_no_include_match(self): """Test URL not matching any include pattern""" url = "https://docs.example.com/download/installer" self.assertFalse(self.converter.is_valid_url(url)) def test_url_validation_no_patterns(self): """Test URL validation with no include/exclude patterns""" config = { "name": "test", "base_url": "https://docs.example.com/", "url_patterns": {"include": [], "exclude": []}, "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } converter = DocToSkillConverter(config, dry_run=True) # Should accept any URL under base_url self.assertTrue(converter.is_valid_url("https://docs.example.com/anything")) self.assertFalse(converter.is_valid_url("https://other.com/anything")) class TestLanguageDetection(unittest.TestCase): """Test language detection from code blocks""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_detect_language_from_class(self): """Test language detection from CSS class""" html = 'print("hello")' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, 'print("hello")') self.assertEqual(lang, "python") def test_detect_language_from_lang_class(self): """Test language detection from lang- prefix""" html = 'console.log("hello")' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, 'console.log("hello")') self.assertEqual(lang, "javascript") def test_detect_language_from_parent(self): """Test language detection from parent pre element""" html = '
int main() {}
' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, "int main() {}") self.assertEqual(lang, "cpp") def test_detect_python_from_heuristics(self): """Test Python detection from code content""" html = "import os\nfrom pathlib import Path" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "python") def test_detect_python_from_def(self): """Test Python detection from def keyword""" html = "def my_function():\n pass" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "python") def test_detect_javascript_from_const(self): """Test JavaScript detection from const keyword""" html = "const myVar = 10;" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "javascript") def test_detect_javascript_from_arrow(self): """Test JavaScript detection from arrow function""" html = "const add = (a, b) => a + b;" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "javascript") def test_detect_gdscript(self): """Test GDScript detection""" html = "func _ready():\n var x = 5" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "gdscript") def test_detect_cpp(self): """Test C++ detection""" html = "#include \nint main() { return 0; }" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "cpp") def test_detect_unknown(self): """Test unknown language detection""" html = "some random text without clear indicators" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "unknown") def test_detect_brush_pattern_in_pre(self): """Test brush: pattern in pre element""" html = '
x
' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, "x") self.assertEqual(lang, "python", "Should detect python from brush: python pattern") def test_detect_bare_class_in_pre(self): """Test bare class name in pre element""" html = '
x
' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, "x") self.assertEqual(lang, "python", "Should detect python from bare class name") def test_detect_bare_class_in_code(self): """Test bare class name in code element""" html = 'x' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, "x") self.assertEqual(lang, "python", "Should detect python from bare class name") def test_detect_csharp_from_using_system(self): """Test C# detection from 'using System' keyword""" html = "using System;\nnamespace MyApp { }" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from using System") def test_detect_csharp_from_namespace(self): """Test C# detection from 'namespace' keyword""" html = "namespace MyNamespace\n{\n public class Test { }\n}" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from namespace") def test_detect_csharp_from_property_syntax(self): """Test C# detection from property syntax""" html = "public string Name { get; set; }" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from { get; set; } syntax") def test_detect_csharp_from_public_class(self): """Test C# detection from 'public class' keyword""" html = "public class MyClass\n{\n private int value;\n}" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from public class") def test_detect_csharp_from_private_class(self): """Test C# detection from 'private class' keyword""" html = "private class Helper { }" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from private class") def test_detect_csharp_from_public_static_void(self): """Test C# detection from 'public static void' keyword""" html = 'public static void Main(string[] args)\n{\n Console.WriteLine("Test");\n}' elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from public static void") def test_detect_csharp_from_class_attribute(self): """Test C# detection from CSS class attribute""" html = 'var x = 5;' elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from language-csharp class") class TestPatternExtraction(unittest.TestCase): """Test pattern extraction from documentation""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_extract_pattern_with_example_marker(self): """Test pattern extraction with 'Example:' marker""" html = """

Example: Here's how to use it

print("hello")
""" soup = BeautifulSoup(html, "html.parser") main = soup.find("article") patterns = self.converter.extract_patterns(main, []) self.assertGreater(len(patterns), 0) self.assertIn("example", patterns[0]["description"].lower()) def test_extract_pattern_with_usage_marker(self): """Test pattern extraction with 'Usage:' marker""" html = """

Usage: Call this function like so

my_function(arg)
""" soup = BeautifulSoup(html, "html.parser") main = soup.find("article") patterns = self.converter.extract_patterns(main, []) self.assertGreater(len(patterns), 0) self.assertIn("usage", patterns[0]["description"].lower()) def test_extract_pattern_limit(self): """Test pattern extraction limits to 5 patterns""" html = "
" for i in range(10): html += f"

Example {i}: Test

code_{i}
" html += "
" soup = BeautifulSoup(html, "html.parser") main = soup.find("article") patterns = self.converter.extract_patterns(main, []) self.assertLessEqual(len(patterns), 5, "Should limit to 5 patterns max") class TestCategorization(unittest.TestCase): """Test smart categorization logic""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "categories": { "getting_started": ["intro", "tutorial", "getting-started"], "api": ["api", "reference", "class"], "guides": ["guide", "how-to"], }, "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_categorize_by_url(self): """Test categorization based on URL""" pages = [ { "url": "https://example.com/api/reference", "title": "Some Title", "content": "Some content", } ] categories = self.converter.smart_categorize(pages) # Should categorize to 'api' based on URL containing 'api' self.assertIn("api", categories) self.assertEqual(len(categories["api"]), 1) def test_categorize_by_title(self): """Test categorization based on title""" pages = [ { "url": "https://example.com/docs/page", "title": "API Reference Documentation", "content": "Some content", } ] categories = self.converter.smart_categorize(pages) self.assertIn("api", categories) self.assertEqual(len(categories["api"]), 1) def test_categorize_by_content(self): """Test categorization based on content (lower priority)""" pages = [ { "url": "https://example.com/docs/page", "title": "Some Page", "content": "This is a tutorial for beginners. An intro to the system.", } ] categories = self.converter.smart_categorize(pages) # Should categorize based on 'tutorial' and 'intro' in content self.assertIn("getting_started", categories) def test_categorize_to_other(self): """Test pages that don't match any category go to 'other'""" pages = [ { "url": "https://example.com/random/page", "title": "Random Page", "content": "Random content with no keywords", } ] categories = self.converter.smart_categorize(pages) self.assertIn("other", categories) self.assertEqual(len(categories["other"]), 1) def test_empty_categories_removed(self): """Test empty categories are removed""" pages = [ { "url": "https://example.com/api/reference", "title": "API Reference", "content": "API documentation", } ] categories = self.converter.smart_categorize(pages) # Only 'api' should exist, not empty 'guides' or 'getting_started' # (categories with no pages are removed) self.assertIn("api", categories) self.assertNotIn("guides", categories) class TestLinkExtraction(unittest.TestCase): """Test link extraction and anchor fragment handling""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "url_patterns": {"include": [], "exclude": []}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_extract_links_strips_anchor_fragments(self): """Test that anchor fragments (#anchor) are stripped from extracted links""" html = """

Test Page

Content with links

Link 1 Link 2 Link 3
""" soup = BeautifulSoup(html, "html.parser") page = self.converter.extract_content(soup, "https://example.com/") # Should have 2 unique URLs (page.html and other.html), not 3 # The two links with different anchors should be deduplicated self.assertEqual(len(page["links"]), 2) self.assertIn("https://example.com/docs/page.html", page["links"]) self.assertIn("https://example.com/docs/other.html", page["links"]) def test_extract_links_no_anchor_duplicates(self): """Test that multiple anchor links to same page don't create duplicates""" html = """

Test Page

Anchor 1 Anchor 2 Anchor 3 Anchor 4 Anchor 5
""" soup = BeautifulSoup(html, "html.parser") page = self.converter.extract_content(soup, "https://example.com/") # All 5 links point to the same page, should result in only 1 URL self.assertEqual(len(page["links"]), 1) self.assertEqual(page["links"][0], "https://example.com/docs/api.html") def test_extract_links_preserves_query_params(self): """Test that query parameters are preserved when stripping anchors""" html = """

Test Page

Search Result
""" soup = BeautifulSoup(html, "html.parser") page = self.converter.extract_content(soup, "https://example.com/") # Query params should be preserved, only anchor stripped self.assertEqual(len(page["links"]), 1) self.assertEqual(page["links"][0], "https://example.com/search?q=test") def test_extract_links_relative_urls_with_anchors(self): """Test that relative URLs with anchors are handled correctly""" html = """

Test Page

Relative Link 1 Relative Link 2 Relative Link 3
""" soup = BeautifulSoup(html, "html.parser") page = self.converter.extract_content(soup, "https://example.com/") # Should have 2 unique URLs (guide.html and tutorial.html) self.assertEqual(len(page["links"]), 2) self.assertIn("https://example.com/docs/guide.html", page["links"]) self.assertIn("https://example.com/docs/tutorial.html", page["links"]) class TestTextCleaning(unittest.TestCase): """Test text cleaning utility""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_clean_multiple_spaces(self): """Test cleaning multiple spaces""" text = "Hello world test" cleaned = self.converter.clean_text(text) self.assertEqual(cleaned, "Hello world test") def test_clean_newlines(self): """Test cleaning newlines""" text = "Hello\n\nworld\ntest" cleaned = self.converter.clean_text(text) self.assertEqual(cleaned, "Hello world test") def test_clean_tabs(self): """Test cleaning tabs""" text = "Hello\t\tworld\ttest" cleaned = self.converter.clean_text(text) self.assertEqual(cleaned, "Hello world test") def test_clean_strip_whitespace(self): """Test stripping leading/trailing whitespace""" text = " Hello world " cleaned = self.converter.clean_text(text) self.assertEqual(cleaned, "Hello world") if __name__ == "__main__": unittest.main()