#!/usr/bin/env python3 """ Test suite for doc_scraper core features Tests URL validation, language detection, pattern extraction, and categorization """ import os import sys import unittest from bs4 import BeautifulSoup # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from skill_seekers.cli.doc_scraper import DocToSkillConverter class TestURLValidation(unittest.TestCase): """Test URL validation logic""" def setUp(self): """Set up test converter""" self.config = { "name": "test", "base_url": "https://docs.example.com/", "url_patterns": {"include": ["/guide/", "/api/"], "exclude": ["/blog/", "/about/"]}, "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(self.config, dry_run=True) def test_valid_url_with_include_pattern(self): """Test URL matching include pattern""" url = "https://docs.example.com/guide/getting-started" self.assertTrue(self.converter.is_valid_url(url)) def test_valid_url_with_api_pattern(self): """Test URL matching API pattern""" url = "https://docs.example.com/api/reference" self.assertTrue(self.converter.is_valid_url(url)) def test_invalid_url_with_exclude_pattern(self): """Test URL matching exclude pattern""" url = "https://docs.example.com/blog/announcement" self.assertFalse(self.converter.is_valid_url(url)) def test_invalid_url_different_domain(self): """Test URL from different domain""" url = "https://other-site.com/guide/tutorial" self.assertFalse(self.converter.is_valid_url(url)) def test_invalid_url_no_include_match(self): """Test URL not matching any include pattern""" url = "https://docs.example.com/download/installer" self.assertFalse(self.converter.is_valid_url(url)) def test_url_validation_no_patterns(self): """Test URL validation with no include/exclude patterns""" config = { "name": "test", "base_url": "https://docs.example.com/", "url_patterns": {"include": [], "exclude": []}, "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } converter = DocToSkillConverter(config, dry_run=True) # Should accept any URL under base_url self.assertTrue(converter.is_valid_url("https://docs.example.com/anything")) self.assertFalse(converter.is_valid_url("https://other.com/anything")) class TestLanguageDetection(unittest.TestCase): """Test language detection from code blocks""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_detect_language_from_class(self): """Test language detection from CSS class""" html = 'print("hello")' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, 'print("hello")') self.assertEqual(lang, "python") def test_detect_language_from_lang_class(self): """Test language detection from lang- prefix""" html = 'console.log("hello")' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, 'console.log("hello")') self.assertEqual(lang, "javascript") def test_detect_language_from_parent(self): """Test language detection from parent pre element""" html = '
int main() {}
' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, "int main() {}") self.assertEqual(lang, "cpp") def test_detect_python_from_heuristics(self): """Test Python detection from code content""" html = "import os\nfrom pathlib import Path" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "python") def test_detect_python_from_def(self): """Test Python detection from def keyword""" html = "def my_function():\n pass" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "python") def test_detect_javascript_from_const(self): """Test JavaScript detection from const keyword""" html = "const myVar = 10;" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "javascript") def test_detect_javascript_from_arrow(self): """Test JavaScript detection from arrow function""" html = "const add = (a, b) => a + b;" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "javascript") def test_detect_gdscript(self): """Test GDScript detection""" html = "func _ready():\n var x = 5" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "gdscript") def test_detect_cpp(self): """Test C++ detection""" html = "#include \nint main() { return 0; }" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "cpp") def test_detect_unknown(self): """Test unknown language detection""" html = "some random text without clear indicators" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "unknown") def test_detect_brush_pattern_in_pre(self): """Test brush: pattern in pre element""" html = '
x
' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, "x") self.assertEqual(lang, "python", "Should detect python from brush: python pattern") def test_detect_bare_class_in_pre(self): """Test bare class name in pre element""" html = '
x
' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, "x") self.assertEqual(lang, "python", "Should detect python from bare class name") def test_detect_bare_class_in_code(self): """Test bare class name in code element""" html = 'x' elem = BeautifulSoup(html, "html.parser").find("code") lang = self.converter.detect_language(elem, "x") self.assertEqual(lang, "python", "Should detect python from bare class name") def test_detect_csharp_from_using_system(self): """Test C# detection from 'using System' keyword""" html = "using System;\nnamespace MyApp { }" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from using System") def test_detect_csharp_from_namespace(self): """Test C# detection from 'namespace' keyword""" html = "namespace MyNamespace\n{\n public class Test { }\n}" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from namespace") def test_detect_csharp_from_property_syntax(self): """Test C# detection from property syntax""" html = "public string Name { get; set; }" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from { get; set; } syntax") def test_detect_csharp_from_public_class(self): """Test C# detection from 'public class' keyword""" html = "public class MyClass\n{\n private int value;\n}" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from public class") def test_detect_csharp_from_private_class(self): """Test C# detection from 'private class' keyword""" html = "private class Helper { }" elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from private class") def test_detect_csharp_from_public_static_void(self): """Test C# detection from 'public static void' keyword""" html = 'public static void Main(string[] args)\n{\n Console.WriteLine("Test");\n}' elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from public static void") def test_detect_csharp_from_class_attribute(self): """Test C# detection from CSS class attribute""" html = 'var x = 5;' elem = BeautifulSoup(html, "html.parser").find("code") code = elem.get_text() lang = self.converter.detect_language(elem, code) self.assertEqual(lang, "csharp", "Should detect C# from language-csharp class") class TestPatternExtraction(unittest.TestCase): """Test pattern extraction from documentation""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_extract_pattern_with_example_marker(self): """Test pattern extraction with 'Example:' marker""" html = """

Example: Here's how to use it

print("hello")
""" soup = BeautifulSoup(html, "html.parser") main = soup.find("article") patterns = self.converter.extract_patterns(main, []) self.assertGreater(len(patterns), 0) self.assertIn("example", patterns[0]["description"].lower()) def test_extract_pattern_with_usage_marker(self): """Test pattern extraction with 'Usage:' marker""" html = """

Usage: Call this function like so

my_function(arg)
""" soup = BeautifulSoup(html, "html.parser") main = soup.find("article") patterns = self.converter.extract_patterns(main, []) self.assertGreater(len(patterns), 0) self.assertIn("usage", patterns[0]["description"].lower()) def test_extract_pattern_limit(self): """Test pattern extraction limits to 5 patterns""" html = "
" for i in range(10): html += f"

Example {i}: Test

code_{i}
" html += "
" soup = BeautifulSoup(html, "html.parser") main = soup.find("article") patterns = self.converter.extract_patterns(main, []) self.assertLessEqual(len(patterns), 5, "Should limit to 5 patterns max") class TestCategorization(unittest.TestCase): """Test smart categorization logic""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "categories": { "getting_started": ["intro", "tutorial", "getting-started"], "api": ["api", "reference", "class"], "guides": ["guide", "how-to"], }, "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_categorize_by_url(self): """Test categorization based on URL""" pages = [ { "url": "https://example.com/api/reference", "title": "Some Title", "content": "Some content", } ] categories = self.converter.smart_categorize(pages) # Should categorize to 'api' based on URL containing 'api' self.assertIn("api", categories) self.assertEqual(len(categories["api"]), 1) def test_categorize_by_title(self): """Test categorization based on title""" pages = [ { "url": "https://example.com/docs/page", "title": "API Reference Documentation", "content": "Some content", } ] categories = self.converter.smart_categorize(pages) self.assertIn("api", categories) self.assertEqual(len(categories["api"]), 1) def test_categorize_by_content(self): """Test categorization based on content (lower priority)""" pages = [ { "url": "https://example.com/docs/page", "title": "Some Page", "content": "This is a tutorial for beginners. An intro to the system.", } ] categories = self.converter.smart_categorize(pages) # Should categorize based on 'tutorial' and 'intro' in content self.assertIn("getting_started", categories) def test_categorize_to_other(self): """Test pages that don't match any category go to 'other'""" pages = [ { "url": "https://example.com/random/page", "title": "Random Page", "content": "Random content with no keywords", } ] categories = self.converter.smart_categorize(pages) self.assertIn("other", categories) self.assertEqual(len(categories["other"]), 1) def test_empty_categories_removed(self): """Test empty categories are removed""" pages = [ { "url": "https://example.com/api/reference", "title": "API Reference", "content": "API documentation", } ] categories = self.converter.smart_categorize(pages) # Only 'api' should exist, not empty 'guides' or 'getting_started' # (categories with no pages are removed) self.assertIn("api", categories) self.assertNotIn("guides", categories) class TestLinkExtraction(unittest.TestCase): """Test link extraction and anchor fragment handling""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "url_patterns": {"include": [], "exclude": []}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_extract_links_strips_anchor_fragments(self): """Test that anchor fragments (#anchor) are stripped from extracted links""" html = """

Test Page

Content with links

Link 1 Link 2 Link 3
""" soup = BeautifulSoup(html, "html.parser") page = self.converter.extract_content(soup, "https://example.com/") # Should have 2 unique URLs (page.html and other.html), not 3 # The two links with different anchors should be deduplicated self.assertEqual(len(page["links"]), 2) self.assertIn("https://example.com/docs/page.html", page["links"]) self.assertIn("https://example.com/docs/other.html", page["links"]) def test_extract_links_no_anchor_duplicates(self): """Test that multiple anchor links to same page don't create duplicates""" html = """

Test Page

Anchor 1 Anchor 2 Anchor 3 Anchor 4 Anchor 5
""" soup = BeautifulSoup(html, "html.parser") page = self.converter.extract_content(soup, "https://example.com/") # All 5 links point to the same page, should result in only 1 URL self.assertEqual(len(page["links"]), 1) self.assertEqual(page["links"][0], "https://example.com/docs/api.html") def test_extract_links_preserves_query_params(self): """Test that query parameters are preserved when stripping anchors""" html = """

Test Page

Search Result
""" soup = BeautifulSoup(html, "html.parser") page = self.converter.extract_content(soup, "https://example.com/") # Query params should be preserved, only anchor stripped self.assertEqual(len(page["links"]), 1) self.assertEqual(page["links"][0], "https://example.com/search?q=test") def test_extract_links_relative_urls_with_anchors(self): """Test that relative URLs with anchors are handled correctly""" html = """

Test Page

Relative Link 1 Relative Link 2 Relative Link 3
""" soup = BeautifulSoup(html, "html.parser") page = self.converter.extract_content(soup, "https://example.com/") # Should have 2 unique URLs (guide.html and tutorial.html) self.assertEqual(len(page["links"]), 2) self.assertIn("https://example.com/docs/guide.html", page["links"]) self.assertIn("https://example.com/docs/tutorial.html", page["links"]) class TestTextCleaning(unittest.TestCase): """Test text cleaning utility""" def setUp(self): """Set up test converter""" config = { "name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, "rate_limit": 0.1, "max_pages": 10, } self.converter = DocToSkillConverter(config, dry_run=True) def test_clean_multiple_spaces(self): """Test cleaning multiple spaces""" text = "Hello world test" cleaned = self.converter.clean_text(text) self.assertEqual(cleaned, "Hello world test") def test_clean_newlines(self): """Test cleaning newlines""" text = "Hello\n\nworld\ntest" cleaned = self.converter.clean_text(text) self.assertEqual(cleaned, "Hello world test") def test_clean_tabs(self): """Test cleaning tabs""" text = "Hello\t\tworld\ttest" cleaned = self.converter.clean_text(text) self.assertEqual(cleaned, "Hello world test") def test_clean_strip_whitespace(self): """Test stripping leading/trailing whitespace""" text = " Hello world " cleaned = self.converter.clean_text(text) self.assertEqual(cleaned, "Hello world") class TestSanitizeUrl(unittest.TestCase): """Test the shared sanitize_url utility (see issue #284).""" def test_no_brackets_unchanged(self): """URLs without brackets should pass through unchanged.""" from skill_seekers.cli.utils import sanitize_url url = "https://docs.example.com/api/v1/users" self.assertEqual(sanitize_url(url), url) def test_brackets_in_path_encoded(self): """Square brackets in path should be percent-encoded.""" from skill_seekers.cli.utils import sanitize_url result = sanitize_url("https://example.com/api/[v1]/users") self.assertEqual(result, "https://example.com/api/%5Bv1%5D/users") def test_brackets_in_query_encoded(self): """Square brackets in query should be percent-encoded.""" from skill_seekers.cli.utils import sanitize_url result = sanitize_url("https://example.com/search?filter=[active]&sort=[name]") self.assertEqual(result, "https://example.com/search?filter=%5Bactive%5D&sort=%5Bname%5D") def test_host_not_affected(self): """Host portion should never be modified (IPv6 literals are valid there).""" from skill_seekers.cli.utils import sanitize_url # URL with brackets only in path, host stays intact result = sanitize_url("https://example.com/[v1]/ref") self.assertTrue(result.startswith("https://example.com/")) def test_already_encoded_brackets(self): """Already-encoded brackets should not be double-encoded.""" from skill_seekers.cli.utils import sanitize_url url = "https://example.com/api/%5Bv1%5D/users" # No raw brackets present, should pass through unchanged self.assertEqual(sanitize_url(url), url) def test_empty_and_simple_urls(self): """Edge cases: empty string, simple URLs.""" from skill_seekers.cli.utils import sanitize_url self.assertEqual(sanitize_url(""), "") self.assertEqual(sanitize_url("https://example.com"), "https://example.com") self.assertEqual(sanitize_url("https://example.com/"), "https://example.com/") def test_malformed_ipv6_url_no_crash(self): """URLs with brackets that look like broken IPv6 must not crash (issue #284). Python 3.14 raises ValueError from urlparse() on unencoded brackets that look like IPv6 but are malformed (e.g. from documentation examples). """ from skill_seekers.cli.utils import sanitize_url # Incomplete IPv6 placeholder from docs.openclaw.ai llms-full.txt result = sanitize_url("http://[fdaa:x:x:x:x::x") self.assertNotIn("[", result) self.assertIn("%5B", result) def test_unmatched_bracket_no_crash(self): """Unmatched brackets should be encoded, not crash.""" from skill_seekers.cli.utils import sanitize_url result = sanitize_url("https://example.com/api/[v1/users") self.assertNotIn("[", result) self.assertIn("%5B", result) class TestEnqueueUrlSanitization(unittest.TestCase): """Test that _enqueue_url sanitises bracket URLs before enqueueing (#284).""" def setUp(self): """Set up test converter.""" self.config = { "name": "test", "base_url": "https://docs.example.com/", "url_patterns": {"include": [], "exclude": []}, "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "rate_limit": 0, "max_pages": 100, } self.converter = DocToSkillConverter(self.config, dry_run=True) def test_enqueue_sanitises_brackets(self): """_enqueue_url should percent-encode brackets before adding to queue.""" self.converter._enqueue_url("https://docs.example.com/api/[v1]/users") # The URL in the queue should have encoded brackets queued_url = list(self.converter.pending_urls)[-1] self.assertNotIn("[", queued_url) self.assertNotIn("]", queued_url) self.assertIn("%5B", queued_url) self.assertIn("%5D", queued_url) def test_enqueue_dedup_with_encoded_brackets(self): """Encoded and raw bracket URLs should be treated as the same URL.""" self.converter._enqueue_url("https://docs.example.com/api/[v1]/ref") initial_len = len(self.converter.pending_urls) # Enqueueing the same URL again (raw brackets) should be a no-op self.converter._enqueue_url("https://docs.example.com/api/[v1]/ref") self.assertEqual(len(self.converter.pending_urls), initial_len) def test_enqueue_normal_url_unchanged(self): """Normal URLs without brackets should pass through unchanged.""" self.converter._enqueue_url("https://docs.example.com/guide/intro") queued_url = list(self.converter.pending_urls)[-1] self.assertEqual(queued_url, "https://docs.example.com/guide/intro") class TestMarkdownLinkBracketSanitization(unittest.TestCase): """Integration test: markdown content with bracket URLs should not crash (#284).""" def setUp(self): """Set up test converter.""" self.config = { "name": "test", "base_url": "https://docs.example.com/", "url_patterns": {"include": [], "exclude": []}, "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, "rate_limit": 0, "max_pages": 100, } self.converter = DocToSkillConverter(self.config, dry_run=True) def test_extract_markdown_links_with_brackets(self): """Links with brackets in .md content should be sanitised when enqueued.""" # Simulate markdown content containing a link with brackets md_content = """# API Reference See the [Users Endpoint](https://docs.example.com/api/[v1]/users.md) for details. Also check [Guide](https://docs.example.com/guide/intro.md). """ page = self.converter._extract_markdown_content(md_content, "https://docs.example.com/") # Enqueue all extracted links (this is what scrape_page does) for link in page["links"]: self.converter._enqueue_url(link) # All enqueued URLs should have brackets encoded for url in self.converter.pending_urls: self.assertNotIn("[", url, f"Raw bracket found in enqueued URL: {url}") self.assertNotIn("]", url, f"Raw bracket found in enqueued URL: {url}") if __name__ == "__main__": unittest.main()