Example: Here's how to use it
print("hello")
#!/usr/bin/env python3
"""
Test suite for doc_scraper core features
Tests URL validation, language detection, pattern extraction, and categorization
"""
import os
import sys
import unittest
from bs4 import BeautifulSoup
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from skill_seekers.cli.doc_scraper import DocToSkillConverter
class TestURLValidation(unittest.TestCase):
"""Test URL validation logic"""
def setUp(self):
"""Set up test converter"""
self.config = {
"name": "test",
"base_url": "https://docs.example.com/",
"url_patterns": {"include": ["/guide/", "/api/"], "exclude": ["/blog/", "/about/"]},
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
"rate_limit": 0.1,
"max_pages": 10,
}
self.converter = DocToSkillConverter(self.config, dry_run=True)
def test_valid_url_with_include_pattern(self):
"""Test URL matching include pattern"""
url = "https://docs.example.com/guide/getting-started"
self.assertTrue(self.converter.is_valid_url(url))
def test_valid_url_with_api_pattern(self):
"""Test URL matching API pattern"""
url = "https://docs.example.com/api/reference"
self.assertTrue(self.converter.is_valid_url(url))
def test_invalid_url_with_exclude_pattern(self):
"""Test URL matching exclude pattern"""
url = "https://docs.example.com/blog/announcement"
self.assertFalse(self.converter.is_valid_url(url))
def test_invalid_url_different_domain(self):
"""Test URL from different domain"""
url = "https://other-site.com/guide/tutorial"
self.assertFalse(self.converter.is_valid_url(url))
def test_invalid_url_no_include_match(self):
"""Test URL not matching any include pattern"""
url = "https://docs.example.com/download/installer"
self.assertFalse(self.converter.is_valid_url(url))
def test_url_validation_no_patterns(self):
"""Test URL validation with no include/exclude patterns"""
config = {
"name": "test",
"base_url": "https://docs.example.com/",
"url_patterns": {"include": [], "exclude": []},
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"},
"rate_limit": 0.1,
"max_pages": 10,
}
converter = DocToSkillConverter(config, dry_run=True)
# Should accept any URL under base_url
self.assertTrue(converter.is_valid_url("https://docs.example.com/anything"))
self.assertFalse(converter.is_valid_url("https://other.com/anything"))
class TestLanguageDetection(unittest.TestCase):
"""Test language detection from code blocks"""
def setUp(self):
"""Set up test converter"""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"},
"rate_limit": 0.1,
"max_pages": 10,
}
self.converter = DocToSkillConverter(config, dry_run=True)
def test_detect_language_from_class(self):
"""Test language detection from CSS class"""
html = 'print("hello")'
elem = BeautifulSoup(html, "html.parser").find("code")
lang = self.converter.detect_language(elem, 'print("hello")')
self.assertEqual(lang, "python")
def test_detect_language_from_lang_class(self):
"""Test language detection from lang- prefix"""
html = 'console.log("hello")'
elem = BeautifulSoup(html, "html.parser").find("code")
lang = self.converter.detect_language(elem, 'console.log("hello")')
self.assertEqual(lang, "javascript")
def test_detect_language_from_parent(self):
"""Test language detection from parent pre element"""
html = '
int main() {}'
elem = BeautifulSoup(html, "html.parser").find("code")
lang = self.converter.detect_language(elem, "int main() {}")
self.assertEqual(lang, "cpp")
def test_detect_python_from_heuristics(self):
"""Test Python detection from code content"""
html = "import os\nfrom pathlib import Path"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "python")
def test_detect_python_from_def(self):
"""Test Python detection from def keyword"""
html = "def my_function():\n pass"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "python")
def test_detect_javascript_from_const(self):
"""Test JavaScript detection from const keyword"""
html = "const myVar = 10;"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "javascript")
def test_detect_javascript_from_arrow(self):
"""Test JavaScript detection from arrow function"""
html = "const add = (a, b) => a + b;"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "javascript")
def test_detect_gdscript(self):
"""Test GDScript detection"""
html = "func _ready():\n var x = 5"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "gdscript")
def test_detect_cpp(self):
"""Test C++ detection"""
html = "#include \nint main() { return 0; } "
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "cpp")
def test_detect_unknown(self):
"""Test unknown language detection"""
html = "some random text without clear indicators"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "unknown")
def test_detect_brush_pattern_in_pre(self):
"""Test brush: pattern in pre element"""
html = 'x'
elem = BeautifulSoup(html, "html.parser").find("code")
lang = self.converter.detect_language(elem, "x")
self.assertEqual(lang, "python", "Should detect python from brush: python pattern")
def test_detect_bare_class_in_pre(self):
"""Test bare class name in pre element"""
html = 'x'
elem = BeautifulSoup(html, "html.parser").find("code")
lang = self.converter.detect_language(elem, "x")
self.assertEqual(lang, "python", "Should detect python from bare class name")
def test_detect_bare_class_in_code(self):
"""Test bare class name in code element"""
html = 'x'
elem = BeautifulSoup(html, "html.parser").find("code")
lang = self.converter.detect_language(elem, "x")
self.assertEqual(lang, "python", "Should detect python from bare class name")
def test_detect_csharp_from_using_system(self):
"""Test C# detection from 'using System' keyword"""
html = "using System;\nnamespace MyApp { }"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "csharp", "Should detect C# from using System")
def test_detect_csharp_from_namespace(self):
"""Test C# detection from 'namespace' keyword"""
html = "namespace MyNamespace\n{\n public class Test { }\n}"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "csharp", "Should detect C# from namespace")
def test_detect_csharp_from_property_syntax(self):
"""Test C# detection from property syntax"""
html = "public string Name { get; set; }"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "csharp", "Should detect C# from { get; set; } syntax")
def test_detect_csharp_from_public_class(self):
"""Test C# detection from 'public class' keyword"""
html = "public class MyClass\n{\n private int value;\n}"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "csharp", "Should detect C# from public class")
def test_detect_csharp_from_private_class(self):
"""Test C# detection from 'private class' keyword"""
html = "private class Helper { }"
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "csharp", "Should detect C# from private class")
def test_detect_csharp_from_public_static_void(self):
"""Test C# detection from 'public static void' keyword"""
html = 'public static void Main(string[] args)\n{\n Console.WriteLine("Test");\n}'
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "csharp", "Should detect C# from public static void")
def test_detect_csharp_from_class_attribute(self):
"""Test C# detection from CSS class attribute"""
html = 'var x = 5;'
elem = BeautifulSoup(html, "html.parser").find("code")
code = elem.get_text()
lang = self.converter.detect_language(elem, code)
self.assertEqual(lang, "csharp", "Should detect C# from language-csharp class")
class TestPatternExtraction(unittest.TestCase):
"""Test pattern extraction from documentation"""
def setUp(self):
"""Set up test converter"""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"},
"rate_limit": 0.1,
"max_pages": 10,
}
self.converter = DocToSkillConverter(config, dry_run=True)
def test_extract_pattern_with_example_marker(self):
"""Test pattern extraction with 'Example:' marker"""
html = """
Example: Here's how to use it
print("hello")
Usage: Call this function like so
my_function(arg)
Example {i}: Test
code_{i}"
html += "Content with links
Link 1 Link 2 Link 3