""" Tests for Markdown parsing and BFS URL crawling features. Tests the following functionality: 1. Markdown file content extraction (_extract_markdown_content) 2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown) 3. URL extraction from llms.txt (extract_urls, _clean_url) 4. Empty/short content filtering in save_page """ import os import shutil import unittest class TestMarkdownContentExtraction(unittest.TestCase): """Test Markdown file parsing in doc_scraper.""" def setUp(self): """Set up test fixtures.""" from skill_seekers.cli.doc_scraper import DocToSkillConverter self.config = { "name": "test_md_parsing", "base_url": "https://example.com", "selectors": {}, "url_patterns": {"include": [], "exclude": []}, "categories": {}, } self.converter = DocToSkillConverter(self.config) def tearDown(self): """Clean up output directory.""" output_dir = f"output/{self.config['name']}_data" if os.path.exists(output_dir): shutil.rmtree(output_dir) def test_extract_title_from_h1(self): """Test extracting title from first h1.""" content = "# My Documentation Title\n\nSome content here." result = self.converter._extract_markdown_content(content, "https://example.com/test.md") self.assertEqual(result["title"], "My Documentation Title") def test_extract_headings_h2_to_h6(self): """Test extracting h2-h6 headings (not h1).""" content = """# Title ## Section One ### Subsection A #### Deep Section ##### Deeper ###### Deepest Content here. """ result = self.converter._extract_markdown_content(content, "https://example.com/test.md") # Should have 5 headings (h2-h6), not h1 self.assertEqual(len(result["headings"]), 5) self.assertEqual(result["headings"][0]["level"], "h2") self.assertEqual(result["headings"][0]["text"], "Section One") def test_extract_code_blocks_with_language(self): """Test extracting code blocks with language tags.""" content = """# API Guide ```python def hello(): return "Hello, World!" ``` Some explanation. ```javascript const greet = () => console.log("Hi"); ``` ``` plain code without language ``` """ result = self.converter._extract_markdown_content(content, "https://example.com/test.md") self.assertEqual(len(result["code_samples"]), 3) self.assertEqual(result["code_samples"][0]["language"], "python") self.assertEqual(result["code_samples"][1]["language"], "javascript") self.assertIn(result["code_samples"][2]["language"], ("unknown", "text")) def test_extract_markdown_links_only_md_files(self): """Test that only .md links are extracted.""" content = """# Links - [Markdown Doc](./guide.md) - [Another MD](https://example.com/api.md) - [HTML Page](./page.html) - [External](https://google.com) """ result = self.converter._extract_markdown_content( content, "https://example.com/docs/test.md" ) # Should only include .md links md_links = [link for link in result["links"] if ".md" in link] self.assertEqual(len(md_links), len(result["links"])) def test_extract_content_paragraphs(self): """Test extracting paragraph content.""" content = """# Title This is a paragraph with enough content to pass the minimum length filter. Short. Another paragraph that should be included in the final content output. """ result = self.converter._extract_markdown_content(content, "https://example.com/test.md") self.assertIn("paragraph with enough content", result["content"]) self.assertNotIn("Short.", result["content"]) def test_detect_html_in_md_url(self): """Test that HTML content is detected when .md URL returns HTML.""" html_content = "Page

Hello

" result = self.converter._extract_markdown_content( html_content, "https://example.com/test.md" ) self.assertEqual(result["title"], "Page") class TestHtmlAsMarkdownExtraction(unittest.TestCase): """Test HTML to markdown-like extraction.""" def setUp(self): """Set up test fixtures.""" from skill_seekers.cli.doc_scraper import DocToSkillConverter self.config = { "name": "test_html_fallback", "base_url": "https://example.com", "selectors": {}, "url_patterns": {"include": [], "exclude": []}, "categories": {}, } self.converter = DocToSkillConverter(self.config) def tearDown(self): """Clean up output directory.""" output_dir = f"output/{self.config['name']}_data" if os.path.exists(output_dir): shutil.rmtree(output_dir) def test_extract_title_from_html(self): """Test extracting title from HTML title tag.""" html = "My Page Title" result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") self.assertEqual(result["title"], "My Page Title") def test_find_main_content_area(self): """Test finding main content from various selectors.""" html = """

Main Content

This is the main content area with enough text to pass filters.

""" result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") self.assertIn("main content area", result["content"].lower()) def test_extract_code_blocks_from_html(self): """Test extracting code blocks from HTML pre/code tags.""" html = """
print("hello")
""" result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") self.assertTrue(len(result["code_samples"]) > 0) def test_fallback_to_body_when_no_main(self): """Test fallback to body when no main/article element.""" html = """

Section

Content in body without main element, long enough to pass filter.

""" result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") self.assertTrue(len(result["headings"]) > 0 or len(result["content"]) > 0) class TestLlmsTxtUrlExtraction(unittest.TestCase): """Test URL extraction from llms.txt content.""" def test_extract_markdown_style_links(self): """Test extracting [text](url) style links.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser content = """ # Documentation Index - [Getting Started](https://docs.example.com/start.md) - [API Reference](https://docs.example.com/api/index.md) - [Advanced Guide](https://docs.example.com/advanced.md) """ parser = LlmsTxtParser(content, base_url="https://docs.example.com") urls = parser.extract_urls() self.assertIn("https://docs.example.com/start.md", urls) self.assertIn("https://docs.example.com/api/index.md", urls) self.assertIn("https://docs.example.com/advanced.md", urls) def test_extract_bare_urls(self): """Test extracting bare URLs without markdown syntax.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser content = """ Documentation: https://example.com/docs/guide.md API: https://example.com/api/reference.md """ parser = LlmsTxtParser(content) urls = parser.extract_urls() self.assertIn("https://example.com/docs/guide.md", urls) self.assertIn("https://example.com/api/reference.md", urls) def test_resolve_relative_urls(self): """Test resolving relative URLs with base_url.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser content = """ - [Local Doc](./docs/guide.md) - [Parent](../api/ref.md) """ parser = LlmsTxtParser(content, base_url="https://example.com/learn/") urls = parser.extract_urls() # Should resolve relative paths self.assertTrue(any("docs/guide.md" in url for url in urls)) def test_clean_url_invalid_anchor_pattern(self): """Test cleaning URLs with invalid anchor patterns.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") # Invalid: path after anchor result = parser._clean_url("https://example.com/page#section/index.html.md") self.assertEqual(result, "https://example.com/page") def test_clean_url_valid_anchor(self): """Test that valid anchors are preserved.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") # Valid anchor should be unchanged result = parser._clean_url("https://example.com/page.md#section") self.assertEqual(result, "https://example.com/page.md#section") def test_clean_url_no_anchor(self): """Test that URLs without anchors are unchanged.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") result = parser._clean_url("https://example.com/docs/guide.md") self.assertEqual(result, "https://example.com/docs/guide.md") def test_clean_url_bracket_encoding(self): """Test that square brackets are percent-encoded in URL path (#284).""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") result = parser._clean_url("https://example.com/api/[v1]/users") self.assertEqual(result, "https://example.com/api/%5Bv1%5D/users") def test_clean_url_bracket_encoding_preserves_host(self): """Test that bracket encoding does not affect host (IPv6 literals).""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") # Brackets should only be encoded in path, not in host result = parser._clean_url("https://example.com/path/[param]/end") self.assertIn("%5B", result) self.assertIn("%5D", result) self.assertIn("example.com", result) def test_clean_url_bracket_in_query(self): """Test that brackets in query params are also encoded.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") result = parser._clean_url("https://example.com/search?filter=[active]") self.assertEqual(result, "https://example.com/search?filter=%5Bactive%5D") def test_clean_url_malformed_anchor_with_brackets(self): """Test combined malformed anchor stripping + bracket encoding.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") # Malformed anchor should be stripped, then brackets encoded result = parser._clean_url("https://example.com/api/[v1]/page#section/deep") self.assertEqual(result, "https://example.com/api/%5Bv1%5D/page") def test_deduplicate_urls(self): """Test that duplicate URLs are removed.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser content = """ - [Doc 1](https://example.com/doc.md) - [Doc 2](https://example.com/doc.md) https://example.com/doc.md """ parser = LlmsTxtParser(content) urls = parser.extract_urls() # Should only have one instance count = sum(1 for u in urls if u == "https://example.com/doc.md") self.assertEqual(count, 1) class TestSavePageContentFiltering(unittest.TestCase): """Test content filtering in save_page.""" def setUp(self): """Set up test fixtures.""" from skill_seekers.cli.doc_scraper import DocToSkillConverter self.config = { "name": "test_save_filter", "base_url": "https://example.com", "selectors": {}, "url_patterns": {"include": [], "exclude": []}, "categories": {}, } self.converter = DocToSkillConverter(self.config) def tearDown(self): """Clean up output directory.""" output_dir = f"output/{self.config['name']}_data" if os.path.exists(output_dir): shutil.rmtree(output_dir) def test_skip_empty_content(self): """Test that pages with empty content are skipped.""" page = { "url": "https://example.com/empty", "title": "Empty Page", "content": "", "headings": [], "code_samples": [], } self.converter.save_page(page) pages_dir = os.path.join(self.converter.data_dir, "pages") if os.path.exists(pages_dir): self.assertEqual(len(os.listdir(pages_dir)), 0) def test_skip_short_content_under_50_chars(self): """Test that pages with content < 50 chars are skipped.""" page = { "url": "https://example.com/short", "title": "Short", "content": "This is too short.", # 18 chars "headings": [], "code_samples": [], } self.converter.save_page(page) pages_dir = os.path.join(self.converter.data_dir, "pages") if os.path.exists(pages_dir): self.assertEqual(len(os.listdir(pages_dir)), 0) def test_save_content_over_50_chars(self): """Test that pages with content >= 50 chars are saved.""" page = { "url": "https://example.com/valid", "title": "Valid Page", "content": "A" * 60, # 60 chars, should pass "headings": [], "code_samples": [], } self.converter.save_page(page) pages_dir = os.path.join(self.converter.data_dir, "pages") self.assertTrue(os.path.exists(pages_dir)) self.assertEqual(len(os.listdir(pages_dir)), 1) if __name__ == "__main__": unittest.main()