""" Tests for Markdown parsing and BFS URL crawling features. Tests the following functionality: 1. Markdown file content extraction (_extract_markdown_content) 2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown) 3. URL extraction from llms.txt (extract_urls, _clean_url) 4. Empty/short content filtering in save_page """ import unittest import tempfile import os import shutil class TestMarkdownContentExtraction(unittest.TestCase): """Test Markdown file parsing in doc_scraper.""" def setUp(self): """Set up test fixtures.""" from skill_seekers.cli.doc_scraper import DocToSkillConverter self.config = { 'name': 'test_md_parsing', 'base_url': 'https://example.com', 'selectors': {}, 'url_patterns': {'include': [], 'exclude': []}, 'categories': {} } self.converter = DocToSkillConverter(self.config) def tearDown(self): """Clean up output directory.""" output_dir = f"output/{self.config['name']}_data" if os.path.exists(output_dir): shutil.rmtree(output_dir) def test_extract_title_from_h1(self): """Test extracting title from first h1.""" content = "# My Documentation Title\n\nSome content here." result = self.converter._extract_markdown_content(content, "https://example.com/test.md") self.assertEqual(result['title'], "My Documentation Title") def test_extract_headings_h2_to_h6(self): """Test extracting h2-h6 headings (not h1).""" content = """# Title ## Section One ### Subsection A #### Deep Section ##### Deeper ###### Deepest Content here. """ result = self.converter._extract_markdown_content(content, "https://example.com/test.md") # Should have 5 headings (h2-h6), not h1 self.assertEqual(len(result['headings']), 5) self.assertEqual(result['headings'][0]['level'], 'h2') self.assertEqual(result['headings'][0]['text'], 'Section One') def test_extract_code_blocks_with_language(self): """Test extracting code blocks with language tags.""" content = """# API Guide ```python def hello(): return "Hello, World!" ``` Some explanation. ```javascript const greet = () => console.log("Hi"); ``` ``` plain code without language ``` """ result = self.converter._extract_markdown_content(content, "https://example.com/test.md") self.assertEqual(len(result['code_samples']), 3) self.assertEqual(result['code_samples'][0]['language'], 'python') self.assertEqual(result['code_samples'][1]['language'], 'javascript') self.assertEqual(result['code_samples'][2]['language'], 'unknown') def test_extract_markdown_links_only_md_files(self): """Test that only .md links are extracted.""" content = """# Links - [Markdown Doc](./guide.md) - [Another MD](https://example.com/api.md) - [HTML Page](./page.html) - [External](https://google.com) """ result = self.converter._extract_markdown_content(content, "https://example.com/docs/test.md") # Should only include .md links md_links = [l for l in result['links'] if '.md' in l] self.assertEqual(len(md_links), len(result['links'])) def test_extract_content_paragraphs(self): """Test extracting paragraph content.""" content = """# Title This is a paragraph with enough content to pass the minimum length filter. Short. Another paragraph that should be included in the final content output. """ result = self.converter._extract_markdown_content(content, "https://example.com/test.md") self.assertIn("paragraph with enough content", result['content']) self.assertNotIn("Short.", result['content']) def test_detect_html_in_md_url(self): """Test that HTML content is detected when .md URL returns HTML.""" html_content = "Page

Hello

" result = self.converter._extract_markdown_content(html_content, "https://example.com/test.md") self.assertEqual(result['title'], "Page") class TestHtmlAsMarkdownExtraction(unittest.TestCase): """Test HTML to markdown-like extraction.""" def setUp(self): """Set up test fixtures.""" from skill_seekers.cli.doc_scraper import DocToSkillConverter self.config = { 'name': 'test_html_fallback', 'base_url': 'https://example.com', 'selectors': {}, 'url_patterns': {'include': [], 'exclude': []}, 'categories': {} } self.converter = DocToSkillConverter(self.config) def tearDown(self): """Clean up output directory.""" output_dir = f"output/{self.config['name']}_data" if os.path.exists(output_dir): shutil.rmtree(output_dir) def test_extract_title_from_html(self): """Test extracting title from HTML title tag.""" html = "My Page Title" result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") self.assertEqual(result['title'], "My Page Title") def test_find_main_content_area(self): """Test finding main content from various selectors.""" html = """

Main Content

This is the main content area with enough text to pass filters.

""" result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") self.assertIn("main content area", result['content'].lower()) def test_extract_code_blocks_from_html(self): """Test extracting code blocks from HTML pre/code tags.""" html = """

print("hello")

""" result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") self.assertTrue(len(result['code_samples']) > 0) def test_fallback_to_body_when_no_main(self): """Test fallback to body when no main/article element.""" html = """

Section

Content in body without main element, long enough to pass filter.

""" result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") self.assertTrue(len(result['headings']) > 0 or len(result['content']) > 0) class TestLlmsTxtUrlExtraction(unittest.TestCase): """Test URL extraction from llms.txt content.""" def test_extract_markdown_style_links(self): """Test extracting [text](url) style links.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser content = """ # Documentation Index - [Getting Started](https://docs.example.com/start.md) - [API Reference](https://docs.example.com/api/index.md) - [Advanced Guide](https://docs.example.com/advanced.md) """ parser = LlmsTxtParser(content, base_url="https://docs.example.com") urls = parser.extract_urls() self.assertIn("https://docs.example.com/start.md", urls) self.assertIn("https://docs.example.com/api/index.md", urls) self.assertIn("https://docs.example.com/advanced.md", urls) def test_extract_bare_urls(self): """Test extracting bare URLs without markdown syntax.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser content = """ Documentation: https://example.com/docs/guide.md API: https://example.com/api/reference.md """ parser = LlmsTxtParser(content) urls = parser.extract_urls() self.assertIn("https://example.com/docs/guide.md", urls) self.assertIn("https://example.com/api/reference.md", urls) def test_resolve_relative_urls(self): """Test resolving relative URLs with base_url.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser content = """ - [Local Doc](./docs/guide.md) - [Parent](../api/ref.md) """ parser = LlmsTxtParser(content, base_url="https://example.com/learn/") urls = parser.extract_urls() # Should resolve relative paths self.assertTrue(any("docs/guide.md" in url for url in urls)) def test_clean_url_invalid_anchor_pattern(self): """Test cleaning URLs with invalid anchor patterns.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") # Invalid: path after anchor result = parser._clean_url("https://example.com/page#section/index.html.md") self.assertEqual(result, "https://example.com/page") def test_clean_url_valid_anchor(self): """Test that valid anchors are preserved.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") # Valid anchor should be unchanged result = parser._clean_url("https://example.com/page.md#section") self.assertEqual(result, "https://example.com/page.md#section") def test_clean_url_no_anchor(self): """Test that URLs without anchors are unchanged.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser parser = LlmsTxtParser("", base_url="https://example.com") result = parser._clean_url("https://example.com/docs/guide.md") self.assertEqual(result, "https://example.com/docs/guide.md") def test_deduplicate_urls(self): """Test that duplicate URLs are removed.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser content = """ - [Doc 1](https://example.com/doc.md) - [Doc 2](https://example.com/doc.md) https://example.com/doc.md """ parser = LlmsTxtParser(content) urls = parser.extract_urls() # Should only have one instance count = sum(1 for u in urls if u == "https://example.com/doc.md") self.assertEqual(count, 1) class TestSavePageContentFiltering(unittest.TestCase): """Test content filtering in save_page.""" def setUp(self): """Set up test fixtures.""" from skill_seekers.cli.doc_scraper import DocToSkillConverter self.config = { 'name': 'test_save_filter', 'base_url': 'https://example.com', 'selectors': {}, 'url_patterns': {'include': [], 'exclude': []}, 'categories': {} } self.converter = DocToSkillConverter(self.config) def tearDown(self): """Clean up output directory.""" output_dir = f"output/{self.config['name']}_data" if os.path.exists(output_dir): shutil.rmtree(output_dir) def test_skip_empty_content(self): """Test that pages with empty content are skipped.""" page = { 'url': 'https://example.com/empty', 'title': 'Empty Page', 'content': '', 'headings': [], 'code_samples': [] } self.converter.save_page(page) pages_dir = os.path.join(self.converter.data_dir, 'pages') if os.path.exists(pages_dir): self.assertEqual(len(os.listdir(pages_dir)), 0) def test_skip_short_content_under_50_chars(self): """Test that pages with content < 50 chars are skipped.""" page = { 'url': 'https://example.com/short', 'title': 'Short', 'content': 'This is too short.', # 18 chars 'headings': [], 'code_samples': [] } self.converter.save_page(page) pages_dir = os.path.join(self.converter.data_dir, 'pages') if os.path.exists(pages_dir): self.assertEqual(len(os.listdir(pages_dir)), 0) def test_save_content_over_50_chars(self): """Test that pages with content >= 50 chars are saved.""" page = { 'url': 'https://example.com/valid', 'title': 'Valid Page', 'content': 'A' * 60, # 60 chars, should pass 'headings': [], 'code_samples': [] } self.converter.save_page(page) pages_dir = os.path.join(self.converter.data_dir, 'pages') self.assertTrue(os.path.exists(pages_dir)) self.assertEqual(len(os.listdir(pages_dir)), 1) if __name__ == '__main__': unittest.main()