From a8ab462930f6b1455caf2d760569060ae51fd1bd Mon Sep 17 00:00:00 2001 From: yusyus Date: Wed, 4 Feb 2026 21:20:23 +0300 Subject: [PATCH] test: Add real-world integration tests for issue #277 (MikroORM case) Added comprehensive integration tests using the exact MikroORM URLs that caused 404 errors in the original bug report. Test Coverage (6 integration tests): 1. test_mikro_orm_urls_from_issue_277 - Tests exact URLs from the bug report - Verifies no malformed anchor fragments in results - Validates deduplication and correct URL transformation 2. test_no_404_causing_urls_generated - Verifies no URLs matching the 404 error pattern are generated - Tests all problematic patterns from the issue 3. test_deduplication_prevents_multiple_requests - Validates that multiple anchors on same page deduplicate correctly - Ensures bandwidth savings 4. test_md_files_with_anchors_preserved - Tests .md files with anchors are handled correctly - Verifies anchor stripping on .md URLs 5. test_real_scraping_scenario_no_404s - Integration test simulating full llms.txt parsing flow - Validates URL structure with regex patterns 6. test_issue_277_error_message_urls - Tests the exact malformed URLs from error output - Verifies correct URLs are generated instead Results: - 18/18 tests passing (12 unit + 6 integration) - All MikroORM URLs from issue #277 handled correctly - No 404-causing patterns generated Related: #277 --- tests/test_issue_277_real_world.py | 270 +++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 tests/test_issue_277_real_world.py diff --git a/tests/test_issue_277_real_world.py b/tests/test_issue_277_real_world.py new file mode 100644 index 0000000..38051a1 --- /dev/null +++ b/tests/test_issue_277_real_world.py @@ -0,0 +1,270 @@ +""" +Real-world integration test for Issue #277: URL conversion bug with anchor fragments. +Tests the exact MikroORM case that was reported in the issue. +""" + +import unittest +from unittest.mock import MagicMock, patch + +from skill_seekers.cli.doc_scraper import DocToSkillConverter + + +class TestIssue277RealWorld(unittest.TestCase): + """Integration test for Issue #277 using real MikroORM URLs""" + + def setUp(self): + """Set up test converter with MikroORM-like configuration""" + self.config = { + "name": "MikroORM", + "description": "ORM", + "base_url": "https://mikro-orm.io/docs/", + "selectors": {"main_content": "article"}, + "url_patterns": { + "include": ["/docs"], + "exclude": [], + }, + } + self.converter = DocToSkillConverter(self.config, dry_run=True) + + def test_mikro_orm_urls_from_issue_277(self): + """Test the exact URLs that caused 404 errors in issue #277""" + # These are the actual problematic URLs from the bug report + urls_from_llms_txt = [ + "https://mikro-orm.io/docs/", + "https://mikro-orm.io/docs/reference.md", + "https://mikro-orm.io/docs/quick-start#synchronous-initialization", + "https://mikro-orm.io/docs/repositories.md#custom-repository", + "https://mikro-orm.io/docs/propagation", + "https://mikro-orm.io/docs/defining-entities.md#check-constraints", + "https://mikro-orm.io/docs/defining-entities#formulas", + "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums", + ] + + result = self.converter._convert_to_md_urls(urls_from_llms_txt) + + # Verify no malformed URLs with anchor fragments + for url in result: + self.assertNotIn( + "#synchronous-initialization/index.html.md", + url, + "Should not append /index.html.md after anchor fragments", + ) + self.assertNotIn( + "#formulas/index.html.md", + url, + "Should not append /index.html.md after anchor fragments", + ) + self.assertNotIn( + "#postgresql-native-enums/index.html.md", + url, + "Should not append /index.html.md after anchor fragments", + ) + + # Verify correct transformed URLs + expected_urls = { + "https://mikro-orm.io/docs/index.html.md", # Root URL + "https://mikro-orm.io/docs/reference.md", # Already .md + "https://mikro-orm.io/docs/quick-start/index.html.md", # Deduplicated from anchor + "https://mikro-orm.io/docs/repositories.md", # Already .md, anchor stripped + "https://mikro-orm.io/docs/propagation/index.html.md", + "https://mikro-orm.io/docs/defining-entities.md", # Already .md, deduplicated + "https://mikro-orm.io/docs/defining-entities/index.html.md", # Non-.md version + } + + # Check that we got the expected number of unique URLs + # Note: defining-entities has both .md and non-.md versions, so we have 2 entries for it + self.assertEqual( + len(result), + 7, + f"Should have 7 unique base URLs after deduplication, got {len(result)}", + ) + + # Verify specific URLs that were causing 404s are now correct + self.assertIn( + "https://mikro-orm.io/docs/quick-start/index.html.md", + result, + "quick-start URL should be correctly transformed", + ) + self.assertIn( + "https://mikro-orm.io/docs/propagation/index.html.md", + result, + "propagation URL should be correctly transformed", + ) + self.assertIn( + "https://mikro-orm.io/docs/defining-entities.md", + result, + "defining-entities.md should preserve .md extension", + ) + + def test_no_404_causing_urls_generated(self): + """Verify that no URLs matching the 404 error pattern are generated""" + # The exact 404-causing URL pattern from the issue + problematic_patterns = [ + "/index.html.md#", # /index.html.md should never come after # + "#synchronous-initialization/index.html.md", + "#formulas/index.html.md", + "#postgresql-native-enums/index.html.md", + "#custom-repository/index.html.md", + "#check-constraints/index.html.md", + ] + + urls = [ + "https://mikro-orm.io/docs/quick-start#synchronous-initialization", + "https://mikro-orm.io/docs/defining-entities#formulas", + "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums", + "https://mikro-orm.io/docs/repositories.md#custom-repository", + "https://mikro-orm.io/docs/defining-entities.md#check-constraints", + ] + + result = self.converter._convert_to_md_urls(urls) + + # Verify NONE of the problematic patterns exist + for url in result: + for pattern in problematic_patterns: + self.assertNotIn( + pattern, + url, + f"URL '{url}' contains problematic pattern '{pattern}' that causes 404", + ) + + def test_deduplication_prevents_multiple_requests(self): + """Verify that multiple anchors on same page don't create duplicate requests""" + # From the issue: These should all map to the same base URL + urls_with_multiple_anchors = [ + "https://mikro-orm.io/docs/defining-entities#formulas", + "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums", + "https://mikro-orm.io/docs/defining-entities#indexes", + "https://mikro-orm.io/docs/defining-entities#check-constraints", + ] + + result = self.converter._convert_to_md_urls(urls_with_multiple_anchors) + + # Should deduplicate to single URL + self.assertEqual( + len(result), + 1, + "Multiple anchors on same page should deduplicate to single request", + ) + self.assertEqual( + result[0], + "https://mikro-orm.io/docs/defining-entities/index.html.md", + ) + + def test_md_files_with_anchors_preserved(self): + """Test that .md files with anchors are handled correctly""" + urls = [ + "https://mikro-orm.io/docs/repositories.md#custom-repository", + "https://mikro-orm.io/docs/defining-entities.md#check-constraints", + "https://mikro-orm.io/docs/inheritance-mapping.md#single-table-inheritance", + ] + + result = self.converter._convert_to_md_urls(urls) + + # Should preserve .md extension, strip anchors, deduplicate + self.assertEqual(len(result), 3) + self.assertIn("https://mikro-orm.io/docs/repositories.md", result) + self.assertIn("https://mikro-orm.io/docs/defining-entities.md", result) + self.assertIn("https://mikro-orm.io/docs/inheritance-mapping.md", result) + + # Verify no anchors in results + for url in result: + self.assertNotIn("#", url, "Result should not contain anchor fragments") + + @patch("skill_seekers.cli.doc_scraper.requests.get") + def test_real_scraping_scenario_no_404s(self, mock_get): + """ + Integration test: Simulate real scraping scenario with llms.txt URLs. + Verify that the converted URLs would not cause 404 errors. + """ + # Mock response for llms.txt content + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = """ +# MikroORM Documentation +https://mikro-orm.io/docs/quick-start +https://mikro-orm.io/docs/quick-start#synchronous-initialization +https://mikro-orm.io/docs/propagation +https://mikro-orm.io/docs/defining-entities#formulas +""" + mock_get.return_value = mock_response + + # Simulate the llms.txt parsing flow + urls_from_llms = [ + "https://mikro-orm.io/docs/quick-start", + "https://mikro-orm.io/docs/quick-start#synchronous-initialization", + "https://mikro-orm.io/docs/propagation", + "https://mikro-orm.io/docs/defining-entities#formulas", + ] + + # Convert URLs (this is what happens in _try_llms_txt_v2) + converted_urls = self.converter._convert_to_md_urls(urls_from_llms) + + # Verify converted URLs are valid + # In real scenario, these would be added to pending_urls and scraped + self.assertTrue( + len(converted_urls) > 0, "Should generate at least one URL to scrape" + ) + + # Verify no URLs would cause 404 (no anchors in middle of path) + for url in converted_urls: + # Check URL structure is valid + self.assertRegex( + url, + r"^https://[^#]+$", # Should not contain # anywhere + f"URL should not contain anchor fragments: {url}", + ) + + # Verify the problematic pattern from the issue doesn't exist + self.assertNotRegex( + url, + r"#[^/]+/index\.html\.md", + f"URL should not have /index.html.md after anchor: {url}", + ) + + def test_issue_277_error_message_urls(self): + """ + Test the exact URLs that appeared in error messages from the issue report. + These were the actual 404-causing URLs that need to be fixed. + """ + # These are the MALFORMED URLs that caused 404 errors (with anchors in the middle) + error_urls_with_anchors = [ + "https://mikro-orm.io/docs/quick-start#synchronous-initialization/index.html.md", + "https://mikro-orm.io/docs/defining-entities#formulas/index.html.md", + "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums/index.html.md", + ] + + # Extract the input URLs that would have generated these errors + input_urls = [ + "https://mikro-orm.io/docs/quick-start#synchronous-initialization", + "https://mikro-orm.io/docs/propagation", + "https://mikro-orm.io/docs/defining-entities#formulas", + "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums", + ] + + result = self.converter._convert_to_md_urls(input_urls) + + # Verify NONE of the malformed error URLs (with anchors) are generated + for error_url in error_urls_with_anchors: + self.assertNotIn( + error_url, + result, + f"Should not generate the 404-causing URL: {error_url}", + ) + + # Verify correct URLs are generated instead + correct_urls = [ + "https://mikro-orm.io/docs/quick-start/index.html.md", + "https://mikro-orm.io/docs/propagation/index.html.md", + "https://mikro-orm.io/docs/defining-entities/index.html.md", + ] + + for correct_url in correct_urls: + self.assertIn( + correct_url, + result, + f"Should generate the correct URL: {correct_url}", + ) + + +if __name__ == "__main__": + unittest.main()