diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 3490cc6..502ab30 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -471,7 +471,11 @@ class DocToSkillConverter: else: continue full_url = full_url.split("#")[0] - if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in links: + if ( + self._has_md_extension(full_url) + and self.is_valid_url(full_url) + and full_url not in links + ): links.append(full_url) return { @@ -560,7 +564,11 @@ class DocToSkillConverter: # Strip anchor fragments full_url = full_url.split("#")[0] # Only include .md URLs to avoid client-side rendered HTML pages - if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in page["links"]: + if ( + self._has_md_extension(full_url) + and self.is_valid_url(full_url) + and full_url not in page["links"] + ): page["links"].append(full_url) return page diff --git a/tests/test_issue_277_discord_e2e.py b/tests/test_issue_277_discord_e2e.py index f7b01a3..1d48646 100644 --- a/tests/test_issue_277_discord_e2e.py +++ b/tests/test_issue_277_discord_e2e.py @@ -88,8 +88,7 @@ class TestIssue277DiscordDocsE2E(unittest.TestCase): len(bad_urls), 0, f"Found {len(bad_urls)} URLs with /index.html.md appended " - f"(would cause 404s):\n" - + "\n".join(bad_urls[:10]), + f"(would cause 404s):\n" + "\n".join(bad_urls[:10]), ) # Step 6: Verify no anchor fragments leaked through @@ -97,8 +96,7 @@ class TestIssue277DiscordDocsE2E(unittest.TestCase): self.assertEqual( len(anchor_urls), 0, - f"Found {len(anchor_urls)} URLs with anchor fragments:\n" - + "\n".join(anchor_urls[:10]), + f"Found {len(anchor_urls)} URLs with anchor fragments:\n" + "\n".join(anchor_urls[:10]), ) # Step 7: Verify we got a reasonable number of URLs diff --git a/tests/test_issue_277_real_world.py b/tests/test_issue_277_real_world.py index da65d7f..9cbc280 100644 --- a/tests/test_issue_277_real_world.py +++ b/tests/test_issue_277_real_world.py @@ -49,9 +49,7 @@ class TestIssue277RealWorld(unittest.TestCase): self.assertNotIn("#", url, f"URL should not contain anchor: {url}") # No /index.html.md should be appended to non-.md URLs if not url.endswith(".md"): - self.assertNotIn( - "index.html.md", url, f"Should not append /index.html.md: {url}" - ) + self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}") # .md URLs preserved, non-.md URLs preserved as-is, anchors deduplicated self.assertIn("https://mikro-orm.io/docs/reference.md", result) diff --git a/tests/test_unified_scraper_orchestration.py b/tests/test_unified_scraper_orchestration.py index 02309fc..ec57855 100644 --- a/tests/test_unified_scraper_orchestration.py +++ b/tests/test_unified_scraper_orchestration.py @@ -224,11 +224,7 @@ class TestScrapeDocumentation: mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="") scraper._scrape_documentation(source) - assert any( - "llms_txt_url" in s - for c in written_configs - for s in c.get("sources", [c]) - ) + assert any("llms_txt_url" in s for c in written_configs for s in c.get("sources", [c])) def test_start_urls_forwarded_to_doc_config(self, tmp_path): """start_urls from source is forwarded to the temporary doc config.""" @@ -251,11 +247,7 @@ class TestScrapeDocumentation: mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="") scraper._scrape_documentation(source) - assert any( - "start_urls" in s - for c in written_configs - for s in c.get("sources", [c]) - ) + assert any("start_urls" in s for c in written_configs for s in c.get("sources", [c])) # =========================================================================== diff --git a/tests/test_url_conversion.py b/tests/test_url_conversion.py index 49a3c38..e9be026 100644 --- a/tests/test_url_conversion.py +++ b/tests/test_url_conversion.py @@ -280,10 +280,14 @@ class TestHasMdExtension(unittest.TestCase): def test_md_in_middle_of_path(self): """.md in middle of path should not match""" - self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.md/subpage")) + self.assertFalse( + DocToSkillConverter._has_md_extension("https://example.com/page.md/subpage") + ) def test_index_html_md(self): - self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page/index.html.md")) + self.assertTrue( + DocToSkillConverter._has_md_extension("https://example.com/page/index.html.md") + ) if __name__ == "__main__":