From e802dfee6d21e87aa8f6538290ec1d01995d7ebb Mon Sep 17 00:00:00 2001 From: Joshua Shanks Date: Sun, 19 Oct 2025 16:50:17 -0700 Subject: [PATCH] Strip anchors from urls so that the pages aren't duplicated Signed-off-by: Joshua Shanks --- cli/doc_scraper.py | 4 +- tests/test_scraper_features.py | 91 ++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py index f741d81..4b1ead1 100644 --- a/cli/doc_scraper.py +++ b/cli/doc_scraper.py @@ -196,7 +196,9 @@ class DocToSkillConverter: # Extract links for link in main.find_all('a', href=True): href = urljoin(url, link['href']) - if self.is_valid_url(href): + # Strip anchor fragments to avoid treating #anchors as separate pages + href = href.split('#')[0] + if self.is_valid_url(href) and href not in page['links']: page['links'].append(href) return page diff --git a/tests/test_scraper_features.py b/tests/test_scraper_features.py index 4069e6f..59c4bc4 100644 --- a/tests/test_scraper_features.py +++ b/tests/test_scraper_features.py @@ -316,6 +316,97 @@ class TestCategorization(unittest.TestCase): self.assertNotIn('guides', categories) +class TestLinkExtraction(unittest.TestCase): + """Test link extraction and anchor fragment handling""" + + def setUp(self): + """Set up test converter""" + config = { + 'name': 'test', + 'base_url': 'https://example.com/', + 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre code'}, + 'url_patterns': { + 'include': [], + 'exclude': [] + }, + 'rate_limit': 0.1, + 'max_pages': 10 + } + self.converter = DocToSkillConverter(config, dry_run=True) + + def test_extract_links_strips_anchor_fragments(self): + """Test that anchor fragments (#anchor) are stripped from extracted links""" + html = ''' + + ''' + soup = BeautifulSoup(html, 'html.parser') + page = self.converter.extract_content(soup, 'https://example.com/') + + # Should have 2 unique URLs (page.html and other.html), not 3 + # The two links with different anchors should be deduplicated + self.assertEqual(len(page['links']), 2) + self.assertIn('https://example.com/docs/page.html', page['links']) + self.assertIn('https://example.com/docs/other.html', page['links']) + + def test_extract_links_no_anchor_duplicates(self): + """Test that multiple anchor links to same page don't create duplicates""" + html = ''' + + ''' + soup = BeautifulSoup(html, 'html.parser') + page = self.converter.extract_content(soup, 'https://example.com/') + + # All 5 links point to the same page, should result in only 1 URL + self.assertEqual(len(page['links']), 1) + self.assertEqual(page['links'][0], 'https://example.com/docs/api.html') + + def test_extract_links_preserves_query_params(self): + """Test that query parameters are preserved when stripping anchors""" + html = ''' + + ''' + soup = BeautifulSoup(html, 'html.parser') + page = self.converter.extract_content(soup, 'https://example.com/') + + # Query params should be preserved, only anchor stripped + self.assertEqual(len(page['links']), 1) + self.assertEqual(page['links'][0], 'https://example.com/search?q=test') + + def test_extract_links_relative_urls_with_anchors(self): + """Test that relative URLs with anchors are handled correctly""" + html = ''' + + ''' + soup = BeautifulSoup(html, 'html.parser') + page = self.converter.extract_content(soup, 'https://example.com/') + + # Should have 2 unique URLs (guide.html and tutorial.html) + self.assertEqual(len(page['links']), 2) + self.assertIn('https://example.com/docs/guide.html', page['links']) + self.assertIn('https://example.com/docs/tutorial.html', page['links']) + + class TestTextCleaning(unittest.TestCase): """Test text cleaning utility"""