diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py
index f741d81..4b1ead1 100644
--- a/cli/doc_scraper.py
+++ b/cli/doc_scraper.py
@@ -196,7 +196,9 @@ class DocToSkillConverter:
# Extract links
for link in main.find_all('a', href=True):
href = urljoin(url, link['href'])
- if self.is_valid_url(href):
+ # Strip anchor fragments to avoid treating #anchors as separate pages
+ href = href.split('#')[0]
+ if self.is_valid_url(href) and href not in page['links']:
page['links'].append(href)
return page
diff --git a/tests/test_scraper_features.py b/tests/test_scraper_features.py
index 4069e6f..59c4bc4 100644
--- a/tests/test_scraper_features.py
+++ b/tests/test_scraper_features.py
@@ -316,6 +316,97 @@ class TestCategorization(unittest.TestCase):
self.assertNotIn('guides', categories)
+class TestLinkExtraction(unittest.TestCase):
+ """Test link extraction and anchor fragment handling"""
+
+ def setUp(self):
+ """Set up test converter"""
+ config = {
+ 'name': 'test',
+ 'base_url': 'https://example.com/',
+ 'selectors': {'main_content': 'article', 'title': 'h1', 'code_blocks': 'pre code'},
+ 'url_patterns': {
+ 'include': [],
+ 'exclude': []
+ },
+ 'rate_limit': 0.1,
+ 'max_pages': 10
+ }
+ self.converter = DocToSkillConverter(config, dry_run=True)
+
+ def test_extract_links_strips_anchor_fragments(self):
+ """Test that anchor fragments (#anchor) are stripped from extracted links"""
+ html = '''
+
+ Test Page
+ Content with links
+ Link 1
+ Link 2
+ Link 3
+
+ '''
+ soup = BeautifulSoup(html, 'html.parser')
+ page = self.converter.extract_content(soup, 'https://example.com/')
+
+ # Should have 2 unique URLs (page.html and other.html), not 3
+ # The two links with different anchors should be deduplicated
+ self.assertEqual(len(page['links']), 2)
+ self.assertIn('https://example.com/docs/page.html', page['links'])
+ self.assertIn('https://example.com/docs/other.html', page['links'])
+
+ def test_extract_links_no_anchor_duplicates(self):
+ """Test that multiple anchor links to same page don't create duplicates"""
+ html = '''
+
+ Test Page
+ Anchor 1
+ Anchor 2
+ Anchor 3
+ Anchor 4
+ Anchor 5
+
+ '''
+ soup = BeautifulSoup(html, 'html.parser')
+ page = self.converter.extract_content(soup, 'https://example.com/')
+
+ # All 5 links point to the same page, should result in only 1 URL
+ self.assertEqual(len(page['links']), 1)
+ self.assertEqual(page['links'][0], 'https://example.com/docs/api.html')
+
+ def test_extract_links_preserves_query_params(self):
+ """Test that query parameters are preserved when stripping anchors"""
+ html = '''
+
+ Test Page
+ Search Result
+
+ '''
+ soup = BeautifulSoup(html, 'html.parser')
+ page = self.converter.extract_content(soup, 'https://example.com/')
+
+ # Query params should be preserved, only anchor stripped
+ self.assertEqual(len(page['links']), 1)
+ self.assertEqual(page['links'][0], 'https://example.com/search?q=test')
+
+ def test_extract_links_relative_urls_with_anchors(self):
+ """Test that relative URLs with anchors are handled correctly"""
+ html = '''
+
+ Test Page
+ Relative Link 1
+ Relative Link 2
+ Relative Link 3
+
+ '''
+ soup = BeautifulSoup(html, 'html.parser')
+ page = self.converter.extract_content(soup, 'https://example.com/')
+
+ # Should have 2 unique URLs (guide.html and tutorial.html)
+ self.assertEqual(len(page['links']), 2)
+ self.assertIn('https://example.com/docs/guide.html', page['links'])
+ self.assertIn('https://example.com/docs/tutorial.html', page['links'])
+
+
class TestTextCleaning(unittest.TestCase):
"""Test text cleaning utility"""