Strip anchors from urls so that the pages aren't duplicated

Signed-off-by: Joshua Shanks <jjshanks@gmail.com>
This commit is contained in:
Joshua Shanks
2025-10-19 16:50:17 -07:00
parent b83f276621
commit e802dfee6d
2 changed files with 94 additions and 1 deletions

View File

@@ -196,7 +196,9 @@ class DocToSkillConverter:
# Extract links
for link in main.find_all('a', href=True):
href = urljoin(url, link['href'])
if self.is_valid_url(href):
# Strip anchor fragments to avoid treating #anchors as separate pages
href = href.split('#')[0]
if self.is_valid_url(href) and href not in page['links']:
page['links'].append(href)
return page