Strip anchors from urls so that the pages aren't duplicated
Signed-off-by: Joshua Shanks <jjshanks@gmail.com>
This commit is contained in:
@@ -196,7 +196,9 @@ class DocToSkillConverter:
|
||||
# Extract links
|
||||
for link in main.find_all('a', href=True):
|
||||
href = urljoin(url, link['href'])
|
||||
if self.is_valid_url(href):
|
||||
# Strip anchor fragments to avoid treating #anchors as separate pages
|
||||
href = href.split('#')[0]
|
||||
if self.is_valid_url(href) and href not in page['links']:
|
||||
page['links'].append(href)
|
||||
|
||||
return page
|
||||
|
||||
Reference in New Issue
Block a user