feat: support multiple sources of same type in unified scraper
- Add Markdown file parsing in doc_scraper (_extract_markdown_content, _extract_html_as_markdown) - Add URL extraction and cleaning in llms_txt_parser (extract_urls, _clean_url) - Support multiple documentation/github/pdf sources in unified_scraper - Generate separate reference directories per source in unified_skill_builder - Skip pages with empty/short content (<50 chars) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -3,12 +3,67 @@
|
||||
|
||||
import re
|
||||
from typing import List, Dict
|
||||
from urllib.parse import urljoin
|
||||
|
||||
class LlmsTxtParser:
|
||||
"""Parse llms.txt markdown content into page structures"""
|
||||
|
||||
def __init__(self, content: str):
|
||||
def __init__(self, content: str, base_url: str = None):
|
||||
self.content = content
|
||||
self.base_url = base_url
|
||||
|
||||
def extract_urls(self) -> List[str]:
|
||||
"""
|
||||
Extract all URLs from the llms.txt content.
|
||||
|
||||
Returns:
|
||||
List of unique URLs found in the content
|
||||
"""
|
||||
urls = set()
|
||||
|
||||
# Match markdown links: [text](url)
|
||||
md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
|
||||
for _, url in md_links:
|
||||
if url.startswith('http'):
|
||||
clean_url = self._clean_url(url)
|
||||
if clean_url:
|
||||
urls.add(clean_url)
|
||||
elif self.base_url and not url.startswith('#'):
|
||||
clean_url = self._clean_url(urljoin(self.base_url, url))
|
||||
if clean_url:
|
||||
urls.add(clean_url)
|
||||
|
||||
# Match bare URLs
|
||||
bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
|
||||
for url in bare_urls:
|
||||
# Clean trailing punctuation
|
||||
url = url.rstrip('.,;:')
|
||||
clean_url = self._clean_url(url)
|
||||
if clean_url:
|
||||
urls.add(clean_url)
|
||||
|
||||
return list(urls)
|
||||
|
||||
def _clean_url(self, url: str) -> str:
|
||||
"""
|
||||
Clean and validate URL, removing invalid anchor patterns.
|
||||
|
||||
Args:
|
||||
url: URL to clean
|
||||
|
||||
Returns:
|
||||
Cleaned URL or empty string if invalid
|
||||
"""
|
||||
# Skip URLs with path after anchor (e.g., #section/index.html.md)
|
||||
# These are malformed and return duplicate HTML content
|
||||
if '#' in url:
|
||||
anchor_pos = url.index('#')
|
||||
after_anchor = url[anchor_pos + 1:]
|
||||
# If there's a path separator after anchor, it's invalid
|
||||
if '/' in after_anchor:
|
||||
# Extract the base URL without the malformed anchor
|
||||
return url[:anchor_pos]
|
||||
return url
|
||||
|
||||
def parse(self) -> List[Dict]:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user