feat: support multiple sources of same type in unified scraper

- Add Markdown file parsing in doc_scraper (_extract_markdown_content, _extract_html_as_markdown)
- Add URL extraction and cleaning in llms_txt_parser (extract_urls, _clean_url)
- Support multiple documentation/github/pdf sources in unified_scraper
- Generate separate reference directories per source in unified_skill_builder
- Skip pages with empty/short content (<50 chars)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
tsyhahaha
2026-01-05 21:45:36 +08:00
parent 26474c29eb
commit 8cf43582a4
4 changed files with 529 additions and 112 deletions

View File

@@ -3,12 +3,67 @@
import re
from typing import List, Dict
from urllib.parse import urljoin
class LlmsTxtParser:
"""Parse llms.txt markdown content into page structures"""
def __init__(self, content: str):
def __init__(self, content: str, base_url: str = None):
self.content = content
self.base_url = base_url
def extract_urls(self) -> List[str]:
"""
Extract all URLs from the llms.txt content.
Returns:
List of unique URLs found in the content
"""
urls = set()
# Match markdown links: [text](url)
md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
for _, url in md_links:
if url.startswith('http'):
clean_url = self._clean_url(url)
if clean_url:
urls.add(clean_url)
elif self.base_url and not url.startswith('#'):
clean_url = self._clean_url(urljoin(self.base_url, url))
if clean_url:
urls.add(clean_url)
# Match bare URLs
bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
for url in bare_urls:
# Clean trailing punctuation
url = url.rstrip('.,;:')
clean_url = self._clean_url(url)
if clean_url:
urls.add(clean_url)
return list(urls)
def _clean_url(self, url: str) -> str:
"""
Clean and validate URL, removing invalid anchor patterns.
Args:
url: URL to clean
Returns:
Cleaned URL or empty string if invalid
"""
# Skip URLs with path after anchor (e.g., #section/index.html.md)
# These are malformed and return duplicate HTML content
if '#' in url:
anchor_pos = url.index('#')
after_anchor = url[anchor_pos + 1:]
# If there's a path separator after anchor, it's invalid
if '/' in after_anchor:
# Extract the base URL without the malformed anchor
return url[:anchor_pos]
return url
def parse(self) -> List[Dict]:
"""