feat: support multiple sources of same type in unified scraper

- Add Markdown file parsing in doc_scraper (_extract_markdown_content, _extract_html_as_markdown) - Add URL extraction and cleaning in llms_txt_parser (extract_urls, _clean_url) - Support multiple documentation/github/pdf sources in unified_scraper - Generate separate reference directories per source in unified_skill_builder - Skip pages with empty/short content (<50 chars) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-05 21:45:36 +08:00
parent 26474c29eb
commit 8cf43582a4
4 changed files with 529 additions and 112 deletions
--- a/src/skill_seekers/cli/llms_txt_parser.py
+++ b/src/skill_seekers/cli/llms_txt_parser.py
@@ -3,12 +3,67 @@

 import re
 from typing import List, Dict
+from urllib.parse import urljoin

 class LlmsTxtParser:
    """Parse llms.txt markdown content into page structures"""

-    def __init__(self, content: str):
+    def __init__(self, content: str, base_url: str = None):
        self.content = content
+        self.base_url = base_url
+
+    def extract_urls(self) -> List[str]:
+        """
+        Extract all URLs from the llms.txt content.
+
+        Returns:
+            List of unique URLs found in the content
+        """
+        urls = set()
+
+        # Match markdown links: [text](url)
+        md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
+        for _, url in md_links:
+            if url.startswith('http'):
+                clean_url = self._clean_url(url)
+                if clean_url:
+                    urls.add(clean_url)
+            elif self.base_url and not url.startswith('#'):
+                clean_url = self._clean_url(urljoin(self.base_url, url))
+                if clean_url:
+                    urls.add(clean_url)
+
+        # Match bare URLs
+        bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
+        for url in bare_urls:
+            # Clean trailing punctuation
+            url = url.rstrip('.,;:')
+            clean_url = self._clean_url(url)
+            if clean_url:
+                urls.add(clean_url)
+
+        return list(urls)
+
+    def _clean_url(self, url: str) -> str:
+        """
+        Clean and validate URL, removing invalid anchor patterns.
+
+        Args:
+            url: URL to clean
+
+        Returns:
+            Cleaned URL or empty string if invalid
+        """
+        # Skip URLs with path after anchor (e.g., #section/index.html.md)
+        # These are malformed and return duplicate HTML content
+        if '#' in url:
+            anchor_pos = url.index('#')
+            after_anchor = url[anchor_pos + 1:]
+            # If there's a path separator after anchor, it's invalid
+            if '/' in after_anchor:
+                # Extract the base URL without the malformed anchor
+                return url[:anchor_pos]
+        return url

    def parse(self) -> List[Dict]:
        """