From 8cf43582a4a62974e71eaa1361b18c20dc63bb97 Mon Sep 17 00:00:00 2001
From: tsyhahaha <tsy1433701769@163.com>
Date: Mon, 5 Jan 2026 21:45:36 +0800
Subject: [PATCH 1/7] feat: support multiple sources of same type in unified
 scraper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add Markdown file parsing in doc_scraper (_extract_markdown_content, _extract_html_as_markdown)
- Add URL extraction and cleaning in llms_txt_parser (extract_urls, _clean_url)
- Support multiple documentation/github/pdf sources in unified_scraper
- Generate separate reference directories per source in unified_skill_builder
- Skip pages with empty/short content (<50 chars)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/skill_seekers/cli/doc_scraper.py          | 274 +++++++++++++++++-
 src/skill_seekers/cli/llms_txt_parser.py      |  57 +++-
 src/skill_seekers/cli/unified_scraper.py      |  95 ++++--
 .../cli/unified_skill_builder.py              | 215 +++++++++-----
 4 files changed, 529 insertions(+), 112 deletions(-)
diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py
index 74b1ee0..1e52181 100755
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -349,6 +349,151 @@ class DocToSkillConverter:
 
         return page
 
+    def _extract_markdown_content(self, content: str, url: str) -> Dict[str, Any]:
+        """Extract content from a Markdown file.
+
+        Args:
+            content: Raw markdown content (or HTML if server returned HTML)
+            url: Source URL
+
+        Returns:
+            Page dict with title, content, code_samples, headings, links
+        """
+        import re
+
+        # Detect if content is actually HTML (some .md URLs return HTML)
+        if content.strip().startswith('<!DOCTYPE') or content.strip().startswith('<html'):
+            return self._extract_html_as_markdown(content, url)
+
+        page = {
+            'url': url,
+            'title': '',
+            'content': '',
+            'headings': [],
+            'code_samples': [],
+            'patterns': [],
+            'links': []
+        }
+
+        lines = content.split('\n')
+
+        # Extract title from first h1
+        for line in lines:
+            if line.startswith('# '):
+                page['title'] = line[2:].strip()
+                break
+
+        # Extract headings (h2-h6)
+        for line in lines:
+            match = re.match(r'^(#{2,6})\s+(.+)$', line)
+            if match:
+                level = len(match.group(1))
+                text = match.group(2).strip()
+                page['headings'].append({
+                    'level': f'h{level}',
+                    'text': text,
+                    'id': text.lower().replace(' ', '-')
+                })
+
+        # Extract code blocks with language
+        code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
+        for lang, code in code_blocks:
+            if len(code.strip()) > 10:
+                page['code_samples'].append({
+                    'code': code.strip(),
+                    'language': lang or 'unknown'
+                })
+
+        # Extract content (paragraphs)
+        content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
+        paragraphs = []
+        for para in content_no_code.split('\n\n'):
+            text = para.strip()
+            # Skip headings and short text
+            if text and len(text) > 20 and not text.startswith('#'):
+                paragraphs.append(text)
+        page['content'] = '\n\n'.join(paragraphs)
+
+        # Extract links from markdown (only .md files to avoid client-side rendered HTML pages)
+        md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', content)
+        for _, href in md_links:
+            if href.startswith('http'):
+                full_url = href
+            elif not href.startswith('#'):
+                full_url = urljoin(url, href)
+            else:
+                continue
+            # Strip anchor fragments
+            full_url = full_url.split('#')[0]
+            # Only include .md URLs to avoid client-side rendered HTML pages
+            if '.md' in full_url and self.is_valid_url(full_url) and full_url not in page['links']:
+                page['links'].append(full_url)
+
+        return page
+
+    def _extract_html_as_markdown(self, html_content: str, url: str) -> Dict[str, Any]:
+        """Extract content from HTML and convert to markdown-like structure.
+
+        Args:
+            html_content: Raw HTML content
+            url: Source URL
+
+        Returns:
+            Page dict with title, content, code_samples, headings, links
+        """
+        page = {
+            'url': url,
+            'title': '',
+            'content': '',
+            'headings': [],
+            'code_samples': [],
+            'patterns': [],
+            'links': []
+        }
+
+        soup = BeautifulSoup(html_content, 'html.parser')
+
+        # Try to extract title
+        title_elem = soup.select_one('title')
+        if title_elem:
+            page['title'] = self.clean_text(title_elem.get_text())
+
+        # Try to find main content area
+        main = soup.select_one('main, article, [role="main"], .content')
+        if not main:
+            main = soup.body if soup.body else soup
+
+        if main:
+            # Extract headings
+            for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+                text = self.clean_text(h.get_text())
+                if text:
+                    page['headings'].append({
+                        'level': h.name,
+                        'text': text,
+                        'id': h.get('id', '')
+                    })
+
+            # Extract code blocks
+            for code_elem in main.select('pre code, pre'):
+                code = code_elem.get_text()
+                if len(code.strip()) > 10:
+                    lang = self.detect_language(code_elem, code)
+                    page['code_samples'].append({
+                        'code': code.strip(),
+                        'language': lang
+                    })
+
+            # Extract paragraphs
+            paragraphs = []
+            for p in main.find_all('p'):
+                text = self.clean_text(p.get_text())
+                if text and len(text) > 20:
+                    paragraphs.append(text)
+            page['content'] = '\n\n'.join(paragraphs)
+
+        return page
+
     def detect_language(self, elem, code):
         """Detect programming language from code block
 
@@ -386,14 +531,19 @@ class DocToSkillConverter:
         return text.strip()
     
     def save_page(self, page: Dict[str, Any]) -> None:
-        """Save page data"""
+        """Save page data (skip pages with empty content)"""
+        # Skip pages with empty or very short content
+        if not page.get('content') or len(page.get('content', '')) < 50:
+            logger.debug("Skipping page with empty/short content: %s", page.get('url', 'unknown'))
+            return
+
         url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10]
         safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50]
         safe_title = re.sub(r'[-\s]+', '_', safe_title)
-        
+
         filename = f"{safe_title}_{url_hash}.json"
         filepath = os.path.join(self.data_dir, "pages", filename)
-        
+
         with open(filepath, 'w', encoding='utf-8') as f:
             json.dump(page, f, indent=2, ensure_ascii=False)
     
@@ -408,6 +558,7 @@ class DocToSkillConverter:
 
         Note:
             Uses threading locks when workers > 1 for thread safety
+            Supports both HTML pages and Markdown (.md) files
         """
         try:
             # Scraping part (no lock needed - independent)
@@ -415,8 +566,12 @@ class DocToSkillConverter:
             response = requests.get(url, headers=headers, timeout=30)
             response.raise_for_status()
 
-            soup = BeautifulSoup(response.content, 'html.parser')
-            page = self.extract_content(soup, url)
+            # Check if this is a Markdown file
+            if url.endswith('.md') or '.md' in url:
+                page = self._extract_markdown_content(response.text, url)
+            else:
+                soup = BeautifulSoup(response.content, 'html.parser')
+                page = self.extract_content(soup, url)
 
             # Thread-safe operations (lock required)
             if self.workers > 1:
@@ -463,6 +618,7 @@ class DocToSkillConverter:
 
         Note:
             Uses asyncio.Lock for async-safe operations instead of threading.Lock
+            Supports both HTML pages and Markdown (.md) files
         """
         async with semaphore:  # Limit concurrent requests
             try:
@@ -471,9 +627,13 @@ class DocToSkillConverter:
                 response = await client.get(url, headers=headers, timeout=30.0)
                 response.raise_for_status()
 
-                # BeautifulSoup parsing (still synchronous, but fast)
-                soup = BeautifulSoup(response.content, 'html.parser')
-                page = self.extract_content(soup, url)
+                # Check if this is a Markdown file
+                if url.endswith('.md') or '.md' in url:
+                    page = self._extract_markdown_content(response.text, url)
+                else:
+                    # BeautifulSoup parsing (still synchronous, but fast)
+                    soup = BeautifulSoup(response.content, 'html.parser')
+                    page = self.extract_content(soup, url)
 
                 # Async-safe operations (no lock needed - single event loop)
                 logger.info("  %s", url)
@@ -493,6 +653,56 @@ class DocToSkillConverter:
             except Exception as e:
                 logger.error("  ✗ Error scraping %s: %s: %s", url, type(e).__name__, e)
 
+    def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
+        """
+        Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
+        不预先检查 URL 是否存在，直接加入队列，在爬取时再验证。
+
+        Args:
+            urls: List of URLs to process
+
+        Returns:
+            List of .md URLs (未验证)
+        """
+        md_urls = []
+
+        for url in urls:
+            if '.md' in url:
+                md_urls.append(url)
+            else:
+                # 直接转换为 .md 格式，不发送 HEAD 请求检查
+                url = url.rstrip('/')
+                md_url = f"{url}/index.html.md"
+                md_urls.append(md_url)
+
+        logger.info("  ✓ Converted %d URLs to .md format (will validate during crawl)", len(md_urls))
+        return md_urls
+
+    # ORIGINAL _convert_to_md_urls (with HEAD request validation):
+    # def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
+    #     md_urls = []
+    #     non_md_urls = []
+    #     for url in urls:
+    #         if '.md' in url:
+    #             md_urls.append(url)
+    #         else:
+    #             non_md_urls.append(url)
+    #     if non_md_urls:
+    #         logger.info("  🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls))
+    #         converted = 0
+    #         for url in non_md_urls:
+    #             url = url.rstrip('/')
+    #             md_url = f"{url}/index.html.md"
+    #             try:
+    #                 resp = requests.head(md_url, timeout=5, allow_redirects=True)
+    #                 if resp.status_code == 200:
+    #                     md_urls.append(md_url)
+    #                     converted += 1
+    #             except Exception:
+    #                 pass
+    #         logger.info("  ✓ Converted %d URLs to .md format", converted)
+    #     return md_urls
+
     def _try_llms_txt(self) -> bool:
         """
         Try to use llms.txt instead of HTML scraping.
@@ -548,7 +758,29 @@ class DocToSkillConverter:
                             logger.info("     ✓ %s (%d chars)", extra_filename, len(extra_content))
 
                 # Parse explicit file for skill building
-                parser = LlmsTxtParser(content)
+                parser = LlmsTxtParser(content, self.base_url)
+
+                # Extract URLs from llms.txt and add to pending_urls for BFS crawling
+                extracted_urls = parser.extract_urls()
+                if extracted_urls:
+                    # Convert non-.md URLs to .md format by trying /index.html.md suffix
+                    md_urls = self._convert_to_md_urls(extracted_urls)
+                    logger.info("\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
+                               len(extracted_urls), len(md_urls))
+
+                    # Filter URLs based on url_patterns config
+                    for url in md_urls:
+                        if self.is_valid_url(url) and url not in self.visited_urls:
+                            self.pending_urls.append(url)
+
+                    logger.info("  📋 %d URLs added to crawl queue after filtering", len(self.pending_urls))
+
+                    # Return False to trigger HTML scraping with the populated pending_urls
+                    self.llms_txt_detected = True
+                    self.llms_txt_variant = 'explicit'
+                    return False  # Continue with BFS crawling
+
+                # Fallback: if no URLs found, use section-based parsing
                 pages = parser.parse()
 
                 if pages:
@@ -606,7 +838,29 @@ class DocToSkillConverter:
         largest = max(downloaded.items(), key=lambda x: x[1]['size'])
         logger.info("\n📄 Parsing %s for skill building...", largest[1]['filename'])
 
-        parser = LlmsTxtParser(largest[1]['content'])
+        parser = LlmsTxtParser(largest[1]['content'], self.base_url)
+
+        # Extract URLs from llms.txt and add to pending_urls for BFS crawling
+        extracted_urls = parser.extract_urls()
+        if extracted_urls:
+            # Convert non-.md URLs to .md format by trying /index.html.md suffix
+            md_urls = self._convert_to_md_urls(extracted_urls)
+            logger.info("\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
+                       len(extracted_urls), len(md_urls))
+
+            # Filter URLs based on url_patterns config
+            for url in md_urls:
+                if self.is_valid_url(url) and url not in self.visited_urls:
+                    self.pending_urls.append(url)
+
+            logger.info("  📋 %d URLs added to crawl queue after filtering", len(self.pending_urls))
+
+            # Return False to trigger HTML scraping with the populated pending_urls
+            self.llms_txt_detected = True
+            self.llms_txt_variants = list(downloaded.keys())
+            return False  # Continue with BFS crawling
+
+        # Fallback: if no URLs found, use section-based parsing
         pages = parser.parse()
 
         if not pages:
diff --git a/src/skill_seekers/cli/llms_txt_parser.py b/src/skill_seekers/cli/llms_txt_parser.py
index e288c92..2e143bf 100644
--- a/src/skill_seekers/cli/llms_txt_parser.py
+++ b/src/skill_seekers/cli/llms_txt_parser.py
@@ -3,12 +3,67 @@
 
 import re
 from typing import List, Dict
+from urllib.parse import urljoin
 
 class LlmsTxtParser:
     """Parse llms.txt markdown content into page structures"""
 
-    def __init__(self, content: str):
+    def __init__(self, content: str, base_url: str = None):
         self.content = content
+        self.base_url = base_url
+
+    def extract_urls(self) -> List[str]:
+        """
+        Extract all URLs from the llms.txt content.
+
+        Returns:
+            List of unique URLs found in the content
+        """
+        urls = set()
+
+        # Match markdown links: [text](url)
+        md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
+        for _, url in md_links:
+            if url.startswith('http'):
+                clean_url = self._clean_url(url)
+                if clean_url:
+                    urls.add(clean_url)
+            elif self.base_url and not url.startswith('#'):
+                clean_url = self._clean_url(urljoin(self.base_url, url))
+                if clean_url:
+                    urls.add(clean_url)
+
+        # Match bare URLs
+        bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
+        for url in bare_urls:
+            # Clean trailing punctuation
+            url = url.rstrip('.,;:')
+            clean_url = self._clean_url(url)
+            if clean_url:
+                urls.add(clean_url)
+
+        return list(urls)
+
+    def _clean_url(self, url: str) -> str:
+        """
+        Clean and validate URL, removing invalid anchor patterns.
+
+        Args:
+            url: URL to clean
+
+        Returns:
+            Cleaned URL or empty string if invalid
+        """
+        # Skip URLs with path after anchor (e.g., #section/index.html.md)
+        # These are malformed and return duplicate HTML content
+        if '#' in url:
+            anchor_pos = url.index('#')
+            after_anchor = url[anchor_pos + 1:]
+            # If there's a path separator after anchor, it's invalid
+            if '/' in after_anchor:
+                # Extract the base URL without the malformed anchor
+                return url[:anchor_pos]
+        return url
 
     def parse(self) -> List[Dict]:
         """
diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py
index f8b5dcf..f294f89 100644
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -70,8 +70,12 @@ class UnifiedScraper:
         self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based')
         logger.info(f"Merge mode: {self.merge_mode}")
 
-        # Storage for scraped data
-        self.scraped_data = {}
+        # Storage for scraped data - use lists to support multiple sources of same type
+        self.scraped_data = {
+            'documentation': [],  # List of doc sources
+            'github': [],         # List of github sources
+            'pdf': []             # List of pdf sources
+        }
 
         # Output paths
         self.name = self.config['name']
@@ -81,6 +85,9 @@ class UnifiedScraper:
         os.makedirs(self.output_dir, exist_ok=True)
         os.makedirs(self.data_dir, exist_ok=True)
 
+        # Track source index for unique naming
+        self._source_counters = {'documentation': 0, 'github': 0, 'pdf': 0}
+
     def scrape_all_sources(self):
         """
         Scrape all configured sources.
@@ -114,13 +121,22 @@ class UnifiedScraper:
                 logger.error(f"Error scraping {source_type}: {e}")
                 logger.info("Continuing with other sources...")
 
-        logger.info(f"\n✅ Scraped {len(self.scraped_data)} sources successfully")
+        logger.info(f"\n✅ Scraped {sum(len(v) for v in self.scraped_data.values())} sources successfully")
 
     def _scrape_documentation(self, source: Dict[str, Any]):
         """Scrape documentation website."""
-        # Create temporary config for doc scraper
+        # Get unique index for this documentation source
+        idx = self._source_counters['documentation']
+        self._source_counters['documentation'] += 1
+
+        # Extract source identifier from URL for unique naming
+        from urllib.parse import urlparse
+        parsed = urlparse(source['base_url'])
+        source_id = parsed.netloc.replace('.', '_').replace(':', '_')
+
+        # Create temporary config for doc scraper with unique name
         doc_config = {
-            'name': f"{self.name}_docs",
+            'name': f"{self.name}_docs_{idx}_{source_id}",
             'base_url': source['base_url'],
             'selectors': source.get('selectors', {}),
             'url_patterns': source.get('url_patterns', {}),
@@ -164,10 +180,15 @@ class UnifiedScraper:
             with open(docs_data_file, 'r', encoding='utf-8') as f:
                 summary = json.load(f)
 
-            self.scraped_data['documentation'] = {
+            # Append to list instead of overwriting
+            self.scraped_data['documentation'].append({
+                'source_id': source_id,
+                'base_url': source['base_url'],
                 'pages': summary.get('pages', []),
-                'data_file': docs_data_file
-            }
+                'total_pages': summary.get('total_pages', 0),
+                'data_file': docs_data_file,
+                'refs_dir': f"output/{doc_config['name']}/references"
+            })
 
             logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped")
         else:
@@ -185,10 +206,18 @@ class UnifiedScraper:
             logger.error("github_scraper.py not found")
             return
 
+        # Get unique index for this GitHub source
+        idx = self._source_counters['github']
+        self._source_counters['github'] += 1
+
+        # Extract repo identifier for unique naming
+        repo = source['repo']
+        repo_id = repo.replace('/', '_')
+
         # Create config for GitHub scraper
         github_config = {
-            'repo': source['repo'],
-            'name': f"{self.name}_github",
+            'repo': repo,
+            'name': f"{self.name}_github_{idx}_{repo_id}",
             'github_token': source.get('github_token'),
             'include_issues': source.get('include_issues', True),
             'max_issues': source.get('max_issues', 100),
@@ -197,7 +226,7 @@ class UnifiedScraper:
             'include_code': source.get('include_code', True),
             'code_analysis_depth': source.get('code_analysis_depth', 'surface'),
             'file_patterns': source.get('file_patterns', []),
-            'local_repo_path': source.get('local_repo_path')  # Pass local_repo_path from config
+            'local_repo_path': source.get('local_repo_path')
         }
 
         # Pass directory exclusions if specified (optional)
@@ -207,19 +236,22 @@ class UnifiedScraper:
             github_config['exclude_dirs_additional'] = source['exclude_dirs_additional']
 
         # Scrape
-        logger.info(f"Scraping GitHub repository: {source['repo']}")
+        logger.info(f"Scraping GitHub repository: {repo}")
         scraper = GitHubScraper(github_config)
         github_data = scraper.scrape()
 
-        # Save data
-        github_data_file = os.path.join(self.data_dir, 'github_data.json')
+        # Save data with unique filename
+        github_data_file = os.path.join(self.data_dir, f'github_data_{idx}_{repo_id}.json')
         with open(github_data_file, 'w', encoding='utf-8') as f:
             json.dump(github_data, f, indent=2, ensure_ascii=False)
 
-        self.scraped_data['github'] = {
+        # Append to list instead of overwriting
+        self.scraped_data['github'].append({
+            'repo': repo,
+            'repo_id': repo_id,
             'data': github_data,
             'data_file': github_data_file
-        }
+        })
 
         logger.info(f"✅ GitHub: Repository scraped successfully")
 
@@ -274,14 +306,23 @@ class UnifiedScraper:
             logger.info("No API merge needed (only one API source)")
             return []
 
-        # Get documentation and GitHub data
-        docs_data = self.scraped_data.get('documentation', {})
-        github_data = self.scraped_data.get('github', {})
+        # Get documentation and GitHub data (now lists)
+        docs_list = self.scraped_data.get('documentation', [])
+        github_list = self.scraped_data.get('github', [])
 
-        if not docs_data or not github_data:
+        if not docs_list or not github_list:
             logger.warning("Missing documentation or GitHub data for conflict detection")
             return []
 
+        # For conflict detection, combine all docs and all github data
+        # Use the first of each for now (conflict detection is optional)
+        docs_data = docs_list[0] if docs_list else {}
+        github_data = github_list[0] if github_list else {}
+
+        if not docs_data.get('data_file') or not github_data.get('data_file'):
+            logger.warning("Missing data files for conflict detection")
+            return []
+
         # Load data files
         with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
             docs_json = json.load(f)
@@ -328,9 +369,17 @@ class UnifiedScraper:
             logger.info("No conflicts to merge")
             return None
 
-        # Get data files
-        docs_data = self.scraped_data.get('documentation', {})
-        github_data = self.scraped_data.get('github', {})
+        # Get data files (now lists)
+        docs_list = self.scraped_data.get('documentation', [])
+        github_list = self.scraped_data.get('github', [])
+
+        if not docs_list or not github_list:
+            logger.warning("Missing data for merge")
+            return None
+
+        # Use first source of each type for merge
+        docs_data = docs_list[0]
+        github_data = github_list[0]
 
         # Load data
         with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
diff --git a/src/skill_seekers/cli/unified_skill_builder.py b/src/skill_seekers/cli/unified_skill_builder.py
index b8f9700..a80f86d 100644
--- a/src/skill_seekers/cli/unified_skill_builder.py
+++ b/src/skill_seekers/cli/unified_skill_builder.py
@@ -268,118 +268,177 @@ This skill combines knowledge from multiple sources:
         """Generate reference files organized by source."""
         logger.info("Generating reference files...")
 
-        # Generate references for each source type
-        if 'documentation' in self.scraped_data:
-            self._generate_docs_references()
+        # Generate references for each source type (now lists)
+        docs_list = self.scraped_data.get('documentation', [])
+        if docs_list:
+            self._generate_docs_references(docs_list)
 
-        if 'github' in self.scraped_data:
-            self._generate_github_references()
+        github_list = self.scraped_data.get('github', [])
+        if github_list:
+            self._generate_github_references(github_list)
 
-        if 'pdf' in self.scraped_data:
-            self._generate_pdf_references()
+        pdf_list = self.scraped_data.get('pdf', [])
+        if pdf_list:
+            self._generate_pdf_references(pdf_list)
 
         # Generate merged API reference if available
         if self.merged_data:
             self._generate_merged_api_reference()
 
-    def _generate_docs_references(self):
-        """Generate references from documentation source."""
+    def _generate_docs_references(self, docs_list: List[Dict]):
+        """Generate references from multiple documentation sources."""
         docs_dir = os.path.join(self.skill_dir, 'references', 'documentation')
         os.makedirs(docs_dir, exist_ok=True)
 
-        # Best-effort: copy docs-only reference files into unified docs references.
-        # UnifiedScraper runs doc_scraper using name "{name}_docs", which creates
-        # output/{name}_docs/references/*.md. Those are the most useful documentation
-        # references for the unified skill.
-        source_refs_dir = os.path.join('output', f"{self.name}_docs", 'references')
-        copied_files: List[str] = []
+        all_copied_files: List[str] = []
 
-        if os.path.isdir(source_refs_dir):
-            for entry in sorted(os.listdir(source_refs_dir)):
-                src_path = os.path.join(source_refs_dir, entry)
-                dst_path = os.path.join(docs_dir, entry)
-                if not os.path.isfile(src_path):
-                    continue
-                shutil.copy2(src_path, dst_path)
-                copied_files.append(entry)
+        # Process each documentation source
+        for i, doc_source in enumerate(docs_list):
+            source_id = doc_source.get('source_id', f'source_{i}')
+            base_url = doc_source.get('base_url', 'Unknown')
+            refs_dir = doc_source.get('refs_dir', '')
 
-        # Create index
+            # Create subdirectory for this source
+            source_dir = os.path.join(docs_dir, source_id)
+            os.makedirs(source_dir, exist_ok=True)
+
+            copied_files: List[str] = []
+
+            if refs_dir and os.path.isdir(refs_dir):
+                for entry in sorted(os.listdir(refs_dir)):
+                    src_path = os.path.join(refs_dir, entry)
+                    dst_path = os.path.join(source_dir, entry)
+                    if not os.path.isfile(src_path):
+                        continue
+                    shutil.copy2(src_path, dst_path)
+                    copied_files.append(entry)
+
+            # Create index for this source
+            source_index_path = os.path.join(source_dir, 'index.md')
+            with open(source_index_path, 'w', encoding='utf-8') as f:
+                f.write(f"# Documentation: {source_id}\n\n")
+                f.write(f"**Source**: {base_url}\n\n")
+                f.write(f"**Pages**: {doc_source.get('total_pages', 'N/A')}\n\n")
+
+                if copied_files:
+                    files_no_index = [p for p in copied_files if p.lower() != 'index.md']
+                    f.write("## Files\n\n")
+                    for filename in files_no_index:
+                        f.write(f"- [{filename}]({filename})\n")
+                else:
+                    f.write("No reference files available.\n")
+
+            all_copied_files.extend(copied_files)
+
+        # Create main index
         index_path = os.path.join(docs_dir, 'index.md')
         with open(index_path, 'w', encoding='utf-8') as f:
-            f.write("# Documentation\n\n")
-            f.write("Reference from official documentation.\n\n")
+            f.write("# Documentation References\n\n")
+            f.write(f"Combined from {len(docs_list)} documentation sources.\n\n")
 
-            if copied_files:
-                files_no_index = [p for p in copied_files if p.lower() != 'index.md']
-                files_index = [p for p in copied_files if p.lower() == 'index.md']
+            f.write("## Sources\n\n")
+            for doc_source in docs_list:
+                source_id = doc_source.get('source_id', 'unknown')
+                base_url = doc_source.get('base_url', 'Unknown')
+                total_pages = doc_source.get('total_pages', 'N/A')
+                f.write(f"- [{source_id}]({source_id}/index.md) - {base_url} ({total_pages} pages)\n")
 
-                f.write("## Files\n\n")
-                for filename in files_no_index + files_index:
-                    f.write(f"- [{filename}]({filename})\n")
-            else:
-                f.write("## Notes\n\n")
-                f.write(
-                    "No documentation reference files were copied into this unified skill. "
-                    "This usually means the docs-only build did not produce reference files.\n"
-                )
+        logger.info(f"Created documentation references ({len(docs_list)} sources)")
 
-        logger.info("Created documentation references")
-
-    def _generate_github_references(self):
-        """Generate references from GitHub source."""
+    def _generate_github_references(self, github_list: List[Dict]):
+        """Generate references from multiple GitHub sources."""
         github_dir = os.path.join(self.skill_dir, 'references', 'github')
         os.makedirs(github_dir, exist_ok=True)
 
-        github_data = self.scraped_data['github']['data']
+        # Process each GitHub source
+        for i, github_source in enumerate(github_list):
+            repo = github_source.get('repo', f'repo_{i}')
+            repo_id = github_source.get('repo_id', repo.replace('/', '_'))
+            github_data = github_source.get('data', {})
 
-        # Create README reference
-        if github_data.get('readme'):
-            readme_path = os.path.join(github_dir, 'README.md')
-            with open(readme_path, 'w') as f:
-                f.write("# Repository README\n\n")
-                f.write(github_data['readme'])
+            # Create subdirectory for this repo
+            repo_dir = os.path.join(github_dir, repo_id)
+            os.makedirs(repo_dir, exist_ok=True)
 
-        # Create issues reference
-        if github_data.get('issues'):
-            issues_path = os.path.join(github_dir, 'issues.md')
-            with open(issues_path, 'w') as f:
-                f.write("# GitHub Issues\n\n")
-                f.write(f"{len(github_data['issues'])} recent issues.\n\n")
+            # Create README reference
+            if github_data.get('readme'):
+                readme_path = os.path.join(repo_dir, 'README.md')
+                with open(readme_path, 'w', encoding='utf-8') as f:
+                    f.write(f"# Repository README: {repo}\n\n")
+                    f.write(github_data['readme'])
 
-                for issue in github_data['issues'][:20]:
-                    f.write(f"## #{issue['number']}: {issue['title']}\n\n")
-                    f.write(f"**State**: {issue['state']}\n")
-                    if issue.get('labels'):
-                        f.write(f"**Labels**: {', '.join(issue['labels'])}\n")
-                    f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n")
+            # Create issues reference
+            if github_data.get('issues'):
+                issues_path = os.path.join(repo_dir, 'issues.md')
+                with open(issues_path, 'w', encoding='utf-8') as f:
+                    f.write(f"# GitHub Issues: {repo}\n\n")
+                    f.write(f"{len(github_data['issues'])} recent issues.\n\n")
 
-        # Create releases reference
-        if github_data.get('releases'):
-            releases_path = os.path.join(github_dir, 'releases.md')
-            with open(releases_path, 'w') as f:
-                f.write("# Releases\n\n")
+                    for issue in github_data['issues'][:20]:
+                        f.write(f"## #{issue['number']}: {issue['title']}\n\n")
+                        f.write(f"**State**: {issue['state']}\n")
+                        if issue.get('labels'):
+                            f.write(f"**Labels**: {', '.join(issue['labels'])}\n")
+                        f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n")
 
-                for release in github_data['releases'][:10]:
-                    f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n")
-                    f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n")
-                    if release.get('body'):
-                        f.write(release['body'][:500])
-                        f.write("\n\n")
+            # Create releases reference
+            if github_data.get('releases'):
+                releases_path = os.path.join(repo_dir, 'releases.md')
+                with open(releases_path, 'w', encoding='utf-8') as f:
+                    f.write(f"# Releases: {repo}\n\n")
 
-        logger.info("Created GitHub references")
+                    for release in github_data['releases'][:10]:
+                        f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n")
+                        f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n")
+                        if release.get('body'):
+                            f.write(release['body'][:500])
+                            f.write("\n\n")
 
-    def _generate_pdf_references(self):
-        """Generate references from PDF source."""
+            # Create index for this repo
+            repo_index_path = os.path.join(repo_dir, 'index.md')
+            repo_info = github_data.get('repo_info', {})
+            with open(repo_index_path, 'w', encoding='utf-8') as f:
+                f.write(f"# GitHub: {repo}\n\n")
+                f.write(f"**Stars**: {repo_info.get('stars', 'N/A')}\n")
+                f.write(f"**Language**: {repo_info.get('language', 'N/A')}\n")
+                f.write(f"**Issues**: {len(github_data.get('issues', []))}\n")
+                f.write(f"**Releases**: {len(github_data.get('releases', []))}\n\n")
+                f.write("## Files\n\n")
+                f.write("- [README.md](README.md)\n")
+                if github_data.get('issues'):
+                    f.write("- [issues.md](issues.md)\n")
+                if github_data.get('releases'):
+                    f.write("- [releases.md](releases.md)\n")
+
+        # Create main index
+        index_path = os.path.join(github_dir, 'index.md')
+        with open(index_path, 'w', encoding='utf-8') as f:
+            f.write("# GitHub References\n\n")
+            f.write(f"Combined from {len(github_list)} GitHub repositories.\n\n")
+
+            f.write("## Repositories\n\n")
+            for github_source in github_list:
+                repo = github_source.get('repo', 'unknown')
+                repo_id = github_source.get('repo_id', repo.replace('/', '_'))
+                github_data = github_source.get('data', {})
+                repo_info = github_data.get('repo_info', {})
+                stars = repo_info.get('stars', 'N/A')
+                f.write(f"- [{repo}]({repo_id}/index.md) - {stars} stars\n")
+
+        logger.info(f"Created GitHub references ({len(github_list)} repos)")
+
+    def _generate_pdf_references(self, pdf_list: List[Dict]):
+        """Generate references from PDF sources."""
         pdf_dir = os.path.join(self.skill_dir, 'references', 'pdf')
         os.makedirs(pdf_dir, exist_ok=True)
 
         # Create index
         index_path = os.path.join(pdf_dir, 'index.md')
-        with open(index_path, 'w') as f:
+        with open(index_path, 'w', encoding='utf-8') as f:
             f.write("# PDF Documentation\n\n")
-            f.write("Reference from PDF document.\n\n")
+            f.write(f"Reference from {len(pdf_list)} PDF document(s).\n\n")
 
-        logger.info("Created PDF references")
+        logger.info(f"Created PDF references ({len(pdf_list)} sources)")
 
     def _generate_merged_api_reference(self):
         """Generate merged API reference file."""

From 4b764ed1c50317511163a0126fc74be1c20007e4 Mon Sep 17 00:00:00 2001
From: tsyhahaha <tsy1433701769@163.com>
Date: Mon, 5 Jan 2026 22:13:19 +0800
Subject: [PATCH 2/7] test: add unit tests for markdown parsing and
 multi-source features
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add test_markdown_parsing.py with 20 tests covering:
  - Markdown content extraction (titles, headings, code blocks, links)
  - HTML fallback when .md URL returns HTML
  - llms.txt URL extraction and cleaning
  - Empty/short content filtering

- Add test_multi_source.py with 12 tests covering:
  - List-based scraped_data structure
  - Per-source subdirectory generation for docs/github/pdf
  - Index file generation for each source type

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 tests/test_markdown_parsing.py | 359 +++++++++++++++++++++++++++
 tests/test_multi_source.py     | 433 +++++++++++++++++++++++++++++++++
 2 files changed, 792 insertions(+)
 create mode 100644 tests/test_markdown_parsing.py
 create mode 100644 tests/test_multi_source.py

diff --git a/tests/test_markdown_parsing.py b/tests/test_markdown_parsing.py
new file mode 100644
index 0000000..9917225
--- /dev/null
+++ b/tests/test_markdown_parsing.py
@@ -0,0 +1,359 @@
+"""
+Tests for Markdown parsing and BFS URL crawling features.
+
+Tests the following functionality:
+1. Markdown file content extraction (_extract_markdown_content)
+2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown)
+3. URL extraction from llms.txt (extract_urls, _clean_url)
+4. Empty/short content filtering in save_page
+"""
+
+import unittest
+import tempfile
+import os
+import shutil
+
+
+class TestMarkdownContentExtraction(unittest.TestCase):
+    """Test Markdown file parsing in doc_scraper."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        from skill_seekers.cli.doc_scraper import DocToSkillConverter
+
+        self.config = {
+            'name': 'test_md_parsing',
+            'base_url': 'https://example.com',
+            'selectors': {},
+            'url_patterns': {'include': [], 'exclude': []},
+            'categories': {}
+        }
+        self.converter = DocToSkillConverter(self.config)
+
+    def tearDown(self):
+        """Clean up output directory."""
+        output_dir = f"output/{self.config['name']}_data"
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+
+    def test_extract_title_from_h1(self):
+        """Test extracting title from first h1."""
+        content = "# My Documentation Title\n\nSome content here."
+        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
+        self.assertEqual(result['title'], "My Documentation Title")
+
+    def test_extract_headings_h2_to_h6(self):
+        """Test extracting h2-h6 headings (not h1)."""
+        content = """# Title
+
+## Section One
+### Subsection A
+#### Deep Section
+##### Deeper
+###### Deepest
+
+Content here.
+"""
+        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
+        # Should have 5 headings (h2-h6), not h1
+        self.assertEqual(len(result['headings']), 5)
+        self.assertEqual(result['headings'][0]['level'], 'h2')
+        self.assertEqual(result['headings'][0]['text'], 'Section One')
+
+    def test_extract_code_blocks_with_language(self):
+        """Test extracting code blocks with language tags."""
+        content = """# API Guide
+
+```python
+def hello():
+    return "Hello, World!"
+```
+
+Some explanation.
+
+```javascript
+const greet = () => console.log("Hi");
+```
+
+```
+plain code without language
+```
+"""
+        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
+        self.assertEqual(len(result['code_samples']), 3)
+        self.assertEqual(result['code_samples'][0]['language'], 'python')
+        self.assertEqual(result['code_samples'][1]['language'], 'javascript')
+        self.assertEqual(result['code_samples'][2]['language'], 'unknown')
+
+    def test_extract_markdown_links_only_md_files(self):
+        """Test that only .md links are extracted."""
+        content = """# Links
+
+- [Markdown Doc](./guide.md)
+- [Another MD](https://example.com/api.md)
+- [HTML Page](./page.html)
+- [External](https://google.com)
+"""
+        result = self.converter._extract_markdown_content(content, "https://example.com/docs/test.md")
+        # Should only include .md links
+        md_links = [l for l in result['links'] if '.md' in l]
+        self.assertEqual(len(md_links), len(result['links']))
+
+    def test_extract_content_paragraphs(self):
+        """Test extracting paragraph content."""
+        content = """# Title
+
+This is a paragraph with enough content to pass the minimum length filter.
+
+Short.
+
+Another paragraph that should be included in the final content output.
+"""
+        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
+        self.assertIn("paragraph with enough content", result['content'])
+        self.assertNotIn("Short.", result['content'])
+
+    def test_detect_html_in_md_url(self):
+        """Test that HTML content is detected when .md URL returns HTML."""
+        html_content = "<!DOCTYPE html><html><head><title>Page</title></head><body><h1>Hello</h1></body></html>"
+        result = self.converter._extract_markdown_content(html_content, "https://example.com/test.md")
+        self.assertEqual(result['title'], "Page")
+
+
+class TestHtmlAsMarkdownExtraction(unittest.TestCase):
+    """Test HTML to markdown-like extraction."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        from skill_seekers.cli.doc_scraper import DocToSkillConverter
+
+        self.config = {
+            'name': 'test_html_fallback',
+            'base_url': 'https://example.com',
+            'selectors': {},
+            'url_patterns': {'include': [], 'exclude': []},
+            'categories': {}
+        }
+        self.converter = DocToSkillConverter(self.config)
+
+    def tearDown(self):
+        """Clean up output directory."""
+        output_dir = f"output/{self.config['name']}_data"
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+
+    def test_extract_title_from_html(self):
+        """Test extracting title from HTML title tag."""
+        html = "<html><head><title>My Page Title</title></head><body></body></html>"
+        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
+        self.assertEqual(result['title'], "My Page Title")
+
+    def test_find_main_content_area(self):
+        """Test finding main content from various selectors."""
+        html = """
+        <html><body>
+            <nav>Navigation</nav>
+            <main>
+                <h1>Main Content</h1>
+                <p>This is the main content area with enough text to pass filters.</p>
+            </main>
+            <footer>Footer</footer>
+        </body></html>
+        """
+        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
+        self.assertIn("main content area", result['content'].lower())
+
+    def test_extract_code_blocks_from_html(self):
+        """Test extracting code blocks from HTML pre/code tags."""
+        html = """
+        <html><body>
+            <main>
+                <pre><code class="language-python">print("hello")</code></pre>
+            </main>
+        </body></html>
+        """
+        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
+        self.assertTrue(len(result['code_samples']) > 0)
+
+    def test_fallback_to_body_when_no_main(self):
+        """Test fallback to body when no main/article element."""
+        html = """
+        <html><body>
+            <div>
+                <h2>Section</h2>
+                <p>Content in body without main element, long enough to pass filter.</p>
+            </div>
+        </body></html>
+        """
+        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
+        self.assertTrue(len(result['headings']) > 0 or len(result['content']) > 0)
+
+
+class TestLlmsTxtUrlExtraction(unittest.TestCase):
+    """Test URL extraction from llms.txt content."""
+
+    def test_extract_markdown_style_links(self):
+        """Test extracting [text](url) style links."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        content = """
+# Documentation Index
+
+- [Getting Started](https://docs.example.com/start.md)
+- [API Reference](https://docs.example.com/api/index.md)
+- [Advanced Guide](https://docs.example.com/advanced.md)
+"""
+        parser = LlmsTxtParser(content, base_url="https://docs.example.com")
+        urls = parser.extract_urls()
+
+        self.assertIn("https://docs.example.com/start.md", urls)
+        self.assertIn("https://docs.example.com/api/index.md", urls)
+        self.assertIn("https://docs.example.com/advanced.md", urls)
+
+    def test_extract_bare_urls(self):
+        """Test extracting bare URLs without markdown syntax."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        content = """
+Documentation: https://example.com/docs/guide.md
+API: https://example.com/api/reference.md
+"""
+        parser = LlmsTxtParser(content)
+        urls = parser.extract_urls()
+
+        self.assertIn("https://example.com/docs/guide.md", urls)
+        self.assertIn("https://example.com/api/reference.md", urls)
+
+    def test_resolve_relative_urls(self):
+        """Test resolving relative URLs with base_url."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        content = """
+- [Local Doc](./docs/guide.md)
+- [Parent](../api/ref.md)
+"""
+        parser = LlmsTxtParser(content, base_url="https://example.com/learn/")
+        urls = parser.extract_urls()
+
+        # Should resolve relative paths
+        self.assertTrue(any("docs/guide.md" in url for url in urls))
+
+    def test_clean_url_invalid_anchor_pattern(self):
+        """Test cleaning URLs with invalid anchor patterns."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        parser = LlmsTxtParser("", base_url="https://example.com")
+
+        # Invalid: path after anchor
+        result = parser._clean_url("https://example.com/page#section/index.html.md")
+        self.assertEqual(result, "https://example.com/page")
+
+    def test_clean_url_valid_anchor(self):
+        """Test that valid anchors are preserved."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        parser = LlmsTxtParser("", base_url="https://example.com")
+
+        # Valid anchor should be unchanged
+        result = parser._clean_url("https://example.com/page.md#section")
+        self.assertEqual(result, "https://example.com/page.md#section")
+
+    def test_clean_url_no_anchor(self):
+        """Test that URLs without anchors are unchanged."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        parser = LlmsTxtParser("", base_url="https://example.com")
+
+        result = parser._clean_url("https://example.com/docs/guide.md")
+        self.assertEqual(result, "https://example.com/docs/guide.md")
+
+    def test_deduplicate_urls(self):
+        """Test that duplicate URLs are removed."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        content = """
+- [Doc 1](https://example.com/doc.md)
+- [Doc 2](https://example.com/doc.md)
+https://example.com/doc.md
+"""
+        parser = LlmsTxtParser(content)
+        urls = parser.extract_urls()
+
+        # Should only have one instance
+        count = sum(1 for u in urls if u == "https://example.com/doc.md")
+        self.assertEqual(count, 1)
+
+
+class TestSavePageContentFiltering(unittest.TestCase):
+    """Test content filtering in save_page."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        from skill_seekers.cli.doc_scraper import DocToSkillConverter
+
+        self.config = {
+            'name': 'test_save_filter',
+            'base_url': 'https://example.com',
+            'selectors': {},
+            'url_patterns': {'include': [], 'exclude': []},
+            'categories': {}
+        }
+        self.converter = DocToSkillConverter(self.config)
+
+    def tearDown(self):
+        """Clean up output directory."""
+        output_dir = f"output/{self.config['name']}_data"
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+
+    def test_skip_empty_content(self):
+        """Test that pages with empty content are skipped."""
+        page = {
+            'url': 'https://example.com/empty',
+            'title': 'Empty Page',
+            'content': '',
+            'headings': [],
+            'code_samples': []
+        }
+
+        self.converter.save_page(page)
+
+        pages_dir = os.path.join(self.converter.data_dir, 'pages')
+        if os.path.exists(pages_dir):
+            self.assertEqual(len(os.listdir(pages_dir)), 0)
+
+    def test_skip_short_content_under_50_chars(self):
+        """Test that pages with content < 50 chars are skipped."""
+        page = {
+            'url': 'https://example.com/short',
+            'title': 'Short',
+            'content': 'This is too short.',  # 18 chars
+            'headings': [],
+            'code_samples': []
+        }
+
+        self.converter.save_page(page)
+
+        pages_dir = os.path.join(self.converter.data_dir, 'pages')
+        if os.path.exists(pages_dir):
+            self.assertEqual(len(os.listdir(pages_dir)), 0)
+
+    def test_save_content_over_50_chars(self):
+        """Test that pages with content >= 50 chars are saved."""
+        page = {
+            'url': 'https://example.com/valid',
+            'title': 'Valid Page',
+            'content': 'A' * 60,  # 60 chars, should pass
+            'headings': [],
+            'code_samples': []
+        }
+
+        self.converter.save_page(page)
+
+        pages_dir = os.path.join(self.converter.data_dir, 'pages')
+        self.assertTrue(os.path.exists(pages_dir))
+        self.assertEqual(len(os.listdir(pages_dir)), 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_multi_source.py b/tests/test_multi_source.py
new file mode 100644
index 0000000..80644d2
--- /dev/null
+++ b/tests/test_multi_source.py
@@ -0,0 +1,433 @@
+"""
+Tests for multi-source support in unified scraper and skill builder.
+
+Tests the following functionality:
+1. Multiple sources of same type in unified_scraper (list structure)
+2. Source counters and unique naming
+3. Per-source reference directory generation in unified_skill_builder
+4. Multiple documentation sources handling
+5. Multiple GitHub repositories handling
+"""
+
+import unittest
+import tempfile
+import os
+import shutil
+
+
+class TestUnifiedScraperDataStructure(unittest.TestCase):
+    """Test scraped_data list structure in unified_scraper."""
+
+    def test_scraped_data_uses_list_structure(self):
+        """Test that scraped_data uses list for each source type."""
+        from skill_seekers.cli.unified_scraper import UnifiedScraper
+
+        config = {
+            'name': 'test_multi',
+            'description': 'Test skill',
+            'sources': [
+                {'type': 'documentation', 'base_url': 'https://example.com'}
+            ]
+        }
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            original_dir = os.getcwd()
+            try:
+                os.chdir(temp_dir)
+                scraper = UnifiedScraper(config)
+
+                self.assertIsInstance(scraper.scraped_data['documentation'], list)
+                self.assertIsInstance(scraper.scraped_data['github'], list)
+                self.assertIsInstance(scraper.scraped_data['pdf'], list)
+            finally:
+                os.chdir(original_dir)
+
+    def test_source_counters_initialized_to_zero(self):
+        """Test that source counters start at zero."""
+        from skill_seekers.cli.unified_scraper import UnifiedScraper
+
+        config = {
+            'name': 'test_counters',
+            'description': 'Test skill',
+            'sources': [
+                {'type': 'documentation', 'base_url': 'https://example.com'}
+            ]
+        }
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            original_dir = os.getcwd()
+            try:
+                os.chdir(temp_dir)
+                scraper = UnifiedScraper(config)
+
+                self.assertEqual(scraper._source_counters['documentation'], 0)
+                self.assertEqual(scraper._source_counters['github'], 0)
+                self.assertEqual(scraper._source_counters['pdf'], 0)
+            finally:
+                os.chdir(original_dir)
+
+    def test_empty_lists_initially(self):
+        """Test that source lists are empty initially."""
+        from skill_seekers.cli.unified_scraper import UnifiedScraper
+
+        config = {
+            'name': 'test_empty',
+            'description': 'Test skill',
+            'sources': [
+                {'type': 'documentation', 'base_url': 'https://example.com'}
+            ]
+        }
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            original_dir = os.getcwd()
+            try:
+                os.chdir(temp_dir)
+                scraper = UnifiedScraper(config)
+
+                self.assertEqual(len(scraper.scraped_data['documentation']), 0)
+                self.assertEqual(len(scraper.scraped_data['github']), 0)
+                self.assertEqual(len(scraper.scraped_data['pdf']), 0)
+            finally:
+                os.chdir(original_dir)
+
+
+class TestUnifiedSkillBuilderDocsReferences(unittest.TestCase):
+    """Test documentation reference generation for multiple sources."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.original_dir = os.getcwd()
+        os.chdir(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        os.chdir(self.original_dir)
+        if os.path.exists(self.temp_dir):
+            shutil.rmtree(self.temp_dir)
+
+    def test_creates_subdirectory_per_source(self):
+        """Test that each doc source gets its own subdirectory."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        # Create mock refs directories
+        refs_dir1 = os.path.join(self.temp_dir, 'refs1')
+        refs_dir2 = os.path.join(self.temp_dir, 'refs2')
+        os.makedirs(refs_dir1)
+        os.makedirs(refs_dir2)
+
+        config = {
+            'name': 'test_docs_refs',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [
+                {'source_id': 'source_a', 'base_url': 'https://a.com', 'total_pages': 5, 'refs_dir': refs_dir1},
+                {'source_id': 'source_b', 'base_url': 'https://b.com', 'total_pages': 3, 'refs_dir': refs_dir2}
+            ],
+            'github': [],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_docs_references(scraped_data['documentation'])
+
+        docs_dir = os.path.join(builder.skill_dir, 'references', 'documentation')
+        self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_a')))
+        self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_b')))
+
+    def test_creates_index_per_source(self):
+        """Test that each source subdirectory has its own index.md."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        refs_dir = os.path.join(self.temp_dir, 'refs')
+        os.makedirs(refs_dir)
+
+        config = {
+            'name': 'test_source_index',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [
+                {'source_id': 'my_source', 'base_url': 'https://example.com', 'total_pages': 10, 'refs_dir': refs_dir}
+            ],
+            'github': [],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_docs_references(scraped_data['documentation'])
+
+        source_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'my_source', 'index.md')
+        self.assertTrue(os.path.exists(source_index))
+
+        with open(source_index, 'r') as f:
+            content = f.read()
+            self.assertIn('my_source', content)
+            self.assertIn('https://example.com', content)
+
+    def test_creates_main_index_listing_all_sources(self):
+        """Test that main index.md lists all documentation sources."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        refs_dir1 = os.path.join(self.temp_dir, 'refs1')
+        refs_dir2 = os.path.join(self.temp_dir, 'refs2')
+        os.makedirs(refs_dir1)
+        os.makedirs(refs_dir2)
+
+        config = {
+            'name': 'test_main_index',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [
+                {'source_id': 'docs_one', 'base_url': 'https://one.com', 'total_pages': 10, 'refs_dir': refs_dir1},
+                {'source_id': 'docs_two', 'base_url': 'https://two.com', 'total_pages': 20, 'refs_dir': refs_dir2}
+            ],
+            'github': [],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_docs_references(scraped_data['documentation'])
+
+        main_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'index.md')
+        self.assertTrue(os.path.exists(main_index))
+
+        with open(main_index, 'r') as f:
+            content = f.read()
+            self.assertIn('docs_one', content)
+            self.assertIn('docs_two', content)
+            self.assertIn('2 documentation sources', content)
+
+    def test_copies_reference_files_to_source_dir(self):
+        """Test that reference files are copied to source subdirectory."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        refs_dir = os.path.join(self.temp_dir, 'refs')
+        os.makedirs(refs_dir)
+
+        # Create mock reference files
+        with open(os.path.join(refs_dir, 'api.md'), 'w') as f:
+            f.write('# API Reference')
+        with open(os.path.join(refs_dir, 'guide.md'), 'w') as f:
+            f.write('# User Guide')
+
+        config = {
+            'name': 'test_copy_refs',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [
+                {'source_id': 'test_source', 'base_url': 'https://test.com', 'total_pages': 5, 'refs_dir': refs_dir}
+            ],
+            'github': [],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_docs_references(scraped_data['documentation'])
+
+        source_dir = os.path.join(builder.skill_dir, 'references', 'documentation', 'test_source')
+        self.assertTrue(os.path.exists(os.path.join(source_dir, 'api.md')))
+        self.assertTrue(os.path.exists(os.path.join(source_dir, 'guide.md')))
+
+
+class TestUnifiedSkillBuilderGitHubReferences(unittest.TestCase):
+    """Test GitHub reference generation for multiple repositories."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.original_dir = os.getcwd()
+        os.chdir(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        os.chdir(self.original_dir)
+        if os.path.exists(self.temp_dir):
+            shutil.rmtree(self.temp_dir)
+
+    def test_creates_subdirectory_per_repo(self):
+        """Test that each GitHub repo gets its own subdirectory."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_github_refs',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [
+                {'repo': 'org/repo1', 'repo_id': 'org_repo1', 'data': {'readme': '# Repo 1', 'issues': [], 'releases': [], 'repo_info': {}}},
+                {'repo': 'org/repo2', 'repo_id': 'org_repo2', 'data': {'readme': '# Repo 2', 'issues': [], 'releases': [], 'repo_info': {}}}
+            ],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_github_references(scraped_data['github'])
+
+        github_dir = os.path.join(builder.skill_dir, 'references', 'github')
+        self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo1')))
+        self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo2')))
+
+    def test_creates_readme_per_repo(self):
+        """Test that README.md is created for each repo."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_readme',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [
+                {'repo': 'test/myrepo', 'repo_id': 'test_myrepo', 'data': {'readme': '# My Repository\n\nDescription here.', 'issues': [], 'releases': [], 'repo_info': {}}}
+            ],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_github_references(scraped_data['github'])
+
+        readme_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_myrepo', 'README.md')
+        self.assertTrue(os.path.exists(readme_path))
+
+        with open(readme_path, 'r') as f:
+            content = f.read()
+            self.assertIn('test/myrepo', content)
+
+    def test_creates_issues_file_when_issues_exist(self):
+        """Test that issues.md is created when repo has issues."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_issues',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [
+                {
+                    'repo': 'test/repo',
+                    'repo_id': 'test_repo',
+                    'data': {
+                        'readme': '# Repo',
+                        'issues': [
+                            {'number': 1, 'title': 'Bug report', 'state': 'open', 'labels': ['bug'], 'url': 'https://github.com/test/repo/issues/1'},
+                            {'number': 2, 'title': 'Feature request', 'state': 'closed', 'labels': ['enhancement'], 'url': 'https://github.com/test/repo/issues/2'}
+                        ],
+                        'releases': [],
+                        'repo_info': {}
+                    }
+                }
+            ],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_github_references(scraped_data['github'])
+
+        issues_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_repo', 'issues.md')
+        self.assertTrue(os.path.exists(issues_path))
+
+        with open(issues_path, 'r') as f:
+            content = f.read()
+            self.assertIn('Bug report', content)
+            self.assertIn('Feature request', content)
+
+    def test_creates_main_index_listing_all_repos(self):
+        """Test that main index.md lists all GitHub repositories."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_github_index',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [
+                {'repo': 'org/first', 'repo_id': 'org_first', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 100}}},
+                {'repo': 'org/second', 'repo_id': 'org_second', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 50}}}
+            ],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_github_references(scraped_data['github'])
+
+        main_index = os.path.join(builder.skill_dir, 'references', 'github', 'index.md')
+        self.assertTrue(os.path.exists(main_index))
+
+        with open(main_index, 'r') as f:
+            content = f.read()
+            self.assertIn('org/first', content)
+            self.assertIn('org/second', content)
+            self.assertIn('2 GitHub repositories', content)
+
+
+class TestUnifiedSkillBuilderPdfReferences(unittest.TestCase):
+    """Test PDF reference generation for multiple sources."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.original_dir = os.getcwd()
+        os.chdir(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        os.chdir(self.original_dir)
+        if os.path.exists(self.temp_dir):
+            shutil.rmtree(self.temp_dir)
+
+    def test_creates_pdf_index_with_count(self):
+        """Test that PDF index shows correct document count."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_pdf',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [],
+            'pdf': [
+                {'path': '/path/to/doc1.pdf'},
+                {'path': '/path/to/doc2.pdf'},
+                {'path': '/path/to/doc3.pdf'}
+            ]
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_pdf_references(scraped_data['pdf'])
+
+        pdf_index = os.path.join(builder.skill_dir, 'references', 'pdf', 'index.md')
+        self.assertTrue(os.path.exists(pdf_index))
+
+        with open(pdf_index, 'r') as f:
+            content = f.read()
+            self.assertIn('3 PDF document', content)
+
+
+if __name__ == '__main__':
+    unittest.main()

From a7f13ec75ff4cebf8bb8ec565bc875f7bb3c0313 Mon Sep 17 00:00:00 2001
From: tsyhahaha <tsy1433701769@163.com>
Date: Mon, 5 Jan 2026 22:32:31 +0800
Subject: [PATCH 3/7] chore: add medusa-mercurjs unified config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multi-source config combining Medusa docs and Mercur.js marketplace

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 configs/medusa-mercurjs.json | 71 ++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 configs/medusa-mercurjs.json

diff --git a/configs/medusa-mercurjs.json b/configs/medusa-mercurjs.json
new file mode 100644
index 0000000..e5c31ad
--- /dev/null
+++ b/configs/medusa-mercurjs.json
@@ -0,0 +1,71 @@
+{
+  "name": "medusa-mercurjs",
+  "description": "Complete Medusa v2 + MercurJS multi-vendor e-commerce framework knowledge. Use when building headless commerce applications, implementing multi-vendor marketplaces, or understanding Medusa modules/workflows.",
+  "merge_mode": "rule-based",
+  "sources": [
+    {
+      "type": "documentation",
+      "base_url": "https://docs.medusajs.com",
+      "llms_txt_url": "https://docs.medusajs.com/llms-full.txt",
+      "extract_api": true,
+      "selectors": {
+        "main_content": "main, article, .content",
+        "title": "h1",
+        "code_blocks": "pre"
+      },
+      "url_patterns": {
+        "include": [
+          "/learn",
+          "/resources"
+        ],
+        "exclude": []
+      },
+      "categories": {
+        "installation": ["installation", "install", "docker", "update"],
+        "fundamentals": ["fundamentals", "api-routes", "data-models", "modules", "module-links", "workflows", "events-and-subscribers", "scheduled-jobs", "custom-cli-scripts", "admin", "environment-variables"],
+        "customization": ["customization", "custom-features", "extend-features", "integrate-systems", "customize-admin"],
+        "debugging_testing": ["debugging-and-testing", "logging", "testing", "test-tools", "instrumentation", "feature-flags", "debug-workflows"],
+        "deployment": ["deployment", "production", "deploy", "general"],
+        "commerce_modules": ["commerce-modules", "product", "cart", "order", "payment", "pricing", "tax", "inventory", "fulfillment", "customer", "promotion", "auth", "region", "currency", "sales-channel", "stock-location", "api-key", "user"],
+        "infrastructure_modules": ["infrastructure-modules", "caching", "event", "file", "locking", "notification", "workflow-engine", "analytics"],
+        "storefront": ["storefront-development", "publishable-api-keys", "checkout", "products", "customers", "regions"],
+        "integrations": ["integrations", "sanity", "contentful", "stripe", "paypal", "shipstation", "sentry"],
+        "cli_tools": ["medusa-cli", "commands", "build", "develop", "plugin", "db"],
+        "references": ["references", "medusa-workflows", "helper-steps", "service-factory-reference", "data-model-repository-reference", "test-tools-reference", "fulfillment", "auth", "notification-provider", "file-provider", "locking-service", "caching-service"],
+        "recipes": ["recipes", "erp", "marketplace", "b2b", "subscriptions", "digital-products", "bundled-products"],
+        "admin_components": ["admin-components", "widgets", "ui-routes"],
+        "examples": ["examples", "guides", "how-to-tutorials", "tutorials"]
+      },
+      "rate_limit": 0.3,
+      "max_pages": 500
+    },
+    {
+      "type": "documentation",
+      "base_url": "https://docs.mercurjs.com/",
+      "llms_txt_url": "https://docs.mercurjs.com/llms-full.txt",
+      "extract_api": true,
+      "selectors": {
+        "main_content": "main, article",
+        "title": "h1",
+        "code_blocks": "pre"
+      },
+      "url_patterns": {
+        "include": ["/"],
+        "exclude": []
+      },
+      "categories": {
+        "quick_start": ["introduction", "get-started"],
+        "components": ["components", "backend", "admin-panel", "vendor-panel", "storefront"],
+        "core_concepts": ["core-concepts", "seller", "commission", "payouts", "order-splitting", "reviews", "requests", "notifications", "marketplace-settings"],
+        "product": ["product", "core-commerce-modules", "core-infrastructure-modules", "framework"],
+        "integrations": ["integrations", "algolia", "resend", "stripe"],
+        "api_admin": ["api-reference/admin", "admin-algolia", "admin-api-keys", "admin-attributes", "admin-auth", "admin-campaigns", "admin-claims", "admin-collections", "admin-commission", "admin-currencies", "admin-customers", "admin-draft-orders", "admin-exchanges", "admin-fulfillment", "admin-inventory", "admin-invites", "admin-notifications", "admin-orders", "admin-payments", "admin-price-lists", "admin-products", "admin-promotions", "admin-regions", "admin-reservations", "admin-returns", "admin-sales-channels", "admin-sellers", "admin-shipping", "admin-stock-locations", "admin-stores", "admin-tax", "admin-uploads", "admin-users"],
+        "api_store": ["api-reference/store", "store-auth", "store-carts", "store-collections", "store-currencies", "store-customers", "store-fulfillment", "store-orders", "store-payment", "store-products", "store-regions", "store-returns"],
+        "api_vendor": ["api-reference/vendor", "vendor-auth", "vendor-fulfillment", "vendor-inventory", "vendor-orders", "vendor-payouts", "vendor-products", "vendor-returns", "vendor-sellers", "vendor-shipping", "vendor-stock-locations", "vendor-uploads"],
+        "help": ["help", "llm", "mcp", "support"]
+      },
+      "rate_limit": 0.3,
+      "max_pages": 300
+    }
+  ]
+}

From 2e096c0284adf87294337b407777e18cef384528 Mon Sep 17 00:00:00 2001
From: Nick Miethe <miethe.dev@gmail.com>
Date: Thu, 8 Jan 2026 15:33:12 -0500
Subject: [PATCH 4/7] Enabling full support of the Claude Code documentation
 site, with support for all relevant pages and Anthropic's unconventional
 llms.txt

---
 configs/claude-code.json                     | 90 +++++++++++++++-----
 src/skill_seekers/cli/llms_txt_downloader.py | 17 +++-
 2 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/configs/claude-code.json b/configs/claude-code.json
index c84e709..ee96f68 100644
--- a/configs/claude-code.json
+++ b/configs/claude-code.json
@@ -1,37 +1,83 @@
 {
   "name": "claude-code",
-  "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, configuration, and AI-assisted development.",
-  "base_url": "https://docs.claude.com/en/docs/claude-code/",
+  "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, plugins, hooks, configuration, deployment, and AI-assisted development.",
+  "base_url": "https://code.claude.com/docs/en/",
   "start_urls": [
-    "https://docs.claude.com/en/docs/claude-code/overview",
-    "https://docs.claude.com/en/docs/claude-code/quickstart",
-    "https://docs.claude.com/en/docs/claude-code/common-workflows",
-    "https://docs.claude.com/en/docs/claude-code/mcp",
-    "https://docs.claude.com/en/docs/claude-code/settings",
-    "https://docs.claude.com/en/docs/claude-code/troubleshooting",
-    "https://docs.claude.com/en/docs/claude-code/iam"
+    "https://code.claude.com/docs/en/overview",
+    "https://code.claude.com/docs/en/quickstart",
+    "https://code.claude.com/docs/en/common-workflows",
+    "https://code.claude.com/docs/en/claude-code-on-the-web",
+    "https://code.claude.com/docs/en/desktop",
+    "https://code.claude.com/docs/en/chrome",
+    "https://code.claude.com/docs/en/vs-code",
+    "https://code.claude.com/docs/en/jetbrains",
+    "https://code.claude.com/docs/en/github-actions",
+    "https://code.claude.com/docs/en/gitlab-ci-cd",
+    "https://code.claude.com/docs/en/slack",
+    "https://code.claude.com/docs/en/sub-agents",
+    "https://code.claude.com/docs/en/plugins",
+    "https://code.claude.com/docs/en/discover-plugins",
+    "https://code.claude.com/docs/en/skills",
+    "https://code.claude.com/docs/en/output-styles",
+    "https://code.claude.com/docs/en/hooks-guide",
+    "https://code.claude.com/docs/en/headless",
+    "https://code.claude.com/docs/en/mcp",
+    "https://code.claude.com/docs/en/third-party-integrations",
+    "https://code.claude.com/docs/en/amazon-bedrock",
+    "https://code.claude.com/docs/en/google-vertex-ai",
+    "https://code.claude.com/docs/en/microsoft-foundry",
+    "https://code.claude.com/docs/en/network-config",
+    "https://code.claude.com/docs/en/llm-gateway",
+    "https://code.claude.com/docs/en/devcontainer",
+    "https://code.claude.com/docs/en/sandboxing",
+    "https://code.claude.com/docs/en/setup",
+    "https://code.claude.com/docs/en/iam",
+    "https://code.claude.com/docs/en/security",
+    "https://code.claude.com/docs/en/data-usage",
+    "https://code.claude.com/docs/en/monitoring-usage",
+    "https://code.claude.com/docs/en/costs",
+    "https://code.claude.com/docs/en/analytics",
+    "https://code.claude.com/docs/en/plugin-marketplaces",
+    "https://code.claude.com/docs/en/settings",
+    "https://code.claude.com/docs/en/terminal-config",
+    "https://code.claude.com/docs/en/model-config",
+    "https://code.claude.com/docs/en/memory",
+    "https://code.claude.com/docs/en/statusline",
+    "https://code.claude.com/docs/en/cli-reference",
+    "https://code.claude.com/docs/en/interactive-mode",
+    "https://code.claude.com/docs/en/slash-commands",
+    "https://code.claude.com/docs/en/checkpointing",
+    "https://code.claude.com/docs/en/hooks",
+    "https://code.claude.com/docs/en/plugins-reference",
+    "https://code.claude.com/docs/en/troubleshooting",
+    "https://code.claude.com/docs/en/legal-and-compliance"
   ],
   "selectors": {
-    "main_content": "#content-container",
+    "main_content": "#content-area, #content-container, article, main",
     "title": "h1",
     "code_blocks": "pre code"
   },
   "url_patterns": {
-    "include": ["/claude-code/"],
-    "exclude": ["/api-reference/", "/claude-ai/", "/claude.ai/", "/prompt-engineering/", "/changelog/"]
+    "include": ["/docs/en/"],
+    "exclude": [
+      "/docs/fr/", "/docs/de/", "/docs/it/", "/docs/ja/", "/docs/es/",
+      "/docs/ko/", "/docs/zh-CN/", "/docs/zh-TW/", "/docs/ru/",
+      "/docs/id/", "/docs/pt/", "/changelog", "github.com"
+    ]
   },
   "categories": {
-    "getting_started": ["overview", "quickstart", "installation", "setup", "terminal-config"],
-    "workflows": ["workflow", "common-workflows", "git", "testing", "debugging", "interactive"],
+    "getting_started": ["overview", "quickstart", "common-workflows"],
+    "ide_integrations": ["vs-code", "jetbrains", "desktop", "chrome", "claude-code-on-the-web", "slack"],
+    "ci_cd": ["github-actions", "gitlab-ci-cd"],
+    "building": ["sub-agents", "subagent", "plugins", "discover-plugins", "skills", "output-styles", "hooks-guide", "headless", "programmatic"],
     "mcp": ["mcp", "model-context-protocol"],
-    "configuration": ["config", "settings", "preferences", "customize", "hooks", "statusline", "model-config", "memory", "output-styles"],
-    "agents": ["agent", "task", "subagent", "sub-agent", "specialized"],
-    "skills": ["skill", "agent-skill"],
-    "integrations": ["ide-integrations", "vs-code", "jetbrains", "plugin", "marketplace"],
-    "deployment": ["bedrock", "vertex", "deployment", "network", "gateway", "devcontainer", "sandboxing", "third-party"],
-    "reference": ["reference", "api", "command", "cli-reference", "slash", "checkpointing", "headless", "sdk"],
-    "enterprise": ["iam", "security", "monitoring", "analytics", "costs", "legal", "data-usage"]
+    "deployment": ["third-party-integrations", "amazon-bedrock", "google-vertex-ai", "microsoft-foundry", "network-config", "llm-gateway", "devcontainer", "sandboxing"],
+    "administration": ["setup", "iam", "security", "data-usage", "monitoring-usage", "costs", "analytics", "plugin-marketplaces"],
+    "configuration": ["settings", "terminal-config", "model-config", "memory", "statusline"],
+    "reference": ["cli-reference", "interactive-mode", "slash-commands", "checkpointing", "hooks", "plugins-reference"],
+    "troubleshooting": ["troubleshooting"],
+    "legal": ["legal-and-compliance"]
   },
   "rate_limit": 0.5,
-  "max_pages": 200
+  "max_pages": 250
 }
diff --git a/src/skill_seekers/cli/llms_txt_downloader.py b/src/skill_seekers/cli/llms_txt_downloader.py
index 1049f86..76ec740 100644
--- a/src/skill_seekers/cli/llms_txt_downloader.py
+++ b/src/skill_seekers/cli/llms_txt_downloader.py
@@ -38,11 +38,24 @@ class LlmsTxtDownloader:
 
     def _is_markdown(self, content: str) -> bool:
         """
-        Check if content looks like markdown.
+        Check if content looks like markdown (not HTML).
 
         Returns:
-            True if content contains markdown patterns
+            True if content contains markdown patterns and is NOT HTML
         """
+        # First, reject HTML content (common redirect trap)
+        content_start = content.strip()[:500].lower()
+        html_indicators = [
+            '<!doctype html',
+            '<html',
+            '<!doctype',
+            '<head>',
+            '<meta charset',
+        ]
+        if any(indicator in content_start for indicator in html_indicators):
+            return False
+
+        # Then check for markdown patterns
         markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
         return any(pattern in content for pattern in markdown_patterns)
 

From 04de96f2f58e26244ebc2aedaa929075af40a3b0 Mon Sep 17 00:00:00 2001
From: yusyus <yusufkaraaslan.yk@pm.me>
Date: Sun, 11 Jan 2026 14:01:23 +0300
Subject: [PATCH 5/7] fix: Add empty list checks and enhance docstrings (PR
 #243 review fixes)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two critical improvements from PR #243 code review:

## Fix 1: Empty List Edge Case Handling

Added early return checks to prevent creating empty index files:

**Files Modified:**
- src/skill_seekers/cli/unified_skill_builder.py

**Changes:**
- _generate_docs_references: Skip if docs_list empty
- _generate_github_references: Skip if github_list empty
- _generate_pdf_references: Skip if pdf_list empty

**Impact:**
Prevents "Combined from 0 sources" index files which look odd.

## Fix 2: Enhanced Method Docstrings

Added comprehensive parameter types and return value documentation:

**Files Modified:**
- src/skill_seekers/cli/llms_txt_parser.py
  - extract_urls: Added detailed examples and behavior notes
  - _clean_url: Added malformed URL pattern examples

- src/skill_seekers/cli/doc_scraper.py
  - _extract_markdown_content: Full return dict structure documented
  - _extract_html_as_markdown: Extraction strategy and fallback behavior

**Impact:**
Improved developer experience with detailed API documentation.

## Testing

All tests passing:
- ✅ 32/32 PR #243 tests (markdown parsing + multi-source)
- ✅ 975/975 core tests
- 159 skipped (optional dependencies)
- 4 failed (missing anthropic - expected)

Co-authored-by: Code Review <claude-sonnet-4.5@anthropic.com>
---
 src/skill_seekers/cli/doc_scraper.py          | 56 ++++++++++++++++---
 src/skill_seekers/cli/llms_txt_parser.py      | 29 +++++++++-
 .../cli/unified_skill_builder.py              | 12 ++++
 3 files changed, 87 insertions(+), 10 deletions(-)

diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py
index 1e52181..021738b 100755
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -350,14 +350,34 @@ class DocToSkillConverter:
         return page
 
     def _extract_markdown_content(self, content: str, url: str) -> Dict[str, Any]:
-        """Extract content from a Markdown file.
+        """Extract structured content from a Markdown file.
+
+        Parses markdown files from llms.txt URLs to extract:
+        - Title from first h1 heading
+        - Headings (h2-h6, excluding h1)
+        - Code blocks with language detection
+        - Internal .md links for BFS crawling
+        - Content paragraphs (>20 chars)
+
+        Auto-detects HTML content and falls back to _extract_html_as_markdown.
 
         Args:
-            content: Raw markdown content (or HTML if server returned HTML)
-            url: Source URL
+            content: Raw markdown content string (or HTML if server returned HTML)
+            url: Source URL for resolving relative links
 
         Returns:
-            Page dict with title, content, code_samples, headings, links
+            Dict with keys:
+                - url: str - Source URL
+                - title: str - Extracted from first # heading
+                - content: str - Paragraphs joined with double newlines
+                - headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str}
+                - code_samples: List[Dict] - {'code': str, 'language': str}
+                - links: List[str] - Absolute URLs to other .md files
+                - patterns: List - Empty (reserved for future use)
+
+        Note:
+            Only .md links are extracted to avoid client-side rendered HTML pages.
+            Anchor fragments (#section) are stripped from links.
         """
         import re
 
@@ -434,12 +454,34 @@ class DocToSkillConverter:
     def _extract_html_as_markdown(self, html_content: str, url: str) -> Dict[str, Any]:
         """Extract content from HTML and convert to markdown-like structure.
 
+        Fallback method when .md URL returns HTML content instead of markdown.
+        Uses BeautifulSoup to extract structured data from HTML elements.
+
+        Extraction strategy:
+        1. Title from <title> tag
+        2. Main content from <main>, <article>, [role="main"], or <body>
+        3. Headings (h1-h6) with text and id attributes
+        4. Code blocks from <pre><code> or <pre> tags
+        5. Text content from paragraphs
+
         Args:
-            html_content: Raw HTML content
-            url: Source URL
+            html_content: Raw HTML content string
+            url: Source URL (for reference in result dict)
 
         Returns:
-            Page dict with title, content, code_samples, headings, links
+            Dict with keys:
+                - url: str - Source URL
+                - title: str - From <title> tag, cleaned
+                - content: str - Text content from main area
+                - headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str}
+                - code_samples: List[Dict] - {'code': str, 'language': str}
+                - links: List - Empty (HTML links not extracted to avoid client-side routes)
+                - patterns: List - Empty (reserved for future use)
+
+        Note:
+            Prefers <main> or <article> tags for content area.
+            Falls back to <body> if no semantic content container found.
+            Language detection uses detect_language() method.
         """
         page = {
             'url': url,
diff --git a/src/skill_seekers/cli/llms_txt_parser.py b/src/skill_seekers/cli/llms_txt_parser.py
index 2e143bf..ae11410 100644
--- a/src/skill_seekers/cli/llms_txt_parser.py
+++ b/src/skill_seekers/cli/llms_txt_parser.py
@@ -16,8 +16,19 @@ class LlmsTxtParser:
         """
         Extract all URLs from the llms.txt content.
 
+        Supports both markdown-style links [text](url) and bare URLs.
+        Resolves relative URLs using base_url if provided.
+        Filters out malformed URLs with invalid anchor patterns.
+
         Returns:
-            List of unique URLs found in the content
+            List of unique, cleaned URLs found in the content.
+            Returns empty list if no valid URLs found.
+
+        Note:
+            - Markdown links: [Getting Started](./docs/guide.md)
+            - Bare URLs: https://example.com/api.md
+            - Relative paths resolved with base_url
+            - Invalid anchors (#section/path.md) are stripped
         """
         urls = set()
 
@@ -48,11 +59,23 @@ class LlmsTxtParser:
         """
         Clean and validate URL, removing invalid anchor patterns.
 
+        Detects and strips malformed anchors that contain path separators.
+        Valid: https://example.com/page.md#section
+        Invalid: https://example.com/page#section/index.html.md
+
         Args:
-            url: URL to clean
+            url: URL to clean (absolute or relative)
 
         Returns:
-            Cleaned URL or empty string if invalid
+            Cleaned URL with malformed anchors stripped.
+            Returns base URL if anchor contains '/' (malformed).
+            Returns original URL if anchor is valid or no anchor present.
+
+        Example:
+            >>> parser._clean_url("https://ex.com/page#sec/path.md")
+            "https://ex.com/page"
+            >>> parser._clean_url("https://ex.com/page.md#section")
+            "https://ex.com/page.md#section"
         """
         # Skip URLs with path after anchor (e.g., #section/index.html.md)
         # These are malformed and return duplicate HTML content
diff --git a/src/skill_seekers/cli/unified_skill_builder.py b/src/skill_seekers/cli/unified_skill_builder.py
index a80f86d..ef6437c 100644
--- a/src/skill_seekers/cli/unified_skill_builder.py
+++ b/src/skill_seekers/cli/unified_skill_builder.py
@@ -287,6 +287,10 @@ This skill combines knowledge from multiple sources:
 
     def _generate_docs_references(self, docs_list: List[Dict]):
         """Generate references from multiple documentation sources."""
+        # Skip if no documentation sources
+        if not docs_list:
+            return
+
         docs_dir = os.path.join(self.skill_dir, 'references', 'documentation')
         os.makedirs(docs_dir, exist_ok=True)
 
@@ -347,6 +351,10 @@ This skill combines knowledge from multiple sources:
 
     def _generate_github_references(self, github_list: List[Dict]):
         """Generate references from multiple GitHub sources."""
+        # Skip if no GitHub sources
+        if not github_list:
+            return
+
         github_dir = os.path.join(self.skill_dir, 'references', 'github')
         os.makedirs(github_dir, exist_ok=True)
 
@@ -429,6 +437,10 @@ This skill combines knowledge from multiple sources:
 
     def _generate_pdf_references(self, pdf_list: List[Dict]):
         """Generate references from PDF sources."""
+        # Skip if no PDF sources
+        if not pdf_list:
+            return
+
         pdf_dir = os.path.join(self.skill_dir, 'references', 'pdf')
         os.makedirs(pdf_dir, exist_ok=True)
 

From 9042e1680c8080543be7dc509592094a335a1a91 Mon Sep 17 00:00:00 2001
From: Nick Miethe <miethe.dev@gmail.com>
Date: Thu, 8 Jan 2026 15:33:12 -0500
Subject: [PATCH 6/7] Enabling full support of the Claude Code documentation
 site, with support for all relevant pages and Anthropic's unconventional
 llms.txt

---
 configs/claude-code.json                     | 90 +++++++++++++++-----
 src/skill_seekers/cli/llms_txt_downloader.py | 17 +++-
 2 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/configs/claude-code.json b/configs/claude-code.json
index c84e709..ee96f68 100644
--- a/configs/claude-code.json
+++ b/configs/claude-code.json
@@ -1,37 +1,83 @@
 {
   "name": "claude-code",
-  "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, configuration, and AI-assisted development.",
-  "base_url": "https://docs.claude.com/en/docs/claude-code/",
+  "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, plugins, hooks, configuration, deployment, and AI-assisted development.",
+  "base_url": "https://code.claude.com/docs/en/",
   "start_urls": [
-    "https://docs.claude.com/en/docs/claude-code/overview",
-    "https://docs.claude.com/en/docs/claude-code/quickstart",
-    "https://docs.claude.com/en/docs/claude-code/common-workflows",
-    "https://docs.claude.com/en/docs/claude-code/mcp",
-    "https://docs.claude.com/en/docs/claude-code/settings",
-    "https://docs.claude.com/en/docs/claude-code/troubleshooting",
-    "https://docs.claude.com/en/docs/claude-code/iam"
+    "https://code.claude.com/docs/en/overview",
+    "https://code.claude.com/docs/en/quickstart",
+    "https://code.claude.com/docs/en/common-workflows",
+    "https://code.claude.com/docs/en/claude-code-on-the-web",
+    "https://code.claude.com/docs/en/desktop",
+    "https://code.claude.com/docs/en/chrome",
+    "https://code.claude.com/docs/en/vs-code",
+    "https://code.claude.com/docs/en/jetbrains",
+    "https://code.claude.com/docs/en/github-actions",
+    "https://code.claude.com/docs/en/gitlab-ci-cd",
+    "https://code.claude.com/docs/en/slack",
+    "https://code.claude.com/docs/en/sub-agents",
+    "https://code.claude.com/docs/en/plugins",
+    "https://code.claude.com/docs/en/discover-plugins",
+    "https://code.claude.com/docs/en/skills",
+    "https://code.claude.com/docs/en/output-styles",
+    "https://code.claude.com/docs/en/hooks-guide",
+    "https://code.claude.com/docs/en/headless",
+    "https://code.claude.com/docs/en/mcp",
+    "https://code.claude.com/docs/en/third-party-integrations",
+    "https://code.claude.com/docs/en/amazon-bedrock",
+    "https://code.claude.com/docs/en/google-vertex-ai",
+    "https://code.claude.com/docs/en/microsoft-foundry",
+    "https://code.claude.com/docs/en/network-config",
+    "https://code.claude.com/docs/en/llm-gateway",
+    "https://code.claude.com/docs/en/devcontainer",
+    "https://code.claude.com/docs/en/sandboxing",
+    "https://code.claude.com/docs/en/setup",
+    "https://code.claude.com/docs/en/iam",
+    "https://code.claude.com/docs/en/security",
+    "https://code.claude.com/docs/en/data-usage",
+    "https://code.claude.com/docs/en/monitoring-usage",
+    "https://code.claude.com/docs/en/costs",
+    "https://code.claude.com/docs/en/analytics",
+    "https://code.claude.com/docs/en/plugin-marketplaces",
+    "https://code.claude.com/docs/en/settings",
+    "https://code.claude.com/docs/en/terminal-config",
+    "https://code.claude.com/docs/en/model-config",
+    "https://code.claude.com/docs/en/memory",
+    "https://code.claude.com/docs/en/statusline",
+    "https://code.claude.com/docs/en/cli-reference",
+    "https://code.claude.com/docs/en/interactive-mode",
+    "https://code.claude.com/docs/en/slash-commands",
+    "https://code.claude.com/docs/en/checkpointing",
+    "https://code.claude.com/docs/en/hooks",
+    "https://code.claude.com/docs/en/plugins-reference",
+    "https://code.claude.com/docs/en/troubleshooting",
+    "https://code.claude.com/docs/en/legal-and-compliance"
   ],
   "selectors": {
-    "main_content": "#content-container",
+    "main_content": "#content-area, #content-container, article, main",
     "title": "h1",
     "code_blocks": "pre code"
   },
   "url_patterns": {
-    "include": ["/claude-code/"],
-    "exclude": ["/api-reference/", "/claude-ai/", "/claude.ai/", "/prompt-engineering/", "/changelog/"]
+    "include": ["/docs/en/"],
+    "exclude": [
+      "/docs/fr/", "/docs/de/", "/docs/it/", "/docs/ja/", "/docs/es/",
+      "/docs/ko/", "/docs/zh-CN/", "/docs/zh-TW/", "/docs/ru/",
+      "/docs/id/", "/docs/pt/", "/changelog", "github.com"
+    ]
   },
   "categories": {
-    "getting_started": ["overview", "quickstart", "installation", "setup", "terminal-config"],
-    "workflows": ["workflow", "common-workflows", "git", "testing", "debugging", "interactive"],
+    "getting_started": ["overview", "quickstart", "common-workflows"],
+    "ide_integrations": ["vs-code", "jetbrains", "desktop", "chrome", "claude-code-on-the-web", "slack"],
+    "ci_cd": ["github-actions", "gitlab-ci-cd"],
+    "building": ["sub-agents", "subagent", "plugins", "discover-plugins", "skills", "output-styles", "hooks-guide", "headless", "programmatic"],
     "mcp": ["mcp", "model-context-protocol"],
-    "configuration": ["config", "settings", "preferences", "customize", "hooks", "statusline", "model-config", "memory", "output-styles"],
-    "agents": ["agent", "task", "subagent", "sub-agent", "specialized"],
-    "skills": ["skill", "agent-skill"],
-    "integrations": ["ide-integrations", "vs-code", "jetbrains", "plugin", "marketplace"],
-    "deployment": ["bedrock", "vertex", "deployment", "network", "gateway", "devcontainer", "sandboxing", "third-party"],
-    "reference": ["reference", "api", "command", "cli-reference", "slash", "checkpointing", "headless", "sdk"],
-    "enterprise": ["iam", "security", "monitoring", "analytics", "costs", "legal", "data-usage"]
+    "deployment": ["third-party-integrations", "amazon-bedrock", "google-vertex-ai", "microsoft-foundry", "network-config", "llm-gateway", "devcontainer", "sandboxing"],
+    "administration": ["setup", "iam", "security", "data-usage", "monitoring-usage", "costs", "analytics", "plugin-marketplaces"],
+    "configuration": ["settings", "terminal-config", "model-config", "memory", "statusline"],
+    "reference": ["cli-reference", "interactive-mode", "slash-commands", "checkpointing", "hooks", "plugins-reference"],
+    "troubleshooting": ["troubleshooting"],
+    "legal": ["legal-and-compliance"]
   },
   "rate_limit": 0.5,
-  "max_pages": 200
+  "max_pages": 250
 }
diff --git a/src/skill_seekers/cli/llms_txt_downloader.py b/src/skill_seekers/cli/llms_txt_downloader.py
index 1049f86..76ec740 100644
--- a/src/skill_seekers/cli/llms_txt_downloader.py
+++ b/src/skill_seekers/cli/llms_txt_downloader.py
@@ -38,11 +38,24 @@ class LlmsTxtDownloader:
 
     def _is_markdown(self, content: str) -> bool:
         """
-        Check if content looks like markdown.
+        Check if content looks like markdown (not HTML).
 
         Returns:
-            True if content contains markdown patterns
+            True if content contains markdown patterns and is NOT HTML
         """
+        # First, reject HTML content (common redirect trap)
+        content_start = content.strip()[:500].lower()
+        html_indicators = [
+            '<!doctype html',
+            '<html',
+            '<!doctype',
+            '<head>',
+            '<meta charset',
+        ]
+        if any(indicator in content_start for indicator in html_indicators):
+            return False
+
+        # Then check for markdown patterns
         markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
         return any(pattern in content for pattern in markdown_patterns)
 

From 6008f13127e9b467632e1ba1001c474c9c22acae Mon Sep 17 00:00:00 2001
From: yusyus <yusufkaraaslan.yk@pm.me>
Date: Sun, 11 Jan 2026 14:16:44 +0300
Subject: [PATCH 7/7] test: Add comprehensive HTML detection tests for llms.txt
 downloader (PR #244 review fix)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added 7 test cases to verify HTML redirect trap prevention:
- test_is_markdown_rejects_html_doctype() - DOCTYPE rejection (case-insensitive)
- test_is_markdown_rejects_html_tag() - <html> tag rejection
- test_is_markdown_rejects_html_meta() - <meta> and <head> tag rejection
- test_is_markdown_accepts_markdown_with_html_words() - Edge case: markdown mentioning "html"
- test_html_detection_only_scans_first_500_chars() - Performance optimization verification
- test_html_redirect_trap_scenario() - Real-world Claude Code redirect scenario
- test_download_rejects_html_redirect() - End-to-end download rejection

Addresses minor observation from PR #244 review:
- Ensures HTML detection logic is fully covered
- Prevents regression of redirect trap fixes
- Validates 500-char scanning optimization

Test Results: 20/20 llms_txt_downloader tests passing
Overall: 982/982 tests passing (4 expected failures - missing anthropic package)

Related: PR #244 (Claude Code documentation config update)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 tests/test_llms_txt_downloader.py | 92 +++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/tests/test_llms_txt_downloader.py b/tests/test_llms_txt_downloader.py
index 3b945fc..bcdc4dc 100644
--- a/tests/test_llms_txt_downloader.py
+++ b/tests/test_llms_txt_downloader.py
@@ -168,3 +168,95 @@ def test_get_proper_filename_small():
     filename = downloader.get_proper_filename()
 
     assert filename == "llms-small.md"
+
+def test_is_markdown_rejects_html_doctype():
+    """Test that HTML with DOCTYPE is rejected (prevents redirect trap)"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    html = '<!DOCTYPE html><html><head><title>Product Page</title></head><body>Content</body></html>'
+    assert not downloader._is_markdown(html)
+
+    # Test case-insensitive
+    html_uppercase = '<!DOCTYPE HTML><HTML><BODY>Content</BODY></HTML>'
+    assert not downloader._is_markdown(html_uppercase)
+
+def test_is_markdown_rejects_html_tag():
+    """Test that HTML with <html> tag is rejected (prevents redirect trap)"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    html = '<html><head><meta charset="utf-8"></head><body>Content</body></html>'
+    assert not downloader._is_markdown(html)
+
+    # Test with just opening tag
+    html_partial = '<html><head>Some content'
+    assert not downloader._is_markdown(html_partial)
+
+def test_is_markdown_rejects_html_meta():
+    """Test that HTML with <meta> or <head> tags is rejected"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    html_with_head = '<head><title>Page</title></head><body>Content</body>'
+    assert not downloader._is_markdown(html_with_head)
+
+    html_with_meta = '<meta charset="utf-8"><meta name="viewport" content="width=device-width">'
+    assert not downloader._is_markdown(html_with_meta)
+
+def test_is_markdown_accepts_markdown_with_html_words():
+    """Test that markdown mentioning 'html' word is still accepted"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    markdown = '# Guide\n\nLearn about html tags in markdown. You can write HTML inside markdown.'
+    assert downloader._is_markdown(markdown)
+
+    # Test with actual markdown patterns
+    markdown_with_code = '# HTML Tutorial\n\n```html\n<div>example</div>\n```\n\n## More content'
+    assert downloader._is_markdown(markdown_with_code)
+
+def test_html_detection_only_scans_first_500_chars():
+    """Test that HTML detection only scans first 500 characters for performance"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    # HTML tag after 500 chars should not be detected
+    safe_markdown = '# Header\n\n' + ('Valid markdown content. ' * 50) + '\n\n<!DOCTYPE html>'
+    # This should pass because <!DOCTYPE html> is beyond first 500 chars
+    if len(safe_markdown[:500]) < len('<!DOCTYPE html>'):
+        # If the HTML is within 500 chars, adjust test
+        assert not downloader._is_markdown(safe_markdown)
+    else:
+        # HTML beyond 500 chars should not trigger rejection
+        assert downloader._is_markdown(safe_markdown)
+
+def test_html_redirect_trap_scenario():
+    """Test real-world scenario: llms.txt redirects to HTML product page"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    # Simulate Claude Code redirect scenario (302 to HTML page)
+    html_product_page = '''<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Claude Code - Product Page</title>
+</head>
+<body>
+    <h1>Claude Code</h1>
+    <p>Product information...</p>
+</body>
+</html>'''
+
+    # Should reject this HTML even though it has <h1> tag (looks like markdown "# ")
+    assert not downloader._is_markdown(html_product_page)
+
+def test_download_rejects_html_redirect():
+    """Test that download() properly rejects HTML redirects"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    mock_response = Mock()
+    # Simulate server returning HTML instead of markdown
+    mock_response.text = '<!DOCTYPE html><html><body><h1>Product Page</h1></body></html>'
+    mock_response.raise_for_status = Mock()
+
+    with patch('requests.get', return_value=mock_response):
+        content = downloader.download()
+
+    # Should return None (rejected as non-markdown)
+    assert content is None