feat: support multiple sources of same type in unified scraper

- Add Markdown file parsing in doc_scraper (_extract_markdown_content, _extract_html_as_markdown) - Add URL extraction and cleaning in llms_txt_parser (extract_urls, _clean_url) - Support multiple documentation/github/pdf sources in unified_scraper - Generate separate reference directories per source in unified_skill_builder - Skip pages with empty/short content (<50 chars) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-05 21:45:36 +08:00
parent 26474c29eb
commit 8cf43582a4
4 changed files with 529 additions and 112 deletions
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -349,6 +349,151 @@ class DocToSkillConverter:
        return page
    def _extract_markdown_content(self, content: str, url: str) -> Dict[str, Any]:
        """Extract content from a Markdown file.
        Args:
            content: Raw markdown content (or HTML if server returned HTML)
            url: Source URL
        Returns:
            Page dict with title, content, code_samples, headings, links
        """
        import re
        # Detect if content is actually HTML (some .md URLs return HTML)
        if content.strip().startswith('<!DOCTYPE') or content.strip().startswith('<html'):
            return self._extract_html_as_markdown(content, url)
        page = {
            'url': url,
            'title': '',
            'content': '',
            'headings': [],
            'code_samples': [],
            'patterns': [],
            'links': []
        }
        lines = content.split('\n')
        # Extract title from first h1
        for line in lines:
            if line.startswith('# '):
                page['title'] = line[2:].strip()
                break
        # Extract headings (h2-h6)
        for line in lines:
            match = re.match(r'^(#{2,6})\s+(.+)$', line)
            if match:
                level = len(match.group(1))
                text = match.group(2).strip()
                page['headings'].append({
                    'level': f'h{level}',
                    'text': text,
                    'id': text.lower().replace(' ', '-')
                })
        # Extract code blocks with language
        code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
        for lang, code in code_blocks:
            if len(code.strip()) > 10:
                page['code_samples'].append({
                    'code': code.strip(),
                    'language': lang or 'unknown'
                })
        # Extract content (paragraphs)
        content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
        paragraphs = []
        for para in content_no_code.split('\n\n'):
            text = para.strip()
            # Skip headings and short text
            if text and len(text) > 20 and not text.startswith('#'):
                paragraphs.append(text)
        page['content'] = '\n\n'.join(paragraphs)
        # Extract links from markdown (only .md files to avoid client-side rendered HTML pages)
        md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', content)
        for _, href in md_links:
            if href.startswith('http'):
                full_url = href
            elif not href.startswith('#'):
                full_url = urljoin(url, href)
            else:
                continue
            # Strip anchor fragments
            full_url = full_url.split('#')[0]
            # Only include .md URLs to avoid client-side rendered HTML pages
            if '.md' in full_url and self.is_valid_url(full_url) and full_url not in page['links']:
                page['links'].append(full_url)
        return page
    def _extract_html_as_markdown(self, html_content: str, url: str) -> Dict[str, Any]:
        """Extract content from HTML and convert to markdown-like structure.
        Args:
            html_content: Raw HTML content
            url: Source URL
        Returns:
            Page dict with title, content, code_samples, headings, links
        """
        page = {
            'url': url,
            'title': '',
            'content': '',
            'headings': [],
            'code_samples': [],
            'patterns': [],
            'links': []
        }
        soup = BeautifulSoup(html_content, 'html.parser')
        # Try to extract title
        title_elem = soup.select_one('title')
        if title_elem:
            page['title'] = self.clean_text(title_elem.get_text())
        # Try to find main content area
        main = soup.select_one('main, article, [role="main"], .content')
        if not main:
            main = soup.body if soup.body else soup
        if main:
            # Extract headings
            for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
                text = self.clean_text(h.get_text())
                if text:
                    page['headings'].append({
                        'level': h.name,
                        'text': text,
                        'id': h.get('id', '')
                    })
            # Extract code blocks
            for code_elem in main.select('pre code, pre'):
                code = code_elem.get_text()
                if len(code.strip()) > 10:
                    lang = self.detect_language(code_elem, code)
                    page['code_samples'].append({
                        'code': code.strip(),
                        'language': lang
                    })
            # Extract paragraphs
            paragraphs = []
            for p in main.find_all('p'):
                text = self.clean_text(p.get_text())
                if text and len(text) > 20:
                    paragraphs.append(text)
            page['content'] = '\n\n'.join(paragraphs)
        return page
    def detect_language(self, elem, code):
        """Detect programming language from code block
@@ -386,7 +531,12 @@ class DocToSkillConverter:
        return text.strip()
    def save_page(self, page: Dict[str, Any]) -> None:
-        """Save page data"""
+        """Save page data (skip pages with empty content)"""
        # Skip pages with empty or very short content
        if not page.get('content') or len(page.get('content', '')) < 50:
            logger.debug("Skipping page with empty/short content: %s", page.get('url', 'unknown'))
            return
        url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10]
        safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50]
        safe_title = re.sub(r'[-\s]+', '_', safe_title)
@@ -408,6 +558,7 @@ class DocToSkillConverter:
        Note:
            Uses threading locks when workers > 1 for thread safety
            Supports both HTML pages and Markdown (.md) files
        """
        try:
            # Scraping part (no lock needed - independent)
@@ -415,8 +566,12 @@ class DocToSkillConverter:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()
-            soup = BeautifulSoup(response.content, 'html.parser')
+            # Check if this is a Markdown file
-            page = self.extract_content(soup, url)
+            if url.endswith('.md') or '.md' in url:
                page = self._extract_markdown_content(response.text, url)
            else:
                soup = BeautifulSoup(response.content, 'html.parser')
                page = self.extract_content(soup, url)
            # Thread-safe operations (lock required)
            if self.workers > 1:
@@ -463,6 +618,7 @@ class DocToSkillConverter:
        Note:
            Uses asyncio.Lock for async-safe operations instead of threading.Lock
            Supports both HTML pages and Markdown (.md) files
        """
        async with semaphore:  # Limit concurrent requests
            try:
@@ -471,9 +627,13 @@ class DocToSkillConverter:
                response = await client.get(url, headers=headers, timeout=30.0)
                response.raise_for_status()
-                # BeautifulSoup parsing (still synchronous, but fast)
+                # Check if this is a Markdown file
-                soup = BeautifulSoup(response.content, 'html.parser')
+                if url.endswith('.md') or '.md' in url:
-                page = self.extract_content(soup, url)
+                    page = self._extract_markdown_content(response.text, url)
                else:
                    # BeautifulSoup parsing (still synchronous, but fast)
                    soup = BeautifulSoup(response.content, 'html.parser')
                    page = self.extract_content(soup, url)
                # Async-safe operations (no lock needed - single event loop)
                logger.info("  %s", url)
@@ -493,6 +653,56 @@ class DocToSkillConverter:
            except Exception as e:
                logger.error("  ✗ Error scraping %s: %s: %s", url, type(e).__name__, e)
    def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
        """
        Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
        不预先检查 URL 是否存在，直接加入队列，在爬取时再验证。
        Args:
            urls: List of URLs to process
        Returns:
            List of .md URLs (未验证)
        """
        md_urls = []
        for url in urls:
            if '.md' in url:
                md_urls.append(url)
            else:
                # 直接转换为 .md 格式，不发送 HEAD 请求检查
                url = url.rstrip('/')
                md_url = f"{url}/index.html.md"
                md_urls.append(md_url)
        logger.info("  ✓ Converted %d URLs to .md format (will validate during crawl)", len(md_urls))
        return md_urls
    # ORIGINAL _convert_to_md_urls (with HEAD request validation):
    # def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
    #     md_urls = []
    #     non_md_urls = []
    #     for url in urls:
    #         if '.md' in url:
    #             md_urls.append(url)
    #         else:
    #             non_md_urls.append(url)
    #     if non_md_urls:
    #         logger.info("  🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls))
    #         converted = 0
    #         for url in non_md_urls:
    #             url = url.rstrip('/')
    #             md_url = f"{url}/index.html.md"
    #             try:
    #                 resp = requests.head(md_url, timeout=5, allow_redirects=True)
    #                 if resp.status_code == 200:
    #                     md_urls.append(md_url)
    #                     converted += 1
    #             except Exception:
    #                 pass
    #         logger.info("  ✓ Converted %d URLs to .md format", converted)
    #     return md_urls
    def _try_llms_txt(self) -> bool:
        """
        Try to use llms.txt instead of HTML scraping.
@@ -548,7 +758,29 @@ class DocToSkillConverter:
                            logger.info("     ✓ %s (%d chars)", extra_filename, len(extra_content))
                # Parse explicit file for skill building
-                parser = LlmsTxtParser(content)
+                parser = LlmsTxtParser(content, self.base_url)
                # Extract URLs from llms.txt and add to pending_urls for BFS crawling
                extracted_urls = parser.extract_urls()
                if extracted_urls:
                    # Convert non-.md URLs to .md format by trying /index.html.md suffix
                    md_urls = self._convert_to_md_urls(extracted_urls)
                    logger.info("\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
                               len(extracted_urls), len(md_urls))
                    # Filter URLs based on url_patterns config
                    for url in md_urls:
                        if self.is_valid_url(url) and url not in self.visited_urls:
                            self.pending_urls.append(url)
                    logger.info("  📋 %d URLs added to crawl queue after filtering", len(self.pending_urls))
                    # Return False to trigger HTML scraping with the populated pending_urls
                    self.llms_txt_detected = True
                    self.llms_txt_variant = 'explicit'
                    return False  # Continue with BFS crawling
                # Fallback: if no URLs found, use section-based parsing
                pages = parser.parse()
                if pages:
@@ -606,7 +838,29 @@ class DocToSkillConverter:
        largest = max(downloaded.items(), key=lambda x: x[1]['size'])
        logger.info("\n📄 Parsing %s for skill building...", largest[1]['filename'])
-        parser = LlmsTxtParser(largest[1]['content'])
+        parser = LlmsTxtParser(largest[1]['content'], self.base_url)
        # Extract URLs from llms.txt and add to pending_urls for BFS crawling
        extracted_urls = parser.extract_urls()
        if extracted_urls:
            # Convert non-.md URLs to .md format by trying /index.html.md suffix
            md_urls = self._convert_to_md_urls(extracted_urls)
            logger.info("\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
                       len(extracted_urls), len(md_urls))
            # Filter URLs based on url_patterns config
            for url in md_urls:
                if self.is_valid_url(url) and url not in self.visited_urls:
                    self.pending_urls.append(url)
            logger.info("  📋 %d URLs added to crawl queue after filtering", len(self.pending_urls))
            # Return False to trigger HTML scraping with the populated pending_urls
            self.llms_txt_detected = True
            self.llms_txt_variants = list(downloaded.keys())
            return False  # Continue with BFS crawling
        # Fallback: if no URLs found, use section-based parsing
        pages = parser.parse()
        if not pages:
--- a/src/skill_seekers/cli/llms_txt_parser.py
+++ b/src/skill_seekers/cli/llms_txt_parser.py
@@ -3,12 +3,67 @@
 import re
 from typing import List, Dict
 from urllib.parse import urljoin
 class LlmsTxtParser:
    """Parse llms.txt markdown content into page structures"""
-    def __init__(self, content: str):
+    def __init__(self, content: str, base_url: str = None):
        self.content = content
        self.base_url = base_url
    def extract_urls(self) -> List[str]:
        """
        Extract all URLs from the llms.txt content.
        Returns:
            List of unique URLs found in the content
        """
        urls = set()
        # Match markdown links: [text](url)
        md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
        for _, url in md_links:
            if url.startswith('http'):
                clean_url = self._clean_url(url)
                if clean_url:
                    urls.add(clean_url)
            elif self.base_url and not url.startswith('#'):
                clean_url = self._clean_url(urljoin(self.base_url, url))
                if clean_url:
                    urls.add(clean_url)
        # Match bare URLs
        bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
        for url in bare_urls:
            # Clean trailing punctuation
            url = url.rstrip('.,;:')
            clean_url = self._clean_url(url)
            if clean_url:
                urls.add(clean_url)
        return list(urls)
    def _clean_url(self, url: str) -> str:
        """
        Clean and validate URL, removing invalid anchor patterns.
        Args:
            url: URL to clean
        Returns:
            Cleaned URL or empty string if invalid
        """
        # Skip URLs with path after anchor (e.g., #section/index.html.md)
        # These are malformed and return duplicate HTML content
        if '#' in url:
            anchor_pos = url.index('#')
            after_anchor = url[anchor_pos + 1:]
            # If there's a path separator after anchor, it's invalid
            if '/' in after_anchor:
                # Extract the base URL without the malformed anchor
                return url[:anchor_pos]
        return url
    def parse(self) -> List[Dict]:
        """
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -70,8 +70,12 @@ class UnifiedScraper:
        self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based')
        logger.info(f"Merge mode: {self.merge_mode}")
-        # Storage for scraped data
+        # Storage for scraped data - use lists to support multiple sources of same type
-        self.scraped_data = {}
+        self.scraped_data = {
            'documentation': [],  # List of doc sources
            'github': [],         # List of github sources
            'pdf': []             # List of pdf sources
        }
        # Output paths
        self.name = self.config['name']
@@ -81,6 +85,9 @@ class UnifiedScraper:
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.data_dir, exist_ok=True)
        # Track source index for unique naming
        self._source_counters = {'documentation': 0, 'github': 0, 'pdf': 0}
    def scrape_all_sources(self):
        """
        Scrape all configured sources.
@@ -114,13 +121,22 @@ class UnifiedScraper:
                logger.error(f"Error scraping {source_type}: {e}")
                logger.info("Continuing with other sources...")
-        logger.info(f"\n✅ Scraped {len(self.scraped_data)} sources successfully")
+        logger.info(f"\n✅ Scraped {sum(len(v) for v in self.scraped_data.values())} sources successfully")
    def _scrape_documentation(self, source: Dict[str, Any]):
        """Scrape documentation website."""
-        # Create temporary config for doc scraper
+        # Get unique index for this documentation source
        idx = self._source_counters['documentation']
        self._source_counters['documentation'] += 1
        # Extract source identifier from URL for unique naming
        from urllib.parse import urlparse
        parsed = urlparse(source['base_url'])
        source_id = parsed.netloc.replace('.', '_').replace(':', '_')
        # Create temporary config for doc scraper with unique name
        doc_config = {
-            'name': f"{self.name}_docs",
+            'name': f"{self.name}_docs_{idx}_{source_id}",
            'base_url': source['base_url'],
            'selectors': source.get('selectors', {}),
            'url_patterns': source.get('url_patterns', {}),
@@ -164,10 +180,15 @@ class UnifiedScraper:
            with open(docs_data_file, 'r', encoding='utf-8') as f:
                summary = json.load(f)
-            self.scraped_data['documentation'] = {
+            # Append to list instead of overwriting
            self.scraped_data['documentation'].append({
                'source_id': source_id,
                'base_url': source['base_url'],
                'pages': summary.get('pages', []),
-                'data_file': docs_data_file
+                'total_pages': summary.get('total_pages', 0),
-            }
+                'data_file': docs_data_file,
                'refs_dir': f"output/{doc_config['name']}/references"
            })
            logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped")
        else:
@@ -185,10 +206,18 @@ class UnifiedScraper:
            logger.error("github_scraper.py not found")
            return
        # Get unique index for this GitHub source
        idx = self._source_counters['github']
        self._source_counters['github'] += 1
        # Extract repo identifier for unique naming
        repo = source['repo']
        repo_id = repo.replace('/', '_')
        # Create config for GitHub scraper
        github_config = {
-            'repo': source['repo'],
+            'repo': repo,
-            'name': f"{self.name}_github",
+            'name': f"{self.name}_github_{idx}_{repo_id}",
            'github_token': source.get('github_token'),
            'include_issues': source.get('include_issues', True),
            'max_issues': source.get('max_issues', 100),
@@ -197,7 +226,7 @@ class UnifiedScraper:
            'include_code': source.get('include_code', True),
            'code_analysis_depth': source.get('code_analysis_depth', 'surface'),
            'file_patterns': source.get('file_patterns', []),
-            'local_repo_path': source.get('local_repo_path')  # Pass local_repo_path from config
+            'local_repo_path': source.get('local_repo_path')
        }
        # Pass directory exclusions if specified (optional)
@@ -207,19 +236,22 @@ class UnifiedScraper:
            github_config['exclude_dirs_additional'] = source['exclude_dirs_additional']
        # Scrape
-        logger.info(f"Scraping GitHub repository: {source['repo']}")
+        logger.info(f"Scraping GitHub repository: {repo}")
        scraper = GitHubScraper(github_config)
        github_data = scraper.scrape()
-        # Save data
+        # Save data with unique filename
-        github_data_file = os.path.join(self.data_dir, 'github_data.json')
+        github_data_file = os.path.join(self.data_dir, f'github_data_{idx}_{repo_id}.json')
        with open(github_data_file, 'w', encoding='utf-8') as f:
            json.dump(github_data, f, indent=2, ensure_ascii=False)
-        self.scraped_data['github'] = {
+        # Append to list instead of overwriting
        self.scraped_data['github'].append({
            'repo': repo,
            'repo_id': repo_id,
            'data': github_data,
            'data_file': github_data_file
-        }
+        })
        logger.info(f"✅ GitHub: Repository scraped successfully")
@@ -274,14 +306,23 @@ class UnifiedScraper:
            logger.info("No API merge needed (only one API source)")
            return []
-        # Get documentation and GitHub data
+        # Get documentation and GitHub data (now lists)
-        docs_data = self.scraped_data.get('documentation', {})
+        docs_list = self.scraped_data.get('documentation', [])
-        github_data = self.scraped_data.get('github', {})
+        github_list = self.scraped_data.get('github', [])
-        if not docs_data or not github_data:
+        if not docs_list or not github_list:
            logger.warning("Missing documentation or GitHub data for conflict detection")
            return []
        # For conflict detection, combine all docs and all github data
        # Use the first of each for now (conflict detection is optional)
        docs_data = docs_list[0] if docs_list else {}
        github_data = github_list[0] if github_list else {}
        if not docs_data.get('data_file') or not github_data.get('data_file'):
            logger.warning("Missing data files for conflict detection")
            return []
        # Load data files
        with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
            docs_json = json.load(f)
@@ -328,9 +369,17 @@ class UnifiedScraper:
            logger.info("No conflicts to merge")
            return None
-        # Get data files
+        # Get data files (now lists)
-        docs_data = self.scraped_data.get('documentation', {})
+        docs_list = self.scraped_data.get('documentation', [])
-        github_data = self.scraped_data.get('github', {})
+        github_list = self.scraped_data.get('github', [])
        if not docs_list or not github_list:
            logger.warning("Missing data for merge")
            return None
        # Use first source of each type for merge
        docs_data = docs_list[0]
        github_data = github_list[0]
        # Load data
        with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
--- a/src/skill_seekers/cli/unified_skill_builder.py
+++ b/src/skill_seekers/cli/unified_skill_builder.py
@@ -268,118 +268,177 @@ This skill combines knowledge from multiple sources:
        """Generate reference files organized by source."""
        logger.info("Generating reference files...")
-        # Generate references for each source type
+        # Generate references for each source type (now lists)
-        if 'documentation' in self.scraped_data:
+        docs_list = self.scraped_data.get('documentation', [])
-            self._generate_docs_references()
+        if docs_list:
            self._generate_docs_references(docs_list)
-        if 'github' in self.scraped_data:
+        github_list = self.scraped_data.get('github', [])
-            self._generate_github_references()
+        if github_list:
            self._generate_github_references(github_list)
-        if 'pdf' in self.scraped_data:
+        pdf_list = self.scraped_data.get('pdf', [])
-            self._generate_pdf_references()
+        if pdf_list:
            self._generate_pdf_references(pdf_list)
        # Generate merged API reference if available
        if self.merged_data:
            self._generate_merged_api_reference()
-    def _generate_docs_references(self):
+    def _generate_docs_references(self, docs_list: List[Dict]):
-        """Generate references from documentation source."""
+        """Generate references from multiple documentation sources."""
        docs_dir = os.path.join(self.skill_dir, 'references', 'documentation')
        os.makedirs(docs_dir, exist_ok=True)
-        # Best-effort: copy docs-only reference files into unified docs references.
+        all_copied_files: List[str] = []
        # UnifiedScraper runs doc_scraper using name "{name}_docs", which creates
        # output/{name}_docs/references/*.md. Those are the most useful documentation
        # references for the unified skill.
        source_refs_dir = os.path.join('output', f"{self.name}_docs", 'references')
        copied_files: List[str] = []
-        if os.path.isdir(source_refs_dir):
+        # Process each documentation source
-            for entry in sorted(os.listdir(source_refs_dir)):
+        for i, doc_source in enumerate(docs_list):
-                src_path = os.path.join(source_refs_dir, entry)
+            source_id = doc_source.get('source_id', f'source_{i}')
-                dst_path = os.path.join(docs_dir, entry)
+            base_url = doc_source.get('base_url', 'Unknown')
-                if not os.path.isfile(src_path):
+            refs_dir = doc_source.get('refs_dir', '')
                    continue
                shutil.copy2(src_path, dst_path)
                copied_files.append(entry)
-        # Create index
+            # Create subdirectory for this source
            source_dir = os.path.join(docs_dir, source_id)
            os.makedirs(source_dir, exist_ok=True)
            copied_files: List[str] = []
            if refs_dir and os.path.isdir(refs_dir):
                for entry in sorted(os.listdir(refs_dir)):
                    src_path = os.path.join(refs_dir, entry)
                    dst_path = os.path.join(source_dir, entry)
                    if not os.path.isfile(src_path):
                        continue
                    shutil.copy2(src_path, dst_path)
                    copied_files.append(entry)
            # Create index for this source
            source_index_path = os.path.join(source_dir, 'index.md')
            with open(source_index_path, 'w', encoding='utf-8') as f:
                f.write(f"# Documentation: {source_id}\n\n")
                f.write(f"**Source**: {base_url}\n\n")
                f.write(f"**Pages**: {doc_source.get('total_pages', 'N/A')}\n\n")
                if copied_files:
                    files_no_index = [p for p in copied_files if p.lower() != 'index.md']
                    f.write("## Files\n\n")
                    for filename in files_no_index:
                        f.write(f"- [{filename}]({filename})\n")
                else:
                    f.write("No reference files available.\n")
            all_copied_files.extend(copied_files)
        # Create main index
        index_path = os.path.join(docs_dir, 'index.md')
        with open(index_path, 'w', encoding='utf-8') as f:
-            f.write("# Documentation\n\n")
+            f.write("# Documentation References\n\n")
-            f.write("Reference from official documentation.\n\n")
+            f.write(f"Combined from {len(docs_list)} documentation sources.\n\n")
-            if copied_files:
+            f.write("## Sources\n\n")
-                files_no_index = [p for p in copied_files if p.lower() != 'index.md']
+            for doc_source in docs_list:
-                files_index = [p for p in copied_files if p.lower() == 'index.md']
+                source_id = doc_source.get('source_id', 'unknown')
                base_url = doc_source.get('base_url', 'Unknown')
                total_pages = doc_source.get('total_pages', 'N/A')
                f.write(f"- [{source_id}]({source_id}/index.md) - {base_url} ({total_pages} pages)\n")
-                f.write("## Files\n\n")
+        logger.info(f"Created documentation references ({len(docs_list)} sources)")
                for filename in files_no_index + files_index:
                    f.write(f"- [{filename}]({filename})\n")
            else:
                f.write("## Notes\n\n")
                f.write(
                    "No documentation reference files were copied into this unified skill. "
                    "This usually means the docs-only build did not produce reference files.\n"
                )
-        logger.info("Created documentation references")
+    def _generate_github_references(self, github_list: List[Dict]):
-
+        """Generate references from multiple GitHub sources."""
    def _generate_github_references(self):
        """Generate references from GitHub source."""
        github_dir = os.path.join(self.skill_dir, 'references', 'github')
        os.makedirs(github_dir, exist_ok=True)
-        github_data = self.scraped_data['github']['data']
+        # Process each GitHub source
        for i, github_source in enumerate(github_list):
            repo = github_source.get('repo', f'repo_{i}')
            repo_id = github_source.get('repo_id', repo.replace('/', '_'))
            github_data = github_source.get('data', {})
-        # Create README reference
+            # Create subdirectory for this repo
-        if github_data.get('readme'):
+            repo_dir = os.path.join(github_dir, repo_id)
-            readme_path = os.path.join(github_dir, 'README.md')
+            os.makedirs(repo_dir, exist_ok=True)
            with open(readme_path, 'w') as f:
                f.write("# Repository README\n\n")
                f.write(github_data['readme'])
-        # Create issues reference
+            # Create README reference
-        if github_data.get('issues'):
+            if github_data.get('readme'):
-            issues_path = os.path.join(github_dir, 'issues.md')
+                readme_path = os.path.join(repo_dir, 'README.md')
-            with open(issues_path, 'w') as f:
+                with open(readme_path, 'w', encoding='utf-8') as f:
-                f.write("# GitHub Issues\n\n")
+                    f.write(f"# Repository README: {repo}\n\n")
-                f.write(f"{len(github_data['issues'])} recent issues.\n\n")
+                    f.write(github_data['readme'])
-                for issue in github_data['issues'][:20]:
+            # Create issues reference
-                    f.write(f"## #{issue['number']}: {issue['title']}\n\n")
+            if github_data.get('issues'):
-                    f.write(f"**State**: {issue['state']}\n")
+                issues_path = os.path.join(repo_dir, 'issues.md')
-                    if issue.get('labels'):
+                with open(issues_path, 'w', encoding='utf-8') as f:
-                        f.write(f"**Labels**: {', '.join(issue['labels'])}\n")
+                    f.write(f"# GitHub Issues: {repo}\n\n")
-                    f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n")
+                    f.write(f"{len(github_data['issues'])} recent issues.\n\n")
-        # Create releases reference
+                    for issue in github_data['issues'][:20]:
-        if github_data.get('releases'):
+                        f.write(f"## #{issue['number']}: {issue['title']}\n\n")
-            releases_path = os.path.join(github_dir, 'releases.md')
+                        f.write(f"**State**: {issue['state']}\n")
-            with open(releases_path, 'w') as f:
+                        if issue.get('labels'):
-                f.write("# Releases\n\n")
+                            f.write(f"**Labels**: {', '.join(issue['labels'])}\n")
                        f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n")
-                for release in github_data['releases'][:10]:
+            # Create releases reference
-                    f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n")
+            if github_data.get('releases'):
-                    f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n")
+                releases_path = os.path.join(repo_dir, 'releases.md')
-                    if release.get('body'):
+                with open(releases_path, 'w', encoding='utf-8') as f:
-                        f.write(release['body'][:500])
+                    f.write(f"# Releases: {repo}\n\n")
                        f.write("\n\n")
-        logger.info("Created GitHub references")
+                    for release in github_data['releases'][:10]:
                        f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n")
                        f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n")
                        if release.get('body'):
                            f.write(release['body'][:500])
                            f.write("\n\n")
-    def _generate_pdf_references(self):
+            # Create index for this repo
-        """Generate references from PDF source."""
+            repo_index_path = os.path.join(repo_dir, 'index.md')
            repo_info = github_data.get('repo_info', {})
            with open(repo_index_path, 'w', encoding='utf-8') as f:
                f.write(f"# GitHub: {repo}\n\n")
                f.write(f"**Stars**: {repo_info.get('stars', 'N/A')}\n")
                f.write(f"**Language**: {repo_info.get('language', 'N/A')}\n")
                f.write(f"**Issues**: {len(github_data.get('issues', []))}\n")
                f.write(f"**Releases**: {len(github_data.get('releases', []))}\n\n")
                f.write("## Files\n\n")
                f.write("- [README.md](README.md)\n")
                if github_data.get('issues'):
                    f.write("- [issues.md](issues.md)\n")
                if github_data.get('releases'):
                    f.write("- [releases.md](releases.md)\n")
        # Create main index
        index_path = os.path.join(github_dir, 'index.md')
        with open(index_path, 'w', encoding='utf-8') as f:
            f.write("# GitHub References\n\n")
            f.write(f"Combined from {len(github_list)} GitHub repositories.\n\n")
            f.write("## Repositories\n\n")
            for github_source in github_list:
                repo = github_source.get('repo', 'unknown')
                repo_id = github_source.get('repo_id', repo.replace('/', '_'))
                github_data = github_source.get('data', {})
                repo_info = github_data.get('repo_info', {})
                stars = repo_info.get('stars', 'N/A')
                f.write(f"- [{repo}]({repo_id}/index.md) - {stars} stars\n")
        logger.info(f"Created GitHub references ({len(github_list)} repos)")
    def _generate_pdf_references(self, pdf_list: List[Dict]):
        """Generate references from PDF sources."""
        pdf_dir = os.path.join(self.skill_dir, 'references', 'pdf')
        os.makedirs(pdf_dir, exist_ok=True)
        # Create index
        index_path = os.path.join(pdf_dir, 'index.md')
-        with open(index_path, 'w') as f:
+        with open(index_path, 'w', encoding='utf-8') as f:
            f.write("# PDF Documentation\n\n")
-            f.write("Reference from PDF document.\n\n")
+            f.write(f"Reference from {len(pdf_list)} PDF document(s).\n\n")
-        logger.info("Created PDF references")
+        logger.info(f"Created PDF references ({len(pdf_list)} sources)")
    def _generate_merged_api_reference(self):
        """Generate merged API reference file."""