diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 74b1ee0..1e52181 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -349,6 +349,151 @@ class DocToSkillConverter: return page + def _extract_markdown_content(self, content: str, url: str) -> Dict[str, Any]: + """Extract content from a Markdown file. + + Args: + content: Raw markdown content (or HTML if server returned HTML) + url: Source URL + + Returns: + Page dict with title, content, code_samples, headings, links + """ + import re + + # Detect if content is actually HTML (some .md URLs return HTML) + if content.strip().startswith(' 10: + page['code_samples'].append({ + 'code': code.strip(), + 'language': lang or 'unknown' + }) + + # Extract content (paragraphs) + content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL) + paragraphs = [] + for para in content_no_code.split('\n\n'): + text = para.strip() + # Skip headings and short text + if text and len(text) > 20 and not text.startswith('#'): + paragraphs.append(text) + page['content'] = '\n\n'.join(paragraphs) + + # Extract links from markdown (only .md files to avoid client-side rendered HTML pages) + md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', content) + for _, href in md_links: + if href.startswith('http'): + full_url = href + elif not href.startswith('#'): + full_url = urljoin(url, href) + else: + continue + # Strip anchor fragments + full_url = full_url.split('#')[0] + # Only include .md URLs to avoid client-side rendered HTML pages + if '.md' in full_url and self.is_valid_url(full_url) and full_url not in page['links']: + page['links'].append(full_url) + + return page + + def _extract_html_as_markdown(self, html_content: str, url: str) -> Dict[str, Any]: + """Extract content from HTML and convert to markdown-like structure. + + Args: + html_content: Raw HTML content + url: Source URL + + Returns: + Page dict with title, content, code_samples, headings, links + """ + page = { + 'url': url, + 'title': '', + 'content': '', + 'headings': [], + 'code_samples': [], + 'patterns': [], + 'links': [] + } + + soup = BeautifulSoup(html_content, 'html.parser') + + # Try to extract title + title_elem = soup.select_one('title') + if title_elem: + page['title'] = self.clean_text(title_elem.get_text()) + + # Try to find main content area + main = soup.select_one('main, article, [role="main"], .content') + if not main: + main = soup.body if soup.body else soup + + if main: + # Extract headings + for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): + text = self.clean_text(h.get_text()) + if text: + page['headings'].append({ + 'level': h.name, + 'text': text, + 'id': h.get('id', '') + }) + + # Extract code blocks + for code_elem in main.select('pre code, pre'): + code = code_elem.get_text() + if len(code.strip()) > 10: + lang = self.detect_language(code_elem, code) + page['code_samples'].append({ + 'code': code.strip(), + 'language': lang + }) + + # Extract paragraphs + paragraphs = [] + for p in main.find_all('p'): + text = self.clean_text(p.get_text()) + if text and len(text) > 20: + paragraphs.append(text) + page['content'] = '\n\n'.join(paragraphs) + + return page + def detect_language(self, elem, code): """Detect programming language from code block @@ -386,14 +531,19 @@ class DocToSkillConverter: return text.strip() def save_page(self, page: Dict[str, Any]) -> None: - """Save page data""" + """Save page data (skip pages with empty content)""" + # Skip pages with empty or very short content + if not page.get('content') or len(page.get('content', '')) < 50: + logger.debug("Skipping page with empty/short content: %s", page.get('url', 'unknown')) + return + url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10] safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50] safe_title = re.sub(r'[-\s]+', '_', safe_title) - + filename = f"{safe_title}_{url_hash}.json" filepath = os.path.join(self.data_dir, "pages", filename) - + with open(filepath, 'w', encoding='utf-8') as f: json.dump(page, f, indent=2, ensure_ascii=False) @@ -408,6 +558,7 @@ class DocToSkillConverter: Note: Uses threading locks when workers > 1 for thread safety + Supports both HTML pages and Markdown (.md) files """ try: # Scraping part (no lock needed - independent) @@ -415,8 +566,12 @@ class DocToSkillConverter: response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() - soup = BeautifulSoup(response.content, 'html.parser') - page = self.extract_content(soup, url) + # Check if this is a Markdown file + if url.endswith('.md') or '.md' in url: + page = self._extract_markdown_content(response.text, url) + else: + soup = BeautifulSoup(response.content, 'html.parser') + page = self.extract_content(soup, url) # Thread-safe operations (lock required) if self.workers > 1: @@ -463,6 +618,7 @@ class DocToSkillConverter: Note: Uses asyncio.Lock for async-safe operations instead of threading.Lock + Supports both HTML pages and Markdown (.md) files """ async with semaphore: # Limit concurrent requests try: @@ -471,9 +627,13 @@ class DocToSkillConverter: response = await client.get(url, headers=headers, timeout=30.0) response.raise_for_status() - # BeautifulSoup parsing (still synchronous, but fast) - soup = BeautifulSoup(response.content, 'html.parser') - page = self.extract_content(soup, url) + # Check if this is a Markdown file + if url.endswith('.md') or '.md' in url: + page = self._extract_markdown_content(response.text, url) + else: + # BeautifulSoup parsing (still synchronous, but fast) + soup = BeautifulSoup(response.content, 'html.parser') + page = self.extract_content(soup, url) # Async-safe operations (no lock needed - single event loop) logger.info(" %s", url) @@ -493,6 +653,56 @@ class DocToSkillConverter: except Exception as e: logger.error(" ✗ Error scraping %s: %s: %s", url, type(e).__name__, e) + def _convert_to_md_urls(self, urls: List[str]) -> List[str]: + """ + Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs. + 不预先检查 URL 是否存在,直接加入队列,在爬取时再验证。 + + Args: + urls: List of URLs to process + + Returns: + List of .md URLs (未验证) + """ + md_urls = [] + + for url in urls: + if '.md' in url: + md_urls.append(url) + else: + # 直接转换为 .md 格式,不发送 HEAD 请求检查 + url = url.rstrip('/') + md_url = f"{url}/index.html.md" + md_urls.append(md_url) + + logger.info(" ✓ Converted %d URLs to .md format (will validate during crawl)", len(md_urls)) + return md_urls + + # ORIGINAL _convert_to_md_urls (with HEAD request validation): + # def _convert_to_md_urls(self, urls: List[str]) -> List[str]: + # md_urls = [] + # non_md_urls = [] + # for url in urls: + # if '.md' in url: + # md_urls.append(url) + # else: + # non_md_urls.append(url) + # if non_md_urls: + # logger.info(" 🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls)) + # converted = 0 + # for url in non_md_urls: + # url = url.rstrip('/') + # md_url = f"{url}/index.html.md" + # try: + # resp = requests.head(md_url, timeout=5, allow_redirects=True) + # if resp.status_code == 200: + # md_urls.append(md_url) + # converted += 1 + # except Exception: + # pass + # logger.info(" ✓ Converted %d URLs to .md format", converted) + # return md_urls + def _try_llms_txt(self) -> bool: """ Try to use llms.txt instead of HTML scraping. @@ -548,7 +758,29 @@ class DocToSkillConverter: logger.info(" ✓ %s (%d chars)", extra_filename, len(extra_content)) # Parse explicit file for skill building - parser = LlmsTxtParser(content) + parser = LlmsTxtParser(content, self.base_url) + + # Extract URLs from llms.txt and add to pending_urls for BFS crawling + extracted_urls = parser.extract_urls() + if extracted_urls: + # Convert non-.md URLs to .md format by trying /index.html.md suffix + md_urls = self._convert_to_md_urls(extracted_urls) + logger.info("\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", + len(extracted_urls), len(md_urls)) + + # Filter URLs based on url_patterns config + for url in md_urls: + if self.is_valid_url(url) and url not in self.visited_urls: + self.pending_urls.append(url) + + logger.info(" 📋 %d URLs added to crawl queue after filtering", len(self.pending_urls)) + + # Return False to trigger HTML scraping with the populated pending_urls + self.llms_txt_detected = True + self.llms_txt_variant = 'explicit' + return False # Continue with BFS crawling + + # Fallback: if no URLs found, use section-based parsing pages = parser.parse() if pages: @@ -606,7 +838,29 @@ class DocToSkillConverter: largest = max(downloaded.items(), key=lambda x: x[1]['size']) logger.info("\n📄 Parsing %s for skill building...", largest[1]['filename']) - parser = LlmsTxtParser(largest[1]['content']) + parser = LlmsTxtParser(largest[1]['content'], self.base_url) + + # Extract URLs from llms.txt and add to pending_urls for BFS crawling + extracted_urls = parser.extract_urls() + if extracted_urls: + # Convert non-.md URLs to .md format by trying /index.html.md suffix + md_urls = self._convert_to_md_urls(extracted_urls) + logger.info("\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", + len(extracted_urls), len(md_urls)) + + # Filter URLs based on url_patterns config + for url in md_urls: + if self.is_valid_url(url) and url not in self.visited_urls: + self.pending_urls.append(url) + + logger.info(" 📋 %d URLs added to crawl queue after filtering", len(self.pending_urls)) + + # Return False to trigger HTML scraping with the populated pending_urls + self.llms_txt_detected = True + self.llms_txt_variants = list(downloaded.keys()) + return False # Continue with BFS crawling + + # Fallback: if no URLs found, use section-based parsing pages = parser.parse() if not pages: diff --git a/src/skill_seekers/cli/llms_txt_parser.py b/src/skill_seekers/cli/llms_txt_parser.py index e288c92..2e143bf 100644 --- a/src/skill_seekers/cli/llms_txt_parser.py +++ b/src/skill_seekers/cli/llms_txt_parser.py @@ -3,12 +3,67 @@ import re from typing import List, Dict +from urllib.parse import urljoin class LlmsTxtParser: """Parse llms.txt markdown content into page structures""" - def __init__(self, content: str): + def __init__(self, content: str, base_url: str = None): self.content = content + self.base_url = base_url + + def extract_urls(self) -> List[str]: + """ + Extract all URLs from the llms.txt content. + + Returns: + List of unique URLs found in the content + """ + urls = set() + + # Match markdown links: [text](url) + md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content) + for _, url in md_links: + if url.startswith('http'): + clean_url = self._clean_url(url) + if clean_url: + urls.add(clean_url) + elif self.base_url and not url.startswith('#'): + clean_url = self._clean_url(urljoin(self.base_url, url)) + if clean_url: + urls.add(clean_url) + + # Match bare URLs + bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content) + for url in bare_urls: + # Clean trailing punctuation + url = url.rstrip('.,;:') + clean_url = self._clean_url(url) + if clean_url: + urls.add(clean_url) + + return list(urls) + + def _clean_url(self, url: str) -> str: + """ + Clean and validate URL, removing invalid anchor patterns. + + Args: + url: URL to clean + + Returns: + Cleaned URL or empty string if invalid + """ + # Skip URLs with path after anchor (e.g., #section/index.html.md) + # These are malformed and return duplicate HTML content + if '#' in url: + anchor_pos = url.index('#') + after_anchor = url[anchor_pos + 1:] + # If there's a path separator after anchor, it's invalid + if '/' in after_anchor: + # Extract the base URL without the malformed anchor + return url[:anchor_pos] + return url def parse(self) -> List[Dict]: """ diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py index f8b5dcf..f294f89 100644 --- a/src/skill_seekers/cli/unified_scraper.py +++ b/src/skill_seekers/cli/unified_scraper.py @@ -70,8 +70,12 @@ class UnifiedScraper: self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based') logger.info(f"Merge mode: {self.merge_mode}") - # Storage for scraped data - self.scraped_data = {} + # Storage for scraped data - use lists to support multiple sources of same type + self.scraped_data = { + 'documentation': [], # List of doc sources + 'github': [], # List of github sources + 'pdf': [] # List of pdf sources + } # Output paths self.name = self.config['name'] @@ -81,6 +85,9 @@ class UnifiedScraper: os.makedirs(self.output_dir, exist_ok=True) os.makedirs(self.data_dir, exist_ok=True) + # Track source index for unique naming + self._source_counters = {'documentation': 0, 'github': 0, 'pdf': 0} + def scrape_all_sources(self): """ Scrape all configured sources. @@ -114,13 +121,22 @@ class UnifiedScraper: logger.error(f"Error scraping {source_type}: {e}") logger.info("Continuing with other sources...") - logger.info(f"\n✅ Scraped {len(self.scraped_data)} sources successfully") + logger.info(f"\n✅ Scraped {sum(len(v) for v in self.scraped_data.values())} sources successfully") def _scrape_documentation(self, source: Dict[str, Any]): """Scrape documentation website.""" - # Create temporary config for doc scraper + # Get unique index for this documentation source + idx = self._source_counters['documentation'] + self._source_counters['documentation'] += 1 + + # Extract source identifier from URL for unique naming + from urllib.parse import urlparse + parsed = urlparse(source['base_url']) + source_id = parsed.netloc.replace('.', '_').replace(':', '_') + + # Create temporary config for doc scraper with unique name doc_config = { - 'name': f"{self.name}_docs", + 'name': f"{self.name}_docs_{idx}_{source_id}", 'base_url': source['base_url'], 'selectors': source.get('selectors', {}), 'url_patterns': source.get('url_patterns', {}), @@ -164,10 +180,15 @@ class UnifiedScraper: with open(docs_data_file, 'r', encoding='utf-8') as f: summary = json.load(f) - self.scraped_data['documentation'] = { + # Append to list instead of overwriting + self.scraped_data['documentation'].append({ + 'source_id': source_id, + 'base_url': source['base_url'], 'pages': summary.get('pages', []), - 'data_file': docs_data_file - } + 'total_pages': summary.get('total_pages', 0), + 'data_file': docs_data_file, + 'refs_dir': f"output/{doc_config['name']}/references" + }) logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped") else: @@ -185,10 +206,18 @@ class UnifiedScraper: logger.error("github_scraper.py not found") return + # Get unique index for this GitHub source + idx = self._source_counters['github'] + self._source_counters['github'] += 1 + + # Extract repo identifier for unique naming + repo = source['repo'] + repo_id = repo.replace('/', '_') + # Create config for GitHub scraper github_config = { - 'repo': source['repo'], - 'name': f"{self.name}_github", + 'repo': repo, + 'name': f"{self.name}_github_{idx}_{repo_id}", 'github_token': source.get('github_token'), 'include_issues': source.get('include_issues', True), 'max_issues': source.get('max_issues', 100), @@ -197,7 +226,7 @@ class UnifiedScraper: 'include_code': source.get('include_code', True), 'code_analysis_depth': source.get('code_analysis_depth', 'surface'), 'file_patterns': source.get('file_patterns', []), - 'local_repo_path': source.get('local_repo_path') # Pass local_repo_path from config + 'local_repo_path': source.get('local_repo_path') } # Pass directory exclusions if specified (optional) @@ -207,19 +236,22 @@ class UnifiedScraper: github_config['exclude_dirs_additional'] = source['exclude_dirs_additional'] # Scrape - logger.info(f"Scraping GitHub repository: {source['repo']}") + logger.info(f"Scraping GitHub repository: {repo}") scraper = GitHubScraper(github_config) github_data = scraper.scrape() - # Save data - github_data_file = os.path.join(self.data_dir, 'github_data.json') + # Save data with unique filename + github_data_file = os.path.join(self.data_dir, f'github_data_{idx}_{repo_id}.json') with open(github_data_file, 'w', encoding='utf-8') as f: json.dump(github_data, f, indent=2, ensure_ascii=False) - self.scraped_data['github'] = { + # Append to list instead of overwriting + self.scraped_data['github'].append({ + 'repo': repo, + 'repo_id': repo_id, 'data': github_data, 'data_file': github_data_file - } + }) logger.info(f"✅ GitHub: Repository scraped successfully") @@ -274,14 +306,23 @@ class UnifiedScraper: logger.info("No API merge needed (only one API source)") return [] - # Get documentation and GitHub data - docs_data = self.scraped_data.get('documentation', {}) - github_data = self.scraped_data.get('github', {}) + # Get documentation and GitHub data (now lists) + docs_list = self.scraped_data.get('documentation', []) + github_list = self.scraped_data.get('github', []) - if not docs_data or not github_data: + if not docs_list or not github_list: logger.warning("Missing documentation or GitHub data for conflict detection") return [] + # For conflict detection, combine all docs and all github data + # Use the first of each for now (conflict detection is optional) + docs_data = docs_list[0] if docs_list else {} + github_data = github_list[0] if github_list else {} + + if not docs_data.get('data_file') or not github_data.get('data_file'): + logger.warning("Missing data files for conflict detection") + return [] + # Load data files with open(docs_data['data_file'], 'r', encoding='utf-8') as f: docs_json = json.load(f) @@ -328,9 +369,17 @@ class UnifiedScraper: logger.info("No conflicts to merge") return None - # Get data files - docs_data = self.scraped_data.get('documentation', {}) - github_data = self.scraped_data.get('github', {}) + # Get data files (now lists) + docs_list = self.scraped_data.get('documentation', []) + github_list = self.scraped_data.get('github', []) + + if not docs_list or not github_list: + logger.warning("Missing data for merge") + return None + + # Use first source of each type for merge + docs_data = docs_list[0] + github_data = github_list[0] # Load data with open(docs_data['data_file'], 'r', encoding='utf-8') as f: diff --git a/src/skill_seekers/cli/unified_skill_builder.py b/src/skill_seekers/cli/unified_skill_builder.py index b8f9700..a80f86d 100644 --- a/src/skill_seekers/cli/unified_skill_builder.py +++ b/src/skill_seekers/cli/unified_skill_builder.py @@ -268,118 +268,177 @@ This skill combines knowledge from multiple sources: """Generate reference files organized by source.""" logger.info("Generating reference files...") - # Generate references for each source type - if 'documentation' in self.scraped_data: - self._generate_docs_references() + # Generate references for each source type (now lists) + docs_list = self.scraped_data.get('documentation', []) + if docs_list: + self._generate_docs_references(docs_list) - if 'github' in self.scraped_data: - self._generate_github_references() + github_list = self.scraped_data.get('github', []) + if github_list: + self._generate_github_references(github_list) - if 'pdf' in self.scraped_data: - self._generate_pdf_references() + pdf_list = self.scraped_data.get('pdf', []) + if pdf_list: + self._generate_pdf_references(pdf_list) # Generate merged API reference if available if self.merged_data: self._generate_merged_api_reference() - def _generate_docs_references(self): - """Generate references from documentation source.""" + def _generate_docs_references(self, docs_list: List[Dict]): + """Generate references from multiple documentation sources.""" docs_dir = os.path.join(self.skill_dir, 'references', 'documentation') os.makedirs(docs_dir, exist_ok=True) - # Best-effort: copy docs-only reference files into unified docs references. - # UnifiedScraper runs doc_scraper using name "{name}_docs", which creates - # output/{name}_docs/references/*.md. Those are the most useful documentation - # references for the unified skill. - source_refs_dir = os.path.join('output', f"{self.name}_docs", 'references') - copied_files: List[str] = [] + all_copied_files: List[str] = [] - if os.path.isdir(source_refs_dir): - for entry in sorted(os.listdir(source_refs_dir)): - src_path = os.path.join(source_refs_dir, entry) - dst_path = os.path.join(docs_dir, entry) - if not os.path.isfile(src_path): - continue - shutil.copy2(src_path, dst_path) - copied_files.append(entry) + # Process each documentation source + for i, doc_source in enumerate(docs_list): + source_id = doc_source.get('source_id', f'source_{i}') + base_url = doc_source.get('base_url', 'Unknown') + refs_dir = doc_source.get('refs_dir', '') - # Create index + # Create subdirectory for this source + source_dir = os.path.join(docs_dir, source_id) + os.makedirs(source_dir, exist_ok=True) + + copied_files: List[str] = [] + + if refs_dir and os.path.isdir(refs_dir): + for entry in sorted(os.listdir(refs_dir)): + src_path = os.path.join(refs_dir, entry) + dst_path = os.path.join(source_dir, entry) + if not os.path.isfile(src_path): + continue + shutil.copy2(src_path, dst_path) + copied_files.append(entry) + + # Create index for this source + source_index_path = os.path.join(source_dir, 'index.md') + with open(source_index_path, 'w', encoding='utf-8') as f: + f.write(f"# Documentation: {source_id}\n\n") + f.write(f"**Source**: {base_url}\n\n") + f.write(f"**Pages**: {doc_source.get('total_pages', 'N/A')}\n\n") + + if copied_files: + files_no_index = [p for p in copied_files if p.lower() != 'index.md'] + f.write("## Files\n\n") + for filename in files_no_index: + f.write(f"- [{filename}]({filename})\n") + else: + f.write("No reference files available.\n") + + all_copied_files.extend(copied_files) + + # Create main index index_path = os.path.join(docs_dir, 'index.md') with open(index_path, 'w', encoding='utf-8') as f: - f.write("# Documentation\n\n") - f.write("Reference from official documentation.\n\n") + f.write("# Documentation References\n\n") + f.write(f"Combined from {len(docs_list)} documentation sources.\n\n") - if copied_files: - files_no_index = [p for p in copied_files if p.lower() != 'index.md'] - files_index = [p for p in copied_files if p.lower() == 'index.md'] + f.write("## Sources\n\n") + for doc_source in docs_list: + source_id = doc_source.get('source_id', 'unknown') + base_url = doc_source.get('base_url', 'Unknown') + total_pages = doc_source.get('total_pages', 'N/A') + f.write(f"- [{source_id}]({source_id}/index.md) - {base_url} ({total_pages} pages)\n") - f.write("## Files\n\n") - for filename in files_no_index + files_index: - f.write(f"- [{filename}]({filename})\n") - else: - f.write("## Notes\n\n") - f.write( - "No documentation reference files were copied into this unified skill. " - "This usually means the docs-only build did not produce reference files.\n" - ) + logger.info(f"Created documentation references ({len(docs_list)} sources)") - logger.info("Created documentation references") - - def _generate_github_references(self): - """Generate references from GitHub source.""" + def _generate_github_references(self, github_list: List[Dict]): + """Generate references from multiple GitHub sources.""" github_dir = os.path.join(self.skill_dir, 'references', 'github') os.makedirs(github_dir, exist_ok=True) - github_data = self.scraped_data['github']['data'] + # Process each GitHub source + for i, github_source in enumerate(github_list): + repo = github_source.get('repo', f'repo_{i}') + repo_id = github_source.get('repo_id', repo.replace('/', '_')) + github_data = github_source.get('data', {}) - # Create README reference - if github_data.get('readme'): - readme_path = os.path.join(github_dir, 'README.md') - with open(readme_path, 'w') as f: - f.write("# Repository README\n\n") - f.write(github_data['readme']) + # Create subdirectory for this repo + repo_dir = os.path.join(github_dir, repo_id) + os.makedirs(repo_dir, exist_ok=True) - # Create issues reference - if github_data.get('issues'): - issues_path = os.path.join(github_dir, 'issues.md') - with open(issues_path, 'w') as f: - f.write("# GitHub Issues\n\n") - f.write(f"{len(github_data['issues'])} recent issues.\n\n") + # Create README reference + if github_data.get('readme'): + readme_path = os.path.join(repo_dir, 'README.md') + with open(readme_path, 'w', encoding='utf-8') as f: + f.write(f"# Repository README: {repo}\n\n") + f.write(github_data['readme']) - for issue in github_data['issues'][:20]: - f.write(f"## #{issue['number']}: {issue['title']}\n\n") - f.write(f"**State**: {issue['state']}\n") - if issue.get('labels'): - f.write(f"**Labels**: {', '.join(issue['labels'])}\n") - f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n") + # Create issues reference + if github_data.get('issues'): + issues_path = os.path.join(repo_dir, 'issues.md') + with open(issues_path, 'w', encoding='utf-8') as f: + f.write(f"# GitHub Issues: {repo}\n\n") + f.write(f"{len(github_data['issues'])} recent issues.\n\n") - # Create releases reference - if github_data.get('releases'): - releases_path = os.path.join(github_dir, 'releases.md') - with open(releases_path, 'w') as f: - f.write("# Releases\n\n") + for issue in github_data['issues'][:20]: + f.write(f"## #{issue['number']}: {issue['title']}\n\n") + f.write(f"**State**: {issue['state']}\n") + if issue.get('labels'): + f.write(f"**Labels**: {', '.join(issue['labels'])}\n") + f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n") - for release in github_data['releases'][:10]: - f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n") - f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n") - if release.get('body'): - f.write(release['body'][:500]) - f.write("\n\n") + # Create releases reference + if github_data.get('releases'): + releases_path = os.path.join(repo_dir, 'releases.md') + with open(releases_path, 'w', encoding='utf-8') as f: + f.write(f"# Releases: {repo}\n\n") - logger.info("Created GitHub references") + for release in github_data['releases'][:10]: + f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n") + f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n") + if release.get('body'): + f.write(release['body'][:500]) + f.write("\n\n") - def _generate_pdf_references(self): - """Generate references from PDF source.""" + # Create index for this repo + repo_index_path = os.path.join(repo_dir, 'index.md') + repo_info = github_data.get('repo_info', {}) + with open(repo_index_path, 'w', encoding='utf-8') as f: + f.write(f"# GitHub: {repo}\n\n") + f.write(f"**Stars**: {repo_info.get('stars', 'N/A')}\n") + f.write(f"**Language**: {repo_info.get('language', 'N/A')}\n") + f.write(f"**Issues**: {len(github_data.get('issues', []))}\n") + f.write(f"**Releases**: {len(github_data.get('releases', []))}\n\n") + f.write("## Files\n\n") + f.write("- [README.md](README.md)\n") + if github_data.get('issues'): + f.write("- [issues.md](issues.md)\n") + if github_data.get('releases'): + f.write("- [releases.md](releases.md)\n") + + # Create main index + index_path = os.path.join(github_dir, 'index.md') + with open(index_path, 'w', encoding='utf-8') as f: + f.write("# GitHub References\n\n") + f.write(f"Combined from {len(github_list)} GitHub repositories.\n\n") + + f.write("## Repositories\n\n") + for github_source in github_list: + repo = github_source.get('repo', 'unknown') + repo_id = github_source.get('repo_id', repo.replace('/', '_')) + github_data = github_source.get('data', {}) + repo_info = github_data.get('repo_info', {}) + stars = repo_info.get('stars', 'N/A') + f.write(f"- [{repo}]({repo_id}/index.md) - {stars} stars\n") + + logger.info(f"Created GitHub references ({len(github_list)} repos)") + + def _generate_pdf_references(self, pdf_list: List[Dict]): + """Generate references from PDF sources.""" pdf_dir = os.path.join(self.skill_dir, 'references', 'pdf') os.makedirs(pdf_dir, exist_ok=True) # Create index index_path = os.path.join(pdf_dir, 'index.md') - with open(index_path, 'w') as f: + with open(index_path, 'w', encoding='utf-8') as f: f.write("# PDF Documentation\n\n") - f.write("Reference from PDF document.\n\n") + f.write(f"Reference from {len(pdf_list)} PDF document(s).\n\n") - logger.info("Created PDF references") + logger.info(f"Created PDF references ({len(pdf_list)} sources)") def _generate_merged_api_reference(self): """Generate merged API reference file."""