From 8cf43582a4a62974e71eaa1361b18c20dc63bb97 Mon Sep 17 00:00:00 2001 From: tsyhahaha Date: Mon, 5 Jan 2026 21:45:36 +0800 Subject: [PATCH 1/7] feat: support multiple sources of same type in unified scraper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Markdown file parsing in doc_scraper (_extract_markdown_content, _extract_html_as_markdown) - Add URL extraction and cleaning in llms_txt_parser (extract_urls, _clean_url) - Support multiple documentation/github/pdf sources in unified_scraper - Generate separate reference directories per source in unified_skill_builder - Skip pages with empty/short content (<50 chars) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/skill_seekers/cli/doc_scraper.py | 274 +++++++++++++++++- src/skill_seekers/cli/llms_txt_parser.py | 57 +++- src/skill_seekers/cli/unified_scraper.py | 95 ++++-- .../cli/unified_skill_builder.py | 215 +++++++++----- 4 files changed, 529 insertions(+), 112 deletions(-) diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 74b1ee0..1e52181 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -349,6 +349,151 @@ class DocToSkillConverter: return page + def _extract_markdown_content(self, content: str, url: str) -> Dict[str, Any]: + """Extract content from a Markdown file. + + Args: + content: Raw markdown content (or HTML if server returned HTML) + url: Source URL + + Returns: + Page dict with title, content, code_samples, headings, links + """ + import re + + # Detect if content is actually HTML (some .md URLs return HTML) + if content.strip().startswith(' 10: + page['code_samples'].append({ + 'code': code.strip(), + 'language': lang or 'unknown' + }) + + # Extract content (paragraphs) + content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL) + paragraphs = [] + for para in content_no_code.split('\n\n'): + text = para.strip() + # Skip headings and short text + if text and len(text) > 20 and not text.startswith('#'): + paragraphs.append(text) + page['content'] = '\n\n'.join(paragraphs) + + # Extract links from markdown (only .md files to avoid client-side rendered HTML pages) + md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', content) + for _, href in md_links: + if href.startswith('http'): + full_url = href + elif not href.startswith('#'): + full_url = urljoin(url, href) + else: + continue + # Strip anchor fragments + full_url = full_url.split('#')[0] + # Only include .md URLs to avoid client-side rendered HTML pages + if '.md' in full_url and self.is_valid_url(full_url) and full_url not in page['links']: + page['links'].append(full_url) + + return page + + def _extract_html_as_markdown(self, html_content: str, url: str) -> Dict[str, Any]: + """Extract content from HTML and convert to markdown-like structure. + + Args: + html_content: Raw HTML content + url: Source URL + + Returns: + Page dict with title, content, code_samples, headings, links + """ + page = { + 'url': url, + 'title': '', + 'content': '', + 'headings': [], + 'code_samples': [], + 'patterns': [], + 'links': [] + } + + soup = BeautifulSoup(html_content, 'html.parser') + + # Try to extract title + title_elem = soup.select_one('title') + if title_elem: + page['title'] = self.clean_text(title_elem.get_text()) + + # Try to find main content area + main = soup.select_one('main, article, [role="main"], .content') + if not main: + main = soup.body if soup.body else soup + + if main: + # Extract headings + for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): + text = self.clean_text(h.get_text()) + if text: + page['headings'].append({ + 'level': h.name, + 'text': text, + 'id': h.get('id', '') + }) + + # Extract code blocks + for code_elem in main.select('pre code, pre'): + code = code_elem.get_text() + if len(code.strip()) > 10: + lang = self.detect_language(code_elem, code) + page['code_samples'].append({ + 'code': code.strip(), + 'language': lang + }) + + # Extract paragraphs + paragraphs = [] + for p in main.find_all('p'): + text = self.clean_text(p.get_text()) + if text and len(text) > 20: + paragraphs.append(text) + page['content'] = '\n\n'.join(paragraphs) + + return page + def detect_language(self, elem, code): """Detect programming language from code block @@ -386,14 +531,19 @@ class DocToSkillConverter: return text.strip() def save_page(self, page: Dict[str, Any]) -> None: - """Save page data""" + """Save page data (skip pages with empty content)""" + # Skip pages with empty or very short content + if not page.get('content') or len(page.get('content', '')) < 50: + logger.debug("Skipping page with empty/short content: %s", page.get('url', 'unknown')) + return + url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10] safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50] safe_title = re.sub(r'[-\s]+', '_', safe_title) - + filename = f"{safe_title}_{url_hash}.json" filepath = os.path.join(self.data_dir, "pages", filename) - + with open(filepath, 'w', encoding='utf-8') as f: json.dump(page, f, indent=2, ensure_ascii=False) @@ -408,6 +558,7 @@ class DocToSkillConverter: Note: Uses threading locks when workers > 1 for thread safety + Supports both HTML pages and Markdown (.md) files """ try: # Scraping part (no lock needed - independent) @@ -415,8 +566,12 @@ class DocToSkillConverter: response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() - soup = BeautifulSoup(response.content, 'html.parser') - page = self.extract_content(soup, url) + # Check if this is a Markdown file + if url.endswith('.md') or '.md' in url: + page = self._extract_markdown_content(response.text, url) + else: + soup = BeautifulSoup(response.content, 'html.parser') + page = self.extract_content(soup, url) # Thread-safe operations (lock required) if self.workers > 1: @@ -463,6 +618,7 @@ class DocToSkillConverter: Note: Uses asyncio.Lock for async-safe operations instead of threading.Lock + Supports both HTML pages and Markdown (.md) files """ async with semaphore: # Limit concurrent requests try: @@ -471,9 +627,13 @@ class DocToSkillConverter: response = await client.get(url, headers=headers, timeout=30.0) response.raise_for_status() - # BeautifulSoup parsing (still synchronous, but fast) - soup = BeautifulSoup(response.content, 'html.parser') - page = self.extract_content(soup, url) + # Check if this is a Markdown file + if url.endswith('.md') or '.md' in url: + page = self._extract_markdown_content(response.text, url) + else: + # BeautifulSoup parsing (still synchronous, but fast) + soup = BeautifulSoup(response.content, 'html.parser') + page = self.extract_content(soup, url) # Async-safe operations (no lock needed - single event loop) logger.info(" %s", url) @@ -493,6 +653,56 @@ class DocToSkillConverter: except Exception as e: logger.error(" βœ— Error scraping %s: %s: %s", url, type(e).__name__, e) + def _convert_to_md_urls(self, urls: List[str]) -> List[str]: + """ + Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs. + δΈι’„ε…ˆζ£€ζŸ₯ URL ζ˜―ε¦ε­˜εœ¨οΌŒη›΄ζŽ₯加ε…₯ι˜Ÿεˆ—οΌŒεœ¨ηˆ¬ε–ζ—Άε†ιͺŒθ―γ€‚ + + Args: + urls: List of URLs to process + + Returns: + List of .md URLs (ζœͺιͺŒθ―) + """ + md_urls = [] + + for url in urls: + if '.md' in url: + md_urls.append(url) + else: + # η›΄ζŽ₯转捒为 .md ζ ΌεΌοΌŒδΈε‘ι€ HEAD 请求检ζŸ₯ + url = url.rstrip('/') + md_url = f"{url}/index.html.md" + md_urls.append(md_url) + + logger.info(" βœ“ Converted %d URLs to .md format (will validate during crawl)", len(md_urls)) + return md_urls + + # ORIGINAL _convert_to_md_urls (with HEAD request validation): + # def _convert_to_md_urls(self, urls: List[str]) -> List[str]: + # md_urls = [] + # non_md_urls = [] + # for url in urls: + # if '.md' in url: + # md_urls.append(url) + # else: + # non_md_urls.append(url) + # if non_md_urls: + # logger.info(" πŸ”„ Trying to convert %d non-.md URLs to .md format...", len(non_md_urls)) + # converted = 0 + # for url in non_md_urls: + # url = url.rstrip('/') + # md_url = f"{url}/index.html.md" + # try: + # resp = requests.head(md_url, timeout=5, allow_redirects=True) + # if resp.status_code == 200: + # md_urls.append(md_url) + # converted += 1 + # except Exception: + # pass + # logger.info(" βœ“ Converted %d URLs to .md format", converted) + # return md_urls + def _try_llms_txt(self) -> bool: """ Try to use llms.txt instead of HTML scraping. @@ -548,7 +758,29 @@ class DocToSkillConverter: logger.info(" βœ“ %s (%d chars)", extra_filename, len(extra_content)) # Parse explicit file for skill building - parser = LlmsTxtParser(content) + parser = LlmsTxtParser(content, self.base_url) + + # Extract URLs from llms.txt and add to pending_urls for BFS crawling + extracted_urls = parser.extract_urls() + if extracted_urls: + # Convert non-.md URLs to .md format by trying /index.html.md suffix + md_urls = self._convert_to_md_urls(extracted_urls) + logger.info("\nπŸ”— Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", + len(extracted_urls), len(md_urls)) + + # Filter URLs based on url_patterns config + for url in md_urls: + if self.is_valid_url(url) and url not in self.visited_urls: + self.pending_urls.append(url) + + logger.info(" πŸ“‹ %d URLs added to crawl queue after filtering", len(self.pending_urls)) + + # Return False to trigger HTML scraping with the populated pending_urls + self.llms_txt_detected = True + self.llms_txt_variant = 'explicit' + return False # Continue with BFS crawling + + # Fallback: if no URLs found, use section-based parsing pages = parser.parse() if pages: @@ -606,7 +838,29 @@ class DocToSkillConverter: largest = max(downloaded.items(), key=lambda x: x[1]['size']) logger.info("\nπŸ“„ Parsing %s for skill building...", largest[1]['filename']) - parser = LlmsTxtParser(largest[1]['content']) + parser = LlmsTxtParser(largest[1]['content'], self.base_url) + + # Extract URLs from llms.txt and add to pending_urls for BFS crawling + extracted_urls = parser.extract_urls() + if extracted_urls: + # Convert non-.md URLs to .md format by trying /index.html.md suffix + md_urls = self._convert_to_md_urls(extracted_urls) + logger.info("\nπŸ”— Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", + len(extracted_urls), len(md_urls)) + + # Filter URLs based on url_patterns config + for url in md_urls: + if self.is_valid_url(url) and url not in self.visited_urls: + self.pending_urls.append(url) + + logger.info(" πŸ“‹ %d URLs added to crawl queue after filtering", len(self.pending_urls)) + + # Return False to trigger HTML scraping with the populated pending_urls + self.llms_txt_detected = True + self.llms_txt_variants = list(downloaded.keys()) + return False # Continue with BFS crawling + + # Fallback: if no URLs found, use section-based parsing pages = parser.parse() if not pages: diff --git a/src/skill_seekers/cli/llms_txt_parser.py b/src/skill_seekers/cli/llms_txt_parser.py index e288c92..2e143bf 100644 --- a/src/skill_seekers/cli/llms_txt_parser.py +++ b/src/skill_seekers/cli/llms_txt_parser.py @@ -3,12 +3,67 @@ import re from typing import List, Dict +from urllib.parse import urljoin class LlmsTxtParser: """Parse llms.txt markdown content into page structures""" - def __init__(self, content: str): + def __init__(self, content: str, base_url: str = None): self.content = content + self.base_url = base_url + + def extract_urls(self) -> List[str]: + """ + Extract all URLs from the llms.txt content. + + Returns: + List of unique URLs found in the content + """ + urls = set() + + # Match markdown links: [text](url) + md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content) + for _, url in md_links: + if url.startswith('http'): + clean_url = self._clean_url(url) + if clean_url: + urls.add(clean_url) + elif self.base_url and not url.startswith('#'): + clean_url = self._clean_url(urljoin(self.base_url, url)) + if clean_url: + urls.add(clean_url) + + # Match bare URLs + bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content) + for url in bare_urls: + # Clean trailing punctuation + url = url.rstrip('.,;:') + clean_url = self._clean_url(url) + if clean_url: + urls.add(clean_url) + + return list(urls) + + def _clean_url(self, url: str) -> str: + """ + Clean and validate URL, removing invalid anchor patterns. + + Args: + url: URL to clean + + Returns: + Cleaned URL or empty string if invalid + """ + # Skip URLs with path after anchor (e.g., #section/index.html.md) + # These are malformed and return duplicate HTML content + if '#' in url: + anchor_pos = url.index('#') + after_anchor = url[anchor_pos + 1:] + # If there's a path separator after anchor, it's invalid + if '/' in after_anchor: + # Extract the base URL without the malformed anchor + return url[:anchor_pos] + return url def parse(self) -> List[Dict]: """ diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py index f8b5dcf..f294f89 100644 --- a/src/skill_seekers/cli/unified_scraper.py +++ b/src/skill_seekers/cli/unified_scraper.py @@ -70,8 +70,12 @@ class UnifiedScraper: self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based') logger.info(f"Merge mode: {self.merge_mode}") - # Storage for scraped data - self.scraped_data = {} + # Storage for scraped data - use lists to support multiple sources of same type + self.scraped_data = { + 'documentation': [], # List of doc sources + 'github': [], # List of github sources + 'pdf': [] # List of pdf sources + } # Output paths self.name = self.config['name'] @@ -81,6 +85,9 @@ class UnifiedScraper: os.makedirs(self.output_dir, exist_ok=True) os.makedirs(self.data_dir, exist_ok=True) + # Track source index for unique naming + self._source_counters = {'documentation': 0, 'github': 0, 'pdf': 0} + def scrape_all_sources(self): """ Scrape all configured sources. @@ -114,13 +121,22 @@ class UnifiedScraper: logger.error(f"Error scraping {source_type}: {e}") logger.info("Continuing with other sources...") - logger.info(f"\nβœ… Scraped {len(self.scraped_data)} sources successfully") + logger.info(f"\nβœ… Scraped {sum(len(v) for v in self.scraped_data.values())} sources successfully") def _scrape_documentation(self, source: Dict[str, Any]): """Scrape documentation website.""" - # Create temporary config for doc scraper + # Get unique index for this documentation source + idx = self._source_counters['documentation'] + self._source_counters['documentation'] += 1 + + # Extract source identifier from URL for unique naming + from urllib.parse import urlparse + parsed = urlparse(source['base_url']) + source_id = parsed.netloc.replace('.', '_').replace(':', '_') + + # Create temporary config for doc scraper with unique name doc_config = { - 'name': f"{self.name}_docs", + 'name': f"{self.name}_docs_{idx}_{source_id}", 'base_url': source['base_url'], 'selectors': source.get('selectors', {}), 'url_patterns': source.get('url_patterns', {}), @@ -164,10 +180,15 @@ class UnifiedScraper: with open(docs_data_file, 'r', encoding='utf-8') as f: summary = json.load(f) - self.scraped_data['documentation'] = { + # Append to list instead of overwriting + self.scraped_data['documentation'].append({ + 'source_id': source_id, + 'base_url': source['base_url'], 'pages': summary.get('pages', []), - 'data_file': docs_data_file - } + 'total_pages': summary.get('total_pages', 0), + 'data_file': docs_data_file, + 'refs_dir': f"output/{doc_config['name']}/references" + }) logger.info(f"βœ… Documentation: {summary.get('total_pages', 0)} pages scraped") else: @@ -185,10 +206,18 @@ class UnifiedScraper: logger.error("github_scraper.py not found") return + # Get unique index for this GitHub source + idx = self._source_counters['github'] + self._source_counters['github'] += 1 + + # Extract repo identifier for unique naming + repo = source['repo'] + repo_id = repo.replace('/', '_') + # Create config for GitHub scraper github_config = { - 'repo': source['repo'], - 'name': f"{self.name}_github", + 'repo': repo, + 'name': f"{self.name}_github_{idx}_{repo_id}", 'github_token': source.get('github_token'), 'include_issues': source.get('include_issues', True), 'max_issues': source.get('max_issues', 100), @@ -197,7 +226,7 @@ class UnifiedScraper: 'include_code': source.get('include_code', True), 'code_analysis_depth': source.get('code_analysis_depth', 'surface'), 'file_patterns': source.get('file_patterns', []), - 'local_repo_path': source.get('local_repo_path') # Pass local_repo_path from config + 'local_repo_path': source.get('local_repo_path') } # Pass directory exclusions if specified (optional) @@ -207,19 +236,22 @@ class UnifiedScraper: github_config['exclude_dirs_additional'] = source['exclude_dirs_additional'] # Scrape - logger.info(f"Scraping GitHub repository: {source['repo']}") + logger.info(f"Scraping GitHub repository: {repo}") scraper = GitHubScraper(github_config) github_data = scraper.scrape() - # Save data - github_data_file = os.path.join(self.data_dir, 'github_data.json') + # Save data with unique filename + github_data_file = os.path.join(self.data_dir, f'github_data_{idx}_{repo_id}.json') with open(github_data_file, 'w', encoding='utf-8') as f: json.dump(github_data, f, indent=2, ensure_ascii=False) - self.scraped_data['github'] = { + # Append to list instead of overwriting + self.scraped_data['github'].append({ + 'repo': repo, + 'repo_id': repo_id, 'data': github_data, 'data_file': github_data_file - } + }) logger.info(f"βœ… GitHub: Repository scraped successfully") @@ -274,14 +306,23 @@ class UnifiedScraper: logger.info("No API merge needed (only one API source)") return [] - # Get documentation and GitHub data - docs_data = self.scraped_data.get('documentation', {}) - github_data = self.scraped_data.get('github', {}) + # Get documentation and GitHub data (now lists) + docs_list = self.scraped_data.get('documentation', []) + github_list = self.scraped_data.get('github', []) - if not docs_data or not github_data: + if not docs_list or not github_list: logger.warning("Missing documentation or GitHub data for conflict detection") return [] + # For conflict detection, combine all docs and all github data + # Use the first of each for now (conflict detection is optional) + docs_data = docs_list[0] if docs_list else {} + github_data = github_list[0] if github_list else {} + + if not docs_data.get('data_file') or not github_data.get('data_file'): + logger.warning("Missing data files for conflict detection") + return [] + # Load data files with open(docs_data['data_file'], 'r', encoding='utf-8') as f: docs_json = json.load(f) @@ -328,9 +369,17 @@ class UnifiedScraper: logger.info("No conflicts to merge") return None - # Get data files - docs_data = self.scraped_data.get('documentation', {}) - github_data = self.scraped_data.get('github', {}) + # Get data files (now lists) + docs_list = self.scraped_data.get('documentation', []) + github_list = self.scraped_data.get('github', []) + + if not docs_list or not github_list: + logger.warning("Missing data for merge") + return None + + # Use first source of each type for merge + docs_data = docs_list[0] + github_data = github_list[0] # Load data with open(docs_data['data_file'], 'r', encoding='utf-8') as f: diff --git a/src/skill_seekers/cli/unified_skill_builder.py b/src/skill_seekers/cli/unified_skill_builder.py index b8f9700..a80f86d 100644 --- a/src/skill_seekers/cli/unified_skill_builder.py +++ b/src/skill_seekers/cli/unified_skill_builder.py @@ -268,118 +268,177 @@ This skill combines knowledge from multiple sources: """Generate reference files organized by source.""" logger.info("Generating reference files...") - # Generate references for each source type - if 'documentation' in self.scraped_data: - self._generate_docs_references() + # Generate references for each source type (now lists) + docs_list = self.scraped_data.get('documentation', []) + if docs_list: + self._generate_docs_references(docs_list) - if 'github' in self.scraped_data: - self._generate_github_references() + github_list = self.scraped_data.get('github', []) + if github_list: + self._generate_github_references(github_list) - if 'pdf' in self.scraped_data: - self._generate_pdf_references() + pdf_list = self.scraped_data.get('pdf', []) + if pdf_list: + self._generate_pdf_references(pdf_list) # Generate merged API reference if available if self.merged_data: self._generate_merged_api_reference() - def _generate_docs_references(self): - """Generate references from documentation source.""" + def _generate_docs_references(self, docs_list: List[Dict]): + """Generate references from multiple documentation sources.""" docs_dir = os.path.join(self.skill_dir, 'references', 'documentation') os.makedirs(docs_dir, exist_ok=True) - # Best-effort: copy docs-only reference files into unified docs references. - # UnifiedScraper runs doc_scraper using name "{name}_docs", which creates - # output/{name}_docs/references/*.md. Those are the most useful documentation - # references for the unified skill. - source_refs_dir = os.path.join('output', f"{self.name}_docs", 'references') - copied_files: List[str] = [] + all_copied_files: List[str] = [] - if os.path.isdir(source_refs_dir): - for entry in sorted(os.listdir(source_refs_dir)): - src_path = os.path.join(source_refs_dir, entry) - dst_path = os.path.join(docs_dir, entry) - if not os.path.isfile(src_path): - continue - shutil.copy2(src_path, dst_path) - copied_files.append(entry) + # Process each documentation source + for i, doc_source in enumerate(docs_list): + source_id = doc_source.get('source_id', f'source_{i}') + base_url = doc_source.get('base_url', 'Unknown') + refs_dir = doc_source.get('refs_dir', '') - # Create index + # Create subdirectory for this source + source_dir = os.path.join(docs_dir, source_id) + os.makedirs(source_dir, exist_ok=True) + + copied_files: List[str] = [] + + if refs_dir and os.path.isdir(refs_dir): + for entry in sorted(os.listdir(refs_dir)): + src_path = os.path.join(refs_dir, entry) + dst_path = os.path.join(source_dir, entry) + if not os.path.isfile(src_path): + continue + shutil.copy2(src_path, dst_path) + copied_files.append(entry) + + # Create index for this source + source_index_path = os.path.join(source_dir, 'index.md') + with open(source_index_path, 'w', encoding='utf-8') as f: + f.write(f"# Documentation: {source_id}\n\n") + f.write(f"**Source**: {base_url}\n\n") + f.write(f"**Pages**: {doc_source.get('total_pages', 'N/A')}\n\n") + + if copied_files: + files_no_index = [p for p in copied_files if p.lower() != 'index.md'] + f.write("## Files\n\n") + for filename in files_no_index: + f.write(f"- [{filename}]({filename})\n") + else: + f.write("No reference files available.\n") + + all_copied_files.extend(copied_files) + + # Create main index index_path = os.path.join(docs_dir, 'index.md') with open(index_path, 'w', encoding='utf-8') as f: - f.write("# Documentation\n\n") - f.write("Reference from official documentation.\n\n") + f.write("# Documentation References\n\n") + f.write(f"Combined from {len(docs_list)} documentation sources.\n\n") - if copied_files: - files_no_index = [p for p in copied_files if p.lower() != 'index.md'] - files_index = [p for p in copied_files if p.lower() == 'index.md'] + f.write("## Sources\n\n") + for doc_source in docs_list: + source_id = doc_source.get('source_id', 'unknown') + base_url = doc_source.get('base_url', 'Unknown') + total_pages = doc_source.get('total_pages', 'N/A') + f.write(f"- [{source_id}]({source_id}/index.md) - {base_url} ({total_pages} pages)\n") - f.write("## Files\n\n") - for filename in files_no_index + files_index: - f.write(f"- [{filename}]({filename})\n") - else: - f.write("## Notes\n\n") - f.write( - "No documentation reference files were copied into this unified skill. " - "This usually means the docs-only build did not produce reference files.\n" - ) + logger.info(f"Created documentation references ({len(docs_list)} sources)") - logger.info("Created documentation references") - - def _generate_github_references(self): - """Generate references from GitHub source.""" + def _generate_github_references(self, github_list: List[Dict]): + """Generate references from multiple GitHub sources.""" github_dir = os.path.join(self.skill_dir, 'references', 'github') os.makedirs(github_dir, exist_ok=True) - github_data = self.scraped_data['github']['data'] + # Process each GitHub source + for i, github_source in enumerate(github_list): + repo = github_source.get('repo', f'repo_{i}') + repo_id = github_source.get('repo_id', repo.replace('/', '_')) + github_data = github_source.get('data', {}) - # Create README reference - if github_data.get('readme'): - readme_path = os.path.join(github_dir, 'README.md') - with open(readme_path, 'w') as f: - f.write("# Repository README\n\n") - f.write(github_data['readme']) + # Create subdirectory for this repo + repo_dir = os.path.join(github_dir, repo_id) + os.makedirs(repo_dir, exist_ok=True) - # Create issues reference - if github_data.get('issues'): - issues_path = os.path.join(github_dir, 'issues.md') - with open(issues_path, 'w') as f: - f.write("# GitHub Issues\n\n") - f.write(f"{len(github_data['issues'])} recent issues.\n\n") + # Create README reference + if github_data.get('readme'): + readme_path = os.path.join(repo_dir, 'README.md') + with open(readme_path, 'w', encoding='utf-8') as f: + f.write(f"# Repository README: {repo}\n\n") + f.write(github_data['readme']) - for issue in github_data['issues'][:20]: - f.write(f"## #{issue['number']}: {issue['title']}\n\n") - f.write(f"**State**: {issue['state']}\n") - if issue.get('labels'): - f.write(f"**Labels**: {', '.join(issue['labels'])}\n") - f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n") + # Create issues reference + if github_data.get('issues'): + issues_path = os.path.join(repo_dir, 'issues.md') + with open(issues_path, 'w', encoding='utf-8') as f: + f.write(f"# GitHub Issues: {repo}\n\n") + f.write(f"{len(github_data['issues'])} recent issues.\n\n") - # Create releases reference - if github_data.get('releases'): - releases_path = os.path.join(github_dir, 'releases.md') - with open(releases_path, 'w') as f: - f.write("# Releases\n\n") + for issue in github_data['issues'][:20]: + f.write(f"## #{issue['number']}: {issue['title']}\n\n") + f.write(f"**State**: {issue['state']}\n") + if issue.get('labels'): + f.write(f"**Labels**: {', '.join(issue['labels'])}\n") + f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n") - for release in github_data['releases'][:10]: - f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n") - f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n") - if release.get('body'): - f.write(release['body'][:500]) - f.write("\n\n") + # Create releases reference + if github_data.get('releases'): + releases_path = os.path.join(repo_dir, 'releases.md') + with open(releases_path, 'w', encoding='utf-8') as f: + f.write(f"# Releases: {repo}\n\n") - logger.info("Created GitHub references") + for release in github_data['releases'][:10]: + f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n") + f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n") + if release.get('body'): + f.write(release['body'][:500]) + f.write("\n\n") - def _generate_pdf_references(self): - """Generate references from PDF source.""" + # Create index for this repo + repo_index_path = os.path.join(repo_dir, 'index.md') + repo_info = github_data.get('repo_info', {}) + with open(repo_index_path, 'w', encoding='utf-8') as f: + f.write(f"# GitHub: {repo}\n\n") + f.write(f"**Stars**: {repo_info.get('stars', 'N/A')}\n") + f.write(f"**Language**: {repo_info.get('language', 'N/A')}\n") + f.write(f"**Issues**: {len(github_data.get('issues', []))}\n") + f.write(f"**Releases**: {len(github_data.get('releases', []))}\n\n") + f.write("## Files\n\n") + f.write("- [README.md](README.md)\n") + if github_data.get('issues'): + f.write("- [issues.md](issues.md)\n") + if github_data.get('releases'): + f.write("- [releases.md](releases.md)\n") + + # Create main index + index_path = os.path.join(github_dir, 'index.md') + with open(index_path, 'w', encoding='utf-8') as f: + f.write("# GitHub References\n\n") + f.write(f"Combined from {len(github_list)} GitHub repositories.\n\n") + + f.write("## Repositories\n\n") + for github_source in github_list: + repo = github_source.get('repo', 'unknown') + repo_id = github_source.get('repo_id', repo.replace('/', '_')) + github_data = github_source.get('data', {}) + repo_info = github_data.get('repo_info', {}) + stars = repo_info.get('stars', 'N/A') + f.write(f"- [{repo}]({repo_id}/index.md) - {stars} stars\n") + + logger.info(f"Created GitHub references ({len(github_list)} repos)") + + def _generate_pdf_references(self, pdf_list: List[Dict]): + """Generate references from PDF sources.""" pdf_dir = os.path.join(self.skill_dir, 'references', 'pdf') os.makedirs(pdf_dir, exist_ok=True) # Create index index_path = os.path.join(pdf_dir, 'index.md') - with open(index_path, 'w') as f: + with open(index_path, 'w', encoding='utf-8') as f: f.write("# PDF Documentation\n\n") - f.write("Reference from PDF document.\n\n") + f.write(f"Reference from {len(pdf_list)} PDF document(s).\n\n") - logger.info("Created PDF references") + logger.info(f"Created PDF references ({len(pdf_list)} sources)") def _generate_merged_api_reference(self): """Generate merged API reference file.""" From 4b764ed1c50317511163a0126fc74be1c20007e4 Mon Sep 17 00:00:00 2001 From: tsyhahaha Date: Mon, 5 Jan 2026 22:13:19 +0800 Subject: [PATCH 2/7] test: add unit tests for markdown parsing and multi-source features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add test_markdown_parsing.py with 20 tests covering: - Markdown content extraction (titles, headings, code blocks, links) - HTML fallback when .md URL returns HTML - llms.txt URL extraction and cleaning - Empty/short content filtering - Add test_multi_source.py with 12 tests covering: - List-based scraped_data structure - Per-source subdirectory generation for docs/github/pdf - Index file generation for each source type πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- tests/test_markdown_parsing.py | 359 +++++++++++++++++++++++++++ tests/test_multi_source.py | 433 +++++++++++++++++++++++++++++++++ 2 files changed, 792 insertions(+) create mode 100644 tests/test_markdown_parsing.py create mode 100644 tests/test_multi_source.py diff --git a/tests/test_markdown_parsing.py b/tests/test_markdown_parsing.py new file mode 100644 index 0000000..9917225 --- /dev/null +++ b/tests/test_markdown_parsing.py @@ -0,0 +1,359 @@ +""" +Tests for Markdown parsing and BFS URL crawling features. + +Tests the following functionality: +1. Markdown file content extraction (_extract_markdown_content) +2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown) +3. URL extraction from llms.txt (extract_urls, _clean_url) +4. Empty/short content filtering in save_page +""" + +import unittest +import tempfile +import os +import shutil + + +class TestMarkdownContentExtraction(unittest.TestCase): + """Test Markdown file parsing in doc_scraper.""" + + def setUp(self): + """Set up test fixtures.""" + from skill_seekers.cli.doc_scraper import DocToSkillConverter + + self.config = { + 'name': 'test_md_parsing', + 'base_url': 'https://example.com', + 'selectors': {}, + 'url_patterns': {'include': [], 'exclude': []}, + 'categories': {} + } + self.converter = DocToSkillConverter(self.config) + + def tearDown(self): + """Clean up output directory.""" + output_dir = f"output/{self.config['name']}_data" + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + def test_extract_title_from_h1(self): + """Test extracting title from first h1.""" + content = "# My Documentation Title\n\nSome content here." + result = self.converter._extract_markdown_content(content, "https://example.com/test.md") + self.assertEqual(result['title'], "My Documentation Title") + + def test_extract_headings_h2_to_h6(self): + """Test extracting h2-h6 headings (not h1).""" + content = """# Title + +## Section One +### Subsection A +#### Deep Section +##### Deeper +###### Deepest + +Content here. +""" + result = self.converter._extract_markdown_content(content, "https://example.com/test.md") + # Should have 5 headings (h2-h6), not h1 + self.assertEqual(len(result['headings']), 5) + self.assertEqual(result['headings'][0]['level'], 'h2') + self.assertEqual(result['headings'][0]['text'], 'Section One') + + def test_extract_code_blocks_with_language(self): + """Test extracting code blocks with language tags.""" + content = """# API Guide + +```python +def hello(): + return "Hello, World!" +``` + +Some explanation. + +```javascript +const greet = () => console.log("Hi"); +``` + +``` +plain code without language +``` +""" + result = self.converter._extract_markdown_content(content, "https://example.com/test.md") + self.assertEqual(len(result['code_samples']), 3) + self.assertEqual(result['code_samples'][0]['language'], 'python') + self.assertEqual(result['code_samples'][1]['language'], 'javascript') + self.assertEqual(result['code_samples'][2]['language'], 'unknown') + + def test_extract_markdown_links_only_md_files(self): + """Test that only .md links are extracted.""" + content = """# Links + +- [Markdown Doc](./guide.md) +- [Another MD](https://example.com/api.md) +- [HTML Page](./page.html) +- [External](https://google.com) +""" + result = self.converter._extract_markdown_content(content, "https://example.com/docs/test.md") + # Should only include .md links + md_links = [l for l in result['links'] if '.md' in l] + self.assertEqual(len(md_links), len(result['links'])) + + def test_extract_content_paragraphs(self): + """Test extracting paragraph content.""" + content = """# Title + +This is a paragraph with enough content to pass the minimum length filter. + +Short. + +Another paragraph that should be included in the final content output. +""" + result = self.converter._extract_markdown_content(content, "https://example.com/test.md") + self.assertIn("paragraph with enough content", result['content']) + self.assertNotIn("Short.", result['content']) + + def test_detect_html_in_md_url(self): + """Test that HTML content is detected when .md URL returns HTML.""" + html_content = "Page

Hello

" + result = self.converter._extract_markdown_content(html_content, "https://example.com/test.md") + self.assertEqual(result['title'], "Page") + + +class TestHtmlAsMarkdownExtraction(unittest.TestCase): + """Test HTML to markdown-like extraction.""" + + def setUp(self): + """Set up test fixtures.""" + from skill_seekers.cli.doc_scraper import DocToSkillConverter + + self.config = { + 'name': 'test_html_fallback', + 'base_url': 'https://example.com', + 'selectors': {}, + 'url_patterns': {'include': [], 'exclude': []}, + 'categories': {} + } + self.converter = DocToSkillConverter(self.config) + + def tearDown(self): + """Clean up output directory.""" + output_dir = f"output/{self.config['name']}_data" + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + def test_extract_title_from_html(self): + """Test extracting title from HTML title tag.""" + html = "My Page Title" + result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") + self.assertEqual(result['title'], "My Page Title") + + def test_find_main_content_area(self): + """Test finding main content from various selectors.""" + html = """ + + +
+

Main Content

+

This is the main content area with enough text to pass filters.

+
+
Footer
+ + """ + result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") + self.assertIn("main content area", result['content'].lower()) + + def test_extract_code_blocks_from_html(self): + """Test extracting code blocks from HTML pre/code tags.""" + html = """ + +
+
print("hello")
+
+ + """ + result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") + self.assertTrue(len(result['code_samples']) > 0) + + def test_fallback_to_body_when_no_main(self): + """Test fallback to body when no main/article element.""" + html = """ + +
+

Section

+

Content in body without main element, long enough to pass filter.

+
+ + """ + result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") + self.assertTrue(len(result['headings']) > 0 or len(result['content']) > 0) + + +class TestLlmsTxtUrlExtraction(unittest.TestCase): + """Test URL extraction from llms.txt content.""" + + def test_extract_markdown_style_links(self): + """Test extracting [text](url) style links.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + content = """ +# Documentation Index + +- [Getting Started](https://docs.example.com/start.md) +- [API Reference](https://docs.example.com/api/index.md) +- [Advanced Guide](https://docs.example.com/advanced.md) +""" + parser = LlmsTxtParser(content, base_url="https://docs.example.com") + urls = parser.extract_urls() + + self.assertIn("https://docs.example.com/start.md", urls) + self.assertIn("https://docs.example.com/api/index.md", urls) + self.assertIn("https://docs.example.com/advanced.md", urls) + + def test_extract_bare_urls(self): + """Test extracting bare URLs without markdown syntax.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + content = """ +Documentation: https://example.com/docs/guide.md +API: https://example.com/api/reference.md +""" + parser = LlmsTxtParser(content) + urls = parser.extract_urls() + + self.assertIn("https://example.com/docs/guide.md", urls) + self.assertIn("https://example.com/api/reference.md", urls) + + def test_resolve_relative_urls(self): + """Test resolving relative URLs with base_url.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + content = """ +- [Local Doc](./docs/guide.md) +- [Parent](../api/ref.md) +""" + parser = LlmsTxtParser(content, base_url="https://example.com/learn/") + urls = parser.extract_urls() + + # Should resolve relative paths + self.assertTrue(any("docs/guide.md" in url for url in urls)) + + def test_clean_url_invalid_anchor_pattern(self): + """Test cleaning URLs with invalid anchor patterns.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + # Invalid: path after anchor + result = parser._clean_url("https://example.com/page#section/index.html.md") + self.assertEqual(result, "https://example.com/page") + + def test_clean_url_valid_anchor(self): + """Test that valid anchors are preserved.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + # Valid anchor should be unchanged + result = parser._clean_url("https://example.com/page.md#section") + self.assertEqual(result, "https://example.com/page.md#section") + + def test_clean_url_no_anchor(self): + """Test that URLs without anchors are unchanged.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + result = parser._clean_url("https://example.com/docs/guide.md") + self.assertEqual(result, "https://example.com/docs/guide.md") + + def test_deduplicate_urls(self): + """Test that duplicate URLs are removed.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + content = """ +- [Doc 1](https://example.com/doc.md) +- [Doc 2](https://example.com/doc.md) +https://example.com/doc.md +""" + parser = LlmsTxtParser(content) + urls = parser.extract_urls() + + # Should only have one instance + count = sum(1 for u in urls if u == "https://example.com/doc.md") + self.assertEqual(count, 1) + + +class TestSavePageContentFiltering(unittest.TestCase): + """Test content filtering in save_page.""" + + def setUp(self): + """Set up test fixtures.""" + from skill_seekers.cli.doc_scraper import DocToSkillConverter + + self.config = { + 'name': 'test_save_filter', + 'base_url': 'https://example.com', + 'selectors': {}, + 'url_patterns': {'include': [], 'exclude': []}, + 'categories': {} + } + self.converter = DocToSkillConverter(self.config) + + def tearDown(self): + """Clean up output directory.""" + output_dir = f"output/{self.config['name']}_data" + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + def test_skip_empty_content(self): + """Test that pages with empty content are skipped.""" + page = { + 'url': 'https://example.com/empty', + 'title': 'Empty Page', + 'content': '', + 'headings': [], + 'code_samples': [] + } + + self.converter.save_page(page) + + pages_dir = os.path.join(self.converter.data_dir, 'pages') + if os.path.exists(pages_dir): + self.assertEqual(len(os.listdir(pages_dir)), 0) + + def test_skip_short_content_under_50_chars(self): + """Test that pages with content < 50 chars are skipped.""" + page = { + 'url': 'https://example.com/short', + 'title': 'Short', + 'content': 'This is too short.', # 18 chars + 'headings': [], + 'code_samples': [] + } + + self.converter.save_page(page) + + pages_dir = os.path.join(self.converter.data_dir, 'pages') + if os.path.exists(pages_dir): + self.assertEqual(len(os.listdir(pages_dir)), 0) + + def test_save_content_over_50_chars(self): + """Test that pages with content >= 50 chars are saved.""" + page = { + 'url': 'https://example.com/valid', + 'title': 'Valid Page', + 'content': 'A' * 60, # 60 chars, should pass + 'headings': [], + 'code_samples': [] + } + + self.converter.save_page(page) + + pages_dir = os.path.join(self.converter.data_dir, 'pages') + self.assertTrue(os.path.exists(pages_dir)) + self.assertEqual(len(os.listdir(pages_dir)), 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_multi_source.py b/tests/test_multi_source.py new file mode 100644 index 0000000..80644d2 --- /dev/null +++ b/tests/test_multi_source.py @@ -0,0 +1,433 @@ +""" +Tests for multi-source support in unified scraper and skill builder. + +Tests the following functionality: +1. Multiple sources of same type in unified_scraper (list structure) +2. Source counters and unique naming +3. Per-source reference directory generation in unified_skill_builder +4. Multiple documentation sources handling +5. Multiple GitHub repositories handling +""" + +import unittest +import tempfile +import os +import shutil + + +class TestUnifiedScraperDataStructure(unittest.TestCase): + """Test scraped_data list structure in unified_scraper.""" + + def test_scraped_data_uses_list_structure(self): + """Test that scraped_data uses list for each source type.""" + from skill_seekers.cli.unified_scraper import UnifiedScraper + + config = { + 'name': 'test_multi', + 'description': 'Test skill', + 'sources': [ + {'type': 'documentation', 'base_url': 'https://example.com'} + ] + } + + with tempfile.TemporaryDirectory() as temp_dir: + original_dir = os.getcwd() + try: + os.chdir(temp_dir) + scraper = UnifiedScraper(config) + + self.assertIsInstance(scraper.scraped_data['documentation'], list) + self.assertIsInstance(scraper.scraped_data['github'], list) + self.assertIsInstance(scraper.scraped_data['pdf'], list) + finally: + os.chdir(original_dir) + + def test_source_counters_initialized_to_zero(self): + """Test that source counters start at zero.""" + from skill_seekers.cli.unified_scraper import UnifiedScraper + + config = { + 'name': 'test_counters', + 'description': 'Test skill', + 'sources': [ + {'type': 'documentation', 'base_url': 'https://example.com'} + ] + } + + with tempfile.TemporaryDirectory() as temp_dir: + original_dir = os.getcwd() + try: + os.chdir(temp_dir) + scraper = UnifiedScraper(config) + + self.assertEqual(scraper._source_counters['documentation'], 0) + self.assertEqual(scraper._source_counters['github'], 0) + self.assertEqual(scraper._source_counters['pdf'], 0) + finally: + os.chdir(original_dir) + + def test_empty_lists_initially(self): + """Test that source lists are empty initially.""" + from skill_seekers.cli.unified_scraper import UnifiedScraper + + config = { + 'name': 'test_empty', + 'description': 'Test skill', + 'sources': [ + {'type': 'documentation', 'base_url': 'https://example.com'} + ] + } + + with tempfile.TemporaryDirectory() as temp_dir: + original_dir = os.getcwd() + try: + os.chdir(temp_dir) + scraper = UnifiedScraper(config) + + self.assertEqual(len(scraper.scraped_data['documentation']), 0) + self.assertEqual(len(scraper.scraped_data['github']), 0) + self.assertEqual(len(scraper.scraped_data['pdf']), 0) + finally: + os.chdir(original_dir) + + +class TestUnifiedSkillBuilderDocsReferences(unittest.TestCase): + """Test documentation reference generation for multiple sources.""" + + def setUp(self): + """Set up test fixtures.""" + self.temp_dir = tempfile.mkdtemp() + self.original_dir = os.getcwd() + os.chdir(self.temp_dir) + + def tearDown(self): + """Clean up test fixtures.""" + os.chdir(self.original_dir) + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + def test_creates_subdirectory_per_source(self): + """Test that each doc source gets its own subdirectory.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + # Create mock refs directories + refs_dir1 = os.path.join(self.temp_dir, 'refs1') + refs_dir2 = os.path.join(self.temp_dir, 'refs2') + os.makedirs(refs_dir1) + os.makedirs(refs_dir2) + + config = { + 'name': 'test_docs_refs', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [ + {'source_id': 'source_a', 'base_url': 'https://a.com', 'total_pages': 5, 'refs_dir': refs_dir1}, + {'source_id': 'source_b', 'base_url': 'https://b.com', 'total_pages': 3, 'refs_dir': refs_dir2} + ], + 'github': [], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_docs_references(scraped_data['documentation']) + + docs_dir = os.path.join(builder.skill_dir, 'references', 'documentation') + self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_a'))) + self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_b'))) + + def test_creates_index_per_source(self): + """Test that each source subdirectory has its own index.md.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + refs_dir = os.path.join(self.temp_dir, 'refs') + os.makedirs(refs_dir) + + config = { + 'name': 'test_source_index', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [ + {'source_id': 'my_source', 'base_url': 'https://example.com', 'total_pages': 10, 'refs_dir': refs_dir} + ], + 'github': [], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_docs_references(scraped_data['documentation']) + + source_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'my_source', 'index.md') + self.assertTrue(os.path.exists(source_index)) + + with open(source_index, 'r') as f: + content = f.read() + self.assertIn('my_source', content) + self.assertIn('https://example.com', content) + + def test_creates_main_index_listing_all_sources(self): + """Test that main index.md lists all documentation sources.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + refs_dir1 = os.path.join(self.temp_dir, 'refs1') + refs_dir2 = os.path.join(self.temp_dir, 'refs2') + os.makedirs(refs_dir1) + os.makedirs(refs_dir2) + + config = { + 'name': 'test_main_index', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [ + {'source_id': 'docs_one', 'base_url': 'https://one.com', 'total_pages': 10, 'refs_dir': refs_dir1}, + {'source_id': 'docs_two', 'base_url': 'https://two.com', 'total_pages': 20, 'refs_dir': refs_dir2} + ], + 'github': [], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_docs_references(scraped_data['documentation']) + + main_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'index.md') + self.assertTrue(os.path.exists(main_index)) + + with open(main_index, 'r') as f: + content = f.read() + self.assertIn('docs_one', content) + self.assertIn('docs_two', content) + self.assertIn('2 documentation sources', content) + + def test_copies_reference_files_to_source_dir(self): + """Test that reference files are copied to source subdirectory.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + refs_dir = os.path.join(self.temp_dir, 'refs') + os.makedirs(refs_dir) + + # Create mock reference files + with open(os.path.join(refs_dir, 'api.md'), 'w') as f: + f.write('# API Reference') + with open(os.path.join(refs_dir, 'guide.md'), 'w') as f: + f.write('# User Guide') + + config = { + 'name': 'test_copy_refs', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [ + {'source_id': 'test_source', 'base_url': 'https://test.com', 'total_pages': 5, 'refs_dir': refs_dir} + ], + 'github': [], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_docs_references(scraped_data['documentation']) + + source_dir = os.path.join(builder.skill_dir, 'references', 'documentation', 'test_source') + self.assertTrue(os.path.exists(os.path.join(source_dir, 'api.md'))) + self.assertTrue(os.path.exists(os.path.join(source_dir, 'guide.md'))) + + +class TestUnifiedSkillBuilderGitHubReferences(unittest.TestCase): + """Test GitHub reference generation for multiple repositories.""" + + def setUp(self): + """Set up test fixtures.""" + self.temp_dir = tempfile.mkdtemp() + self.original_dir = os.getcwd() + os.chdir(self.temp_dir) + + def tearDown(self): + """Clean up test fixtures.""" + os.chdir(self.original_dir) + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + def test_creates_subdirectory_per_repo(self): + """Test that each GitHub repo gets its own subdirectory.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_github_refs', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [ + {'repo': 'org/repo1', 'repo_id': 'org_repo1', 'data': {'readme': '# Repo 1', 'issues': [], 'releases': [], 'repo_info': {}}}, + {'repo': 'org/repo2', 'repo_id': 'org_repo2', 'data': {'readme': '# Repo 2', 'issues': [], 'releases': [], 'repo_info': {}}} + ], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_github_references(scraped_data['github']) + + github_dir = os.path.join(builder.skill_dir, 'references', 'github') + self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo1'))) + self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo2'))) + + def test_creates_readme_per_repo(self): + """Test that README.md is created for each repo.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_readme', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [ + {'repo': 'test/myrepo', 'repo_id': 'test_myrepo', 'data': {'readme': '# My Repository\n\nDescription here.', 'issues': [], 'releases': [], 'repo_info': {}}} + ], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_github_references(scraped_data['github']) + + readme_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_myrepo', 'README.md') + self.assertTrue(os.path.exists(readme_path)) + + with open(readme_path, 'r') as f: + content = f.read() + self.assertIn('test/myrepo', content) + + def test_creates_issues_file_when_issues_exist(self): + """Test that issues.md is created when repo has issues.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_issues', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [ + { + 'repo': 'test/repo', + 'repo_id': 'test_repo', + 'data': { + 'readme': '# Repo', + 'issues': [ + {'number': 1, 'title': 'Bug report', 'state': 'open', 'labels': ['bug'], 'url': 'https://github.com/test/repo/issues/1'}, + {'number': 2, 'title': 'Feature request', 'state': 'closed', 'labels': ['enhancement'], 'url': 'https://github.com/test/repo/issues/2'} + ], + 'releases': [], + 'repo_info': {} + } + } + ], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_github_references(scraped_data['github']) + + issues_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_repo', 'issues.md') + self.assertTrue(os.path.exists(issues_path)) + + with open(issues_path, 'r') as f: + content = f.read() + self.assertIn('Bug report', content) + self.assertIn('Feature request', content) + + def test_creates_main_index_listing_all_repos(self): + """Test that main index.md lists all GitHub repositories.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_github_index', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [ + {'repo': 'org/first', 'repo_id': 'org_first', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 100}}}, + {'repo': 'org/second', 'repo_id': 'org_second', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 50}}} + ], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_github_references(scraped_data['github']) + + main_index = os.path.join(builder.skill_dir, 'references', 'github', 'index.md') + self.assertTrue(os.path.exists(main_index)) + + with open(main_index, 'r') as f: + content = f.read() + self.assertIn('org/first', content) + self.assertIn('org/second', content) + self.assertIn('2 GitHub repositories', content) + + +class TestUnifiedSkillBuilderPdfReferences(unittest.TestCase): + """Test PDF reference generation for multiple sources.""" + + def setUp(self): + """Set up test fixtures.""" + self.temp_dir = tempfile.mkdtemp() + self.original_dir = os.getcwd() + os.chdir(self.temp_dir) + + def tearDown(self): + """Clean up test fixtures.""" + os.chdir(self.original_dir) + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + def test_creates_pdf_index_with_count(self): + """Test that PDF index shows correct document count.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_pdf', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [], + 'pdf': [ + {'path': '/path/to/doc1.pdf'}, + {'path': '/path/to/doc2.pdf'}, + {'path': '/path/to/doc3.pdf'} + ] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_pdf_references(scraped_data['pdf']) + + pdf_index = os.path.join(builder.skill_dir, 'references', 'pdf', 'index.md') + self.assertTrue(os.path.exists(pdf_index)) + + with open(pdf_index, 'r') as f: + content = f.read() + self.assertIn('3 PDF document', content) + + +if __name__ == '__main__': + unittest.main() From a7f13ec75ff4cebf8bb8ec565bc875f7bb3c0313 Mon Sep 17 00:00:00 2001 From: tsyhahaha Date: Mon, 5 Jan 2026 22:32:31 +0800 Subject: [PATCH 3/7] chore: add medusa-mercurjs unified config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-source config combining Medusa docs and Mercur.js marketplace πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- configs/medusa-mercurjs.json | 71 ++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 configs/medusa-mercurjs.json diff --git a/configs/medusa-mercurjs.json b/configs/medusa-mercurjs.json new file mode 100644 index 0000000..e5c31ad --- /dev/null +++ b/configs/medusa-mercurjs.json @@ -0,0 +1,71 @@ +{ + "name": "medusa-mercurjs", + "description": "Complete Medusa v2 + MercurJS multi-vendor e-commerce framework knowledge. Use when building headless commerce applications, implementing multi-vendor marketplaces, or understanding Medusa modules/workflows.", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://docs.medusajs.com", + "llms_txt_url": "https://docs.medusajs.com/llms-full.txt", + "extract_api": true, + "selectors": { + "main_content": "main, article, .content", + "title": "h1", + "code_blocks": "pre" + }, + "url_patterns": { + "include": [ + "/learn", + "/resources" + ], + "exclude": [] + }, + "categories": { + "installation": ["installation", "install", "docker", "update"], + "fundamentals": ["fundamentals", "api-routes", "data-models", "modules", "module-links", "workflows", "events-and-subscribers", "scheduled-jobs", "custom-cli-scripts", "admin", "environment-variables"], + "customization": ["customization", "custom-features", "extend-features", "integrate-systems", "customize-admin"], + "debugging_testing": ["debugging-and-testing", "logging", "testing", "test-tools", "instrumentation", "feature-flags", "debug-workflows"], + "deployment": ["deployment", "production", "deploy", "general"], + "commerce_modules": ["commerce-modules", "product", "cart", "order", "payment", "pricing", "tax", "inventory", "fulfillment", "customer", "promotion", "auth", "region", "currency", "sales-channel", "stock-location", "api-key", "user"], + "infrastructure_modules": ["infrastructure-modules", "caching", "event", "file", "locking", "notification", "workflow-engine", "analytics"], + "storefront": ["storefront-development", "publishable-api-keys", "checkout", "products", "customers", "regions"], + "integrations": ["integrations", "sanity", "contentful", "stripe", "paypal", "shipstation", "sentry"], + "cli_tools": ["medusa-cli", "commands", "build", "develop", "plugin", "db"], + "references": ["references", "medusa-workflows", "helper-steps", "service-factory-reference", "data-model-repository-reference", "test-tools-reference", "fulfillment", "auth", "notification-provider", "file-provider", "locking-service", "caching-service"], + "recipes": ["recipes", "erp", "marketplace", "b2b", "subscriptions", "digital-products", "bundled-products"], + "admin_components": ["admin-components", "widgets", "ui-routes"], + "examples": ["examples", "guides", "how-to-tutorials", "tutorials"] + }, + "rate_limit": 0.3, + "max_pages": 500 + }, + { + "type": "documentation", + "base_url": "https://docs.mercurjs.com/", + "llms_txt_url": "https://docs.mercurjs.com/llms-full.txt", + "extract_api": true, + "selectors": { + "main_content": "main, article", + "title": "h1", + "code_blocks": "pre" + }, + "url_patterns": { + "include": ["/"], + "exclude": [] + }, + "categories": { + "quick_start": ["introduction", "get-started"], + "components": ["components", "backend", "admin-panel", "vendor-panel", "storefront"], + "core_concepts": ["core-concepts", "seller", "commission", "payouts", "order-splitting", "reviews", "requests", "notifications", "marketplace-settings"], + "product": ["product", "core-commerce-modules", "core-infrastructure-modules", "framework"], + "integrations": ["integrations", "algolia", "resend", "stripe"], + "api_admin": ["api-reference/admin", "admin-algolia", "admin-api-keys", "admin-attributes", "admin-auth", "admin-campaigns", "admin-claims", "admin-collections", "admin-commission", "admin-currencies", "admin-customers", "admin-draft-orders", "admin-exchanges", "admin-fulfillment", "admin-inventory", "admin-invites", "admin-notifications", "admin-orders", "admin-payments", "admin-price-lists", "admin-products", "admin-promotions", "admin-regions", "admin-reservations", "admin-returns", "admin-sales-channels", "admin-sellers", "admin-shipping", "admin-stock-locations", "admin-stores", "admin-tax", "admin-uploads", "admin-users"], + "api_store": ["api-reference/store", "store-auth", "store-carts", "store-collections", "store-currencies", "store-customers", "store-fulfillment", "store-orders", "store-payment", "store-products", "store-regions", "store-returns"], + "api_vendor": ["api-reference/vendor", "vendor-auth", "vendor-fulfillment", "vendor-inventory", "vendor-orders", "vendor-payouts", "vendor-products", "vendor-returns", "vendor-sellers", "vendor-shipping", "vendor-stock-locations", "vendor-uploads"], + "help": ["help", "llm", "mcp", "support"] + }, + "rate_limit": 0.3, + "max_pages": 300 + } + ] +} From 2e096c0284adf87294337b407777e18cef384528 Mon Sep 17 00:00:00 2001 From: Nick Miethe Date: Thu, 8 Jan 2026 15:33:12 -0500 Subject: [PATCH 4/7] Enabling full support of the Claude Code documentation site, with support for all relevant pages and Anthropic's unconventional llms.txt --- configs/claude-code.json | 90 +++++++++++++++----- src/skill_seekers/cli/llms_txt_downloader.py | 17 +++- 2 files changed, 83 insertions(+), 24 deletions(-) diff --git a/configs/claude-code.json b/configs/claude-code.json index c84e709..ee96f68 100644 --- a/configs/claude-code.json +++ b/configs/claude-code.json @@ -1,37 +1,83 @@ { "name": "claude-code", - "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, configuration, and AI-assisted development.", - "base_url": "https://docs.claude.com/en/docs/claude-code/", + "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, plugins, hooks, configuration, deployment, and AI-assisted development.", + "base_url": "https://code.claude.com/docs/en/", "start_urls": [ - "https://docs.claude.com/en/docs/claude-code/overview", - "https://docs.claude.com/en/docs/claude-code/quickstart", - "https://docs.claude.com/en/docs/claude-code/common-workflows", - "https://docs.claude.com/en/docs/claude-code/mcp", - "https://docs.claude.com/en/docs/claude-code/settings", - "https://docs.claude.com/en/docs/claude-code/troubleshooting", - "https://docs.claude.com/en/docs/claude-code/iam" + "https://code.claude.com/docs/en/overview", + "https://code.claude.com/docs/en/quickstart", + "https://code.claude.com/docs/en/common-workflows", + "https://code.claude.com/docs/en/claude-code-on-the-web", + "https://code.claude.com/docs/en/desktop", + "https://code.claude.com/docs/en/chrome", + "https://code.claude.com/docs/en/vs-code", + "https://code.claude.com/docs/en/jetbrains", + "https://code.claude.com/docs/en/github-actions", + "https://code.claude.com/docs/en/gitlab-ci-cd", + "https://code.claude.com/docs/en/slack", + "https://code.claude.com/docs/en/sub-agents", + "https://code.claude.com/docs/en/plugins", + "https://code.claude.com/docs/en/discover-plugins", + "https://code.claude.com/docs/en/skills", + "https://code.claude.com/docs/en/output-styles", + "https://code.claude.com/docs/en/hooks-guide", + "https://code.claude.com/docs/en/headless", + "https://code.claude.com/docs/en/mcp", + "https://code.claude.com/docs/en/third-party-integrations", + "https://code.claude.com/docs/en/amazon-bedrock", + "https://code.claude.com/docs/en/google-vertex-ai", + "https://code.claude.com/docs/en/microsoft-foundry", + "https://code.claude.com/docs/en/network-config", + "https://code.claude.com/docs/en/llm-gateway", + "https://code.claude.com/docs/en/devcontainer", + "https://code.claude.com/docs/en/sandboxing", + "https://code.claude.com/docs/en/setup", + "https://code.claude.com/docs/en/iam", + "https://code.claude.com/docs/en/security", + "https://code.claude.com/docs/en/data-usage", + "https://code.claude.com/docs/en/monitoring-usage", + "https://code.claude.com/docs/en/costs", + "https://code.claude.com/docs/en/analytics", + "https://code.claude.com/docs/en/plugin-marketplaces", + "https://code.claude.com/docs/en/settings", + "https://code.claude.com/docs/en/terminal-config", + "https://code.claude.com/docs/en/model-config", + "https://code.claude.com/docs/en/memory", + "https://code.claude.com/docs/en/statusline", + "https://code.claude.com/docs/en/cli-reference", + "https://code.claude.com/docs/en/interactive-mode", + "https://code.claude.com/docs/en/slash-commands", + "https://code.claude.com/docs/en/checkpointing", + "https://code.claude.com/docs/en/hooks", + "https://code.claude.com/docs/en/plugins-reference", + "https://code.claude.com/docs/en/troubleshooting", + "https://code.claude.com/docs/en/legal-and-compliance" ], "selectors": { - "main_content": "#content-container", + "main_content": "#content-area, #content-container, article, main", "title": "h1", "code_blocks": "pre code" }, "url_patterns": { - "include": ["/claude-code/"], - "exclude": ["/api-reference/", "/claude-ai/", "/claude.ai/", "/prompt-engineering/", "/changelog/"] + "include": ["/docs/en/"], + "exclude": [ + "/docs/fr/", "/docs/de/", "/docs/it/", "/docs/ja/", "/docs/es/", + "/docs/ko/", "/docs/zh-CN/", "/docs/zh-TW/", "/docs/ru/", + "/docs/id/", "/docs/pt/", "/changelog", "github.com" + ] }, "categories": { - "getting_started": ["overview", "quickstart", "installation", "setup", "terminal-config"], - "workflows": ["workflow", "common-workflows", "git", "testing", "debugging", "interactive"], + "getting_started": ["overview", "quickstart", "common-workflows"], + "ide_integrations": ["vs-code", "jetbrains", "desktop", "chrome", "claude-code-on-the-web", "slack"], + "ci_cd": ["github-actions", "gitlab-ci-cd"], + "building": ["sub-agents", "subagent", "plugins", "discover-plugins", "skills", "output-styles", "hooks-guide", "headless", "programmatic"], "mcp": ["mcp", "model-context-protocol"], - "configuration": ["config", "settings", "preferences", "customize", "hooks", "statusline", "model-config", "memory", "output-styles"], - "agents": ["agent", "task", "subagent", "sub-agent", "specialized"], - "skills": ["skill", "agent-skill"], - "integrations": ["ide-integrations", "vs-code", "jetbrains", "plugin", "marketplace"], - "deployment": ["bedrock", "vertex", "deployment", "network", "gateway", "devcontainer", "sandboxing", "third-party"], - "reference": ["reference", "api", "command", "cli-reference", "slash", "checkpointing", "headless", "sdk"], - "enterprise": ["iam", "security", "monitoring", "analytics", "costs", "legal", "data-usage"] + "deployment": ["third-party-integrations", "amazon-bedrock", "google-vertex-ai", "microsoft-foundry", "network-config", "llm-gateway", "devcontainer", "sandboxing"], + "administration": ["setup", "iam", "security", "data-usage", "monitoring-usage", "costs", "analytics", "plugin-marketplaces"], + "configuration": ["settings", "terminal-config", "model-config", "memory", "statusline"], + "reference": ["cli-reference", "interactive-mode", "slash-commands", "checkpointing", "hooks", "plugins-reference"], + "troubleshooting": ["troubleshooting"], + "legal": ["legal-and-compliance"] }, "rate_limit": 0.5, - "max_pages": 200 + "max_pages": 250 } diff --git a/src/skill_seekers/cli/llms_txt_downloader.py b/src/skill_seekers/cli/llms_txt_downloader.py index 1049f86..76ec740 100644 --- a/src/skill_seekers/cli/llms_txt_downloader.py +++ b/src/skill_seekers/cli/llms_txt_downloader.py @@ -38,11 +38,24 @@ class LlmsTxtDownloader: def _is_markdown(self, content: str) -> bool: """ - Check if content looks like markdown. + Check if content looks like markdown (not HTML). Returns: - True if content contains markdown patterns + True if content contains markdown patterns and is NOT HTML """ + # First, reject HTML content (common redirect trap) + content_start = content.strip()[:500].lower() + html_indicators = [ + '', + ' Date: Sun, 11 Jan 2026 14:01:23 +0300 Subject: [PATCH 5/7] fix: Add empty list checks and enhance docstrings (PR #243 review fixes) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two critical improvements from PR #243 code review: ## Fix 1: Empty List Edge Case Handling Added early return checks to prevent creating empty index files: **Files Modified:** - src/skill_seekers/cli/unified_skill_builder.py **Changes:** - _generate_docs_references: Skip if docs_list empty - _generate_github_references: Skip if github_list empty - _generate_pdf_references: Skip if pdf_list empty **Impact:** Prevents "Combined from 0 sources" index files which look odd. ## Fix 2: Enhanced Method Docstrings Added comprehensive parameter types and return value documentation: **Files Modified:** - src/skill_seekers/cli/llms_txt_parser.py - extract_urls: Added detailed examples and behavior notes - _clean_url: Added malformed URL pattern examples - src/skill_seekers/cli/doc_scraper.py - _extract_markdown_content: Full return dict structure documented - _extract_html_as_markdown: Extraction strategy and fallback behavior **Impact:** Improved developer experience with detailed API documentation. ## Testing All tests passing: - βœ… 32/32 PR #243 tests (markdown parsing + multi-source) - βœ… 975/975 core tests - 159 skipped (optional dependencies) - 4 failed (missing anthropic - expected) Co-authored-by: Code Review --- src/skill_seekers/cli/doc_scraper.py | 56 ++++++++++++++++--- src/skill_seekers/cli/llms_txt_parser.py | 29 +++++++++- .../cli/unified_skill_builder.py | 12 ++++ 3 files changed, 87 insertions(+), 10 deletions(-) diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 1e52181..021738b 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -350,14 +350,34 @@ class DocToSkillConverter: return page def _extract_markdown_content(self, content: str, url: str) -> Dict[str, Any]: - """Extract content from a Markdown file. + """Extract structured content from a Markdown file. + + Parses markdown files from llms.txt URLs to extract: + - Title from first h1 heading + - Headings (h2-h6, excluding h1) + - Code blocks with language detection + - Internal .md links for BFS crawling + - Content paragraphs (>20 chars) + + Auto-detects HTML content and falls back to _extract_html_as_markdown. Args: - content: Raw markdown content (or HTML if server returned HTML) - url: Source URL + content: Raw markdown content string (or HTML if server returned HTML) + url: Source URL for resolving relative links Returns: - Page dict with title, content, code_samples, headings, links + Dict with keys: + - url: str - Source URL + - title: str - Extracted from first # heading + - content: str - Paragraphs joined with double newlines + - headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str} + - code_samples: List[Dict] - {'code': str, 'language': str} + - links: List[str] - Absolute URLs to other .md files + - patterns: List - Empty (reserved for future use) + + Note: + Only .md links are extracted to avoid client-side rendered HTML pages. + Anchor fragments (#section) are stripped from links. """ import re @@ -434,12 +454,34 @@ class DocToSkillConverter: def _extract_html_as_markdown(self, html_content: str, url: str) -> Dict[str, Any]: """Extract content from HTML and convert to markdown-like structure. + Fallback method when .md URL returns HTML content instead of markdown. + Uses BeautifulSoup to extract structured data from HTML elements. + + Extraction strategy: + 1. Title from tag + 2. Main content from <main>, <article>, [role="main"], or <body> + 3. Headings (h1-h6) with text and id attributes + 4. Code blocks from <pre><code> or <pre> tags + 5. Text content from paragraphs + Args: - html_content: Raw HTML content - url: Source URL + html_content: Raw HTML content string + url: Source URL (for reference in result dict) Returns: - Page dict with title, content, code_samples, headings, links + Dict with keys: + - url: str - Source URL + - title: str - From <title> tag, cleaned + - content: str - Text content from main area + - headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str} + - code_samples: List[Dict] - {'code': str, 'language': str} + - links: List - Empty (HTML links not extracted to avoid client-side routes) + - patterns: List - Empty (reserved for future use) + + Note: + Prefers <main> or <article> tags for content area. + Falls back to <body> if no semantic content container found. + Language detection uses detect_language() method. """ page = { 'url': url, diff --git a/src/skill_seekers/cli/llms_txt_parser.py b/src/skill_seekers/cli/llms_txt_parser.py index 2e143bf..ae11410 100644 --- a/src/skill_seekers/cli/llms_txt_parser.py +++ b/src/skill_seekers/cli/llms_txt_parser.py @@ -16,8 +16,19 @@ class LlmsTxtParser: """ Extract all URLs from the llms.txt content. + Supports both markdown-style links [text](url) and bare URLs. + Resolves relative URLs using base_url if provided. + Filters out malformed URLs with invalid anchor patterns. + Returns: - List of unique URLs found in the content + List of unique, cleaned URLs found in the content. + Returns empty list if no valid URLs found. + + Note: + - Markdown links: [Getting Started](./docs/guide.md) + - Bare URLs: https://example.com/api.md + - Relative paths resolved with base_url + - Invalid anchors (#section/path.md) are stripped """ urls = set() @@ -48,11 +59,23 @@ class LlmsTxtParser: """ Clean and validate URL, removing invalid anchor patterns. + Detects and strips malformed anchors that contain path separators. + Valid: https://example.com/page.md#section + Invalid: https://example.com/page#section/index.html.md + Args: - url: URL to clean + url: URL to clean (absolute or relative) Returns: - Cleaned URL or empty string if invalid + Cleaned URL with malformed anchors stripped. + Returns base URL if anchor contains '/' (malformed). + Returns original URL if anchor is valid or no anchor present. + + Example: + >>> parser._clean_url("https://ex.com/page#sec/path.md") + "https://ex.com/page" + >>> parser._clean_url("https://ex.com/page.md#section") + "https://ex.com/page.md#section" """ # Skip URLs with path after anchor (e.g., #section/index.html.md) # These are malformed and return duplicate HTML content diff --git a/src/skill_seekers/cli/unified_skill_builder.py b/src/skill_seekers/cli/unified_skill_builder.py index a80f86d..ef6437c 100644 --- a/src/skill_seekers/cli/unified_skill_builder.py +++ b/src/skill_seekers/cli/unified_skill_builder.py @@ -287,6 +287,10 @@ This skill combines knowledge from multiple sources: def _generate_docs_references(self, docs_list: List[Dict]): """Generate references from multiple documentation sources.""" + # Skip if no documentation sources + if not docs_list: + return + docs_dir = os.path.join(self.skill_dir, 'references', 'documentation') os.makedirs(docs_dir, exist_ok=True) @@ -347,6 +351,10 @@ This skill combines knowledge from multiple sources: def _generate_github_references(self, github_list: List[Dict]): """Generate references from multiple GitHub sources.""" + # Skip if no GitHub sources + if not github_list: + return + github_dir = os.path.join(self.skill_dir, 'references', 'github') os.makedirs(github_dir, exist_ok=True) @@ -429,6 +437,10 @@ This skill combines knowledge from multiple sources: def _generate_pdf_references(self, pdf_list: List[Dict]): """Generate references from PDF sources.""" + # Skip if no PDF sources + if not pdf_list: + return + pdf_dir = os.path.join(self.skill_dir, 'references', 'pdf') os.makedirs(pdf_dir, exist_ok=True) From 9042e1680c8080543be7dc509592094a335a1a91 Mon Sep 17 00:00:00 2001 From: Nick Miethe <miethe.dev@gmail.com> Date: Thu, 8 Jan 2026 15:33:12 -0500 Subject: [PATCH 6/7] Enabling full support of the Claude Code documentation site, with support for all relevant pages and Anthropic's unconventional llms.txt --- configs/claude-code.json | 90 +++++++++++++++----- src/skill_seekers/cli/llms_txt_downloader.py | 17 +++- 2 files changed, 83 insertions(+), 24 deletions(-) diff --git a/configs/claude-code.json b/configs/claude-code.json index c84e709..ee96f68 100644 --- a/configs/claude-code.json +++ b/configs/claude-code.json @@ -1,37 +1,83 @@ { "name": "claude-code", - "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, configuration, and AI-assisted development.", - "base_url": "https://docs.claude.com/en/docs/claude-code/", + "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, plugins, hooks, configuration, deployment, and AI-assisted development.", + "base_url": "https://code.claude.com/docs/en/", "start_urls": [ - "https://docs.claude.com/en/docs/claude-code/overview", - "https://docs.claude.com/en/docs/claude-code/quickstart", - "https://docs.claude.com/en/docs/claude-code/common-workflows", - "https://docs.claude.com/en/docs/claude-code/mcp", - "https://docs.claude.com/en/docs/claude-code/settings", - "https://docs.claude.com/en/docs/claude-code/troubleshooting", - "https://docs.claude.com/en/docs/claude-code/iam" + "https://code.claude.com/docs/en/overview", + "https://code.claude.com/docs/en/quickstart", + "https://code.claude.com/docs/en/common-workflows", + "https://code.claude.com/docs/en/claude-code-on-the-web", + "https://code.claude.com/docs/en/desktop", + "https://code.claude.com/docs/en/chrome", + "https://code.claude.com/docs/en/vs-code", + "https://code.claude.com/docs/en/jetbrains", + "https://code.claude.com/docs/en/github-actions", + "https://code.claude.com/docs/en/gitlab-ci-cd", + "https://code.claude.com/docs/en/slack", + "https://code.claude.com/docs/en/sub-agents", + "https://code.claude.com/docs/en/plugins", + "https://code.claude.com/docs/en/discover-plugins", + "https://code.claude.com/docs/en/skills", + "https://code.claude.com/docs/en/output-styles", + "https://code.claude.com/docs/en/hooks-guide", + "https://code.claude.com/docs/en/headless", + "https://code.claude.com/docs/en/mcp", + "https://code.claude.com/docs/en/third-party-integrations", + "https://code.claude.com/docs/en/amazon-bedrock", + "https://code.claude.com/docs/en/google-vertex-ai", + "https://code.claude.com/docs/en/microsoft-foundry", + "https://code.claude.com/docs/en/network-config", + "https://code.claude.com/docs/en/llm-gateway", + "https://code.claude.com/docs/en/devcontainer", + "https://code.claude.com/docs/en/sandboxing", + "https://code.claude.com/docs/en/setup", + "https://code.claude.com/docs/en/iam", + "https://code.claude.com/docs/en/security", + "https://code.claude.com/docs/en/data-usage", + "https://code.claude.com/docs/en/monitoring-usage", + "https://code.claude.com/docs/en/costs", + "https://code.claude.com/docs/en/analytics", + "https://code.claude.com/docs/en/plugin-marketplaces", + "https://code.claude.com/docs/en/settings", + "https://code.claude.com/docs/en/terminal-config", + "https://code.claude.com/docs/en/model-config", + "https://code.claude.com/docs/en/memory", + "https://code.claude.com/docs/en/statusline", + "https://code.claude.com/docs/en/cli-reference", + "https://code.claude.com/docs/en/interactive-mode", + "https://code.claude.com/docs/en/slash-commands", + "https://code.claude.com/docs/en/checkpointing", + "https://code.claude.com/docs/en/hooks", + "https://code.claude.com/docs/en/plugins-reference", + "https://code.claude.com/docs/en/troubleshooting", + "https://code.claude.com/docs/en/legal-and-compliance" ], "selectors": { - "main_content": "#content-container", + "main_content": "#content-area, #content-container, article, main", "title": "h1", "code_blocks": "pre code" }, "url_patterns": { - "include": ["/claude-code/"], - "exclude": ["/api-reference/", "/claude-ai/", "/claude.ai/", "/prompt-engineering/", "/changelog/"] + "include": ["/docs/en/"], + "exclude": [ + "/docs/fr/", "/docs/de/", "/docs/it/", "/docs/ja/", "/docs/es/", + "/docs/ko/", "/docs/zh-CN/", "/docs/zh-TW/", "/docs/ru/", + "/docs/id/", "/docs/pt/", "/changelog", "github.com" + ] }, "categories": { - "getting_started": ["overview", "quickstart", "installation", "setup", "terminal-config"], - "workflows": ["workflow", "common-workflows", "git", "testing", "debugging", "interactive"], + "getting_started": ["overview", "quickstart", "common-workflows"], + "ide_integrations": ["vs-code", "jetbrains", "desktop", "chrome", "claude-code-on-the-web", "slack"], + "ci_cd": ["github-actions", "gitlab-ci-cd"], + "building": ["sub-agents", "subagent", "plugins", "discover-plugins", "skills", "output-styles", "hooks-guide", "headless", "programmatic"], "mcp": ["mcp", "model-context-protocol"], - "configuration": ["config", "settings", "preferences", "customize", "hooks", "statusline", "model-config", "memory", "output-styles"], - "agents": ["agent", "task", "subagent", "sub-agent", "specialized"], - "skills": ["skill", "agent-skill"], - "integrations": ["ide-integrations", "vs-code", "jetbrains", "plugin", "marketplace"], - "deployment": ["bedrock", "vertex", "deployment", "network", "gateway", "devcontainer", "sandboxing", "third-party"], - "reference": ["reference", "api", "command", "cli-reference", "slash", "checkpointing", "headless", "sdk"], - "enterprise": ["iam", "security", "monitoring", "analytics", "costs", "legal", "data-usage"] + "deployment": ["third-party-integrations", "amazon-bedrock", "google-vertex-ai", "microsoft-foundry", "network-config", "llm-gateway", "devcontainer", "sandboxing"], + "administration": ["setup", "iam", "security", "data-usage", "monitoring-usage", "costs", "analytics", "plugin-marketplaces"], + "configuration": ["settings", "terminal-config", "model-config", "memory", "statusline"], + "reference": ["cli-reference", "interactive-mode", "slash-commands", "checkpointing", "hooks", "plugins-reference"], + "troubleshooting": ["troubleshooting"], + "legal": ["legal-and-compliance"] }, "rate_limit": 0.5, - "max_pages": 200 + "max_pages": 250 } diff --git a/src/skill_seekers/cli/llms_txt_downloader.py b/src/skill_seekers/cli/llms_txt_downloader.py index 1049f86..76ec740 100644 --- a/src/skill_seekers/cli/llms_txt_downloader.py +++ b/src/skill_seekers/cli/llms_txt_downloader.py @@ -38,11 +38,24 @@ class LlmsTxtDownloader: def _is_markdown(self, content: str) -> bool: """ - Check if content looks like markdown. + Check if content looks like markdown (not HTML). Returns: - True if content contains markdown patterns + True if content contains markdown patterns and is NOT HTML """ + # First, reject HTML content (common redirect trap) + content_start = content.strip()[:500].lower() + html_indicators = [ + '<!doctype html', + '<html', + '<!doctype', + '<head>', + '<meta charset', + ] + if any(indicator in content_start for indicator in html_indicators): + return False + + # Then check for markdown patterns markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`'] return any(pattern in content for pattern in markdown_patterns) From 6008f13127e9b467632e1ba1001c474c9c22acae Mon Sep 17 00:00:00 2001 From: yusyus <yusufkaraaslan.yk@pm.me> Date: Sun, 11 Jan 2026 14:16:44 +0300 Subject: [PATCH 7/7] test: Add comprehensive HTML detection tests for llms.txt downloader (PR #244 review fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added 7 test cases to verify HTML redirect trap prevention: - test_is_markdown_rejects_html_doctype() - DOCTYPE rejection (case-insensitive) - test_is_markdown_rejects_html_tag() - <html> tag rejection - test_is_markdown_rejects_html_meta() - <meta> and <head> tag rejection - test_is_markdown_accepts_markdown_with_html_words() - Edge case: markdown mentioning "html" - test_html_detection_only_scans_first_500_chars() - Performance optimization verification - test_html_redirect_trap_scenario() - Real-world Claude Code redirect scenario - test_download_rejects_html_redirect() - End-to-end download rejection Addresses minor observation from PR #244 review: - Ensures HTML detection logic is fully covered - Prevents regression of redirect trap fixes - Validates 500-char scanning optimization Test Results: 20/20 llms_txt_downloader tests passing Overall: 982/982 tests passing (4 expected failures - missing anthropic package) Related: PR #244 (Claude Code documentation config update) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> --- tests/test_llms_txt_downloader.py | 92 +++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/tests/test_llms_txt_downloader.py b/tests/test_llms_txt_downloader.py index 3b945fc..bcdc4dc 100644 --- a/tests/test_llms_txt_downloader.py +++ b/tests/test_llms_txt_downloader.py @@ -168,3 +168,95 @@ def test_get_proper_filename_small(): filename = downloader.get_proper_filename() assert filename == "llms-small.md" + +def test_is_markdown_rejects_html_doctype(): + """Test that HTML with DOCTYPE is rejected (prevents redirect trap)""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + html = '<!DOCTYPE html><html><head><title>Product PageContent' + assert not downloader._is_markdown(html) + + # Test case-insensitive + html_uppercase = 'Content' + assert not downloader._is_markdown(html_uppercase) + +def test_is_markdown_rejects_html_tag(): + """Test that HTML with tag is rejected (prevents redirect trap)""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + html = 'Content' + assert not downloader._is_markdown(html) + + # Test with just opening tag + html_partial = 'Some content' + assert not downloader._is_markdown(html_partial) + +def test_is_markdown_rejects_html_meta(): + """Test that HTML with or tags is rejected""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + html_with_head = 'PageContent' + assert not downloader._is_markdown(html_with_head) + + html_with_meta = '' + assert not downloader._is_markdown(html_with_meta) + +def test_is_markdown_accepts_markdown_with_html_words(): + """Test that markdown mentioning 'html' word is still accepted""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + markdown = '# Guide\n\nLearn about html tags in markdown. You can write HTML inside markdown.' + assert downloader._is_markdown(markdown) + + # Test with actual markdown patterns + markdown_with_code = '# HTML Tutorial\n\n```html\n
example
\n```\n\n## More content' + assert downloader._is_markdown(markdown_with_code) + +def test_html_detection_only_scans_first_500_chars(): + """Test that HTML detection only scans first 500 characters for performance""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + # HTML tag after 500 chars should not be detected + safe_markdown = '# Header\n\n' + ('Valid markdown content. ' * 50) + '\n\n' + # This should pass because is beyond first 500 chars + if len(safe_markdown[:500]) < len(''): + # If the HTML is within 500 chars, adjust test + assert not downloader._is_markdown(safe_markdown) + else: + # HTML beyond 500 chars should not trigger rejection + assert downloader._is_markdown(safe_markdown) + +def test_html_redirect_trap_scenario(): + """Test real-world scenario: llms.txt redirects to HTML product page""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + # Simulate Claude Code redirect scenario (302 to HTML page) + html_product_page = ''' + + + + Claude Code - Product Page + + +

Claude Code

+

Product information...

+ +''' + + # Should reject this HTML even though it has

tag (looks like markdown "# ") + assert not downloader._is_markdown(html_product_page) + +def test_download_rejects_html_redirect(): + """Test that download() properly rejects HTML redirects""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + mock_response = Mock() + # Simulate server returning HTML instead of markdown + mock_response.text = '

Product Page

' + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + content = downloader.download() + + # Should return None (rejected as non-markdown) + assert content is None