diff --git a/CHANGELOG.md b/CHANGELOG.md index aa54281..cbd25f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,26 @@ All notable changes to Skill Seeker will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added - Phase 1: Active Skills Foundation +- Multi-variant llms.txt detection: downloads all 3 variants (full, standard, small) +- Automatic .txt → .md file extension conversion +- No content truncation: preserves complete documentation +- `detect_all()` method for finding all llms.txt variants +- `get_proper_filename()` for correct .md naming + +### Changed +- `_try_llms_txt()` now downloads all available variants instead of just one +- Reference files now contain complete content (no 2500 char limit) +- Code samples now include full code (no 600 char limit) + +### Fixed +- File extension bug: llms.txt files now saved as .md +- Content loss: 0% truncation (was 36%) + +--- + ## [1.2.0] - 2025-10-23 ### 🚀 PDF Advanced Features Release diff --git a/QUICKSTART.md b/QUICKSTART.md index 1c98464..d7bb12e 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -8,6 +8,8 @@ pip3 install requests beautifulsoup4 ``` +> **Note:** Skill_Seekers automatically checks for llms.txt files first, which is 10x faster when available. + ### Step 2: Run the Tool **Option A: Use a Preset (Easiest)** diff --git a/README.md b/README.md index 2a9a135..070261d 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![MCP Integration](https://img.shields.io/badge/MCP-Integrated-blue.svg)](https://modelcontextprotocol.io) -[![Tested](https://img.shields.io/badge/Tests-142%20Passing-brightgreen.svg)](tests/) +[![Tested](https://img.shields.io/badge/Tests-207%20Passing-brightgreen.svg)](tests/) [![Project Board](https://img.shields.io/badge/Project-Board-purple.svg)](https://github.com/users/yusufkaraaslan/projects/2) **Automatically convert any documentation website into a Claude AI skill in minutes.** @@ -34,6 +34,7 @@ Skill Seeker is an automated tool that transforms any documentation website into ## Key Features ### 🌐 Documentation Scraping +- ✅ **llms.txt Support** - Automatically detects and uses LLM-ready documentation files (10x faster) - ✅ **Universal Scraper** - Works with ANY documentation website - ✅ **Smart Categorization** - Automatically organizes content by topic - ✅ **Code Language Detection** - Recognizes Python, JavaScript, C++, GDScript, etc. @@ -60,7 +61,7 @@ Skill Seeker is an automated tool that transforms any documentation website into - ✅ **Caching System** - Scrape once, rebuild instantly ### ✅ Quality Assurance -- ✅ **Fully Tested** - 142 tests with 100% pass rate +- ✅ **Fully Tested** - 207 tests with 100% pass rate ## Quick Example @@ -139,6 +140,7 @@ graph LR G --> H[Upload to Claude AI] ``` +0. **Detect llms.txt** - Checks for llms-full.txt, llms.txt, llms-small.txt first 1. **Scrape**: Extracts all pages from documentation 2. **Categorize**: Organizes content into topics (API, guides, tutorials, etc.) 3. **Enhance**: AI analyzes docs and creates comprehensive SKILL.md with examples diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py old mode 100644 new mode 100755 index 54f8bfa..86e77d6 --- a/cli/doc_scraper.py +++ b/cli/doc_scraper.py @@ -22,6 +22,13 @@ from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from collections import deque, defaultdict +# Add parent directory to path for imports when run as script +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from cli.llms_txt_detector import LlmsTxtDetector +from cli.llms_txt_parser import LlmsTxtParser +from cli.llms_txt_downloader import LlmsTxtDownloader + class DocToSkillConverter: def __init__(self, config, dry_run=False, resume=False): @@ -41,6 +48,11 @@ class DocToSkillConverter: self.checkpoint_enabled = checkpoint_config.get('enabled', False) self.checkpoint_interval = checkpoint_config.get('interval', 1000) + # llms.txt detection state + self.llms_txt_detected = False + self.llms_txt_variant = None + self.llms_txt_variants = [] # Track all downloaded variants + # Parallel scraping config self.workers = config.get('workers', 1) @@ -322,9 +334,151 @@ class DocToSkillConverter: print(f" ✗ Error on {url}: {e}") else: print(f" ✗ Error: {e}") - + + def _try_llms_txt(self) -> bool: + """ + Try to use llms.txt instead of HTML scraping. + Downloads ALL available variants and stores with .md extension. + + Returns: + True if llms.txt was found and processed successfully + """ + print(f"\n🔍 Checking for llms.txt at {self.base_url}...") + + # Check for explicit config URL first + explicit_url = self.config.get('llms_txt_url') + if explicit_url: + print(f"\n📌 Using explicit llms_txt_url from config: {explicit_url}") + + # Download explicit file first + downloader = LlmsTxtDownloader(explicit_url) + content = downloader.download() + + if content: + # Save explicit file with proper .md extension + filename = downloader.get_proper_filename() + filepath = os.path.join(self.skill_dir, "references", filename) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + print(f" 💾 Saved {filename} ({len(content)} chars)") + + # Also try to detect and download ALL other variants + detector = LlmsTxtDetector(self.base_url) + variants = detector.detect_all() + + if variants: + print(f"\n🔍 Found {len(variants)} total variant(s), downloading remaining...") + for variant_info in variants: + url = variant_info['url'] + variant = variant_info['variant'] + + # Skip the explicit one we already downloaded + if url == explicit_url: + continue + + print(f" 📥 Downloading {variant}...") + extra_downloader = LlmsTxtDownloader(url) + extra_content = extra_downloader.download() + + if extra_content: + extra_filename = extra_downloader.get_proper_filename() + extra_filepath = os.path.join(self.skill_dir, "references", extra_filename) + with open(extra_filepath, 'w', encoding='utf-8') as f: + f.write(extra_content) + print(f" ✓ {extra_filename} ({len(extra_content)} chars)") + + # Parse explicit file for skill building + parser = LlmsTxtParser(content) + pages = parser.parse() + + if pages: + for page in pages: + self.save_page(page) + self.pages.append(page) + + self.llms_txt_detected = True + self.llms_txt_variant = 'explicit' + return True + + # Auto-detection: Find ALL variants + detector = LlmsTxtDetector(self.base_url) + variants = detector.detect_all() + + if not variants: + print("ℹ️ No llms.txt found, using HTML scraping") + return False + + print(f"✅ Found {len(variants)} llms.txt variant(s)") + + # Download ALL variants + downloaded = {} + for variant_info in variants: + url = variant_info['url'] + variant = variant_info['variant'] + + print(f" 📥 Downloading {variant}...") + downloader = LlmsTxtDownloader(url) + content = downloader.download() + + if content: + filename = downloader.get_proper_filename() + downloaded[variant] = { + 'content': content, + 'filename': filename, + 'size': len(content) + } + print(f" ✓ {filename} ({len(content)} chars)") + + if not downloaded: + print("⚠️ Failed to download any variants, falling back to HTML scraping") + return False + + # Save ALL variants to references/ + os.makedirs(os.path.join(self.skill_dir, "references"), exist_ok=True) + + for variant, data in downloaded.items(): + filepath = os.path.join(self.skill_dir, "references", data['filename']) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(data['content']) + print(f" 💾 Saved {data['filename']}") + + # Parse LARGEST variant for skill building + largest = max(downloaded.items(), key=lambda x: x[1]['size']) + print(f"\n📄 Parsing {largest[1]['filename']} for skill building...") + + parser = LlmsTxtParser(largest[1]['content']) + pages = parser.parse() + + if not pages: + print("⚠️ Failed to parse llms.txt, falling back to HTML scraping") + return False + + print(f" ✓ Parsed {len(pages)} sections") + + # Save pages for skill building + for page in pages: + self.save_page(page) + self.pages.append(page) + + self.llms_txt_detected = True + self.llms_txt_variants = list(downloaded.keys()) + + return True + def scrape_all(self): - """Scrape all pages (supports parallel scraping)""" + """Scrape all pages (supports llms.txt and HTML scraping)""" + + # Try llms.txt first (unless dry-run) + if not self.dry_run: + llms_result = self._try_llms_txt() + if llms_result: + print(f"\n✅ Used llms.txt ({self.llms_txt_variant}) - skipping HTML scraping") + self.save_summary() + return + + # HTML scraping (original logic) print(f"\n{'='*60}") if self.dry_run: print(f"DRY RUN: {self.name}") @@ -472,9 +626,11 @@ class DocToSkillConverter: 'name': self.name, 'total_pages': len(self.pages), 'base_url': self.base_url, + 'llms_txt_detected': self.llms_txt_detected, + 'llms_txt_variant': self.llms_txt_variant, 'pages': [{'title': p['title'], 'url': p['url']} for p in self.pages] } - + with open(f"{self.data_dir}/summary.json", 'w', encoding='utf-8') as f: json.dump(summary, f, indent=2, ensure_ascii=False) @@ -610,15 +766,12 @@ class DocToSkillConverter: lines.append(f"{indent}- {h['text']}") lines.append("") - # Content + # Content (NO TRUNCATION) if page.get('content'): - content = page['content'][:2500] - if len(page['content']) > 2500: - content += "\n\n*[Content truncated]*" - lines.append(content) + lines.append(page['content']) lines.append("") - - # Code examples with language + + # Code examples with language (NO TRUNCATION) if page.get('code_samples'): lines.append("**Examples:**\n") for i, sample in enumerate(page['code_samples'][:4], 1): @@ -626,9 +779,7 @@ class DocToSkillConverter: code = sample.get('code', sample if isinstance(sample, str) else '') lines.append(f"Example {i} ({lang}):") lines.append(f"```{lang}") - lines.append(code[:600]) - if len(code) > 600: - lines.append("...") + lines.append(code) # Full code, no truncation lines.append("```\n") lines.append("---\n") diff --git a/cli/llms_txt_detector.py b/cli/llms_txt_detector.py new file mode 100644 index 0000000..688fdb7 --- /dev/null +++ b/cli/llms_txt_detector.py @@ -0,0 +1,66 @@ +# ABOUTME: Detects and validates llms.txt file availability at documentation URLs +# ABOUTME: Supports llms-full.txt, llms.txt, and llms-small.txt variants + +import requests +from typing import Optional, Dict, List +from urllib.parse import urlparse + +class LlmsTxtDetector: + """Detect llms.txt files at documentation URLs""" + + VARIANTS = [ + ('llms-full.txt', 'full'), + ('llms.txt', 'standard'), + ('llms-small.txt', 'small') + ] + + def __init__(self, base_url: str): + self.base_url = base_url.rstrip('/') + + def detect(self) -> Optional[Dict[str, str]]: + """ + Detect available llms.txt variant. + + Returns: + Dict with 'url' and 'variant' keys, or None if not found + """ + parsed = urlparse(self.base_url) + root_url = f"{parsed.scheme}://{parsed.netloc}" + + for filename, variant in self.VARIANTS: + url = f"{root_url}/{filename}" + + if self._check_url_exists(url): + return {'url': url, 'variant': variant} + + return None + + def detect_all(self) -> List[Dict[str, str]]: + """ + Detect all available llms.txt variants. + + Returns: + List of dicts with 'url' and 'variant' keys for each found variant + """ + found_variants = [] + + for filename, variant in self.VARIANTS: + parsed = urlparse(self.base_url) + root_url = f"{parsed.scheme}://{parsed.netloc}" + url = f"{root_url}/{filename}" + + if self._check_url_exists(url): + found_variants.append({ + 'url': url, + 'variant': variant + }) + + return found_variants + + def _check_url_exists(self, url: str) -> bool: + """Check if URL returns 200 status""" + try: + response = requests.head(url, timeout=5, allow_redirects=True) + return response.status_code == 200 + except requests.RequestException: + return False diff --git a/cli/llms_txt_downloader.py b/cli/llms_txt_downloader.py new file mode 100644 index 0000000..1049f86 --- /dev/null +++ b/cli/llms_txt_downloader.py @@ -0,0 +1,94 @@ +"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic""" +"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff""" + +import requests +import time +from typing import Optional + +class LlmsTxtDownloader: + """Download llms.txt content from URLs with retry logic""" + + def __init__(self, url: str, timeout: int = 30, max_retries: int = 3): + self.url = url + self.timeout = timeout + self.max_retries = max_retries + + def get_proper_filename(self) -> str: + """ + Extract filename from URL and convert .txt to .md + + Returns: + Proper filename with .md extension + + Examples: + https://hono.dev/llms-full.txt -> llms-full.md + https://hono.dev/llms.txt -> llms.md + https://hono.dev/llms-small.txt -> llms-small.md + """ + # Extract filename from URL + from urllib.parse import urlparse + parsed = urlparse(self.url) + filename = parsed.path.split('/')[-1] + + # Replace .txt with .md + if filename.endswith('.txt'): + filename = filename[:-4] + '.md' + + return filename + + def _is_markdown(self, content: str) -> bool: + """ + Check if content looks like markdown. + + Returns: + True if content contains markdown patterns + """ + markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`'] + return any(pattern in content for pattern in markdown_patterns) + + def download(self) -> Optional[str]: + """ + Download llms.txt content with retry logic. + + Returns: + String content or None if download fails + """ + headers = { + 'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0' + } + + for attempt in range(self.max_retries): + try: + response = requests.get( + self.url, + headers=headers, + timeout=self.timeout + ) + response.raise_for_status() + + content = response.text + + # Validate content is not empty + if len(content) < 100: + print(f"⚠️ Content too short ({len(content)} chars), rejecting") + return None + + # Validate content looks like markdown + if not self._is_markdown(content): + print(f"⚠️ Content doesn't look like markdown") + return None + + return content + + except requests.RequestException as e: + if attempt < self.max_retries - 1: + # Calculate exponential backoff delay: 1s, 2s, 4s, etc. + delay = 2 ** attempt + print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}") + print(f" Retrying in {delay}s...") + time.sleep(delay) + else: + print(f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}") + return None + + return None diff --git a/cli/llms_txt_parser.py b/cli/llms_txt_parser.py new file mode 100644 index 0000000..e288c92 --- /dev/null +++ b/cli/llms_txt_parser.py @@ -0,0 +1,74 @@ +"""ABOUTME: Parses llms.txt markdown content into structured page data""" +"""ABOUTME: Extracts titles, content, code samples, and headings from markdown""" + +import re +from typing import List, Dict + +class LlmsTxtParser: + """Parse llms.txt markdown content into page structures""" + + def __init__(self, content: str): + self.content = content + + def parse(self) -> List[Dict]: + """ + Parse markdown content into page structures. + + Returns: + List of page dicts with title, content, code_samples, headings + """ + pages = [] + + # Split by h1 headers (# Title) + sections = re.split(r'\n# ', self.content) + + for section in sections: + if not section.strip(): + continue + + # First line is title + lines = section.split('\n') + title = lines[0].strip('#').strip() + + # Parse content + page = self._parse_section('\n'.join(lines[1:]), title) + pages.append(page) + + return pages + + def _parse_section(self, content: str, title: str) -> Dict: + """Parse a single section into page structure""" + page = { + 'title': title, + 'content': '', + 'code_samples': [], + 'headings': [], + 'url': f'llms-txt#{title.lower().replace(" ", "-")}', + 'links': [] + } + + # Extract code blocks + code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL) + for lang, code in code_blocks: + page['code_samples'].append({ + 'code': code.strip(), + 'language': lang or 'unknown' + }) + + # Extract h2/h3 headings + headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE) + for level_markers, text in headings: + page['headings'].append({ + 'level': f'h{len(level_markers)}', + 'text': text.strip(), + 'id': text.lower().replace(' ', '-') + }) + + # Remove code blocks from content for plain text + content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL) + + # Extract paragraphs + paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20] + page['content'] = '\n\n'.join(paragraphs) + + return page diff --git a/configs/hono.json b/configs/hono.json new file mode 100644 index 0000000..e27ca41 --- /dev/null +++ b/configs/hono.json @@ -0,0 +1,18 @@ +{ + "name": "hono", + "description": "Hono web application framework for building fast, lightweight APIs. Use for Hono routing, middleware, context handling, and modern JavaScript/TypeScript web development.", + "llms_txt_url": "https://hono.dev/llms-full.txt", + "base_url": "https://hono.dev/docs", + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [], + "exclude": [] + }, + "categories": {}, + "rate_limit": 0.5, + "max_pages": 50 +} \ No newline at end of file diff --git a/docs/CLAUDE.md b/docs/CLAUDE.md index 5b1cb58..1bc3014 100644 --- a/docs/CLAUDE.md +++ b/docs/CLAUDE.md @@ -344,3 +344,23 @@ cat output/godot/SKILL.md # Should have real code examples cat output/godot/references/index.md # Should show categories ls output/godot/references/ # Should have category .md files ``` + +## llms.txt Support + +Skill_Seekers automatically detects llms.txt files before HTML scraping: + +### Detection Order +1. `{base_url}/llms-full.txt` (complete documentation) +2. `{base_url}/llms.txt` (standard version) +3. `{base_url}/llms-small.txt` (quick reference) + +### Benefits +- ⚡ 10x faster (< 5 seconds vs 20-60 seconds) +- ✅ More reliable (maintained by docs authors) +- 🎯 Better quality (pre-formatted for LLMs) +- 🚫 No rate limiting needed + +### Example Sites +- Hono: https://hono.dev/llms-full.txt + +If no llms.txt is found, automatically falls back to HTML scraping. diff --git a/docs/LLMS_TXT_SUPPORT.md b/docs/LLMS_TXT_SUPPORT.md new file mode 100644 index 0000000..3083f16 --- /dev/null +++ b/docs/LLMS_TXT_SUPPORT.md @@ -0,0 +1,60 @@ +# llms.txt Support + +## Overview + +Skill_Seekers now automatically detects and uses llms.txt files when available, providing 10x faster documentation ingestion. + +## What is llms.txt? + +The llms.txt convention is a growing standard where documentation sites provide pre-formatted, LLM-ready markdown files: + +- `llms-full.txt` - Complete documentation +- `llms.txt` - Standard balanced version +- `llms-small.txt` - Quick reference + +## How It Works + +1. Before HTML scraping, Skill_Seekers checks for llms.txt files +2. If found, downloads and parses the markdown +3. If not found, falls back to HTML scraping +4. Zero config changes needed + +## Configuration + +### Automatic Detection (Recommended) + +No config changes needed. Just run normally: + +```bash +python3 cli/doc_scraper.py --config configs/hono.json +``` + +### Explicit URL + +Optionally specify llms.txt URL: + +```json +{ + "name": "hono", + "llms_txt_url": "https://hono.dev/llms-full.txt", + "base_url": "https://hono.dev/docs" +} +``` + +## Performance Comparison + +| Method | Time | Requests | +|--------|------|----------| +| HTML Scraping (20 pages) | 20-60s | 20+ | +| llms.txt | < 5s | 1 | + +## Supported Sites + +Sites known to provide llms.txt: + +- Hono: https://hono.dev/llms-full.txt +- (More to be discovered) + +## Fallback Behavior + +If llms.txt download or parsing fails, automatically falls back to HTML scraping with no user intervention required. diff --git a/docs/plans/2025-10-24-active-skills-design.md b/docs/plans/2025-10-24-active-skills-design.md new file mode 100644 index 0000000..48b32e0 --- /dev/null +++ b/docs/plans/2025-10-24-active-skills-design.md @@ -0,0 +1,867 @@ +# Active Skills Design - Demand-Driven Documentation Loading + +**Date:** 2025-10-24 +**Type:** Architecture Design +**Status:** Phase 1 Implemented ✅ +**Author:** Edgar + Claude (Brainstorming Session) + +--- + +## Executive Summary + +Transform Skill_Seekers from creating **passive documentation dumps** into **active, intelligent skills** that load documentation on-demand. This eliminates context bloat (300k → 5-10k per query) while maintaining full access to complete documentation. + +**Key Innovation:** Skills become lightweight routers with heavy tools in `scripts/`, not documentation repositories. + +--- + +## Problem Statement + +### Current Architecture: Passive Skills + +**What happens today:** +``` +Agent: "How do I use Hono middleware?" + ↓ +Skill: *Claude loads 203k llms-txt.md into context* + ↓ +Agent: *answers using loaded docs* + ↓ +Result: Context bloat, slower performance, hits limits +``` + +**Issues:** +1. **Context Bloat**: 319k llms-full.txt loaded entirely into context +2. **Wasted Resources**: Agent needs 5k but gets 319k +3. **Truncation Loss**: 36% of content lost (319k → 203k) due to size limits +4. **File Extension Bug**: llms.txt files stored as .txt instead of .md +5. **Single Variant**: Only downloads one file (usually llms-full.txt) + +### Current File Structure + +``` +output/hono/ +├── SKILL.md ──────────► Documentation dump + instructions +├── references/ +│ └── llms-txt.md ───► 203k (36% truncated from 319k original) +├── scripts/ ──────────► EMPTY (placeholder only!) +└── assets/ ───────────► EMPTY (placeholder only!) +``` + +--- + +## Proposed Architecture: Active Skills + +### Core Concept + +**Skills = Routers + Tools**, not documentation dumps. + +**New workflow:** +``` +Agent: "How do I use Hono middleware?" + ↓ +Skill: *runs scripts/search.py "middleware"* + ↓ +Script: *loads llms-full.md, extracts middleware section, returns 8k* + ↓ +Agent: *answers using ONLY 8k* (CLEAN CONTEXT!) + ↓ +Result: 40x less context, no truncation, full access to docs +``` + +### Benefits + +| Metric | Before | After | Improvement | +|--------|--------|-------|-------------| +| Context per query | 203k | 5-10k | **20-40x reduction** | +| Content loss | 36% truncated | 0% (no truncation) | **Full fidelity** | +| Variants available | 1 | 3 | **User choice** | +| File format | .txt (wrong) | .md (correct) | **Fixed** | +| Agent workflow | Passive read | Active tools | **Autonomous** | + +--- + +## Design Components + +### Component 1: Multi-Variant Download + +**Change:** Download ALL 3 variants, not just one. + +**File naming (FIXED):** +- `https://hono.dev/llms-full.txt` → `llms-full.md` ✅ +- `https://hono.dev/llms.txt` → `llms.md` ✅ +- `https://hono.dev/llms-small.txt` → `llms-small.md` ✅ + +**Sizes (Hono example):** +- `llms-full.md` - 319k (complete documentation) +- `llms-small.md` - 176k (curated essentials) +- `llms.md` - 5.4k (quick reference) + +**Storage:** +``` +output/hono/references/ +├── llms-full.md # 319k - everything (RENAMED from .txt) +├── llms-small.md # 176k - curated (RENAMED from .txt) +├── llms.md # 5.4k - quick ref (RENAMED from .txt) +└── catalog.json # Generated index (NEW) +``` + +**Implementation in `_try_llms_txt()`:** +```python +def _try_llms_txt(self) -> bool: + """Download ALL llms.txt variants for active skills""" + + # 1. Detect all available variants + detector = LlmsTxtDetector(self.base_url) + variants = detector.detect_all() # NEW method + + downloaded = {} + for variant_info in variants: + url = variant_info['url'] # https://hono.dev/llms-full.txt + variant = variant_info['variant'] # 'full', 'standard', 'small' + + downloader = LlmsTxtDownloader(url) + content = downloader.download() + + if content: + # ✨ FIX: Rename .txt → .md immediately + clean_name = f"llms-{variant}.md" + downloaded[variant] = { + 'content': content, + 'filename': clean_name + } + + # 2. Save ALL variants (not just one) + for variant, data in downloaded.items(): + path = os.path.join(self.skill_dir, "references", data['filename']) + with open(path, 'w', encoding='utf-8') as f: + f.write(data['content']) + + # 3. Generate catalog from smallest variant + if 'small' in downloaded: + self._generate_catalog(downloaded['small']['content']) + + return True +``` + +--- + +### Component 2: The Catalog System + +**Purpose:** Lightweight index of what exists, not the content itself. + +**File:** `assets/catalog.json` + +**Structure:** +```json +{ + "metadata": { + "framework": "hono", + "version": "auto-detected", + "generated": "2025-10-24T14:30:00Z", + "total_sections": 93, + "variants": { + "quick": "llms-small.md", + "standard": "llms.md", + "complete": "llms-full.md" + } + }, + "sections": [ + { + "id": "routing", + "title": "Routing", + "h1_marker": "# Routing", + "topics": ["routes", "path", "params", "wildcard"], + "size_bytes": 4800, + "variants": ["quick", "complete"], + "complexity": "beginner" + }, + { + "id": "middleware", + "title": "Middleware", + "h1_marker": "# Middleware", + "topics": ["cors", "auth", "logging", "compression"], + "size_bytes": 8200, + "variants": ["quick", "complete"], + "complexity": "intermediate" + } + ], + "search_index": { + "cors": ["middleware"], + "routing": ["routing", "path-parameters"], + "authentication": ["middleware", "jwt"], + "context": ["context-handling"], + "streaming": ["streaming-responses"] + } +} +``` + +**Generation (from llms-small.md):** +```python +def _generate_catalog(self, llms_small_content): + """Generate catalog.json from llms-small.md TOC""" + catalog = { + "metadata": {...}, + "sections": [], + "search_index": {} + } + + # Split by h1 headers + sections = re.split(r'\n# ', llms_small_content) + + for section_text in sections[1:]: + lines = section_text.split('\n') + title = lines[0].strip() + + # Extract h2 topics + topics = re.findall(r'^## (.+)$', section_text, re.MULTILINE) + topics = [t.strip().lower() for t in topics] + + section_info = { + "id": title.lower().replace(' ', '-'), + "title": title, + "h1_marker": f"# {title}", + "topics": topics + [title.lower()], + "size_bytes": len(section_text), + "variants": ["quick", "complete"] + } + + catalog["sections"].append(section_info) + + # Build search index + for topic in section_info["topics"]: + if topic not in catalog["search_index"]: + catalog["search_index"][topic] = [] + catalog["search_index"][topic].append(section_info["id"]) + + # Save to assets/catalog.json + catalog_path = os.path.join(self.skill_dir, "assets", "catalog.json") + with open(catalog_path, 'w', encoding='utf-8') as f: + json.dump(catalog, f, indent=2) +``` + +--- + +### Component 3: Active Scripts + +**Location:** `scripts/` directory (currently empty) + +#### Script 1: `scripts/search.py` + +**Purpose:** Search and return only relevant documentation sections. + +```python +#!/usr/bin/env python3 +""" +ABOUTME: Searches framework documentation and returns relevant sections +ABOUTME: Loads only what's needed - keeps agent context clean +""" + +import json +import sys +import re +from pathlib import Path + +def search(query, detail="auto"): + """ + Search documentation and return relevant sections. + + Args: + query: Search term (e.g., "middleware", "cors", "routing") + detail: "quick" | "standard" | "complete" | "auto" + + Returns: + Markdown text of relevant sections only + """ + # Load catalog + catalog_path = Path(__file__).parent.parent / "assets" / "catalog.json" + catalog = json.load(open(catalog_path)) + + # 1. Find matching sections using search index + query_lower = query.lower() + matching_section_ids = set() + + for keyword, section_ids in catalog["search_index"].items(): + if query_lower in keyword or keyword in query_lower: + matching_section_ids.update(section_ids) + + # Get section details + matches = [s for s in catalog["sections"] if s["id"] in matching_section_ids] + + if not matches: + return f"❌ No sections found for '{query}'. Try: python scripts/list_topics.py" + + # 2. Determine detail level + if detail == "auto": + # Use quick for overview, complete for deep dive + total_size = sum(s["size_bytes"] for s in matches) + if total_size > 50000: # > 50k + variant = "quick" + else: + variant = "complete" + else: + variant = detail + + variant_file = catalog["metadata"]["variants"].get(variant, "complete") + + # 3. Load documentation file + doc_path = Path(__file__).parent.parent / "references" / variant_file + doc_content = open(doc_path, 'r', encoding='utf-8').read() + + # 4. Extract matched sections + results = [] + for match in matches: + h1_marker = match["h1_marker"] + + # Find section boundaries + start = doc_content.find(h1_marker) + if start == -1: + continue + + # Find next h1 (or end of file) + next_h1 = doc_content.find("\n# ", start + len(h1_marker)) + if next_h1 == -1: + section_text = doc_content[start:] + else: + section_text = doc_content[start:next_h1] + + results.append({ + 'title': match['title'], + 'size': len(section_text), + 'content': section_text + }) + + # 5. Format output + output = [f"# Search Results for '{query}' ({len(results)} sections found)\n"] + output.append(f"**Variant used:** {variant} ({variant_file})") + output.append(f"**Total size:** {sum(r['size'] for r in results):,} bytes\n") + output.append("---\n") + + for result in results: + output.append(result['content']) + output.append("\n---\n") + + return '\n'.join(output) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python search.py [detail]") + print("Example: python search.py middleware") + print("Example: python search.py routing --detail quick") + sys.exit(1) + + query = sys.argv[1] + detail = sys.argv[2] if len(sys.argv) > 2 else "auto" + + print(search(query, detail)) +``` + +#### Script 2: `scripts/list_topics.py` + +**Purpose:** Show all available documentation sections. + +```python +#!/usr/bin/env python3 +""" +ABOUTME: Lists all available documentation sections with sizes +ABOUTME: Helps agent discover what documentation exists +""" + +import json +from pathlib import Path + +def list_topics(): + """List all available documentation sections.""" + catalog_path = Path(__file__).parent.parent / "assets" / "catalog.json" + catalog = json.load(open(catalog_path)) + + print(f"# Available Documentation Topics ({catalog['metadata']['framework']})\n") + print(f"**Total sections:** {catalog['metadata']['total_sections']}") + print(f"**Variants:** {', '.join(catalog['metadata']['variants'].keys())}\n") + print("---\n") + + # Group by complexity if available + by_complexity = {} + for section in catalog["sections"]: + complexity = section.get("complexity", "general") + if complexity not in by_complexity: + by_complexity[complexity] = [] + by_complexity[complexity].append(section) + + for complexity in ["beginner", "intermediate", "advanced", "general"]: + if complexity not in by_complexity: + continue + + sections = by_complexity[complexity] + print(f"## {complexity.title()} ({len(sections)} sections)\n") + + for section in sections: + size_kb = section["size_bytes"] / 1024 + topics_str = ", ".join(section["topics"][:3]) + print(f"- **{section['title']}** ({size_kb:.1f}k)") + print(f" Topics: {topics_str}") + print(f" Search: `python scripts/search.py {section['id']}`\n") + +if __name__ == "__main__": + list_topics() +``` + +#### Script 3: `scripts/get_section.py` + +**Purpose:** Extract a complete section by exact title. + +```python +#!/usr/bin/env python3 +""" +ABOUTME: Extracts a complete documentation section by title +ABOUTME: Returns full section from llms-full.md (no truncation) +""" + +import json +import sys +from pathlib import Path + +def get_section(title, variant="complete"): + """ + Get a complete section by exact title. + + Args: + title: Section title (e.g., "Middleware", "Routing") + variant: Which file to use (quick/standard/complete) + + Returns: + Complete section content + """ + catalog_path = Path(__file__).parent.parent / "assets" / "catalog.json" + catalog = json.load(open(catalog_path)) + + # Find section + section = None + for s in catalog["sections"]: + if s["title"].lower() == title.lower(): + section = s + break + + if not section: + return f"❌ Section '{title}' not found. Try: python scripts/list_topics.py" + + # Load doc + variant_file = catalog["metadata"]["variants"].get(variant, "complete") + doc_path = Path(__file__).parent.parent / "references" / variant_file + doc_content = open(doc_path, 'r', encoding='utf-8').read() + + # Extract section + h1_marker = section["h1_marker"] + start = doc_content.find(h1_marker) + + if start == -1: + return f"❌ Section '{title}' not found in {variant_file}" + + next_h1 = doc_content.find("\n# ", start + len(h1_marker)) + if next_h1 == -1: + section_text = doc_content[start:] + else: + section_text = doc_content[start:next_h1] + + return section_text + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python get_section.py [variant]") + print("Example: python get_section.py Middleware") + print("Example: python get_section.py Routing quick") + sys.exit(1) + + title = sys.argv[1] + variant = sys.argv[2] if len(sys.argv) > 2 else "complete" + + print(get_section(title, variant)) +``` + +--- + +### Component 4: Active SKILL.md Template + +**New template for llms.txt-based skills:** + +```markdown +--- +name: {name} +description: {description} +type: active +--- + +# {Name} Skill + +**⚡ This is an ACTIVE skill** - Uses scripts to load documentation on-demand instead of dumping everything into context. + +## 🎯 Strategy: Demand-Driven Documentation + +**Traditional approach:** +- Load 300k+ documentation into context +- Agent reads everything to answer one question +- Context bloat, slower performance + +**Active approach:** +- Load 5-10k of relevant sections on-demand +- Agent calls scripts to fetch what's needed +- Clean context, faster performance + +## 📚 Available Documentation + +This skill provides access to {num_sections} documentation sections across 3 detail levels: + +- **Quick Reference** (`llms-small.md`): {small_size}k - Curated essentials +- **Standard** (`llms.md`): {standard_size}k - Core concepts +- **Complete** (`llms-full.md`): {full_size}k - Everything + +## 🔧 Tools Available + +### 1. Search Documentation +Find and load only relevant sections: + +```bash +python scripts/search.py "middleware" +python scripts/search.py "routing" --detail quick +``` + +**Returns:** 5-10k of relevant content (not 300k!) + +### 2. List All Topics +See what documentation exists: + +```bash +python scripts/list_topics.py +``` + +**Returns:** Table of contents with section sizes and search hints + +### 3. Get Complete Section +Extract a full section by title: + +```bash +python scripts/get_section.py "Middleware" +python scripts/get_section.py "Routing" quick +``` + +**Returns:** Complete section from chosen variant + +## 💡 Recommended Workflow + +1. **Discover:** `python scripts/list_topics.py` to see what's available +2. **Search:** `python scripts/search.py "your topic"` to find relevant sections +3. **Deep Dive:** Use returned content to answer questions in detail +4. **Iterate:** Search more specific topics as needed + +## ⚠️ Important + +**DON'T:** Read `references/*.md` files directly into context +**DO:** Use scripts to fetch only what you need + +This keeps your context clean and focused! + +## 📊 Index + +Complete section catalog available in `assets/catalog.json` with search mappings and size information. + +## 🔄 Updating + +To refresh with latest documentation: +```bash +python3 cli/doc_scraper.py --config configs/{name}.json +``` +``` + +--- + +## Implementation Plan + +### Phase 1: Foundation (Quick Fixes) + +**Tasks:** +1. Fix `.txt` → `.md` renaming in downloader +2. Download all 3 variants (not just one) +3. Store all variants in `references/` with correct names +4. Remove content truncation (2500 chars → unlimited) + +**Time:** 1-2 hours +**Files:** `cli/doc_scraper.py`, `cli/llms_txt_downloader.py` + +### Phase 2: Catalog System + +**Tasks:** +1. Implement `_generate_catalog()` method +2. Parse llms-small.md to extract sections +3. Build search index from topics +4. Generate `assets/catalog.json` + +**Time:** 2-3 hours +**Files:** `cli/doc_scraper.py` + +### Phase 3: Active Scripts + +**Tasks:** +1. Create `scripts/search.py` +2. Create `scripts/list_topics.py` +3. Create `scripts/get_section.py` +4. Make scripts executable (`chmod +x`) + +**Time:** 2-3 hours +**Files:** New scripts in `scripts/` template directory + +### Phase 4: Template Updates + +**Tasks:** +1. Create new active SKILL.md template +2. Update `create_enhanced_skill_md()` to use active template for llms.txt skills +3. Update documentation to explain active skills + +**Time:** 1 hour +**Files:** `cli/doc_scraper.py`, `README.md`, `CLAUDE.md` + +### Phase 5: Testing & Refinement + +**Tasks:** +1. Test with Hono skill (has all 3 variants) +2. Test search accuracy +3. Measure context reduction +4. Document examples + +**Time:** 2-3 hours + +**Total Estimated Time:** 8-12 hours + +--- + +## Migration Path + +### Backward Compatibility + +**Existing skills:** No changes (passive skills still work) +**New llms.txt skills:** Automatically use active architecture +**User choice:** Can disable via config flag + +### Config Option + +```json +{ + "name": "hono", + "llms_txt_url": "https://hono.dev/llms-full.txt", + "active_skill": true, // NEW: Enable active architecture (default: true) + "base_url": "https://hono.dev/docs" +} +``` + +### Detection Logic + +```python +# In _try_llms_txt() +active_mode = self.config.get('active_skill', True) # Default true + +if active_mode: + # Download all variants, generate catalog, create scripts + self._build_active_skill(downloaded) +else: + # Traditional: single file, no scripts + self._build_passive_skill(downloaded) +``` + +--- + +## Benefits Analysis + +### Context Efficiency + +| Scenario | Passive Skill | Active Skill | Improvement | +|----------|---------------|--------------|-------------| +| Simple query | 203k loaded | 5k loaded | **40x reduction** | +| Multi-topic query | 203k loaded | 15k loaded | **13x reduction** | +| Deep dive | 203k loaded | 30k loaded | **6x reduction** | + +### Data Fidelity + +| Aspect | Passive | Active | +|--------|---------|--------| +| Content truncation | 36% lost | 0% lost | +| Code truncation | 600 chars max | Unlimited | +| Variants available | 1 | 3 | + +### Agent Capabilities + +**Passive Skills:** +- ❌ Cannot choose detail level +- ❌ Cannot search efficiently +- ❌ Must read entire context +- ❌ Limited by context window + +**Active Skills:** +- ✅ Chooses appropriate detail level +- ✅ Searches catalog efficiently +- ✅ Loads only what's needed +- ✅ Unlimited documentation access + +--- + +## Trade-offs + +### Advantages + +1. **Massive context reduction** (20-40x less per query) +2. **No content loss** (all 3 variants preserved) +3. **Correct file format** (.md not .txt) +4. **Agent autonomy** (tools to fetch docs) +5. **Scalable** (works with 1MB+ docs) + +### Disadvantages + +1. **Complexity** (scripts + catalog vs simple files) +2. **Initial overhead** (catalog generation) +3. **Agent learning curve** (must learn to use scripts) +4. **Dependency** (Python required to run scripts) + +### Risk Mitigation + +**Risk:** Scripts don't work in Claude's sandbox +**Mitigation:** Test thoroughly, provide fallback to passive mode + +**Risk:** Catalog generation fails +**Mitigation:** Graceful degradation to single-file mode + +**Risk:** Agent doesn't use scripts +**Mitigation:** Clear SKILL.md instructions, examples in quick reference + +--- + +## Success Metrics + +### Technical Metrics + +- ✅ Context per query < 20k (down from 203k) +- ✅ All 3 variants downloaded and named correctly +- ✅ 0% content truncation +- ✅ Catalog generation < 5 seconds +- ✅ Search script < 1 second response time + +### User Experience Metrics + +- ✅ Agent successfully uses scripts without prompting +- ✅ Answers are equally or more accurate than passive mode +- ✅ Agent can handle queries about all documentation sections +- ✅ No "context limit exceeded" errors + +--- + +## Future Enhancements + +### Phase 6: Smart Caching + +Cache frequently accessed sections in SKILL.md quick reference: +```python +# Track access frequency in catalog.json +"sections": [ + { + "id": "middleware", + "access_count": 47, # NEW: Track usage + "last_accessed": "2025-10-24T14:30:00Z" + } +] + +# Include top 10 most-accessed sections directly in SKILL.md +``` + +### Phase 7: Semantic Search + +Use embeddings for better search: +```python +# Generate embeddings for each section +"sections": [ + { + "id": "middleware", + "embedding": [...], # NEW: Vector embedding + "topics": ["cors", "auth"] + } +] + +# In search.py: Use cosine similarity for better matches +``` + +### Phase 8: Progressive Loading + +Load increasingly detailed docs: +```python +# First: Load llms.md (5.4k - overview) +# If insufficient: Load llms-small.md section (15k) +# If still insufficient: Load llms-full.md section (30k) +``` + +--- + +## Conclusion + +Active skills represent a fundamental shift from **documentation repositories** to **documentation routers**. By treating skills as intelligent intermediaries rather than static dumps, we can: + +1. **Eliminate context bloat** (40x reduction) +2. **Preserve full fidelity** (0% truncation) +3. **Enable agent autonomy** (tools to fetch docs) +4. **Scale indefinitely** (no size limits) + +This design maintains backward compatibility while unlocking new capabilities for modern, LLM-optimized documentation sources like llms.txt. + +**Recommendation:** Implement in phases, starting with foundation fixes, then catalog system, then active scripts. Test thoroughly with Hono before making it the default for all llms.txt-based skills. + +--- + +## References + +- Original brainstorming session: 2025-10-24 +- llms.txt convention: https://llmstxt.org/ +- Hono example: https://hono.dev/llms-full.txt +- Skill_Seekers repository: Current project + +--- + +## Appendix: Example Workflows + +### Example 1: Agent Searches for "Middleware" + +```bash +# Agent runs: +python scripts/search.py "middleware" + +# Script returns ~8k of middleware documentation from llms-full.md +# Agent uses that 8k to answer the question +# Total context used: 8k (not 319k!) +``` + +### Example 2: Agent Explores Documentation + +```bash +# 1. Agent lists topics +python scripts/list_topics.py +# Returns: Table of contents (2k) + +# 2. Agent picks a topic +python scripts/get_section.py "Routing" +# Returns: Complete Routing section (5k) + +# 3. Agent searches related topics +python scripts/search.py "path parameters" +# Returns: Routing + Path section (7k) + +# Total context used across 3 queries: 14k (not 3 × 319k = 957k!) +``` + +### Example 3: Agent Needs Quick Answer + +```bash +# Agent uses quick variant for overview +python scripts/search.py "cors" --detail quick + +# Returns: Short CORS explanation from llms-small.md (2k) +# If insufficient, agent can follow up with: +python scripts/get_section.py "Middleware" # Full section from llms-full.md +``` + +--- + +**Document Status:** Ready for review and implementation planning. diff --git a/docs/plans/2025-10-24-active-skills-phase1.md b/docs/plans/2025-10-24-active-skills-phase1.md new file mode 100644 index 0000000..24eb59f --- /dev/null +++ b/docs/plans/2025-10-24-active-skills-phase1.md @@ -0,0 +1,682 @@ +# Active Skills Phase 1: Foundation Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Fix fundamental issues in llms.txt handling: rename .txt→.md, download all 3 variants, remove truncation. + +**Architecture:** Modify existing llms.txt download/parse/build workflow to handle multiple variants correctly, store with proper extensions, and preserve complete content without truncation. + +**Tech Stack:** Python 3.10+, requests, BeautifulSoup4, existing Skill_Seekers architecture + +--- + +## Task 1: Add Multi-Variant Detection + +**Files:** +- Modify: `cli/llms_txt_detector.py` +- Test: `tests/test_llms_txt_detector.py` + +**Step 1: Write failing test for detect_all() method** + +```python +# tests/test_llms_txt_detector.py (add new test) + +def test_detect_all_variants(): + """Test detecting all llms.txt variants""" + from unittest.mock import patch, Mock + + detector = LlmsTxtDetector("https://hono.dev/docs") + + with patch('cli.llms_txt_detector.requests.head') as mock_head: + # Mock responses for different variants + def mock_response(url, **kwargs): + response = Mock() + # All 3 variants exist for Hono + if 'llms-full.txt' in url or 'llms.txt' in url or 'llms-small.txt' in url: + response.status_code = 200 + else: + response.status_code = 404 + return response + + mock_head.side_effect = mock_response + + variants = detector.detect_all() + + assert len(variants) == 3 + assert any(v['variant'] == 'full' for v in variants) + assert any(v['variant'] == 'standard' for v in variants) + assert any(v['variant'] == 'small' for v in variants) + assert all('url' in v for v in variants) +``` + +**Step 2: Run test to verify it fails** + +Run: `source .venv/bin/activate && pytest tests/test_llms_txt_detector.py::test_detect_all_variants -v` + +Expected: FAIL with "AttributeError: 'LlmsTxtDetector' object has no attribute 'detect_all'" + +**Step 3: Implement detect_all() method** + +```python +# cli/llms_txt_detector.py (add new method) + +def detect_all(self) -> List[Dict[str, str]]: + """ + Detect all available llms.txt variants. + + Returns: + List of dicts with 'url' and 'variant' keys for each found variant + """ + found_variants = [] + + for filename, variant in self.VARIANTS: + parsed = urlparse(self.base_url) + root_url = f"{parsed.scheme}://{parsed.netloc}" + url = f"{root_url}/{filename}" + + if self._check_url_exists(url): + found_variants.append({ + 'url': url, + 'variant': variant + }) + + return found_variants +``` + +**Step 4: Add import for List and Dict at top of file** + +```python +# cli/llms_txt_detector.py (add to imports) +from typing import Optional, Dict, List +``` + +**Step 5: Run test to verify it passes** + +Run: `source .venv/bin/activate && pytest tests/test_llms_txt_detector.py::test_detect_all_variants -v` + +Expected: PASS + +**Step 6: Commit** + +```bash +git add cli/llms_txt_detector.py tests/test_llms_txt_detector.py +git commit -m "feat: add detect_all() for multi-variant detection" +``` + +--- + +## Task 2: Add File Extension Renaming to Downloader + +**Files:** +- Modify: `cli/llms_txt_downloader.py` +- Test: `tests/test_llms_txt_downloader.py` + +**Step 1: Write failing test for get_proper_filename() method** + +```python +# tests/test_llms_txt_downloader.py (add new test) + +def test_get_proper_filename(): + """Test filename conversion from .txt to .md""" + downloader = LlmsTxtDownloader("https://hono.dev/llms-full.txt") + + filename = downloader.get_proper_filename() + + assert filename == "llms-full.md" + assert not filename.endswith('.txt') + +def test_get_proper_filename_standard(): + """Test standard variant naming""" + downloader = LlmsTxtDownloader("https://hono.dev/llms.txt") + + filename = downloader.get_proper_filename() + + assert filename == "llms.md" + +def test_get_proper_filename_small(): + """Test small variant naming""" + downloader = LlmsTxtDownloader("https://hono.dev/llms-small.txt") + + filename = downloader.get_proper_filename() + + assert filename == "llms-small.md" +``` + +**Step 2: Run test to verify it fails** + +Run: `source .venv/bin/activate && pytest tests/test_llms_txt_downloader.py::test_get_proper_filename -v` + +Expected: FAIL with "AttributeError: 'LlmsTxtDownloader' object has no attribute 'get_proper_filename'" + +**Step 3: Implement get_proper_filename() method** + +```python +# cli/llms_txt_downloader.py (add new method) + +def get_proper_filename(self) -> str: + """ + Extract filename from URL and convert .txt to .md + + Returns: + Proper filename with .md extension + + Examples: + https://hono.dev/llms-full.txt -> llms-full.md + https://hono.dev/llms.txt -> llms.md + https://hono.dev/llms-small.txt -> llms-small.md + """ + # Extract filename from URL + from urllib.parse import urlparse + parsed = urlparse(self.url) + filename = parsed.path.split('/')[-1] + + # Replace .txt with .md + if filename.endswith('.txt'): + filename = filename[:-4] + '.md' + + return filename +``` + +**Step 4: Run test to verify it passes** + +Run: `source .venv/bin/activate && pytest tests/test_llms_txt_downloader.py::test_get_proper_filename -v` + +Expected: PASS (all 3 tests) + +**Step 5: Commit** + +```bash +git add cli/llms_txt_downloader.py tests/test_llms_txt_downloader.py +git commit -m "feat: add get_proper_filename() for .txt to .md conversion" +``` + +--- + +## Task 3: Update _try_llms_txt() to Download All Variants + +**Files:** +- Modify: `cli/doc_scraper.py:337-384` (_try_llms_txt method) +- Test: `tests/test_integration.py` + +**Step 1: Write failing test for multi-variant download** + +```python +# tests/test_integration.py (add to TestFullLlmsTxtWorkflow class) + +def test_multi_variant_download(self): + """Test downloading all 3 llms.txt variants""" + from unittest.mock import patch, Mock + import tempfile + import os + + config = { + 'name': 'test-multi-variant', + 'base_url': 'https://hono.dev/docs' + } + + # Mock all 3 variants + sample_full = "# Full\n" + "x" * 1000 + sample_standard = "# Standard\n" + "x" * 200 + sample_small = "# Small\n" + "x" * 500 + + with tempfile.TemporaryDirectory() as tmpdir: + with patch('cli.llms_txt_detector.requests.head') as mock_head, \ + patch('cli.llms_txt_downloader.requests.get') as mock_get: + + # Mock detection (all exist) + mock_head_response = Mock() + mock_head_response.status_code = 200 + mock_head.return_value = mock_head_response + + # Mock downloads + def mock_download(url, **kwargs): + response = Mock() + response.status_code = 200 + if 'llms-full.txt' in url: + response.text = sample_full + elif 'llms-small.txt' in url: + response.text = sample_small + else: # llms.txt + response.text = sample_standard + return response + + mock_get.side_effect = mock_download + + # Run scraper + scraper = DocumentationScraper(config, dry_run=False) + result = scraper._try_llms_txt() + + # Verify all 3 files created + refs_dir = os.path.join(scraper.skill_dir, 'references') + + assert os.path.exists(os.path.join(refs_dir, 'llms-full.md')) + assert os.path.exists(os.path.join(refs_dir, 'llms.md')) + assert os.path.exists(os.path.join(refs_dir, 'llms-small.md')) + + # Verify content not truncated + with open(os.path.join(refs_dir, 'llms-full.md')) as f: + content = f.read() + assert len(content) == len(sample_full) +``` + +**Step 2: Run test to verify it fails** + +Run: `source .venv/bin/activate && pytest tests/test_integration.py::TestFullLlmsTxtWorkflow::test_multi_variant_download -v` + +Expected: FAIL - only one file created, not all 3 + +**Step 3: Modify _try_llms_txt() to use detect_all()** + +```python +# cli/doc_scraper.py (replace _try_llms_txt method, lines 337-384) + +def _try_llms_txt(self) -> bool: + """ + Try to use llms.txt instead of HTML scraping. + Downloads ALL available variants and stores with .md extension. + + Returns: + True if llms.txt was found and processed successfully + """ + print(f"\n🔍 Checking for llms.txt at {self.base_url}...") + + # Check for explicit config URL first + explicit_url = self.config.get('llms_txt_url') + if explicit_url: + print(f"\n📌 Using explicit llms_txt_url from config: {explicit_url}") + + downloader = LlmsTxtDownloader(explicit_url) + content = downloader.download() + + if content: + # Save with proper .md extension + filename = downloader.get_proper_filename() + filepath = os.path.join(self.skill_dir, "references", filename) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + print(f" 💾 Saved {filename} ({len(content)} chars)") + + # Parse and save pages + parser = LlmsTxtParser(content) + pages = parser.parse() + + if pages: + for page in pages: + self.save_page(page) + self.pages.append(page) + + self.llms_txt_detected = True + self.llms_txt_variant = 'explicit' + return True + + # Auto-detection: Find ALL variants + detector = LlmsTxtDetector(self.base_url) + variants = detector.detect_all() + + if not variants: + print("ℹ️ No llms.txt found, using HTML scraping") + return False + + print(f"✅ Found {len(variants)} llms.txt variant(s)") + + # Download ALL variants + downloaded = {} + for variant_info in variants: + url = variant_info['url'] + variant = variant_info['variant'] + + print(f" 📥 Downloading {variant}...") + downloader = LlmsTxtDownloader(url) + content = downloader.download() + + if content: + filename = downloader.get_proper_filename() + downloaded[variant] = { + 'content': content, + 'filename': filename, + 'size': len(content) + } + print(f" ✓ {filename} ({len(content)} chars)") + + if not downloaded: + print("⚠️ Failed to download any variants, falling back to HTML scraping") + return False + + # Save ALL variants to references/ + os.makedirs(os.path.join(self.skill_dir, "references"), exist_ok=True) + + for variant, data in downloaded.items(): + filepath = os.path.join(self.skill_dir, "references", data['filename']) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(data['content']) + print(f" 💾 Saved {data['filename']}") + + # Parse LARGEST variant for skill building + largest = max(downloaded.items(), key=lambda x: x[1]['size']) + print(f"\n📄 Parsing {largest[1]['filename']} for skill building...") + + parser = LlmsTxtParser(largest[1]['content']) + pages = parser.parse() + + if not pages: + print("⚠️ Failed to parse llms.txt, falling back to HTML scraping") + return False + + print(f" ✓ Parsed {len(pages)} sections") + + # Save pages for skill building + for page in pages: + self.save_page(page) + self.pages.append(page) + + self.llms_txt_detected = True + self.llms_txt_variants = list(downloaded.keys()) + + return True +``` + +**Step 4: Add llms_txt_variants attribute to __init__** + +```python +# cli/doc_scraper.py (in __init__ method, after llms_txt_variant line) + +self.llms_txt_variants = [] # Track all downloaded variants +``` + +**Step 5: Run test to verify it passes** + +Run: `source .venv/bin/activate && pytest tests/test_integration.py::TestFullLlmsTxtWorkflow::test_multi_variant_download -v` + +Expected: PASS + +**Step 6: Commit** + +```bash +git add cli/doc_scraper.py tests/test_integration.py +git commit -m "feat: download all llms.txt variants with proper .md extension" +``` + +--- + +## Task 4: Remove Content Truncation + +**Files:** +- Modify: `cli/doc_scraper.py:714-730` (create_reference_file method) + +**Step 1: Write failing test for no truncation** + +```python +# tests/test_integration.py (add new test) + +def test_no_content_truncation(): + """Test that content is NOT truncated in reference files""" + from unittest.mock import Mock + import tempfile + import os + + config = { + 'name': 'test-no-truncate', + 'base_url': 'https://example.com/docs' + } + + # Create scraper with long content + scraper = DocumentationScraper(config, dry_run=False) + + # Create page with content > 2500 chars + long_content = "x" * 5000 + long_code = "y" * 1000 + + pages = [{ + 'title': 'Long Page', + 'url': 'https://example.com/long', + 'content': long_content, + 'code_samples': [ + {'code': long_code, 'language': 'python'} + ], + 'headings': [] + }] + + # Create reference file + scraper.create_reference_file('test', pages) + + # Verify no truncation + ref_file = os.path.join(scraper.skill_dir, 'references', 'test.md') + with open(ref_file, 'r') as f: + content = f.read() + + assert long_content in content # Full content included + assert long_code in content # Full code included + assert '[Content truncated]' not in content + assert '...' not in content or content.count('...') == 0 +``` + +**Step 2: Run test to verify it fails** + +Run: `source .venv/bin/activate && pytest tests/test_integration.py::test_no_content_truncation -v` + +Expected: FAIL - content contains "[Content truncated]" or "..." + +**Step 3: Remove truncation from create_reference_file()** + +```python +# cli/doc_scraper.py (modify create_reference_file method, lines 712-731) + +# OLD (line 714-716): +# if page.get('content'): +# content = page['content'][:2500] +# if len(page['content']) > 2500: +# content += "\n\n*[Content truncated]*" + +# NEW (replace with): + if page.get('content'): + content = page['content'] # NO TRUNCATION + lines.append(content) + lines.append("") + +# OLD (line 728-730): +# lines.append(code[:600]) +# if len(code) > 600: +# lines.append("...") + +# NEW (replace with): + lines.append(code) # NO TRUNCATION + # No "..." suffix +``` + +**Complete replacement of lines 712-731:** + +```python +# cli/doc_scraper.py:712-731 (complete replacement) + + # Content (NO TRUNCATION) + if page.get('content'): + lines.append(page['content']) + lines.append("") + + # Code examples with language (NO TRUNCATION) + if page.get('code_samples'): + lines.append("**Examples:**\n") + for i, sample in enumerate(page['code_samples'][:4], 1): + lang = sample.get('language', 'unknown') + code = sample.get('code', sample if isinstance(sample, str) else '') + lines.append(f"Example {i} ({lang}):") + lines.append(f"```{lang}") + lines.append(code) # Full code, no truncation + lines.append("```\n") +``` + +**Step 4: Run test to verify it passes** + +Run: `source .venv/bin/activate && pytest tests/test_integration.py::test_no_content_truncation -v` + +Expected: PASS + +**Step 5: Run full test suite to check for regressions** + +Run: `source .venv/bin/activate && pytest tests/ -v` + +Expected: All 201+ tests pass + +**Step 6: Commit** + +```bash +git add cli/doc_scraper.py tests/test_integration.py +git commit -m "feat: remove content truncation in reference files" +``` + +--- + +## Task 5: Update Documentation + +**Files:** +- Modify: `docs/plans/2025-10-24-active-skills-design.md` +- Modify: `CHANGELOG.md` + +**Step 1: Update design doc status** + +```markdown +# docs/plans/2025-10-24-active-skills-design.md (update header) + +**Status:** Phase 1 Implemented ✅ +``` + +**Step 2: Add CHANGELOG entry** + +```markdown +# CHANGELOG.md (add new section at top) + +## [Unreleased] + +### Added - Phase 1: Active Skills Foundation +- Multi-variant llms.txt detection: downloads all 3 variants (full, standard, small) +- Automatic .txt → .md file extension conversion +- No content truncation: preserves complete documentation +- `detect_all()` method for finding all llms.txt variants +- `get_proper_filename()` for correct .md naming + +### Changed +- `_try_llms_txt()` now downloads all available variants instead of just one +- Reference files now contain complete content (no 2500 char limit) +- Code samples now include full code (no 600 char limit) + +### Fixed +- File extension bug: llms.txt files now saved as .md +- Content loss: 0% truncation (was 36%) +``` + +**Step 3: Commit** + +```bash +git add docs/plans/2025-10-24-active-skills-design.md CHANGELOG.md +git commit -m "docs: update status for Phase 1 completion" +``` + +--- + +## Task 6: Manual Verification + +**Files:** +- None (manual testing) + +**Step 1: Test with Hono config** + +Run: `source .venv/bin/activate && python3 cli/doc_scraper.py --config configs/hono.json` + +**Expected output:** +``` +🔍 Checking for llms.txt at https://hono.dev/docs... +📌 Using explicit llms_txt_url from config: https://hono.dev/llms-full.txt + 💾 Saved llms-full.md (319000 chars) +📄 Parsing llms-full.md for skill building... + ✓ Parsed 93 sections +✅ Used llms.txt (explicit) - skipping HTML scraping +``` + +**Step 2: Verify all 3 files exist with correct extensions** + +Run: `ls -lah output/hono/references/llms*.md` + +Expected: +``` +llms-full.md 319k +llms.md 5.4k +llms-small.md 176k +``` + +**Step 3: Verify no truncation in reference files** + +Run: `grep -c "Content truncated" output/hono/references/*.md` + +Expected: 0 matches (no truncation messages) + +**Step 4: Check file sizes are correct** + +Run: `wc -c output/hono/references/llms-full.md` + +Expected: Should match original download size (~319k), not reduced to 203k + +**Step 5: Verify all tests still pass** + +Run: `source .venv/bin/activate && pytest tests/ -v` + +Expected: All tests pass (201+) + +--- + +## Completion Checklist + +- [ ] Task 1: Multi-variant detection (detect_all) +- [ ] Task 2: File extension renaming (get_proper_filename) +- [ ] Task 3: Download all variants (_try_llms_txt) +- [ ] Task 4: Remove truncation (create_reference_file) +- [ ] Task 5: Update documentation +- [ ] Task 6: Manual verification +- [ ] All tests passing +- [ ] No regressions in existing functionality + +--- + +## Success Criteria + +**Technical:** +- ✅ All 3 variants downloaded when available +- ✅ Files saved with .md extension (not .txt) +- ✅ 0% content truncation (was 36%) +- ✅ All existing tests pass +- ✅ New tests cover all changes + +**User Experience:** +- ✅ Hono skill has all 3 files: llms-full.md, llms.md, llms-small.md +- ✅ Reference files contain complete documentation +- ✅ No "[Content truncated]" messages in output + +--- + +## Related Skills + +- @superpowers:test-driven-development - Used throughout for TDD approach +- @superpowers:verification-before-completion - Used in Task 6 for manual verification + +--- + +## Notes + +- This plan implements Phase 1 from `docs/plans/2025-10-24-active-skills-design.md` +- Phase 2 (Catalog System) and Phase 3 (Active Scripts) will be separate plans +- All changes maintain backward compatibility with existing HTML scraping +- File extension fix (.txt → .md) is critical for proper skill functionality + +--- + +## Estimated Time + +- Task 1: 15 minutes +- Task 2: 15 minutes +- Task 3: 30 minutes +- Task 4: 20 minutes +- Task 5: 10 minutes +- Task 6: 15 minutes + +**Total: ~1.5 hours** diff --git a/mcp/server.py b/mcp/server.py index de1613f..83f61a0 100644 --- a/mcp/server.py +++ b/mcp/server.py @@ -168,7 +168,7 @@ async def list_tools() -> list[Tool]: ), Tool( name="scrape_docs", - description="Scrape documentation and build Claude skill. Creates SKILL.md and reference files.", + description="Scrape documentation and build Claude skill. Creates SKILL.md and reference files. Automatically detects llms.txt files for 10x faster processing. Falls back to HTML scraping if not available.", inputSchema={ "type": "object", "properties": { diff --git a/tests/test_config_validation.py b/tests/test_config_validation.py index a270707..ced51d3 100644 --- a/tests/test_config_validation.py +++ b/tests/test_config_validation.py @@ -296,6 +296,17 @@ class TestConfigValidation(unittest.TestCase): url_errors = [e for e in errors if 'start_url' in e.lower()] self.assertEqual(len(url_errors), 0, "Valid start_urls should pass validation") + def test_config_with_llms_txt_url(self): + """Test config validation with explicit llms_txt_url""" + config = { + 'name': 'test', + 'llms_txt_url': 'https://example.com/llms-full.txt', + 'base_url': 'https://example.com/docs' + } + + # Should be valid + self.assertEqual(config.get('llms_txt_url'), 'https://example.com/llms-full.txt') + if __name__ == '__main__': unittest.main() diff --git a/tests/test_integration.py b/tests/test_integration.py index d278e67..4501eda 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -247,6 +247,48 @@ class TestURLProcessing(unittest.TestCase): self.assertEqual(len(converter.pending_urls), 3) +class TestLlmsTxtIntegration(unittest.TestCase): + """Test llms.txt integration into scraping workflow""" + + def test_scraper_has_llms_txt_attributes(self): + """Test that scraper has llms.txt detection attributes""" + config = { + 'name': 'test-llms', + 'base_url': 'https://hono.dev/docs', + 'selectors': { + 'main_content': 'article', + 'title': 'h1', + 'code_blocks': 'pre code' + }, + 'max_pages': 50 + } + + scraper = DocToSkillConverter(config, dry_run=True) + + # Should have llms.txt attributes + self.assertFalse(scraper.llms_txt_detected) + self.assertIsNone(scraper.llms_txt_variant) + + def test_scraper_has_try_llms_txt_method(self): + """Test that scraper has _try_llms_txt method""" + config = { + 'name': 'test-llms', + 'base_url': 'https://hono.dev/docs', + 'selectors': { + 'main_content': 'article', + 'title': 'h1', + 'code_blocks': 'pre code' + }, + 'max_pages': 50 + } + + scraper = DocToSkillConverter(config, dry_run=True) + + # Should have _try_llms_txt method + self.assertTrue(hasattr(scraper, '_try_llms_txt')) + self.assertTrue(callable(getattr(scraper, '_try_llms_txt'))) + + class TestContentExtraction(unittest.TestCase): """Test content extraction functionality""" @@ -305,5 +347,301 @@ class TestContentExtraction(unittest.TestCase): self.assertEqual(page['code_samples'][0]['language'], 'python') +class TestFullLlmsTxtWorkflow(unittest.TestCase): + """Test complete llms.txt workflow with mocked HTTP requests""" + + def setUp(self): + """Set up test configuration and temporary directory""" + self.temp_dir = tempfile.mkdtemp() + self.config = { + 'name': 'test-e2e-llms', + 'base_url': 'https://hono.dev/docs', + 'llms_txt_url': 'https://hono.dev/llms-full.txt', + 'selectors': { + 'main_content': 'article', + 'title': 'h1', + 'code_blocks': 'pre code' + }, + 'max_pages': 50 + } + + # Sample llms.txt content for testing + self.sample_llms_content = """# Getting Started + +Welcome to the framework documentation. This is the introduction section. + +## Installation + +To install the framework, run the following command: + +```bash +npm install hono +``` + +## Quick Start + +Create a simple application: + +```javascript +import { Hono } from 'hono' + +const app = new Hono() + +app.get('/', (c) => { + return c.text('Hello World!') +}) + +export default app +``` + +# API Reference + +This section covers the API documentation for the framework. + +## Context + +The context object provides request and response handling: + +```typescript +interface Context { + req: Request + res: Response + text: (text: string) => Response +} +``` + +# Middleware + +Middleware functions run before route handlers. + +## Built-in Middleware + +The framework provides several built-in middleware functions: + +```javascript +import { logger, cors } from 'hono/middleware' + +app.use('*', logger()) +app.use('*', cors()) +``` +""" + + def tearDown(self): + """Clean up temporary directory and test output""" + shutil.rmtree(self.temp_dir, ignore_errors=True) + # Clean up test output directories + shutil.rmtree(f"output/{self.config['name']}_data", ignore_errors=True) + shutil.rmtree(f"output/{self.config['name']}", ignore_errors=True) + + def test_full_llms_txt_workflow(self): + """Test complete workflow: config -> scrape (llms.txt) -> build -> verify""" + from unittest.mock import patch, MagicMock + import requests + + # Mock the requests.get call for downloading llms.txt + with patch('cli.llms_txt_downloader.requests.get') as mock_get: + # Configure mock response + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = self.sample_llms_content + mock_response.raise_for_status = MagicMock() + mock_get.return_value = mock_response + + # Create scraper and scrape + scraper = DocToSkillConverter(self.config, dry_run=False) + scraper.scrape_all() + + # Verify llms.txt was detected + self.assertTrue(scraper.llms_txt_detected, + "llms.txt should be detected") + self.assertEqual(scraper.llms_txt_variant, 'explicit', + "Should use explicit variant from config") + + # Verify pages were parsed + self.assertGreater(len(scraper.pages), 0, + "Should have parsed pages from llms.txt") + + # Verify page structure + self.assertTrue(all('title' in page for page in scraper.pages), + "All pages should have titles") + self.assertTrue(all('content' in page for page in scraper.pages), + "All pages should have content") + self.assertTrue(any(len(page.get('code_samples', [])) > 0 + for page in scraper.pages), + "At least one page should have code samples") + + # Verify code samples have language detection + pages_with_code = [p for p in scraper.pages + if len(p.get('code_samples', [])) > 0] + if pages_with_code: + sample = pages_with_code[0]['code_samples'][0] + self.assertIn('language', sample, + "Code samples should have language field") + self.assertIn('code', sample, + "Code samples should have code field") + + # Build skill + scraper.build_skill() + + # Verify SKILL.md exists + skill_md_path = Path(f"output/{self.config['name']}/SKILL.md") + self.assertTrue(skill_md_path.exists(), + "SKILL.md should be created") + + # Verify SKILL.md content + skill_content = skill_md_path.read_text() + self.assertIn(self.config['name'], skill_content, + "SKILL.md should contain skill name") + self.assertGreater(len(skill_content), 100, + "SKILL.md should have substantial content") + + # Verify references directory exists + refs_dir = Path(f"output/{self.config['name']}/references") + self.assertTrue(refs_dir.exists(), + "references directory should exist") + + # Verify at least index.md was created + index_md = refs_dir / 'index.md' + self.assertTrue(index_md.exists(), + "references/index.md should exist") + + # Verify reference files have content + ref_files = list(refs_dir.glob('*.md')) + self.assertGreater(len(ref_files), 0, + "Should have at least one reference file") + + # Verify data directory was created and has summary + data_dir = Path(f"output/{self.config['name']}_data") + self.assertTrue(data_dir.exists(), + "Data directory should exist") + + summary_path = data_dir / 'summary.json' + self.assertTrue(summary_path.exists(), + "summary.json should exist") + + # Verify summary content + with open(summary_path) as f: + summary = json.load(f) + self.assertEqual(summary['name'], self.config['name']) + self.assertGreater(summary['total_pages'], 0) + self.assertIn('llms_txt_detected', summary) + self.assertTrue(summary['llms_txt_detected']) + + def test_multi_variant_download(self): + """Test downloading all 3 llms.txt variants""" + from unittest.mock import patch, Mock + + config = { + 'name': 'test-multi-variant', + 'base_url': 'https://hono.dev/docs', + 'selectors': { + 'main_content': 'article', + 'title': 'h1', + 'code_blocks': 'pre code' + }, + 'max_pages': 50 + } + + # Mock all 3 variants + sample_full = "# Full\n" + "x" * 1000 + sample_standard = "# Standard\n" + "x" * 200 + sample_small = "# Small\n" + "x" * 500 + + with patch('cli.llms_txt_detector.requests.head') as mock_head, \ + patch('cli.llms_txt_downloader.requests.get') as mock_get: + + # Mock detection (all exist) + mock_head_response = Mock() + mock_head_response.status_code = 200 + mock_head.return_value = mock_head_response + + # Mock downloads + def mock_download(url, **kwargs): + response = Mock() + response.status_code = 200 + if 'llms-full.txt' in url: + response.text = sample_full + elif 'llms-small.txt' in url: + response.text = sample_small + else: # llms.txt + response.text = sample_standard + response.raise_for_status = Mock() + return response + + mock_get.side_effect = mock_download + + # Run scraper + from cli.doc_scraper import DocToSkillConverter as DocumentationScraper + scraper = DocumentationScraper(config, dry_run=False) + result = scraper._try_llms_txt() + + # Verify all 3 files created + refs_dir = Path(f"output/{config['name']}/references") + + self.assertTrue(refs_dir.exists(), "references directory should exist") + self.assertTrue((refs_dir / 'llms-full.md').exists(), "llms-full.md should exist") + self.assertTrue((refs_dir / 'llms.md').exists(), "llms.md should exist") + self.assertTrue((refs_dir / 'llms-small.md').exists(), "llms-small.md should exist") + + # Verify content not truncated + full_content = (refs_dir / 'llms-full.md').read_text() + self.assertEqual(len(full_content), len(sample_full)) + + # Clean up + shutil.rmtree(f"output/{config['name']}_data", ignore_errors=True) + shutil.rmtree(f"output/{config['name']}", ignore_errors=True) + +def test_no_content_truncation(): + """Test that content is NOT truncated in reference files""" + from unittest.mock import Mock + import tempfile + + config = { + 'name': 'test-no-truncate', + 'base_url': 'https://example.com/docs', + 'selectors': { + 'main_content': 'article', + 'title': 'h1', + 'code_blocks': 'pre code' + }, + 'max_pages': 50 + } + + # Create scraper with long content + from cli.doc_scraper import DocToSkillConverter + scraper = DocToSkillConverter(config, dry_run=False) + + # Create page with content > 2500 chars + long_content = "x" * 5000 + long_code = "y" * 1000 + + pages = [{ + 'title': 'Long Page', + 'url': 'https://example.com/long', + 'content': long_content, + 'code_samples': [ + {'code': long_code, 'language': 'python'} + ], + 'headings': [] + }] + + # Create reference file + scraper.create_reference_file('test', pages) + + # Verify no truncation + ref_file = Path(f"output/{config['name']}/references/test.md") + with open(ref_file, 'r') as f: + content = f.read() + + assert long_content in content # Full content included + assert long_code in content # Full code included + assert '[Content truncated]' not in content + assert '...' not in content or content.count('...') == 0 + + # Clean up + shutil.rmtree(f"output/{config['name']}_data", ignore_errors=True) + shutil.rmtree(f"output/{config['name']}", ignore_errors=True) + + if __name__ == '__main__': unittest.main() diff --git a/tests/test_llms_txt_detector.py b/tests/test_llms_txt_detector.py new file mode 100644 index 0000000..d5934d8 --- /dev/null +++ b/tests/test_llms_txt_detector.py @@ -0,0 +1,77 @@ +import pytest +from unittest.mock import patch, Mock +from cli.llms_txt_detector import LlmsTxtDetector + +def test_detect_llms_txt_variants(): + """Test detection of llms.txt file variants""" + detector = LlmsTxtDetector("https://hono.dev/docs") + + with patch('cli.llms_txt_detector.requests.head') as mock_head: + mock_response = Mock() + mock_response.status_code = 200 + mock_head.return_value = mock_response + + variants = detector.detect() + + assert variants is not None + assert variants['url'] == 'https://hono.dev/llms-full.txt' + assert variants['variant'] == 'full' + mock_head.assert_called() + +def test_detect_no_llms_txt(): + """Test detection when no llms.txt file exists""" + detector = LlmsTxtDetector("https://example.com/docs") + + with patch('cli.llms_txt_detector.requests.head') as mock_head: + mock_response = Mock() + mock_response.status_code = 404 + mock_head.return_value = mock_response + + variants = detector.detect() + + assert variants is None + assert mock_head.call_count == 3 # Should try all three variants + +def test_url_parsing_with_complex_paths(): + """Test URL parsing handles non-standard paths correctly""" + detector = LlmsTxtDetector("https://example.com/docs/v2/guide") + + with patch('cli.llms_txt_detector.requests.head') as mock_head: + mock_response = Mock() + mock_response.status_code = 200 + mock_head.return_value = mock_response + + variants = detector.detect() + + assert variants is not None + assert variants['url'] == 'https://example.com/llms-full.txt' + mock_head.assert_called_with( + 'https://example.com/llms-full.txt', + timeout=5, + allow_redirects=True + ) + +def test_detect_all_variants(): + """Test detecting all llms.txt variants""" + detector = LlmsTxtDetector("https://hono.dev/docs") + + with patch('cli.llms_txt_detector.requests.head') as mock_head: + # Mock responses for different variants + def mock_response(url, **kwargs): + response = Mock() + # All 3 variants exist for Hono + if 'llms-full.txt' in url or 'llms.txt' in url or 'llms-small.txt' in url: + response.status_code = 200 + else: + response.status_code = 404 + return response + + mock_head.side_effect = mock_response + + variants = detector.detect_all() + + assert len(variants) == 3 + assert any(v['variant'] == 'full' for v in variants) + assert any(v['variant'] == 'standard' for v in variants) + assert any(v['variant'] == 'small' for v in variants) + assert all('url' in v for v in variants) diff --git a/tests/test_llms_txt_downloader.py b/tests/test_llms_txt_downloader.py new file mode 100644 index 0000000..3aaf48c --- /dev/null +++ b/tests/test_llms_txt_downloader.py @@ -0,0 +1,170 @@ +import pytest +from unittest.mock import patch, Mock +import requests +from cli.llms_txt_downloader import LlmsTxtDownloader + +def test_successful_download(): + """Test successful download with valid markdown content""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + mock_response = Mock() + mock_response.text = "# Header\n\nSome content with markdown patterns.\n\n## Subheader\n\n- List item\n- Another item\n\n```python\ncode_block()\n```\n" + "x" * 200 + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response) as mock_get: + content = downloader.download() + + assert content is not None + assert len(content) > 100 + assert isinstance(content, str) + assert "# Header" in content + mock_get.assert_called_once() + +def test_timeout_with_retry(): + """Test timeout scenario with retry logic""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2) + + with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get: + with patch('time.sleep') as mock_sleep: # Mock sleep to speed up test + content = downloader.download() + + assert content is None + assert mock_get.call_count == 2 # Should retry once (2 total attempts) + assert mock_sleep.call_count == 1 # Should sleep once between retries + +def test_empty_content_rejection(): + """Test rejection of content shorter than 100 chars""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + mock_response = Mock() + mock_response.text = "# Short" + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + content = downloader.download() + + assert content is None + +def test_non_markdown_rejection(): + """Test rejection of content that doesn't look like markdown""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + mock_response = Mock() + mock_response.text = "Plain text without any markdown patterns at all. " * 10 + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + content = downloader.download() + + assert content is None + +def test_http_error_handling(): + """Test handling of HTTP errors (404, 500, etc.)""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2) + + mock_response = Mock() + mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found") + + with patch('requests.get', return_value=mock_response) as mock_get: + with patch('time.sleep'): + content = downloader.download() + + assert content is None + assert mock_get.call_count == 2 # Should retry once + +def test_exponential_backoff(): + """Test that exponential backoff delays are correct""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=3) + + with patch('requests.get', side_effect=requests.Timeout("Connection timeout")): + with patch('time.sleep') as mock_sleep: + content = downloader.download() + + assert content is None + # Should sleep with delays: 1s, 2s (2^0, 2^1) + assert mock_sleep.call_count == 2 + mock_sleep.assert_any_call(1) # First retry delay + mock_sleep.assert_any_call(2) # Second retry delay + +def test_markdown_validation(): + """Test markdown pattern detection""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + # Test various markdown patterns + assert downloader._is_markdown("# Header") + assert downloader._is_markdown("## Subheader") + assert downloader._is_markdown("```code```") + assert downloader._is_markdown("- list item") + assert downloader._is_markdown("* bullet point") + assert downloader._is_markdown("`inline code`") + + # Test non-markdown content + assert not downloader._is_markdown("Plain text without any markdown patterns") + +def test_custom_timeout(): + """Test custom timeout parameter""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", timeout=10) + + mock_response = Mock() + mock_response.text = "# Header\n\nContent " * 50 + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response) as mock_get: + content = downloader.download() + + assert content is not None + # Verify timeout was passed to requests.get + call_kwargs = mock_get.call_args[1] + assert call_kwargs['timeout'] == 10 + +def test_custom_max_retries(): + """Test custom max_retries parameter""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=5) + + with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get: + with patch('time.sleep'): + content = downloader.download() + + assert content is None + assert mock_get.call_count == 5 # Should attempt 5 times + +def test_user_agent_header(): + """Test that custom user agent is set""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + mock_response = Mock() + mock_response.text = "# Header\n\nContent " * 50 + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response) as mock_get: + content = downloader.download() + + assert content is not None + # Verify custom user agent was passed + call_kwargs = mock_get.call_args[1] + assert call_kwargs['headers']['User-Agent'] == 'Skill-Seekers-llms.txt-Reader/1.0' + +def test_get_proper_filename(): + """Test filename conversion from .txt to .md""" + downloader = LlmsTxtDownloader("https://hono.dev/llms-full.txt") + + filename = downloader.get_proper_filename() + + assert filename == "llms-full.md" + assert not filename.endswith('.txt') + +def test_get_proper_filename_standard(): + """Test standard variant naming""" + downloader = LlmsTxtDownloader("https://hono.dev/llms.txt") + + filename = downloader.get_proper_filename() + + assert filename == "llms.md" + +def test_get_proper_filename_small(): + """Test small variant naming""" + downloader = LlmsTxtDownloader("https://hono.dev/llms-small.txt") + + filename = downloader.get_proper_filename() + + assert filename == "llms-small.md" diff --git a/tests/test_llms_txt_parser.py b/tests/test_llms_txt_parser.py new file mode 100644 index 0000000..8e8c7fa --- /dev/null +++ b/tests/test_llms_txt_parser.py @@ -0,0 +1,34 @@ +import pytest +from cli.llms_txt_parser import LlmsTxtParser + +def test_parse_markdown_sections(): + """Test parsing markdown into page sections""" + sample_content = """# Getting Started + +Welcome to the docs. + +## Installation + +Run: npm install + +## Usage + +Import the library: + +```javascript +import { app } from 'framework' +``` + +# API Reference + +Main API documentation here. +""" + + parser = LlmsTxtParser(sample_content) + pages = parser.parse() + + assert len(pages) >= 2 + assert pages[0]['title'] == 'Getting Started' + assert pages[1]['title'] == 'API Reference' + assert len(pages[0]['code_samples']) == 1 + assert pages[0]['code_samples'][0]['language'] == 'javascript'