feat(#191): Smart description generation for skill descriptions

Implements hybrid smart extraction + improved fallback templates for
skill descriptions across all scrapers.

Changes:
- github_scraper.py:
  * Added extract_description_from_readme() helper
  * Extracts from README first paragraph (60 lines)
  * Updates description after README extraction
  * Fallback: "Use when working with {name}"
  * Updated 3 locations (GitHubScraper, GitHubToSkillConverter, main)

- doc_scraper.py:
  * Added infer_description_from_docs() helper
  * Extracts from meta tags or first paragraph (65 lines)
  * Tries: meta description, og:description, first content paragraph
  * Fallback: "Use when working with {name}"
  * Updated 2 locations (create_enhanced_skill_md, get_configuration)

- pdf_scraper.py:
  * Added infer_description_from_pdf() helper
  * Extracts from PDF metadata (subject, title)
  * Fallback: "Use when referencing {name} documentation"
  * Updated 3 locations (PDFToSkillConverter, main x2)

- generate_router.py:
  * Updated 2 locations with improved router descriptions
  * "Use when working with {name} development and programming"

All changes:
- Only apply to NEW skill generations (don't modify existing)
- No API calls (free/offline)
- Smart extraction when metadata/README available
- Improved "Use when..." fallbacks instead of generic templates
- 612 tests passing (100%)

Fixes #191

Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2025-12-28 19:00:26 +03:00
parent e32f2fd977
commit 74bae4b49f
4 changed files with 227 additions and 11 deletions

View File

@@ -68,6 +68,73 @@ def setup_logging(verbose: bool = False, quiet: bool = False) -> None:
)
def infer_description_from_docs(base_url: str, first_page_content: Optional[str] = None, name: str = '') -> str:
"""
Infer skill description from documentation metadata or first page content.
Tries multiple strategies:
1. Extract meta description tag from first page
2. Extract first meaningful paragraph from content
3. Fall back to improved template
Args:
base_url: Documentation base URL
first_page_content: HTML content of first page (optional)
name: Skill name
Returns:
Description string suitable for "Use when..." format
"""
# If we have first page content, try to extract description
if first_page_content:
try:
soup = BeautifulSoup(first_page_content, 'html.parser')
# Strategy 1: Try meta description tag
meta_desc = soup.find('meta', {'name': 'description'})
if meta_desc and meta_desc.get('content'):
desc = meta_desc['content'].strip()
if len(desc) > 20: # Meaningful length
# Clean and format
if len(desc) > 150:
desc = desc[:147] + '...'
return f'Use when {desc.lower()}'
# Strategy 2: Try OpenGraph description
og_desc = soup.find('meta', {'property': 'og:description'})
if og_desc and og_desc.get('content'):
desc = og_desc['content'].strip()
if len(desc) > 20:
if len(desc) > 150:
desc = desc[:147] + '...'
return f'Use when {desc.lower()}'
# Strategy 3: Extract first meaningful paragraph from main content
# Look for common documentation main content areas
main_content = None
for selector in ['article', 'main', 'div[role="main"]', 'div.content', 'div.doc-content']:
main_content = soup.select_one(selector)
if main_content:
break
if main_content:
# Find first paragraph
for p in main_content.find_all('p', limit=5):
text = p.get_text().strip()
# Skip empty, very short, or navigation-like paragraphs
if len(text) > 30 and not any(skip in text.lower() for skip in ['table of contents', 'on this page', 'navigation']):
# Clean and format
if len(text) > 150:
text = text[:147] + '...'
return f'Use when working with {text.lower()}'
except Exception as e:
logger.debug(f"Could not infer description from page content: {e}")
# Improved fallback template
return f'Use when working with {name}' if name else f'Use when working with documentation at {urlparse(base_url).netloc}'
class DocToSkillConverter:
def __init__(self, config: Dict[str, Any], dry_run: bool = False, resume: bool = False) -> None:
self.config = config
@@ -999,7 +1066,17 @@ class DocToSkillConverter:
def create_enhanced_skill_md(self, categories: Dict[str, List[Dict[str, Any]]], quick_ref: List[Dict[str, str]]) -> None:
"""Create SKILL.md with actual examples (IMPROVED)"""
description = self.config.get('description', f'Comprehensive assistance with {self.name}')
# Try to infer description if not in config
if 'description' not in self.config:
# Get first page HTML content to infer description
first_page_html = None
for pages in categories.values():
if pages:
first_page_html = pages[0].get('raw_html', '')
break
description = infer_description_from_docs(self.base_url, first_page_html, self.name)
else:
description = self.config['description']
# Extract actual code examples from docs
example_codes = []
@@ -1024,7 +1101,7 @@ description: {description}
# {self.name.title()} Skill
Comprehensive assistance with {self.name} development, generated from official documentation.
{description.capitalize()}, generated from official documentation.
## When to Use This Skill
@@ -1511,7 +1588,7 @@ def get_configuration(args: argparse.Namespace) -> Dict[str, Any]:
else:
config = {
'name': args.name,
'description': args.description or f'Comprehensive assistance with {args.name}',
'description': args.description or f'Use when working with {args.name}',
'base_url': args.url,
'selectors': {
'main_content': "div[role='main']",