feat(#191): Smart description generation for skill descriptions

Implements hybrid smart extraction + improved fallback templates for skill descriptions across all scrapers. Changes: - github_scraper.py: * Added extract_description_from_readme() helper * Extracts from README first paragraph (60 lines) * Updates description after README extraction * Fallback: "Use when working with {name}" * Updated 3 locations (GitHubScraper, GitHubToSkillConverter, main) - doc_scraper.py: * Added infer_description_from_docs() helper * Extracts from meta tags or first paragraph (65 lines) * Tries: meta description, og:description, first content paragraph * Fallback: "Use when working with {name}" * Updated 2 locations (create_enhanced_skill_md, get_configuration) - pdf_scraper.py: * Added infer_description_from_pdf() helper * Extracts from PDF metadata (subject, title) * Fallback: "Use when referencing {name} documentation" * Updated 3 locations (PDFToSkillConverter, main x2) - generate_router.py: * Updated 2 locations with improved router descriptions * "Use when working with {name} development and programming" All changes: - Only apply to NEW skill generations (don't modify existing) - No API calls (free/offline) - Smart extraction when metadata/README available - Improved "Use when..." fallbacks instead of generic templates - 612 tests passing (100%) Fixes #191 Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-28 19:00:26 +03:00
parent e32f2fd977
commit 74bae4b49f
4 changed files with 227 additions and 11 deletions
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -68,6 +68,73 @@ def setup_logging(verbose: bool = False, quiet: bool = False) -> None:
    )


+def infer_description_from_docs(base_url: str, first_page_content: Optional[str] = None, name: str = '') -> str:
+    """
+    Infer skill description from documentation metadata or first page content.
+
+    Tries multiple strategies:
+    1. Extract meta description tag from first page
+    2. Extract first meaningful paragraph from content
+    3. Fall back to improved template
+
+    Args:
+        base_url: Documentation base URL
+        first_page_content: HTML content of first page (optional)
+        name: Skill name
+
+    Returns:
+        Description string suitable for "Use when..." format
+    """
+    # If we have first page content, try to extract description
+    if first_page_content:
+        try:
+            soup = BeautifulSoup(first_page_content, 'html.parser')
+
+            # Strategy 1: Try meta description tag
+            meta_desc = soup.find('meta', {'name': 'description'})
+            if meta_desc and meta_desc.get('content'):
+                desc = meta_desc['content'].strip()
+                if len(desc) > 20:  # Meaningful length
+                    # Clean and format
+                    if len(desc) > 150:
+                        desc = desc[:147] + '...'
+                    return f'Use when {desc.lower()}'
+
+            # Strategy 2: Try OpenGraph description
+            og_desc = soup.find('meta', {'property': 'og:description'})
+            if og_desc and og_desc.get('content'):
+                desc = og_desc['content'].strip()
+                if len(desc) > 20:
+                    if len(desc) > 150:
+                        desc = desc[:147] + '...'
+                    return f'Use when {desc.lower()}'
+
+            # Strategy 3: Extract first meaningful paragraph from main content
+            # Look for common documentation main content areas
+            main_content = None
+            for selector in ['article', 'main', 'div[role="main"]', 'div.content', 'div.doc-content']:
+                main_content = soup.select_one(selector)
+                if main_content:
+                    break
+
+            if main_content:
+                # Find first paragraph
+                for p in main_content.find_all('p', limit=5):
+                    text = p.get_text().strip()
+                    # Skip empty, very short, or navigation-like paragraphs
+                    if len(text) > 30 and not any(skip in text.lower() for skip in ['table of contents', 'on this page', 'navigation']):
+                        # Clean and format
+                        if len(text) > 150:
+                            text = text[:147] + '...'
+                        return f'Use when working with {text.lower()}'
+
+        except Exception as e:
+            logger.debug(f"Could not infer description from page content: {e}")
+
+    # Improved fallback template
+    return f'Use when working with {name}' if name else f'Use when working with documentation at {urlparse(base_url).netloc}'
+
+
 class DocToSkillConverter:
    def __init__(self, config: Dict[str, Any], dry_run: bool = False, resume: bool = False) -> None:
        self.config = config
@@ -999,7 +1066,17 @@ class DocToSkillConverter:
    
    def create_enhanced_skill_md(self, categories: Dict[str, List[Dict[str, Any]]], quick_ref: List[Dict[str, str]]) -> None:
        """Create SKILL.md with actual examples (IMPROVED)"""
-        description = self.config.get('description', f'Comprehensive assistance with {self.name}')
+        # Try to infer description if not in config
+        if 'description' not in self.config:
+            # Get first page HTML content to infer description
+            first_page_html = None
+            for pages in categories.values():
+                if pages:
+                    first_page_html = pages[0].get('raw_html', '')
+                    break
+            description = infer_description_from_docs(self.base_url, first_page_html, self.name)
+        else:
+            description = self.config['description']
        
        # Extract actual code examples from docs
        example_codes = []
@@ -1024,7 +1101,7 @@ description: {description}

 # {self.name.title()} Skill

-Comprehensive assistance with {self.name} development, generated from official documentation.
+{description.capitalize()}, generated from official documentation.

 ## When to Use This Skill

@@ -1511,7 +1588,7 @@ def get_configuration(args: argparse.Namespace) -> Dict[str, Any]:
    else:
        config = {
            'name': args.name,
-            'description': args.description or f'Comprehensive assistance with {args.name}',
+            'description': args.description or f'Use when working with {args.name}',
            'base_url': args.url,
            'selectors': {
                'main_content': "div[role='main']",