feat(#191): Smart description generation for skill descriptions

Implements hybrid smart extraction + improved fallback templates for skill descriptions across all scrapers. Changes: - github_scraper.py: * Added extract_description_from_readme() helper * Extracts from README first paragraph (60 lines) * Updates description after README extraction * Fallback: "Use when working with {name}" * Updated 3 locations (GitHubScraper, GitHubToSkillConverter, main) - doc_scraper.py: * Added infer_description_from_docs() helper * Extracts from meta tags or first paragraph (65 lines) * Tries: meta description, og:description, first content paragraph * Fallback: "Use when working with {name}" * Updated 2 locations (create_enhanced_skill_md, get_configuration) - pdf_scraper.py: * Added infer_description_from_pdf() helper * Extracts from PDF metadata (subject, title) * Fallback: "Use when referencing {name} documentation" * Updated 3 locations (PDFToSkillConverter, main x2) - generate_router.py: * Updated 2 locations with improved router descriptions * "Use when working with {name} development and programming" All changes: - Only apply to NEW skill generations (don't modify existing) - No API calls (free/offline) - Smart extraction when metadata/README available - Improved "Use when..." fallbacks instead of generic templates - 612 tests passing (100%) Fixes #191 Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-28 19:00:26 +03:00
parent e32f2fd977
commit 74bae4b49f
4 changed files with 227 additions and 11 deletions
--- a/src/skill_seekers/cli/pdf_scraper.py
+++ b/src/skill_seekers/cli/pdf_scraper.py
@@ -22,6 +22,41 @@ from pathlib import Path
 from .pdf_extractor_poc import PDFExtractor


+def infer_description_from_pdf(pdf_metadata: dict = None, name: str = '') -> str:
+    """
+    Infer skill description from PDF metadata or document properties.
+
+    Tries to extract meaningful description from:
+    1. PDF metadata fields (title, subject, keywords)
+    2. Falls back to improved template
+
+    Args:
+        pdf_metadata: PDF metadata dictionary with title, subject, etc.
+        name: Skill name for fallback
+
+    Returns:
+        Description string suitable for "Use when..." format
+    """
+    if pdf_metadata:
+        # Try to use subject field (often contains description)
+        if 'subject' in pdf_metadata and pdf_metadata['subject']:
+            desc = str(pdf_metadata['subject']).strip()
+            if len(desc) > 20:
+                if len(desc) > 150:
+                    desc = desc[:147] + '...'
+                return f'Use when {desc.lower()}'
+
+        # Try title field if meaningful
+        if 'title' in pdf_metadata and pdf_metadata['title']:
+            title = str(pdf_metadata['title']).strip()
+            # Skip if it's just the filename
+            if len(title) > 10 and not title.endswith('.pdf'):
+                return f'Use when working with {title.lower()}'
+
+    # Improved fallback
+    return f'Use when referencing {name} documentation' if name else 'Use when referencing this documentation'
+
+
 class PDFToSkillConverter:
    """Convert PDF documentation to Claude skill"""

@@ -29,7 +64,8 @@ class PDFToSkillConverter:
        self.config = config
        self.name = config['name']
        self.pdf_path = config.get('pdf_path', '')
-        self.description = config.get('description', f'Documentation skill for {self.name}')
+        # Set initial description (will be improved after extraction if metadata available)
+        self.description = config.get('description', f'Use when referencing {self.name} documentation')

        # Paths
        self.skill_dir = f"output/{self.name}"
@@ -363,7 +399,7 @@ def main():
        name = Path(args.from_json).stem.replace('_extracted', '')
        config = {
            'name': name,
-            'description': args.description or f'Documentation skill for {name}'
+            'description': args.description or f'Use when referencing {name} documentation'
        }
        converter = PDFToSkillConverter(config)
        converter.load_extracted_data(args.from_json)
@@ -376,7 +412,7 @@ def main():
        config = {
            'name': args.name,
            'pdf_path': args.pdf,
-            'description': args.description or f'Documentation skill for {args.name}',
+            'description': args.description or f'Use when referencing {args.name} documentation',
            'extract_options': {
                'chunk_size': 10,
                'min_quality': 5.0,