diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 58e354e..74b1ee0 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -68,6 +68,73 @@ def setup_logging(verbose: bool = False, quiet: bool = False) -> None: ) +def infer_description_from_docs(base_url: str, first_page_content: Optional[str] = None, name: str = '') -> str: + """ + Infer skill description from documentation metadata or first page content. + + Tries multiple strategies: + 1. Extract meta description tag from first page + 2. Extract first meaningful paragraph from content + 3. Fall back to improved template + + Args: + base_url: Documentation base URL + first_page_content: HTML content of first page (optional) + name: Skill name + + Returns: + Description string suitable for "Use when..." format + """ + # If we have first page content, try to extract description + if first_page_content: + try: + soup = BeautifulSoup(first_page_content, 'html.parser') + + # Strategy 1: Try meta description tag + meta_desc = soup.find('meta', {'name': 'description'}) + if meta_desc and meta_desc.get('content'): + desc = meta_desc['content'].strip() + if len(desc) > 20: # Meaningful length + # Clean and format + if len(desc) > 150: + desc = desc[:147] + '...' + return f'Use when {desc.lower()}' + + # Strategy 2: Try OpenGraph description + og_desc = soup.find('meta', {'property': 'og:description'}) + if og_desc and og_desc.get('content'): + desc = og_desc['content'].strip() + if len(desc) > 20: + if len(desc) > 150: + desc = desc[:147] + '...' + return f'Use when {desc.lower()}' + + # Strategy 3: Extract first meaningful paragraph from main content + # Look for common documentation main content areas + main_content = None + for selector in ['article', 'main', 'div[role="main"]', 'div.content', 'div.doc-content']: + main_content = soup.select_one(selector) + if main_content: + break + + if main_content: + # Find first paragraph + for p in main_content.find_all('p', limit=5): + text = p.get_text().strip() + # Skip empty, very short, or navigation-like paragraphs + if len(text) > 30 and not any(skip in text.lower() for skip in ['table of contents', 'on this page', 'navigation']): + # Clean and format + if len(text) > 150: + text = text[:147] + '...' + return f'Use when working with {text.lower()}' + + except Exception as e: + logger.debug(f"Could not infer description from page content: {e}") + + # Improved fallback template + return f'Use when working with {name}' if name else f'Use when working with documentation at {urlparse(base_url).netloc}' + + class DocToSkillConverter: def __init__(self, config: Dict[str, Any], dry_run: bool = False, resume: bool = False) -> None: self.config = config @@ -999,7 +1066,17 @@ class DocToSkillConverter: def create_enhanced_skill_md(self, categories: Dict[str, List[Dict[str, Any]]], quick_ref: List[Dict[str, str]]) -> None: """Create SKILL.md with actual examples (IMPROVED)""" - description = self.config.get('description', f'Comprehensive assistance with {self.name}') + # Try to infer description if not in config + if 'description' not in self.config: + # Get first page HTML content to infer description + first_page_html = None + for pages in categories.values(): + if pages: + first_page_html = pages[0].get('raw_html', '') + break + description = infer_description_from_docs(self.base_url, first_page_html, self.name) + else: + description = self.config['description'] # Extract actual code examples from docs example_codes = [] @@ -1024,7 +1101,7 @@ description: {description} # {self.name.title()} Skill -Comprehensive assistance with {self.name} development, generated from official documentation. +{description.capitalize()}, generated from official documentation. ## When to Use This Skill @@ -1511,7 +1588,7 @@ def get_configuration(args: argparse.Namespace) -> Dict[str, Any]: else: config = { 'name': args.name, - 'description': args.description or f'Comprehensive assistance with {args.name}', + 'description': args.description or f'Use when working with {args.name}', 'base_url': args.url, 'selectors': { 'main_content': "div[role='main']", diff --git a/src/skill_seekers/cli/generate_router.py b/src/skill_seekers/cli/generate_router.py index 0d4ef84..e3f37b8 100644 --- a/src/skill_seekers/cli/generate_router.py +++ b/src/skill_seekers/cli/generate_router.py @@ -73,7 +73,7 @@ class RouterGenerator: ## When to Use This Skill -{self.base_config.get('description', f'Use for {self.router_name} development and programming.')} +{self.base_config.get('description', f'Use when working with {self.router_name} development and programming')} This is a router skill that directs your questions to specialized sub-skills for efficient, focused assistance. @@ -156,7 +156,7 @@ Simply ask your question and mention the topic. The router will find the right s router_config = { "name": self.router_name, - "description": self.base_config.get('description', f'{self.router_name.title()} documentation router'), + "description": self.base_config.get('description', f'Use when working with {self.router_name} documentation (router for multiple sub-skills)'), "base_url": self.base_config['base_url'], "selectors": self.base_config.get('selectors', {}), "url_patterns": self.base_config.get('url_patterns', {}), diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py index db7a7e7..47f2196 100644 --- a/src/skill_seekers/cli/github_scraper.py +++ b/src/skill_seekers/cli/github_scraper.py @@ -58,6 +58,87 @@ EXCLUDED_DIRS = { } +def extract_description_from_readme(readme_content: str, repo_name: str) -> str: + """ + Extract a meaningful description from README content for skill description. + + Parses README to find the first meaningful paragraph that describes + what the project does, suitable for "Use when..." format. + + Args: + readme_content: README.md content + repo_name: Repository name (e.g., 'facebook/react') + + Returns: + Description string, or improved fallback if extraction fails + """ + if not readme_content: + return f'Use when working with {repo_name.split("/")[-1]}' + + try: + lines = readme_content.split('\n') + + # Skip badges, images, title - find first meaningful text paragraph + meaningful_paragraph = None + in_code_block = False + + for i, line in enumerate(lines): + stripped = line.strip() + + # Track code blocks + if stripped.startswith('```'): + in_code_block = not in_code_block + continue + + # Skip if in code block + if in_code_block: + continue + + # Skip empty lines, badges, images, HTML + if not stripped or stripped.startswith(('#', '!', '<', '[![', '[![')): + continue + + # Skip lines that are just links or badges + if stripped.startswith('[') and '](' in stripped and len(stripped) < 100: + continue + + # Found a meaningful paragraph - take up to 200 chars + if len(stripped) > 20: # Meaningful length + meaningful_paragraph = stripped + break + + if meaningful_paragraph: + # Clean up and extract purpose + # Remove markdown formatting + clean = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', meaningful_paragraph) # Links + clean = re.sub(r'[*_`]', '', clean) # Bold, italic, code + clean = re.sub(r'<[^>]+>', '', clean) # HTML tags + + # Truncate if too long (keep first sentence or ~150 chars) + if '. ' in clean: + first_sentence = clean.split('. ')[0] + '.' + if len(first_sentence) < 200: + clean = first_sentence + + if len(clean) > 150: + clean = clean[:147] + '...' + + # Format as "Use when..." description + # If it already starts with action words, use as-is + action_words = ['build', 'create', 'develop', 'work', 'use', 'implement', 'manage'] + if any(clean.lower().startswith(word) for word in action_words): + return f'Use when {clean.lower()}' + else: + return f'Use when working with {clean.lower()}' + + except Exception as e: + logger.debug(f"Could not extract description from README: {e}") + + # Improved fallback + project_name = repo_name.split('/')[-1] + return f'Use when working with {project_name}' + + class GitHubScraper: """ GitHub Repository Scraper (C1.1-C1.9) @@ -79,7 +160,8 @@ class GitHubScraper: self.config = config self.repo_name = config['repo'] self.name = config.get('name', self.repo_name.split('/')[-1]) - self.description = config.get('description', f'Skill for {self.repo_name}') + # Set initial description (will be improved after README extraction if not in config) + self.description = config.get('description', f'Use when working with {self.repo_name.split("/")[-1]}') # Local repository path (optional - enables unlimited analysis) self.local_repo_path = local_repo_path or config.get('local_repo_path') @@ -257,6 +339,16 @@ class GitHubScraper: if content: self.extracted_data['readme'] = content.decoded_content.decode('utf-8') logger.info(f"README found: {readme_path}") + + # Update description if not explicitly set in config + if 'description' not in self.config: + smart_description = extract_description_from_readme( + self.extracted_data['readme'], + self.repo_name + ) + self.description = smart_description + logger.debug(f"Generated description: {self.description}") + return except GithubException: continue @@ -654,7 +746,6 @@ class GitHubToSkillConverter: """Initialize converter with configuration.""" self.config = config self.name = config.get('name', config['repo'].split('/')[-1]) - self.description = config.get('description', f'Skill for {config["repo"]}') # Paths self.data_file = f"output/{self.name}_github_data.json" @@ -663,6 +754,18 @@ class GitHubToSkillConverter: # Load extracted data self.data = self._load_data() + # Set description (smart extraction from README if available) + if 'description' in config: + self.description = config['description'] + else: + # Try to extract from README in loaded data + readme_content = self.data.get('readme', '') + repo_name = config['repo'] + if readme_content: + self.description = extract_description_from_readme(readme_content, repo_name) + else: + self.description = f'Use when working with {repo_name.split("/")[-1]}' + def _load_data(self) -> Dict[str, Any]: """Load extracted GitHub data from JSON.""" if not os.path.exists(self.data_file): @@ -925,7 +1028,7 @@ Examples: config = { 'repo': args.repo, 'name': args.name or args.repo.split('/')[-1], - 'description': args.description or f'GitHub repository skill for {args.repo}', + 'description': args.description or f'Use when working with {args.repo.split("/")[-1]}', 'github_token': args.token, 'include_issues': not args.no_issues, 'include_changelog': not args.no_changelog, diff --git a/src/skill_seekers/cli/pdf_scraper.py b/src/skill_seekers/cli/pdf_scraper.py index 76ce377..39be795 100644 --- a/src/skill_seekers/cli/pdf_scraper.py +++ b/src/skill_seekers/cli/pdf_scraper.py @@ -22,6 +22,41 @@ from pathlib import Path from .pdf_extractor_poc import PDFExtractor +def infer_description_from_pdf(pdf_metadata: dict = None, name: str = '') -> str: + """ + Infer skill description from PDF metadata or document properties. + + Tries to extract meaningful description from: + 1. PDF metadata fields (title, subject, keywords) + 2. Falls back to improved template + + Args: + pdf_metadata: PDF metadata dictionary with title, subject, etc. + name: Skill name for fallback + + Returns: + Description string suitable for "Use when..." format + """ + if pdf_metadata: + # Try to use subject field (often contains description) + if 'subject' in pdf_metadata and pdf_metadata['subject']: + desc = str(pdf_metadata['subject']).strip() + if len(desc) > 20: + if len(desc) > 150: + desc = desc[:147] + '...' + return f'Use when {desc.lower()}' + + # Try title field if meaningful + if 'title' in pdf_metadata and pdf_metadata['title']: + title = str(pdf_metadata['title']).strip() + # Skip if it's just the filename + if len(title) > 10 and not title.endswith('.pdf'): + return f'Use when working with {title.lower()}' + + # Improved fallback + return f'Use when referencing {name} documentation' if name else 'Use when referencing this documentation' + + class PDFToSkillConverter: """Convert PDF documentation to Claude skill""" @@ -29,7 +64,8 @@ class PDFToSkillConverter: self.config = config self.name = config['name'] self.pdf_path = config.get('pdf_path', '') - self.description = config.get('description', f'Documentation skill for {self.name}') + # Set initial description (will be improved after extraction if metadata available) + self.description = config.get('description', f'Use when referencing {self.name} documentation') # Paths self.skill_dir = f"output/{self.name}" @@ -363,7 +399,7 @@ def main(): name = Path(args.from_json).stem.replace('_extracted', '') config = { 'name': name, - 'description': args.description or f'Documentation skill for {name}' + 'description': args.description or f'Use when referencing {name} documentation' } converter = PDFToSkillConverter(config) converter.load_extracted_data(args.from_json) @@ -376,7 +412,7 @@ def main(): config = { 'name': args.name, 'pdf_path': args.pdf, - 'description': args.description or f'Documentation skill for {args.name}', + 'description': args.description or f'Use when referencing {args.name} documentation', 'extract_options': { 'chunk_size': 10, 'min_quality': 5.0,