fix: Add empty list checks and enhance docstrings (PR #243 review fixes)
Two critical improvements from PR #243 code review: ## Fix 1: Empty List Edge Case Handling Added early return checks to prevent creating empty index files: **Files Modified:** - src/skill_seekers/cli/unified_skill_builder.py **Changes:** - _generate_docs_references: Skip if docs_list empty - _generate_github_references: Skip if github_list empty - _generate_pdf_references: Skip if pdf_list empty **Impact:** Prevents "Combined from 0 sources" index files which look odd. ## Fix 2: Enhanced Method Docstrings Added comprehensive parameter types and return value documentation: **Files Modified:** - src/skill_seekers/cli/llms_txt_parser.py - extract_urls: Added detailed examples and behavior notes - _clean_url: Added malformed URL pattern examples - src/skill_seekers/cli/doc_scraper.py - _extract_markdown_content: Full return dict structure documented - _extract_html_as_markdown: Extraction strategy and fallback behavior **Impact:** Improved developer experience with detailed API documentation. ## Testing All tests passing: - ✅ 32/32 PR #243 tests (markdown parsing + multi-source) - ✅ 975/975 core tests - 159 skipped (optional dependencies) - 4 failed (missing anthropic - expected) Co-authored-by: Code Review <claude-sonnet-4.5@anthropic.com>
This commit is contained in:
@@ -350,14 +350,34 @@ class DocToSkillConverter:
|
||||
return page
|
||||
|
||||
def _extract_markdown_content(self, content: str, url: str) -> Dict[str, Any]:
|
||||
"""Extract content from a Markdown file.
|
||||
"""Extract structured content from a Markdown file.
|
||||
|
||||
Parses markdown files from llms.txt URLs to extract:
|
||||
- Title from first h1 heading
|
||||
- Headings (h2-h6, excluding h1)
|
||||
- Code blocks with language detection
|
||||
- Internal .md links for BFS crawling
|
||||
- Content paragraphs (>20 chars)
|
||||
|
||||
Auto-detects HTML content and falls back to _extract_html_as_markdown.
|
||||
|
||||
Args:
|
||||
content: Raw markdown content (or HTML if server returned HTML)
|
||||
url: Source URL
|
||||
content: Raw markdown content string (or HTML if server returned HTML)
|
||||
url: Source URL for resolving relative links
|
||||
|
||||
Returns:
|
||||
Page dict with title, content, code_samples, headings, links
|
||||
Dict with keys:
|
||||
- url: str - Source URL
|
||||
- title: str - Extracted from first # heading
|
||||
- content: str - Paragraphs joined with double newlines
|
||||
- headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str}
|
||||
- code_samples: List[Dict] - {'code': str, 'language': str}
|
||||
- links: List[str] - Absolute URLs to other .md files
|
||||
- patterns: List - Empty (reserved for future use)
|
||||
|
||||
Note:
|
||||
Only .md links are extracted to avoid client-side rendered HTML pages.
|
||||
Anchor fragments (#section) are stripped from links.
|
||||
"""
|
||||
import re
|
||||
|
||||
@@ -434,12 +454,34 @@ class DocToSkillConverter:
|
||||
def _extract_html_as_markdown(self, html_content: str, url: str) -> Dict[str, Any]:
|
||||
"""Extract content from HTML and convert to markdown-like structure.
|
||||
|
||||
Fallback method when .md URL returns HTML content instead of markdown.
|
||||
Uses BeautifulSoup to extract structured data from HTML elements.
|
||||
|
||||
Extraction strategy:
|
||||
1. Title from <title> tag
|
||||
2. Main content from <main>, <article>, [role="main"], or <body>
|
||||
3. Headings (h1-h6) with text and id attributes
|
||||
4. Code blocks from <pre><code> or <pre> tags
|
||||
5. Text content from paragraphs
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML content
|
||||
url: Source URL
|
||||
html_content: Raw HTML content string
|
||||
url: Source URL (for reference in result dict)
|
||||
|
||||
Returns:
|
||||
Page dict with title, content, code_samples, headings, links
|
||||
Dict with keys:
|
||||
- url: str - Source URL
|
||||
- title: str - From <title> tag, cleaned
|
||||
- content: str - Text content from main area
|
||||
- headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str}
|
||||
- code_samples: List[Dict] - {'code': str, 'language': str}
|
||||
- links: List - Empty (HTML links not extracted to avoid client-side routes)
|
||||
- patterns: List - Empty (reserved for future use)
|
||||
|
||||
Note:
|
||||
Prefers <main> or <article> tags for content area.
|
||||
Falls back to <body> if no semantic content container found.
|
||||
Language detection uses detect_language() method.
|
||||
"""
|
||||
page = {
|
||||
'url': url,
|
||||
|
||||
@@ -16,8 +16,19 @@ class LlmsTxtParser:
|
||||
"""
|
||||
Extract all URLs from the llms.txt content.
|
||||
|
||||
Supports both markdown-style links [text](url) and bare URLs.
|
||||
Resolves relative URLs using base_url if provided.
|
||||
Filters out malformed URLs with invalid anchor patterns.
|
||||
|
||||
Returns:
|
||||
List of unique URLs found in the content
|
||||
List of unique, cleaned URLs found in the content.
|
||||
Returns empty list if no valid URLs found.
|
||||
|
||||
Note:
|
||||
- Markdown links: [Getting Started](./docs/guide.md)
|
||||
- Bare URLs: https://example.com/api.md
|
||||
- Relative paths resolved with base_url
|
||||
- Invalid anchors (#section/path.md) are stripped
|
||||
"""
|
||||
urls = set()
|
||||
|
||||
@@ -48,11 +59,23 @@ class LlmsTxtParser:
|
||||
"""
|
||||
Clean and validate URL, removing invalid anchor patterns.
|
||||
|
||||
Detects and strips malformed anchors that contain path separators.
|
||||
Valid: https://example.com/page.md#section
|
||||
Invalid: https://example.com/page#section/index.html.md
|
||||
|
||||
Args:
|
||||
url: URL to clean
|
||||
url: URL to clean (absolute or relative)
|
||||
|
||||
Returns:
|
||||
Cleaned URL or empty string if invalid
|
||||
Cleaned URL with malformed anchors stripped.
|
||||
Returns base URL if anchor contains '/' (malformed).
|
||||
Returns original URL if anchor is valid or no anchor present.
|
||||
|
||||
Example:
|
||||
>>> parser._clean_url("https://ex.com/page#sec/path.md")
|
||||
"https://ex.com/page"
|
||||
>>> parser._clean_url("https://ex.com/page.md#section")
|
||||
"https://ex.com/page.md#section"
|
||||
"""
|
||||
# Skip URLs with path after anchor (e.g., #section/index.html.md)
|
||||
# These are malformed and return duplicate HTML content
|
||||
|
||||
@@ -287,6 +287,10 @@ This skill combines knowledge from multiple sources:
|
||||
|
||||
def _generate_docs_references(self, docs_list: List[Dict]):
|
||||
"""Generate references from multiple documentation sources."""
|
||||
# Skip if no documentation sources
|
||||
if not docs_list:
|
||||
return
|
||||
|
||||
docs_dir = os.path.join(self.skill_dir, 'references', 'documentation')
|
||||
os.makedirs(docs_dir, exist_ok=True)
|
||||
|
||||
@@ -347,6 +351,10 @@ This skill combines knowledge from multiple sources:
|
||||
|
||||
def _generate_github_references(self, github_list: List[Dict]):
|
||||
"""Generate references from multiple GitHub sources."""
|
||||
# Skip if no GitHub sources
|
||||
if not github_list:
|
||||
return
|
||||
|
||||
github_dir = os.path.join(self.skill_dir, 'references', 'github')
|
||||
os.makedirs(github_dir, exist_ok=True)
|
||||
|
||||
@@ -429,6 +437,10 @@ This skill combines knowledge from multiple sources:
|
||||
|
||||
def _generate_pdf_references(self, pdf_list: List[Dict]):
|
||||
"""Generate references from PDF sources."""
|
||||
# Skip if no PDF sources
|
||||
if not pdf_list:
|
||||
return
|
||||
|
||||
pdf_dir = os.path.join(self.skill_dir, 'references', 'pdf')
|
||||
os.makedirs(pdf_dir, exist_ok=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user