Two critical improvements from PR #243 code review: ## Fix 1: Empty List Edge Case Handling Added early return checks to prevent creating empty index files: **Files Modified:** - src/skill_seekers/cli/unified_skill_builder.py **Changes:** - _generate_docs_references: Skip if docs_list empty - _generate_github_references: Skip if github_list empty - _generate_pdf_references: Skip if pdf_list empty **Impact:** Prevents "Combined from 0 sources" index files which look odd. ## Fix 2: Enhanced Method Docstrings Added comprehensive parameter types and return value documentation: **Files Modified:** - src/skill_seekers/cli/llms_txt_parser.py - extract_urls: Added detailed examples and behavior notes - _clean_url: Added malformed URL pattern examples - src/skill_seekers/cli/doc_scraper.py - _extract_markdown_content: Full return dict structure documented - _extract_html_as_markdown: Extraction strategy and fallback behavior **Impact:** Improved developer experience with detailed API documentation. ## Testing All tests passing: - ✅ 32/32 PR #243 tests (markdown parsing + multi-source) - ✅ 975/975 core tests - 159 skipped (optional dependencies) - 4 failed (missing anthropic - expected) Co-authored-by: Code Review <claude-sonnet-4.5@anthropic.com>
153 lines
5.2 KiB
Python
153 lines
5.2 KiB
Python
"""ABOUTME: Parses llms.txt markdown content into structured page data"""
|
|
"""ABOUTME: Extracts titles, content, code samples, and headings from markdown"""
|
|
|
|
import re
|
|
from typing import List, Dict
|
|
from urllib.parse import urljoin
|
|
|
|
class LlmsTxtParser:
|
|
"""Parse llms.txt markdown content into page structures"""
|
|
|
|
def __init__(self, content: str, base_url: str = None):
|
|
self.content = content
|
|
self.base_url = base_url
|
|
|
|
def extract_urls(self) -> List[str]:
|
|
"""
|
|
Extract all URLs from the llms.txt content.
|
|
|
|
Supports both markdown-style links [text](url) and bare URLs.
|
|
Resolves relative URLs using base_url if provided.
|
|
Filters out malformed URLs with invalid anchor patterns.
|
|
|
|
Returns:
|
|
List of unique, cleaned URLs found in the content.
|
|
Returns empty list if no valid URLs found.
|
|
|
|
Note:
|
|
- Markdown links: [Getting Started](./docs/guide.md)
|
|
- Bare URLs: https://example.com/api.md
|
|
- Relative paths resolved with base_url
|
|
- Invalid anchors (#section/path.md) are stripped
|
|
"""
|
|
urls = set()
|
|
|
|
# Match markdown links: [text](url)
|
|
md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
|
|
for _, url in md_links:
|
|
if url.startswith('http'):
|
|
clean_url = self._clean_url(url)
|
|
if clean_url:
|
|
urls.add(clean_url)
|
|
elif self.base_url and not url.startswith('#'):
|
|
clean_url = self._clean_url(urljoin(self.base_url, url))
|
|
if clean_url:
|
|
urls.add(clean_url)
|
|
|
|
# Match bare URLs
|
|
bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
|
|
for url in bare_urls:
|
|
# Clean trailing punctuation
|
|
url = url.rstrip('.,;:')
|
|
clean_url = self._clean_url(url)
|
|
if clean_url:
|
|
urls.add(clean_url)
|
|
|
|
return list(urls)
|
|
|
|
def _clean_url(self, url: str) -> str:
|
|
"""
|
|
Clean and validate URL, removing invalid anchor patterns.
|
|
|
|
Detects and strips malformed anchors that contain path separators.
|
|
Valid: https://example.com/page.md#section
|
|
Invalid: https://example.com/page#section/index.html.md
|
|
|
|
Args:
|
|
url: URL to clean (absolute or relative)
|
|
|
|
Returns:
|
|
Cleaned URL with malformed anchors stripped.
|
|
Returns base URL if anchor contains '/' (malformed).
|
|
Returns original URL if anchor is valid or no anchor present.
|
|
|
|
Example:
|
|
>>> parser._clean_url("https://ex.com/page#sec/path.md")
|
|
"https://ex.com/page"
|
|
>>> parser._clean_url("https://ex.com/page.md#section")
|
|
"https://ex.com/page.md#section"
|
|
"""
|
|
# Skip URLs with path after anchor (e.g., #section/index.html.md)
|
|
# These are malformed and return duplicate HTML content
|
|
if '#' in url:
|
|
anchor_pos = url.index('#')
|
|
after_anchor = url[anchor_pos + 1:]
|
|
# If there's a path separator after anchor, it's invalid
|
|
if '/' in after_anchor:
|
|
# Extract the base URL without the malformed anchor
|
|
return url[:anchor_pos]
|
|
return url
|
|
|
|
def parse(self) -> List[Dict]:
|
|
"""
|
|
Parse markdown content into page structures.
|
|
|
|
Returns:
|
|
List of page dicts with title, content, code_samples, headings
|
|
"""
|
|
pages = []
|
|
|
|
# Split by h1 headers (# Title)
|
|
sections = re.split(r'\n# ', self.content)
|
|
|
|
for section in sections:
|
|
if not section.strip():
|
|
continue
|
|
|
|
# First line is title
|
|
lines = section.split('\n')
|
|
title = lines[0].strip('#').strip()
|
|
|
|
# Parse content
|
|
page = self._parse_section('\n'.join(lines[1:]), title)
|
|
pages.append(page)
|
|
|
|
return pages
|
|
|
|
def _parse_section(self, content: str, title: str) -> Dict:
|
|
"""Parse a single section into page structure"""
|
|
page = {
|
|
'title': title,
|
|
'content': '',
|
|
'code_samples': [],
|
|
'headings': [],
|
|
'url': f'llms-txt#{title.lower().replace(" ", "-")}',
|
|
'links': []
|
|
}
|
|
|
|
# Extract code blocks
|
|
code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
|
|
for lang, code in code_blocks:
|
|
page['code_samples'].append({
|
|
'code': code.strip(),
|
|
'language': lang or 'unknown'
|
|
})
|
|
|
|
# Extract h2/h3 headings
|
|
headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
|
|
for level_markers, text in headings:
|
|
page['headings'].append({
|
|
'level': f'h{len(level_markers)}',
|
|
'text': text.strip(),
|
|
'id': text.lower().replace(' ', '-')
|
|
})
|
|
|
|
# Remove code blocks from content for plain text
|
|
content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
|
|
|
|
# Extract paragraphs
|
|
paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
|
|
page['content'] = '\n\n'.join(paragraphs)
|
|
|
|
return page
|