Files
skill-seekers-reference/src/skill_seekers/cli/llms_txt_parser.py
yusyus b25a6f7f53 fix: centralize bracket-encoding to prevent 'Invalid IPv6 URL' on all code paths (#284)
The original fix (741daf1) only patched LlmsTxtParser._clean_url(),
which covers URLs extracted directly from llms.txt content. But URLs
discovered from .md files during BFS crawl (_extract_markdown_content)
and from HTML pages (extract_content) bypass _clean_url() entirely.
When those pages contain links with square brackets (e.g.
/api/[v1]/users), httpx raises 'Invalid IPv6 URL' on fetch.

Fix: add a shared sanitize_url() utility in cli/utils.py that
percent-encodes [ and ] in path/query components, and apply it at
every URL ingestion point:

- _enqueue_url(): main chokepoint — all discovered URLs pass through
- scrape_page(): safety net for start_urls that skip _enqueue_url
- scrape_page_async(): same for async mode
- dry-run sync/async paths: direct fetches that also bypass _enqueue_url

LlmsTxtParser._clean_url() now delegates bracket-encoding to the
shared sanitize_url() (DRY), keeping only its malformed-anchor
stripping logic.

Added 16 tests: sanitize_url unit tests, _clean_url bracket tests,
_enqueue_url sanitization tests, and integration test verifying
markdown content with bracket URLs is handled safely.

Fixes #284
2026-03-14 23:53:47 +03:00

161 lines
5.6 KiB
Python

"""ABOUTME: Parses llms.txt markdown content into structured page data"""
import re
from urllib.parse import urljoin
from skill_seekers.cli.utils import sanitize_url
class LlmsTxtParser:
"""Parse llms.txt markdown content into page structures"""
def __init__(self, content: str, base_url: str = None):
self.content = content
self.base_url = base_url
def extract_urls(self) -> list[str]:
"""
Extract all URLs from the llms.txt content.
Supports both markdown-style links [text](url) and bare URLs.
Resolves relative URLs using base_url if provided.
Filters out malformed URLs with invalid anchor patterns.
Returns:
List of unique, cleaned URLs found in the content.
Returns empty list if no valid URLs found.
Note:
- Markdown links: [Getting Started](./docs/guide.md)
- Bare URLs: https://example.com/api.md
- Relative paths resolved with base_url
- Invalid anchors (#section/path.md) are stripped
"""
urls = set()
# Match markdown links: [text](url)
md_links = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", self.content)
for _, url in md_links:
if url.startswith("http"):
clean_url = self._clean_url(url)
if clean_url:
urls.add(clean_url)
elif self.base_url and not url.startswith("#"):
clean_url = self._clean_url(urljoin(self.base_url, url))
if clean_url:
urls.add(clean_url)
# Match bare URLs
bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
for url in bare_urls:
# Clean trailing punctuation
url = url.rstrip(".,;:")
clean_url = self._clean_url(url)
if clean_url:
urls.add(clean_url)
return list(urls)
def _clean_url(self, url: str) -> str:
"""
Clean and validate URL, removing invalid anchor patterns and encoding
square brackets in the URL path.
Detects and strips malformed anchors that contain path separators.
Percent-encodes [ and ] characters in the path so that httpx/urllib3
do not misinterpret them as IPv6 address literals (fixes #284).
Valid: https://example.com/page.md#section
Invalid: https://example.com/page#section/index.html.md
Args:
url: URL to clean (absolute or relative)
Returns:
Cleaned URL with malformed anchors stripped and brackets encoded.
Returns base URL if anchor contains '/' (malformed).
Returns original URL if anchor is valid or no anchor present.
Example:
>>> parser._clean_url("https://ex.com/page#sec/path.md")
"https://ex.com/page"
>>> parser._clean_url("https://ex.com/page.md#section")
"https://ex.com/page.md#section"
>>> parser._clean_url("https://ex.com/api/[v1]/users")
"https://ex.com/api/%5Bv1%5D/users"
"""
# Skip URLs with path after anchor (e.g., #section/index.html.md)
# These are malformed and return duplicate HTML content
if "#" in url:
anchor_pos = url.index("#")
after_anchor = url[anchor_pos + 1 :]
# If there's a path separator after anchor, it's invalid
if "/" in after_anchor:
# Extract the base URL without the malformed anchor
url = url[:anchor_pos]
# Percent-encode square brackets in the path/query (see #284).
return sanitize_url(url)
def parse(self) -> list[dict]:
"""
Parse markdown content into page structures.
Returns:
List of page dicts with title, content, code_samples, headings
"""
pages = []
# Split by h1 headers (# Title)
sections = re.split(r"\n# ", self.content)
for section in sections:
if not section.strip():
continue
# First line is title
lines = section.split("\n")
title = lines[0].strip("#").strip()
# Parse content
page = self._parse_section("\n".join(lines[1:]), title)
pages.append(page)
return pages
def _parse_section(self, content: str, title: str) -> dict:
"""Parse a single section into page structure"""
page = {
"title": title,
"content": "",
"code_samples": [],
"headings": [],
"url": f"llms-txt#{title.lower().replace(' ', '-')}",
"links": [],
}
# Extract code blocks
code_blocks = re.findall(r"```(\w+)?\n(.*?)```", content, re.DOTALL)
for lang, code in code_blocks:
page["code_samples"].append({"code": code.strip(), "language": lang or "unknown"})
# Extract h2/h3 headings
headings = re.findall(r"^(#{2,3})\s+(.+)$", content, re.MULTILINE)
for level_markers, text in headings:
page["headings"].append(
{
"level": f"h{len(level_markers)}",
"text": text.strip(),
"id": text.lower().replace(" ", "-"),
}
)
# Remove code blocks from content for plain text
content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
# Extract paragraphs
paragraphs = [p.strip() for p in content_no_code.split("\n\n") if len(p.strip()) > 20]
page["content"] = "\n\n".join(paragraphs)
return page