run ruff
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
"""ABOUTME: Parses llms.txt markdown content into structured page data"""
|
||||
|
||||
"""ABOUTME: Extracts titles, content, code samples, and headings from markdown"""
|
||||
|
||||
import re
|
||||
from typing import List, Dict
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class LlmsTxtParser:
|
||||
"""Parse llms.txt markdown content into page structures"""
|
||||
|
||||
@@ -12,7 +13,7 @@ class LlmsTxtParser:
|
||||
self.content = content
|
||||
self.base_url = base_url
|
||||
|
||||
def extract_urls(self) -> List[str]:
|
||||
def extract_urls(self) -> list[str]:
|
||||
"""
|
||||
Extract all URLs from the llms.txt content.
|
||||
|
||||
@@ -33,13 +34,13 @@ class LlmsTxtParser:
|
||||
urls = set()
|
||||
|
||||
# Match markdown links: [text](url)
|
||||
md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
|
||||
md_links = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", self.content)
|
||||
for _, url in md_links:
|
||||
if url.startswith('http'):
|
||||
if url.startswith("http"):
|
||||
clean_url = self._clean_url(url)
|
||||
if clean_url:
|
||||
urls.add(clean_url)
|
||||
elif self.base_url and not url.startswith('#'):
|
||||
elif self.base_url and not url.startswith("#"):
|
||||
clean_url = self._clean_url(urljoin(self.base_url, url))
|
||||
if clean_url:
|
||||
urls.add(clean_url)
|
||||
@@ -48,7 +49,7 @@ class LlmsTxtParser:
|
||||
bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
|
||||
for url in bare_urls:
|
||||
# Clean trailing punctuation
|
||||
url = url.rstrip('.,;:')
|
||||
url = url.rstrip(".,;:")
|
||||
clean_url = self._clean_url(url)
|
||||
if clean_url:
|
||||
urls.add(clean_url)
|
||||
@@ -79,16 +80,16 @@ class LlmsTxtParser:
|
||||
"""
|
||||
# Skip URLs with path after anchor (e.g., #section/index.html.md)
|
||||
# These are malformed and return duplicate HTML content
|
||||
if '#' in url:
|
||||
anchor_pos = url.index('#')
|
||||
after_anchor = url[anchor_pos + 1:]
|
||||
if "#" in url:
|
||||
anchor_pos = url.index("#")
|
||||
after_anchor = url[anchor_pos + 1 :]
|
||||
# If there's a path separator after anchor, it's invalid
|
||||
if '/' in after_anchor:
|
||||
if "/" in after_anchor:
|
||||
# Extract the base URL without the malformed anchor
|
||||
return url[:anchor_pos]
|
||||
return url
|
||||
|
||||
def parse(self) -> List[Dict]:
|
||||
def parse(self) -> list[dict]:
|
||||
"""
|
||||
Parse markdown content into page structures.
|
||||
|
||||
@@ -98,55 +99,50 @@ class LlmsTxtParser:
|
||||
pages = []
|
||||
|
||||
# Split by h1 headers (# Title)
|
||||
sections = re.split(r'\n# ', self.content)
|
||||
sections = re.split(r"\n# ", self.content)
|
||||
|
||||
for section in sections:
|
||||
if not section.strip():
|
||||
continue
|
||||
|
||||
# First line is title
|
||||
lines = section.split('\n')
|
||||
title = lines[0].strip('#').strip()
|
||||
lines = section.split("\n")
|
||||
title = lines[0].strip("#").strip()
|
||||
|
||||
# Parse content
|
||||
page = self._parse_section('\n'.join(lines[1:]), title)
|
||||
page = self._parse_section("\n".join(lines[1:]), title)
|
||||
pages.append(page)
|
||||
|
||||
return pages
|
||||
|
||||
def _parse_section(self, content: str, title: str) -> Dict:
|
||||
def _parse_section(self, content: str, title: str) -> dict:
|
||||
"""Parse a single section into page structure"""
|
||||
page = {
|
||||
'title': title,
|
||||
'content': '',
|
||||
'code_samples': [],
|
||||
'headings': [],
|
||||
'url': f'llms-txt#{title.lower().replace(" ", "-")}',
|
||||
'links': []
|
||||
"title": title,
|
||||
"content": "",
|
||||
"code_samples": [],
|
||||
"headings": [],
|
||||
"url": f"llms-txt#{title.lower().replace(' ', '-')}",
|
||||
"links": [],
|
||||
}
|
||||
|
||||
# Extract code blocks
|
||||
code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
|
||||
code_blocks = re.findall(r"```(\w+)?\n(.*?)```", content, re.DOTALL)
|
||||
for lang, code in code_blocks:
|
||||
page['code_samples'].append({
|
||||
'code': code.strip(),
|
||||
'language': lang or 'unknown'
|
||||
})
|
||||
page["code_samples"].append({"code": code.strip(), "language": lang or "unknown"})
|
||||
|
||||
# Extract h2/h3 headings
|
||||
headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
|
||||
headings = re.findall(r"^(#{2,3})\s+(.+)$", content, re.MULTILINE)
|
||||
for level_markers, text in headings:
|
||||
page['headings'].append({
|
||||
'level': f'h{len(level_markers)}',
|
||||
'text': text.strip(),
|
||||
'id': text.lower().replace(' ', '-')
|
||||
})
|
||||
page["headings"].append(
|
||||
{"level": f"h{len(level_markers)}", "text": text.strip(), "id": text.lower().replace(" ", "-")}
|
||||
)
|
||||
|
||||
# Remove code blocks from content for plain text
|
||||
content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
|
||||
content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
|
||||
|
||||
# Extract paragraphs
|
||||
paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
|
||||
page['content'] = '\n\n'.join(paragraphs)
|
||||
paragraphs = [p.strip() for p in content_no_code.split("\n\n") if len(p.strip()) > 20]
|
||||
page["content"] = "\n\n".join(paragraphs)
|
||||
|
||||
return page
|
||||
|
||||
Reference in New Issue
Block a user