Applied black formatting to files modified in linting fixes: Source files (8): - config_extractor.py - doc_scraper.py - how_to_guide_builder.py - llms_txt_downloader.py - llms_txt_parser.py - pattern_recognizer.py - test_example_extractor.py - unified_codebase_analyzer.py Test files (8): - test_architecture_scenarios.py - test_async_scraping.py - test_github_scraper.py - test_guide_enhancer.py - test_install_agent.py - test_issue_219_e2e.py - test_llms_txt_downloader.py - test_skip_llms_txt.py All formatting issues resolved. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
105 lines
3.3 KiB
Python
105 lines
3.3 KiB
Python
"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
|
|
|
|
import time
|
|
|
|
import requests
|
|
|
|
|
|
class LlmsTxtDownloader:
|
|
"""Download llms.txt content from URLs with retry logic"""
|
|
|
|
def __init__(self, url: str, timeout: int = 30, max_retries: int = 3):
|
|
self.url = url
|
|
self.timeout = timeout
|
|
self.max_retries = max_retries
|
|
|
|
def get_proper_filename(self) -> str:
|
|
"""
|
|
Extract filename from URL and convert .txt to .md
|
|
|
|
Returns:
|
|
Proper filename with .md extension
|
|
|
|
Examples:
|
|
https://hono.dev/llms-full.txt -> llms-full.md
|
|
https://hono.dev/llms.txt -> llms.md
|
|
https://hono.dev/llms-small.txt -> llms-small.md
|
|
"""
|
|
# Extract filename from URL
|
|
from urllib.parse import urlparse
|
|
|
|
parsed = urlparse(self.url)
|
|
filename = parsed.path.split("/")[-1]
|
|
|
|
# Replace .txt with .md
|
|
if filename.endswith(".txt"):
|
|
filename = filename[:-4] + ".md"
|
|
|
|
return filename
|
|
|
|
def _is_markdown(self, content: str) -> bool:
|
|
"""
|
|
Check if content looks like markdown (not HTML).
|
|
|
|
Returns:
|
|
True if content contains markdown patterns and is NOT HTML
|
|
"""
|
|
# First, reject HTML content (common redirect trap)
|
|
content_start = content.strip()[:500].lower()
|
|
html_indicators = [
|
|
"<!doctype html",
|
|
"<html",
|
|
"<!doctype",
|
|
"<head>",
|
|
"<meta charset",
|
|
]
|
|
if any(indicator in content_start for indicator in html_indicators):
|
|
return False
|
|
|
|
# Then check for markdown patterns
|
|
markdown_patterns = ["# ", "## ", "```", "- ", "* ", "`"]
|
|
return any(pattern in content for pattern in markdown_patterns)
|
|
|
|
def download(self) -> str | None:
|
|
"""
|
|
Download llms.txt content with retry logic.
|
|
|
|
Returns:
|
|
String content or None if download fails
|
|
"""
|
|
headers = {"User-Agent": "Skill-Seekers-llms.txt-Reader/1.0"}
|
|
|
|
for attempt in range(self.max_retries):
|
|
try:
|
|
response = requests.get(self.url, headers=headers, timeout=self.timeout)
|
|
response.raise_for_status()
|
|
|
|
content = response.text
|
|
|
|
# Validate content is not empty
|
|
if len(content) < 100:
|
|
print(f"⚠️ Content too short ({len(content)} chars), rejecting")
|
|
return None
|
|
|
|
# Validate content looks like markdown
|
|
if not self._is_markdown(content):
|
|
print("⚠️ Content doesn't look like markdown")
|
|
return None
|
|
|
|
return content
|
|
|
|
except requests.RequestException as e:
|
|
if attempt < self.max_retries - 1:
|
|
# Calculate exponential backoff delay: 1s, 2s, 4s, etc.
|
|
delay = 2**attempt
|
|
print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}")
|
|
print(f" Retrying in {delay}s...")
|
|
time.sleep(delay)
|
|
else:
|
|
print(
|
|
f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}"
|
|
)
|
|
return None
|
|
|
|
return None
|