Files
skill-seekers-reference/src/skill_seekers/cli/llms_txt_downloader.py
yusyus 9d43956b1d style: Run black formatter on 16 files
Applied black formatting to files modified in linting fixes:

Source files (8):
- config_extractor.py
- doc_scraper.py
- how_to_guide_builder.py
- llms_txt_downloader.py
- llms_txt_parser.py
- pattern_recognizer.py
- test_example_extractor.py
- unified_codebase_analyzer.py

Test files (8):
- test_architecture_scenarios.py
- test_async_scraping.py
- test_github_scraper.py
- test_guide_enhancer.py
- test_install_agent.py
- test_issue_219_e2e.py
- test_llms_txt_downloader.py
- test_skip_llms_txt.py

All formatting issues resolved.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-17 23:56:24 +03:00

105 lines
3.3 KiB
Python

"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
import time
import requests
class LlmsTxtDownloader:
"""Download llms.txt content from URLs with retry logic"""
def __init__(self, url: str, timeout: int = 30, max_retries: int = 3):
self.url = url
self.timeout = timeout
self.max_retries = max_retries
def get_proper_filename(self) -> str:
"""
Extract filename from URL and convert .txt to .md
Returns:
Proper filename with .md extension
Examples:
https://hono.dev/llms-full.txt -> llms-full.md
https://hono.dev/llms.txt -> llms.md
https://hono.dev/llms-small.txt -> llms-small.md
"""
# Extract filename from URL
from urllib.parse import urlparse
parsed = urlparse(self.url)
filename = parsed.path.split("/")[-1]
# Replace .txt with .md
if filename.endswith(".txt"):
filename = filename[:-4] + ".md"
return filename
def _is_markdown(self, content: str) -> bool:
"""
Check if content looks like markdown (not HTML).
Returns:
True if content contains markdown patterns and is NOT HTML
"""
# First, reject HTML content (common redirect trap)
content_start = content.strip()[:500].lower()
html_indicators = [
"<!doctype html",
"<html",
"<!doctype",
"<head>",
"<meta charset",
]
if any(indicator in content_start for indicator in html_indicators):
return False
# Then check for markdown patterns
markdown_patterns = ["# ", "## ", "```", "- ", "* ", "`"]
return any(pattern in content for pattern in markdown_patterns)
def download(self) -> str | None:
"""
Download llms.txt content with retry logic.
Returns:
String content or None if download fails
"""
headers = {"User-Agent": "Skill-Seekers-llms.txt-Reader/1.0"}
for attempt in range(self.max_retries):
try:
response = requests.get(self.url, headers=headers, timeout=self.timeout)
response.raise_for_status()
content = response.text
# Validate content is not empty
if len(content) < 100:
print(f"⚠️ Content too short ({len(content)} chars), rejecting")
return None
# Validate content looks like markdown
if not self._is_markdown(content):
print("⚠️ Content doesn't look like markdown")
return None
return content
except requests.RequestException as e:
if attempt < self.max_retries - 1:
# Calculate exponential backoff delay: 1s, 2s, 4s, etc.
delay = 2**attempt
print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}")
print(f" Retrying in {delay}s...")
time.sleep(delay)
else:
print(
f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}"
)
return None
return None