fix: add retries, markdown validation, and test mocking to downloader
- Implement retry logic with exponential backoff (default: 3 retries) - Add markdown validation to check for markdown patterns - Replace flaky HTTP tests with comprehensive mocking - Add 10 test cases covering all scenarios: - Successful download - Timeout with retry - Empty content rejection (<100 chars) - Non-markdown rejection - HTTP error handling - Exponential backoff validation - Markdown pattern detection - Custom timeout parameter - Custom max_retries parameter - User agent header verification All tests now pass reliably (10/10) without making real HTTP requests.
This commit is contained in:
@@ -1,43 +1,71 @@
|
||||
"""ABOUTME: Downloads llms.txt files from documentation URLs with error handling"""
|
||||
"""ABOUTME: Handles timeouts, retries, and validates content before returning"""
|
||||
"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
|
||||
"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff"""
|
||||
|
||||
import requests
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
class LlmsTxtDownloader:
|
||||
"""Download llms.txt content from URLs"""
|
||||
"""Download llms.txt content from URLs with retry logic"""
|
||||
|
||||
def __init__(self, url: str, timeout: int = 30):
|
||||
def __init__(self, url: str, timeout: int = 30, max_retries: int = 3):
|
||||
self.url = url
|
||||
self.timeout = timeout
|
||||
self.max_retries = max_retries
|
||||
|
||||
def _is_markdown(self, content: str) -> bool:
|
||||
"""
|
||||
Check if content looks like markdown.
|
||||
|
||||
Returns:
|
||||
True if content contains markdown patterns
|
||||
"""
|
||||
markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
|
||||
return any(pattern in content for pattern in markdown_patterns)
|
||||
|
||||
def download(self) -> Optional[str]:
|
||||
"""
|
||||
Download llms.txt content.
|
||||
Download llms.txt content with retry logic.
|
||||
|
||||
Returns:
|
||||
String content or None if download fails
|
||||
"""
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
|
||||
}
|
||||
headers = {
|
||||
'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
self.url,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
response = requests.get(
|
||||
self.url,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
content = response.text
|
||||
content = response.text
|
||||
|
||||
# Validate content is not empty and looks like markdown
|
||||
if len(content) < 100:
|
||||
return None
|
||||
# Validate content is not empty
|
||||
if len(content) < 100:
|
||||
print(f"⚠️ Content too short ({len(content)} chars), rejecting")
|
||||
return None
|
||||
|
||||
return content
|
||||
# Validate content looks like markdown
|
||||
if not self._is_markdown(content):
|
||||
print(f"⚠️ Content doesn't look like markdown")
|
||||
return None
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Failed to download {self.url}: {e}")
|
||||
return None
|
||||
return content
|
||||
|
||||
except requests.RequestException as e:
|
||||
if attempt < self.max_retries - 1:
|
||||
# Calculate exponential backoff delay: 1s, 2s, 4s, etc.
|
||||
delay = 2 ** attempt
|
||||
print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}")
|
||||
print(f" Retrying in {delay}s...")
|
||||
time.sleep(delay)
|
||||
else:
|
||||
print(f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user