fix: add retries, markdown validation, and test mocking to downloader

- Implement retry logic with exponential backoff (default: 3 retries)
- Add markdown validation to check for markdown patterns
- Replace flaky HTTP tests with comprehensive mocking
- Add 10 test cases covering all scenarios:
  - Successful download
  - Timeout with retry
  - Empty content rejection (<100 chars)
  - Non-markdown rejection
  - HTTP error handling
  - Exponential backoff validation
  - Markdown pattern detection
  - Custom timeout parameter
  - Custom max_retries parameter
  - User agent header verification

All tests now pass reliably (10/10) without making real HTTP requests.
This commit is contained in:
Edgar I.
2025-10-24 13:29:21 +04:00
parent 3dd928b34b
commit e88a4b0fcc
2 changed files with 189 additions and 28 deletions

View File

@@ -1,43 +1,71 @@
"""ABOUTME: Downloads llms.txt files from documentation URLs with error handling"""
"""ABOUTME: Handles timeouts, retries, and validates content before returning"""
"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff"""
import requests
import time
from typing import Optional
class LlmsTxtDownloader:
"""Download llms.txt content from URLs"""
"""Download llms.txt content from URLs with retry logic"""
def __init__(self, url: str, timeout: int = 30):
def __init__(self, url: str, timeout: int = 30, max_retries: int = 3):
self.url = url
self.timeout = timeout
self.max_retries = max_retries
def _is_markdown(self, content: str) -> bool:
"""
Check if content looks like markdown.
Returns:
True if content contains markdown patterns
"""
markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
return any(pattern in content for pattern in markdown_patterns)
def download(self) -> Optional[str]:
"""
Download llms.txt content.
Download llms.txt content with retry logic.
Returns:
String content or None if download fails
"""
try:
headers = {
'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
}
headers = {
'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
}
response = requests.get(
self.url,
headers=headers,
timeout=self.timeout
)
response.raise_for_status()
for attempt in range(self.max_retries):
try:
response = requests.get(
self.url,
headers=headers,
timeout=self.timeout
)
response.raise_for_status()
content = response.text
content = response.text
# Validate content is not empty and looks like markdown
if len(content) < 100:
return None
# Validate content is not empty
if len(content) < 100:
print(f"⚠️ Content too short ({len(content)} chars), rejecting")
return None
return content
# Validate content looks like markdown
if not self._is_markdown(content):
print(f"⚠️ Content doesn't look like markdown")
return None
except requests.RequestException as e:
print(f"❌ Failed to download {self.url}: {e}")
return None
return content
except requests.RequestException as e:
if attempt < self.max_retries - 1:
# Calculate exponential backoff delay: 1s, 2s, 4s, etc.
delay = 2 ** attempt
print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}")
print(f" Retrying in {delay}s...")
time.sleep(delay)
else:
print(f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}")
return None
return None