fix: add retries, markdown validation, and test mocking to downloader
- Implement retry logic with exponential backoff (default: 3 retries) - Add markdown validation to check for markdown patterns - Replace flaky HTTP tests with comprehensive mocking - Add 10 test cases covering all scenarios: - Successful download - Timeout with retry - Empty content rejection (<100 chars) - Non-markdown rejection - HTTP error handling - Exponential backoff validation - Markdown pattern detection - Custom timeout parameter - Custom max_retries parameter - User agent header verification All tests now pass reliably (10/10) without making real HTTP requests.
This commit is contained in:
@@ -1,43 +1,71 @@
|
|||||||
"""ABOUTME: Downloads llms.txt files from documentation URLs with error handling"""
|
"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
|
||||||
"""ABOUTME: Handles timeouts, retries, and validates content before returning"""
|
"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff"""
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
import time
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
class LlmsTxtDownloader:
|
class LlmsTxtDownloader:
|
||||||
"""Download llms.txt content from URLs"""
|
"""Download llms.txt content from URLs with retry logic"""
|
||||||
|
|
||||||
def __init__(self, url: str, timeout: int = 30):
|
def __init__(self, url: str, timeout: int = 30, max_retries: int = 3):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
|
self.max_retries = max_retries
|
||||||
|
|
||||||
|
def _is_markdown(self, content: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if content looks like markdown.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if content contains markdown patterns
|
||||||
|
"""
|
||||||
|
markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
|
||||||
|
return any(pattern in content for pattern in markdown_patterns)
|
||||||
|
|
||||||
def download(self) -> Optional[str]:
|
def download(self) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Download llms.txt content.
|
Download llms.txt content with retry logic.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
String content or None if download fails
|
String content or None if download fails
|
||||||
"""
|
"""
|
||||||
try:
|
headers = {
|
||||||
headers = {
|
'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
|
||||||
'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
|
}
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(
|
for attempt in range(self.max_retries):
|
||||||
self.url,
|
try:
|
||||||
headers=headers,
|
response = requests.get(
|
||||||
timeout=self.timeout
|
self.url,
|
||||||
)
|
headers=headers,
|
||||||
response.raise_for_status()
|
timeout=self.timeout
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
content = response.text
|
content = response.text
|
||||||
|
|
||||||
# Validate content is not empty and looks like markdown
|
# Validate content is not empty
|
||||||
if len(content) < 100:
|
if len(content) < 100:
|
||||||
return None
|
print(f"⚠️ Content too short ({len(content)} chars), rejecting")
|
||||||
|
return None
|
||||||
|
|
||||||
return content
|
# Validate content looks like markdown
|
||||||
|
if not self._is_markdown(content):
|
||||||
|
print(f"⚠️ Content doesn't look like markdown")
|
||||||
|
return None
|
||||||
|
|
||||||
except requests.RequestException as e:
|
return content
|
||||||
print(f"❌ Failed to download {self.url}: {e}")
|
|
||||||
return None
|
except requests.RequestException as e:
|
||||||
|
if attempt < self.max_retries - 1:
|
||||||
|
# Calculate exponential backoff delay: 1s, 2s, 4s, etc.
|
||||||
|
delay = 2 ** attempt
|
||||||
|
print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}")
|
||||||
|
print(f" Retrying in {delay}s...")
|
||||||
|
time.sleep(delay)
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|||||||
@@ -1,12 +1,145 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
from unittest.mock import patch, Mock
|
||||||
|
import requests
|
||||||
from cli.llms_txt_downloader import LlmsTxtDownloader
|
from cli.llms_txt_downloader import LlmsTxtDownloader
|
||||||
|
|
||||||
def test_download_llms_txt():
|
def test_successful_download():
|
||||||
"""Test downloading llms.txt content"""
|
"""Test successful download with valid markdown content"""
|
||||||
downloader = LlmsTxtDownloader("https://hono.dev/llms-full.txt")
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
||||||
|
|
||||||
content = downloader.download()
|
mock_response = Mock()
|
||||||
|
mock_response.text = "# Header\n\nSome content with markdown patterns.\n\n## Subheader\n\n- List item\n- Another item\n\n```python\ncode_block()\n```\n" + "x" * 200
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch('requests.get', return_value=mock_response) as mock_get:
|
||||||
|
content = downloader.download()
|
||||||
|
|
||||||
assert content is not None
|
assert content is not None
|
||||||
assert len(content) > 100 # Should have substantial content
|
assert len(content) > 100
|
||||||
assert isinstance(content, str)
|
assert isinstance(content, str)
|
||||||
|
assert "# Header" in content
|
||||||
|
mock_get.assert_called_once()
|
||||||
|
|
||||||
|
def test_timeout_with_retry():
|
||||||
|
"""Test timeout scenario with retry logic"""
|
||||||
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
|
||||||
|
|
||||||
|
with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get:
|
||||||
|
with patch('time.sleep') as mock_sleep: # Mock sleep to speed up test
|
||||||
|
content = downloader.download()
|
||||||
|
|
||||||
|
assert content is None
|
||||||
|
assert mock_get.call_count == 2 # Should retry once (2 total attempts)
|
||||||
|
assert mock_sleep.call_count == 1 # Should sleep once between retries
|
||||||
|
|
||||||
|
def test_empty_content_rejection():
|
||||||
|
"""Test rejection of content shorter than 100 chars"""
|
||||||
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.text = "# Short"
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch('requests.get', return_value=mock_response):
|
||||||
|
content = downloader.download()
|
||||||
|
|
||||||
|
assert content is None
|
||||||
|
|
||||||
|
def test_non_markdown_rejection():
|
||||||
|
"""Test rejection of content that doesn't look like markdown"""
|
||||||
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.text = "Plain text without any markdown patterns at all. " * 10
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch('requests.get', return_value=mock_response):
|
||||||
|
content = downloader.download()
|
||||||
|
|
||||||
|
assert content is None
|
||||||
|
|
||||||
|
def test_http_error_handling():
|
||||||
|
"""Test handling of HTTP errors (404, 500, etc.)"""
|
||||||
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found")
|
||||||
|
|
||||||
|
with patch('requests.get', return_value=mock_response) as mock_get:
|
||||||
|
with patch('time.sleep'):
|
||||||
|
content = downloader.download()
|
||||||
|
|
||||||
|
assert content is None
|
||||||
|
assert mock_get.call_count == 2 # Should retry once
|
||||||
|
|
||||||
|
def test_exponential_backoff():
|
||||||
|
"""Test that exponential backoff delays are correct"""
|
||||||
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=3)
|
||||||
|
|
||||||
|
with patch('requests.get', side_effect=requests.Timeout("Connection timeout")):
|
||||||
|
with patch('time.sleep') as mock_sleep:
|
||||||
|
content = downloader.download()
|
||||||
|
|
||||||
|
assert content is None
|
||||||
|
# Should sleep with delays: 1s, 2s (2^0, 2^1)
|
||||||
|
assert mock_sleep.call_count == 2
|
||||||
|
mock_sleep.assert_any_call(1) # First retry delay
|
||||||
|
mock_sleep.assert_any_call(2) # Second retry delay
|
||||||
|
|
||||||
|
def test_markdown_validation():
|
||||||
|
"""Test markdown pattern detection"""
|
||||||
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
||||||
|
|
||||||
|
# Test various markdown patterns
|
||||||
|
assert downloader._is_markdown("# Header")
|
||||||
|
assert downloader._is_markdown("## Subheader")
|
||||||
|
assert downloader._is_markdown("```code```")
|
||||||
|
assert downloader._is_markdown("- list item")
|
||||||
|
assert downloader._is_markdown("* bullet point")
|
||||||
|
assert downloader._is_markdown("`inline code`")
|
||||||
|
|
||||||
|
# Test non-markdown content
|
||||||
|
assert not downloader._is_markdown("Plain text without any markdown patterns")
|
||||||
|
|
||||||
|
def test_custom_timeout():
|
||||||
|
"""Test custom timeout parameter"""
|
||||||
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", timeout=10)
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.text = "# Header\n\nContent " * 50
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch('requests.get', return_value=mock_response) as mock_get:
|
||||||
|
content = downloader.download()
|
||||||
|
|
||||||
|
assert content is not None
|
||||||
|
# Verify timeout was passed to requests.get
|
||||||
|
call_kwargs = mock_get.call_args[1]
|
||||||
|
assert call_kwargs['timeout'] == 10
|
||||||
|
|
||||||
|
def test_custom_max_retries():
|
||||||
|
"""Test custom max_retries parameter"""
|
||||||
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=5)
|
||||||
|
|
||||||
|
with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get:
|
||||||
|
with patch('time.sleep'):
|
||||||
|
content = downloader.download()
|
||||||
|
|
||||||
|
assert content is None
|
||||||
|
assert mock_get.call_count == 5 # Should attempt 5 times
|
||||||
|
|
||||||
|
def test_user_agent_header():
|
||||||
|
"""Test that custom user agent is set"""
|
||||||
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.text = "# Header\n\nContent " * 50
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch('requests.get', return_value=mock_response) as mock_get:
|
||||||
|
content = downloader.download()
|
||||||
|
|
||||||
|
assert content is not None
|
||||||
|
# Verify custom user agent was passed
|
||||||
|
call_kwargs = mock_get.call_args[1]
|
||||||
|
assert call_kwargs['headers']['User-Agent'] == 'Skill-Seekers-llms.txt-Reader/1.0'
|
||||||
|
|||||||
Reference in New Issue
Block a user