From e88a4b0fccd98dc2154b971770e047cd184fbe67 Mon Sep 17 00:00:00 2001 From: "Edgar I." Date: Fri, 24 Oct 2025 13:29:21 +0400 Subject: [PATCH] fix: add retries, markdown validation, and test mocking to downloader - Implement retry logic with exponential backoff (default: 3 retries) - Add markdown validation to check for markdown patterns - Replace flaky HTTP tests with comprehensive mocking - Add 10 test cases covering all scenarios: - Successful download - Timeout with retry - Empty content rejection (<100 chars) - Non-markdown rejection - HTTP error handling - Exponential backoff validation - Markdown pattern detection - Custom timeout parameter - Custom max_retries parameter - User agent header verification All tests now pass reliably (10/10) without making real HTTP requests. --- cli/llms_txt_downloader.py | 74 +++++++++++----- tests/test_llms_txt_downloader.py | 143 ++++++++++++++++++++++++++++-- 2 files changed, 189 insertions(+), 28 deletions(-) diff --git a/cli/llms_txt_downloader.py b/cli/llms_txt_downloader.py index 1ce60ca..d618149 100644 --- a/cli/llms_txt_downloader.py +++ b/cli/llms_txt_downloader.py @@ -1,43 +1,71 @@ -"""ABOUTME: Downloads llms.txt files from documentation URLs with error handling""" -"""ABOUTME: Handles timeouts, retries, and validates content before returning""" +"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic""" +"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff""" import requests +import time from typing import Optional class LlmsTxtDownloader: - """Download llms.txt content from URLs""" + """Download llms.txt content from URLs with retry logic""" - def __init__(self, url: str, timeout: int = 30): + def __init__(self, url: str, timeout: int = 30, max_retries: int = 3): self.url = url self.timeout = timeout + self.max_retries = max_retries + + def _is_markdown(self, content: str) -> bool: + """ + Check if content looks like markdown. + + Returns: + True if content contains markdown patterns + """ + markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`'] + return any(pattern in content for pattern in markdown_patterns) def download(self) -> Optional[str]: """ - Download llms.txt content. + Download llms.txt content with retry logic. Returns: String content or None if download fails """ - try: - headers = { - 'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0' - } + headers = { + 'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0' + } - response = requests.get( - self.url, - headers=headers, - timeout=self.timeout - ) - response.raise_for_status() + for attempt in range(self.max_retries): + try: + response = requests.get( + self.url, + headers=headers, + timeout=self.timeout + ) + response.raise_for_status() - content = response.text + content = response.text - # Validate content is not empty and looks like markdown - if len(content) < 100: - return None + # Validate content is not empty + if len(content) < 100: + print(f"⚠️ Content too short ({len(content)} chars), rejecting") + return None - return content + # Validate content looks like markdown + if not self._is_markdown(content): + print(f"⚠️ Content doesn't look like markdown") + return None - except requests.RequestException as e: - print(f"❌ Failed to download {self.url}: {e}") - return None + return content + + except requests.RequestException as e: + if attempt < self.max_retries - 1: + # Calculate exponential backoff delay: 1s, 2s, 4s, etc. + delay = 2 ** attempt + print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}") + print(f" Retrying in {delay}s...") + time.sleep(delay) + else: + print(f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}") + return None + + return None diff --git a/tests/test_llms_txt_downloader.py b/tests/test_llms_txt_downloader.py index a902b57..c1b15b4 100644 --- a/tests/test_llms_txt_downloader.py +++ b/tests/test_llms_txt_downloader.py @@ -1,12 +1,145 @@ import pytest +from unittest.mock import patch, Mock +import requests from cli.llms_txt_downloader import LlmsTxtDownloader -def test_download_llms_txt(): - """Test downloading llms.txt content""" - downloader = LlmsTxtDownloader("https://hono.dev/llms-full.txt") +def test_successful_download(): + """Test successful download with valid markdown content""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") - content = downloader.download() + mock_response = Mock() + mock_response.text = "# Header\n\nSome content with markdown patterns.\n\n## Subheader\n\n- List item\n- Another item\n\n```python\ncode_block()\n```\n" + "x" * 200 + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response) as mock_get: + content = downloader.download() assert content is not None - assert len(content) > 100 # Should have substantial content + assert len(content) > 100 assert isinstance(content, str) + assert "# Header" in content + mock_get.assert_called_once() + +def test_timeout_with_retry(): + """Test timeout scenario with retry logic""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2) + + with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get: + with patch('time.sleep') as mock_sleep: # Mock sleep to speed up test + content = downloader.download() + + assert content is None + assert mock_get.call_count == 2 # Should retry once (2 total attempts) + assert mock_sleep.call_count == 1 # Should sleep once between retries + +def test_empty_content_rejection(): + """Test rejection of content shorter than 100 chars""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + mock_response = Mock() + mock_response.text = "# Short" + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + content = downloader.download() + + assert content is None + +def test_non_markdown_rejection(): + """Test rejection of content that doesn't look like markdown""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + mock_response = Mock() + mock_response.text = "Plain text without any markdown patterns at all. " * 10 + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + content = downloader.download() + + assert content is None + +def test_http_error_handling(): + """Test handling of HTTP errors (404, 500, etc.)""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2) + + mock_response = Mock() + mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found") + + with patch('requests.get', return_value=mock_response) as mock_get: + with patch('time.sleep'): + content = downloader.download() + + assert content is None + assert mock_get.call_count == 2 # Should retry once + +def test_exponential_backoff(): + """Test that exponential backoff delays are correct""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=3) + + with patch('requests.get', side_effect=requests.Timeout("Connection timeout")): + with patch('time.sleep') as mock_sleep: + content = downloader.download() + + assert content is None + # Should sleep with delays: 1s, 2s (2^0, 2^1) + assert mock_sleep.call_count == 2 + mock_sleep.assert_any_call(1) # First retry delay + mock_sleep.assert_any_call(2) # Second retry delay + +def test_markdown_validation(): + """Test markdown pattern detection""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + # Test various markdown patterns + assert downloader._is_markdown("# Header") + assert downloader._is_markdown("## Subheader") + assert downloader._is_markdown("```code```") + assert downloader._is_markdown("- list item") + assert downloader._is_markdown("* bullet point") + assert downloader._is_markdown("`inline code`") + + # Test non-markdown content + assert not downloader._is_markdown("Plain text without any markdown patterns") + +def test_custom_timeout(): + """Test custom timeout parameter""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", timeout=10) + + mock_response = Mock() + mock_response.text = "# Header\n\nContent " * 50 + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response) as mock_get: + content = downloader.download() + + assert content is not None + # Verify timeout was passed to requests.get + call_kwargs = mock_get.call_args[1] + assert call_kwargs['timeout'] == 10 + +def test_custom_max_retries(): + """Test custom max_retries parameter""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=5) + + with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get: + with patch('time.sleep'): + content = downloader.download() + + assert content is None + assert mock_get.call_count == 5 # Should attempt 5 times + +def test_user_agent_header(): + """Test that custom user agent is set""" + downloader = LlmsTxtDownloader("https://example.com/llms.txt") + + mock_response = Mock() + mock_response.text = "# Header\n\nContent " * 50 + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response) as mock_get: + content = downloader.download() + + assert content is not None + # Verify custom user agent was passed + call_kwargs = mock_get.call_args[1] + assert call_kwargs['headers']['User-Agent'] == 'Skill-Seekers-llms.txt-Reader/1.0'