From e88a4b0fccd98dc2154b971770e047cd184fbe67 Mon Sep 17 00:00:00 2001
From: "Edgar I." <eibrahimov@gmail.com>
Date: Fri, 24 Oct 2025 13:29:21 +0400
Subject: [PATCH] fix: add retries, markdown validation, and test mocking to
 downloader

- Implement retry logic with exponential backoff (default: 3 retries)
- Add markdown validation to check for markdown patterns
- Replace flaky HTTP tests with comprehensive mocking
- Add 10 test cases covering all scenarios:
  - Successful download
  - Timeout with retry
  - Empty content rejection (<100 chars)
  - Non-markdown rejection
  - HTTP error handling
  - Exponential backoff validation
  - Markdown pattern detection
  - Custom timeout parameter
  - Custom max_retries parameter
  - User agent header verification

All tests now pass reliably (10/10) without making real HTTP requests.
---
 cli/llms_txt_downloader.py        |  74 +++++++++++-----
 tests/test_llms_txt_downloader.py | 143 ++++++++++++++++++++++++++++--
 2 files changed, 189 insertions(+), 28 deletions(-)

diff --git a/cli/llms_txt_downloader.py b/cli/llms_txt_downloader.py
index 1ce60ca..d618149 100644
--- a/cli/llms_txt_downloader.py
+++ b/cli/llms_txt_downloader.py
@@ -1,43 +1,71 @@
-"""ABOUTME: Downloads llms.txt files from documentation URLs with error handling"""
-"""ABOUTME: Handles timeouts, retries, and validates content before returning"""
+"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
+"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff"""
 
 import requests
+import time
 from typing import Optional
 
 class LlmsTxtDownloader:
-    """Download llms.txt content from URLs"""
+    """Download llms.txt content from URLs with retry logic"""
 
-    def __init__(self, url: str, timeout: int = 30):
+    def __init__(self, url: str, timeout: int = 30, max_retries: int = 3):
         self.url = url
         self.timeout = timeout
+        self.max_retries = max_retries
+
+    def _is_markdown(self, content: str) -> bool:
+        """
+        Check if content looks like markdown.
+
+        Returns:
+            True if content contains markdown patterns
+        """
+        markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
+        return any(pattern in content for pattern in markdown_patterns)
 
     def download(self) -> Optional[str]:
         """
-        Download llms.txt content.
+        Download llms.txt content with retry logic.
 
         Returns:
             String content or None if download fails
         """
-        try:
-            headers = {
-                'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
-            }
+        headers = {
+            'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
+        }
 
-            response = requests.get(
-                self.url,
-                headers=headers,
-                timeout=self.timeout
-            )
-            response.raise_for_status()
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.get(
+                    self.url,
+                    headers=headers,
+                    timeout=self.timeout
+                )
+                response.raise_for_status()
 
-            content = response.text
+                content = response.text
 
-            # Validate content is not empty and looks like markdown
-            if len(content) < 100:
-                return None
+                # Validate content is not empty
+                if len(content) < 100:
+                    print(f"⚠️  Content too short ({len(content)} chars), rejecting")
+                    return None
 
-            return content
+                # Validate content looks like markdown
+                if not self._is_markdown(content):
+                    print(f"⚠️  Content doesn't look like markdown")
+                    return None
 
-        except requests.RequestException as e:
-            print(f"❌ Failed to download {self.url}: {e}")
-            return None
+                return content
+
+            except requests.RequestException as e:
+                if attempt < self.max_retries - 1:
+                    # Calculate exponential backoff delay: 1s, 2s, 4s, etc.
+                    delay = 2 ** attempt
+                    print(f"⚠️  Attempt {attempt + 1}/{self.max_retries} failed: {e}")
+                    print(f"   Retrying in {delay}s...")
+                    time.sleep(delay)
+                else:
+                    print(f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}")
+                    return None
+
+        return None
diff --git a/tests/test_llms_txt_downloader.py b/tests/test_llms_txt_downloader.py
index a902b57..c1b15b4 100644
--- a/tests/test_llms_txt_downloader.py
+++ b/tests/test_llms_txt_downloader.py
@@ -1,12 +1,145 @@
 import pytest
+from unittest.mock import patch, Mock
+import requests
 from cli.llms_txt_downloader import LlmsTxtDownloader
 
-def test_download_llms_txt():
-    """Test downloading llms.txt content"""
-    downloader = LlmsTxtDownloader("https://hono.dev/llms-full.txt")
+def test_successful_download():
+    """Test successful download with valid markdown content"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
 
-    content = downloader.download()
+    mock_response = Mock()
+    mock_response.text = "# Header\n\nSome content with markdown patterns.\n\n## Subheader\n\n- List item\n- Another item\n\n```python\ncode_block()\n```\n" + "x" * 200
+    mock_response.raise_for_status = Mock()
+
+    with patch('requests.get', return_value=mock_response) as mock_get:
+        content = downloader.download()
 
     assert content is not None
-    assert len(content) > 100  # Should have substantial content
+    assert len(content) > 100
     assert isinstance(content, str)
+    assert "# Header" in content
+    mock_get.assert_called_once()
+
+def test_timeout_with_retry():
+    """Test timeout scenario with retry logic"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
+
+    with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get:
+        with patch('time.sleep') as mock_sleep:  # Mock sleep to speed up test
+            content = downloader.download()
+
+    assert content is None
+    assert mock_get.call_count == 2  # Should retry once (2 total attempts)
+    assert mock_sleep.call_count == 1  # Should sleep once between retries
+
+def test_empty_content_rejection():
+    """Test rejection of content shorter than 100 chars"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    mock_response = Mock()
+    mock_response.text = "# Short"
+    mock_response.raise_for_status = Mock()
+
+    with patch('requests.get', return_value=mock_response):
+        content = downloader.download()
+
+    assert content is None
+
+def test_non_markdown_rejection():
+    """Test rejection of content that doesn't look like markdown"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    mock_response = Mock()
+    mock_response.text = "Plain text without any markdown patterns at all. " * 10
+    mock_response.raise_for_status = Mock()
+
+    with patch('requests.get', return_value=mock_response):
+        content = downloader.download()
+
+    assert content is None
+
+def test_http_error_handling():
+    """Test handling of HTTP errors (404, 500, etc.)"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
+
+    mock_response = Mock()
+    mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found")
+
+    with patch('requests.get', return_value=mock_response) as mock_get:
+        with patch('time.sleep'):
+            content = downloader.download()
+
+    assert content is None
+    assert mock_get.call_count == 2  # Should retry once
+
+def test_exponential_backoff():
+    """Test that exponential backoff delays are correct"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=3)
+
+    with patch('requests.get', side_effect=requests.Timeout("Connection timeout")):
+        with patch('time.sleep') as mock_sleep:
+            content = downloader.download()
+
+    assert content is None
+    # Should sleep with delays: 1s, 2s (2^0, 2^1)
+    assert mock_sleep.call_count == 2
+    mock_sleep.assert_any_call(1)  # First retry delay
+    mock_sleep.assert_any_call(2)  # Second retry delay
+
+def test_markdown_validation():
+    """Test markdown pattern detection"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    # Test various markdown patterns
+    assert downloader._is_markdown("# Header")
+    assert downloader._is_markdown("## Subheader")
+    assert downloader._is_markdown("```code```")
+    assert downloader._is_markdown("- list item")
+    assert downloader._is_markdown("* bullet point")
+    assert downloader._is_markdown("`inline code`")
+
+    # Test non-markdown content
+    assert not downloader._is_markdown("Plain text without any markdown patterns")
+
+def test_custom_timeout():
+    """Test custom timeout parameter"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt", timeout=10)
+
+    mock_response = Mock()
+    mock_response.text = "# Header\n\nContent " * 50
+    mock_response.raise_for_status = Mock()
+
+    with patch('requests.get', return_value=mock_response) as mock_get:
+        content = downloader.download()
+
+    assert content is not None
+    # Verify timeout was passed to requests.get
+    call_kwargs = mock_get.call_args[1]
+    assert call_kwargs['timeout'] == 10
+
+def test_custom_max_retries():
+    """Test custom max_retries parameter"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=5)
+
+    with patch('requests.get', side_effect=requests.Timeout("Connection timeout")) as mock_get:
+        with patch('time.sleep'):
+            content = downloader.download()
+
+    assert content is None
+    assert mock_get.call_count == 5  # Should attempt 5 times
+
+def test_user_agent_header():
+    """Test that custom user agent is set"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    mock_response = Mock()
+    mock_response.text = "# Header\n\nContent " * 50
+    mock_response.raise_for_status = Mock()
+
+    with patch('requests.get', return_value=mock_response) as mock_get:
+        content = downloader.download()
+
+    assert content is not None
+    # Verify custom user agent was passed
+    call_kwargs = mock_get.call_args[1]
+    assert call_kwargs['headers']['User-Agent'] == 'Skill-Seekers-llms.txt-Reader/1.0'