fix: add retries, markdown validation, and test mocking to downloader

- Implement retry logic with exponential backoff (default: 3 retries) - Add markdown validation to check for markdown patterns - Replace flaky HTTP tests with comprehensive mocking - Add 10 test cases covering all scenarios: - Successful download - Timeout with retry - Empty content rejection (<100 chars) - Non-markdown rejection - HTTP error handling - Exponential backoff validation - Markdown pattern detection - Custom timeout parameter - Custom max_retries parameter - User agent header verification All tests now pass reliably (10/10) without making real HTTP requests.
2025-10-24 13:29:21 +04:00
parent 3dd928b34b
commit e88a4b0fcc
2 changed files with 189 additions and 28 deletions
--- a/cli/llms_txt_downloader.py
+++ b/cli/llms_txt_downloader.py
@@ -1,43 +1,71 @@
-"""ABOUTME: Downloads llms.txt files from documentation URLs with error handling"""
-"""ABOUTME: Handles timeouts, retries, and validates content before returning"""
+"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
+"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff"""

 import requests
+import time
 from typing import Optional

 class LlmsTxtDownloader:
-    """Download llms.txt content from URLs"""
+    """Download llms.txt content from URLs with retry logic"""

-    def __init__(self, url: str, timeout: int = 30):
+    def __init__(self, url: str, timeout: int = 30, max_retries: int = 3):
        self.url = url
        self.timeout = timeout
+        self.max_retries = max_retries
+
+    def _is_markdown(self, content: str) -> bool:
+        """
+        Check if content looks like markdown.
+
+        Returns:
+            True if content contains markdown patterns
+        """
+        markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
+        return any(pattern in content for pattern in markdown_patterns)

    def download(self) -> Optional[str]:
        """
-        Download llms.txt content.
+        Download llms.txt content with retry logic.

        Returns:
            String content or None if download fails
        """
-        try:
-            headers = {
-                'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
-            }
+        headers = {
+            'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0'
+        }

-            response = requests.get(
-                self.url,
-                headers=headers,
-                timeout=self.timeout
-            )
-            response.raise_for_status()
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.get(
+                    self.url,
+                    headers=headers,
+                    timeout=self.timeout
+                )
+                response.raise_for_status()

-            content = response.text
+                content = response.text

-            # Validate content is not empty and looks like markdown
-            if len(content) < 100:
-                return None
+                # Validate content is not empty
+                if len(content) < 100:
+                    print(f"⚠️  Content too short ({len(content)} chars), rejecting")
+                    return None

-            return content
+                # Validate content looks like markdown
+                if not self._is_markdown(content):
+                    print(f"⚠️  Content doesn't look like markdown")
+                    return None

-        except requests.RequestException as e:
-            print(f"❌ Failed to download {self.url}: {e}")
-            return None
+                return content
+
+            except requests.RequestException as e:
+                if attempt < self.max_retries - 1:
+                    # Calculate exponential backoff delay: 1s, 2s, 4s, etc.
+                    delay = 2 ** attempt
+                    print(f"⚠️  Attempt {attempt + 1}/{self.max_retries} failed: {e}")
+                    print(f"   Retrying in {delay}s...")
+                    time.sleep(delay)
+                else:
+                    print(f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}")
+                    return None
+
+        return None