290 lines
9.9 KiB
Python
290 lines
9.9 KiB
Python
from unittest.mock import Mock, patch
|
|
|
|
import requests
|
|
|
|
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
|
|
|
|
|
|
def test_successful_download():
|
|
"""Test successful download with valid markdown content"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
mock_response = Mock()
|
|
mock_response.text = (
|
|
"# Header\n\nSome content with markdown patterns.\n\n## Subheader\n\n- List item\n- Another item\n\n```python\ncode_block()\n```\n"
|
|
+ "x" * 200
|
|
)
|
|
mock_response.raise_for_status = Mock()
|
|
|
|
with patch("requests.get", return_value=mock_response) as mock_get:
|
|
content = downloader.download()
|
|
|
|
assert content is not None
|
|
assert len(content) > 100
|
|
assert isinstance(content, str)
|
|
assert "# Header" in content
|
|
mock_get.assert_called_once()
|
|
|
|
|
|
def test_timeout_with_retry():
|
|
"""Test timeout scenario with retry logic"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
|
|
|
|
with patch("requests.get", side_effect=requests.Timeout("Connection timeout")) as mock_get:
|
|
with patch("time.sleep") as mock_sleep: # Mock sleep to speed up test
|
|
content = downloader.download()
|
|
|
|
assert content is None
|
|
assert mock_get.call_count == 2 # Should retry once (2 total attempts)
|
|
assert mock_sleep.call_count == 1 # Should sleep once between retries
|
|
|
|
|
|
def test_empty_content_rejection():
|
|
"""Test rejection of content shorter than 100 chars"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
mock_response = Mock()
|
|
mock_response.text = "# Short"
|
|
mock_response.raise_for_status = Mock()
|
|
|
|
with patch("requests.get", return_value=mock_response):
|
|
content = downloader.download()
|
|
|
|
assert content is None
|
|
|
|
|
|
def test_non_markdown_rejection():
|
|
"""Test rejection of content that doesn't look like markdown"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
mock_response = Mock()
|
|
mock_response.text = "Plain text without any markdown patterns at all. " * 10
|
|
mock_response.raise_for_status = Mock()
|
|
|
|
with patch("requests.get", return_value=mock_response):
|
|
content = downloader.download()
|
|
|
|
assert content is None
|
|
|
|
|
|
def test_http_error_handling():
|
|
"""Test handling of HTTP errors (404, 500, etc.)"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
|
|
|
|
mock_response = Mock()
|
|
mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found")
|
|
|
|
with patch("requests.get", return_value=mock_response) as mock_get, patch("time.sleep"):
|
|
content = downloader.download()
|
|
|
|
assert content is None
|
|
assert mock_get.call_count == 2 # Should retry once
|
|
|
|
|
|
def test_exponential_backoff():
|
|
"""Test that exponential backoff delays are correct"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=3)
|
|
|
|
with patch("requests.get", side_effect=requests.Timeout("Connection timeout")):
|
|
with patch("time.sleep") as mock_sleep:
|
|
content = downloader.download()
|
|
|
|
assert content is None
|
|
# Should sleep with delays: 1s, 2s (2^0, 2^1)
|
|
assert mock_sleep.call_count == 2
|
|
mock_sleep.assert_any_call(1) # First retry delay
|
|
mock_sleep.assert_any_call(2) # Second retry delay
|
|
|
|
|
|
def test_markdown_validation():
|
|
"""Test markdown pattern detection"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
# Test various markdown patterns
|
|
assert downloader._is_markdown("# Header")
|
|
assert downloader._is_markdown("## Subheader")
|
|
assert downloader._is_markdown("```code```")
|
|
assert downloader._is_markdown("- list item")
|
|
assert downloader._is_markdown("* bullet point")
|
|
assert downloader._is_markdown("`inline code`")
|
|
|
|
# Test non-markdown content
|
|
assert not downloader._is_markdown("Plain text without any markdown patterns")
|
|
|
|
|
|
def test_custom_timeout():
|
|
"""Test custom timeout parameter"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", timeout=10)
|
|
|
|
mock_response = Mock()
|
|
mock_response.text = "# Header\n\nContent " * 50
|
|
mock_response.raise_for_status = Mock()
|
|
|
|
with patch("requests.get", return_value=mock_response) as mock_get:
|
|
content = downloader.download()
|
|
|
|
assert content is not None
|
|
# Verify timeout was passed to requests.get
|
|
call_kwargs = mock_get.call_args[1]
|
|
assert call_kwargs["timeout"] == 10
|
|
|
|
|
|
def test_custom_max_retries():
|
|
"""Test custom max_retries parameter"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=5)
|
|
|
|
with (
|
|
patch("requests.get", side_effect=requests.Timeout("Connection timeout")) as mock_get,
|
|
patch("time.sleep"),
|
|
):
|
|
content = downloader.download()
|
|
|
|
assert content is None
|
|
assert mock_get.call_count == 5 # Should attempt 5 times
|
|
|
|
|
|
def test_user_agent_header():
|
|
"""Test that custom user agent is set"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
mock_response = Mock()
|
|
mock_response.text = "# Header\n\nContent " * 50
|
|
mock_response.raise_for_status = Mock()
|
|
|
|
with patch("requests.get", return_value=mock_response) as mock_get:
|
|
content = downloader.download()
|
|
|
|
assert content is not None
|
|
# Verify custom user agent was passed
|
|
call_kwargs = mock_get.call_args[1]
|
|
assert call_kwargs["headers"]["User-Agent"] == "Skill-Seekers-llms.txt-Reader/1.0"
|
|
|
|
|
|
def test_get_proper_filename():
|
|
"""Test filename conversion from .txt to .md"""
|
|
downloader = LlmsTxtDownloader("https://hono.dev/llms-full.txt")
|
|
|
|
filename = downloader.get_proper_filename()
|
|
|
|
assert filename == "llms-full.md"
|
|
assert not filename.endswith(".txt")
|
|
|
|
|
|
def test_get_proper_filename_standard():
|
|
"""Test standard variant naming"""
|
|
downloader = LlmsTxtDownloader("https://hono.dev/llms.txt")
|
|
|
|
filename = downloader.get_proper_filename()
|
|
|
|
assert filename == "llms.md"
|
|
|
|
|
|
def test_get_proper_filename_small():
|
|
"""Test small variant naming"""
|
|
downloader = LlmsTxtDownloader("https://hono.dev/llms-small.txt")
|
|
|
|
filename = downloader.get_proper_filename()
|
|
|
|
assert filename == "llms-small.md"
|
|
|
|
|
|
def test_is_markdown_rejects_html_doctype():
|
|
"""Test that HTML with DOCTYPE is rejected (prevents redirect trap)"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
html = (
|
|
"<!DOCTYPE html><html><head><title>Product Page</title></head><body>Content</body></html>"
|
|
)
|
|
assert not downloader._is_markdown(html)
|
|
|
|
# Test case-insensitive
|
|
html_uppercase = "<!DOCTYPE HTML><HTML><BODY>Content</BODY></HTML>"
|
|
assert not downloader._is_markdown(html_uppercase)
|
|
|
|
|
|
def test_is_markdown_rejects_html_tag():
|
|
"""Test that HTML with <html> tag is rejected (prevents redirect trap)"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
html = '<html><head><meta charset="utf-8"></head><body>Content</body></html>'
|
|
assert not downloader._is_markdown(html)
|
|
|
|
# Test with just opening tag
|
|
html_partial = "<html><head>Some content"
|
|
assert not downloader._is_markdown(html_partial)
|
|
|
|
|
|
def test_is_markdown_rejects_html_meta():
|
|
"""Test that HTML with <meta> or <head> tags is rejected"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
html_with_head = "<head><title>Page</title></head><body>Content</body>"
|
|
assert not downloader._is_markdown(html_with_head)
|
|
|
|
html_with_meta = '<meta charset="utf-8"><meta name="viewport" content="width=device-width">'
|
|
assert not downloader._is_markdown(html_with_meta)
|
|
|
|
|
|
def test_is_markdown_accepts_markdown_with_html_words():
|
|
"""Test that markdown mentioning 'html' word is still accepted"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
markdown = "# Guide\n\nLearn about html tags in markdown. You can write HTML inside markdown."
|
|
assert downloader._is_markdown(markdown)
|
|
|
|
# Test with actual markdown patterns
|
|
markdown_with_code = "# HTML Tutorial\n\n```html\n<div>example</div>\n```\n\n## More content"
|
|
assert downloader._is_markdown(markdown_with_code)
|
|
|
|
|
|
def test_html_detection_only_scans_first_500_chars():
|
|
"""Test that HTML detection only scans first 500 characters for performance"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
# HTML tag after 500 chars should not be detected
|
|
safe_markdown = "# Header\n\n" + ("Valid markdown content. " * 50) + "\n\n<!DOCTYPE html>"
|
|
# This should pass because <!DOCTYPE html> is beyond first 500 chars
|
|
if len(safe_markdown[:500]) < len("<!DOCTYPE html>"):
|
|
# If the HTML is within 500 chars, adjust test
|
|
assert not downloader._is_markdown(safe_markdown)
|
|
else:
|
|
# HTML beyond 500 chars should not trigger rejection
|
|
assert downloader._is_markdown(safe_markdown)
|
|
|
|
|
|
def test_html_redirect_trap_scenario():
|
|
"""Test real-world scenario: llms.txt redirects to HTML product page"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
# Simulate Claude Code redirect scenario (302 to HTML page)
|
|
html_product_page = """<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<title>Claude Code - Product Page</title>
|
|
</head>
|
|
<body>
|
|
<h1>Claude Code</h1>
|
|
<p>Product information...</p>
|
|
</body>
|
|
</html>"""
|
|
|
|
# Should reject this HTML even though it has <h1> tag (looks like markdown "# ")
|
|
assert not downloader._is_markdown(html_product_page)
|
|
|
|
|
|
def test_download_rejects_html_redirect():
|
|
"""Test that download() properly rejects HTML redirects"""
|
|
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
|
|
|
|
mock_response = Mock()
|
|
# Simulate server returning HTML instead of markdown
|
|
mock_response.text = "<!DOCTYPE html><html><body><h1>Product Page</h1></body></html>"
|
|
mock_response.raise_for_status = Mock()
|
|
|
|
with patch("requests.get", return_value=mock_response):
|
|
content = downloader.download()
|
|
|
|
# Should return None (rejected as non-markdown)
|
|
assert content is None
|