test: Add comprehensive HTML detection tests for llms.txt downloader (PR #244 review fix)

Added 7 test cases to verify HTML redirect trap prevention:
- test_is_markdown_rejects_html_doctype() - DOCTYPE rejection (case-insensitive)
- test_is_markdown_rejects_html_tag() - <html> tag rejection
- test_is_markdown_rejects_html_meta() - <meta> and <head> tag rejection
- test_is_markdown_accepts_markdown_with_html_words() - Edge case: markdown mentioning "html"
- test_html_detection_only_scans_first_500_chars() - Performance optimization verification
- test_html_redirect_trap_scenario() - Real-world Claude Code redirect scenario
- test_download_rejects_html_redirect() - End-to-end download rejection

Addresses minor observation from PR #244 review:
- Ensures HTML detection logic is fully covered
- Prevents regression of redirect trap fixes
- Validates 500-char scanning optimization

Test Results: 20/20 llms_txt_downloader tests passing
Overall: 982/982 tests passing (4 expected failures - missing anthropic package)

Related: PR #244 (Claude Code documentation config update)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-01-11 14:16:44 +03:00
parent 9042e1680c
commit 6008f13127

View File

@@ -168,3 +168,95 @@ def test_get_proper_filename_small():
filename = downloader.get_proper_filename()
assert filename == "llms-small.md"
def test_is_markdown_rejects_html_doctype():
"""Test that HTML with DOCTYPE is rejected (prevents redirect trap)"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
html = '<!DOCTYPE html><html><head><title>Product Page</title></head><body>Content</body></html>'
assert not downloader._is_markdown(html)
# Test case-insensitive
html_uppercase = '<!DOCTYPE HTML><HTML><BODY>Content</BODY></HTML>'
assert not downloader._is_markdown(html_uppercase)
def test_is_markdown_rejects_html_tag():
"""Test that HTML with <html> tag is rejected (prevents redirect trap)"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
html = '<html><head><meta charset="utf-8"></head><body>Content</body></html>'
assert not downloader._is_markdown(html)
# Test with just opening tag
html_partial = '<html><head>Some content'
assert not downloader._is_markdown(html_partial)
def test_is_markdown_rejects_html_meta():
"""Test that HTML with <meta> or <head> tags is rejected"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
html_with_head = '<head><title>Page</title></head><body>Content</body>'
assert not downloader._is_markdown(html_with_head)
html_with_meta = '<meta charset="utf-8"><meta name="viewport" content="width=device-width">'
assert not downloader._is_markdown(html_with_meta)
def test_is_markdown_accepts_markdown_with_html_words():
"""Test that markdown mentioning 'html' word is still accepted"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
markdown = '# Guide\n\nLearn about html tags in markdown. You can write HTML inside markdown.'
assert downloader._is_markdown(markdown)
# Test with actual markdown patterns
markdown_with_code = '# HTML Tutorial\n\n```html\n<div>example</div>\n```\n\n## More content'
assert downloader._is_markdown(markdown_with_code)
def test_html_detection_only_scans_first_500_chars():
"""Test that HTML detection only scans first 500 characters for performance"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
# HTML tag after 500 chars should not be detected
safe_markdown = '# Header\n\n' + ('Valid markdown content. ' * 50) + '\n\n<!DOCTYPE html>'
# This should pass because <!DOCTYPE html> is beyond first 500 chars
if len(safe_markdown[:500]) < len('<!DOCTYPE html>'):
# If the HTML is within 500 chars, adjust test
assert not downloader._is_markdown(safe_markdown)
else:
# HTML beyond 500 chars should not trigger rejection
assert downloader._is_markdown(safe_markdown)
def test_html_redirect_trap_scenario():
"""Test real-world scenario: llms.txt redirects to HTML product page"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
# Simulate Claude Code redirect scenario (302 to HTML page)
html_product_page = '''<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Claude Code - Product Page</title>
</head>
<body>
<h1>Claude Code</h1>
<p>Product information...</p>
</body>
</html>'''
# Should reject this HTML even though it has <h1> tag (looks like markdown "# ")
assert not downloader._is_markdown(html_product_page)
def test_download_rejects_html_redirect():
"""Test that download() properly rejects HTML redirects"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
mock_response = Mock()
# Simulate server returning HTML instead of markdown
mock_response.text = '<!DOCTYPE html><html><body><h1>Product Page</h1></body></html>'
mock_response.raise_for_status = Mock()
with patch('requests.get', return_value=mock_response):
content = downloader.download()
# Should return None (rejected as non-markdown)
assert content is None