test: Add comprehensive HTML detection tests for llms.txt downloader (PR #244 review fix)

Added 7 test cases to verify HTML redirect trap prevention: - test_is_markdown_rejects_html_doctype() - DOCTYPE rejection (case-insensitive) - test_is_markdown_rejects_html_tag() - <html> tag rejection - test_is_markdown_rejects_html_meta() - <meta> and <head> tag rejection - test_is_markdown_accepts_markdown_with_html_words() - Edge case: markdown mentioning "html" - test_html_detection_only_scans_first_500_chars() - Performance optimization verification - test_html_redirect_trap_scenario() - Real-world Claude Code redirect scenario - test_download_rejects_html_redirect() - End-to-end download rejection Addresses minor observation from PR #244 review: - Ensures HTML detection logic is fully covered - Prevents regression of redirect trap fixes - Validates 500-char scanning optimization Test Results: 20/20 llms_txt_downloader tests passing Overall: 982/982 tests passing (4 expected failures - missing anthropic package) Related: PR #244 (Claude Code documentation config update) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-11 14:16:44 +03:00
parent 9042e1680c
commit 6008f13127
1 changed files with 92 additions and 0 deletions
--- a/tests/test_llms_txt_downloader.py
+++ b/tests/test_llms_txt_downloader.py
@@ -168,3 +168,95 @@ def test_get_proper_filename_small():
    filename = downloader.get_proper_filename()

    assert filename == "llms-small.md"
+
+def test_is_markdown_rejects_html_doctype():
+    """Test that HTML with DOCTYPE is rejected (prevents redirect trap)"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    html = '<!DOCTYPE html><html><head><title>Product Page</title></head><body>Content</body></html>'
+    assert not downloader._is_markdown(html)
+
+    # Test case-insensitive
+    html_uppercase = '<!DOCTYPE HTML><HTML><BODY>Content</BODY></HTML>'
+    assert not downloader._is_markdown(html_uppercase)
+
+def test_is_markdown_rejects_html_tag():
+    """Test that HTML with <html> tag is rejected (prevents redirect trap)"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    html = '<html><head><meta charset="utf-8"></head><body>Content</body></html>'
+    assert not downloader._is_markdown(html)
+
+    # Test with just opening tag
+    html_partial = '<html><head>Some content'
+    assert not downloader._is_markdown(html_partial)
+
+def test_is_markdown_rejects_html_meta():
+    """Test that HTML with <meta> or <head> tags is rejected"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    html_with_head = '<head><title>Page</title></head><body>Content</body>'
+    assert not downloader._is_markdown(html_with_head)
+
+    html_with_meta = '<meta charset="utf-8"><meta name="viewport" content="width=device-width">'
+    assert not downloader._is_markdown(html_with_meta)
+
+def test_is_markdown_accepts_markdown_with_html_words():
+    """Test that markdown mentioning 'html' word is still accepted"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    markdown = '# Guide\n\nLearn about html tags in markdown. You can write HTML inside markdown.'
+    assert downloader._is_markdown(markdown)
+
+    # Test with actual markdown patterns
+    markdown_with_code = '# HTML Tutorial\n\n```html\n<div>example</div>\n```\n\n## More content'
+    assert downloader._is_markdown(markdown_with_code)
+
+def test_html_detection_only_scans_first_500_chars():
+    """Test that HTML detection only scans first 500 characters for performance"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    # HTML tag after 500 chars should not be detected
+    safe_markdown = '# Header\n\n' + ('Valid markdown content. ' * 50) + '\n\n<!DOCTYPE html>'
+    # This should pass because <!DOCTYPE html> is beyond first 500 chars
+    if len(safe_markdown[:500]) < len('<!DOCTYPE html>'):
+        # If the HTML is within 500 chars, adjust test
+        assert not downloader._is_markdown(safe_markdown)
+    else:
+        # HTML beyond 500 chars should not trigger rejection
+        assert downloader._is_markdown(safe_markdown)
+
+def test_html_redirect_trap_scenario():
+    """Test real-world scenario: llms.txt redirects to HTML product page"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    # Simulate Claude Code redirect scenario (302 to HTML page)
+    html_product_page = '''<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Claude Code - Product Page</title>
+</head>
+<body>
+    <h1>Claude Code</h1>
+    <p>Product information...</p>
+</body>
+</html>'''
+
+    # Should reject this HTML even though it has <h1> tag (looks like markdown "# ")
+    assert not downloader._is_markdown(html_product_page)
+
+def test_download_rejects_html_redirect():
+    """Test that download() properly rejects HTML redirects"""
+    downloader = LlmsTxtDownloader("https://example.com/llms.txt")
+
+    mock_response = Mock()
+    # Simulate server returning HTML instead of markdown
+    mock_response.text = '<!DOCTYPE html><html><body><h1>Product Page</h1></body></html>'
+    mock_response.raise_for_status = Mock()
+
+    with patch('requests.get', return_value=mock_response):
+        content = downloader.download()
+
+    # Should return None (rejected as non-markdown)
+    assert content is None