From 60fefb6c0b89140e0fcfd7bde0ecd0f35b85dcdb Mon Sep 17 00:00:00 2001 From: "Edgar I." Date: Fri, 24 Oct 2025 13:18:30 +0400 Subject: [PATCH] fix: improve URL parsing and add test mocking for llms.txt detector --- cli/llms_txt_detector.py | 7 +++-- tests/test_llms_txt_detector.py | 49 +++++++++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/cli/llms_txt_detector.py b/cli/llms_txt_detector.py index 4d702b9..6a4a832 100644 --- a/cli/llms_txt_detector.py +++ b/cli/llms_txt_detector.py @@ -3,6 +3,7 @@ import requests from typing import Optional, Dict +from urllib.parse import urlparse class LlmsTxtDetector: """Detect llms.txt files at documentation URLs""" @@ -23,9 +24,11 @@ class LlmsTxtDetector: Returns: Dict with 'url' and 'variant' keys, or None if not found """ + parsed = urlparse(self.base_url) + root_url = f"{parsed.scheme}://{parsed.netloc}" + for filename, variant in self.VARIANTS: - # Try at base URL root - url = f"{self.base_url.split('/docs')[0]}/{filename}" + url = f"{root_url}/{filename}" if self._check_url_exists(url): return {'url': url, 'variant': variant} diff --git a/tests/test_llms_txt_detector.py b/tests/test_llms_txt_detector.py index 832a45c..fc9481e 100644 --- a/tests/test_llms_txt_detector.py +++ b/tests/test_llms_txt_detector.py @@ -1,13 +1,52 @@ import pytest +from unittest.mock import patch, Mock from cli.llms_txt_detector import LlmsTxtDetector def test_detect_llms_txt_variants(): """Test detection of llms.txt file variants""" detector = LlmsTxtDetector("https://hono.dev/docs") - # Mock responses - variants = detector.detect() + with patch('cli.llms_txt_detector.requests.head') as mock_head: + mock_response = Mock() + mock_response.status_code = 200 + mock_head.return_value = mock_response - assert variants is not None - assert 'url' in variants - assert 'variant' in variants # 'full', 'standard', 'small' + variants = detector.detect() + + assert variants is not None + assert variants['url'] == 'https://hono.dev/llms-full.txt' + assert variants['variant'] == 'full' + mock_head.assert_called() + +def test_detect_no_llms_txt(): + """Test detection when no llms.txt file exists""" + detector = LlmsTxtDetector("https://example.com/docs") + + with patch('cli.llms_txt_detector.requests.head') as mock_head: + mock_response = Mock() + mock_response.status_code = 404 + mock_head.return_value = mock_response + + variants = detector.detect() + + assert variants is None + assert mock_head.call_count == 3 # Should try all three variants + +def test_url_parsing_with_complex_paths(): + """Test URL parsing handles non-standard paths correctly""" + detector = LlmsTxtDetector("https://example.com/docs/v2/guide") + + with patch('cli.llms_txt_detector.requests.head') as mock_head: + mock_response = Mock() + mock_response.status_code = 200 + mock_head.return_value = mock_response + + variants = detector.detect() + + assert variants is not None + assert variants['url'] == 'https://example.com/llms-full.txt' + mock_head.assert_called_with( + 'https://example.com/llms-full.txt', + timeout=5, + allow_redirects=True + )