Files
skill-seekers-reference/tests/test_skip_llms_txt.py
yusyus 9666938eb0 fix: Resolve 21 ruff linting errors (SIM102, SIM117, B904, SIM113, B007)
Fixed all 21 linting errors identified in GitHub Actions:

SIM102 (7 errors - nested if statements):
- config_extractor.py:468 - Combined nested conditions
- config_validator.py (was B904, already fixed)
- pattern_recognizer.py:430,538,916 - Combined nested conditions
- test_example_extractor.py:365,412,460 - Combined nested conditions
- unified_skill_builder.py:1070 - Combined nested conditions

SIM117 (9 errors - multiple with statements):
- test_install_agent.py:418 - Combined with statements
- test_issue_219_e2e.py:278 - Combined with statements
- test_llms_txt_downloader.py:33,88 - Combined with statements
- test_skip_llms_txt.py:75,98,121,148,172,304 - Combined with statements

B904 (1 error - exception handling):
- config_validator.py:62 - Added 'from e' to exception chain

SIM113 (1 error - enumerate usage):
- doc_scraper.py:1068 - Removed unused 'completed' counter variable

B007 (1 error - unused loop variable):
- pdf_scraper.py:167 - Changed 'keywords' to '_' for unused variable

All changes improve code quality without altering functionality.
Tests: 1214 passed, 167 skipped (4 pre-existing failures unrelated)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-17 23:54:22 +03:00

305 lines
12 KiB
Python

"""Tests for skip_llms_txt configuration option.
This config option allows users to explicitly skip llms.txt detection and fetching,
which is useful when:
- A site's llms.txt is incomplete or incorrect
- You need specific pages not in llms.txt
- You want to force HTML scraping
"""
import os
import tempfile
import unittest
from unittest.mock import patch
from skill_seekers.cli.doc_scraper import DocToSkillConverter
class TestSkipLlmsTxtConfig(unittest.TestCase):
"""Test skip_llms_txt configuration option."""
def test_default_skip_llms_txt_is_false(self):
"""Test that skip_llms_txt defaults to False when not specified."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
def test_skip_llms_txt_can_be_set_true(self):
"""Test that skip_llms_txt can be explicitly set to True."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": True,
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertTrue(converter.skip_llms_txt)
def test_skip_llms_txt_can_be_set_false(self):
"""Test that skip_llms_txt can be explicitly set to False."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": False,
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
class TestSkipLlmsTxtSyncBehavior(unittest.TestCase):
"""Test skip_llms_txt behavior in sync scraping mode."""
def test_llms_txt_tried_when_not_skipped(self):
"""Test that _try_llms_txt is called when skip_llms_txt is False."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": False,
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
with patch.object(converter, "_try_llms_txt", return_value=False) as mock_try, patch.object(converter, "scrape_page"), patch.object(converter, "save_summary"):
converter.scrape_all()
mock_try.assert_called_once()
finally:
os.chdir(original_cwd)
def test_llms_txt_skipped_when_skip_true(self):
"""Test that _try_llms_txt is NOT called when skip_llms_txt is True."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": True,
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
with patch.object(converter, "_try_llms_txt") as mock_try, patch.object(converter, "scrape_page"), patch.object(converter, "save_summary"):
converter.scrape_all()
mock_try.assert_not_called()
finally:
os.chdir(original_cwd)
def test_llms_txt_skipped_in_dry_run_mode(self):
"""Test that _try_llms_txt is NOT called in dry-run mode regardless of skip setting."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": False, # Even when False
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=True)
with patch.object(converter, "_try_llms_txt") as mock_try, patch.object(converter, "save_summary"):
converter.scrape_all()
mock_try.assert_not_called()
finally:
os.chdir(original_cwd)
class TestSkipLlmsTxtAsyncBehavior(unittest.TestCase):
"""Test skip_llms_txt behavior in async scraping mode."""
def test_async_llms_txt_tried_when_not_skipped(self):
"""Test that _try_llms_txt is called in async mode when skip_llms_txt is False."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"async_mode": True,
"skip_llms_txt": False,
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
with patch.object(converter, "_try_llms_txt", return_value=False) as mock_try, patch.object(converter, "scrape_page_async", return_value=None), patch.object(converter, "save_summary"):
converter.scrape_all()
mock_try.assert_called_once()
finally:
os.chdir(original_cwd)
def test_async_llms_txt_skipped_when_skip_true(self):
"""Test that _try_llms_txt is NOT called in async mode when skip_llms_txt is True."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"async_mode": True,
"skip_llms_txt": True,
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
with patch.object(converter, "_try_llms_txt") as mock_try, patch.object(converter, "scrape_page_async", return_value=None), patch.object(converter, "save_summary"):
converter.scrape_all()
mock_try.assert_not_called()
finally:
os.chdir(original_cwd)
class TestSkipLlmsTxtWithRealConfig(unittest.TestCase):
"""Test skip_llms_txt with real-world config patterns."""
def test_telegram_bots_config_pattern(self):
"""Test the telegram-bots config pattern which uses skip_llms_txt."""
config = {
"name": "telegram-bots",
"description": "Telegram bot documentation",
"base_url": "https://core.telegram.org/bots",
"skip_llms_txt": True, # Telegram doesn't have useful llms.txt
"start_urls": ["https://core.telegram.org/bots", "https://core.telegram.org/bots/api"],
"selectors": {
"main_content": "#dev_page_content, main, article",
"title": "h1, title",
"code_blocks": "pre code, pre",
},
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertTrue(converter.skip_llms_txt)
self.assertEqual(converter.name, "telegram-bots")
def test_skip_llms_txt_with_multiple_start_urls(self):
"""Test skip_llms_txt works correctly with multiple start URLs."""
config = {
"name": "test-multi",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": True,
"start_urls": [
"https://example.com/docs/",
"https://example.com/api/",
"https://example.com/guide/",
],
}
converter = DocToSkillConverter(config, dry_run=True)
self.assertTrue(converter.skip_llms_txt)
# start_urls are stored in pending_urls deque
self.assertEqual(len(converter.pending_urls), 3)
class TestSkipLlmsTxtEdgeCases(unittest.TestCase):
"""Test edge cases for skip_llms_txt."""
def test_skip_llms_txt_with_int_zero_logs_warning(self):
"""Test that integer 0 logs warning and defaults to False."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": 0, # Invalid type
}
with self.assertLogs("skill_seekers.cli.doc_scraper", level="WARNING") as cm:
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
self.assertTrue(any("Invalid value" in log and "0" in log for log in cm.output))
def test_skip_llms_txt_with_int_one_logs_warning(self):
"""Test that integer 1 logs warning and defaults to False."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": 1, # Invalid type
}
with self.assertLogs("skill_seekers.cli.doc_scraper", level="WARNING") as cm:
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
self.assertTrue(any("Invalid value" in log and "1" in log for log in cm.output))
def test_skip_llms_txt_with_string_logs_warning(self):
"""Test that string values log warning and default to False."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": "true", # Invalid type
}
with self.assertLogs("skill_seekers.cli.doc_scraper", level="WARNING") as cm:
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
self.assertTrue(any("Invalid value" in log and "true" in log for log in cm.output))
def test_skip_llms_txt_with_none_logs_warning(self):
"""Test that None logs warning and defaults to False."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": None, # Invalid type
}
with self.assertLogs("skill_seekers.cli.doc_scraper", level="WARNING") as cm:
converter = DocToSkillConverter(config, dry_run=True)
self.assertFalse(converter.skip_llms_txt)
self.assertTrue(any("Invalid value" in log and "None" in log for log in cm.output))
def test_scraping_proceeds_when_llms_txt_skipped(self):
"""Test that HTML scraping proceeds normally when llms.txt is skipped."""
config = {
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"skip_llms_txt": True,
}
original_cwd = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=False)
# Track if scrape_page was called
scrape_called = []
def mock_scrape(url):
scrape_called.append(url)
return None
with patch.object(converter, "scrape_page", side_effect=mock_scrape), patch.object(converter, "save_summary"):
converter.scrape_all()
# Should have attempted to scrape the base URL
self.assertTrue(len(scrape_called) > 0)
finally:
os.chdir(original_cwd)
if __name__ == "__main__":
unittest.main()