This commit is contained in:
Pablo Estevez
2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions

View File

@@ -4,14 +4,11 @@ Tests for async scraping functionality
Tests the async/await implementation for parallel web scraping
"""
import sys
import os
import unittest
import asyncio
import os
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch, AsyncMock, MagicMock
from collections import deque
import unittest
from unittest.mock import AsyncMock, patch
from skill_seekers.cli.doc_scraper import DocToSkillConverter
@@ -30,10 +27,10 @@ class TestAsyncConfiguration(unittest.TestCase):
def test_async_mode_default_false(self):
"""Test async mode is disabled by default"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'max_pages': 10
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"max_pages": 10,
}
with tempfile.TemporaryDirectory() as tmpdir:
@@ -47,11 +44,11 @@ class TestAsyncConfiguration(unittest.TestCase):
def test_async_mode_enabled_from_config(self):
"""Test async mode can be enabled via config"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'max_pages': 10,
'async_mode': True
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"max_pages": 10,
"async_mode": True,
}
with tempfile.TemporaryDirectory() as tmpdir:
@@ -65,11 +62,11 @@ class TestAsyncConfiguration(unittest.TestCase):
def test_async_mode_with_workers(self):
"""Test async mode works with multiple workers"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'workers': 4,
'async_mode': True
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"workers": 4,
"async_mode": True,
}
with tempfile.TemporaryDirectory() as tmpdir:
@@ -95,34 +92,26 @@ class TestAsyncScrapeMethods(unittest.TestCase):
def test_scrape_page_async_exists(self):
"""Test scrape_page_async method exists"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'}
}
config = {"name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article"}}
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=True)
self.assertTrue(hasattr(converter, 'scrape_page_async'))
self.assertTrue(hasattr(converter, "scrape_page_async"))
self.assertTrue(asyncio.iscoroutinefunction(converter.scrape_page_async))
finally:
os.chdir(self.original_cwd)
def test_scrape_all_async_exists(self):
"""Test scrape_all_async method exists"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'}
}
config = {"name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article"}}
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.chdir(tmpdir)
converter = DocToSkillConverter(config, dry_run=True)
self.assertTrue(hasattr(converter, 'scrape_all_async'))
self.assertTrue(hasattr(converter, "scrape_all_async"))
self.assertTrue(asyncio.iscoroutinefunction(converter.scrape_all_async))
finally:
os.chdir(self.original_cwd)
@@ -142,11 +131,11 @@ class TestAsyncRouting(unittest.TestCase):
def test_scrape_all_routes_to_async_when_enabled(self):
"""Test scrape_all calls async version when async_mode=True"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'async_mode': True,
'max_pages': 1
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"async_mode": True,
"max_pages": 1,
}
with tempfile.TemporaryDirectory() as tmpdir:
@@ -155,7 +144,7 @@ class TestAsyncRouting(unittest.TestCase):
converter = DocToSkillConverter(config, dry_run=True)
# Mock scrape_all_async to verify it gets called
with patch.object(converter, 'scrape_all_async', new_callable=AsyncMock) as mock_async:
with patch.object(converter, "scrape_all_async", new_callable=AsyncMock) as mock_async:
converter.scrape_all()
# Verify async version was called
mock_async.assert_called_once()
@@ -165,11 +154,11 @@ class TestAsyncRouting(unittest.TestCase):
def test_scrape_all_uses_sync_when_async_disabled(self):
"""Test scrape_all uses sync version when async_mode=False"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'async_mode': False,
'max_pages': 1
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"async_mode": False,
"max_pages": 1,
}
with tempfile.TemporaryDirectory() as tmpdir:
@@ -178,8 +167,8 @@ class TestAsyncRouting(unittest.TestCase):
converter = DocToSkillConverter(config, dry_run=True)
# Mock scrape_all_async to verify it does NOT get called
with patch.object(converter, 'scrape_all_async', new_callable=AsyncMock) as mock_async:
with patch.object(converter, '_try_llms_txt', return_value=False):
with patch.object(converter, "scrape_all_async", new_callable=AsyncMock) as mock_async:
with patch.object(converter, "_try_llms_txt", return_value=False):
converter.scrape_all()
# Verify async version was NOT called
mock_async.assert_not_called()
@@ -201,11 +190,11 @@ class TestAsyncDryRun(unittest.TestCase):
def test_async_dry_run_completes(self):
"""Test async dry run completes without errors"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'async_mode': True,
'max_pages': 5
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"async_mode": True,
"max_pages": 5,
}
with tempfile.TemporaryDirectory() as tmpdir:
@@ -214,7 +203,7 @@ class TestAsyncDryRun(unittest.TestCase):
converter = DocToSkillConverter(config, dry_run=True)
# Mock _try_llms_txt to skip llms.txt detection
with patch.object(converter, '_try_llms_txt', return_value=False):
with patch.object(converter, "_try_llms_txt", return_value=False):
# Should complete without errors
converter.scrape_all()
# Verify dry run mode was used
@@ -237,12 +226,12 @@ class TestAsyncErrorHandling(unittest.TestCase):
def test_async_handles_http_errors(self):
"""Test async scraping handles HTTP errors gracefully"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'async_mode': True,
'workers': 2,
'max_pages': 1
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"async_mode": True,
"workers": 2,
"max_pages": 1,
}
with tempfile.TemporaryDirectory() as tmpdir:
@@ -258,9 +247,9 @@ class TestAsyncErrorHandling(unittest.TestCase):
async with httpx.AsyncClient() as client:
# Mock client.get to raise exception
with patch.object(client, 'get', side_effect=httpx.HTTPError("Test error")):
with patch.object(client, "get", side_effect=httpx.HTTPError("Test error")):
# Should not raise exception, just log error
await converter.scrape_page_async('https://example.com/test', semaphore, client)
await converter.scrape_page_async("https://example.com/test", semaphore, client)
# Run async test
asyncio.run(run_test())
@@ -275,11 +264,11 @@ class TestAsyncPerformance(unittest.TestCase):
def test_async_uses_semaphore_for_concurrency_control(self):
"""Test async mode uses semaphore instead of threading lock"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'async_mode': True,
'workers': 4
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"async_mode": True,
"workers": 4,
}
original_cwd = os.getcwd()
@@ -301,10 +290,10 @@ class TestAsyncLlmsTxtIntegration(unittest.TestCase):
def test_async_respects_llms_txt(self):
"""Test async mode respects llms.txt and skips HTML scraping"""
config = {
'name': 'test',
'base_url': 'https://example.com/',
'selectors': {'main_content': 'article'},
'async_mode': True
"name": "test",
"base_url": "https://example.com/",
"selectors": {"main_content": "article"},
"async_mode": True,
}
original_cwd = os.getcwd()
@@ -314,8 +303,8 @@ class TestAsyncLlmsTxtIntegration(unittest.TestCase):
converter = DocToSkillConverter(config, dry_run=False)
# Mock _try_llms_txt to return True (llms.txt found)
with patch.object(converter, '_try_llms_txt', return_value=True):
with patch.object(converter, 'save_summary'):
with patch.object(converter, "_try_llms_txt", return_value=True):
with patch.object(converter, "save_summary"):
converter.scrape_all()
# If llms.txt succeeded, async scraping should be skipped
# Verify by checking that pages were not scraped
@@ -324,5 +313,5 @@ class TestAsyncLlmsTxtIntegration(unittest.TestCase):
os.chdir(original_cwd)
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()