run ruff
This commit is contained in:
@@ -4,14 +4,11 @@ Tests for async scraping functionality
|
||||
Tests the async/await implementation for parallel web scraping
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import unittest
|
||||
import asyncio
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, AsyncMock, MagicMock
|
||||
from collections import deque
|
||||
import unittest
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
@@ -30,10 +27,10 @@ class TestAsyncConfiguration(unittest.TestCase):
|
||||
def test_async_mode_default_false(self):
|
||||
"""Test async mode is disabled by default"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': 10
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": 10,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -47,11 +44,11 @@ class TestAsyncConfiguration(unittest.TestCase):
|
||||
def test_async_mode_enabled_from_config(self):
|
||||
"""Test async mode can be enabled via config"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': 10,
|
||||
'async_mode': True
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": 10,
|
||||
"async_mode": True,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -65,11 +62,11 @@ class TestAsyncConfiguration(unittest.TestCase):
|
||||
def test_async_mode_with_workers(self):
|
||||
"""Test async mode works with multiple workers"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'workers': 4,
|
||||
'async_mode': True
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"workers": 4,
|
||||
"async_mode": True,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -95,34 +92,26 @@ class TestAsyncScrapeMethods(unittest.TestCase):
|
||||
|
||||
def test_scrape_page_async_exists(self):
|
||||
"""Test scrape_page_async method exists"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'}
|
||||
}
|
||||
config = {"name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article"}}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
try:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertTrue(hasattr(converter, 'scrape_page_async'))
|
||||
self.assertTrue(hasattr(converter, "scrape_page_async"))
|
||||
self.assertTrue(asyncio.iscoroutinefunction(converter.scrape_page_async))
|
||||
finally:
|
||||
os.chdir(self.original_cwd)
|
||||
|
||||
def test_scrape_all_async_exists(self):
|
||||
"""Test scrape_all_async method exists"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'}
|
||||
}
|
||||
config = {"name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article"}}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
try:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertTrue(hasattr(converter, 'scrape_all_async'))
|
||||
self.assertTrue(hasattr(converter, "scrape_all_async"))
|
||||
self.assertTrue(asyncio.iscoroutinefunction(converter.scrape_all_async))
|
||||
finally:
|
||||
os.chdir(self.original_cwd)
|
||||
@@ -142,11 +131,11 @@ class TestAsyncRouting(unittest.TestCase):
|
||||
def test_scrape_all_routes_to_async_when_enabled(self):
|
||||
"""Test scrape_all calls async version when async_mode=True"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'async_mode': True,
|
||||
'max_pages': 1
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"async_mode": True,
|
||||
"max_pages": 1,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -155,7 +144,7 @@ class TestAsyncRouting(unittest.TestCase):
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
|
||||
# Mock scrape_all_async to verify it gets called
|
||||
with patch.object(converter, 'scrape_all_async', new_callable=AsyncMock) as mock_async:
|
||||
with patch.object(converter, "scrape_all_async", new_callable=AsyncMock) as mock_async:
|
||||
converter.scrape_all()
|
||||
# Verify async version was called
|
||||
mock_async.assert_called_once()
|
||||
@@ -165,11 +154,11 @@ class TestAsyncRouting(unittest.TestCase):
|
||||
def test_scrape_all_uses_sync_when_async_disabled(self):
|
||||
"""Test scrape_all uses sync version when async_mode=False"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'async_mode': False,
|
||||
'max_pages': 1
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"async_mode": False,
|
||||
"max_pages": 1,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -178,8 +167,8 @@ class TestAsyncRouting(unittest.TestCase):
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
|
||||
# Mock scrape_all_async to verify it does NOT get called
|
||||
with patch.object(converter, 'scrape_all_async', new_callable=AsyncMock) as mock_async:
|
||||
with patch.object(converter, '_try_llms_txt', return_value=False):
|
||||
with patch.object(converter, "scrape_all_async", new_callable=AsyncMock) as mock_async:
|
||||
with patch.object(converter, "_try_llms_txt", return_value=False):
|
||||
converter.scrape_all()
|
||||
# Verify async version was NOT called
|
||||
mock_async.assert_not_called()
|
||||
@@ -201,11 +190,11 @@ class TestAsyncDryRun(unittest.TestCase):
|
||||
def test_async_dry_run_completes(self):
|
||||
"""Test async dry run completes without errors"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'async_mode': True,
|
||||
'max_pages': 5
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"async_mode": True,
|
||||
"max_pages": 5,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -214,7 +203,7 @@ class TestAsyncDryRun(unittest.TestCase):
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
|
||||
# Mock _try_llms_txt to skip llms.txt detection
|
||||
with patch.object(converter, '_try_llms_txt', return_value=False):
|
||||
with patch.object(converter, "_try_llms_txt", return_value=False):
|
||||
# Should complete without errors
|
||||
converter.scrape_all()
|
||||
# Verify dry run mode was used
|
||||
@@ -237,12 +226,12 @@ class TestAsyncErrorHandling(unittest.TestCase):
|
||||
def test_async_handles_http_errors(self):
|
||||
"""Test async scraping handles HTTP errors gracefully"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'async_mode': True,
|
||||
'workers': 2,
|
||||
'max_pages': 1
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"async_mode": True,
|
||||
"workers": 2,
|
||||
"max_pages": 1,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -258,9 +247,9 @@ class TestAsyncErrorHandling(unittest.TestCase):
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
# Mock client.get to raise exception
|
||||
with patch.object(client, 'get', side_effect=httpx.HTTPError("Test error")):
|
||||
with patch.object(client, "get", side_effect=httpx.HTTPError("Test error")):
|
||||
# Should not raise exception, just log error
|
||||
await converter.scrape_page_async('https://example.com/test', semaphore, client)
|
||||
await converter.scrape_page_async("https://example.com/test", semaphore, client)
|
||||
|
||||
# Run async test
|
||||
asyncio.run(run_test())
|
||||
@@ -275,11 +264,11 @@ class TestAsyncPerformance(unittest.TestCase):
|
||||
def test_async_uses_semaphore_for_concurrency_control(self):
|
||||
"""Test async mode uses semaphore instead of threading lock"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'async_mode': True,
|
||||
'workers': 4
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"async_mode": True,
|
||||
"workers": 4,
|
||||
}
|
||||
|
||||
original_cwd = os.getcwd()
|
||||
@@ -301,10 +290,10 @@ class TestAsyncLlmsTxtIntegration(unittest.TestCase):
|
||||
def test_async_respects_llms_txt(self):
|
||||
"""Test async mode respects llms.txt and skips HTML scraping"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'async_mode': True
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"async_mode": True,
|
||||
}
|
||||
|
||||
original_cwd = os.getcwd()
|
||||
@@ -314,8 +303,8 @@ class TestAsyncLlmsTxtIntegration(unittest.TestCase):
|
||||
converter = DocToSkillConverter(config, dry_run=False)
|
||||
|
||||
# Mock _try_llms_txt to return True (llms.txt found)
|
||||
with patch.object(converter, '_try_llms_txt', return_value=True):
|
||||
with patch.object(converter, 'save_summary'):
|
||||
with patch.object(converter, "_try_llms_txt", return_value=True):
|
||||
with patch.object(converter, "save_summary"):
|
||||
converter.scrape_all()
|
||||
# If llms.txt succeeded, async scraping should be skipped
|
||||
# Verify by checking that pages were not scraped
|
||||
@@ -324,5 +313,5 @@ class TestAsyncLlmsTxtIntegration(unittest.TestCase):
|
||||
os.chdir(original_cwd)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user