run ruff

2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions
--- a/tests/test_async_scraping.py
+++ b/tests/test_async_scraping.py
@@ -4,14 +4,11 @@ Tests for async scraping functionality
 Tests the async/await implementation for parallel web scraping
 """

-import sys
-import os
-import unittest
 import asyncio
+import os
 import tempfile
-from pathlib import Path
-from unittest.mock import Mock, patch, AsyncMock, MagicMock
-from collections import deque
+import unittest
+from unittest.mock import AsyncMock, patch

 from skill_seekers.cli.doc_scraper import DocToSkillConverter

@@ -30,10 +27,10 @@ class TestAsyncConfiguration(unittest.TestCase):
    def test_async_mode_default_false(self):
        """Test async mode is disabled by default"""
        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'},
-            'max_pages': 10
+            "name": "test",
+            "base_url": "https://example.com/",
+            "selectors": {"main_content": "article"},
+            "max_pages": 10,
        }

        with tempfile.TemporaryDirectory() as tmpdir:
@@ -47,11 +44,11 @@ class TestAsyncConfiguration(unittest.TestCase):
    def test_async_mode_enabled_from_config(self):
        """Test async mode can be enabled via config"""
        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'},
-            'max_pages': 10,
-            'async_mode': True
+            "name": "test",
+            "base_url": "https://example.com/",
+            "selectors": {"main_content": "article"},
+            "max_pages": 10,
+            "async_mode": True,
        }

        with tempfile.TemporaryDirectory() as tmpdir:
@@ -65,11 +62,11 @@ class TestAsyncConfiguration(unittest.TestCase):
    def test_async_mode_with_workers(self):
        """Test async mode works with multiple workers"""
        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'},
-            'workers': 4,
-            'async_mode': True
+            "name": "test",
+            "base_url": "https://example.com/",
+            "selectors": {"main_content": "article"},
+            "workers": 4,
+            "async_mode": True,
        }

        with tempfile.TemporaryDirectory() as tmpdir:
@@ -95,34 +92,26 @@ class TestAsyncScrapeMethods(unittest.TestCase):

    def test_scrape_page_async_exists(self):
        """Test scrape_page_async method exists"""
-        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'}
-        }
+        config = {"name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article"}}

        with tempfile.TemporaryDirectory() as tmpdir:
            try:
                os.chdir(tmpdir)
                converter = DocToSkillConverter(config, dry_run=True)
-                self.assertTrue(hasattr(converter, 'scrape_page_async'))
+                self.assertTrue(hasattr(converter, "scrape_page_async"))
                self.assertTrue(asyncio.iscoroutinefunction(converter.scrape_page_async))
            finally:
                os.chdir(self.original_cwd)

    def test_scrape_all_async_exists(self):
        """Test scrape_all_async method exists"""
-        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'}
-        }
+        config = {"name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article"}}

        with tempfile.TemporaryDirectory() as tmpdir:
            try:
                os.chdir(tmpdir)
                converter = DocToSkillConverter(config, dry_run=True)
-                self.assertTrue(hasattr(converter, 'scrape_all_async'))
+                self.assertTrue(hasattr(converter, "scrape_all_async"))
                self.assertTrue(asyncio.iscoroutinefunction(converter.scrape_all_async))
            finally:
                os.chdir(self.original_cwd)
@@ -142,11 +131,11 @@ class TestAsyncRouting(unittest.TestCase):
    def test_scrape_all_routes_to_async_when_enabled(self):
        """Test scrape_all calls async version when async_mode=True"""
        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'},
-            'async_mode': True,
-            'max_pages': 1
+            "name": "test",
+            "base_url": "https://example.com/",
+            "selectors": {"main_content": "article"},
+            "async_mode": True,
+            "max_pages": 1,
        }

        with tempfile.TemporaryDirectory() as tmpdir:
@@ -155,7 +144,7 @@ class TestAsyncRouting(unittest.TestCase):
                converter = DocToSkillConverter(config, dry_run=True)

                # Mock scrape_all_async to verify it gets called
-                with patch.object(converter, 'scrape_all_async', new_callable=AsyncMock) as mock_async:
+                with patch.object(converter, "scrape_all_async", new_callable=AsyncMock) as mock_async:
                    converter.scrape_all()
                    # Verify async version was called
                    mock_async.assert_called_once()
@@ -165,11 +154,11 @@ class TestAsyncRouting(unittest.TestCase):
    def test_scrape_all_uses_sync_when_async_disabled(self):
        """Test scrape_all uses sync version when async_mode=False"""
        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'},
-            'async_mode': False,
-            'max_pages': 1
+            "name": "test",
+            "base_url": "https://example.com/",
+            "selectors": {"main_content": "article"},
+            "async_mode": False,
+            "max_pages": 1,
        }

        with tempfile.TemporaryDirectory() as tmpdir:
@@ -178,8 +167,8 @@ class TestAsyncRouting(unittest.TestCase):
                converter = DocToSkillConverter(config, dry_run=True)

                # Mock scrape_all_async to verify it does NOT get called
-                with patch.object(converter, 'scrape_all_async', new_callable=AsyncMock) as mock_async:
-                    with patch.object(converter, '_try_llms_txt', return_value=False):
+                with patch.object(converter, "scrape_all_async", new_callable=AsyncMock) as mock_async:
+                    with patch.object(converter, "_try_llms_txt", return_value=False):
                        converter.scrape_all()
                        # Verify async version was NOT called
                        mock_async.assert_not_called()
@@ -201,11 +190,11 @@ class TestAsyncDryRun(unittest.TestCase):
    def test_async_dry_run_completes(self):
        """Test async dry run completes without errors"""
        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'},
-            'async_mode': True,
-            'max_pages': 5
+            "name": "test",
+            "base_url": "https://example.com/",
+            "selectors": {"main_content": "article"},
+            "async_mode": True,
+            "max_pages": 5,
        }

        with tempfile.TemporaryDirectory() as tmpdir:
@@ -214,7 +203,7 @@ class TestAsyncDryRun(unittest.TestCase):
                converter = DocToSkillConverter(config, dry_run=True)

                # Mock _try_llms_txt to skip llms.txt detection
-                with patch.object(converter, '_try_llms_txt', return_value=False):
+                with patch.object(converter, "_try_llms_txt", return_value=False):
                    # Should complete without errors
                    converter.scrape_all()
                    # Verify dry run mode was used
@@ -237,12 +226,12 @@ class TestAsyncErrorHandling(unittest.TestCase):
    def test_async_handles_http_errors(self):
        """Test async scraping handles HTTP errors gracefully"""
        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'},
-            'async_mode': True,
-            'workers': 2,
-            'max_pages': 1
+            "name": "test",
+            "base_url": "https://example.com/",
+            "selectors": {"main_content": "article"},
+            "async_mode": True,
+            "workers": 2,
+            "max_pages": 1,
        }

        with tempfile.TemporaryDirectory() as tmpdir:
@@ -258,9 +247,9 @@ class TestAsyncErrorHandling(unittest.TestCase):

                    async with httpx.AsyncClient() as client:
                        # Mock client.get to raise exception
-                        with patch.object(client, 'get', side_effect=httpx.HTTPError("Test error")):
+                        with patch.object(client, "get", side_effect=httpx.HTTPError("Test error")):
                            # Should not raise exception, just log error
-                            await converter.scrape_page_async('https://example.com/test', semaphore, client)
+                            await converter.scrape_page_async("https://example.com/test", semaphore, client)

                # Run async test
                asyncio.run(run_test())
@@ -275,11 +264,11 @@ class TestAsyncPerformance(unittest.TestCase):
    def test_async_uses_semaphore_for_concurrency_control(self):
        """Test async mode uses semaphore instead of threading lock"""
        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'},
-            'async_mode': True,
-            'workers': 4
+            "name": "test",
+            "base_url": "https://example.com/",
+            "selectors": {"main_content": "article"},
+            "async_mode": True,
+            "workers": 4,
        }

        original_cwd = os.getcwd()
@@ -301,10 +290,10 @@ class TestAsyncLlmsTxtIntegration(unittest.TestCase):
    def test_async_respects_llms_txt(self):
        """Test async mode respects llms.txt and skips HTML scraping"""
        config = {
-            'name': 'test',
-            'base_url': 'https://example.com/',
-            'selectors': {'main_content': 'article'},
-            'async_mode': True
+            "name": "test",
+            "base_url": "https://example.com/",
+            "selectors": {"main_content": "article"},
+            "async_mode": True,
        }

        original_cwd = os.getcwd()
@@ -314,8 +303,8 @@ class TestAsyncLlmsTxtIntegration(unittest.TestCase):
                converter = DocToSkillConverter(config, dry_run=False)

                # Mock _try_llms_txt to return True (llms.txt found)
-                with patch.object(converter, '_try_llms_txt', return_value=True):
-                    with patch.object(converter, 'save_summary'):
+                with patch.object(converter, "_try_llms_txt", return_value=True):
+                    with patch.object(converter, "save_summary"):
                        converter.scrape_all()
                        # If llms.txt succeeded, async scraping should be skipped
                        # Verify by checking that pages were not scraped
@@ -324,5 +313,5 @@ class TestAsyncLlmsTxtIntegration(unittest.TestCase):
                os.chdir(original_cwd)


-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()