run ruff
This commit is contained in:
@@ -3,15 +3,9 @@
|
||||
Tests for parallel scraping, unlimited mode, and rate limiting features (PR #144)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import unittest
|
||||
import tempfile
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
from collections import deque
|
||||
import unittest
|
||||
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
@@ -30,41 +24,41 @@ class TestParallelScrapingConfiguration(unittest.TestCase):
|
||||
def test_single_worker_default(self):
|
||||
"""Test default is single-worker mode"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': 10
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": 10,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.workers, 1)
|
||||
self.assertFalse(hasattr(converter, 'lock'))
|
||||
self.assertFalse(hasattr(converter, "lock"))
|
||||
|
||||
def test_multiple_workers_creates_lock(self):
|
||||
"""Test multiple workers creates thread lock"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': 10,
|
||||
'workers': 4
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": 10,
|
||||
"workers": 4,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.workers, 4)
|
||||
self.assertTrue(hasattr(converter, 'lock'))
|
||||
self.assertTrue(hasattr(converter, "lock"))
|
||||
|
||||
def test_workers_from_config(self):
|
||||
"""Test workers parameter is read from config"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'workers': 8
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"workers": 8,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -87,43 +81,39 @@ class TestUnlimitedMode(unittest.TestCase):
|
||||
def test_unlimited_with_none(self):
|
||||
"""Test max_pages: None enables unlimited mode"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': None
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": None,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertIsNone(converter.config.get('max_pages'))
|
||||
self.assertIsNone(converter.config.get("max_pages"))
|
||||
|
||||
def test_unlimited_with_minus_one(self):
|
||||
"""Test max_pages: -1 enables unlimited mode"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': -1
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": -1,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.config.get('max_pages'), -1)
|
||||
self.assertEqual(converter.config.get("max_pages"), -1)
|
||||
|
||||
def test_limited_mode_default(self):
|
||||
"""Test default max_pages is limited"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'}
|
||||
}
|
||||
config = {"name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article"}}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
max_pages = converter.config.get('max_pages', 500)
|
||||
max_pages = converter.config.get("max_pages", 500)
|
||||
self.assertIsNotNone(max_pages)
|
||||
self.assertGreater(max_pages, 0)
|
||||
|
||||
@@ -142,43 +132,39 @@ class TestRateLimiting(unittest.TestCase):
|
||||
def test_rate_limit_from_config(self):
|
||||
"""Test rate_limit is read from config"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'rate_limit': 0.1
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"rate_limit": 0.1,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.config.get('rate_limit'), 0.1)
|
||||
self.assertEqual(converter.config.get("rate_limit"), 0.1)
|
||||
|
||||
def test_rate_limit_default(self):
|
||||
"""Test default rate_limit is 0.5"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'}
|
||||
}
|
||||
config = {"name": "test", "base_url": "https://example.com/", "selectors": {"main_content": "article"}}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.config.get('rate_limit', 0.5), 0.5)
|
||||
self.assertEqual(converter.config.get("rate_limit", 0.5), 0.5)
|
||||
|
||||
def test_zero_rate_limit_disables(self):
|
||||
"""Test rate_limit: 0 disables rate limiting"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'rate_limit': 0
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"rate_limit": 0,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.config.get('rate_limit'), 0)
|
||||
self.assertEqual(converter.config.get("rate_limit"), 0)
|
||||
|
||||
|
||||
class TestThreadSafety(unittest.TestCase):
|
||||
@@ -195,10 +181,10 @@ class TestThreadSafety(unittest.TestCase):
|
||||
def test_lock_protects_visited_urls(self):
|
||||
"""Test visited_urls operations are protected by lock"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'workers': 4
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"workers": 4,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -206,25 +192,26 @@ class TestThreadSafety(unittest.TestCase):
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
|
||||
# Verify lock exists
|
||||
self.assertTrue(hasattr(converter, 'lock'))
|
||||
self.assertTrue(hasattr(converter, "lock"))
|
||||
|
||||
# Verify it's a threading.Lock
|
||||
import threading
|
||||
|
||||
self.assertIsInstance(converter.lock, type(threading.Lock()))
|
||||
|
||||
def test_single_worker_no_lock(self):
|
||||
"""Test single worker doesn't create unnecessary lock"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'workers': 1
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"workers": 1,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertFalse(hasattr(converter, 'lock'))
|
||||
self.assertFalse(hasattr(converter, "lock"))
|
||||
|
||||
|
||||
class TestScrapingModes(unittest.TestCase):
|
||||
@@ -241,69 +228,69 @@ class TestScrapingModes(unittest.TestCase):
|
||||
def test_single_threaded_limited(self):
|
||||
"""Test traditional single-threaded limited mode"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': 10,
|
||||
'workers': 1
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": 10,
|
||||
"workers": 1,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.workers, 1)
|
||||
self.assertEqual(converter.config.get('max_pages'), 10)
|
||||
self.assertEqual(converter.config.get("max_pages"), 10)
|
||||
|
||||
def test_parallel_limited(self):
|
||||
"""Test parallel scraping with page limit"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': 100,
|
||||
'workers': 4
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": 100,
|
||||
"workers": 4,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.workers, 4)
|
||||
self.assertEqual(converter.config.get('max_pages'), 100)
|
||||
self.assertTrue(hasattr(converter, 'lock'))
|
||||
self.assertEqual(converter.config.get("max_pages"), 100)
|
||||
self.assertTrue(hasattr(converter, "lock"))
|
||||
|
||||
def test_parallel_unlimited(self):
|
||||
"""Test parallel scraping with unlimited pages"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': None,
|
||||
'workers': 8
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": None,
|
||||
"workers": 8,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.workers, 8)
|
||||
self.assertIsNone(converter.config.get('max_pages'))
|
||||
self.assertTrue(hasattr(converter, 'lock'))
|
||||
self.assertIsNone(converter.config.get("max_pages"))
|
||||
self.assertTrue(hasattr(converter, "lock"))
|
||||
|
||||
def test_fast_scraping_mode(self):
|
||||
"""Test fast scraping with low rate limit and workers"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'rate_limit': 0.1,
|
||||
'workers': 8,
|
||||
'max_pages': 1000
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"rate_limit": 0.1,
|
||||
"workers": 8,
|
||||
"max_pages": 1000,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertEqual(converter.workers, 8)
|
||||
self.assertEqual(converter.config.get('rate_limit'), 0.1)
|
||||
self.assertEqual(converter.config.get("rate_limit"), 0.1)
|
||||
|
||||
|
||||
class TestDryRunWithNewFeatures(unittest.TestCase):
|
||||
@@ -320,10 +307,10 @@ class TestDryRunWithNewFeatures(unittest.TestCase):
|
||||
def test_dry_run_with_parallel(self):
|
||||
"""Test dry-run with parallel workers"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'workers': 4
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"workers": 4,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -335,18 +322,18 @@ class TestDryRunWithNewFeatures(unittest.TestCase):
|
||||
def test_dry_run_with_unlimited(self):
|
||||
"""Test dry-run with unlimited mode"""
|
||||
config = {
|
||||
'name': 'test',
|
||||
'base_url': 'https://example.com/',
|
||||
'selectors': {'main_content': 'article'},
|
||||
'max_pages': None
|
||||
"name": "test",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
"max_pages": None,
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.chdir(tmpdir)
|
||||
converter = DocToSkillConverter(config, dry_run=True)
|
||||
self.assertTrue(converter.dry_run)
|
||||
self.assertIsNone(converter.config.get('max_pages'))
|
||||
self.assertIsNone(converter.config.get("max_pages"))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user