The scraper previously reported len(visited_urls) as "Scraped N pages" even when save_page() silently skipped pages with empty content (<50 chars). For JavaScript SPA sites this meant "Scraped 190 pages" followed by "No scraped data found!" with no explanation. Changes: - Added pages_saved/pages_skipped counters to DocToSkillConverter - save_page() now increments pages_skipped on skip, pages_saved on save - New _log_scrape_completion() reports "(N saved, M skipped)" breakdown - SPA detection warns when all/most pages have empty content - build_skill() error now explains empty content cause when pages skipped - Updated both sync and async scrape completion paths - 14 new tests across 4 test classes (counting, messages, SPA, build) Fixes #320 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
195
tests/test_scrape_count.py
Normal file
195
tests/test_scrape_count.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""Tests for accurate scrape page counting (#320) and SPA detection (#321)."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def converter(tmp_path):
|
||||
"""Create a converter with tmp output directory."""
|
||||
config = {
|
||||
"name": "test-site",
|
||||
"base_url": "https://example.com",
|
||||
"selectors": {"title": "title", "code_blocks": "pre code"},
|
||||
"url_patterns": {"include": [], "exclude": []},
|
||||
"rate_limit": 0,
|
||||
"max_pages": 100,
|
||||
}
|
||||
conv = DocToSkillConverter(config)
|
||||
# Override paths to use tmp_path
|
||||
conv.data_dir = str(tmp_path / "test-site_data")
|
||||
conv.skill_dir = str(tmp_path / "test-site")
|
||||
os.makedirs(os.path.join(conv.data_dir, "pages"), exist_ok=True)
|
||||
return conv
|
||||
|
||||
|
||||
class TestPageCounting:
|
||||
"""Tests for pages_saved and pages_skipped counters."""
|
||||
|
||||
def test_initial_counters_are_zero(self, converter):
|
||||
assert converter.pages_saved == 0
|
||||
assert converter.pages_skipped == 0
|
||||
|
||||
def test_save_page_increments_saved_counter(self, converter):
|
||||
page = {
|
||||
"url": "https://example.com/page1",
|
||||
"title": "Test Page",
|
||||
"content": "This is a test page with enough content to pass the 50 char minimum threshold for saving.",
|
||||
}
|
||||
converter.save_page(page)
|
||||
assert converter.pages_saved == 1
|
||||
assert converter.pages_skipped == 0
|
||||
|
||||
def test_save_page_increments_skipped_for_empty_content(self, converter):
|
||||
page = {
|
||||
"url": "https://example.com/empty",
|
||||
"title": "Empty",
|
||||
"content": "",
|
||||
}
|
||||
converter.save_page(page)
|
||||
assert converter.pages_saved == 0
|
||||
assert converter.pages_skipped == 1
|
||||
|
||||
def test_save_page_increments_skipped_for_short_content(self, converter):
|
||||
page = {
|
||||
"url": "https://example.com/short",
|
||||
"title": "Short",
|
||||
"content": "Too short",
|
||||
}
|
||||
converter.save_page(page)
|
||||
assert converter.pages_saved == 0
|
||||
assert converter.pages_skipped == 1
|
||||
|
||||
def test_save_page_increments_skipped_for_missing_content(self, converter):
|
||||
page = {
|
||||
"url": "https://example.com/none",
|
||||
"title": "No Content",
|
||||
}
|
||||
converter.save_page(page)
|
||||
assert converter.pages_saved == 0
|
||||
assert converter.pages_skipped == 1
|
||||
|
||||
def test_multiple_saves_track_correctly(self, converter):
|
||||
good_page = {
|
||||
"url": "https://example.com/good",
|
||||
"title": "Good Page",
|
||||
"content": "This page has plenty of content to pass the 50 character minimum threshold for saving pages.",
|
||||
}
|
||||
empty_page = {
|
||||
"url": "https://example.com/empty",
|
||||
"title": "Empty",
|
||||
"content": "",
|
||||
}
|
||||
converter.save_page(good_page)
|
||||
converter.save_page(empty_page)
|
||||
converter.save_page(good_page) # same URL, still counts
|
||||
assert converter.pages_saved == 2
|
||||
assert converter.pages_skipped == 1
|
||||
|
||||
|
||||
class TestCompletionMessages:
|
||||
"""Tests for scrape completion log messages."""
|
||||
|
||||
def test_completion_message_shows_saved_and_skipped(self, converter, caplog):
|
||||
"""The completion message should report saved/skipped breakdown."""
|
||||
converter.visited_urls = {"url1", "url2", "url3"}
|
||||
converter.pages_saved = 2
|
||||
converter.pages_skipped = 1
|
||||
|
||||
with caplog.at_level(logging.INFO):
|
||||
converter._log_scrape_completion()
|
||||
|
||||
assert "2 saved" in caplog.text
|
||||
assert "1 skipped" in caplog.text
|
||||
|
||||
def test_completion_message_no_skipped(self, converter, caplog):
|
||||
"""When nothing is skipped, don't mention skipped count."""
|
||||
converter.visited_urls = {"url1", "url2"}
|
||||
converter.pages_saved = 2
|
||||
converter.pages_skipped = 0
|
||||
|
||||
with caplog.at_level(logging.INFO):
|
||||
converter._log_scrape_completion()
|
||||
|
||||
assert "2 saved" in caplog.text
|
||||
assert "skipped" not in caplog.text.lower()
|
||||
|
||||
|
||||
class TestSPADetection:
|
||||
"""Tests for SPA site detection warnings (#321)."""
|
||||
|
||||
def test_spa_warning_when_zero_saved(self, converter, caplog):
|
||||
"""Warn when 0 pages saved out of many visited."""
|
||||
converter.visited_urls = {f"url{i}" for i in range(50)}
|
||||
converter.pages_saved = 0
|
||||
converter.pages_skipped = 50
|
||||
|
||||
with caplog.at_level(logging.WARNING):
|
||||
converter._log_scrape_completion()
|
||||
|
||||
assert "JavaScript" in caplog.text
|
||||
|
||||
def test_spa_warning_when_mostly_skipped(self, converter, caplog):
|
||||
"""Warn when >80% of pages are skipped."""
|
||||
converter.visited_urls = {f"url{i}" for i in range(100)}
|
||||
converter.pages_saved = 10
|
||||
converter.pages_skipped = 90
|
||||
|
||||
with caplog.at_level(logging.WARNING):
|
||||
converter._log_scrape_completion()
|
||||
|
||||
assert "JavaScript" in caplog.text
|
||||
|
||||
def test_no_spa_warning_when_mostly_saved(self, converter, caplog):
|
||||
"""No warning when most pages are saved."""
|
||||
converter.visited_urls = {f"url{i}" for i in range(100)}
|
||||
converter.pages_saved = 80
|
||||
converter.pages_skipped = 20
|
||||
|
||||
with caplog.at_level(logging.WARNING):
|
||||
converter._log_scrape_completion()
|
||||
|
||||
assert "JavaScript" not in caplog.text
|
||||
|
||||
def test_no_spa_warning_for_small_scrapes(self, converter, caplog):
|
||||
"""Don't warn for very small scrapes (< 5 pages)."""
|
||||
converter.visited_urls = {"url1", "url2", "url3"}
|
||||
converter.pages_saved = 0
|
||||
converter.pages_skipped = 3
|
||||
|
||||
with caplog.at_level(logging.WARNING):
|
||||
converter._log_scrape_completion()
|
||||
|
||||
# Small scrape - could just be a few bad pages, not SPA
|
||||
assert "JavaScript" not in caplog.text
|
||||
|
||||
|
||||
class TestBuildSkillError:
|
||||
"""Tests for improved build_skill error message."""
|
||||
|
||||
def test_build_skill_error_suggests_cause(self, converter, caplog):
|
||||
"""build_skill should suggest SPA as cause when 0 pages saved."""
|
||||
converter.pages_saved = 0
|
||||
converter.pages_skipped = 50
|
||||
|
||||
with caplog.at_level(logging.ERROR):
|
||||
result = converter.build_skill()
|
||||
|
||||
assert result is False
|
||||
assert "No scraped data found" in caplog.text
|
||||
assert "empty content" in caplog.text
|
||||
|
||||
def test_build_skill_error_generic_when_no_skips(self, converter, caplog):
|
||||
"""build_skill with no data and no skips = generic error (not SPA)."""
|
||||
converter.pages_saved = 0
|
||||
converter.pages_skipped = 0
|
||||
|
||||
with caplog.at_level(logging.ERROR):
|
||||
result = converter.build_skill()
|
||||
|
||||
assert result is False
|
||||
assert "No scraped data found" in caplog.text
|
||||
Reference in New Issue
Block a user