From d76ab1d9a41885c90558c2f527b38d6567f37c5f Mon Sep 17 00:00:00 2001 From: yusyus Date: Tue, 24 Mar 2026 22:26:35 +0300 Subject: [PATCH] fix: report accurate saved/skipped page counts and detect SPA sites (#320, #321) The scraper previously reported len(visited_urls) as "Scraped N pages" even when save_page() silently skipped pages with empty content (<50 chars). For JavaScript SPA sites this meant "Scraped 190 pages" followed by "No scraped data found!" with no explanation. Changes: - Added pages_saved/pages_skipped counters to DocToSkillConverter - save_page() now increments pages_skipped on skip, pages_saved on save - New _log_scrape_completion() reports "(N saved, M skipped)" breakdown - SPA detection warns when all/most pages have empty content - build_skill() error now explains empty content cause when pages skipped - Updated both sync and async scrape completion paths - 14 new tests across 4 test classes (counting, messages, SPA, build) Fixes #320 Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 20 ++- src/skill_seekers/cli/doc_scraper.py | 49 ++++++- tests/test_scrape_count.py | 195 +++++++++++++++++++++++++++ 3 files changed, 261 insertions(+), 3 deletions(-) create mode 100644 tests/test_scrape_count.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c5f4826..e6d5fe3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,12 +16,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **7 new CLI agent install paths**: roo, cline, aider, bolt, kilo, continue, kimi-code (total: 18 agents) - **OpenCode skill splitter** - Auto-split large docs into focused sub-skills with router - **Bi-directional skill converter** - Import/export between OpenCode and any platform format -- **GitHub Actions template** for automated skill updates (`templates/github-actions/update-skills.yml`) +- **Distribution files** for Smithery (`smithery.yaml`), GitHub Actions (`templates/github-actions/update-skills.yml`), and Claude Code Plugin +- **Full UML architecture documentation** — 14 class diagrams synced from source code via StarUML +- **StarUML HTML API reference** documentation export +- **Ecosystem section** in README linking all Skill Seekers repos (PyPI, website, plugin, GitHub Action) + +### Fixed +- **`sanitize_url()` crashes on Python 3.14** due to strict `urlparse` rejecting bracket-containing URLs (#284) +- **Blindly appending `/index.html.md` to non-.md URLs** — now only appends for URLs that should have it (#277) +- **Unified scraper temp config** uses unified format for `doc_scraper` instead of raw args (#317) +- **Unicode arrows in CLI help text** replaced with ASCII for Windows cp1252 compatibility +- **CLI flags in plugin slash commands** corrected (`create` uses `--preset`, `package` uses `--target`) +- **MiniMax adaptor** improvements from PR #318 review (#319) +- **Misleading "Scraped N pages" count** reported visited URLs instead of saved pages — now shows `(N saved, M skipped)` (#320) +- **"No scraped data found" after successful scrape** on JavaScript SPA sites — now warns that site requires JS rendering (#320, #321) ### Changed - Refactored MiniMax adaptor to inherit from shared OpenAI-compatible base class - Platform count: 5 → 12 LLM targets - Agent count: 11 → 18 install paths +- Consolidated `Docs/` into `docs/` (single documentation directory) +- Removed stale root-level test scripts and junk files +- Removed stale `UNIFIED_PARSERS.md` superseded by UML architecture +- Added architecture references to README.md and CONTRIBUTING.md +- Fixed pre-existing ruff format issues in 5 files ## [3.3.0] - 2026-03-16 diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 502ab30..89f6bbc 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -198,6 +198,8 @@ class DocToSkillConverter: ) # Track all ever-enqueued URLs for O(1) dedup self.pages: list[dict[str, Any]] = [] self.pages_scraped = 0 + self.pages_saved = 0 + self.pages_skipped = 0 # Language detection self.language_detector = LanguageDetector(min_confidence=0.15) @@ -694,9 +696,12 @@ class DocToSkillConverter: """Save page data (skip pages with empty content)""" # Skip pages with empty or very short content if not page.get("content") or len(page.get("content", "")) < 50: + self.pages_skipped += 1 logger.debug("Skipping page with empty/short content: %s", page.get("url", "unknown")) return + self.pages_saved += 1 + url_hash = hashlib.md5(page["url"].encode()).hexdigest()[:10] safe_title = _SAFE_TITLE_RE.sub("", page["title"])[:50] safe_title = _SAFE_TITLE_SEP_RE.sub("_", safe_title) @@ -1219,7 +1224,7 @@ class DocToSkillConverter: ) logger.info("\n💡 To actually scrape, run without --dry-run") else: - logger.info("\n✅ Scraped %d pages", len(self.visited_urls)) + self._log_scrape_completion() self.save_summary() async def scrape_all_async(self) -> None: @@ -1362,9 +1367,43 @@ class DocToSkillConverter: ) logger.info("\n💡 To actually scrape, run without --dry-run") else: - logger.info("\n✅ Scraped %d pages (async mode)", len(self.visited_urls)) + self._log_scrape_completion() self.save_summary() + def _log_scrape_completion(self) -> None: + """Log scrape completion with accurate saved/skipped counts.""" + visited = len(self.visited_urls) + if self.pages_skipped > 0: + logger.info( + "\n✅ Scraped %d pages (%d saved, %d skipped - empty content)", + visited, + self.pages_saved, + self.pages_skipped, + ) + else: + logger.info( + "\n✅ Scraped %d pages (%d saved)", + visited, + self.pages_saved, + ) + + # SPA detection: warn when most pages had empty content + if visited >= 5 and self.pages_saved == 0: + logger.warning( + "⚠️ All %d pages had empty content. This site likely requires " + "JavaScript rendering (SPA/React/Vue). Scraping cannot extract " + "content from JavaScript-rendered pages.", + visited, + ) + elif visited >= 10 and self.pages_skipped > 0: + skip_ratio = self.pages_skipped / visited + if skip_ratio > 0.8: + logger.warning( + "⚠️ %d%% of pages had empty content. This site may use " + "JavaScript rendering for some pages.", + int(skip_ratio * 100), + ) + def save_summary(self) -> None: """Save scraping summary""" summary = { @@ -1731,6 +1770,12 @@ To refresh this skill with updated documentation: if not pages: logger.error("✗ No scraped data found!") + if self.pages_skipped > 0: + logger.error( + " %d pages were visited but had empty content. " + "The site may require JavaScript rendering (SPA).", + self.pages_skipped, + ) return False logger.info(" ✓ Loaded %d pages\n", len(pages)) diff --git a/tests/test_scrape_count.py b/tests/test_scrape_count.py new file mode 100644 index 0000000..02b4563 --- /dev/null +++ b/tests/test_scrape_count.py @@ -0,0 +1,195 @@ +"""Tests for accurate scrape page counting (#320) and SPA detection (#321).""" + +import logging +import os + +import pytest + +from skill_seekers.cli.doc_scraper import DocToSkillConverter + + +@pytest.fixture +def converter(tmp_path): + """Create a converter with tmp output directory.""" + config = { + "name": "test-site", + "base_url": "https://example.com", + "selectors": {"title": "title", "code_blocks": "pre code"}, + "url_patterns": {"include": [], "exclude": []}, + "rate_limit": 0, + "max_pages": 100, + } + conv = DocToSkillConverter(config) + # Override paths to use tmp_path + conv.data_dir = str(tmp_path / "test-site_data") + conv.skill_dir = str(tmp_path / "test-site") + os.makedirs(os.path.join(conv.data_dir, "pages"), exist_ok=True) + return conv + + +class TestPageCounting: + """Tests for pages_saved and pages_skipped counters.""" + + def test_initial_counters_are_zero(self, converter): + assert converter.pages_saved == 0 + assert converter.pages_skipped == 0 + + def test_save_page_increments_saved_counter(self, converter): + page = { + "url": "https://example.com/page1", + "title": "Test Page", + "content": "This is a test page with enough content to pass the 50 char minimum threshold for saving.", + } + converter.save_page(page) + assert converter.pages_saved == 1 + assert converter.pages_skipped == 0 + + def test_save_page_increments_skipped_for_empty_content(self, converter): + page = { + "url": "https://example.com/empty", + "title": "Empty", + "content": "", + } + converter.save_page(page) + assert converter.pages_saved == 0 + assert converter.pages_skipped == 1 + + def test_save_page_increments_skipped_for_short_content(self, converter): + page = { + "url": "https://example.com/short", + "title": "Short", + "content": "Too short", + } + converter.save_page(page) + assert converter.pages_saved == 0 + assert converter.pages_skipped == 1 + + def test_save_page_increments_skipped_for_missing_content(self, converter): + page = { + "url": "https://example.com/none", + "title": "No Content", + } + converter.save_page(page) + assert converter.pages_saved == 0 + assert converter.pages_skipped == 1 + + def test_multiple_saves_track_correctly(self, converter): + good_page = { + "url": "https://example.com/good", + "title": "Good Page", + "content": "This page has plenty of content to pass the 50 character minimum threshold for saving pages.", + } + empty_page = { + "url": "https://example.com/empty", + "title": "Empty", + "content": "", + } + converter.save_page(good_page) + converter.save_page(empty_page) + converter.save_page(good_page) # same URL, still counts + assert converter.pages_saved == 2 + assert converter.pages_skipped == 1 + + +class TestCompletionMessages: + """Tests for scrape completion log messages.""" + + def test_completion_message_shows_saved_and_skipped(self, converter, caplog): + """The completion message should report saved/skipped breakdown.""" + converter.visited_urls = {"url1", "url2", "url3"} + converter.pages_saved = 2 + converter.pages_skipped = 1 + + with caplog.at_level(logging.INFO): + converter._log_scrape_completion() + + assert "2 saved" in caplog.text + assert "1 skipped" in caplog.text + + def test_completion_message_no_skipped(self, converter, caplog): + """When nothing is skipped, don't mention skipped count.""" + converter.visited_urls = {"url1", "url2"} + converter.pages_saved = 2 + converter.pages_skipped = 0 + + with caplog.at_level(logging.INFO): + converter._log_scrape_completion() + + assert "2 saved" in caplog.text + assert "skipped" not in caplog.text.lower() + + +class TestSPADetection: + """Tests for SPA site detection warnings (#321).""" + + def test_spa_warning_when_zero_saved(self, converter, caplog): + """Warn when 0 pages saved out of many visited.""" + converter.visited_urls = {f"url{i}" for i in range(50)} + converter.pages_saved = 0 + converter.pages_skipped = 50 + + with caplog.at_level(logging.WARNING): + converter._log_scrape_completion() + + assert "JavaScript" in caplog.text + + def test_spa_warning_when_mostly_skipped(self, converter, caplog): + """Warn when >80% of pages are skipped.""" + converter.visited_urls = {f"url{i}" for i in range(100)} + converter.pages_saved = 10 + converter.pages_skipped = 90 + + with caplog.at_level(logging.WARNING): + converter._log_scrape_completion() + + assert "JavaScript" in caplog.text + + def test_no_spa_warning_when_mostly_saved(self, converter, caplog): + """No warning when most pages are saved.""" + converter.visited_urls = {f"url{i}" for i in range(100)} + converter.pages_saved = 80 + converter.pages_skipped = 20 + + with caplog.at_level(logging.WARNING): + converter._log_scrape_completion() + + assert "JavaScript" not in caplog.text + + def test_no_spa_warning_for_small_scrapes(self, converter, caplog): + """Don't warn for very small scrapes (< 5 pages).""" + converter.visited_urls = {"url1", "url2", "url3"} + converter.pages_saved = 0 + converter.pages_skipped = 3 + + with caplog.at_level(logging.WARNING): + converter._log_scrape_completion() + + # Small scrape - could just be a few bad pages, not SPA + assert "JavaScript" not in caplog.text + + +class TestBuildSkillError: + """Tests for improved build_skill error message.""" + + def test_build_skill_error_suggests_cause(self, converter, caplog): + """build_skill should suggest SPA as cause when 0 pages saved.""" + converter.pages_saved = 0 + converter.pages_skipped = 50 + + with caplog.at_level(logging.ERROR): + result = converter.build_skill() + + assert result is False + assert "No scraped data found" in caplog.text + assert "empty content" in caplog.text + + def test_build_skill_error_generic_when_no_skips(self, converter, caplog): + """build_skill with no data and no skips = generic error (not SPA).""" + converter.pages_saved = 0 + converter.pages_skipped = 0 + + with caplog.at_level(logging.ERROR): + result = converter.build_skill() + + assert result is False + assert "No scraped data found" in caplog.text