fix: report accurate saved/skipped page counts and detect SPA sites (#320, #321)

The scraper previously reported len(visited_urls) as "Scraped N pages"
even when save_page() silently skipped pages with empty content (<50
chars). For JavaScript SPA sites this meant "Scraped 190 pages" followed
by "No scraped data found!" with no explanation.

Changes:
- Added pages_saved/pages_skipped counters to DocToSkillConverter
- save_page() now increments pages_skipped on skip, pages_saved on save
- New _log_scrape_completion() reports "(N saved, M skipped)" breakdown
- SPA detection warns when all/most pages have empty content
- build_skill() error now explains empty content cause when pages skipped
- Updated both sync and async scrape completion paths
- 14 new tests across 4 test classes (counting, messages, SPA, build)

Fixes #320

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-24 22:26:35 +03:00
parent 8152045e38
commit d76ab1d9a4
3 changed files with 261 additions and 3 deletions

View File

@@ -198,6 +198,8 @@ class DocToSkillConverter:
) # Track all ever-enqueued URLs for O(1) dedup
self.pages: list[dict[str, Any]] = []
self.pages_scraped = 0
self.pages_saved = 0
self.pages_skipped = 0
# Language detection
self.language_detector = LanguageDetector(min_confidence=0.15)
@@ -694,9 +696,12 @@ class DocToSkillConverter:
"""Save page data (skip pages with empty content)"""
# Skip pages with empty or very short content
if not page.get("content") or len(page.get("content", "")) < 50:
self.pages_skipped += 1
logger.debug("Skipping page with empty/short content: %s", page.get("url", "unknown"))
return
self.pages_saved += 1
url_hash = hashlib.md5(page["url"].encode()).hexdigest()[:10]
safe_title = _SAFE_TITLE_RE.sub("", page["title"])[:50]
safe_title = _SAFE_TITLE_SEP_RE.sub("_", safe_title)
@@ -1219,7 +1224,7 @@ class DocToSkillConverter:
)
logger.info("\n💡 To actually scrape, run without --dry-run")
else:
logger.info("\n✅ Scraped %d pages", len(self.visited_urls))
self._log_scrape_completion()
self.save_summary()
async def scrape_all_async(self) -> None:
@@ -1362,9 +1367,43 @@ class DocToSkillConverter:
)
logger.info("\n💡 To actually scrape, run without --dry-run")
else:
logger.info("\n✅ Scraped %d pages (async mode)", len(self.visited_urls))
self._log_scrape_completion()
self.save_summary()
def _log_scrape_completion(self) -> None:
"""Log scrape completion with accurate saved/skipped counts."""
visited = len(self.visited_urls)
if self.pages_skipped > 0:
logger.info(
"\n✅ Scraped %d pages (%d saved, %d skipped - empty content)",
visited,
self.pages_saved,
self.pages_skipped,
)
else:
logger.info(
"\n✅ Scraped %d pages (%d saved)",
visited,
self.pages_saved,
)
# SPA detection: warn when most pages had empty content
if visited >= 5 and self.pages_saved == 0:
logger.warning(
"⚠️ All %d pages had empty content. This site likely requires "
"JavaScript rendering (SPA/React/Vue). Scraping cannot extract "
"content from JavaScript-rendered pages.",
visited,
)
elif visited >= 10 and self.pages_skipped > 0:
skip_ratio = self.pages_skipped / visited
if skip_ratio > 0.8:
logger.warning(
"⚠️ %d%% of pages had empty content. This site may use "
"JavaScript rendering for some pages.",
int(skip_ratio * 100),
)
def save_summary(self) -> None:
"""Save scraping summary"""
summary = {
@@ -1731,6 +1770,12 @@ To refresh this skill with updated documentation:
if not pages:
logger.error("✗ No scraped data found!")
if self.pages_skipped > 0:
logger.error(
" %d pages were visited but had empty content. "
"The site may require JavaScript rendering (SPA).",
self.pages_skipped,
)
return False
logger.info(" ✓ Loaded %d pages\n", len(pages))