feat: add headless browser rendering for JavaScript SPA sites (#321)

New BrowserRenderer class uses Playwright to render JavaScript-heavy
documentation sites (React, Vue SPAs) that return empty HTML shells
with requests.get(). Activated via --browser flag on web scraping.

- browser_renderer.py: Playwright wrapper with lazy browser launch,
  auto-install Chromium on first use, context manager support
- doc_scraper.py: browser_mode config, _render_with_browser() helper,
  integrated into scrape_page() and scrape_page_async()
- SPA detection warnings now suggest --browser flag
- Optional dep: pip install "skill-seekers[browser]"
- 14 real e2e tests (actual Chromium, no mocks)
- UML updated: Scrapers class diagram (BrowserRenderer + dependency),
  Parsers (DoctorParser), Utilities (Doctor), Components, and new
  Browser Rendering sequence diagram (#20)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-28 22:06:14 +03:00
parent 006cccabae
commit ea4fed0be4
15 changed files with 17989 additions and 17824 deletions

View File

@@ -173,6 +173,13 @@ UNIVERSAL_ARGUMENTS.update(RAG_ARGUMENTS)
# Web scraping specific (from scrape.py)
WEB_ARGUMENTS: dict[str, dict[str, Any]] = {
"browser": {
"flags": ("--browser",),
"kwargs": {
"action": "store_true",
"help": "Use headless browser (Playwright) to render JavaScript SPA sites",
},
},
"url": {
"flags": ("--url",),
"kwargs": {

View File

@@ -115,6 +115,13 @@ SCRAPE_ARGUMENTS: dict[str, dict[str, Any]] = {
"help": "Disable rate limiting completely (same as --rate-limit 0)",
},
},
"browser": {
"flags": ("--browser",),
"kwargs": {
"action": "store_true",
"help": "Use headless browser (Playwright) to render JavaScript SPA sites. Install: pip install 'skill-seekers[browser]'",
},
},
"interactive_enhancement": {
"flags": ("--interactive-enhancement",),
"kwargs": {

View File

@@ -0,0 +1,151 @@
"""
Browser Renderer — Playwright-based headless browser for JavaScript SPA sites.
When documentation sites use client-side rendering (React, Vue, etc.),
requests.get() returns empty HTML shells. This module uses Playwright
to render JavaScript before extracting content.
Optional dependency: pip install "skill-seekers[browser]"
"""
from __future__ import annotations
import logging
import subprocess
import sys
logger = logging.getLogger(__name__)
def _check_playwright_available() -> bool:
"""Check if playwright package is installed."""
try:
import playwright # noqa: F401
return True
except ImportError:
return False
def _auto_install_chromium() -> bool:
"""Auto-install Chromium browser on first use.
Returns:
True if install succeeded or already installed, False on failure.
"""
logger.info("Installing Chromium browser for headless rendering...")
try:
result = subprocess.run(
[sys.executable, "-m", "playwright", "install", "chromium"],
capture_output=True,
text=True,
timeout=300,
)
if result.returncode == 0:
logger.info("Chromium installed successfully.")
return True
logger.error("Chromium install failed: %s", result.stderr)
return False
except Exception as e:
logger.error("Failed to install Chromium: %s", e)
return False
class BrowserRenderer:
"""Render JavaScript pages using Playwright headless Chromium.
Usage:
renderer = BrowserRenderer()
html = renderer.render_page("https://docs.discord.com")
renderer.close()
Or as context manager:
with BrowserRenderer() as renderer:
html = renderer.render_page(url)
"""
def __init__(self, timeout: int = 30000, wait_until: str = "networkidle"):
"""Initialize renderer.
Args:
timeout: Page load timeout in milliseconds (default: 30s)
wait_until: Playwright wait condition — "networkidle", "load", "domcontentloaded"
"""
if not _check_playwright_available():
raise ImportError(
"Playwright is required for --browser mode.\n"
"Install it with: pip install 'skill-seekers[browser]'\n"
"Then run: playwright install chromium"
)
self._timeout = timeout
self._wait_until = wait_until
self._playwright = None
self._browser = None
self._context = None
def _ensure_browser(self) -> None:
"""Launch browser if not already running. Auto-installs chromium if needed."""
if self._browser is not None:
return
from playwright.sync_api import sync_playwright
self._playwright = sync_playwright().start()
try:
self._browser = self._playwright.chromium.launch(headless=True)
except Exception:
# Browser not installed — try auto-install
logger.warning("Chromium not found. Attempting auto-install...")
if _auto_install_chromium():
self._browser = self._playwright.chromium.launch(headless=True)
else:
self._playwright.stop()
self._playwright = None
raise RuntimeError(
"Could not launch Chromium. Run: playwright install chromium"
) from None
self._context = self._browser.new_context(user_agent="Mozilla/5.0 (Documentation Scraper)")
def render_page(self, url: str) -> str:
"""Render a page with JavaScript execution and return the HTML.
Args:
url: URL to render
Returns:
Fully-rendered HTML string after JavaScript execution
Raises:
RuntimeError: If browser cannot be launched
TimeoutError: If page load times out
"""
self._ensure_browser()
page = self._context.new_page()
try:
page.goto(url, wait_until=self._wait_until, timeout=self._timeout)
html = page.content()
return html
finally:
page.close()
def close(self) -> None:
"""Shut down browser and Playwright."""
if self._context:
self._context.close()
self._context = None
if self._browser:
self._browser.close()
self._browser = None
if self._playwright:
self._playwright.stop()
self._playwright = None
def __enter__(self):
return self
def __exit__(self, *args):
self.close()

View File

@@ -221,6 +221,8 @@ class CreateCommand:
argv.append("--async")
if getattr(self.args, "no_rate_limit", False):
argv.append("--no-rate-limit")
if getattr(self.args, "browser", False):
argv.append("--browser")
# Call doc_scraper with modified argv
logger.debug(f"Calling doc_scraper with argv: {argv}")

View File

@@ -184,6 +184,10 @@ class DocToSkillConverter:
self.llms_txt_variant = None
self.llms_txt_variants: list[str] = [] # Track all downloaded variants
# Browser rendering mode (for JavaScript SPA sites)
self.browser_mode = config.get("browser", False)
self._browser_renderer = None
# Parallel scraping config
self.workers = config.get("workers", 1)
self.async_mode = config.get("async_mode", DEFAULT_ASYNC_MODE)
@@ -712,6 +716,24 @@ class DocToSkillConverter:
with open(filepath, "w", encoding="utf-8") as f:
json.dump(page, f, indent=2, ensure_ascii=False)
def _render_with_browser(self, url: str) -> str:
"""Render a page using headless browser (Playwright).
Lazily initializes the BrowserRenderer on first call.
Args:
url: URL to render
Returns:
Fully-rendered HTML string
"""
if self._browser_renderer is None:
from skill_seekers.cli.browser_renderer import BrowserRenderer
self._browser_renderer = BrowserRenderer()
logger.info("Launched headless browser for JavaScript rendering")
return self._browser_renderer.render_page(url)
def scrape_page(self, url: str) -> None:
"""Scrape a single page with thread-safe operations.
@@ -730,16 +752,22 @@ class DocToSkillConverter:
url = sanitize_url(url)
# Scraping part (no lock needed - independent)
headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
# Check if this is a Markdown file
if self._has_md_extension(url):
page = self._extract_markdown_content(response.text, url)
else:
soup = BeautifulSoup(response.content, "html.parser")
if self.browser_mode and not self._has_md_extension(url):
# Use Playwright headless browser for JavaScript rendering
html = self._render_with_browser(url)
soup = BeautifulSoup(html, "html.parser")
page = self.extract_content(soup, url)
else:
headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
# Check if this is a Markdown file
if self._has_md_extension(url):
page = self._extract_markdown_content(response.text, url)
else:
soup = BeautifulSoup(response.content, "html.parser")
page = self.extract_content(soup, url)
# Thread-safe operations (lock required for workers > 1)
if self.workers > 1:
@@ -788,18 +816,27 @@ class DocToSkillConverter:
# Sanitise brackets before fetching (safety net; see #284)
url = sanitize_url(url)
# Async HTTP request
headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
response = await client.get(url, headers=headers, timeout=30.0)
response.raise_for_status()
# Check if this is a Markdown file
if self._has_md_extension(url):
page = self._extract_markdown_content(response.text, url)
else:
# BeautifulSoup parsing (still synchronous, but fast)
soup = BeautifulSoup(response.content, "html.parser")
if self.browser_mode and not self._has_md_extension(url):
# Use Playwright in executor (sync API in async context)
loop = asyncio.get_event_loop()
html = await loop.run_in_executor(
None, self._render_with_browser, url
)
soup = BeautifulSoup(html, "html.parser")
page = self.extract_content(soup, url)
else:
# Async HTTP request
headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
response = await client.get(url, headers=headers, timeout=30.0)
response.raise_for_status()
# Check if this is a Markdown file
if self._has_md_extension(url):
page = self._extract_markdown_content(response.text, url)
else:
# BeautifulSoup parsing (still synchronous, but fast)
soup = BeautifulSoup(response.content, "html.parser")
page = self.extract_content(soup, url)
# Async-safe operations (no lock needed - single event loop)
logger.info(" %s", url)
@@ -1370,6 +1407,11 @@ class DocToSkillConverter:
self._log_scrape_completion()
self.save_summary()
# Clean up browser renderer if used
if self._browser_renderer is not None:
self._browser_renderer.close()
self._browser_renderer = None
def _log_scrape_completion(self) -> None:
"""Log scrape completion with accurate saved/skipped counts."""
visited = len(self.visited_urls)
@@ -1391,8 +1433,9 @@ class DocToSkillConverter:
if visited >= 5 and self.pages_saved == 0:
logger.warning(
"⚠️ All %d pages had empty content. This site likely requires "
"JavaScript rendering (SPA/React/Vue). Scraping cannot extract "
"content from JavaScript-rendered pages.",
"JavaScript rendering (SPA/React/Vue).\n"
" Try: skill-seekers create <url> --browser\n"
" Install: pip install 'skill-seekers[browser]'",
visited,
)
elif visited >= 10 and self.pages_skipped > 0:
@@ -1400,7 +1443,8 @@ class DocToSkillConverter:
if skip_ratio > 0.8:
logger.warning(
"⚠️ %d%% of pages had empty content. This site may use "
"JavaScript rendering for some pages.",
"JavaScript rendering for some pages.\n"
" Try: skill-seekers create <url> --browser",
int(skip_ratio * 100),
)
@@ -2212,6 +2256,11 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
"⚠️ Async mode enabled but workers=1. Consider using --workers 4 for better performance"
)
# Apply CLI override for browser mode
if getattr(args, "browser", False):
config["browser"] = True
logger.info("🌐 Browser mode enabled (Playwright headless Chromium)")
# Apply CLI override for max_pages
if args.max_pages is not None:
old_max = config.get("max_pages", DEFAULT_MAX_PAGES)