feat: add headless browser rendering for JavaScript SPA sites (#321)

New BrowserRenderer class uses Playwright to render JavaScript-heavy
documentation sites (React, Vue SPAs) that return empty HTML shells
with requests.get(). Activated via --browser flag on web scraping.

- browser_renderer.py: Playwright wrapper with lazy browser launch,
  auto-install Chromium on first use, context manager support
- doc_scraper.py: browser_mode config, _render_with_browser() helper,
  integrated into scrape_page() and scrape_page_async()
- SPA detection warnings now suggest --browser flag
- Optional dep: pip install "skill-seekers[browser]"
- 14 real e2e tests (actual Chromium, no mocks)
- UML updated: Scrapers class diagram (BrowserRenderer + dependency),
  Parsers (DoctorParser), Utilities (Doctor), Components, and new
  Browser Rendering sequence diagram (#20)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-28 22:06:14 +03:00
parent 006cccabae
commit ea4fed0be4
15 changed files with 17989 additions and 17824 deletions

View File

@@ -0,0 +1,152 @@
"""Tests for browser_renderer.py (#321).
Real end-to-end tests using actual Playwright + Chromium.
"""
from __future__ import annotations
from skill_seekers.cli.browser_renderer import (
BrowserRenderer,
_auto_install_chromium,
_check_playwright_available,
)
class TestPlaywrightAvailability:
"""Test that playwright is properly detected."""
def test_playwright_is_available(self):
assert _check_playwright_available() is True
def test_auto_install_succeeds(self):
# Chromium is already installed, so this should be a no-op success
assert _auto_install_chromium() is True
class TestBrowserRendererReal:
"""Real end-to-end tests with actual Chromium."""
def test_render_simple_page(self):
"""Render a real page and get HTML back."""
with BrowserRenderer() as renderer:
html = renderer.render_page("https://example.com")
assert "<html" in html.lower()
assert "Example Domain" in html
def test_render_returns_js_content(self):
"""Verify that JS-generated content is captured (not just the shell)."""
with BrowserRenderer() as renderer:
html = renderer.render_page("https://example.com")
# example.com has static content, but the point is we get real HTML
assert len(html) > 500
assert "<body" in html.lower()
def test_multiple_pages_reuse_browser(self):
"""Rendering multiple pages should reuse the same browser instance."""
with BrowserRenderer() as renderer:
html1 = renderer.render_page("https://example.com")
html2 = renderer.render_page("https://example.com")
assert "Example Domain" in html1
assert "Example Domain" in html2
def test_close_cleans_up(self):
"""After close(), internal state is None."""
renderer = BrowserRenderer()
renderer.render_page("https://example.com")
assert renderer._browser is not None
renderer.close()
assert renderer._browser is None
assert renderer._context is None
assert renderer._playwright is None
def test_context_manager_cleans_up(self):
"""Context manager calls close on exit."""
with BrowserRenderer() as renderer:
renderer.render_page("https://example.com")
assert renderer._browser is not None
assert renderer._browser is None
def test_timeout_parameter(self):
"""Custom timeout is respected."""
renderer = BrowserRenderer(timeout=5000)
assert renderer._timeout == 5000
renderer.close()
def test_wait_until_parameter(self):
"""Custom wait_until is respected."""
renderer = BrowserRenderer(wait_until="domcontentloaded")
assert renderer._wait_until == "domcontentloaded"
renderer.close()
class TestDocScraperBrowserIntegration:
"""Test that doc_scraper correctly accepts browser config."""
def test_browser_mode_config_sets_attribute(self):
from skill_seekers.cli.doc_scraper import DocToSkillConverter
config = {
"name": "test",
"base_url": "https://example.com",
"browser": True,
"selectors": {},
"url_patterns": {"include": [], "exclude": []},
}
scraper = DocToSkillConverter(config)
assert scraper.browser_mode is True
assert scraper._browser_renderer is None
def test_browser_mode_default_false(self):
from skill_seekers.cli.doc_scraper import DocToSkillConverter
config = {
"name": "test",
"base_url": "https://example.com",
"selectors": {},
"url_patterns": {"include": [], "exclude": []},
}
scraper = DocToSkillConverter(config)
assert scraper.browser_mode is False
def test_render_with_browser_returns_html(self):
"""Test the _render_with_browser helper directly."""
from skill_seekers.cli.doc_scraper import DocToSkillConverter
config = {
"name": "test",
"base_url": "https://example.com",
"browser": True,
"selectors": {},
"url_patterns": {"include": [], "exclude": []},
}
scraper = DocToSkillConverter(config)
html = scraper._render_with_browser("https://example.com")
assert "Example Domain" in html
assert scraper._browser_renderer is not None
# Clean up
scraper._browser_renderer.close()
class TestBrowserArgument:
"""Test --browser argument is registered in CLI."""
def test_scrape_parser_accepts_browser_flag(self):
from skill_seekers.cli.doc_scraper import setup_argument_parser
parser = setup_argument_parser()
args = parser.parse_args(["--name", "test", "--url", "https://example.com", "--browser"])
assert args.browser is True
def test_scrape_parser_browser_default_false(self):
from skill_seekers.cli.doc_scraper import setup_argument_parser
parser = setup_argument_parser()
args = parser.parse_args(["--name", "test", "--url", "https://example.com"])
assert args.browser is False