feat: add headless browser rendering for JavaScript SPA sites (#321)
New BrowserRenderer class uses Playwright to render JavaScript-heavy documentation sites (React, Vue SPAs) that return empty HTML shells with requests.get(). Activated via --browser flag on web scraping. - browser_renderer.py: Playwright wrapper with lazy browser launch, auto-install Chromium on first use, context manager support - doc_scraper.py: browser_mode config, _render_with_browser() helper, integrated into scrape_page() and scrape_page_async() - SPA detection warnings now suggest --browser flag - Optional dep: pip install "skill-seekers[browser]" - 14 real e2e tests (actual Chromium, no mocks) - UML updated: Scrapers class diagram (BrowserRenderer + dependency), Parsers (DoctorParser), Utilities (Doctor), Components, and new Browser Rendering sequence diagram (#20) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
152
tests/test_browser_renderer.py
Normal file
152
tests/test_browser_renderer.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""Tests for browser_renderer.py (#321).
|
||||
|
||||
Real end-to-end tests using actual Playwright + Chromium.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from skill_seekers.cli.browser_renderer import (
|
||||
BrowserRenderer,
|
||||
_auto_install_chromium,
|
||||
_check_playwright_available,
|
||||
)
|
||||
|
||||
|
||||
class TestPlaywrightAvailability:
|
||||
"""Test that playwright is properly detected."""
|
||||
|
||||
def test_playwright_is_available(self):
|
||||
assert _check_playwright_available() is True
|
||||
|
||||
def test_auto_install_succeeds(self):
|
||||
# Chromium is already installed, so this should be a no-op success
|
||||
assert _auto_install_chromium() is True
|
||||
|
||||
|
||||
class TestBrowserRendererReal:
|
||||
"""Real end-to-end tests with actual Chromium."""
|
||||
|
||||
def test_render_simple_page(self):
|
||||
"""Render a real page and get HTML back."""
|
||||
with BrowserRenderer() as renderer:
|
||||
html = renderer.render_page("https://example.com")
|
||||
|
||||
assert "<html" in html.lower()
|
||||
assert "Example Domain" in html
|
||||
|
||||
def test_render_returns_js_content(self):
|
||||
"""Verify that JS-generated content is captured (not just the shell)."""
|
||||
with BrowserRenderer() as renderer:
|
||||
html = renderer.render_page("https://example.com")
|
||||
|
||||
# example.com has static content, but the point is we get real HTML
|
||||
assert len(html) > 500
|
||||
assert "<body" in html.lower()
|
||||
|
||||
def test_multiple_pages_reuse_browser(self):
|
||||
"""Rendering multiple pages should reuse the same browser instance."""
|
||||
with BrowserRenderer() as renderer:
|
||||
html1 = renderer.render_page("https://example.com")
|
||||
html2 = renderer.render_page("https://example.com")
|
||||
|
||||
assert "Example Domain" in html1
|
||||
assert "Example Domain" in html2
|
||||
|
||||
def test_close_cleans_up(self):
|
||||
"""After close(), internal state is None."""
|
||||
renderer = BrowserRenderer()
|
||||
renderer.render_page("https://example.com")
|
||||
assert renderer._browser is not None
|
||||
|
||||
renderer.close()
|
||||
assert renderer._browser is None
|
||||
assert renderer._context is None
|
||||
assert renderer._playwright is None
|
||||
|
||||
def test_context_manager_cleans_up(self):
|
||||
"""Context manager calls close on exit."""
|
||||
with BrowserRenderer() as renderer:
|
||||
renderer.render_page("https://example.com")
|
||||
assert renderer._browser is not None
|
||||
|
||||
assert renderer._browser is None
|
||||
|
||||
def test_timeout_parameter(self):
|
||||
"""Custom timeout is respected."""
|
||||
renderer = BrowserRenderer(timeout=5000)
|
||||
assert renderer._timeout == 5000
|
||||
renderer.close()
|
||||
|
||||
def test_wait_until_parameter(self):
|
||||
"""Custom wait_until is respected."""
|
||||
renderer = BrowserRenderer(wait_until="domcontentloaded")
|
||||
assert renderer._wait_until == "domcontentloaded"
|
||||
renderer.close()
|
||||
|
||||
|
||||
class TestDocScraperBrowserIntegration:
|
||||
"""Test that doc_scraper correctly accepts browser config."""
|
||||
|
||||
def test_browser_mode_config_sets_attribute(self):
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
config = {
|
||||
"name": "test",
|
||||
"base_url": "https://example.com",
|
||||
"browser": True,
|
||||
"selectors": {},
|
||||
"url_patterns": {"include": [], "exclude": []},
|
||||
}
|
||||
scraper = DocToSkillConverter(config)
|
||||
assert scraper.browser_mode is True
|
||||
assert scraper._browser_renderer is None
|
||||
|
||||
def test_browser_mode_default_false(self):
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
config = {
|
||||
"name": "test",
|
||||
"base_url": "https://example.com",
|
||||
"selectors": {},
|
||||
"url_patterns": {"include": [], "exclude": []},
|
||||
}
|
||||
scraper = DocToSkillConverter(config)
|
||||
assert scraper.browser_mode is False
|
||||
|
||||
def test_render_with_browser_returns_html(self):
|
||||
"""Test the _render_with_browser helper directly."""
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
config = {
|
||||
"name": "test",
|
||||
"base_url": "https://example.com",
|
||||
"browser": True,
|
||||
"selectors": {},
|
||||
"url_patterns": {"include": [], "exclude": []},
|
||||
}
|
||||
scraper = DocToSkillConverter(config)
|
||||
|
||||
html = scraper._render_with_browser("https://example.com")
|
||||
assert "Example Domain" in html
|
||||
assert scraper._browser_renderer is not None
|
||||
|
||||
# Clean up
|
||||
scraper._browser_renderer.close()
|
||||
|
||||
|
||||
class TestBrowserArgument:
|
||||
"""Test --browser argument is registered in CLI."""
|
||||
|
||||
def test_scrape_parser_accepts_browser_flag(self):
|
||||
from skill_seekers.cli.doc_scraper import setup_argument_parser
|
||||
|
||||
parser = setup_argument_parser()
|
||||
args = parser.parse_args(["--name", "test", "--url", "https://example.com", "--browser"])
|
||||
assert args.browser is True
|
||||
|
||||
def test_scrape_parser_browser_default_false(self):
|
||||
from skill_seekers.cli.doc_scraper import setup_argument_parser
|
||||
|
||||
parser = setup_argument_parser()
|
||||
args = parser.parse_args(["--name", "test", "--url", "https://example.com"])
|
||||
assert args.browser is False
|
||||
Reference in New Issue
Block a user