Files
claude-skills-reference/engineering/browser-automation/scripts/anti_detection_checker.py
Reza Rezvani 268061b0fd fix: move browser-automation and spec-driven-workflow scripts to scripts/ directory
Validator expects scripts in scripts/ subdirectory, not at skill root.
Moved 6 scripts to match repo convention.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-25 14:53:14 +01:00

521 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Anti-Detection Checker - Audits Playwright scripts for common bot detection vectors.
Analyzes a Playwright automation script and identifies patterns that make the
browser detectable as a bot. Produces a risk score (0-100) with specific
recommendations for each issue found.
Detection vectors checked:
- Headless mode usage
- Default/missing user agent configuration
- Viewport size (default 800x600 is a red flag)
- WebDriver flag (navigator.webdriver)
- Navigator property overrides
- Request throttling / human-like delays
- Cookie/session management
- Proxy configuration
- Error handling patterns
No external dependencies - uses only Python standard library.
"""
import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, asdict
from typing import List, Optional
@dataclass
class Finding:
"""A single detection risk finding."""
category: str
severity: str # "critical", "high", "medium", "low", "info"
description: str
line: Optional[int]
recommendation: str
weight: int # Points added to risk score (0-15)
SEVERITY_WEIGHTS = {
"critical": 15,
"high": 10,
"medium": 5,
"low": 2,
"info": 0,
}
class AntiDetectionChecker:
"""Analyzes Playwright scripts for bot detection vulnerabilities."""
def __init__(self, script_content: str, file_path: str = "<stdin>"):
self.content = script_content
self.lines = script_content.split("\n")
self.file_path = file_path
self.findings: List[Finding] = []
def check_all(self) -> List[Finding]:
"""Run all detection checks."""
self._check_headless_mode()
self._check_user_agent()
self._check_viewport()
self._check_webdriver_flag()
self._check_navigator_properties()
self._check_request_delays()
self._check_error_handling()
self._check_proxy()
self._check_session_management()
self._check_browser_close()
self._check_stealth_imports()
return self.findings
def _find_line(self, pattern: str) -> Optional[int]:
"""Find the first line number matching a regex pattern."""
for i, line in enumerate(self.lines, 1):
if re.search(pattern, line):
return i
return None
def _has_pattern(self, pattern: str) -> bool:
"""Check if pattern exists anywhere in the script."""
return bool(re.search(pattern, self.content))
def _check_headless_mode(self):
"""Check if headless mode is properly configured."""
if self._has_pattern(r"headless\s*=\s*False"):
self.findings.append(Finding(
category="Headless Mode",
severity="high",
description="Browser launched in headed mode (headless=False). This is fine for development but should be headless=True in production.",
line=self._find_line(r"headless\s*=\s*False"),
recommendation="Use headless=True for production. Toggle via environment variable: headless=os.environ.get('HEADLESS', 'true') == 'true'",
weight=SEVERITY_WEIGHTS["high"],
))
elif not self._has_pattern(r"headless"):
# Default is headless=True in Playwright, which is correct
self.findings.append(Finding(
category="Headless Mode",
severity="info",
description="Using default headless mode (True). Good for production.",
line=None,
recommendation="No action needed. Default headless=True is correct.",
weight=SEVERITY_WEIGHTS["info"],
))
def _check_user_agent(self):
"""Check if a custom user agent is set."""
has_ua = self._has_pattern(r"user_agent\s*=") or self._has_pattern(r"userAgent")
has_ua_list = self._has_pattern(r"USER_AGENTS?\s*=\s*\[")
has_random_ua = self._has_pattern(r"random\.choice.*(?:USER_AGENT|user_agent|ua)")
if not has_ua:
self.findings.append(Finding(
category="User Agent",
severity="critical",
description="No custom user agent configured. Playwright's default user agent contains 'HeadlessChrome' which is trivially detected.",
line=None,
recommendation="Set a realistic user agent: context = await browser.new_context(user_agent='Mozilla/5.0 ...')",
weight=SEVERITY_WEIGHTS["critical"],
))
elif has_ua_list and has_random_ua:
self.findings.append(Finding(
category="User Agent",
severity="info",
description="User agent rotation detected. Good anti-detection practice.",
line=self._find_line(r"USER_AGENTS?\s*=\s*\["),
recommendation="Ensure user agents are recent and match the browser being launched (e.g., Chrome UA for Chromium).",
weight=SEVERITY_WEIGHTS["info"],
))
elif has_ua:
self.findings.append(Finding(
category="User Agent",
severity="low",
description="Custom user agent set but no rotation detected. Single user agent is fingerprint-able at scale.",
line=self._find_line(r"user_agent\s*="),
recommendation="Rotate through 5-10 recent user agents using random.choice().",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_viewport(self):
"""Check viewport configuration."""
has_viewport = self._has_pattern(r"viewport\s*=\s*\{") or self._has_pattern(r"viewport.*width")
if not has_viewport:
self.findings.append(Finding(
category="Viewport Size",
severity="high",
description="No viewport configured. Default Playwright viewport (1280x720) is common among bots. Sites may flag unusual viewport distributions.",
line=None,
recommendation="Set a common desktop viewport: viewport={'width': 1920, 'height': 1080}. Vary across runs.",
weight=SEVERITY_WEIGHTS["high"],
))
else:
# Check for suspiciously small viewports
match = re.search(r"width['\"]?\s*[:=]\s*(\d+)", self.content)
if match:
width = int(match.group(1))
if width < 1024:
self.findings.append(Finding(
category="Viewport Size",
severity="medium",
description=f"Viewport width {width}px is unusually small. Most desktop browsers are 1366px+ wide.",
line=self._find_line(r"width.*" + str(width)),
recommendation="Use 1366x768 (most common) or 1920x1080. Avoid unusual sizes like 800x600.",
weight=SEVERITY_WEIGHTS["medium"],
))
else:
self.findings.append(Finding(
category="Viewport Size",
severity="info",
description=f"Viewport width {width}px is reasonable.",
line=self._find_line(r"width.*" + str(width)),
recommendation="No action needed.",
weight=SEVERITY_WEIGHTS["info"],
))
def _check_webdriver_flag(self):
"""Check if navigator.webdriver is being removed."""
has_webdriver_override = (
self._has_pattern(r"navigator.*webdriver") or
self._has_pattern(r"webdriver.*undefined") or
self._has_pattern(r"add_init_script.*webdriver")
)
if not has_webdriver_override:
self.findings.append(Finding(
category="WebDriver Flag",
severity="critical",
description="navigator.webdriver is not overridden. This is the most common bot detection check. Every major anti-bot service tests this property.",
line=None,
recommendation=(
"Add init script to remove the flag:\n"
" await page.add_init_script(\"Object.defineProperty(navigator, 'webdriver', {get: () => undefined});\")"
),
weight=SEVERITY_WEIGHTS["critical"],
))
else:
self.findings.append(Finding(
category="WebDriver Flag",
severity="info",
description="navigator.webdriver override detected.",
line=self._find_line(r"webdriver"),
recommendation="No action needed.",
weight=SEVERITY_WEIGHTS["info"],
))
def _check_navigator_properties(self):
"""Check for additional navigator property hardening."""
checks = {
"plugins": (r"navigator.*plugins", "navigator.plugins is empty in headless mode. Real browsers report installed plugins."),
"languages": (r"navigator.*languages", "navigator.languages should be set to match the user agent locale."),
"platform": (r"navigator.*platform", "navigator.platform should match the user agent OS."),
}
overridden_count = 0
for prop, (pattern, desc) in checks.items():
if self._has_pattern(pattern):
overridden_count += 1
if overridden_count == 0:
self.findings.append(Finding(
category="Navigator Properties",
severity="medium",
description="No navigator property hardening detected. Advanced anti-bot services check plugins, languages, and platform properties.",
line=None,
recommendation="Override navigator.plugins, navigator.languages, and navigator.platform via add_init_script() to match realistic browser fingerprints.",
weight=SEVERITY_WEIGHTS["medium"],
))
elif overridden_count < 3:
self.findings.append(Finding(
category="Navigator Properties",
severity="low",
description=f"Partial navigator hardening ({overridden_count}/3 properties). Consider covering all three: plugins, languages, platform.",
line=None,
recommendation="Add overrides for any missing properties among: plugins, languages, platform.",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_request_delays(self):
"""Check for human-like request delays."""
has_sleep = self._has_pattern(r"asyncio\.sleep") or self._has_pattern(r"wait_for_timeout")
has_random_delay = (
self._has_pattern(r"random\.(uniform|randint|random)") and has_sleep
)
if not has_sleep:
self.findings.append(Finding(
category="Request Timing",
severity="high",
description="No delays between actions detected. Machine-speed interactions are the easiest behavior-based detection signal.",
line=None,
recommendation="Add random delays between page interactions: await asyncio.sleep(random.uniform(0.5, 2.0))",
weight=SEVERITY_WEIGHTS["high"],
))
elif not has_random_delay:
self.findings.append(Finding(
category="Request Timing",
severity="medium",
description="Fixed delays detected but no randomization. Constant timing intervals are detectable patterns.",
line=self._find_line(r"(asyncio\.sleep|wait_for_timeout)"),
recommendation="Use random delays: random.uniform(min_seconds, max_seconds) instead of fixed values.",
weight=SEVERITY_WEIGHTS["medium"],
))
else:
self.findings.append(Finding(
category="Request Timing",
severity="info",
description="Randomized delays detected between actions.",
line=self._find_line(r"random\.(uniform|randint)"),
recommendation="No action needed. Ensure delays are realistic (0.5-3s for browsing, 1-5s for reading).",
weight=SEVERITY_WEIGHTS["info"],
))
def _check_error_handling(self):
"""Check for error handling patterns."""
has_try_except = self._has_pattern(r"try\s*:") and self._has_pattern(r"except")
has_retry = self._has_pattern(r"retr(y|ies)") or self._has_pattern(r"max_retries|max_attempts")
if not has_try_except:
self.findings.append(Finding(
category="Error Handling",
severity="medium",
description="No try/except blocks found. Unhandled errors will crash the automation and leave browser instances running.",
line=None,
recommendation="Wrap page interactions in try/except. Handle TimeoutError, network errors, and element-not-found gracefully.",
weight=SEVERITY_WEIGHTS["medium"],
))
elif not has_retry:
self.findings.append(Finding(
category="Error Handling",
severity="low",
description="Error handling present but no retry logic detected. Transient failures (network blips, slow loads) will cause data loss.",
line=None,
recommendation="Add retry with exponential backoff for network operations and element interactions.",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_proxy(self):
"""Check for proxy configuration."""
has_proxy = self._has_pattern(r"proxy\s*=\s*\{") or self._has_pattern(r"proxy.*server")
if not has_proxy:
self.findings.append(Finding(
category="Proxy",
severity="low",
description="No proxy configuration detected. Running from a single IP address is fine for small jobs but will trigger rate limits at scale.",
line=None,
recommendation="For high-volume scraping, use rotating proxies: proxy={'server': 'http://proxy:port'}",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_session_management(self):
"""Check for session/cookie management."""
has_storage_state = self._has_pattern(r"storage_state")
has_cookies = self._has_pattern(r"cookies\(\)") or self._has_pattern(r"add_cookies")
if not has_storage_state and not has_cookies:
self.findings.append(Finding(
category="Session Management",
severity="low",
description="No session persistence detected. Each run will start fresh, requiring re-authentication.",
line=None,
recommendation="Use storage_state() to save/restore sessions across runs. This avoids repeated logins that may trigger security alerts.",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_browser_close(self):
"""Check if browser is properly closed."""
has_close = self._has_pattern(r"browser\.close\(\)") or self._has_pattern(r"await.*close")
has_context_manager = self._has_pattern(r"async\s+with\s+async_playwright")
if not has_close and not has_context_manager:
self.findings.append(Finding(
category="Resource Cleanup",
severity="medium",
description="No browser.close() or context manager detected. Browser processes will leak on failure.",
line=None,
recommendation="Use 'async with async_playwright() as p:' or ensure browser.close() is in a finally block.",
weight=SEVERITY_WEIGHTS["medium"],
))
def _check_stealth_imports(self):
"""Check for stealth/anti-detection library usage."""
has_stealth = self._has_pattern(r"playwright_stealth|stealth_async|undetected")
if has_stealth:
self.findings.append(Finding(
category="Stealth Library",
severity="info",
description="Third-party stealth library detected. These provide additional fingerprint evasion but add dependencies.",
line=self._find_line(r"playwright_stealth|stealth_async|undetected"),
recommendation="Stealth libraries are helpful but not a silver bullet. Still implement manual checks for user agent, viewport, and timing.",
weight=SEVERITY_WEIGHTS["info"],
))
def get_risk_score(self) -> int:
"""Calculate overall risk score (0-100). Higher = more detectable."""
raw_score = sum(f.weight for f in self.findings)
# Cap at 100
return min(raw_score, 100)
def get_risk_level(self) -> str:
"""Get human-readable risk level."""
score = self.get_risk_score()
if score <= 10:
return "LOW"
elif score <= 30:
return "MODERATE"
elif score <= 50:
return "HIGH"
else:
return "CRITICAL"
def get_summary(self) -> dict:
"""Get a summary of the analysis."""
severity_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0, "info": 0}
for f in self.findings:
severity_counts[f.severity] += 1
return {
"file": self.file_path,
"risk_score": self.get_risk_score(),
"risk_level": self.get_risk_level(),
"total_findings": len(self.findings),
"severity_counts": severity_counts,
"actionable_findings": len([f for f in self.findings if f.severity != "info"]),
}
def format_text_report(checker: AntiDetectionChecker, verbose: bool = False) -> str:
"""Format findings as human-readable text."""
lines = []
summary = checker.get_summary()
lines.append("=" * 60)
lines.append(" ANTI-DETECTION AUDIT REPORT")
lines.append("=" * 60)
lines.append(f"File: {summary['file']}")
lines.append(f"Risk Score: {summary['risk_score']}/100 ({summary['risk_level']})")
lines.append(f"Total Issues: {summary['actionable_findings']} actionable, {summary['severity_counts']['info']} info")
lines.append("")
# Severity breakdown
for sev in ["critical", "high", "medium", "low"]:
count = summary["severity_counts"][sev]
if count > 0:
lines.append(f" {sev.upper():10s} {count}")
lines.append("")
# Findings grouped by severity
severity_order = ["critical", "high", "medium", "low"]
if verbose:
severity_order.append("info")
for sev in severity_order:
sev_findings = [f for f in checker.findings if f.severity == sev]
if not sev_findings:
continue
lines.append(f"--- {sev.upper()} ---")
for f in sev_findings:
line_info = f" (line {f.line})" if f.line else ""
lines.append(f" [{f.category}]{line_info}")
lines.append(f" {f.description}")
lines.append(f" Fix: {f.recommendation}")
lines.append("")
# Exit code guidance
lines.append("-" * 60)
score = summary["risk_score"]
if score <= 10:
lines.append("Result: PASS - Low detection risk.")
elif score <= 30:
lines.append("Result: PASS with warnings - Address medium/high issues for production use.")
else:
lines.append("Result: FAIL - High detection risk. Fix critical and high issues before deploying.")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Audit a Playwright script for common bot detection vectors.",
epilog=(
"Examples:\n"
" %(prog)s --file scraper.py\n"
" %(prog)s --file scraper.py --verbose\n"
" %(prog)s --file scraper.py --json\n"
"\n"
"Exit codes:\n"
" 0 - Low risk (score 0-10)\n"
" 1 - Moderate to high risk (score 11-50)\n"
" 2 - Critical risk (score 51+)\n"
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--file",
required=True,
help="Path to the Playwright script to audit",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_output",
default=False,
help="Output results as JSON",
)
parser.add_argument(
"--verbose",
action="store_true",
default=False,
help="Include informational (non-actionable) findings in output",
)
args = parser.parse_args()
file_path = os.path.abspath(args.file)
if not os.path.isfile(file_path):
print(f"Error: File not found: {file_path}", file=sys.stderr)
sys.exit(2)
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
except Exception as e:
print(f"Error reading file: {e}", file=sys.stderr)
sys.exit(2)
if not content.strip():
print("Error: File is empty.", file=sys.stderr)
sys.exit(2)
checker = AntiDetectionChecker(content, file_path)
checker.check_all()
if args.json_output:
output = checker.get_summary()
output["findings"] = [asdict(f) for f in checker.findings]
if not args.verbose:
output["findings"] = [f for f in output["findings"] if f["severity"] != "info"]
print(json.dumps(output, indent=2))
else:
print(format_text_report(checker, verbose=args.verbose))
# Exit code based on risk
score = checker.get_risk_score()
if score <= 10:
sys.exit(0)
elif score <= 50:
sys.exit(1)
else:
sys.exit(2)
if __name__ == "__main__":
main()