Validator expects scripts in scripts/ subdirectory, not at skill root. Moved 6 scripts to match repo convention. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
521 lines
22 KiB
Python
521 lines
22 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Anti-Detection Checker - Audits Playwright scripts for common bot detection vectors.
|
|
|
|
Analyzes a Playwright automation script and identifies patterns that make the
|
|
browser detectable as a bot. Produces a risk score (0-100) with specific
|
|
recommendations for each issue found.
|
|
|
|
Detection vectors checked:
|
|
- Headless mode usage
|
|
- Default/missing user agent configuration
|
|
- Viewport size (default 800x600 is a red flag)
|
|
- WebDriver flag (navigator.webdriver)
|
|
- Navigator property overrides
|
|
- Request throttling / human-like delays
|
|
- Cookie/session management
|
|
- Proxy configuration
|
|
- Error handling patterns
|
|
|
|
No external dependencies - uses only Python standard library.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from dataclasses import dataclass, asdict
|
|
from typing import List, Optional
|
|
|
|
|
|
@dataclass
|
|
class Finding:
|
|
"""A single detection risk finding."""
|
|
category: str
|
|
severity: str # "critical", "high", "medium", "low", "info"
|
|
description: str
|
|
line: Optional[int]
|
|
recommendation: str
|
|
weight: int # Points added to risk score (0-15)
|
|
|
|
|
|
SEVERITY_WEIGHTS = {
|
|
"critical": 15,
|
|
"high": 10,
|
|
"medium": 5,
|
|
"low": 2,
|
|
"info": 0,
|
|
}
|
|
|
|
|
|
class AntiDetectionChecker:
|
|
"""Analyzes Playwright scripts for bot detection vulnerabilities."""
|
|
|
|
def __init__(self, script_content: str, file_path: str = "<stdin>"):
|
|
self.content = script_content
|
|
self.lines = script_content.split("\n")
|
|
self.file_path = file_path
|
|
self.findings: List[Finding] = []
|
|
|
|
def check_all(self) -> List[Finding]:
|
|
"""Run all detection checks."""
|
|
self._check_headless_mode()
|
|
self._check_user_agent()
|
|
self._check_viewport()
|
|
self._check_webdriver_flag()
|
|
self._check_navigator_properties()
|
|
self._check_request_delays()
|
|
self._check_error_handling()
|
|
self._check_proxy()
|
|
self._check_session_management()
|
|
self._check_browser_close()
|
|
self._check_stealth_imports()
|
|
return self.findings
|
|
|
|
def _find_line(self, pattern: str) -> Optional[int]:
|
|
"""Find the first line number matching a regex pattern."""
|
|
for i, line in enumerate(self.lines, 1):
|
|
if re.search(pattern, line):
|
|
return i
|
|
return None
|
|
|
|
def _has_pattern(self, pattern: str) -> bool:
|
|
"""Check if pattern exists anywhere in the script."""
|
|
return bool(re.search(pattern, self.content))
|
|
|
|
def _check_headless_mode(self):
|
|
"""Check if headless mode is properly configured."""
|
|
if self._has_pattern(r"headless\s*=\s*False"):
|
|
self.findings.append(Finding(
|
|
category="Headless Mode",
|
|
severity="high",
|
|
description="Browser launched in headed mode (headless=False). This is fine for development but should be headless=True in production.",
|
|
line=self._find_line(r"headless\s*=\s*False"),
|
|
recommendation="Use headless=True for production. Toggle via environment variable: headless=os.environ.get('HEADLESS', 'true') == 'true'",
|
|
weight=SEVERITY_WEIGHTS["high"],
|
|
))
|
|
elif not self._has_pattern(r"headless"):
|
|
# Default is headless=True in Playwright, which is correct
|
|
self.findings.append(Finding(
|
|
category="Headless Mode",
|
|
severity="info",
|
|
description="Using default headless mode (True). Good for production.",
|
|
line=None,
|
|
recommendation="No action needed. Default headless=True is correct.",
|
|
weight=SEVERITY_WEIGHTS["info"],
|
|
))
|
|
|
|
def _check_user_agent(self):
|
|
"""Check if a custom user agent is set."""
|
|
has_ua = self._has_pattern(r"user_agent\s*=") or self._has_pattern(r"userAgent")
|
|
has_ua_list = self._has_pattern(r"USER_AGENTS?\s*=\s*\[")
|
|
has_random_ua = self._has_pattern(r"random\.choice.*(?:USER_AGENT|user_agent|ua)")
|
|
|
|
if not has_ua:
|
|
self.findings.append(Finding(
|
|
category="User Agent",
|
|
severity="critical",
|
|
description="No custom user agent configured. Playwright's default user agent contains 'HeadlessChrome' which is trivially detected.",
|
|
line=None,
|
|
recommendation="Set a realistic user agent: context = await browser.new_context(user_agent='Mozilla/5.0 ...')",
|
|
weight=SEVERITY_WEIGHTS["critical"],
|
|
))
|
|
elif has_ua_list and has_random_ua:
|
|
self.findings.append(Finding(
|
|
category="User Agent",
|
|
severity="info",
|
|
description="User agent rotation detected. Good anti-detection practice.",
|
|
line=self._find_line(r"USER_AGENTS?\s*=\s*\["),
|
|
recommendation="Ensure user agents are recent and match the browser being launched (e.g., Chrome UA for Chromium).",
|
|
weight=SEVERITY_WEIGHTS["info"],
|
|
))
|
|
elif has_ua:
|
|
self.findings.append(Finding(
|
|
category="User Agent",
|
|
severity="low",
|
|
description="Custom user agent set but no rotation detected. Single user agent is fingerprint-able at scale.",
|
|
line=self._find_line(r"user_agent\s*="),
|
|
recommendation="Rotate through 5-10 recent user agents using random.choice().",
|
|
weight=SEVERITY_WEIGHTS["low"],
|
|
))
|
|
|
|
def _check_viewport(self):
|
|
"""Check viewport configuration."""
|
|
has_viewport = self._has_pattern(r"viewport\s*=\s*\{") or self._has_pattern(r"viewport.*width")
|
|
|
|
if not has_viewport:
|
|
self.findings.append(Finding(
|
|
category="Viewport Size",
|
|
severity="high",
|
|
description="No viewport configured. Default Playwright viewport (1280x720) is common among bots. Sites may flag unusual viewport distributions.",
|
|
line=None,
|
|
recommendation="Set a common desktop viewport: viewport={'width': 1920, 'height': 1080}. Vary across runs.",
|
|
weight=SEVERITY_WEIGHTS["high"],
|
|
))
|
|
else:
|
|
# Check for suspiciously small viewports
|
|
match = re.search(r"width['\"]?\s*[:=]\s*(\d+)", self.content)
|
|
if match:
|
|
width = int(match.group(1))
|
|
if width < 1024:
|
|
self.findings.append(Finding(
|
|
category="Viewport Size",
|
|
severity="medium",
|
|
description=f"Viewport width {width}px is unusually small. Most desktop browsers are 1366px+ wide.",
|
|
line=self._find_line(r"width.*" + str(width)),
|
|
recommendation="Use 1366x768 (most common) or 1920x1080. Avoid unusual sizes like 800x600.",
|
|
weight=SEVERITY_WEIGHTS["medium"],
|
|
))
|
|
else:
|
|
self.findings.append(Finding(
|
|
category="Viewport Size",
|
|
severity="info",
|
|
description=f"Viewport width {width}px is reasonable.",
|
|
line=self._find_line(r"width.*" + str(width)),
|
|
recommendation="No action needed.",
|
|
weight=SEVERITY_WEIGHTS["info"],
|
|
))
|
|
|
|
def _check_webdriver_flag(self):
|
|
"""Check if navigator.webdriver is being removed."""
|
|
has_webdriver_override = (
|
|
self._has_pattern(r"navigator.*webdriver") or
|
|
self._has_pattern(r"webdriver.*undefined") or
|
|
self._has_pattern(r"add_init_script.*webdriver")
|
|
)
|
|
|
|
if not has_webdriver_override:
|
|
self.findings.append(Finding(
|
|
category="WebDriver Flag",
|
|
severity="critical",
|
|
description="navigator.webdriver is not overridden. This is the most common bot detection check. Every major anti-bot service tests this property.",
|
|
line=None,
|
|
recommendation=(
|
|
"Add init script to remove the flag:\n"
|
|
" await page.add_init_script(\"Object.defineProperty(navigator, 'webdriver', {get: () => undefined});\")"
|
|
),
|
|
weight=SEVERITY_WEIGHTS["critical"],
|
|
))
|
|
else:
|
|
self.findings.append(Finding(
|
|
category="WebDriver Flag",
|
|
severity="info",
|
|
description="navigator.webdriver override detected.",
|
|
line=self._find_line(r"webdriver"),
|
|
recommendation="No action needed.",
|
|
weight=SEVERITY_WEIGHTS["info"],
|
|
))
|
|
|
|
def _check_navigator_properties(self):
|
|
"""Check for additional navigator property hardening."""
|
|
checks = {
|
|
"plugins": (r"navigator.*plugins", "navigator.plugins is empty in headless mode. Real browsers report installed plugins."),
|
|
"languages": (r"navigator.*languages", "navigator.languages should be set to match the user agent locale."),
|
|
"platform": (r"navigator.*platform", "navigator.platform should match the user agent OS."),
|
|
}
|
|
|
|
overridden_count = 0
|
|
for prop, (pattern, desc) in checks.items():
|
|
if self._has_pattern(pattern):
|
|
overridden_count += 1
|
|
|
|
if overridden_count == 0:
|
|
self.findings.append(Finding(
|
|
category="Navigator Properties",
|
|
severity="medium",
|
|
description="No navigator property hardening detected. Advanced anti-bot services check plugins, languages, and platform properties.",
|
|
line=None,
|
|
recommendation="Override navigator.plugins, navigator.languages, and navigator.platform via add_init_script() to match realistic browser fingerprints.",
|
|
weight=SEVERITY_WEIGHTS["medium"],
|
|
))
|
|
elif overridden_count < 3:
|
|
self.findings.append(Finding(
|
|
category="Navigator Properties",
|
|
severity="low",
|
|
description=f"Partial navigator hardening ({overridden_count}/3 properties). Consider covering all three: plugins, languages, platform.",
|
|
line=None,
|
|
recommendation="Add overrides for any missing properties among: plugins, languages, platform.",
|
|
weight=SEVERITY_WEIGHTS["low"],
|
|
))
|
|
|
|
def _check_request_delays(self):
|
|
"""Check for human-like request delays."""
|
|
has_sleep = self._has_pattern(r"asyncio\.sleep") or self._has_pattern(r"wait_for_timeout")
|
|
has_random_delay = (
|
|
self._has_pattern(r"random\.(uniform|randint|random)") and has_sleep
|
|
)
|
|
|
|
if not has_sleep:
|
|
self.findings.append(Finding(
|
|
category="Request Timing",
|
|
severity="high",
|
|
description="No delays between actions detected. Machine-speed interactions are the easiest behavior-based detection signal.",
|
|
line=None,
|
|
recommendation="Add random delays between page interactions: await asyncio.sleep(random.uniform(0.5, 2.0))",
|
|
weight=SEVERITY_WEIGHTS["high"],
|
|
))
|
|
elif not has_random_delay:
|
|
self.findings.append(Finding(
|
|
category="Request Timing",
|
|
severity="medium",
|
|
description="Fixed delays detected but no randomization. Constant timing intervals are detectable patterns.",
|
|
line=self._find_line(r"(asyncio\.sleep|wait_for_timeout)"),
|
|
recommendation="Use random delays: random.uniform(min_seconds, max_seconds) instead of fixed values.",
|
|
weight=SEVERITY_WEIGHTS["medium"],
|
|
))
|
|
else:
|
|
self.findings.append(Finding(
|
|
category="Request Timing",
|
|
severity="info",
|
|
description="Randomized delays detected between actions.",
|
|
line=self._find_line(r"random\.(uniform|randint)"),
|
|
recommendation="No action needed. Ensure delays are realistic (0.5-3s for browsing, 1-5s for reading).",
|
|
weight=SEVERITY_WEIGHTS["info"],
|
|
))
|
|
|
|
def _check_error_handling(self):
|
|
"""Check for error handling patterns."""
|
|
has_try_except = self._has_pattern(r"try\s*:") and self._has_pattern(r"except")
|
|
has_retry = self._has_pattern(r"retr(y|ies)") or self._has_pattern(r"max_retries|max_attempts")
|
|
|
|
if not has_try_except:
|
|
self.findings.append(Finding(
|
|
category="Error Handling",
|
|
severity="medium",
|
|
description="No try/except blocks found. Unhandled errors will crash the automation and leave browser instances running.",
|
|
line=None,
|
|
recommendation="Wrap page interactions in try/except. Handle TimeoutError, network errors, and element-not-found gracefully.",
|
|
weight=SEVERITY_WEIGHTS["medium"],
|
|
))
|
|
elif not has_retry:
|
|
self.findings.append(Finding(
|
|
category="Error Handling",
|
|
severity="low",
|
|
description="Error handling present but no retry logic detected. Transient failures (network blips, slow loads) will cause data loss.",
|
|
line=None,
|
|
recommendation="Add retry with exponential backoff for network operations and element interactions.",
|
|
weight=SEVERITY_WEIGHTS["low"],
|
|
))
|
|
|
|
def _check_proxy(self):
|
|
"""Check for proxy configuration."""
|
|
has_proxy = self._has_pattern(r"proxy\s*=\s*\{") or self._has_pattern(r"proxy.*server")
|
|
|
|
if not has_proxy:
|
|
self.findings.append(Finding(
|
|
category="Proxy",
|
|
severity="low",
|
|
description="No proxy configuration detected. Running from a single IP address is fine for small jobs but will trigger rate limits at scale.",
|
|
line=None,
|
|
recommendation="For high-volume scraping, use rotating proxies: proxy={'server': 'http://proxy:port'}",
|
|
weight=SEVERITY_WEIGHTS["low"],
|
|
))
|
|
|
|
def _check_session_management(self):
|
|
"""Check for session/cookie management."""
|
|
has_storage_state = self._has_pattern(r"storage_state")
|
|
has_cookies = self._has_pattern(r"cookies\(\)") or self._has_pattern(r"add_cookies")
|
|
|
|
if not has_storage_state and not has_cookies:
|
|
self.findings.append(Finding(
|
|
category="Session Management",
|
|
severity="low",
|
|
description="No session persistence detected. Each run will start fresh, requiring re-authentication.",
|
|
line=None,
|
|
recommendation="Use storage_state() to save/restore sessions across runs. This avoids repeated logins that may trigger security alerts.",
|
|
weight=SEVERITY_WEIGHTS["low"],
|
|
))
|
|
|
|
def _check_browser_close(self):
|
|
"""Check if browser is properly closed."""
|
|
has_close = self._has_pattern(r"browser\.close\(\)") or self._has_pattern(r"await.*close")
|
|
has_context_manager = self._has_pattern(r"async\s+with\s+async_playwright")
|
|
|
|
if not has_close and not has_context_manager:
|
|
self.findings.append(Finding(
|
|
category="Resource Cleanup",
|
|
severity="medium",
|
|
description="No browser.close() or context manager detected. Browser processes will leak on failure.",
|
|
line=None,
|
|
recommendation="Use 'async with async_playwright() as p:' or ensure browser.close() is in a finally block.",
|
|
weight=SEVERITY_WEIGHTS["medium"],
|
|
))
|
|
|
|
def _check_stealth_imports(self):
|
|
"""Check for stealth/anti-detection library usage."""
|
|
has_stealth = self._has_pattern(r"playwright_stealth|stealth_async|undetected")
|
|
if has_stealth:
|
|
self.findings.append(Finding(
|
|
category="Stealth Library",
|
|
severity="info",
|
|
description="Third-party stealth library detected. These provide additional fingerprint evasion but add dependencies.",
|
|
line=self._find_line(r"playwright_stealth|stealth_async|undetected"),
|
|
recommendation="Stealth libraries are helpful but not a silver bullet. Still implement manual checks for user agent, viewport, and timing.",
|
|
weight=SEVERITY_WEIGHTS["info"],
|
|
))
|
|
|
|
def get_risk_score(self) -> int:
|
|
"""Calculate overall risk score (0-100). Higher = more detectable."""
|
|
raw_score = sum(f.weight for f in self.findings)
|
|
# Cap at 100
|
|
return min(raw_score, 100)
|
|
|
|
def get_risk_level(self) -> str:
|
|
"""Get human-readable risk level."""
|
|
score = self.get_risk_score()
|
|
if score <= 10:
|
|
return "LOW"
|
|
elif score <= 30:
|
|
return "MODERATE"
|
|
elif score <= 50:
|
|
return "HIGH"
|
|
else:
|
|
return "CRITICAL"
|
|
|
|
def get_summary(self) -> dict:
|
|
"""Get a summary of the analysis."""
|
|
severity_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0, "info": 0}
|
|
for f in self.findings:
|
|
severity_counts[f.severity] += 1
|
|
|
|
return {
|
|
"file": self.file_path,
|
|
"risk_score": self.get_risk_score(),
|
|
"risk_level": self.get_risk_level(),
|
|
"total_findings": len(self.findings),
|
|
"severity_counts": severity_counts,
|
|
"actionable_findings": len([f for f in self.findings if f.severity != "info"]),
|
|
}
|
|
|
|
|
|
def format_text_report(checker: AntiDetectionChecker, verbose: bool = False) -> str:
|
|
"""Format findings as human-readable text."""
|
|
lines = []
|
|
summary = checker.get_summary()
|
|
|
|
lines.append("=" * 60)
|
|
lines.append(" ANTI-DETECTION AUDIT REPORT")
|
|
lines.append("=" * 60)
|
|
lines.append(f"File: {summary['file']}")
|
|
lines.append(f"Risk Score: {summary['risk_score']}/100 ({summary['risk_level']})")
|
|
lines.append(f"Total Issues: {summary['actionable_findings']} actionable, {summary['severity_counts']['info']} info")
|
|
lines.append("")
|
|
|
|
# Severity breakdown
|
|
for sev in ["critical", "high", "medium", "low"]:
|
|
count = summary["severity_counts"][sev]
|
|
if count > 0:
|
|
lines.append(f" {sev.upper():10s} {count}")
|
|
lines.append("")
|
|
|
|
# Findings grouped by severity
|
|
severity_order = ["critical", "high", "medium", "low"]
|
|
if verbose:
|
|
severity_order.append("info")
|
|
|
|
for sev in severity_order:
|
|
sev_findings = [f for f in checker.findings if f.severity == sev]
|
|
if not sev_findings:
|
|
continue
|
|
|
|
lines.append(f"--- {sev.upper()} ---")
|
|
for f in sev_findings:
|
|
line_info = f" (line {f.line})" if f.line else ""
|
|
lines.append(f" [{f.category}]{line_info}")
|
|
lines.append(f" {f.description}")
|
|
lines.append(f" Fix: {f.recommendation}")
|
|
lines.append("")
|
|
|
|
# Exit code guidance
|
|
lines.append("-" * 60)
|
|
score = summary["risk_score"]
|
|
if score <= 10:
|
|
lines.append("Result: PASS - Low detection risk.")
|
|
elif score <= 30:
|
|
lines.append("Result: PASS with warnings - Address medium/high issues for production use.")
|
|
else:
|
|
lines.append("Result: FAIL - High detection risk. Fix critical and high issues before deploying.")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Audit a Playwright script for common bot detection vectors.",
|
|
epilog=(
|
|
"Examples:\n"
|
|
" %(prog)s --file scraper.py\n"
|
|
" %(prog)s --file scraper.py --verbose\n"
|
|
" %(prog)s --file scraper.py --json\n"
|
|
"\n"
|
|
"Exit codes:\n"
|
|
" 0 - Low risk (score 0-10)\n"
|
|
" 1 - Moderate to high risk (score 11-50)\n"
|
|
" 2 - Critical risk (score 51+)\n"
|
|
),
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
parser.add_argument(
|
|
"--file",
|
|
required=True,
|
|
help="Path to the Playwright script to audit",
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
dest="json_output",
|
|
default=False,
|
|
help="Output results as JSON",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
action="store_true",
|
|
default=False,
|
|
help="Include informational (non-actionable) findings in output",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
file_path = os.path.abspath(args.file)
|
|
if not os.path.isfile(file_path):
|
|
print(f"Error: File not found: {file_path}", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
print(f"Error reading file: {e}", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
if not content.strip():
|
|
print("Error: File is empty.", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
checker = AntiDetectionChecker(content, file_path)
|
|
checker.check_all()
|
|
|
|
if args.json_output:
|
|
output = checker.get_summary()
|
|
output["findings"] = [asdict(f) for f in checker.findings]
|
|
if not args.verbose:
|
|
output["findings"] = [f for f in output["findings"] if f["severity"] != "info"]
|
|
print(json.dumps(output, indent=2))
|
|
else:
|
|
print(format_text_report(checker, verbose=args.verbose))
|
|
|
|
# Exit code based on risk
|
|
score = checker.get_risk_score()
|
|
if score <= 10:
|
|
sys.exit(0)
|
|
elif score <= 50:
|
|
sys.exit(1)
|
|
else:
|
|
sys.exit(2)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|