refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators
Major rewrite based on deep study of Karpathy's autoresearch repo.
Architecture changes:
- Multi-experiment support: .autoresearch/{domain}/{name}/ structure
- Domain categories: engineering, marketing, content, prompts, custom
- Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope
- User chooses scope during setup, not installation
New evaluators (8 ready-to-use):
- Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage
- LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy
- LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed
Script improvements:
- setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators
- run_experiment.py: --experiment domain/name, --resume, --loop, --single
- log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output
Results export:
- Terminal (default), CSV, and Markdown formats
- Per-experiment, per-domain, or cross-experiment dashboard view
SKILL.md rewritten:
- Clear activation triggers (when the skill should activate)
- Practical examples for each domain
- Evaluator documentation with cost transparency
- Simplified loop protocol matching Karpathy's original philosophy
This commit is contained in:
56
engineering/autoresearch-agent/evaluators/benchmark_size.py
Normal file
56
engineering/autoresearch-agent/evaluators/benchmark_size.py
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Measure file, bundle, or Docker image size.
|
||||
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# --- CONFIGURE ONE OF THESE ---
|
||||
# Option 1: File size
|
||||
TARGET_FILE = "dist/main.js"
|
||||
|
||||
# Option 2: Directory size (uncomment to use)
|
||||
# TARGET_DIR = "dist/"
|
||||
|
||||
# Option 3: Docker image (uncomment to use)
|
||||
# DOCKER_IMAGE = "myapp:latest"
|
||||
# DOCKER_BUILD_CMD = "docker build -t myapp:latest ."
|
||||
|
||||
# Option 4: Build first, then measure (uncomment to use)
|
||||
# BUILD_CMD = "npm run build"
|
||||
# --- END CONFIG ---
|
||||
|
||||
# Build if needed
|
||||
if "BUILD_CMD" in dir() or "BUILD_CMD" in globals():
|
||||
result = subprocess.run(BUILD_CMD, shell=True, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
print(f"Build failed: {result.stderr.decode()[:200]}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Measure
|
||||
if "DOCKER_IMAGE" in dir() or "DOCKER_IMAGE" in globals():
|
||||
if "DOCKER_BUILD_CMD" in dir():
|
||||
subprocess.run(DOCKER_BUILD_CMD, shell=True, capture_output=True)
|
||||
result = subprocess.run(
|
||||
f"docker image inspect {DOCKER_IMAGE} --format '{{{{.Size}}}}'",
|
||||
shell=True, capture_output=True, text=True
|
||||
)
|
||||
size_bytes = int(result.stdout.strip())
|
||||
elif "TARGET_DIR" in dir() or "TARGET_DIR" in globals():
|
||||
size_bytes = sum(
|
||||
os.path.getsize(os.path.join(dp, f))
|
||||
for dp, _, fns in os.walk(TARGET_DIR) for f in fns
|
||||
)
|
||||
elif os.path.exists(TARGET_FILE):
|
||||
size_bytes = os.path.getsize(TARGET_FILE)
|
||||
else:
|
||||
print(f"Target not found: {TARGET_FILE}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
size_kb = size_bytes / 1024
|
||||
size_mb = size_bytes / (1024 * 1024)
|
||||
|
||||
print(f"size_bytes: {size_bytes}")
|
||||
print(f"size_kb: {size_kb:.1f}")
|
||||
print(f"size_mb: {size_mb:.2f}")
|
||||
40
engineering/autoresearch-agent/evaluators/benchmark_speed.py
Normal file
40
engineering/autoresearch-agent/evaluators/benchmark_speed.py
Normal file
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Measure execution speed of a target function or command.
|
||||
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
|
||||
|
||||
import statistics
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
# --- CONFIGURE THESE ---
|
||||
COMMAND = "python src/module.py" # Command to benchmark
|
||||
RUNS = 5 # Number of runs
|
||||
WARMUP = 1 # Warmup runs (not counted)
|
||||
# --- END CONFIG ---
|
||||
|
||||
times = []
|
||||
|
||||
# Warmup
|
||||
for _ in range(WARMUP):
|
||||
subprocess.run(COMMAND, shell=True, capture_output=True, timeout=120)
|
||||
|
||||
# Benchmark
|
||||
for i in range(RUNS):
|
||||
t0 = time.perf_counter()
|
||||
result = subprocess.run(COMMAND, shell=True, capture_output=True, timeout=120)
|
||||
elapsed = (time.perf_counter() - t0) * 1000 # ms
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Run {i+1} failed (exit {result.returncode})", file=sys.stderr)
|
||||
print(f"stderr: {result.stderr.decode()[:200]}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
times.append(elapsed)
|
||||
|
||||
p50 = statistics.median(times)
|
||||
p95 = sorted(times)[int(len(times) * 0.95)] if len(times) >= 5 else max(times)
|
||||
|
||||
print(f"p50_ms: {p50:.2f}")
|
||||
print(f"p95_ms: {p95:.2f}")
|
||||
print(f"runs: {RUNS}")
|
||||
39
engineering/autoresearch-agent/evaluators/build_speed.py
Normal file
39
engineering/autoresearch-agent/evaluators/build_speed.py
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Measure build/compile time.
|
||||
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
# --- CONFIGURE THESE ---
|
||||
BUILD_CMD = "npm run build" # or: docker build -t test .
|
||||
CLEAN_CMD = "" # optional: npm run clean (run before each build)
|
||||
RUNS = 3 # Number of builds to average
|
||||
# --- END CONFIG ---
|
||||
|
||||
times = []
|
||||
|
||||
for i in range(RUNS):
|
||||
# Clean if configured
|
||||
if CLEAN_CMD:
|
||||
subprocess.run(CLEAN_CMD, shell=True, capture_output=True, timeout=60)
|
||||
|
||||
t0 = time.perf_counter()
|
||||
result = subprocess.run(BUILD_CMD, shell=True, capture_output=True, timeout=600)
|
||||
elapsed = time.perf_counter() - t0
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Build {i+1} failed (exit {result.returncode})", file=sys.stderr)
|
||||
print(f"stderr: {result.stderr.decode()[:200]}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
times.append(elapsed)
|
||||
|
||||
import statistics
|
||||
avg = statistics.mean(times)
|
||||
median = statistics.median(times)
|
||||
|
||||
print(f"build_seconds: {median:.2f}")
|
||||
print(f"build_avg: {avg:.2f}")
|
||||
print(f"runs: {RUNS}")
|
||||
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
"""LLM judge for content quality (headlines, titles, descriptions).
|
||||
Uses the user's existing CLI tool (claude, codex, gemini) for evaluation.
|
||||
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# --- CONFIGURE THESE ---
|
||||
TARGET_FILE = "content/titles.md" # File being optimized
|
||||
CLI_TOOL = "claude" # or: codex, gemini
|
||||
# --- END CONFIG ---
|
||||
|
||||
# The judge prompt is FIXED — the agent cannot change how it's evaluated
|
||||
JUDGE_PROMPT = """You are a content quality evaluator. Score the following content strictly.
|
||||
|
||||
Criteria (each scored 1-10):
|
||||
|
||||
1. CURIOSITY GAP — Does this make you want to click? Is there an information gap
|
||||
that can only be resolved by reading? Generic titles score 1-3. Specific,
|
||||
intriguing titles score 7-10.
|
||||
|
||||
2. SPECIFICITY — Are there concrete numbers, tools, or details? "How I improved
|
||||
performance" = 2. "How I reduced API latency from 800ms to 185ms" = 9.
|
||||
|
||||
3. EMOTIONAL PULL — Does it trigger curiosity, surprise, fear of missing out,
|
||||
or recognition? Flat titles score 1-3. Emotionally charged score 7-10.
|
||||
|
||||
4. SCROLL-STOP POWER — Would this stop someone scrolling through a feed or
|
||||
search results? Would they pause on this headline? Rate honestly.
|
||||
|
||||
5. SEO KEYWORD PRESENCE — Are searchable, high-intent terms present naturally?
|
||||
Keyword-stuffed = 3. Natural integration of search terms = 8-10.
|
||||
|
||||
Output EXACTLY this format (nothing else):
|
||||
curiosity: <score>
|
||||
specificity: <score>
|
||||
emotional: <score>
|
||||
scroll_stop: <score>
|
||||
seo: <score>
|
||||
ctr_score: <average of all 5 scores>
|
||||
|
||||
Be harsh. Most content is mediocre (4-6 range). Only exceptional content scores 8+."""
|
||||
|
||||
content = Path(TARGET_FILE).read_text()
|
||||
full_prompt = f"{JUDGE_PROMPT}\n\n---\n\nContent to evaluate:\n\n{content}"
|
||||
|
||||
# Call the user's CLI tool
|
||||
result = subprocess.run(
|
||||
[CLI_TOOL, "-p", full_prompt],
|
||||
capture_output=True, text=True, timeout=120
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"LLM judge failed: {result.stderr[:200]}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Parse output — look for ctr_score line
|
||||
output = result.stdout
|
||||
for line in output.splitlines():
|
||||
line = line.strip()
|
||||
if line.startswith("ctr_score:"):
|
||||
print(line)
|
||||
elif line.startswith(("curiosity:", "specificity:", "emotional:", "scroll_stop:", "seo:")):
|
||||
print(line)
|
||||
|
||||
# Verify ctr_score was found
|
||||
if "ctr_score:" not in output:
|
||||
print("Could not parse ctr_score from LLM output", file=sys.stderr)
|
||||
print(f"Raw output: {output[:500]}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
88
engineering/autoresearch-agent/evaluators/llm_judge_copy.py
Normal file
88
engineering/autoresearch-agent/evaluators/llm_judge_copy.py
Normal file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
"""LLM judge for marketing copy (social posts, ads, emails).
|
||||
Uses the user's existing CLI tool for evaluation.
|
||||
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# --- CONFIGURE THESE ---
|
||||
TARGET_FILE = "posts.md" # Copy being optimized
|
||||
CLI_TOOL = "claude" # or: codex, gemini
|
||||
PLATFORM = "twitter" # twitter, linkedin, instagram, email, ad
|
||||
# --- END CONFIG ---
|
||||
|
||||
JUDGE_PROMPTS = {
|
||||
"twitter": """Score this Twitter/X post strictly:
|
||||
1. HOOK (1-10) — Does the first line stop the scroll?
|
||||
2. VALUE (1-10) — Does it provide insight, entertainment, or utility?
|
||||
3. ENGAGEMENT (1-10) — Would people reply, retweet, or like?
|
||||
4. BREVITY (1-10) — Is every word earning its place? No filler?
|
||||
5. CTA (1-10) — Is there a clear next action (even implicit)?""",
|
||||
|
||||
"linkedin": """Score this LinkedIn post strictly:
|
||||
1. HOOK (1-10) — Does the first line make you click "see more"?
|
||||
2. STORYTELLING (1-10) — Is there a narrative arc or just statements?
|
||||
3. CREDIBILITY (1-10) — Does it demonstrate expertise without bragging?
|
||||
4. ENGAGEMENT (1-10) — Would professionals comment or share?
|
||||
5. CTA (1-10) — Does it invite discussion or action?""",
|
||||
|
||||
"instagram": """Score this Instagram caption strictly:
|
||||
1. HOOK (1-10) — Does the first line grab attention?
|
||||
2. RELATABILITY (1-10) — Does the audience see themselves in this?
|
||||
3. VISUAL MATCH (1-10) — Does the copy complement visual content?
|
||||
4. HASHTAG STRATEGY (1-10) — Are hashtags relevant and not spammy?
|
||||
5. CTA (1-10) — Does it encourage saves, shares, or comments?""",
|
||||
|
||||
"email": """Score this email subject + preview strictly:
|
||||
1. OPEN INCENTIVE (1-10) — Would you open this in a crowded inbox?
|
||||
2. SPECIFICITY (1-10) — Is it concrete or vague?
|
||||
3. URGENCY (1-10) — Is there a reason to open now vs later?
|
||||
4. PERSONALIZATION (1-10) — Does it feel written for someone, not everyone?
|
||||
5. PREVIEW SYNC (1-10) — Does the preview text complement the subject?""",
|
||||
|
||||
"ad": """Score this ad copy strictly:
|
||||
1. ATTENTION (1-10) — Does it stop someone scrolling past ads?
|
||||
2. DESIRE (1-10) — Does it create want for the product/service?
|
||||
3. PROOF (1-10) — Is there credibility (numbers, social proof)?
|
||||
4. ACTION (1-10) — Is the CTA clear and compelling?
|
||||
5. OBJECTION HANDLING (1-10) — Does it preempt "why not"?""",
|
||||
}
|
||||
|
||||
platform_prompt = JUDGE_PROMPTS.get(PLATFORM, JUDGE_PROMPTS["twitter"])
|
||||
|
||||
JUDGE_PROMPT = f"""{platform_prompt}
|
||||
|
||||
Output EXACTLY this format:
|
||||
criterion_1: <score>
|
||||
criterion_2: <score>
|
||||
criterion_3: <score>
|
||||
criterion_4: <score>
|
||||
criterion_5: <score>
|
||||
engagement_score: <average of all 5>
|
||||
|
||||
Be harsh. Most copy is mediocre (4-6). Only exceptional copy scores 8+."""
|
||||
|
||||
content = Path(TARGET_FILE).read_text()
|
||||
full_prompt = f"{JUDGE_PROMPT}\n\n---\n\nCopy to evaluate:\n\n{content}"
|
||||
|
||||
result = subprocess.run(
|
||||
[CLI_TOOL, "-p", full_prompt],
|
||||
capture_output=True, text=True, timeout=120
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"LLM judge failed: {result.stderr[:200]}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
output = result.stdout
|
||||
for line in output.splitlines():
|
||||
line = line.strip()
|
||||
if line.startswith("engagement_score:") or line.startswith("criterion_"):
|
||||
print(line)
|
||||
|
||||
if "engagement_score:" not in output:
|
||||
print("Could not parse engagement_score from LLM output", file=sys.stderr)
|
||||
print(f"Raw: {output[:500]}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""LLM judge for prompt/instruction quality.
|
||||
Uses the user's existing CLI tool for evaluation.
|
||||
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# --- CONFIGURE THESE ---
|
||||
TARGET_FILE = "prompt.md" # Prompt being optimized
|
||||
TEST_CASES_FILE = "tests/cases.json" # Test cases: [{"input": "...", "expected": "..."}]
|
||||
CLI_TOOL = "claude" # or: codex, gemini
|
||||
# --- END CONFIG ---
|
||||
|
||||
JUDGE_PROMPT_TEMPLATE = """You are evaluating a system prompt's effectiveness.
|
||||
|
||||
SYSTEM PROMPT BEING TESTED:
|
||||
{prompt}
|
||||
|
||||
TEST INPUT:
|
||||
{input}
|
||||
|
||||
EXPECTED OUTPUT (reference):
|
||||
{expected}
|
||||
|
||||
ACTUAL OUTPUT:
|
||||
{actual}
|
||||
|
||||
Score the actual output on these criteria (each 1-10):
|
||||
1. ACCURACY — Does it match the expected output's intent and facts?
|
||||
2. COMPLETENESS — Does it cover all required elements?
|
||||
3. CLARITY — Is it well-structured and easy to understand?
|
||||
4. INSTRUCTION_FOLLOWING — Does it follow the system prompt's guidelines?
|
||||
|
||||
Output EXACTLY: quality_score: <average of all 4>
|
||||
Nothing else."""
|
||||
|
||||
prompt = Path(TARGET_FILE).read_text()
|
||||
test_cases = json.loads(Path(TEST_CASES_FILE).read_text())
|
||||
|
||||
scores = []
|
||||
|
||||
for i, case in enumerate(test_cases):
|
||||
# Generate output using the prompt
|
||||
gen_prompt = f"{prompt}\n\n{case['input']}"
|
||||
gen_result = subprocess.run(
|
||||
[CLI_TOOL, "-p", gen_prompt],
|
||||
capture_output=True, text=True, timeout=60
|
||||
)
|
||||
if gen_result.returncode != 0:
|
||||
print(f"Generation failed for case {i+1}", file=sys.stderr)
|
||||
scores.append(0)
|
||||
continue
|
||||
|
||||
actual = gen_result.stdout.strip()
|
||||
|
||||
# Judge the output
|
||||
judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
|
||||
prompt=prompt[:500],
|
||||
input=case["input"],
|
||||
expected=case.get("expected", "N/A"),
|
||||
actual=actual[:500]
|
||||
)
|
||||
|
||||
judge_result = subprocess.run(
|
||||
[CLI_TOOL, "-p", judge_prompt],
|
||||
capture_output=True, text=True, timeout=60
|
||||
)
|
||||
|
||||
if judge_result.returncode != 0:
|
||||
scores.append(0)
|
||||
continue
|
||||
|
||||
# Parse score
|
||||
for line in judge_result.stdout.splitlines():
|
||||
if "quality_score:" in line:
|
||||
try:
|
||||
score = float(line.split(":")[-1].strip())
|
||||
scores.append(score)
|
||||
except ValueError:
|
||||
scores.append(0)
|
||||
break
|
||||
else:
|
||||
scores.append(0)
|
||||
|
||||
print(f" Case {i+1}/{len(test_cases)}: {scores[-1]:.1f}", file=sys.stderr)
|
||||
|
||||
if not scores:
|
||||
print("No test cases evaluated", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
avg = sum(scores) / len(scores)
|
||||
quality = avg * 10 # Scale to 0-100
|
||||
|
||||
print(f"quality_score: {quality:.2f}")
|
||||
print(f"cases_tested: {len(scores)}")
|
||||
print(f"avg_per_case: {avg:.2f}")
|
||||
52
engineering/autoresearch-agent/evaluators/memory_usage.py
Normal file
52
engineering/autoresearch-agent/evaluators/memory_usage.py
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Measure peak memory usage of a command.
|
||||
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
|
||||
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# --- CONFIGURE THESE ---
|
||||
COMMAND = "python src/module.py" # Command to measure
|
||||
# --- END CONFIG ---
|
||||
|
||||
system = platform.system()
|
||||
|
||||
if system == "Linux":
|
||||
# Use /usr/bin/time for peak RSS
|
||||
result = subprocess.run(
|
||||
f"/usr/bin/time -v {COMMAND}",
|
||||
shell=True, capture_output=True, text=True, timeout=300
|
||||
)
|
||||
output = result.stderr
|
||||
for line in output.splitlines():
|
||||
if "Maximum resident set size" in line:
|
||||
kb = int(line.split(":")[-1].strip())
|
||||
mb = kb / 1024
|
||||
print(f"peak_mb: {mb:.1f}")
|
||||
print(f"peak_kb: {kb}")
|
||||
sys.exit(0)
|
||||
print("Could not parse memory from /usr/bin/time output", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
elif system == "Darwin":
|
||||
# macOS: use /usr/bin/time -l
|
||||
result = subprocess.run(
|
||||
f"/usr/bin/time -l {COMMAND}",
|
||||
shell=True, capture_output=True, text=True, timeout=300
|
||||
)
|
||||
output = result.stderr
|
||||
for line in output.splitlines():
|
||||
if "maximum resident set size" in line.lower():
|
||||
# macOS reports in bytes
|
||||
val = int(line.strip().split()[0])
|
||||
mb = val / (1024 * 1024)
|
||||
print(f"peak_mb: {mb:.1f}")
|
||||
sys.exit(0)
|
||||
print("Could not parse memory from time output", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
else:
|
||||
print(f"Unsupported platform: {system}. Use Linux or macOS.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
55
engineering/autoresearch-agent/evaluators/test_pass_rate.py
Normal file
55
engineering/autoresearch-agent/evaluators/test_pass_rate.py
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Measure test suite pass rate.
|
||||
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# --- CONFIGURE THESE ---
|
||||
TEST_CMD = "pytest tests/ --tb=no -q" # Test command
|
||||
# --- END CONFIG ---
|
||||
|
||||
result = subprocess.run(TEST_CMD, shell=True, capture_output=True, text=True, timeout=300)
|
||||
output = result.stdout + "\n" + result.stderr
|
||||
|
||||
# Try to parse pytest output: "X passed, Y failed, Z errors"
|
||||
passed = failed = errors = 0
|
||||
|
||||
# pytest short format: "5 passed, 2 failed in 1.23s"
|
||||
match = re.search(r"(\d+) passed", output)
|
||||
if match:
|
||||
passed = int(match.group(1))
|
||||
match = re.search(r"(\d+) failed", output)
|
||||
if match:
|
||||
failed = int(match.group(1))
|
||||
match = re.search(r"(\d+) error", output)
|
||||
if match:
|
||||
errors = int(match.group(1))
|
||||
|
||||
total = passed + failed + errors
|
||||
if total == 0:
|
||||
# Try unittest format: "Ran X tests"
|
||||
match = re.search(r"Ran (\d+) test", output)
|
||||
if match:
|
||||
total = int(match.group(1))
|
||||
if result.returncode == 0:
|
||||
passed = total
|
||||
else:
|
||||
# Count failures from output
|
||||
fail_match = re.search(r"FAILED \(failures=(\d+)", output)
|
||||
if fail_match:
|
||||
failed = int(fail_match.group(1))
|
||||
passed = total - failed
|
||||
|
||||
if total == 0:
|
||||
print("Could not parse test results", file=sys.stderr)
|
||||
print(f"Output: {output[:500]}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
rate = passed / total
|
||||
|
||||
print(f"pass_rate: {rate:.4f}")
|
||||
print(f"passed: {passed}")
|
||||
print(f"failed: {failed}")
|
||||
print(f"total: {total}")
|
||||
Reference in New Issue
Block a user