- AgentHub: 13 files updated with non-engineering examples (content drafts, research, strategy) — engineering stays primary, cross-domain secondary - AgentHub: 7 slash commands, 5 Python scripts, 3 references, 1 agent, dry_run.py validation (57 checks) - Marketplace: agenthub entry added with cross-domain keywords, engineering POWERFUL updated (25→30), product (12→13), counts synced across all configs - SEO: generate-docs.py now produces keyword-rich <title> tags and meta descriptions using SKILL.md frontmatter — "Claude Code Skills" in site_name propagates to all 276 HTML pages - SEO: per-domain title suffixes (Agent Skill for Codex & OpenClaw, etc.), slug-as-title cleanup, domain label stripping from titles - Broken links: 141→0 warnings — new rewrite_skill_internal_links() converts references/, scripts/, assets/ links to GitHub source URLs; skills/index.md phantom slugs fixed (6 marketing, 7 RA/QM) - Counts synced: 204 skills, 266 tools, 382 refs, 16 agents, 17 commands, 21 plugins — consistent across CLAUDE.md, README.md, docs/index.md, marketplace.json, getting-started.md, mkdocs.yml - Platform sync: Codex 163 skills, Gemini 246 items, OpenClaw compatible Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
316 lines
10 KiB
Python
316 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""Rank AgentHub agent results by metric or diff quality.
|
|
|
|
Runs an evaluation command in each agent's worktree, parses a metric,
|
|
and produces a ranked table.
|
|
|
|
Usage:
|
|
python result_ranker.py --session 20260317-143022 \\
|
|
--eval-cmd "pytest bench.py --json" --metric p50_ms --direction lower
|
|
|
|
python result_ranker.py --session 20260317-143022 --diff-summary
|
|
|
|
python result_ranker.py --demo
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
|
|
|
|
def run_git(*args):
|
|
"""Run a git command and return stdout."""
|
|
try:
|
|
result = subprocess.run(
|
|
["git"] + list(args),
|
|
capture_output=True, text=True, check=True
|
|
)
|
|
return result.stdout.strip()
|
|
except subprocess.CalledProcessError as e:
|
|
return ""
|
|
|
|
|
|
def get_session_config(session_id):
|
|
"""Load session config."""
|
|
config_path = os.path.join(".agenthub", "sessions", session_id, "config.yaml")
|
|
if not os.path.exists(config_path):
|
|
print(f"Error: Session {session_id} not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
config = {}
|
|
with open(config_path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if ":" in line and not line.startswith("#"):
|
|
key, val = line.split(":", 1)
|
|
val = val.strip().strip('"')
|
|
config[key.strip()] = val
|
|
return config
|
|
|
|
|
|
def get_hub_branches(session_id):
|
|
"""Get all hub branches for a session."""
|
|
output = run_git("branch", "--list", f"hub/{session_id}/*",
|
|
"--format=%(refname:short)")
|
|
if not output:
|
|
return []
|
|
return [b.strip() for b in output.split("\n") if b.strip()]
|
|
|
|
|
|
def get_worktree_path(branch):
|
|
"""Get the worktree path for a branch, if it exists."""
|
|
output = run_git("worktree", "list", "--porcelain")
|
|
if not output:
|
|
return None
|
|
current_path = None
|
|
for line in output.split("\n"):
|
|
if line.startswith("worktree "):
|
|
current_path = line[len("worktree "):]
|
|
elif line.startswith("branch ") and current_path:
|
|
ref = line[len("branch "):]
|
|
short = ref.replace("refs/heads/", "")
|
|
if short == branch:
|
|
return current_path
|
|
current_path = None
|
|
return None
|
|
|
|
|
|
def run_eval_in_worktree(worktree_path, eval_cmd):
|
|
"""Run evaluation command in a worktree and return stdout."""
|
|
try:
|
|
result = subprocess.run(
|
|
eval_cmd, shell=True, capture_output=True, text=True,
|
|
cwd=worktree_path, timeout=120
|
|
)
|
|
return result.stdout.strip(), result.returncode
|
|
except subprocess.TimeoutExpired:
|
|
return "TIMEOUT", 1
|
|
except Exception as e:
|
|
return str(e), 1
|
|
|
|
|
|
def extract_metric(output, metric_name):
|
|
"""Extract a numeric metric from command output.
|
|
|
|
Looks for patterns like:
|
|
- metric_name: 42.5
|
|
- metric_name=42.5
|
|
- "metric_name": 42.5
|
|
"""
|
|
patterns = [
|
|
rf'{metric_name}\s*[:=]\s*([\d.]+)',
|
|
rf'"{metric_name}"\s*[:=]\s*([\d.]+)',
|
|
rf"'{metric_name}'\s*[:=]\s*([\d.]+)",
|
|
]
|
|
for pattern in patterns:
|
|
match = re.search(pattern, output, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
return float(match.group(1))
|
|
except ValueError:
|
|
continue
|
|
return None
|
|
|
|
|
|
def get_diff_stats(branch, base_branch="main"):
|
|
"""Get diff statistics for a branch vs base."""
|
|
output = run_git("diff", "--stat", f"{base_branch}...{branch}")
|
|
lines_output = run_git("diff", "--shortstat", f"{base_branch}...{branch}")
|
|
|
|
files_changed = 0
|
|
insertions = 0
|
|
deletions = 0
|
|
|
|
if lines_output:
|
|
files_match = re.search(r"(\d+) files? changed", lines_output)
|
|
ins_match = re.search(r"(\d+) insertions?", lines_output)
|
|
del_match = re.search(r"(\d+) deletions?", lines_output)
|
|
if files_match:
|
|
files_changed = int(files_match.group(1))
|
|
if ins_match:
|
|
insertions = int(ins_match.group(1))
|
|
if del_match:
|
|
deletions = int(del_match.group(1))
|
|
|
|
return {
|
|
"files_changed": files_changed,
|
|
"insertions": insertions,
|
|
"deletions": deletions,
|
|
"net_lines": insertions - deletions,
|
|
}
|
|
|
|
|
|
def rank_by_metric(results, direction="lower"):
|
|
"""Sort results by metric value."""
|
|
valid = [r for r in results if r.get("metric_value") is not None]
|
|
invalid = [r for r in results if r.get("metric_value") is None]
|
|
|
|
reverse = direction == "higher"
|
|
valid.sort(key=lambda r: r["metric_value"], reverse=reverse)
|
|
|
|
for i, r in enumerate(valid):
|
|
r["rank"] = i + 1
|
|
|
|
for r in invalid:
|
|
r["rank"] = len(valid) + 1
|
|
|
|
return valid + invalid
|
|
|
|
|
|
def run_demo():
|
|
"""Show demo ranking output."""
|
|
print("=" * 60)
|
|
print("AgentHub Result Ranker — Demo Mode")
|
|
print("=" * 60)
|
|
print()
|
|
print("Session: 20260317-143022")
|
|
print("Eval: pytest bench.py --json")
|
|
print("Metric: p50_ms (lower is better)")
|
|
print("Baseline: 180ms")
|
|
print()
|
|
|
|
header = f"{'RANK':<6} {'AGENT':<10} {'METRIC':<10} {'DELTA':<10} {'FILES':<7} {'SUMMARY'}"
|
|
print(header)
|
|
print("-" * 75)
|
|
print(f"{'1':<6} {'agent-2':<10} {'142ms':<10} {'-38ms':<10} {'2':<7} Replaced O(n²) with hash map lookup")
|
|
print(f"{'2':<6} {'agent-1':<10} {'165ms':<10} {'-15ms':<10} {'3':<7} Added caching layer")
|
|
print(f"{'3':<6} {'agent-3':<10} {'190ms':<10} {'+10ms':<10} {'1':<7} Minor loop optimizations")
|
|
print()
|
|
print("Winner: agent-2 (142ms, -21% from baseline)")
|
|
print()
|
|
print("Next step: Run /hub:merge to merge agent-2's branch")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Rank AgentHub agent results"
|
|
)
|
|
parser.add_argument("--session", type=str,
|
|
help="Session ID to evaluate")
|
|
parser.add_argument("--eval-cmd", type=str,
|
|
help="Evaluation command to run in each worktree")
|
|
parser.add_argument("--metric", type=str,
|
|
help="Metric name to extract from eval output")
|
|
parser.add_argument("--direction", choices=["lower", "higher"],
|
|
default="lower",
|
|
help="Whether lower or higher metric is better")
|
|
parser.add_argument("--baseline", type=float,
|
|
help="Baseline metric value for delta calculation")
|
|
parser.add_argument("--diff-summary", action="store_true",
|
|
help="Show diff statistics per agent (no eval cmd needed)")
|
|
parser.add_argument("--format", choices=["table", "json"], default="table",
|
|
help="Output format (default: table)")
|
|
parser.add_argument("--demo", action="store_true",
|
|
help="Show demo output")
|
|
args = parser.parse_args()
|
|
|
|
if args.demo:
|
|
run_demo()
|
|
return
|
|
|
|
if not args.session:
|
|
print("Error: --session is required", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
config = get_session_config(args.session)
|
|
branches = get_hub_branches(args.session)
|
|
|
|
if not branches:
|
|
print(f"No branches found for session {args.session}")
|
|
return
|
|
|
|
eval_cmd = args.eval_cmd or config.get("eval_cmd")
|
|
metric = args.metric or config.get("metric")
|
|
direction = args.direction or config.get("direction", "lower")
|
|
base_branch = config.get("base_branch", "main")
|
|
|
|
results = []
|
|
for branch in branches:
|
|
# Extract agent number
|
|
match = re.match(r"hub/[^/]+/agent-(\d+)/", branch)
|
|
agent_id = f"agent-{match.group(1)}" if match else branch.split("/")[-2]
|
|
|
|
result = {
|
|
"agent": agent_id,
|
|
"branch": branch,
|
|
"metric_value": None,
|
|
"metric_raw": None,
|
|
"diff": get_diff_stats(branch, base_branch),
|
|
}
|
|
|
|
if eval_cmd and metric:
|
|
worktree = get_worktree_path(branch)
|
|
if worktree:
|
|
output, returncode = run_eval_in_worktree(worktree, eval_cmd)
|
|
result["metric_raw"] = output
|
|
result["eval_returncode"] = returncode
|
|
if returncode == 0:
|
|
result["metric_value"] = extract_metric(output, metric)
|
|
|
|
results.append(result)
|
|
|
|
# Rank
|
|
ranked = rank_by_metric(results, direction)
|
|
|
|
# Calculate deltas
|
|
baseline = args.baseline
|
|
if baseline is None and ranked and ranked[0].get("metric_value") is not None:
|
|
# Use worst as baseline if not specified
|
|
values = [r["metric_value"] for r in ranked if r["metric_value"] is not None]
|
|
if values:
|
|
baseline = max(values) if direction == "lower" else min(values)
|
|
|
|
for r in ranked:
|
|
if r.get("metric_value") is not None and baseline is not None:
|
|
r["delta"] = r["metric_value"] - baseline
|
|
else:
|
|
r["delta"] = None
|
|
|
|
if args.format == "json":
|
|
print(json.dumps({"session": args.session, "results": ranked}, indent=2))
|
|
return
|
|
|
|
# Table output
|
|
print(f"Session: {args.session}")
|
|
if eval_cmd:
|
|
print(f"Eval: {eval_cmd}")
|
|
if metric:
|
|
dir_str = "lower is better" if direction == "lower" else "higher is better"
|
|
print(f"Metric: {metric} ({dir_str})")
|
|
if baseline:
|
|
print(f"Baseline: {baseline}")
|
|
print()
|
|
|
|
if args.diff_summary or not eval_cmd:
|
|
header = f"{'RANK':<6} {'AGENT':<12} {'FILES':<7} {'ADDED':<8} {'REMOVED':<8} {'NET':<6}"
|
|
print(header)
|
|
print("-" * 50)
|
|
for i, r in enumerate(ranked):
|
|
d = r["diff"]
|
|
print(f"{i+1:<6} {r['agent']:<12} {d['files_changed']:<7} "
|
|
f"+{d['insertions']:<7} -{d['deletions']:<7} {d['net_lines']:<6}")
|
|
else:
|
|
header = f"{'RANK':<6} {'AGENT':<12} {'METRIC':<12} {'DELTA':<10} {'FILES':<7}"
|
|
print(header)
|
|
print("-" * 50)
|
|
for r in ranked:
|
|
mv = str(r["metric_value"]) if r["metric_value"] is not None else "N/A"
|
|
delta = ""
|
|
if r["delta"] is not None:
|
|
sign = "+" if r["delta"] >= 0 else ""
|
|
delta = f"{sign}{r['delta']:.1f}"
|
|
print(f"{r['rank']:<6} {r['agent']:<12} {mv:<12} {delta:<10} {r['diff']['files_changed']:<7}")
|
|
|
|
# Winner
|
|
if ranked and ranked[0].get("metric_value") is not None:
|
|
winner = ranked[0]
|
|
print()
|
|
print(f"Winner: {winner['agent']} ({winner['metric_value']})")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|