Files
claude-skills-reference/engineering/agenthub/scripts/result_ranker.py
Reza Rezvani 2f57ef8948 feat(agenthub): add AgentHub plugin with cross-domain examples, SEO optimization, and docs site fixes
- AgentHub: 13 files updated with non-engineering examples (content drafts,
  research, strategy) — engineering stays primary, cross-domain secondary
- AgentHub: 7 slash commands, 5 Python scripts, 3 references, 1 agent,
  dry_run.py validation (57 checks)
- Marketplace: agenthub entry added with cross-domain keywords, engineering
  POWERFUL updated (25→30), product (12→13), counts synced across all configs
- SEO: generate-docs.py now produces keyword-rich <title> tags and meta
  descriptions using SKILL.md frontmatter — "Claude Code Skills" in site_name
  propagates to all 276 HTML pages
- SEO: per-domain title suffixes (Agent Skill for Codex & OpenClaw, etc.),
  slug-as-title cleanup, domain label stripping from titles
- Broken links: 141→0 warnings — new rewrite_skill_internal_links() converts
  references/, scripts/, assets/ links to GitHub source URLs; skills/index.md
  phantom slugs fixed (6 marketing, 7 RA/QM)
- Counts synced: 204 skills, 266 tools, 382 refs, 16 agents, 17 commands,
  21 plugins — consistent across CLAUDE.md, README.md, docs/index.md,
  marketplace.json, getting-started.md, mkdocs.yml
- Platform sync: Codex 163 skills, Gemini 246 items, OpenClaw compatible

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-17 12:10:46 +01:00

316 lines
10 KiB
Python

#!/usr/bin/env python3
"""Rank AgentHub agent results by metric or diff quality.
Runs an evaluation command in each agent's worktree, parses a metric,
and produces a ranked table.
Usage:
python result_ranker.py --session 20260317-143022 \\
--eval-cmd "pytest bench.py --json" --metric p50_ms --direction lower
python result_ranker.py --session 20260317-143022 --diff-summary
python result_ranker.py --demo
"""
import argparse
import json
import os
import re
import subprocess
import sys
def run_git(*args):
"""Run a git command and return stdout."""
try:
result = subprocess.run(
["git"] + list(args),
capture_output=True, text=True, check=True
)
return result.stdout.strip()
except subprocess.CalledProcessError as e:
return ""
def get_session_config(session_id):
"""Load session config."""
config_path = os.path.join(".agenthub", "sessions", session_id, "config.yaml")
if not os.path.exists(config_path):
print(f"Error: Session {session_id} not found", file=sys.stderr)
sys.exit(1)
config = {}
with open(config_path) as f:
for line in f:
line = line.strip()
if ":" in line and not line.startswith("#"):
key, val = line.split(":", 1)
val = val.strip().strip('"')
config[key.strip()] = val
return config
def get_hub_branches(session_id):
"""Get all hub branches for a session."""
output = run_git("branch", "--list", f"hub/{session_id}/*",
"--format=%(refname:short)")
if not output:
return []
return [b.strip() for b in output.split("\n") if b.strip()]
def get_worktree_path(branch):
"""Get the worktree path for a branch, if it exists."""
output = run_git("worktree", "list", "--porcelain")
if not output:
return None
current_path = None
for line in output.split("\n"):
if line.startswith("worktree "):
current_path = line[len("worktree "):]
elif line.startswith("branch ") and current_path:
ref = line[len("branch "):]
short = ref.replace("refs/heads/", "")
if short == branch:
return current_path
current_path = None
return None
def run_eval_in_worktree(worktree_path, eval_cmd):
"""Run evaluation command in a worktree and return stdout."""
try:
result = subprocess.run(
eval_cmd, shell=True, capture_output=True, text=True,
cwd=worktree_path, timeout=120
)
return result.stdout.strip(), result.returncode
except subprocess.TimeoutExpired:
return "TIMEOUT", 1
except Exception as e:
return str(e), 1
def extract_metric(output, metric_name):
"""Extract a numeric metric from command output.
Looks for patterns like:
- metric_name: 42.5
- metric_name=42.5
- "metric_name": 42.5
"""
patterns = [
rf'{metric_name}\s*[:=]\s*([\d.]+)',
rf'"{metric_name}"\s*[:=]\s*([\d.]+)',
rf"'{metric_name}'\s*[:=]\s*([\d.]+)",
]
for pattern in patterns:
match = re.search(pattern, output, re.IGNORECASE)
if match:
try:
return float(match.group(1))
except ValueError:
continue
return None
def get_diff_stats(branch, base_branch="main"):
"""Get diff statistics for a branch vs base."""
output = run_git("diff", "--stat", f"{base_branch}...{branch}")
lines_output = run_git("diff", "--shortstat", f"{base_branch}...{branch}")
files_changed = 0
insertions = 0
deletions = 0
if lines_output:
files_match = re.search(r"(\d+) files? changed", lines_output)
ins_match = re.search(r"(\d+) insertions?", lines_output)
del_match = re.search(r"(\d+) deletions?", lines_output)
if files_match:
files_changed = int(files_match.group(1))
if ins_match:
insertions = int(ins_match.group(1))
if del_match:
deletions = int(del_match.group(1))
return {
"files_changed": files_changed,
"insertions": insertions,
"deletions": deletions,
"net_lines": insertions - deletions,
}
def rank_by_metric(results, direction="lower"):
"""Sort results by metric value."""
valid = [r for r in results if r.get("metric_value") is not None]
invalid = [r for r in results if r.get("metric_value") is None]
reverse = direction == "higher"
valid.sort(key=lambda r: r["metric_value"], reverse=reverse)
for i, r in enumerate(valid):
r["rank"] = i + 1
for r in invalid:
r["rank"] = len(valid) + 1
return valid + invalid
def run_demo():
"""Show demo ranking output."""
print("=" * 60)
print("AgentHub Result Ranker — Demo Mode")
print("=" * 60)
print()
print("Session: 20260317-143022")
print("Eval: pytest bench.py --json")
print("Metric: p50_ms (lower is better)")
print("Baseline: 180ms")
print()
header = f"{'RANK':<6} {'AGENT':<10} {'METRIC':<10} {'DELTA':<10} {'FILES':<7} {'SUMMARY'}"
print(header)
print("-" * 75)
print(f"{'1':<6} {'agent-2':<10} {'142ms':<10} {'-38ms':<10} {'2':<7} Replaced O(n²) with hash map lookup")
print(f"{'2':<6} {'agent-1':<10} {'165ms':<10} {'-15ms':<10} {'3':<7} Added caching layer")
print(f"{'3':<6} {'agent-3':<10} {'190ms':<10} {'+10ms':<10} {'1':<7} Minor loop optimizations")
print()
print("Winner: agent-2 (142ms, -21% from baseline)")
print()
print("Next step: Run /hub:merge to merge agent-2's branch")
def main():
parser = argparse.ArgumentParser(
description="Rank AgentHub agent results"
)
parser.add_argument("--session", type=str,
help="Session ID to evaluate")
parser.add_argument("--eval-cmd", type=str,
help="Evaluation command to run in each worktree")
parser.add_argument("--metric", type=str,
help="Metric name to extract from eval output")
parser.add_argument("--direction", choices=["lower", "higher"],
default="lower",
help="Whether lower or higher metric is better")
parser.add_argument("--baseline", type=float,
help="Baseline metric value for delta calculation")
parser.add_argument("--diff-summary", action="store_true",
help="Show diff statistics per agent (no eval cmd needed)")
parser.add_argument("--format", choices=["table", "json"], default="table",
help="Output format (default: table)")
parser.add_argument("--demo", action="store_true",
help="Show demo output")
args = parser.parse_args()
if args.demo:
run_demo()
return
if not args.session:
print("Error: --session is required", file=sys.stderr)
sys.exit(1)
config = get_session_config(args.session)
branches = get_hub_branches(args.session)
if not branches:
print(f"No branches found for session {args.session}")
return
eval_cmd = args.eval_cmd or config.get("eval_cmd")
metric = args.metric or config.get("metric")
direction = args.direction or config.get("direction", "lower")
base_branch = config.get("base_branch", "main")
results = []
for branch in branches:
# Extract agent number
match = re.match(r"hub/[^/]+/agent-(\d+)/", branch)
agent_id = f"agent-{match.group(1)}" if match else branch.split("/")[-2]
result = {
"agent": agent_id,
"branch": branch,
"metric_value": None,
"metric_raw": None,
"diff": get_diff_stats(branch, base_branch),
}
if eval_cmd and metric:
worktree = get_worktree_path(branch)
if worktree:
output, returncode = run_eval_in_worktree(worktree, eval_cmd)
result["metric_raw"] = output
result["eval_returncode"] = returncode
if returncode == 0:
result["metric_value"] = extract_metric(output, metric)
results.append(result)
# Rank
ranked = rank_by_metric(results, direction)
# Calculate deltas
baseline = args.baseline
if baseline is None and ranked and ranked[0].get("metric_value") is not None:
# Use worst as baseline if not specified
values = [r["metric_value"] for r in ranked if r["metric_value"] is not None]
if values:
baseline = max(values) if direction == "lower" else min(values)
for r in ranked:
if r.get("metric_value") is not None and baseline is not None:
r["delta"] = r["metric_value"] - baseline
else:
r["delta"] = None
if args.format == "json":
print(json.dumps({"session": args.session, "results": ranked}, indent=2))
return
# Table output
print(f"Session: {args.session}")
if eval_cmd:
print(f"Eval: {eval_cmd}")
if metric:
dir_str = "lower is better" if direction == "lower" else "higher is better"
print(f"Metric: {metric} ({dir_str})")
if baseline:
print(f"Baseline: {baseline}")
print()
if args.diff_summary or not eval_cmd:
header = f"{'RANK':<6} {'AGENT':<12} {'FILES':<7} {'ADDED':<8} {'REMOVED':<8} {'NET':<6}"
print(header)
print("-" * 50)
for i, r in enumerate(ranked):
d = r["diff"]
print(f"{i+1:<6} {r['agent']:<12} {d['files_changed']:<7} "
f"+{d['insertions']:<7} -{d['deletions']:<7} {d['net_lines']:<6}")
else:
header = f"{'RANK':<6} {'AGENT':<12} {'METRIC':<12} {'DELTA':<10} {'FILES':<7}"
print(header)
print("-" * 50)
for r in ranked:
mv = str(r["metric_value"]) if r["metric_value"] is not None else "N/A"
delta = ""
if r["delta"] is not None:
sign = "+" if r["delta"] >= 0 else ""
delta = f"{sign}{r['delta']:.1f}"
print(f"{r['rank']:<6} {r['agent']:<12} {mv:<12} {delta:<10} {r['diff']['files_changed']:<7}")
# Winner
if ranked and ranked[0].get("metric_value") is not None:
winner = ranked[0]
print()
print(f"Winner: {winner['agent']} ({winner['metric_value']})")
if __name__ == "__main__":
main()