claude-skills-reference/engineering/agenthub/scripts/result_ranker.py

#!/usr/bin/env python3
"""Rank AgentHub agent results by metric or diff quality.

Runs an evaluation command in each agent's worktree, parses a metric,
and produces a ranked table.

Usage:
    python result_ranker.py --session 20260317-143022 \\
        --eval-cmd "pytest bench.py --json" --metric p50_ms --direction lower

    python result_ranker.py --session 20260317-143022 --diff-summary

    python result_ranker.py --demo
"""

import argparse
import json
import os
import re
import subprocess
import sys


def run_git(*args):
    """Run a git command and return stdout."""
    try:
        result = subprocess.run(
            ["git"] + list(args),
            capture_output=True, text=True, check=True
        )
        return result.stdout.strip()
    except subprocess.CalledProcessError as e:
        return ""


def get_session_config(session_id):
    """Load session config."""
    config_path = os.path.join(".agenthub", "sessions", session_id, "config.yaml")
    if not os.path.exists(config_path):
        print(f"Error: Session {session_id} not found", file=sys.stderr)
        sys.exit(1)

    config = {}
    with open(config_path) as f:
        for line in f:
            line = line.strip()
            if ":" in line and not line.startswith("#"):
                key, val = line.split(":", 1)
                val = val.strip().strip('"')
                config[key.strip()] = val
    return config


def get_hub_branches(session_id):
    """Get all hub branches for a session."""
    output = run_git("branch", "--list", f"hub/{session_id}/*",
                     "--format=%(refname:short)")
    if not output:
        return []
    return [b.strip() for b in output.split("\n") if b.strip()]


def get_worktree_path(branch):
    """Get the worktree path for a branch, if it exists."""
    output = run_git("worktree", "list", "--porcelain")
    if not output:
        return None
    current_path = None
    for line in output.split("\n"):
        if line.startswith("worktree "):
            current_path = line[len("worktree "):]
        elif line.startswith("branch ") and current_path:
            ref = line[len("branch "):]
            short = ref.replace("refs/heads/", "")
            if short == branch:
                return current_path
            current_path = None
    return None


def run_eval_in_worktree(worktree_path, eval_cmd):
    """Run evaluation command in a worktree and return stdout."""
    try:
        result = subprocess.run(
            eval_cmd, shell=True, capture_output=True, text=True,
            cwd=worktree_path, timeout=120
        )
        return result.stdout.strip(), result.returncode
    except subprocess.TimeoutExpired:
        return "TIMEOUT", 1
    except Exception as e:
        return str(e), 1


def extract_metric(output, metric_name):
    """Extract a numeric metric from command output.

    Looks for patterns like:
    - metric_name: 42.5
    - metric_name=42.5
    - "metric_name": 42.5
    """
    patterns = [
        rf'{metric_name}\s*[:=]\s*([\d.]+)',
        rf'"{metric_name}"\s*[:=]\s*([\d.]+)',
        rf"'{metric_name}'\s*[:=]\s*([\d.]+)",
    ]
    for pattern in patterns:
        match = re.search(pattern, output, re.IGNORECASE)
        if match:
            try:
                return float(match.group(1))
            except ValueError:
                continue
    return None


def get_diff_stats(branch, base_branch="main"):
    """Get diff statistics for a branch vs base."""
    output = run_git("diff", "--stat", f"{base_branch}...{branch}")
    lines_output = run_git("diff", "--shortstat", f"{base_branch}...{branch}")

    files_changed = 0
    insertions = 0
    deletions = 0

    if lines_output:
        files_match = re.search(r"(\d+) files? changed", lines_output)
        ins_match = re.search(r"(\d+) insertions?", lines_output)
        del_match = re.search(r"(\d+) deletions?", lines_output)
        if files_match:
            files_changed = int(files_match.group(1))
        if ins_match:
            insertions = int(ins_match.group(1))
        if del_match:
            deletions = int(del_match.group(1))

    return {
        "files_changed": files_changed,
        "insertions": insertions,
        "deletions": deletions,
        "net_lines": insertions - deletions,
    }


def rank_by_metric(results, direction="lower"):
    """Sort results by metric value."""
    valid = [r for r in results if r.get("metric_value") is not None]
    invalid = [r for r in results if r.get("metric_value") is None]

    reverse = direction == "higher"
    valid.sort(key=lambda r: r["metric_value"], reverse=reverse)

    for i, r in enumerate(valid):
        r["rank"] = i + 1

    for r in invalid:
        r["rank"] = len(valid) + 1

    return valid + invalid


def run_demo():
    """Show demo ranking output."""
    print("=" * 60)
    print("AgentHub Result Ranker — Demo Mode")
    print("=" * 60)
    print()
    print("Session: 20260317-143022")
    print("Eval: pytest bench.py --json")
    print("Metric: p50_ms (lower is better)")
    print("Baseline: 180ms")
    print()

    header = f"{'RANK':<6} {'AGENT':<10} {'METRIC':<10} {'DELTA':<10} {'FILES':<7} {'SUMMARY'}"
    print(header)
    print("-" * 75)
    print(f"{'1':<6} {'agent-2':<10} {'142ms':<10} {'-38ms':<10} {'2':<7} Replaced O(n²) with hash map lookup")
    print(f"{'2':<6} {'agent-1':<10} {'165ms':<10} {'-15ms':<10} {'3':<7} Added caching layer")
    print(f"{'3':<6} {'agent-3':<10} {'190ms':<10} {'+10ms':<10} {'1':<7} Minor loop optimizations")
    print()
    print("Winner: agent-2 (142ms, -21% from baseline)")
    print()
    print("Next step: Run /hub:merge to merge agent-2's branch")


def main():
    parser = argparse.ArgumentParser(
        description="Rank AgentHub agent results"
    )
    parser.add_argument("--session", type=str,
                        help="Session ID to evaluate")
    parser.add_argument("--eval-cmd", type=str,
                        help="Evaluation command to run in each worktree")
    parser.add_argument("--metric", type=str,
                        help="Metric name to extract from eval output")
    parser.add_argument("--direction", choices=["lower", "higher"],
                        default="lower",
                        help="Whether lower or higher metric is better")
    parser.add_argument("--baseline", type=float,
                        help="Baseline metric value for delta calculation")
    parser.add_argument("--diff-summary", action="store_true",
                        help="Show diff statistics per agent (no eval cmd needed)")
    parser.add_argument("--format", choices=["table", "json"], default="table",
                        help="Output format (default: table)")
    parser.add_argument("--demo", action="store_true",
                        help="Show demo output")
    args = parser.parse_args()

    if args.demo:
        run_demo()
        return

    if not args.session:
        print("Error: --session is required", file=sys.stderr)
        sys.exit(1)

    config = get_session_config(args.session)
    branches = get_hub_branches(args.session)

    if not branches:
        print(f"No branches found for session {args.session}")
        return

    eval_cmd = args.eval_cmd or config.get("eval_cmd")
    metric = args.metric or config.get("metric")
    direction = args.direction or config.get("direction", "lower")
    base_branch = config.get("base_branch", "main")

    results = []
    for branch in branches:
        # Extract agent number
        match = re.match(r"hub/[^/]+/agent-(\d+)/", branch)
        agent_id = f"agent-{match.group(1)}" if match else branch.split("/")[-2]

        result = {
            "agent": agent_id,
            "branch": branch,
            "metric_value": None,
            "metric_raw": None,
            "diff": get_diff_stats(branch, base_branch),
        }

        if eval_cmd and metric:
            worktree = get_worktree_path(branch)
            if worktree:
                output, returncode = run_eval_in_worktree(worktree, eval_cmd)
                result["metric_raw"] = output
                result["eval_returncode"] = returncode
                if returncode == 0:
                    result["metric_value"] = extract_metric(output, metric)

        results.append(result)

    # Rank
    ranked = rank_by_metric(results, direction)

    # Calculate deltas
    baseline = args.baseline
    if baseline is None and ranked and ranked[0].get("metric_value") is not None:
        # Use worst as baseline if not specified
        values = [r["metric_value"] for r in ranked if r["metric_value"] is not None]
        if values:
            baseline = max(values) if direction == "lower" else min(values)

    for r in ranked:
        if r.get("metric_value") is not None and baseline is not None:
            r["delta"] = r["metric_value"] - baseline
        else:
            r["delta"] = None

    if args.format == "json":
        print(json.dumps({"session": args.session, "results": ranked}, indent=2))
        return

    # Table output
    print(f"Session: {args.session}")
    if eval_cmd:
        print(f"Eval: {eval_cmd}")
    if metric:
        dir_str = "lower is better" if direction == "lower" else "higher is better"
        print(f"Metric: {metric} ({dir_str})")
    if baseline:
        print(f"Baseline: {baseline}")
    print()

    if args.diff_summary or not eval_cmd:
        header = f"{'RANK':<6} {'AGENT':<12} {'FILES':<7} {'ADDED':<8} {'REMOVED':<8} {'NET':<6}"
        print(header)
        print("-" * 50)
        for i, r in enumerate(ranked):
            d = r["diff"]
            print(f"{i+1:<6} {r['agent']:<12} {d['files_changed']:<7} "
                  f"+{d['insertions']:<7} -{d['deletions']:<7} {d['net_lines']:<6}")
    else:
        header = f"{'RANK':<6} {'AGENT':<12} {'METRIC':<12} {'DELTA':<10} {'FILES':<7}"
        print(header)
        print("-" * 50)
        for r in ranked:
            mv = str(r["metric_value"]) if r["metric_value"] is not None else "N/A"
            delta = ""
            if r["delta"] is not None:
                sign = "+" if r["delta"] >= 0 else ""
                delta = f"{sign}{r['delta']:.1f}"
            print(f"{r['rank']:<6} {r['agent']:<12} {mv:<12} {delta:<10} {r['diff']['files_changed']:<7}")

    # Winner
    if ranked and ranked[0].get("metric_value") is not None:
        winner = ranked[0]
        print()
        print(f"Winner: {winner['agent']} ({winner['metric_value']})")


if __name__ == "__main__":
    main()