diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 2749d48..185819a 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -4,11 +4,11 @@
"name": "Alireza Rezvani",
"url": "https://alirezarezvani.com"
},
- "description": "177 production-ready skill packages for Claude AI across 9 domains: marketing (43), engineering (24+25), C-level advisory (28), regulatory/QMS (12), product (12), project management (6), business growth (4), and finance (2). Includes 254 Python tools, 357 reference documents, 16 agents, and 17 slash commands.",
+ "description": "177 production-ready skill packages for Claude AI across 9 domains: marketing (43), engineering (24+25), C-level advisory (28), regulatory/QMS (12), product (12), project management (6), business growth (4), and finance (2). Includes 254 Python tools, 357 reference documents, 17 agents, and 22 slash commands.",
"homepage": "https://github.com/alirezarezvani/claude-skills",
"repository": "https://github.com/alirezarezvani/claude-skills",
"metadata": {
- "description": "177 production-ready skill packages across 9 domains with 254 Python tools, 357 reference documents, 16 agents, and 17 slash commands. Compatible with Claude Code, Codex CLI, Gemini CLI, and OpenClaw.",
+ "description": "177 production-ready skill packages across 9 domains with 254 Python tools, 357 reference documents, 17 agents, and 22 slash commands. Compatible with Claude Code, Codex CLI, Gemini CLI, and OpenClaw.",
"version": "2.1.2"
},
"plugins": [
@@ -244,6 +244,25 @@
],
"category": "development"
},
+ {
+ "name": "autoresearch-agent",
+ "source": "./engineering/autoresearch-agent",
+ "description": "Autonomous experiment loop — optimize any file by a measurable metric. 5 slash commands (/ar:setup, /ar:run, /ar:loop, /ar:status, /ar:resume), 8 built-in evaluators, configurable loop intervals (10min to monthly).",
+ "version": "2.1.2",
+ "author": {
+ "name": "Alireza Rezvani"
+ },
+ "keywords": [
+ "autoresearch",
+ "optimization",
+ "experiments",
+ "benchmarks",
+ "loop",
+ "metrics",
+ "evaluators"
+ ],
+ "category": "development"
+ },
{
"name": "content-creator",
"source": "./marketing-skill/content-creator",
diff --git a/.gemini/skills-index.json b/.gemini/skills-index.json
index 74f3e97..e0ed693 100644
--- a/.gemini/skills-index.json
+++ b/.gemini/skills-index.json
@@ -1,8 +1,18 @@
{
"version": "1.0.0",
"name": "gemini-cli-skills",
- "total_skills": 218,
+ "total_skills": 229,
"skills": [
+ {
+ "name": "README",
+ "category": "agent",
+ "description": "Agent from personas"
+ },
+ {
+ "name": "TEMPLATE",
+ "category": "agent",
+ "description": "One paragraph describing what this agent does, who it's for, and when to activate it."
+ },
{
"name": "cs-agile-product-owner",
"category": "agent",
@@ -83,6 +93,21 @@
"category": "agent",
"description": "Google Workspace administration agent using the gws CLI. Orchestrates workspace setup, Gmail/Drive/Sheets/Calendar automation, security audits, and recipe execution. Spawn when users need Google Workspace automation, gws CLI help, or workspace administration."
},
+ {
+ "name": "growth-marketer",
+ "category": "agent",
+ "description": "Growth marketing specialist for bootstrapped startups and indie hackers. Builds content engines, optimizes funnels, runs launch sequences, and finds scalable acquisition channels \u2014 all on a budget that makes enterprise marketers cry."
+ },
+ {
+ "name": "solo-founder",
+ "category": "agent",
+ "description": "Your co-founder who doesn't exist yet. Covers product, engineering, marketing, and strategy for one-person startups \u2014 because nobody's stopping you from making bad decisions and somebody should."
+ },
+ {
+ "name": "startup-cto",
+ "category": "agent",
+ "description": "Technical co-founder who's been through two startups and learned what actually matters. Makes architecture decisions, selects tech stacks, builds engineering culture, and prepares for technical due diligence \u2014 all while shipping fast with a small team."
+ },
{
"name": "business-growth-bundle",
"category": "business-growth",
@@ -578,6 +603,11 @@
"category": "engineering-advanced",
"description": "API Test Suite Builder"
},
+ {
+ "name": "autoresearch-agent",
+ "category": "engineering-advanced",
+ "description": "Autonomous experiment loop that optimizes any file by a measurable metric. Inspired by Karpathy's autoresearch. The agent edits a target file, runs a fixed evaluation, keeps improvements (git commit), discards failures (git reset), and loops indefinitely. Use when: user wants to optimize code speed, reduce bundle/image size, improve test pass rate, optimize prompts, improve content quality (headlines, copy, CTR), or run any measurable improvement loop. Requires: a target file, an evaluation command that outputs a metric, and a git repo."
+ },
{
"name": "changelog-generator",
"category": "engineering-advanced",
@@ -628,6 +658,11 @@
"category": "engineering-advanced",
"description": "This skill should be used when the user asks to \"design interview processes\", \"create hiring pipelines\", \"calibrate interview loops\", \"generate interview questions\", \"design competency matrices\", \"analyze interviewer bias\", \"create scoring rubrics\", \"build question banks\", or \"optimize hiring systems\". Use for designing role-specific interview loops, competency assessments, and hiring calibration systems."
},
+ {
+ "name": "loop",
+ "category": "engineering-advanced",
+ "description": "Start an autonomous experiment loop with user-selected interval (10min, 1h, daily, weekly, monthly). Uses CronCreate for scheduling."
+ },
{
"name": "mcp-server-builder",
"category": "engineering-advanced",
@@ -668,6 +703,16 @@
"category": "engineering-advanced",
"description": "Release Manager"
},
+ {
+ "name": "resume",
+ "category": "engineering-advanced",
+ "description": "Resume a paused experiment. Checkout the experiment branch, read results history, continue iterating."
+ },
+ {
+ "name": "run",
+ "category": "engineering-advanced",
+ "description": "Run a single experiment iteration. Edit the target file, evaluate, keep or discard."
+ },
{
"name": "runbook-generator",
"category": "engineering-advanced",
@@ -678,6 +723,11 @@
"category": "engineering-advanced",
"description": "Skill from engineering/skill-tester/assets/sample-skill"
},
+ {
+ "name": "setup",
+ "category": "engineering-advanced",
+ "description": "Set up a new autoresearch experiment interactively. Collects domain, target file, eval command, metric, direction, and evaluator."
+ },
{
"name": "skill-security-auditor",
"category": "engineering-advanced",
@@ -688,6 +738,11 @@
"category": "engineering-advanced",
"description": "Skill Tester"
},
+ {
+ "name": "skills-status",
+ "category": "engineering-advanced",
+ "description": "Show experiment dashboard with results, active loops, and progress."
+ },
{
"name": "tech-debt-tracker",
"category": "engineering-advanced",
@@ -1096,7 +1151,7 @@
],
"categories": {
"agent": {
- "count": 16,
+ "count": 21,
"description": "Agent resources"
},
"business-growth": {
@@ -1116,7 +1171,7 @@
"description": "Engineering resources"
},
"engineering-advanced": {
- "count": 27,
+ "count": 33,
"description": "Engineering-advanced resources"
},
"finance": {
diff --git a/.gemini/skills/autoresearch-agent/SKILL.md b/.gemini/skills/autoresearch-agent/SKILL.md
new file mode 120000
index 0000000..37c22a2
--- /dev/null
+++ b/.gemini/skills/autoresearch-agent/SKILL.md
@@ -0,0 +1 @@
+../../../engineering/autoresearch-agent/SKILL.md
\ No newline at end of file
diff --git a/.gemini/skills/loop/SKILL.md b/.gemini/skills/loop/SKILL.md
new file mode 120000
index 0000000..b66a4bf
--- /dev/null
+++ b/.gemini/skills/loop/SKILL.md
@@ -0,0 +1 @@
+../../../engineering/autoresearch-agent/skills/loop/SKILL.md
\ No newline at end of file
diff --git a/.gemini/skills/resume/SKILL.md b/.gemini/skills/resume/SKILL.md
new file mode 120000
index 0000000..73cc34f
--- /dev/null
+++ b/.gemini/skills/resume/SKILL.md
@@ -0,0 +1 @@
+../../../engineering/autoresearch-agent/skills/resume/SKILL.md
\ No newline at end of file
diff --git a/.gemini/skills/run/SKILL.md b/.gemini/skills/run/SKILL.md
new file mode 120000
index 0000000..fb5123c
--- /dev/null
+++ b/.gemini/skills/run/SKILL.md
@@ -0,0 +1 @@
+../../../engineering/autoresearch-agent/skills/run/SKILL.md
\ No newline at end of file
diff --git a/.gemini/skills/setup/SKILL.md b/.gemini/skills/setup/SKILL.md
new file mode 120000
index 0000000..6fd9979
--- /dev/null
+++ b/.gemini/skills/setup/SKILL.md
@@ -0,0 +1 @@
+../../../engineering/autoresearch-agent/skills/setup/SKILL.md
\ No newline at end of file
diff --git a/.gemini/skills/skills-status/SKILL.md b/.gemini/skills/skills-status/SKILL.md
new file mode 120000
index 0000000..ec526d3
--- /dev/null
+++ b/.gemini/skills/skills-status/SKILL.md
@@ -0,0 +1 @@
+../../../engineering/autoresearch-agent/skills/status/SKILL.md
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
index 41717a8..d0838c5 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
This is a **comprehensive skills library** for Claude AI and Claude Code - reusable, production-ready skill packages that bundle domain expertise, best practices, analysis tools, and strategic frameworks. The repository provides modular skills that teams can download and use directly in their workflows.
-**Current Scope:** 177 production-ready skills across 9 domains with 254 Python automation tools, 357 reference guides, 16 agents, and 17 slash commands.
+**Current Scope:** 177 production-ready skills across 9 domains with 254 Python automation tools, 357 reference guides, 17 agents, and 22 slash commands.
**Key Distinction**: This is NOT a traditional application. It's a library of skill packages meant to be extracted and deployed by users into their own Claude workflows.
@@ -37,7 +37,7 @@ This repository uses **modular documentation**. For domain-specific guidance, se
claude-code-skills/
├── .claude-plugin/ # Plugin registry (marketplace.json)
├── agents/ # 15 cs-* prefixed agents across all domains
-├── commands/ # 17 slash commands (changelog, tdd, saas-health, workspace, prd, sprint-plan, etc.)
+├── commands/ # 22 slash commands (changelog, tdd, saas-health, workspace, prd, sprint-plan, ar:*, etc.)
├── engineering-team/ # 24 core engineering skills + Playwright Pro + Self-Improving Agent
├── engineering/ # 25 POWERFUL-tier advanced skills
├── product-team/ # 12 product skills + Python tools
@@ -150,7 +150,7 @@ See [standards/git/git-workflow-standards.md](standards/git/git-workflow-standar
**Phase 1-2 Complete:** 177 production-ready skills deployed across 9 domains
- Engineering Core (24), Engineering POWERFUL (25), Product (8), Marketing (43), PM (6), C-Level (28), RA/QM (12), Business & Growth (4), Finance (2)
-- 254 Python automation tools, 357 reference guides, 16 agents, 17 commands
+- 254 Python automation tools, 357 reference guides, 17 agents, 22 commands
- Complete enterprise coverage from engineering through regulatory compliance, sales, customer success, and finance
- MkDocs Material docs site with 210+ indexed pages for SEO
diff --git a/README.md b/README.md
index 9f7c990..5e78c8f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# Claude Code Skills & Plugins
-**177 production-ready skills, 16 agents, 3 personas, and an orchestration protocol for 11 AI coding tools.**
+**177 production-ready skills, 17 agents, 3 personas, and an orchestration protocol for 11 AI coding tools.**
Reusable expertise packages that give AI coding agents domain knowledge they don't have out of the box — from architecture and security to marketing, compliance, and C-level advisory.
@@ -8,9 +8,9 @@ Reusable expertise packages that give AI coding agents domain knowledge they don
[](https://opensource.org/licenses/MIT)
[](#skills-overview)
-[](#agents)
+[](#agents)
[](#personas)
-[](#commands)
+[](#commands)
[](https://github.com/alirezarezvani/claude-skills/stargazers)
[](https://getskillcheck.com)
diff --git a/docs/agents/index.md b/docs/agents/index.md
index 939e635..fbc9c2a 100644
--- a/docs/agents/index.md
+++ b/docs/agents/index.md
@@ -1,13 +1,13 @@
---
title: "Agents"
-description: "All 16 Claude Code agents — multi-skill orchestrators across domains."
+description: "All 21 Claude Code agents — multi-skill orchestrators across domains."
---
# :material-robot: Agents
-
16 agents that orchestrate skills across domains
+
21 agents that orchestrate skills across domains
@@ -67,6 +67,36 @@ description: "All 16 Claude Code agents — multi-skill orchestrators across dom
Marketing
+- :material-account:{ .lg .middle } **[Persona-Based Agents](readme.md)**
+
+ ---
+
+ Personas
+
+- :material-account:{ .lg .middle } **[Agent Name Agent Personality](template.md)**
+
+ ---
+
+ Personas
+
+- :material-account:{ .lg .middle } **[Growth Marketer Agent Personality](growth-marketer.md)**
+
+ ---
+
+ Personas
+
+- :material-account:{ .lg .middle } **[Solo Founder Agent Personality](solo-founder.md)**
+
+ ---
+
+ Personas
+
+- :material-account:{ .lg .middle } **[Startup CTO Agent Personality](startup-cto.md)**
+
+ ---
+
+ Personas
+
- :material-lightbulb-outline:{ .lg .middle } **[Agile Product Owner Agent](cs-agile-product-owner.md)**
---
diff --git a/docs/index.md b/docs/index.md
index d101f06..b2cad2e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,6 +1,6 @@
---
title: Agent Skills for AI Coding Tools
-description: "177 production-ready skills, 16 agents, 3 personas, and an orchestration protocol for 11 AI coding tools — Claude Code, OpenAI Codex, Gemini CLI, Cursor, Aider, Windsurf, and more."
+description: "177 production-ready skills, 17 agents, 3 personas, and an orchestration protocol for 11 AI coding tools — Claude Code, OpenAI Codex, Gemini CLI, Cursor, Aider, Windsurf, and more."
hide:
- toc
- edit
@@ -14,7 +14,7 @@ hide:
# Agent Skills
-177 production-ready skills, 16 agents, 3 personas, and an orchestration protocol for AI coding tools.
+177 production-ready skills, 17 agents, 3 personas, and an orchestration protocol for AI coding tools.
{ .hero-subtitle }
[Get Started](getting-started.md){ .md-button .md-button--primary }
diff --git a/docs/skills/engineering/autoresearch-agent-loop.md b/docs/skills/engineering/autoresearch-agent-loop.md
new file mode 100644
index 0000000..f7f6851
--- /dev/null
+++ b/docs/skills/engineering/autoresearch-agent-loop.md
@@ -0,0 +1,132 @@
+---
+title: "/ar:loop — Autonomous Experiment Loop"
+description: "/ar:loop — Autonomous Experiment Loop - Claude Code skill from the Engineering - POWERFUL domain."
+---
+
+# /ar:loop — Autonomous Experiment Loop
+
+
+
:material-rocket-launch: Engineering - POWERFUL
+
:material-identifier: `loop`
+
:material-github: Source
+
+
+
+Install: claude /plugin install engineering-advanced-skills
+
+
+
+Start a recurring experiment loop that runs at a user-selected interval.
+
+## Usage
+
+```
+/ar:loop engineering/api-speed # Start loop (prompts for interval)
+/ar:loop engineering/api-speed 10m # Every 10 minutes
+/ar:loop engineering/api-speed 1h # Every hour
+/ar:loop engineering/api-speed daily # Daily at ~9am
+/ar:loop engineering/api-speed weekly # Weekly on Monday ~9am
+/ar:loop engineering/api-speed monthly # Monthly on 1st ~9am
+/ar:loop stop engineering/api-speed # Stop an active loop
+```
+
+## What It Does
+
+### Step 1: Resolve experiment
+
+If no experiment specified, list experiments and let user pick.
+
+### Step 2: Select interval
+
+If interval not provided as argument, present options:
+
+```
+Select loop interval:
+ 1. Every 10 minutes (rapid — stay and watch)
+ 2. Every hour (background — check back later)
+ 3. Daily at ~9am (overnight experiments)
+ 4. Weekly on Monday (long-running experiments)
+ 5. Monthly on 1st (slow experiments)
+```
+
+Map to cron expressions:
+
+| Interval | Cron Expression | Shorthand |
+|----------|----------------|-----------|
+| 10 minutes | `*/10 * * * *` | `10m` |
+| 1 hour | `7 * * * *` | `1h` |
+| Daily | `57 8 * * *` | `daily` |
+| Weekly | `57 8 * * 1` | `weekly` |
+| Monthly | `57 8 1 * *` | `monthly` |
+
+### Step 3: Create the recurring job
+
+Use `CronCreate` with this prompt (fill in the experiment details):
+
+```
+You are running autoresearch experiment "{domain}/{name}".
+
+1. Read .autoresearch/{domain}/{name}/config.cfg for: target, evaluate_cmd, metric, metric_direction
+2. Read .autoresearch/{domain}/{name}/program.md for strategy and constraints
+3. Read .autoresearch/{domain}/{name}/results.tsv for experiment history
+4. Run: git checkout autoresearch/{domain}/{name}
+
+Then do exactly ONE iteration:
+- Review results.tsv: what worked, what failed, what hasn't been tried
+- Edit the target file with ONE change (strategy escalation based on run count)
+- Commit: git add {target} && git commit -m "experiment: {description}"
+- Evaluate: python {skill_path}/scripts/run_experiment.py --experiment {domain}/{name} --single
+- Read the output (KEEP/DISCARD/CRASH)
+
+Rules:
+- ONE change per experiment
+- NEVER modify the evaluator
+- If 5 consecutive crashes in results.tsv, delete this cron job (CronDelete) and alert
+- After every 10 experiments, update Strategy section of program.md
+
+Current best metric: {read from results.tsv or "no baseline yet"}
+Total experiments so far: {count from results.tsv}
+```
+
+### Step 4: Store loop metadata
+
+Write to `.autoresearch/{domain}/{name}/loop.json`:
+
+```json
+{
+ "cron_id": "{id from CronCreate}",
+ "interval": "{user selection}",
+ "started": "{ISO timestamp}",
+ "experiment": "{domain}/{name}"
+}
+```
+
+### Step 5: Confirm to user
+
+```
+Loop started for {domain}/{name}
+ Interval: {interval description}
+ Cron ID: {id}
+ Auto-expires: 3 days (CronCreate limit)
+
+ To check progress: /ar:status
+ To stop the loop: /ar:loop stop {domain}/{name}
+
+ Note: Recurring jobs auto-expire after 3 days.
+ Run /ar:loop again to restart after expiry.
+```
+
+## Stopping a Loop
+
+When user runs `/ar:loop stop {experiment}`:
+
+1. Read `.autoresearch/{domain}/{name}/loop.json` to get the cron ID
+2. Call `CronDelete` with that ID
+3. Delete `loop.json`
+4. Confirm: "Loop stopped for {experiment}. {n} experiments completed."
+
+## Important Limitations
+
+- **3-day auto-expiry**: CronCreate jobs expire after 3 days. For longer experiments, the user must re-run `/ar:loop` to restart. Results persist — the new loop picks up where the old one left off.
+- **One loop per experiment**: Don't start multiple loops for the same experiment.
+- **Concurrent experiments**: Multiple experiments can loop simultaneously ONLY if they're on different git branches (which they are by default — each experiment gets `autoresearch/{domain}/{name}`).
diff --git a/docs/skills/engineering/autoresearch-agent-resume.md b/docs/skills/engineering/autoresearch-agent-resume.md
new file mode 100644
index 0000000..cd88b05
--- /dev/null
+++ b/docs/skills/engineering/autoresearch-agent-resume.md
@@ -0,0 +1,87 @@
+---
+title: "/ar:resume — Resume Experiment"
+description: "/ar:resume — Resume Experiment - Claude Code skill from the Engineering - POWERFUL domain."
+---
+
+# /ar:resume — Resume Experiment
+
+
+
:material-rocket-launch: Engineering - POWERFUL
+
:material-identifier: `resume`
+
:material-github: Source
+
+
+
+Install: claude /plugin install engineering-advanced-skills
+
+
+
+Resume a paused or context-limited experiment. Reads all history and continues where you left off.
+
+## Usage
+
+```
+/ar:resume # List experiments, let user pick
+/ar:resume engineering/api-speed # Resume specific experiment
+```
+
+## What It Does
+
+### Step 1: List experiments if needed
+
+If no experiment specified:
+
+```bash
+python {skill_path}/scripts/setup_experiment.py --list
+```
+
+Show status for each (active/paused/done based on results.tsv age). Let user pick.
+
+### Step 2: Load full context
+
+```bash
+# Checkout the experiment branch
+git checkout autoresearch/{domain}/{name}
+
+# Read config
+cat .autoresearch/{domain}/{name}/config.cfg
+
+# Read strategy
+cat .autoresearch/{domain}/{name}/program.md
+
+# Read full results history
+cat .autoresearch/{domain}/{name}/results.tsv
+
+# Read recent git log for the branch
+git log --oneline -20
+```
+
+### Step 3: Report current state
+
+Summarize for the user:
+
+```
+Resuming: engineering/api-speed
+ Target: src/api/search.py
+ Metric: p50_ms (lower is better)
+ Experiments: 23 total — 8 kept, 12 discarded, 3 crashed
+ Best: 185ms (-42% from baseline of 320ms)
+ Last experiment: "added response caching" → KEEP (185ms)
+
+ Recent patterns:
+ - Caching changes: 3 kept, 1 discarded (consistently helpful)
+ - Algorithm changes: 2 discarded, 1 crashed (high risk, low reward so far)
+ - I/O optimization: 2 kept (promising direction)
+```
+
+### Step 4: Ask next action
+
+```
+How would you like to continue?
+ 1. Single iteration (/ar:run) — I'll make one change and evaluate
+ 2. Start a loop (/ar:loop) — Autonomous with scheduled interval
+ 3. Just show me the results — I'll review and decide
+```
+
+If the user picks loop, hand off to `/ar:loop` with the experiment pre-selected.
+If single, hand off to `/ar:run`.
diff --git a/docs/skills/engineering/autoresearch-agent-run.md b/docs/skills/engineering/autoresearch-agent-run.md
new file mode 100644
index 0000000..b3346b1
--- /dev/null
+++ b/docs/skills/engineering/autoresearch-agent-run.md
@@ -0,0 +1,94 @@
+---
+title: "/ar:run — Single Experiment Iteration"
+description: "/ar:run — Single Experiment Iteration - Claude Code skill from the Engineering - POWERFUL domain."
+---
+
+# /ar:run — Single Experiment Iteration
+
+
+
:material-rocket-launch: Engineering - POWERFUL
+
:material-identifier: `run`
+
:material-github: Source
+
+
+
+Install: claude /plugin install engineering-advanced-skills
+
+
+
+Run exactly ONE experiment iteration: review history, decide a change, edit, commit, evaluate.
+
+## Usage
+
+```
+/ar:run engineering/api-speed # Run one iteration
+/ar:run # List experiments, let user pick
+```
+
+## What It Does
+
+### Step 1: Resolve experiment
+
+If no experiment specified, run `python {skill_path}/scripts/setup_experiment.py --list` and ask the user to pick.
+
+### Step 2: Load context
+
+```bash
+# Read experiment config
+cat .autoresearch/{domain}/{name}/config.cfg
+
+# Read strategy and constraints
+cat .autoresearch/{domain}/{name}/program.md
+
+# Read experiment history
+cat .autoresearch/{domain}/{name}/results.tsv
+
+# Checkout the experiment branch
+git checkout autoresearch/{domain}/{name}
+```
+
+### Step 3: Decide what to try
+
+Review results.tsv:
+- What changes were kept? What pattern do they share?
+- What was discarded? Avoid repeating those approaches.
+- What crashed? Understand why.
+- How many runs so far? (Escalate strategy accordingly)
+
+**Strategy escalation:**
+- Runs 1-5: Low-hanging fruit (obvious improvements)
+- Runs 6-15: Systematic exploration (vary one parameter)
+- Runs 16-30: Structural changes (algorithm swaps)
+- Runs 30+: Radical experiments (completely different approaches)
+
+### Step 4: Make ONE change
+
+Edit only the target file specified in config.cfg. Change one thing. Keep it simple.
+
+### Step 5: Commit and evaluate
+
+```bash
+git add {target}
+git commit -m "experiment: {short description of what changed}"
+
+python {skill_path}/scripts/run_experiment.py \
+ --experiment {domain}/{name} --single
+```
+
+### Step 6: Report result
+
+Read the script output. Tell the user:
+- **KEEP**: "Improvement! {metric}: {value} ({delta} from previous best)"
+- **DISCARD**: "No improvement. {metric}: {value} vs best {best}. Reverted."
+- **CRASH**: "Evaluation failed: {reason}. Reverted."
+
+### Step 7: Self-improvement check
+
+After every 10th experiment (check results.tsv line count), update the Strategy section of program.md with patterns learned.
+
+## Rules
+
+- ONE change per iteration. Don't change 5 things at once.
+- NEVER modify the evaluator (evaluate.py). It's ground truth.
+- Simplicity wins. Equal performance with simpler code is an improvement.
+- No new dependencies.
diff --git a/docs/skills/engineering/autoresearch-agent-setup.md b/docs/skills/engineering/autoresearch-agent-setup.md
new file mode 100644
index 0000000..e54b8a9
--- /dev/null
+++ b/docs/skills/engineering/autoresearch-agent-setup.md
@@ -0,0 +1,87 @@
+---
+title: "/ar:setup — Create New Experiment"
+description: "/ar:setup — Create New Experiment - Claude Code skill from the Engineering - POWERFUL domain."
+---
+
+# /ar:setup — Create New Experiment
+
+
+
:material-rocket-launch: Engineering - POWERFUL
+
:material-identifier: `setup`
+
:material-github: Source
+
+
+
+Install: claude /plugin install engineering-advanced-skills
+
+
+
+Set up a new autoresearch experiment with all required configuration.
+
+## Usage
+
+```
+/ar:setup # Interactive mode
+/ar:setup engineering api-speed src/api.py "pytest bench.py" p50_ms lower
+/ar:setup --list # Show existing experiments
+/ar:setup --list-evaluators # Show available evaluators
+```
+
+## What It Does
+
+### If arguments provided
+
+Pass them directly to the setup script:
+
+```bash
+python {skill_path}/scripts/setup_experiment.py \
+ --domain {domain} --name {name} \
+ --target {target} --eval "{eval_cmd}" \
+ --metric {metric} --direction {direction} \
+ [--evaluator {evaluator}] [--scope {scope}]
+```
+
+### If no arguments (interactive mode)
+
+Collect each parameter one at a time:
+
+1. **Domain** — Ask: "What domain? (engineering, marketing, content, prompts, custom)"
+2. **Name** — Ask: "Experiment name? (e.g., api-speed, blog-titles)"
+3. **Target file** — Ask: "Which file to optimize?" Verify it exists.
+4. **Eval command** — Ask: "How to measure it? (e.g., pytest bench.py, python evaluate.py)"
+5. **Metric** — Ask: "What metric does the eval output? (e.g., p50_ms, ctr_score)"
+6. **Direction** — Ask: "Is lower or higher better?"
+7. **Evaluator** (optional) — Show built-in evaluators. Ask: "Use a built-in evaluator, or your own?"
+8. **Scope** — Ask: "Store in project (.autoresearch/) or user (~/.autoresearch/)?"
+
+Then run `setup_experiment.py` with the collected parameters.
+
+### Listing
+
+```bash
+# Show existing experiments
+python {skill_path}/scripts/setup_experiment.py --list
+
+# Show available evaluators
+python {skill_path}/scripts/setup_experiment.py --list-evaluators
+```
+
+## Built-in Evaluators
+
+| Name | Metric | Use Case |
+|------|--------|----------|
+| `benchmark_speed` | `p50_ms` (lower) | Function/API execution time |
+| `benchmark_size` | `size_bytes` (lower) | File, bundle, Docker image size |
+| `test_pass_rate` | `pass_rate` (higher) | Test suite pass percentage |
+| `build_speed` | `build_seconds` (lower) | Build/compile/Docker build time |
+| `memory_usage` | `peak_mb` (lower) | Peak memory during execution |
+| `llm_judge_content` | `ctr_score` (higher) | Headlines, titles, descriptions |
+| `llm_judge_prompt` | `quality_score` (higher) | System prompts, agent instructions |
+| `llm_judge_copy` | `engagement_score` (higher) | Social posts, ad copy, emails |
+
+## After Setup
+
+Report to the user:
+- Experiment path and branch name
+- Whether the eval command worked and the baseline metric
+- Suggest: "Run `/ar:run {domain}/{name}` to start iterating, or `/ar:loop {domain}/{name}` for autonomous mode."
diff --git a/docs/skills/engineering/autoresearch-agent-status.md b/docs/skills/engineering/autoresearch-agent-status.md
new file mode 100644
index 0000000..a3ac639
--- /dev/null
+++ b/docs/skills/engineering/autoresearch-agent-status.md
@@ -0,0 +1,81 @@
+---
+title: "/ar:status — Experiment Dashboard"
+description: "/ar:status — Experiment Dashboard - Claude Code skill from the Engineering - POWERFUL domain."
+---
+
+# /ar:status — Experiment Dashboard
+
+
+
:material-rocket-launch: Engineering - POWERFUL
+
:material-identifier: `status`
+
:material-github: Source
+
+
+
+Install: claude /plugin install engineering-advanced-skills
+
+
+
+Show experiment results, active loops, and progress across all experiments.
+
+## Usage
+
+```
+/ar:status # Full dashboard
+/ar:status engineering/api-speed # Single experiment detail
+/ar:status --domain engineering # All experiments in a domain
+/ar:status --format markdown # Export as markdown
+/ar:status --format csv --output results.csv # Export as CSV
+```
+
+## What It Does
+
+### Single experiment
+
+```bash
+python {skill_path}/scripts/log_results.py --experiment {domain}/{name}
+```
+
+Also check for active loop:
+```bash
+cat .autoresearch/{domain}/{name}/loop.json 2>/dev/null
+```
+
+If loop.json exists, show:
+```
+Active loop: every {interval} (cron ID: {id}, started: {date})
+```
+
+### Domain view
+
+```bash
+python {skill_path}/scripts/log_results.py --domain {domain}
+```
+
+### Full dashboard
+
+```bash
+python {skill_path}/scripts/log_results.py --dashboard
+```
+
+For each experiment, also check for loop.json and show loop status.
+
+### Export
+
+```bash
+# CSV
+python {skill_path}/scripts/log_results.py --dashboard --format csv --output {file}
+
+# Markdown
+python {skill_path}/scripts/log_results.py --dashboard --format markdown --output {file}
+```
+
+## Output Example
+
+```
+DOMAIN EXPERIMENT RUNS KEPT BEST CHANGE STATUS LOOP
+engineering api-speed 47 14 185ms -76.9% active every 1h
+engineering bundle-size 23 8 412KB -58.3% paused —
+marketing medium-ctr 31 11 8.4/10 +68.0% active daily
+prompts support-tone 15 6 82/100 +46.4% done —
+```
diff --git a/docs/skills/engineering/autoresearch-agent.md b/docs/skills/engineering/autoresearch-agent.md
new file mode 100644
index 0000000..20e54e4
--- /dev/null
+++ b/docs/skills/engineering/autoresearch-agent.md
@@ -0,0 +1,313 @@
+---
+title: "Autoresearch Agent"
+description: "Autoresearch Agent - Claude Code skill from the Engineering - POWERFUL domain."
+---
+
+# Autoresearch Agent
+
+
+
:material-rocket-launch: Engineering - POWERFUL
+
:material-identifier: `autoresearch-agent`
+
:material-github: Source
+
+
+
+Install: claude /plugin install engineering-advanced-skills
+
+
+
+> You sleep. The agent experiments. You wake up to results.
+
+Autonomous experiment loop inspired by [Karpathy's autoresearch](https://github.com/karpathy/autoresearch). The agent edits one file, runs a fixed evaluation, keeps improvements, discards failures, and loops indefinitely.
+
+Not one guess — fifty measured attempts, compounding.
+
+---
+
+## Slash Commands
+
+| Command | What it does |
+|---------|-------------|
+| `/ar:setup` | Set up a new experiment interactively |
+| `/ar:run` | Run a single experiment iteration |
+| `/ar:loop` | Start autonomous loop with configurable interval (10m, 1h, daily, weekly, monthly) |
+| `/ar:status` | Show dashboard and results |
+| `/ar:resume` | Resume a paused experiment |
+
+---
+
+## When This Skill Activates
+
+Recognize these patterns from the user:
+
+- "Make this faster / smaller / better"
+- "Optimize [file] for [metric]"
+- "Improve my [headlines / copy / prompts]"
+- "Run experiments overnight"
+- "I want to get [metric] from X to Y"
+- Any request involving: optimize, benchmark, improve, experiment loop, autoresearch
+
+If the user describes a target file + a way to measure success → this skill applies.
+
+---
+
+## Setup
+
+### First Time — Create the Experiment
+
+Run the setup script. The user decides where experiments live:
+
+**Project-level** (inside repo, git-tracked, shareable with team):
+```bash
+python scripts/setup_experiment.py \
+ --domain engineering \
+ --name api-speed \
+ --target src/api/search.py \
+ --eval "pytest bench.py --tb=no -q" \
+ --metric p50_ms \
+ --direction lower \
+ --scope project
+```
+
+**User-level** (personal, in `~/.autoresearch/`):
+```bash
+python scripts/setup_experiment.py \
+ --domain marketing \
+ --name medium-ctr \
+ --target content/titles.md \
+ --eval "python evaluate.py" \
+ --metric ctr_score \
+ --direction higher \
+ --evaluator llm_judge_content \
+ --scope user
+```
+
+The `--scope` flag determines where `.autoresearch/` lives:
+- `project` (default) → `.autoresearch/` in the repo root. Experiment definitions are git-tracked. Results are gitignored.
+- `user` → `~/.autoresearch/` in the home directory. Everything is personal.
+
+### What Setup Creates
+
+```
+.autoresearch/
+├── config.yaml ← Global settings
+├── .gitignore ← Ignores results.tsv, *.log
+└── {domain}/{experiment-name}/
+ ├── program.md ← Objectives, constraints, strategy
+ ├── config.cfg ← Target, eval cmd, metric, direction
+ ├── results.tsv ← Experiment log (gitignored)
+ └── evaluate.py ← Evaluation script (if --evaluator used)
+```
+
+**results.tsv columns:** `commit | metric | status | description`
+- `commit` — short git hash
+- `metric` — float value or "N/A" for crashes
+- `status` — keep | discard | crash
+- `description` — what changed or why it crashed
+
+### Domains
+
+| Domain | Use Cases |
+|--------|-----------|
+| `engineering` | Code speed, memory, bundle size, test pass rate, build time |
+| `marketing` | Headlines, social copy, email subjects, ad copy, engagement |
+| `content` | Article structure, SEO descriptions, readability, CTR |
+| `prompts` | System prompts, chatbot tone, agent instructions |
+| `custom` | Anything else with a measurable metric |
+
+### If `program.md` Already Exists
+
+The user may have written their own `program.md`. If found in the experiment directory, read it. It overrides the template. Only ask for what's missing.
+
+---
+
+## Agent Protocol
+
+You are the loop. The scripts handle setup and evaluation — you handle the creative work.
+
+### Before Starting
+1. Read `.autoresearch/{domain}/{name}/config.cfg` to get:
+ - `target` — the file you edit
+ - `evaluate_cmd` — the command that measures your changes
+ - `metric` — the metric name to look for in eval output
+ - `metric_direction` — "lower" or "higher" is better
+ - `time_budget_minutes` — max time per evaluation
+2. Read `program.md` for strategy, constraints, and what you can/cannot change
+3. Read `results.tsv` for experiment history (columns: commit, metric, status, description)
+4. Checkout the experiment branch: `git checkout autoresearch/{domain}/{name}`
+
+### Each Iteration
+1. Review results.tsv — what worked? What failed? What hasn't been tried?
+2. Decide ONE change to the target file. One variable per experiment.
+3. Edit the target file
+4. Commit: `git add {target} && git commit -m "experiment: {description}"`
+5. Evaluate: `python scripts/run_experiment.py --experiment {domain}/{name} --single`
+6. Read the output — it prints KEEP, DISCARD, or CRASH with the metric value
+7. Go to step 1
+
+### What the Script Handles (you don't)
+- Running the eval command with timeout
+- Parsing the metric from eval output
+- Comparing to previous best
+- Reverting the commit on failure (`git reset --hard HEAD~1`)
+- Logging the result to results.tsv
+
+### Starting an Experiment
+
+```bash
+# Single iteration (the agent calls this repeatedly)
+python scripts/run_experiment.py --experiment engineering/api-speed --single
+
+# Dry run (test setup before starting)
+python scripts/run_experiment.py --experiment engineering/api-speed --dry-run
+```
+
+### Strategy Escalation
+- Runs 1-5: Low-hanging fruit (obvious improvements, simple optimizations)
+- Runs 6-15: Systematic exploration (vary one parameter at a time)
+- Runs 16-30: Structural changes (algorithm swaps, architecture shifts)
+- Runs 30+: Radical experiments (completely different approaches)
+- If no improvement in 20+ runs: update program.md Strategy section
+
+### Self-Improvement
+After every 10 experiments, review results.tsv for patterns. Update the
+Strategy section of program.md with what you learned (e.g., "caching changes
+consistently improve by 5-10%", "refactoring attempts never improve the metric").
+Future iterations benefit from this accumulated knowledge.
+
+### Stopping
+- Run until interrupted by the user, context limit reached, or goal in program.md is met
+- Before stopping: ensure results.tsv is up to date
+- On context limit: the next session can resume — results.tsv and git log persist
+
+### Rules
+
+- **One change per experiment.** Don't change 5 things at once. You won't know what worked.
+- **Simplicity criterion.** A small improvement that adds ugly complexity is not worth it. Equal performance with simpler code is a win. Removing code that gets same results is the best outcome.
+- **Never modify the evaluator.** `evaluate.py` is the ground truth. Modifying it invalidates all comparisons. Hard stop if you catch yourself doing this.
+- **Timeout.** If a run exceeds 2.5× the time budget, kill it and treat as crash.
+- **Crash handling.** If it's a typo or missing import, fix and re-run. If the idea is fundamentally broken, revert, log "crash", move on. 5 consecutive crashes → pause and alert.
+- **No new dependencies.** Only use what's already available in the project.
+
+---
+
+## Evaluators
+
+Ready-to-use evaluation scripts. Copied into the experiment directory during setup with `--evaluator`.
+
+### Free Evaluators (no API cost)
+
+| Evaluator | Metric | Use Case |
+|-----------|--------|----------|
+| `benchmark_speed` | `p50_ms` (lower) | Function/API execution time |
+| `benchmark_size` | `size_bytes` (lower) | File, bundle, Docker image size |
+| `test_pass_rate` | `pass_rate` (higher) | Test suite pass percentage |
+| `build_speed` | `build_seconds` (lower) | Build/compile/Docker build time |
+| `memory_usage` | `peak_mb` (lower) | Peak memory during execution |
+
+### LLM Judge Evaluators (uses your subscription)
+
+| Evaluator | Metric | Use Case |
+|-----------|--------|----------|
+| `llm_judge_content` | `ctr_score` 0-10 (higher) | Headlines, titles, descriptions |
+| `llm_judge_prompt` | `quality_score` 0-100 (higher) | System prompts, agent instructions |
+| `llm_judge_copy` | `engagement_score` 0-10 (higher) | Social posts, ad copy, emails |
+
+LLM judges call the CLI tool the user is already running (Claude, Codex, Gemini). The evaluation prompt is locked inside `evaluate.py` — the agent cannot modify it. This prevents the agent from gaming its own evaluator.
+
+The user's existing subscription covers the cost:
+- Claude Code Max → unlimited Claude calls for evaluation
+- Codex CLI (ChatGPT Pro) → unlimited Codex calls
+- Gemini CLI (free tier) → free evaluation calls
+
+### Custom Evaluators
+
+If no built-in evaluator fits, the user writes their own `evaluate.py`. Only requirement: it must print `metric_name: value` to stdout.
+
+```python
+#!/usr/bin/env python3
+# My custom evaluator — DO NOT MODIFY after experiment starts
+import subprocess
+result = subprocess.run(["my-benchmark", "--json"], capture_output=True, text=True)
+# Parse and output
+print(f"my_metric: {parse_score(result.stdout)}")
+```
+
+---
+
+## Viewing Results
+
+```bash
+# Single experiment
+python scripts/log_results.py --experiment engineering/api-speed
+
+# All experiments in a domain
+python scripts/log_results.py --domain engineering
+
+# Cross-experiment dashboard
+python scripts/log_results.py --dashboard
+
+# Export formats
+python scripts/log_results.py --experiment engineering/api-speed --format csv --output results.csv
+python scripts/log_results.py --experiment engineering/api-speed --format markdown --output results.md
+python scripts/log_results.py --dashboard --format markdown --output dashboard.md
+```
+
+### Dashboard Output
+
+```
+DOMAIN EXPERIMENT RUNS KEPT BEST Δ FROM START STATUS
+engineering api-speed 47 14 185ms -76.9% active
+engineering bundle-size 23 8 412KB -58.3% paused
+marketing medium-ctr 31 11 8.4/10 +68.0% active
+prompts support-tone 15 6 82/100 +46.4% done
+```
+
+### Export Formats
+
+- **TSV** — default, tab-separated (compatible with spreadsheets)
+- **CSV** — comma-separated, with proper quoting
+- **Markdown** — formatted table, readable in GitHub/docs
+
+---
+
+## Proactive Triggers
+
+Flag these without being asked:
+
+- **No evaluation command works** → Test it before starting the loop. Run once, verify output.
+- **Target file not in git** → `git init && git add . && git commit -m 'initial'` first.
+- **Metric direction unclear** → Ask: is lower or higher better? Must know before starting.
+- **Time budget too short** → If eval takes longer than budget, every run crashes.
+- **Agent modifying evaluate.py** → Hard stop. This invalidates all comparisons.
+- **5 consecutive crashes** → Pause the loop. Alert the user. Don't keep burning cycles.
+- **No improvement in 20+ runs** → Suggest changing strategy in program.md or trying a different approach.
+
+---
+
+## Installation
+
+### One-liner (any tool)
+```bash
+git clone https://github.com/alirezarezvani/claude-skills.git
+cp -r claude-skills/engineering/autoresearch-agent ~/.claude/skills/
+```
+
+### Multi-tool install
+```bash
+./scripts/convert.sh --skill autoresearch-agent --tool codex|gemini|cursor|windsurf|openclaw
+```
+
+### OpenClaw
+```bash
+clawhub install autoresearch-agent
+```
+
+---
+
+## Related Skills
+
+- **self-improving-agent** — improves an agent's own memory/rules over time. NOT for structured experiment loops.
+- **senior-ml-engineer** — ML architecture decisions. Complementary — use for initial design, then autoresearch for optimization.
+- **tdd-guide** — test-driven development. Complementary — tests can be the evaluation function.
+- **skill-security-auditor** — audit skills before publishing. NOT for optimization loops.
diff --git a/docs/skills/engineering/index.md b/docs/skills/engineering/index.md
index 81d68a7..090dba1 100644
--- a/docs/skills/engineering/index.md
+++ b/docs/skills/engineering/index.md
@@ -1,13 +1,13 @@
---
title: "Engineering - POWERFUL Skills"
-description: "All 26 Engineering - POWERFUL skills for Claude Code, Codex CLI, Gemini CLI, and OpenClaw."
+description: "All 32 Engineering - POWERFUL skills for Claude Code, Codex CLI, Gemini CLI, and OpenClaw."
---
# :material-rocket-launch: Engineering - POWERFUL
-
26 skills in this domain
+
32 skills in this domain
@@ -41,6 +41,12 @@ description: "All 26 Engineering - POWERFUL skills for Claude Code, Codex CLI, G
Tier: POWERFUL
+- **[Autoresearch Agent](autoresearch-agent.md)** + 5 sub-skills
+
+ ---
+
+ > You sleep. The agent experiments. You wake up to results.
+
- **[Changelog Generator](changelog-generator.md)**
---
@@ -99,7 +105,7 @@ description: "All 26 Engineering - POWERFUL skills for Claude Code, Codex CLI, G
---
- Comprehensive interview system design, competency assessment, and hiring process optimization.
+ Comprehensive interview loop planning and calibration support for role-based hiring systems.
- **[MCP Server Builder](mcp-server-builder.md)**
diff --git a/engineering/autoresearch-agent/.claude-plugin/plugin.json b/engineering/autoresearch-agent/.claude-plugin/plugin.json
new file mode 100644
index 0000000..29b8124
--- /dev/null
+++ b/engineering/autoresearch-agent/.claude-plugin/plugin.json
@@ -0,0 +1,13 @@
+{
+ "name": "autoresearch-agent",
+ "description": "Autonomous experiment loop that optimizes any file by a measurable metric. 5 slash commands, 8 evaluators, configurable loop intervals (10min to monthly).",
+ "version": "2.1.2",
+ "author": {
+ "name": "Alireza Rezvani",
+ "url": "https://alirezarezvani.com"
+ },
+ "homepage": "https://github.com/alirezarezvani/claude-skills/tree/main/engineering/autoresearch-agent",
+ "repository": "https://github.com/alirezarezvani/claude-skills",
+ "license": "MIT",
+ "skills": "./"
+}
diff --git a/engineering/autoresearch-agent/CLAUDE.md b/engineering/autoresearch-agent/CLAUDE.md
new file mode 100644
index 0000000..728d4b4
--- /dev/null
+++ b/engineering/autoresearch-agent/CLAUDE.md
@@ -0,0 +1,66 @@
+# Autoresearch Agent — Claude Code Instructions
+
+This plugin runs autonomous experiment loops that optimize any file by a measurable metric.
+
+## Commands
+
+Use the `/ar:` namespace for all commands:
+
+- `/ar:setup` — Set up a new experiment interactively
+- `/ar:run` — Run a single experiment iteration
+- `/ar:loop` — Start an autonomous loop with user-selected interval
+- `/ar:status` — Show dashboard and results
+- `/ar:resume` — Resume a paused experiment
+
+## How it works
+
+You (the AI agent) are the experiment loop. The scripts handle evaluation and git rollback.
+
+1. You edit the target file with ONE change
+2. You commit it
+3. You call `run_experiment.py --single` — it evaluates and prints KEEP/DISCARD/CRASH
+4. You repeat
+
+Results persist in `results.tsv` and git log. Sessions can be resumed.
+
+## When to use each command
+
+### Starting fresh
+```
+/ar:setup
+```
+Creates the experiment directory, config, program.md, results.tsv, and git branch.
+
+### Running one iteration at a time
+```
+/ar:run engineering/api-speed
+```
+Read history, make one change, evaluate, report result.
+
+### Autonomous background loop
+```
+/ar:loop engineering/api-speed
+```
+Prompts for interval (10min, 1h, daily, weekly, monthly), then creates a recurring job.
+
+### Checking progress
+```
+/ar:status
+```
+Shows the dashboard across all experiments with metrics and trends.
+
+### Resuming after context limit or break
+```
+/ar:resume engineering/api-speed
+```
+Reads results history, checks out the branch, and continues where you left off.
+
+## Agents
+
+- **experiment-runner**: Spawned for each loop iteration. Reads config, results history, decides what to try, edits target, commits, evaluates.
+
+## Key principle
+
+**One change per experiment. Measure everything. Compound improvements.**
+
+The agent never modifies the evaluator. The evaluator is ground truth.
diff --git a/engineering/autoresearch-agent/SKILL.md b/engineering/autoresearch-agent/SKILL.md
index 40f1779..e9efaa1 100644
--- a/engineering/autoresearch-agent/SKILL.md
+++ b/engineering/autoresearch-agent/SKILL.md
@@ -19,6 +19,18 @@ Not one guess — fifty measured attempts, compounding.
---
+## Slash Commands
+
+| Command | What it does |
+|---------|-------------|
+| `/ar:setup` | Set up a new experiment interactively |
+| `/ar:run` | Run a single experiment iteration |
+| `/ar:loop` | Start autonomous loop with configurable interval (10m, 1h, daily, weekly, monthly) |
+| `/ar:status` | Show dashboard and results |
+| `/ar:resume` | Resume a paused experiment |
+
+---
+
## When This Skill Activates
Recognize these patterns from the user:
@@ -82,6 +94,12 @@ The `--scope` flag determines where `.autoresearch/` lives:
└── evaluate.py ← Evaluation script (if --evaluator used)
```
+**results.tsv columns:** `commit | metric | status | description`
+- `commit` — short git hash
+- `metric` — float value or "N/A" for crashes
+- `status` — keep | discard | crash
+- `description` — what changed or why it crashed
+
### Domains
| Domain | Use Cases |
@@ -98,48 +116,67 @@ The user may have written their own `program.md`. If found in the experiment dir
---
-## The Experiment Loop
+## Agent Protocol
+
+You are the loop. The scripts handle setup and evaluation — you handle the creative work.
+
+### Before Starting
+1. Read `.autoresearch/{domain}/{name}/config.cfg` to get:
+ - `target` — the file you edit
+ - `evaluate_cmd` — the command that measures your changes
+ - `metric` — the metric name to look for in eval output
+ - `metric_direction` — "lower" or "higher" is better
+ - `time_budget_minutes` — max time per evaluation
+2. Read `program.md` for strategy, constraints, and what you can/cannot change
+3. Read `results.tsv` for experiment history (columns: commit, metric, status, description)
+4. Checkout the experiment branch: `git checkout autoresearch/{domain}/{name}`
+
+### Each Iteration
+1. Review results.tsv — what worked? What failed? What hasn't been tried?
+2. Decide ONE change to the target file. One variable per experiment.
+3. Edit the target file
+4. Commit: `git add {target} && git commit -m "experiment: {description}"`
+5. Evaluate: `python scripts/run_experiment.py --experiment {domain}/{name} --single`
+6. Read the output — it prints KEEP, DISCARD, or CRASH with the metric value
+7. Go to step 1
+
+### What the Script Handles (you don't)
+- Running the eval command with timeout
+- Parsing the metric from eval output
+- Comparing to previous best
+- Reverting the commit on failure (`git reset --hard HEAD~1`)
+- Logging the result to results.tsv
### Starting an Experiment
```bash
-# Run specific experiment
-python scripts/run_experiment.py --experiment engineering/api-speed --loop
-
-# Single iteration (test setup)
+# Single iteration (the agent calls this repeatedly)
python scripts/run_experiment.py --experiment engineering/api-speed --single
-# Resume last active experiment
-python scripts/run_experiment.py --resume --loop
-
-# Dry run (show what would happen)
+# Dry run (test setup before starting)
python scripts/run_experiment.py --experiment engineering/api-speed --dry-run
```
-### The Loop Protocol
+### Strategy Escalation
+- Runs 1-5: Low-hanging fruit (obvious improvements, simple optimizations)
+- Runs 6-15: Systematic exploration (vary one parameter at a time)
+- Runs 16-30: Structural changes (algorithm swaps, architecture shifts)
+- Runs 30+: Radical experiments (completely different approaches)
+- If no improvement in 20+ runs: update program.md Strategy section
-```
-LOOP FOREVER:
+### Self-Improvement
+After every 10 experiments, review results.tsv for patterns. Update the
+Strategy section of program.md with what you learned (e.g., "caching changes
+consistently improve by 5-10%", "refactoring attempts never improve the metric").
+Future iterations benefit from this accumulated knowledge.
-1. Read program.md for current strategy and constraints
-2. Review git log: what has been tried? What worked? What crashed?
-3. Review results.tsv: current best metric, trend, recent failures
-4. Propose ONE change to the target file
-5. Apply the change
-6. git commit -m "experiment: [short description of what changed]"
-7. Run evaluation: {eval_command} > .autoresearch/{domain}/{name}/run.log 2>&1
-8. Parse metric from run.log (grep for metric_name: value)
-9. Decision:
- - Metric improved → KEEP (advance branch, log "keep")
- - Metric equal or worse → REVERT (git reset --hard, log "discard")
- - Crash/timeout/parse failure → attempt fix once, else REVERT (log "crash")
-10. Append result to results.tsv
-11. Go to 1
-```
+### Stopping
+- Run until interrupted by the user, context limit reached, or goal in program.md is met
+- Before stopping: ensure results.tsv is up to date
+- On context limit: the next session can resume — results.tsv and git log persist
### Rules
-- **NEVER STOP.** The human may be asleep. Run until manually interrupted. If you run out of ideas, read papers, re-read the target, try combining previous near-misses, try radical changes.
- **One change per experiment.** Don't change 5 things at once. You won't know what worked.
- **Simplicity criterion.** A small improvement that adds ugly complexity is not worth it. Equal performance with simpler code is a win. Removing code that gets same results is the best outcome.
- **Never modify the evaluator.** `evaluate.py` is the ground truth. Modifying it invalidates all comparisons. Hard stop if you catch yourself doing this.
@@ -258,7 +295,7 @@ cp -r claude-skills/engineering/autoresearch-agent ~/.claude/skills/
### OpenClaw
```bash
-clawhub install autoresearch-agent
+clawhub install cs-autoresearch-agent
```
---
diff --git a/engineering/autoresearch-agent/agents/experiment-runner.md b/engineering/autoresearch-agent/agents/experiment-runner.md
new file mode 100644
index 0000000..120d81e
--- /dev/null
+++ b/engineering/autoresearch-agent/agents/experiment-runner.md
@@ -0,0 +1,87 @@
+# Experiment Runner Agent
+
+You are an autonomous experimenter. Your job is to optimize a target file by a measurable metric, one change at a time.
+
+## Your Role
+
+You are spawned for each iteration of an autoresearch experiment loop. You:
+1. Read the experiment state (config, strategy, results history)
+2. Decide what to try based on accumulated evidence
+3. Make ONE change to the target file
+4. Commit and evaluate
+5. Report the result
+
+## Process
+
+### 1. Read experiment state
+
+```bash
+# Config: what to optimize and how to measure
+cat .autoresearch/{domain}/{name}/config.cfg
+
+# Strategy: what you can/cannot change, current approach
+cat .autoresearch/{domain}/{name}/program.md
+
+# History: every experiment ever run, with outcomes
+cat .autoresearch/{domain}/{name}/results.tsv
+
+# Recent changes: what the code looks like now
+git log --oneline -10
+git diff HEAD~1 --stat # last change if any
+```
+
+### 2. Analyze results history
+
+From results.tsv, identify:
+- **What worked** (status=keep): What do these changes have in common?
+- **What failed** (status=discard): What approaches should you avoid?
+- **What crashed** (status=crash): Are there fragile areas to be careful with?
+- **Trends**: Is the metric plateauing? Accelerating? Oscillating?
+
+### 3. Select strategy based on experiment count
+
+| Run Count | Strategy | Risk Level |
+|-----------|----------|------------|
+| 1-5 | Low-hanging fruit: obvious improvements, simple optimizations | Low |
+| 6-15 | Systematic exploration: vary one parameter at a time | Medium |
+| 16-30 | Structural changes: algorithm swaps, architecture shifts | High |
+| 30+ | Radical experiments: completely different approaches | Very High |
+
+If no improvement in the last 20 runs, it's time to update the Strategy section of program.md and try something fundamentally different.
+
+### 4. Make ONE change
+
+- Edit only the target file (from config.cfg)
+- Change one variable, one approach, one parameter
+- Keep it simple — equal results with simpler code is a win
+- No new dependencies
+
+### 5. Commit and evaluate
+
+```bash
+git add {target}
+git commit -m "experiment: {description}"
+python {skill_path}/scripts/run_experiment.py --experiment {domain}/{name} --single
+```
+
+### 6. Self-improvement
+
+After every 10th experiment, update program.md's Strategy section:
+- Which approaches consistently work? Double down.
+- Which approaches consistently fail? Stop trying.
+- Any new hypotheses based on the data?
+
+## Hard Rules
+
+- **ONE change per experiment.** Multiple changes = you won't know what worked.
+- **NEVER modify the evaluator.** evaluate.py is the ground truth. Modifying it invalidates all comparisons. If you catch yourself doing this, stop immediately.
+- **5 consecutive crashes → stop.** Alert the user. Don't burn cycles on a broken setup.
+- **Simplicity criterion.** A small improvement that adds ugly complexity is NOT worth it. Removing code that gets same results is the best outcome.
+- **No new dependencies.** Only use what's already available.
+
+## Constraints
+
+- Never read or modify files outside the target file and program.md
+- Never push to remote — all work stays local
+- Never skip the evaluation step — every change must be measured
+- Be concise in commit messages — they become the experiment log
diff --git a/engineering/autoresearch-agent/evaluators/benchmark_size.py b/engineering/autoresearch-agent/evaluators/benchmark_size.py
index 1e5bfb1..648ac9c 100644
--- a/engineering/autoresearch-agent/evaluators/benchmark_size.py
+++ b/engineering/autoresearch-agent/evaluators/benchmark_size.py
@@ -36,7 +36,11 @@ if "DOCKER_IMAGE" in dir() or "DOCKER_IMAGE" in globals():
f"docker image inspect {DOCKER_IMAGE} --format '{{{{.Size}}}}'",
shell=True, capture_output=True, text=True
)
- size_bytes = int(result.stdout.strip())
+ try:
+ size_bytes = int(result.stdout.strip())
+ except ValueError:
+ print(f"Could not parse size from: {result.stdout[:100]}", file=sys.stderr)
+ sys.exit(1)
elif "TARGET_DIR" in dir() or "TARGET_DIR" in globals():
size_bytes = sum(
os.path.getsize(os.path.join(dp, f))
diff --git a/engineering/autoresearch-agent/evaluators/llm_judge_content.py b/engineering/autoresearch-agent/evaluators/llm_judge_content.py
index 795cbec..79bcd4c 100644
--- a/engineering/autoresearch-agent/evaluators/llm_judge_content.py
+++ b/engineering/autoresearch-agent/evaluators/llm_judge_content.py
@@ -43,7 +43,12 @@ ctr_score:
Be harsh. Most content is mediocre (4-6 range). Only exceptional content scores 8+."""
-content = Path(TARGET_FILE).read_text()
+try:
+ content = Path(TARGET_FILE).read_text()
+except FileNotFoundError:
+ print(f"Target file not found: {TARGET_FILE}", file=sys.stderr)
+ sys.exit(1)
+
full_prompt = f"{JUDGE_PROMPT}\n\n---\n\nContent to evaluate:\n\n{content}"
# Call the user's CLI tool
diff --git a/engineering/autoresearch-agent/evaluators/llm_judge_copy.py b/engineering/autoresearch-agent/evaluators/llm_judge_copy.py
index c074feb..c4a9565 100644
--- a/engineering/autoresearch-agent/evaluators/llm_judge_copy.py
+++ b/engineering/autoresearch-agent/evaluators/llm_judge_copy.py
@@ -54,6 +54,9 @@ platform_prompt = JUDGE_PROMPTS.get(PLATFORM, JUDGE_PROMPTS["twitter"])
JUDGE_PROMPT = f"""{platform_prompt}
+IMPORTANT: You MUST use criterion_1 through criterion_5 as labels, NOT the criterion names.
+Do NOT output "hook: 7" — output "criterion_1: 7".
+
Output EXACTLY this format:
criterion_1:
criterion_2:
@@ -64,7 +67,12 @@ engagement_score:
Be harsh. Most copy is mediocre (4-6). Only exceptional copy scores 8+."""
-content = Path(TARGET_FILE).read_text()
+try:
+ content = Path(TARGET_FILE).read_text()
+except FileNotFoundError:
+ print(f"Target file not found: {TARGET_FILE}", file=sys.stderr)
+ sys.exit(1)
+
full_prompt = f"{JUDGE_PROMPT}\n\n---\n\nCopy to evaluate:\n\n{content}"
result = subprocess.run(
@@ -77,12 +85,29 @@ if result.returncode != 0:
sys.exit(1)
output = result.stdout
+found_scores = False
for line in output.splitlines():
line = line.strip()
if line.startswith("engagement_score:") or line.startswith("criterion_"):
print(line)
+ found_scores = True
-if "engagement_score:" not in output:
+# Fallback: if no criterion_ lines found, try parsing any "word: digit" lines
+if not found_scores:
+ import re
+ fallback_scores = []
+ for line in output.splitlines():
+ line = line.strip()
+ match = re.match(r'^(\w[\w\s]*?):\s*(\d+(?:\.\d+)?)\s*$', line)
+ if match and match.group(1).lower() not in ("engagement_score",):
+ fallback_scores.append(float(match.group(2)))
+ print(f"criterion_{len(fallback_scores)}: {match.group(2)}")
+ if fallback_scores:
+ avg = sum(fallback_scores) / len(fallback_scores)
+ print(f"engagement_score: {avg:.1f}")
+ found_scores = True
+
+if "engagement_score:" not in output and not found_scores:
print("Could not parse engagement_score from LLM output", file=sys.stderr)
print(f"Raw: {output[:500]}", file=sys.stderr)
sys.exit(1)
diff --git a/engineering/autoresearch-agent/evaluators/llm_judge_prompt.py b/engineering/autoresearch-agent/evaluators/llm_judge_prompt.py
index 79dfbc5..8bb7fda 100644
--- a/engineering/autoresearch-agent/evaluators/llm_judge_prompt.py
+++ b/engineering/autoresearch-agent/evaluators/llm_judge_prompt.py
@@ -37,8 +37,17 @@ Score the actual output on these criteria (each 1-10):
Output EXACTLY: quality_score:
Nothing else."""
-prompt = Path(TARGET_FILE).read_text()
-test_cases = json.loads(Path(TEST_CASES_FILE).read_text())
+try:
+ prompt = Path(TARGET_FILE).read_text()
+except FileNotFoundError:
+ print(f"Target file not found: {TARGET_FILE}", file=sys.stderr)
+ sys.exit(1)
+
+try:
+ test_cases = json.loads(Path(TEST_CASES_FILE).read_text())
+except FileNotFoundError:
+ print(f"Test cases file not found: {TEST_CASES_FILE}", file=sys.stderr)
+ sys.exit(1)
scores = []
@@ -92,7 +101,7 @@ if not scores:
sys.exit(1)
avg = sum(scores) / len(scores)
-quality = avg * 10 # Scale to 0-100
+quality = avg * 10 # 1-10 scores → 10-100 range
print(f"quality_score: {quality:.2f}")
print(f"cases_tested: {len(scores)}")
diff --git a/engineering/autoresearch-agent/evaluators/memory_usage.py b/engineering/autoresearch-agent/evaluators/memory_usage.py
index 6a19649..faffb1c 100644
--- a/engineering/autoresearch-agent/evaluators/memory_usage.py
+++ b/engineering/autoresearch-agent/evaluators/memory_usage.py
@@ -2,7 +2,6 @@
"""Measure peak memory usage of a command.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
-import os
import platform
import subprocess
import sys
@@ -41,8 +40,10 @@ elif system == "Darwin":
if "maximum resident set size" in line.lower():
# macOS reports in bytes
val = int(line.strip().split()[0])
+ kb = val / 1024
mb = val / (1024 * 1024)
print(f"peak_mb: {mb:.1f}")
+ print(f"peak_kb: {int(kb)}")
sys.exit(0)
print("Could not parse memory from time output", file=sys.stderr)
sys.exit(1)
diff --git a/engineering/autoresearch-agent/references/program-template.md b/engineering/autoresearch-agent/references/program-template.md
index 03498d4..90cfe71 100644
--- a/engineering/autoresearch-agent/references/program-template.md
+++ b/engineering/autoresearch-agent/references/program-template.md
@@ -75,8 +75,8 @@ Maximize eval_score on the test suite. Higher is better (0-100).
## Evaluation
- evaluate.py runs the prompt against 20 test cases
-- Each test case is scored 0-5 by GPT-4o
-- eval_score = average * 20 (maps to 0-100)
+- Each test case is scored 1-10 by your CLI tool (Claude, Codex, or Gemini)
+- quality_score = average * 10 (maps to 10-100)
- Run log shows which test cases failed
## Stop When
@@ -144,14 +144,14 @@ Maximize pass_rate on the task evaluation suite. Higher is better (0-1).
- Proactive trigger conditions
## What You Cannot Change
-- scripts/skill_evaluator.py (fixed evaluation)
+- your custom evaluate.py (see Custom Evaluators in SKILL.md)
- Test tasks in tests/ (ground truth benchmark)
- Skill name (used for routing)
- License or metadata
## Evaluation
-- skill_evaluator.py runs SKILL.md against 15 standardized tasks
-- An AI judge scores each task: 0 (fail), 0.5 (partial), 1 (pass)
+- evaluate.py runs SKILL.md against 15 standardized tasks
+- Your CLI tool scores each task: 0 (fail), 0.5 (partial), 1 (pass)
- pass_rate = sum(scores) / 15
## Strategy
diff --git a/engineering/autoresearch-agent/scripts/log_results.py b/engineering/autoresearch-agent/scripts/log_results.py
index 51b4d10..1820272 100644
--- a/engineering/autoresearch-agent/scripts/log_results.py
+++ b/engineering/autoresearch-agent/scripts/log_results.py
@@ -18,6 +18,7 @@ import argparse
import csv
import io
import sys
+import time
from pathlib import Path
@@ -80,7 +81,7 @@ def compute_stats(results, direction):
best = None
pct_change = None
- if baseline and best and baseline != 0:
+ if baseline is not None and best is not None and baseline != 0:
if direction == "lower":
pct_change = (baseline - best) / baseline * 100
else:
@@ -145,18 +146,17 @@ def print_dashboard(root):
direction = config.get("metric_direction", "lower")
stats = compute_stats(results, direction)
+ best_str = f"{stats['best']:.4f}" if stats["best"] is not None else "—"
+ pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] is not None else "—"
+
# Determine status
status = "idle"
if stats["total"] > 0:
tsv = exp_dir / "results.tsv"
if tsv.exists():
- import time
age_hours = (time.time() - tsv.stat().st_mtime) / 3600
status = "active" if age_hours < 1 else "paused" if age_hours < 24 else "done"
- best_str = f"{stats['best']:.4f}" if stats["best"] is not None else "—"
- pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] is not None else "—"
-
experiments.append({
"domain": domain_dir.name,
"name": exp_dir.name,
@@ -202,7 +202,7 @@ def export_experiment_csv(experiment_dir, experiment_path):
if stats["baseline"] is not None:
writer.writerow(["# Baseline", f"{stats['baseline']:.6f}"])
if stats["best"] is not None:
- pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
+ pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] is not None else ""
writer.writerow(["# Best", f"{stats['best']:.6f}{pct}"])
writer.writerow(["# Total", stats["total"]])
writer.writerow(["# Keep/Discard/Crash", f"{stats['keeps']}/{stats['discards']}/{stats['crashes']}"])
@@ -216,12 +216,14 @@ def export_experiment_csv(experiment_dir, experiment_path):
return buf.getvalue()
-def export_dashboard_csv(root):
+def export_dashboard_csv(root, domain_filter=None):
"""Export dashboard as CSV string."""
experiments = []
for domain_dir in sorted(root.iterdir()):
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
continue
+ if domain_filter and domain_dir.name != domain_filter:
+ continue
for exp_dir in sorted(domain_dir.iterdir()):
if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
continue
@@ -229,8 +231,8 @@ def export_dashboard_csv(root):
results = load_results(exp_dir)
direction = config.get("metric_direction", "lower")
stats = compute_stats(results, direction)
- best_str = f"{stats['best']:.6f}" if stats["best"] else ""
- pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else ""
+ best_str = f"{stats['best']:.6f}" if stats["best"] is not None else ""
+ pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] is not None else ""
experiments.append([
domain_dir.name, exp_dir.name, config.get("metric", ""),
stats["total"], stats["keeps"], stats["discards"], stats["crashes"],
@@ -262,7 +264,7 @@ def export_experiment_markdown(experiment_dir, experiment_path):
lines.append(f"**Experiments:** {stats['total']} total — {stats['keeps']} kept, {stats['discards']} discarded, {stats['crashes']} crashed\n")
if stats["baseline"] is not None and stats["best"] is not None:
- pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
+ pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] is not None else ""
lines.append(f"**Progress:** `{stats['baseline']:.6f}` → `{stats['best']:.6f}`{pct}\n")
lines.append(f"| Commit | Metric | Status | Description |")
@@ -275,7 +277,7 @@ def export_experiment_markdown(experiment_dir, experiment_path):
return "\n".join(lines)
-def export_dashboard_markdown(root):
+def export_dashboard_markdown(root, domain_filter=None):
"""Export dashboard as Markdown string."""
lines = []
lines.append("# Autoresearch Dashboard\n")
@@ -285,6 +287,8 @@ def export_dashboard_markdown(root):
for domain_dir in sorted(root.iterdir()):
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
continue
+ if domain_filter and domain_dir.name != domain_filter:
+ continue
for exp_dir in sorted(domain_dir.iterdir()):
if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
continue
@@ -292,10 +296,9 @@ def export_dashboard_markdown(root):
results = load_results(exp_dir)
direction = config.get("metric_direction", "lower")
stats = compute_stats(results, direction)
- best = f"`{stats['best']:.4f}`" if stats["best"] else "—"
- pct = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else "—"
+ best = f"`{stats['best']:.4f}`" if stats["best"] is not None else "—"
+ pct = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] is not None else "—"
- import time
tsv = exp_dir / "results.tsv"
status = "idle"
if tsv.exists() and stats["total"] > 0:
@@ -356,7 +359,7 @@ def main():
# For CSV/MD, fall through to dashboard with domain filter
if args.format != "terminal":
# Use dashboard export filtered to domain
- output_text = export_dashboard_csv(root) if args.format == "csv" else export_dashboard_markdown(root)
+ output_text = export_dashboard_csv(root, domain_filter=args.domain) if args.format == "csv" else export_dashboard_markdown(root, domain_filter=args.domain)
else:
return
diff --git a/engineering/autoresearch-agent/scripts/run_experiment.py b/engineering/autoresearch-agent/scripts/run_experiment.py
index b8264ea..dad29be 100644
--- a/engineering/autoresearch-agent/scripts/run_experiment.py
+++ b/engineering/autoresearch-agent/scripts/run_experiment.py
@@ -2,20 +2,17 @@
"""
autoresearch-agent: Experiment Runner
-Executes the autonomous experiment loop for a specific experiment.
-Reads config from .autoresearch/{domain}/{name}/config.cfg.
+Executes a single experiment iteration. The AI agent is the loop —
+it calls this script repeatedly. The script handles evaluation,
+metric parsing, keep/discard decisions, and git rollback on failure.
Usage:
- python scripts/run_experiment.py --experiment engineering/api-speed --loop
python scripts/run_experiment.py --experiment engineering/api-speed --single
- python scripts/run_experiment.py --experiment marketing/medium-ctr --loop
- python scripts/run_experiment.py --resume --loop
python scripts/run_experiment.py --experiment engineering/api-speed --dry-run
+ python scripts/run_experiment.py --experiment engineering/api-speed --single --description "added caching"
"""
import argparse
-import os
-import signal
import subprocess
import sys
import time
@@ -48,10 +45,11 @@ def load_config(experiment_dir):
return config
-def run_cmd(cmd, cwd=None, timeout=None):
- """Run shell command, return (returncode, stdout, stderr)."""
+def run_git(args, cwd=None, timeout=30):
+ """Run a git command safely (no shell injection). Returns (returncode, stdout, stderr)."""
result = subprocess.run(
- cmd, shell=True, capture_output=True, text=True,
+ ["git"] + args,
+ capture_output=True, text=True,
cwd=cwd, timeout=timeout
)
return result.returncode, result.stdout.strip(), result.stderr.strip()
@@ -59,7 +57,7 @@ def run_cmd(cmd, cwd=None, timeout=None):
def get_current_commit(path):
"""Get short hash of current HEAD."""
- _, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
+ _, commit, _ = run_git(["rev-parse", "--short", "HEAD"], cwd=path)
return commit
@@ -85,17 +83,23 @@ def get_best_metric(experiment_dir, direction):
def run_evaluation(project_root, eval_cmd, time_budget_minutes, log_file):
- """Run evaluation with time limit. Output goes to log_file."""
+ """Run evaluation with time limit. Output goes to log_file.
+
+ Note: shell=True is intentional here — eval_cmd is user-provided and
+ may contain pipes, redirects, or chained commands.
+ """
hard_limit = time_budget_minutes * 60 * 2.5
t0 = time.time()
try:
- code, _, _ = run_cmd(
- f"{eval_cmd} > {log_file} 2>&1",
- cwd=str(project_root),
- timeout=hard_limit
- )
+ with open(log_file, "w") as lf:
+ result = subprocess.run(
+ eval_cmd, shell=True,
+ stdout=lf, stderr=subprocess.STDOUT,
+ cwd=str(project_root),
+ timeout=hard_limit
+ )
elapsed = time.time() - t0
- return code, elapsed
+ return result.returncode, elapsed
except subprocess.TimeoutExpired:
elapsed = time.time() - t0
return -1, elapsed
@@ -141,24 +145,24 @@ def get_experiment_count(experiment_dir):
return max(0, len(tsv.read_text().splitlines()) - 1)
-def get_last_active(root):
- """Find the most recently modified experiment."""
- latest = None
- latest_time = 0
- for domain_dir in root.iterdir():
- if not domain_dir.is_dir() or domain_dir.name.startswith("."):
- continue
- for exp_dir in domain_dir.iterdir():
- if not exp_dir.is_dir():
- continue
- cfg = exp_dir / "config.cfg"
- if cfg.exists() and cfg.stat().st_mtime > latest_time:
- latest_time = cfg.stat().st_mtime
- latest = f"{domain_dir.name}/{exp_dir.name}"
- return latest
+def get_description_from_diff(project_root):
+ """Auto-generate a description from git diff --stat HEAD~1."""
+ code, diff_stat, _ = run_git(["diff", "--stat", "HEAD~1"], cwd=str(project_root))
+ if code == 0 and diff_stat:
+ return diff_stat.split("\n")[0][:50]
+ return "experiment"
-def run_single(project_root, experiment_dir, config, exp_num, dry_run=False):
+def read_last_lines(filepath, n=5):
+ """Read last n lines of a file (replaces tail shell command)."""
+ path = Path(filepath)
+ if not path.exists():
+ return ""
+ lines = path.read_text().splitlines()
+ return "\n".join(lines[-n:])
+
+
+def run_single(project_root, experiment_dir, config, exp_num, dry_run=False, description=None):
"""Run one experiment iteration."""
direction = config.get("metric_direction", "lower")
metric_grep = config.get("metric_grep", "^metric:")
@@ -177,11 +181,9 @@ def run_single(project_root, experiment_dir, config, exp_num, dry_run=False):
print(" [DRY RUN] Would run evaluation and check metric")
return "dry_run"
- # Save state for rollback
- code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=str(project_root))
- if code != 0:
- print(" Error: can't get git state")
- return "error"
+ # Auto-generate description if not provided
+ if not description:
+ description = get_description_from_diff(str(project_root))
# Run evaluation
print(f" Running: {eval_cmd} (budget: {time_budget}m)")
@@ -192,17 +194,17 @@ def run_single(project_root, experiment_dir, config, exp_num, dry_run=False):
# Timeout
if ret_code == -1:
print(f" TIMEOUT after {elapsed:.0f}s — discarding")
- run_cmd("git checkout -- .", cwd=str(project_root))
- run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+ run_git(["checkout", "--", "."], cwd=str(project_root))
+ run_git(["reset", "--hard", "HEAD~1"], cwd=str(project_root))
log_result(experiment_dir, commit, None, "crash", f"timeout_{elapsed:.0f}s")
return "crash"
# Crash
if ret_code != 0:
- _, tail, _ = run_cmd(f"tail -5 {log_file}", cwd=str(project_root))
+ tail = read_last_lines(log_file, 5)
print(f" CRASH (exit {ret_code}) after {elapsed:.0f}s")
print(f" Last output: {tail[:200]}")
- run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+ run_git(["reset", "--hard", "HEAD~1"], cwd=str(project_root))
log_result(experiment_dir, commit, None, "crash", f"exit_{ret_code}")
return "crash"
@@ -210,7 +212,7 @@ def run_single(project_root, experiment_dir, config, exp_num, dry_run=False):
metric_val = extract_metric(log_file, metric_grep)
if metric_val is None:
print(f" Could not parse {metric_name} from run.log")
- run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+ run_git(["reset", "--hard", "HEAD~1"], cwd=str(project_root))
log_result(experiment_dir, commit, None, "crash", "metric_parse_failed")
return "crash"
@@ -224,63 +226,23 @@ def run_single(project_root, experiment_dir, config, exp_num, dry_run=False):
# Keep or discard
if is_improvement(metric_val, best, direction):
print(f" KEEP — improvement")
- log_result(experiment_dir, commit, metric_val, "keep",
- f"improved_{metric_name}_{metric_val:.4f}")
+ log_result(experiment_dir, commit, metric_val, "keep", description)
return "keep"
else:
print(f" DISCARD — no improvement")
- run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
- best_str = f"{best:.4f}" if best else "?"
+ run_git(["reset", "--hard", "HEAD~1"], cwd=str(project_root))
+ best_str = f"{best:.4f}" if best is not None else "?"
log_result(experiment_dir, commit, metric_val, "discard",
f"no_improvement_{metric_val:.4f}_vs_{best_str}")
return "discard"
-def print_summary(experiment_dir, config):
- """Print session summary."""
- tsv = experiment_dir / "results.tsv"
- if not tsv.exists():
- return
- lines = tsv.read_text().splitlines()[1:]
- if not lines:
- return
-
- keeps = [l for l in lines if "\tkeep\t" in l]
- discards = [l for l in lines if "\tdiscard\t" in l]
- crashes = [l for l in lines if "\tcrash\t" in l]
- metric_name = config.get("metric", "metric")
- direction = config.get("metric_direction", "lower")
-
- print(f"\n{'=' * 55}")
- print(f" autoresearch — Session Summary")
- print(f" Experiments: {len(lines)} total")
- print(f" Keep: {len(keeps)} | Discard: {len(discards)} | Crash: {len(crashes)}")
-
- if keeps:
- try:
- valid = []
- for l in keeps:
- parts = l.split("\t")
- if parts[1] != "N/A":
- valid.append(float(parts[1]))
- if len(valid) >= 2:
- first, last = valid[0], valid[-1]
- best = min(valid) if direction == "lower" else max(valid)
- pct = ((first - best) / first * 100) if direction == "lower" else ((best - first) / first * 100)
- print(f" {metric_name}: {first:.6f} -> {best:.6f} ({pct:+.1f}%)")
- except (ValueError, IndexError):
- pass
- print(f"{'=' * 55}\n")
-
-
def main():
parser = argparse.ArgumentParser(description="autoresearch-agent runner")
parser.add_argument("--experiment", help="Experiment path: domain/name (e.g. engineering/api-speed)")
- parser.add_argument("--resume", action="store_true", help="Resume last active experiment")
- parser.add_argument("--loop", action="store_true", help="Run forever")
- parser.add_argument("--single", action="store_true", help="Run one experiment")
+ parser.add_argument("--single", action="store_true", help="Run one experiment iteration")
parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
- parser.add_argument("--max-experiments", type=int, default=0, help="Max experiments (0 = unlimited)")
+ parser.add_argument("--description", help="Description of the change (auto-generated from git diff if omitted)")
parser.add_argument("--path", default=".", help="Project root")
args = parser.parse_args()
@@ -291,20 +253,11 @@ def main():
print("No .autoresearch/ found. Run setup_experiment.py first.")
sys.exit(1)
- # Resolve experiment
- experiment_path = args.experiment
- if args.resume:
- experiment_path = get_last_active(root)
- if not experiment_path:
- print("No experiments found to resume.")
- sys.exit(1)
- print(f"Resuming: {experiment_path}")
-
- if not experiment_path:
- print("Specify --experiment domain/name or --resume")
+ if not args.experiment:
+ print("Specify --experiment domain/name")
sys.exit(1)
- experiment_dir = root / experiment_path
+ experiment_dir = root / args.experiment
if not experiment_dir.exists():
print(f"Experiment not found: {experiment_dir}")
print("Run: python scripts/setup_experiment.py --list")
@@ -312,56 +265,15 @@ def main():
config = load_config(experiment_dir)
- domain, name = experiment_path.split("/", 1)
print(f"\n autoresearch-agent")
- print(f" Experiment: {experiment_path}")
+ print(f" Experiment: {args.experiment}")
print(f" Target: {config.get('target', '?')}")
print(f" Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
print(f" Budget: {config.get('time_budget_minutes', '?')} min/experiment")
- print(f" Mode: {'loop' if args.loop else 'single'}")
+ print(f" Mode: {'dry-run' if args.dry_run else 'single'}")
- if args.single or args.dry_run:
- exp_num = get_experiment_count(experiment_dir) + 1
- run_single(project_root, experiment_dir, config, exp_num, args.dry_run)
- return
-
- if not args.loop:
- print("\nSpecify --loop (forever) or --single (one experiment)")
- sys.exit(1)
-
- # Graceful shutdown
- def handle_interrupt(sig, frame):
- print_summary(experiment_dir, config)
- print("\nStopped by user.")
- sys.exit(0)
-
- signal.signal(signal.SIGINT, handle_interrupt)
- signal.signal(signal.SIGTERM, handle_interrupt)
-
- consecutive_crashes = 0
exp_num = get_experiment_count(experiment_dir) + 1
-
- print(f"\nStarting loop. Ctrl+C to stop.\n")
-
- while True:
- result = run_single(project_root, experiment_dir, config, exp_num, False)
- exp_num += 1
-
- if result == "crash":
- consecutive_crashes += 1
- else:
- consecutive_crashes = 0
-
- if consecutive_crashes >= 5:
- print("\n 5 consecutive crashes. Pausing.")
- print(" Check .autoresearch/{}/run.log".format(experiment_path))
- break
-
- if 0 < args.max_experiments < exp_num:
- print(f"\n Reached max experiments ({args.max_experiments})")
- break
-
- print_summary(experiment_dir, config)
+ run_single(project_root, experiment_dir, config, exp_num, args.dry_run, args.description)
if __name__ == "__main__":
diff --git a/engineering/autoresearch-agent/scripts/setup_experiment.py b/engineering/autoresearch-agent/scripts/setup_experiment.py
index 2029775..ab15a5d 100644
--- a/engineering/autoresearch-agent/scripts/setup_experiment.py
+++ b/engineering/autoresearch-agent/scripts/setup_experiment.py
@@ -19,11 +19,9 @@ Usage:
"""
import argparse
-import os
import shutil
import subprocess
import sys
-import time
from datetime import datetime
from pathlib import Path
@@ -159,13 +157,19 @@ def copy_evaluator(experiment_dir, evaluator_name):
def create_branch(path, domain, name):
"""Create and checkout the experiment branch."""
branch = f"autoresearch/{domain}/{name}"
- code, _, err = run_cmd(f"git checkout -b {branch}", cwd=path)
- if code != 0:
- if "already exists" in err:
+ result = subprocess.run(
+ ["git", "checkout", "-b", branch],
+ cwd=path, capture_output=True, text=True
+ )
+ if result.returncode != 0:
+ if "already exists" in result.stderr:
print(f" Branch '{branch}' already exists. Checking out...")
- run_cmd(f"git checkout {branch}", cwd=path)
+ subprocess.run(
+ ["git", "checkout", branch],
+ cwd=path, capture_output=True, text=True
+ )
return branch
- print(f" Warning: could not create branch: {err}")
+ print(f" Warning: could not create branch: {result.stderr}")
return None
print(f" Created branch: {branch}")
return branch
@@ -229,10 +233,17 @@ def list_evaluators():
# Read first docstring line
desc = ""
for line in f.read_text().splitlines():
- if line.strip().startswith('"""') or line.strip().startswith("'''"):
+ stripped = line.strip()
+ if stripped.startswith('"""') or stripped.startswith("'''"):
+ quote = stripped[:3]
+ # Single-line docstring: """Description."""
+ after_quote = stripped[3:]
+ if after_quote and after_quote.rstrip(quote[0]).strip():
+ desc = after_quote.rstrip('"').rstrip("'").strip()
+ break
continue
- if line.strip() and not line.startswith("#!"):
- desc = line.strip().strip('"').strip("'")
+ if stripped and not line.startswith("#!"):
+ desc = stripped.strip('"').strip("'")
break
print(f" {f.stem:<25} {desc}")
@@ -252,7 +263,6 @@ def main():
help="Where to store experiments: project (./) or user (~/)")
parser.add_argument("--constraints", default="", help="Additional constraints for program.md")
parser.add_argument("--path", default=".", help="Project root path")
- parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline run")
parser.add_argument("--skip-branch", action="store_true", help="Don't create git branch")
parser.add_argument("--list", action="store_true", help="List all experiments")
parser.add_argument("--list-evaluators", action="store_true", help="List available evaluators")
@@ -288,7 +298,11 @@ def main():
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
# Check git
- code, _, _ = run_cmd("git rev-parse --is-inside-work-tree", cwd=str(project_root))
+ result = subprocess.run(
+ ["git", "rev-parse", "--is-inside-work-tree"],
+ cwd=str(project_root), capture_output=True, text=True
+ )
+ code = result.returncode
if code != 0:
print(" Error: not a git repository. Run: git init && git add . && git commit -m 'initial'")
sys.exit(1)
@@ -362,7 +376,7 @@ def main():
if not args.skip_branch:
print(f" Branch: autoresearch/{args.domain}/{args.name}")
print(f"\n To start:")
- print(f" python scripts/run_experiment.py --experiment {args.domain}/{args.name} --loop")
+ print(f" python scripts/run_experiment.py --experiment {args.domain}/{args.name} --single")
if __name__ == "__main__":
diff --git a/engineering/autoresearch-agent/settings.json b/engineering/autoresearch-agent/settings.json
new file mode 100644
index 0000000..cb73087
--- /dev/null
+++ b/engineering/autoresearch-agent/settings.json
@@ -0,0 +1,22 @@
+{
+ "name": "autoresearch-agent",
+ "displayName": "Autoresearch Agent",
+ "version": "2.1.2",
+ "description": "Autonomous experiment loop — optimize any file by a measurable metric.",
+ "author": "Alireza Rezvani",
+ "license": "MIT",
+ "platforms": ["claude-code", "openclaw", "codex"],
+ "category": "engineering",
+ "tags": ["optimization", "experiments", "benchmarks", "autoresearch", "loop", "metrics"],
+ "repository": "https://github.com/alirezarezvani/claude-skills",
+ "commands": {
+ "setup": "/ar:setup",
+ "run": "/ar:run",
+ "loop": "/ar:loop",
+ "status": "/ar:status",
+ "resume": "/ar:resume"
+ },
+ "agents": [
+ "experiment-runner"
+ ]
+}
diff --git a/engineering/autoresearch-agent/skills/loop/SKILL.md b/engineering/autoresearch-agent/skills/loop/SKILL.md
new file mode 100644
index 0000000..cd07d8b
--- /dev/null
+++ b/engineering/autoresearch-agent/skills/loop/SKILL.md
@@ -0,0 +1,122 @@
+---
+name: "loop"
+description: "Start an autonomous experiment loop with user-selected interval (10min, 1h, daily, weekly, monthly). Uses CronCreate for scheduling."
+command: /ar:loop
+---
+
+# /ar:loop — Autonomous Experiment Loop
+
+Start a recurring experiment loop that runs at a user-selected interval.
+
+## Usage
+
+```
+/ar:loop engineering/api-speed # Start loop (prompts for interval)
+/ar:loop engineering/api-speed 10m # Every 10 minutes
+/ar:loop engineering/api-speed 1h # Every hour
+/ar:loop engineering/api-speed daily # Daily at ~9am
+/ar:loop engineering/api-speed weekly # Weekly on Monday ~9am
+/ar:loop engineering/api-speed monthly # Monthly on 1st ~9am
+/ar:loop stop engineering/api-speed # Stop an active loop
+```
+
+## What It Does
+
+### Step 1: Resolve experiment
+
+If no experiment specified, list experiments and let user pick.
+
+### Step 2: Select interval
+
+If interval not provided as argument, present options:
+
+```
+Select loop interval:
+ 1. Every 10 minutes (rapid — stay and watch)
+ 2. Every hour (background — check back later)
+ 3. Daily at ~9am (overnight experiments)
+ 4. Weekly on Monday (long-running experiments)
+ 5. Monthly on 1st (slow experiments)
+```
+
+Map to cron expressions:
+
+| Interval | Cron Expression | Shorthand |
+|----------|----------------|-----------|
+| 10 minutes | `*/10 * * * *` | `10m` |
+| 1 hour | `7 * * * *` | `1h` |
+| Daily | `57 8 * * *` | `daily` |
+| Weekly | `57 8 * * 1` | `weekly` |
+| Monthly | `57 8 1 * *` | `monthly` |
+
+### Step 3: Create the recurring job
+
+Use `CronCreate` with this prompt (fill in the experiment details):
+
+```
+You are running autoresearch experiment "{domain}/{name}".
+
+1. Read .autoresearch/{domain}/{name}/config.cfg for: target, evaluate_cmd, metric, metric_direction
+2. Read .autoresearch/{domain}/{name}/program.md for strategy and constraints
+3. Read .autoresearch/{domain}/{name}/results.tsv for experiment history
+4. Run: git checkout autoresearch/{domain}/{name}
+
+Then do exactly ONE iteration:
+- Review results.tsv: what worked, what failed, what hasn't been tried
+- Edit the target file with ONE change (strategy escalation based on run count)
+- Commit: git add {target} && git commit -m "experiment: {description}"
+- Evaluate: python {skill_path}/scripts/run_experiment.py --experiment {domain}/{name} --single
+- Read the output (KEEP/DISCARD/CRASH)
+
+Rules:
+- ONE change per experiment
+- NEVER modify the evaluator
+- If 5 consecutive crashes in results.tsv, delete this cron job (CronDelete) and alert
+- After every 10 experiments, update Strategy section of program.md
+
+Current best metric: {read from results.tsv or "no baseline yet"}
+Total experiments so far: {count from results.tsv}
+```
+
+### Step 4: Store loop metadata
+
+Write to `.autoresearch/{domain}/{name}/loop.json`:
+
+```json
+{
+ "cron_id": "{id from CronCreate}",
+ "interval": "{user selection}",
+ "started": "{ISO timestamp}",
+ "experiment": "{domain}/{name}"
+}
+```
+
+### Step 5: Confirm to user
+
+```
+Loop started for {domain}/{name}
+ Interval: {interval description}
+ Cron ID: {id}
+ Auto-expires: 3 days (CronCreate limit)
+
+ To check progress: /ar:status
+ To stop the loop: /ar:loop stop {domain}/{name}
+
+ Note: Recurring jobs auto-expire after 3 days.
+ Run /ar:loop again to restart after expiry.
+```
+
+## Stopping a Loop
+
+When user runs `/ar:loop stop {experiment}`:
+
+1. Read `.autoresearch/{domain}/{name}/loop.json` to get the cron ID
+2. Call `CronDelete` with that ID
+3. Delete `loop.json`
+4. Confirm: "Loop stopped for {experiment}. {n} experiments completed."
+
+## Important Limitations
+
+- **3-day auto-expiry**: CronCreate jobs expire after 3 days. For longer experiments, the user must re-run `/ar:loop` to restart. Results persist — the new loop picks up where the old one left off.
+- **One loop per experiment**: Don't start multiple loops for the same experiment.
+- **Concurrent experiments**: Multiple experiments can loop simultaneously ONLY if they're on different git branches (which they are by default — each experiment gets `autoresearch/{domain}/{name}`).
diff --git a/engineering/autoresearch-agent/skills/resume/SKILL.md b/engineering/autoresearch-agent/skills/resume/SKILL.md
new file mode 100644
index 0000000..48bc7f7
--- /dev/null
+++ b/engineering/autoresearch-agent/skills/resume/SKILL.md
@@ -0,0 +1,77 @@
+---
+name: "resume"
+description: "Resume a paused experiment. Checkout the experiment branch, read results history, continue iterating."
+command: /ar:resume
+---
+
+# /ar:resume — Resume Experiment
+
+Resume a paused or context-limited experiment. Reads all history and continues where you left off.
+
+## Usage
+
+```
+/ar:resume # List experiments, let user pick
+/ar:resume engineering/api-speed # Resume specific experiment
+```
+
+## What It Does
+
+### Step 1: List experiments if needed
+
+If no experiment specified:
+
+```bash
+python {skill_path}/scripts/setup_experiment.py --list
+```
+
+Show status for each (active/paused/done based on results.tsv age). Let user pick.
+
+### Step 2: Load full context
+
+```bash
+# Checkout the experiment branch
+git checkout autoresearch/{domain}/{name}
+
+# Read config
+cat .autoresearch/{domain}/{name}/config.cfg
+
+# Read strategy
+cat .autoresearch/{domain}/{name}/program.md
+
+# Read full results history
+cat .autoresearch/{domain}/{name}/results.tsv
+
+# Read recent git log for the branch
+git log --oneline -20
+```
+
+### Step 3: Report current state
+
+Summarize for the user:
+
+```
+Resuming: engineering/api-speed
+ Target: src/api/search.py
+ Metric: p50_ms (lower is better)
+ Experiments: 23 total — 8 kept, 12 discarded, 3 crashed
+ Best: 185ms (-42% from baseline of 320ms)
+ Last experiment: "added response caching" → KEEP (185ms)
+
+ Recent patterns:
+ - Caching changes: 3 kept, 1 discarded (consistently helpful)
+ - Algorithm changes: 2 discarded, 1 crashed (high risk, low reward so far)
+ - I/O optimization: 2 kept (promising direction)
+```
+
+### Step 4: Ask next action
+
+```
+How would you like to continue?
+ 1. Single iteration (/ar:run) — I'll make one change and evaluate
+ 2. Start a loop (/ar:loop) — Autonomous with scheduled interval
+ 3. Just show me the results — I'll review and decide
+```
+
+If the user picks loop, hand off to `/ar:loop` with the experiment pre-selected.
+If single, hand off to `/ar:run`.
diff --git a/engineering/autoresearch-agent/skills/run/SKILL.md b/engineering/autoresearch-agent/skills/run/SKILL.md
new file mode 100644
index 0000000..4a9caff
--- /dev/null
+++ b/engineering/autoresearch-agent/skills/run/SKILL.md
@@ -0,0 +1,84 @@
+---
+name: "run"
+description: "Run a single experiment iteration. Edit the target file, evaluate, keep or discard."
+command: /ar:run
+---
+
+# /ar:run — Single Experiment Iteration
+
+Run exactly ONE experiment iteration: review history, decide a change, edit, commit, evaluate.
+
+## Usage
+
+```
+/ar:run engineering/api-speed # Run one iteration
+/ar:run # List experiments, let user pick
+```
+
+## What It Does
+
+### Step 1: Resolve experiment
+
+If no experiment specified, run `python {skill_path}/scripts/setup_experiment.py --list` and ask the user to pick.
+
+### Step 2: Load context
+
+```bash
+# Read experiment config
+cat .autoresearch/{domain}/{name}/config.cfg
+
+# Read strategy and constraints
+cat .autoresearch/{domain}/{name}/program.md
+
+# Read experiment history
+cat .autoresearch/{domain}/{name}/results.tsv
+
+# Checkout the experiment branch
+git checkout autoresearch/{domain}/{name}
+```
+
+### Step 3: Decide what to try
+
+Review results.tsv:
+- What changes were kept? What pattern do they share?
+- What was discarded? Avoid repeating those approaches.
+- What crashed? Understand why.
+- How many runs so far? (Escalate strategy accordingly)
+
+**Strategy escalation:**
+- Runs 1-5: Low-hanging fruit (obvious improvements)
+- Runs 6-15: Systematic exploration (vary one parameter)
+- Runs 16-30: Structural changes (algorithm swaps)
+- Runs 30+: Radical experiments (completely different approaches)
+
+### Step 4: Make ONE change
+
+Edit only the target file specified in config.cfg. Change one thing. Keep it simple.
+
+### Step 5: Commit and evaluate
+
+```bash
+git add {target}
+git commit -m "experiment: {short description of what changed}"
+
+python {skill_path}/scripts/run_experiment.py \
+ --experiment {domain}/{name} --single
+```
+
+### Step 6: Report result
+
+Read the script output. Tell the user:
+- **KEEP**: "Improvement! {metric}: {value} ({delta} from previous best)"
+- **DISCARD**: "No improvement. {metric}: {value} vs best {best}. Reverted."
+- **CRASH**: "Evaluation failed: {reason}. Reverted."
+
+### Step 7: Self-improvement check
+
+After every 10th experiment (check results.tsv line count), update the Strategy section of program.md with patterns learned.
+
+## Rules
+
+- ONE change per iteration. Don't change 5 things at once.
+- NEVER modify the evaluator (evaluate.py). It's ground truth.
+- Simplicity wins. Equal performance with simpler code is an improvement.
+- No new dependencies.
diff --git a/engineering/autoresearch-agent/skills/setup/SKILL.md b/engineering/autoresearch-agent/skills/setup/SKILL.md
new file mode 100644
index 0000000..15d42d2
--- /dev/null
+++ b/engineering/autoresearch-agent/skills/setup/SKILL.md
@@ -0,0 +1,77 @@
+---
+name: "setup"
+description: "Set up a new autoresearch experiment interactively. Collects domain, target file, eval command, metric, direction, and evaluator."
+command: /ar:setup
+---
+
+# /ar:setup — Create New Experiment
+
+Set up a new autoresearch experiment with all required configuration.
+
+## Usage
+
+```
+/ar:setup # Interactive mode
+/ar:setup engineering api-speed src/api.py "pytest bench.py" p50_ms lower
+/ar:setup --list # Show existing experiments
+/ar:setup --list-evaluators # Show available evaluators
+```
+
+## What It Does
+
+### If arguments provided
+
+Pass them directly to the setup script:
+
+```bash
+python {skill_path}/scripts/setup_experiment.py \
+ --domain {domain} --name {name} \
+ --target {target} --eval "{eval_cmd}" \
+ --metric {metric} --direction {direction} \
+ [--evaluator {evaluator}] [--scope {scope}]
+```
+
+### If no arguments (interactive mode)
+
+Collect each parameter one at a time:
+
+1. **Domain** — Ask: "What domain? (engineering, marketing, content, prompts, custom)"
+2. **Name** — Ask: "Experiment name? (e.g., api-speed, blog-titles)"
+3. **Target file** — Ask: "Which file to optimize?" Verify it exists.
+4. **Eval command** — Ask: "How to measure it? (e.g., pytest bench.py, python evaluate.py)"
+5. **Metric** — Ask: "What metric does the eval output? (e.g., p50_ms, ctr_score)"
+6. **Direction** — Ask: "Is lower or higher better?"
+7. **Evaluator** (optional) — Show built-in evaluators. Ask: "Use a built-in evaluator, or your own?"
+8. **Scope** — Ask: "Store in project (.autoresearch/) or user (~/.autoresearch/)?"
+
+Then run `setup_experiment.py` with the collected parameters.
+
+### Listing
+
+```bash
+# Show existing experiments
+python {skill_path}/scripts/setup_experiment.py --list
+
+# Show available evaluators
+python {skill_path}/scripts/setup_experiment.py --list-evaluators
+```
+
+## Built-in Evaluators
+
+| Name | Metric | Use Case |
+|------|--------|----------|
+| `benchmark_speed` | `p50_ms` (lower) | Function/API execution time |
+| `benchmark_size` | `size_bytes` (lower) | File, bundle, Docker image size |
+| `test_pass_rate` | `pass_rate` (higher) | Test suite pass percentage |
+| `build_speed` | `build_seconds` (lower) | Build/compile/Docker build time |
+| `memory_usage` | `peak_mb` (lower) | Peak memory during execution |
+| `llm_judge_content` | `ctr_score` (higher) | Headlines, titles, descriptions |
+| `llm_judge_prompt` | `quality_score` (higher) | System prompts, agent instructions |
+| `llm_judge_copy` | `engagement_score` (higher) | Social posts, ad copy, emails |
+
+## After Setup
+
+Report to the user:
+- Experiment path and branch name
+- Whether the eval command worked and the baseline metric
+- Suggest: "Run `/ar:run {domain}/{name}` to start iterating, or `/ar:loop {domain}/{name}` for autonomous mode."
diff --git a/engineering/autoresearch-agent/skills/status/SKILL.md b/engineering/autoresearch-agent/skills/status/SKILL.md
new file mode 100644
index 0000000..56b3ed4
--- /dev/null
+++ b/engineering/autoresearch-agent/skills/status/SKILL.md
@@ -0,0 +1,71 @@
+---
+name: "status"
+description: "Show experiment dashboard with results, active loops, and progress."
+command: /ar:status
+---
+
+# /ar:status — Experiment Dashboard
+
+Show experiment results, active loops, and progress across all experiments.
+
+## Usage
+
+```
+/ar:status # Full dashboard
+/ar:status engineering/api-speed # Single experiment detail
+/ar:status --domain engineering # All experiments in a domain
+/ar:status --format markdown # Export as markdown
+/ar:status --format csv --output results.csv # Export as CSV
+```
+
+## What It Does
+
+### Single experiment
+
+```bash
+python {skill_path}/scripts/log_results.py --experiment {domain}/{name}
+```
+
+Also check for active loop:
+```bash
+cat .autoresearch/{domain}/{name}/loop.json 2>/dev/null
+```
+
+If loop.json exists, show:
+```
+Active loop: every {interval} (cron ID: {id}, started: {date})
+```
+
+### Domain view
+
+```bash
+python {skill_path}/scripts/log_results.py --domain {domain}
+```
+
+### Full dashboard
+
+```bash
+python {skill_path}/scripts/log_results.py --dashboard
+```
+
+For each experiment, also check for loop.json and show loop status.
+
+### Export
+
+```bash
+# CSV
+python {skill_path}/scripts/log_results.py --dashboard --format csv --output {file}
+
+# Markdown
+python {skill_path}/scripts/log_results.py --dashboard --format markdown --output {file}
+```
+
+## Output Example
+
+```
+DOMAIN EXPERIMENT RUNS KEPT BEST CHANGE STATUS LOOP
+engineering api-speed 47 14 185ms -76.9% active every 1h
+engineering bundle-size 23 8 412KB -58.3% paused —
+marketing medium-ctr 31 11 8.4/10 +68.0% active daily
+prompts support-tone 15 6 82/100 +46.4% done —
+```
diff --git a/mkdocs.yml b/mkdocs.yml
index ee81ef8..ab3d823 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,6 +1,6 @@
site_name: Agent Skills — Skills, Agents & Personas for AI Coding Tools
site_url: https://alirezarezvani.github.io/claude-skills/
-site_description: "177 production-ready skills, 16 agents, 3 personas, and an orchestration protocol for 11 AI coding tools. Reusable expertise for engineering, product, marketing, compliance, and more."
+site_description: "177 production-ready skills, 17 agents, 3 personas, and an orchestration protocol for 11 AI coding tools. Reusable expertise for engineering, product, marketing, compliance, and more."
site_author: Alireza Rezvani
repo_url: https://github.com/alirezarezvani/claude-skills
repo_name: alirezarezvani/claude-skills
@@ -162,6 +162,12 @@ nav:
- "Tech Stack Evaluator": skills/engineering-team/tech-stack-evaluator.md
- Engineering - POWERFUL:
- Overview: skills/engineering/index.md
+ - "Autoresearch Agent": skills/engineering/autoresearch-agent.md
+ - "Autoresearch /ar:setup": skills/engineering/autoresearch-agent-setup.md
+ - "Autoresearch /ar:run": skills/engineering/autoresearch-agent-run.md
+ - "Autoresearch /ar:loop": skills/engineering/autoresearch-agent-loop.md
+ - "Autoresearch /ar:status": skills/engineering/autoresearch-agent-status.md
+ - "Autoresearch /ar:resume": skills/engineering/autoresearch-agent-resume.md
- "Agent Designer": skills/engineering/agent-designer.md
- "Agent Workflow Designer": skills/engineering/agent-workflow-designer.md
- "API Design Reviewer": skills/engineering/api-design-reviewer.md