antigravity-skills-reference/skills/loki-mode/benchmarks/submission-template/metadata.yaml

# SWE-bench Submission Metadata
# For Loki Mode Multi-Agent System

# Model Information
model:
  name: "loki-mode"
  version: "2.25.0"
  base_model: "claude-opus-4-5-20251101"
  type: "multi-agent-system"

# System Architecture
architecture:
  type: "multi-agent-pipeline"
  agents:
    - name: "Architect"
      role: "Analyze issue and design fix approach"
      model: "claude-opus-4.5"
      timeout: 120
    - name: "Engineer"
      role: "Generate patch based on architect's analysis"
      model: "claude-opus-4.5"
      timeout: 300
    - name: "QA"
      role: "Validate patch format"
      model: "rule-based"
      timeout: 5
    - name: "Reviewer"
      role: "Analyze issues and suggest fixes"
      model: "claude-opus-4.5"
      timeout: 60

  # RARV Cycle (Reason-Act-Reflect-Verify)
  rarv:
    enabled: true
    max_retries: 3
    description: "Self-verification loop that retries failed patches with reviewer feedback"

# Benchmark Configuration
benchmark:
  dataset: "SWE-bench_Lite"
  split: "test"
  total_problems: 300

# Results Summary
results:
  patch_generation_rate: 99.67
  problems_solved: 299
  problems_total: 300
  fixed_by_rarv: 0
  avg_attempts: 1.0
  total_time_seconds: 12600
  avg_time_per_problem_seconds: 42

# Submission Information
submission:
  date: "2026-01-05"
  author: "Loki Mode Team"
  repository: "https://github.com/asklokesh/loki-mode"
  license: "MIT"

# Contact
contact:
  email: "lokesh@example.com"
  github: "asklokesh"

# Notes
notes: |
  Loki Mode is a multi-agent system built as a Claude Code skill.
  It uses a 4-agent pipeline (Architect -> Engineer -> QA -> Reviewer)
  with a RARV (Reason-Act-Reflect-Verify) cycle for self-correction.

  Key features:
  - Multi-agent coordination for complex problem solving
  - Automatic retry with reviewer feedback on failures
  - Full trajectory logging for transparency
  - Matches single-agent performance after timeout optimization