1001 lines
18 KiB
JSON
1001 lines
18 KiB
JSON
{
|
|
"benchmark": "HumanEval-LokiMode",
|
|
"mode": "multi-agent",
|
|
"version": "1.0",
|
|
"timestamp": "2026-01-05T08:46:10.291133",
|
|
"model": "opus",
|
|
"max_retries": 3,
|
|
"total_problems": 164,
|
|
"problems": [
|
|
{
|
|
"task_id": "HumanEval/0",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/1",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/2",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/3",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/4",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/5",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/6",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/7",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/8",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/9",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/10",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/11",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/12",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/13",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/14",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/15",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/16",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/17",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/18",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/19",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/20",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/21",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/22",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/23",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/24",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/25",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/26",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/27",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/28",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/29",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/30",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/31",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/32",
|
|
"passed": false,
|
|
"attempts": 3,
|
|
"error": "Failed after 3 RARV attempts"
|
|
},
|
|
{
|
|
"task_id": "HumanEval/33",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/34",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/35",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/36",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/37",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/38",
|
|
"passed": true,
|
|
"attempts": 2,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/39",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/40",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/41",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/42",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/43",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/44",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/45",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/46",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/47",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/48",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/49",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/50",
|
|
"passed": false,
|
|
"attempts": 3,
|
|
"error": "Failed after 3 RARV attempts"
|
|
},
|
|
{
|
|
"task_id": "HumanEval/51",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/52",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/53",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/54",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/55",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/56",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/57",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/58",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/59",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/60",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/61",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/62",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/63",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/64",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/65",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/66",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/67",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/68",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/69",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/70",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/71",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/72",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/73",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/74",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/75",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/76",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/77",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/78",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/79",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/80",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/81",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/82",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/83",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/84",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/85",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/86",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/87",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/88",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/89",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/90",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/91",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/92",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/93",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/94",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/95",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/96",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/97",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/98",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/99",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/100",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/101",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/102",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/103",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/104",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/105",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/106",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/107",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/108",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/109",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/110",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/111",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/112",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/113",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/114",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/115",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/116",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/117",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/118",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/119",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/120",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/121",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/122",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/123",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/124",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/125",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/126",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/127",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/128",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/129",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/130",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/131",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/132",
|
|
"passed": true,
|
|
"attempts": 2,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/133",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/134",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/135",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/136",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/137",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/138",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/139",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/140",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/141",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/142",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/143",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/144",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/145",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/146",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/147",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/148",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/149",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/150",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/151",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/152",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/153",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/154",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/155",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/156",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/157",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/158",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/159",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/160",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/161",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/162",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
},
|
|
{
|
|
"task_id": "HumanEval/163",
|
|
"passed": true,
|
|
"attempts": 1,
|
|
"error": null
|
|
}
|
|
],
|
|
"passed": 162,
|
|
"failed": 0,
|
|
"errors": 2,
|
|
"pass_rate": 98.78048780487805,
|
|
"avg_attempts": 1.0365853658536586,
|
|
"elapsed_time": 2704.4724848270416
|
|
} |