Skip to content

Commit

Permalink
Update difficulty.json
Browse files Browse the repository at this point in the history
  • Loading branch information
zzh068 authored Dec 14, 2023
1 parent cb712b4 commit a5275f9
Showing 1 changed file with 291 additions and 1 deletion.
292 changes: 291 additions & 1 deletion agentboard/data/To_Release/difficulty.json
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,296 @@
}
}
},
{
"model": "GPT-3.5-Turbo",
"tasks": {
"Tool-Operation": {
"easy": {
"score": "13.3%",
"accuracy": "35.7%"
},
"hard": {
"score": "4.0%",
"accuracy": "38.0%"
},
"gap": {
"score": "9.3%",
"accuracy": "-2.3%"
}
},
"PDDL": {
"easy": {
"score": "8.3%",
"accuracy": "37.0%"
},
"hard": {
"score": "0.0%",
"accuracy": "6.9%"
},
"gap": {
"score": "8.3%",
"accuracy": "30.1%"
}
},
"AlfWorld": {
"easy": {
"score": "29.2%",
"accuracy": "40.3%"
},
"hard": {
"score": "14.5%",
"accuracy": "34.5%"
},
"gap": {
"score": "14.7%",
"accuracy": "5.8%"
}
},
"BabyAI": {
"easy": {
"score": "46.5%",
"accuracy": "56.0%"
},
"hard": {
"score": "15.4%",
"accuracy": "37.5%"
},
"gap": {
"score": "31.1%",
"accuracy": "18.5%"
}
},
"WebArena": {
"easy": {
"score": "12.0%",
"accuracy": "22.3%"
},
"hard": {
"score": "1.2%",
"accuracy": "26.2%"
},
"gap": {
"score": "10.8%",
"accuracy": "-3.9%"
}
},
"Jericho": {
"easy": {
"score": "10.0%",
"accuracy": "25.8%"
},
"hard": {
"score": "0.0%",
"accuracy": "14.0%"
},
"gap": {
"score": "10.0%",
"accuracy": "11.8%"
}
},
"Tool-Query": {
"easy": {
"score": "67.9%",
"accuracy": "80.8%"
},
"hard": {
"score": "25.0%",
"accuracy": "59.5%"
},
"gap": {
"score": "42.9%",
"accuracy": "21.3%"
}
},
"WebShop": {
"easy": {
"score": "37.9%",
"accuracy": "77.2%"
},
"hard": {
"score": "27.5%",
"accuracy": "74.5%"
},
"gap": {
"score": "10.4%",
"accuracy": "2.7%"
}
},
"ScienceWorld": {
"easy": {
"score": "44.1%",
"accuracy": "50.0%"
},
"hard": {
"score": "3.6%",
"accuracy": "20.9%"
},
"gap": {
"score": "40.5%",
"accuracy": "29.1%"
}
},
"Avg": {
"easy": {
"score": "29.9%",
"accuracy": "47.2%"
},
"hard": {
"score": "10.1%",
"accuracy": "34.7%"
},
"gap": {
"score": "19.8%",
"accuracy": "12.6%"
}
}
}
},
{
"model": "GPT-3.5-Turbo-16k",
"tasks": {
"Tool-Operation": {
"easy": {
"score": "33.3%",
"accuracy": "50.7%"
},
"hard": {
"score": "4.0%",
"accuracy": "32.9%"
},
"gap": {
"score": "29.3%",
"accuracy": "17.8%"
}
},
"PDDL": {
"easy": {
"score": "5.6%",
"accuracy": "37.7%"
},
"hard": {
"score": "0.0%",
"accuracy": "0.0%"
},
"gap": {
"score": "5.6%",
"accuracy": "37.7%"
}
},
"AlfWorld": {
"easy": {
"score": "12.5%",
"accuracy": "22.2%"
},
"hard": {
"score": "2.7%",
"accuracy": "25.9%"
},
"gap": {
"score": "9.8%",
"accuracy": "-3.7%"
}
},
"BabyAI": {
"easy": {
"score": "41.9%",
"accuracy": "50.2%"
},
"hard": {
"score": "7.7%",
"accuracy": "28.2%"
},
"gap": {
"score": "34.2%",
"accuracy": "22.0%"
}
},
"WebArena": {
"easy": {
"score": "18.7%",
"accuracy": "31.5%"
},
"hard": {
"score": "0.6%",
"accuracy": "20.2%"
},
"gap": {
"score": "18.1%",
"accuracy": "11.3%"
}
},
"Jericho": {
"easy": {
"score": "0.0%",
"accuracy": "18.3%"
},
"hard": {
"score": "0.0%",
"accuracy": "13.8%"
},
"gap": {
"score": "0.0%",
"accuracy": "4.5%"
}
},
"Tool-Query": {
"easy": {
"score": "46.4%",
"accuracy": "71.2%"
},
"hard": {
"score": "18.8%",
"accuracy": "48.5%"
},
"gap": {
"score": "27.6%",
"accuracy": "22.7%"
}
},
"WebShop": {
"easy": {
"score": "30.2%",
"accuracy": "75.1%"
},
"hard": {
"score": "21.7%",
"accuracy": "70.2%"
},
"gap": {
"score": "8.5%",
"accuracy": "4.9%"
}
},
"ScienceWorld": {
"easy": {
"score": "0.0%",
"accuracy": "1.0%"
},
"hard": {
"score": "0.0%",
"accuracy": "2.9%"
},
"gap": {
"score": "0.0%",
"accuracy": "-1.9%"
}
},
"Avg": {
"easy": {
"score": "21.0%",
"accuracy": "39.8%"
},
"hard": {
"score": "6.2%",
"accuracy": "27.0%"
},
"gap": {
"score": "14.8%",
"accuracy": "12.8%"
}
}
}
},
{
"model": "Text-Davinci-003",
"tasks": {
Expand Down Expand Up @@ -1421,4 +1711,4 @@
}
}
}
]
]

0 comments on commit a5275f9

Please sign in to comment.