From a5275f97542c5cd62f777963046e3f706192c774 Mon Sep 17 00:00:00 2001 From: zzh068 <128816176+zzh068@users.noreply.github.com> Date: Thu, 14 Dec 2023 13:19:56 +0800 Subject: [PATCH] Update difficulty.json --- agentboard/data/To_Release/difficulty.json | 292 ++++++++++++++++++++- 1 file changed, 291 insertions(+), 1 deletion(-) diff --git a/agentboard/data/To_Release/difficulty.json b/agentboard/data/To_Release/difficulty.json index 44e4ac5..4852d61 100644 --- a/agentboard/data/To_Release/difficulty.json +++ b/agentboard/data/To_Release/difficulty.json @@ -275,6 +275,296 @@ } } }, + { + "model": "GPT-3.5-Turbo", + "tasks": { + "Tool-Operation": { + "easy": { + "score": "13.3%", + "accuracy": "35.7%" + }, + "hard": { + "score": "4.0%", + "accuracy": "38.0%" + }, + "gap": { + "score": "9.3%", + "accuracy": "-2.3%" + } + }, + "PDDL": { + "easy": { + "score": "8.3%", + "accuracy": "37.0%" + }, + "hard": { + "score": "0.0%", + "accuracy": "6.9%" + }, + "gap": { + "score": "8.3%", + "accuracy": "30.1%" + } + }, + "AlfWorld": { + "easy": { + "score": "29.2%", + "accuracy": "40.3%" + }, + "hard": { + "score": "14.5%", + "accuracy": "34.5%" + }, + "gap": { + "score": "14.7%", + "accuracy": "5.8%" + } + }, + "BabyAI": { + "easy": { + "score": "46.5%", + "accuracy": "56.0%" + }, + "hard": { + "score": "15.4%", + "accuracy": "37.5%" + }, + "gap": { + "score": "31.1%", + "accuracy": "18.5%" + } + }, + "WebArena": { + "easy": { + "score": "12.0%", + "accuracy": "22.3%" + }, + "hard": { + "score": "1.2%", + "accuracy": "26.2%" + }, + "gap": { + "score": "10.8%", + "accuracy": "-3.9%" + } + }, + "Jericho": { + "easy": { + "score": "10.0%", + "accuracy": "25.8%" + }, + "hard": { + "score": "0.0%", + "accuracy": "14.0%" + }, + "gap": { + "score": "10.0%", + "accuracy": "11.8%" + } + }, + "Tool-Query": { + "easy": { + "score": "67.9%", + "accuracy": "80.8%" + }, + "hard": { + "score": "25.0%", + "accuracy": "59.5%" + }, + "gap": { + "score": "42.9%", + "accuracy": "21.3%" + } + }, + "WebShop": { + "easy": { + "score": "37.9%", + "accuracy": "77.2%" + }, + "hard": { + "score": "27.5%", + "accuracy": "74.5%" + }, + "gap": { + "score": "10.4%", + "accuracy": "2.7%" + } + }, + "ScienceWorld": { + "easy": { + "score": "44.1%", + "accuracy": "50.0%" + }, + "hard": { + "score": "3.6%", + "accuracy": "20.9%" + }, + "gap": { + "score": "40.5%", + "accuracy": "29.1%" + } + }, + "Avg": { + "easy": { + "score": "29.9%", + "accuracy": "47.2%" + }, + "hard": { + "score": "10.1%", + "accuracy": "34.7%" + }, + "gap": { + "score": "19.8%", + "accuracy": "12.6%" + } + } + } + }, + { + "model": "GPT-3.5-Turbo-16k", + "tasks": { + "Tool-Operation": { + "easy": { + "score": "33.3%", + "accuracy": "50.7%" + }, + "hard": { + "score": "4.0%", + "accuracy": "32.9%" + }, + "gap": { + "score": "29.3%", + "accuracy": "17.8%" + } + }, + "PDDL": { + "easy": { + "score": "5.6%", + "accuracy": "37.7%" + }, + "hard": { + "score": "0.0%", + "accuracy": "0.0%" + }, + "gap": { + "score": "5.6%", + "accuracy": "37.7%" + } + }, + "AlfWorld": { + "easy": { + "score": "12.5%", + "accuracy": "22.2%" + }, + "hard": { + "score": "2.7%", + "accuracy": "25.9%" + }, + "gap": { + "score": "9.8%", + "accuracy": "-3.7%" + } + }, + "BabyAI": { + "easy": { + "score": "41.9%", + "accuracy": "50.2%" + }, + "hard": { + "score": "7.7%", + "accuracy": "28.2%" + }, + "gap": { + "score": "34.2%", + "accuracy": "22.0%" + } + }, + "WebArena": { + "easy": { + "score": "18.7%", + "accuracy": "31.5%" + }, + "hard": { + "score": "0.6%", + "accuracy": "20.2%" + }, + "gap": { + "score": "18.1%", + "accuracy": "11.3%" + } + }, + "Jericho": { + "easy": { + "score": "0.0%", + "accuracy": "18.3%" + }, + "hard": { + "score": "0.0%", + "accuracy": "13.8%" + }, + "gap": { + "score": "0.0%", + "accuracy": "4.5%" + } + }, + "Tool-Query": { + "easy": { + "score": "46.4%", + "accuracy": "71.2%" + }, + "hard": { + "score": "18.8%", + "accuracy": "48.5%" + }, + "gap": { + "score": "27.6%", + "accuracy": "22.7%" + } + }, + "WebShop": { + "easy": { + "score": "30.2%", + "accuracy": "75.1%" + }, + "hard": { + "score": "21.7%", + "accuracy": "70.2%" + }, + "gap": { + "score": "8.5%", + "accuracy": "4.9%" + } + }, + "ScienceWorld": { + "easy": { + "score": "0.0%", + "accuracy": "1.0%" + }, + "hard": { + "score": "0.0%", + "accuracy": "2.9%" + }, + "gap": { + "score": "0.0%", + "accuracy": "-1.9%" + } + }, + "Avg": { + "easy": { + "score": "21.0%", + "accuracy": "39.8%" + }, + "hard": { + "score": "6.2%", + "accuracy": "27.0%" + }, + "gap": { + "score": "14.8%", + "accuracy": "12.8%" + } + } + } + }, { "model": "Text-Davinci-003", "tasks": { @@ -1421,4 +1711,4 @@ } } } -] \ No newline at end of file +]