Skip to content

Commit

Permalink
leaderboard: add results from paper
Browse files Browse the repository at this point in the history
  • Loading branch information
zhudotexe committed Mar 14, 2024
1 parent 4e81b67 commit 58487c0
Show file tree
Hide file tree
Showing 7 changed files with 609 additions and 0 deletions.
87 changes: 87 additions & 0 deletions leaderboard-submissions/results/claude.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"_submission_hash": "",
"_results_hash": "",
"metadata": {
"name": "Claude 2.1",
"authors": "Anthropic",
"url": "https://www.anthropic.com/news/claude-2-1",
"citation": "Anthropic, 2023",
"type": "FOUNDATION",
"context": 200000
},
"closedbook": {
"acc": {
"loose": 0.34087118650894077,
"strict": 0.04143646408839779
},
"rouge": {
"rouge1": {
"precision": 0.5180287739482223,
"recall": 0.3878211545435264,
"fscore": 0.41162891337334295
},
"rouge2": {
"precision": 0.25046659737457144,
"recall": 0.19938401442157735,
"fscore": 0.20843439451844803
},
"rougeL": {
"precision": 0.4368122438044132,
"recall": 0.3228172228944878,
"fscore": 0.34362967985991916
}
},
"bleurt": 0.42636404607383255,
"gpt": 0.11049723756906077
},
"openbook": {
"acc": {
"loose": 0.47120748528585843,
"strict": 0.0856353591160221
},
"rouge": {
"rouge1": {
"precision": 0.2330077534090562,
"recall": 0.5457109366018758,
"fscore": 0.29497641803677077
},
"rouge2": {
"precision": 0.12616194413675985,
"recall": 0.2804704207979029,
"fscore": 0.15747334844988234
},
"rougeL": {
"precision": 0.19843856217260156,
"recall": 0.4759886498517577,
"fscore": 0.2527928960282501
}
},
"bleurt": 0.48486519522594484,
"gpt": 0.21823204419889503
},
"evidenceprovided": {
"acc": {
"loose": 0.6531674930336602,
"strict": 0.2154696132596685
},
"rouge": {
"rouge1": {
"precision": 0.32045520128134836,
"recall": 0.7180304376195379,
"fscore": 0.42300161074719367
},
"rouge2": {
"precision": 0.1979935998869878,
"recall": 0.4389654131557785,
"fscore": 0.26233077626475276
},
"rougeL": {
"precision": 0.2663865768311303,
"recall": 0.6115257311712957,
"fscore": 0.3538037743008641
}
},
"bleurt": 0.5081962022251187,
"gpt": 0.4696132596685083
}
}
87 changes: 87 additions & 0 deletions leaderboard-submissions/results/gpt-3.5-turbo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"_submission_hash": "",
"_results_hash": "",
"metadata": {
"name": "GPT-3.5-turbo",
"authors": "OpenAI",
"url": "https://platform.openai.com/docs/models/gpt-3-5-turbo",
"citation": "OpenAI, 2023",
"type": "FOUNDATION",
"context": 16384
},
"closedbook": {
"acc": {
"loose": 0.3979994873440785,
"strict": 0.058011049723756904
},
"rouge": {
"rouge1": {
"precision": 0.40734904473273553,
"recall": 0.45162239056919035,
"fscore": 0.4012819044350075
},
"rouge2": {
"precision": 0.22803347570177684,
"recall": 0.2518584000616542,
"fscore": 0.2271839790263482
},
"rougeL": {
"precision": 0.34711466296455995,
"recall": 0.38661536042587596,
"fscore": 0.34195490807715956
}
},
"bleurt": 0.4551408462926169,
"gpt": 0.14502762430939226
},
"openbook": {
"acc": {
"loose": 0.15469947220693514,
"strict": 0.031767955801104975
},
"rouge": {
"rouge1": {
"precision": 0.09699008021536212,
"recall": 0.19469418167728442,
"fscore": 0.11418755334242695
},
"rouge2": {
"precision": 0.040413694872158276,
"recall": 0.08488911320642317,
"fscore": 0.05075429578237907
},
"rougeL": {
"precision": 0.0835343704701655,
"recall": 0.1699066826399081,
"fscore": 0.09861238283404707
}
},
"bleurt": 0.33750931579022775,
"gpt": 0.07596685082872928
},
"evidenceprovided": {
"acc": {
"loose": 0.5165072478945334,
"strict": 0.10220994475138122
},
"rouge": {
"rouge1": {
"precision": 0.463516335636312,
"recall": 0.5689152894435296,
"fscore": 0.45546862566608826
},
"rouge2": {
"precision": 0.24946325633429448,
"recall": 0.31371309343477694,
"fscore": 0.25221468710915207
},
"rougeL": {
"precision": 0.3653434458264324,
"recall": 0.45283802498465286,
"fscore": 0.3581738799002991
}
},
"bleurt": 0.49659434195546154,
"gpt": 0.2430939226519337
}
}
87 changes: 87 additions & 0 deletions leaderboard-submissions/results/gpt-4-turbo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"_submission_hash": "",
"_results_hash": "",
"metadata": {
"name": "GPT-4-turbo",
"authors": "OpenAI",
"url": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
"citation": "OpenAI, 2023",
"type": "FOUNDATION",
"context": 128000
},
"closedbook": {
"acc": {
"loose": 0.4597296177579326,
"strict": 0.10082872928176796
},
"rouge": {
"rouge1": {
"precision": 0.5061057938990791,
"recall": 0.5171893148007345,
"fscore": 0.48194841967428786
},
"rouge2": {
"precision": 0.29559659009870354,
"recall": 0.30901380162874503,
"fscore": 0.290250346638784
},
"rougeL": {
"precision": 0.43012909178526687,
"recall": 0.440115792132854,
"fscore": 0.4091321043635482
}
},
"bleurt": 0.49333308180228125,
"gpt": 0.19889502762430938
},
"openbook": {
"acc": {
"loose": 0.4703468836864732,
"strict": 0.10911602209944751
},
"rouge": {
"rouge1": {
"precision": 0.32591691806739764,
"recall": 0.5274025346455309,
"fscore": 0.35589614186761614
},
"rouge2": {
"precision": 0.18949097802681353,
"recall": 0.28817587063251693,
"fscore": 0.20743984742825158
},
"rougeL": {
"precision": 0.28537777495423555,
"recall": 0.46967882341300493,
"fscore": 0.31379331104952
}
},
"bleurt": 0.4865400605212424,
"gpt": 0.26243093922651933
},
"evidenceprovided": {
"acc": {
"loose": 0.6281475667110468,
"strict": 0.1919889502762431
},
"rouge": {
"rouge1": {
"precision": 0.6321127734126731,
"recall": 0.6764341478682872,
"fscore": 0.6144612886380147
},
"rouge2": {
"precision": 0.3984152059157445,
"recall": 0.4292697483707228,
"fscore": 0.39470073987610177
},
"rougeL": {
"precision": 0.536314082392947,
"recall": 0.5794550216005496,
"fscore": 0.5226280415927149
}
},
"bleurt": 0.5809288421923614,
"gpt": 0.4129834254143646
}
}
87 changes: 87 additions & 0 deletions leaderboard-submissions/results/gpt-4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"_submission_hash": "",
"_results_hash": "",
"metadata": {
"name": "GPT-4",
"authors": "OpenAI",
"url": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
"citation": "OpenAI, 2023",
"type": "FOUNDATION",
"context": 8192
},
"closedbook": {
"acc": {
"loose": 0.3549958003169331,
"strict": 0.06629834254143646
},
"rouge": {
"rouge1": {
"precision": 0.3081323409240976,
"recall": 0.4026625098759217,
"fscore": 0.313123968945042
},
"rouge2": {
"precision": 0.1696203592964782,
"recall": 0.22556614456096988,
"fscore": 0.1769179306107922
},
"rougeL": {
"precision": 0.2618982426700318,
"recall": 0.3437078118709097,
"fscore": 0.26676820286543684
}
},
"bleurt": 0.4191427961588729,
"gpt": 0.14917127071823205
},
"openbook": {
"acc": {
"loose": 0.3152160636708284,
"strict": 0.05662983425414365
},
"rouge": {
"rouge1": {
"precision": 0.17356455306164578,
"recall": 0.39150580236680255,
"fscore": 0.20783575754357375
},
"rouge2": {
"precision": 0.09085032002525105,
"recall": 0.17549598793784338,
"fscore": 0.10625950758793883
},
"rougeL": {
"precision": 0.1532112048714597,
"recall": 0.34618346407417766,
"fscore": 0.18344705502411385
}
},
"bleurt": 0.4267584370109587,
"gpt": 0.1643646408839779
},
"evidenceprovided": {
"acc": {
"loose": 0.5456098545791228,
"strict": 0.143646408839779
},
"rouge": {
"rouge1": {
"precision": 0.4725312298278398,
"recall": 0.6085356260042157,
"fscore": 0.49998452049151393
},
"rouge2": {
"precision": 0.2815675162719931,
"recall": 0.35623992580949976,
"fscore": 0.3005037957568856
},
"rougeL": {
"precision": 0.38835972782295014,
"recall": 0.5045559933909823,
"fscore": 0.41272370572817085
}
},
"bleurt": 0.5298143146528723,
"gpt": 0.30386740331491713
}
}
Loading

0 comments on commit 58487c0

Please sign in to comment.