diff --git a/leaderboard-submissions/results/claude.json b/leaderboard-submissions/results/claude.json new file mode 100644 index 0000000..675be13 --- /dev/null +++ b/leaderboard-submissions/results/claude.json @@ -0,0 +1,87 @@ +{ + "_submission_hash": "", + "_results_hash": "", + "metadata": { + "name": "Claude 2.1", + "authors": "Anthropic", + "url": "https://www.anthropic.com/news/claude-2-1", + "citation": "Anthropic, 2023", + "type": "FOUNDATION", + "context": 200000 + }, + "closedbook": { + "acc": { + "loose": 0.34087118650894077, + "strict": 0.04143646408839779 + }, + "rouge": { + "rouge1": { + "precision": 0.5180287739482223, + "recall": 0.3878211545435264, + "fscore": 0.41162891337334295 + }, + "rouge2": { + "precision": 0.25046659737457144, + "recall": 0.19938401442157735, + "fscore": 0.20843439451844803 + }, + "rougeL": { + "precision": 0.4368122438044132, + "recall": 0.3228172228944878, + "fscore": 0.34362967985991916 + } + }, + "bleurt": 0.42636404607383255, + "gpt": 0.11049723756906077 + }, + "openbook": { + "acc": { + "loose": 0.47120748528585843, + "strict": 0.0856353591160221 + }, + "rouge": { + "rouge1": { + "precision": 0.2330077534090562, + "recall": 0.5457109366018758, + "fscore": 0.29497641803677077 + }, + "rouge2": { + "precision": 0.12616194413675985, + "recall": 0.2804704207979029, + "fscore": 0.15747334844988234 + }, + "rougeL": { + "precision": 0.19843856217260156, + "recall": 0.4759886498517577, + "fscore": 0.2527928960282501 + } + }, + "bleurt": 0.48486519522594484, + "gpt": 0.21823204419889503 + }, + "evidenceprovided": { + "acc": { + "loose": 0.6531674930336602, + "strict": 0.2154696132596685 + }, + "rouge": { + "rouge1": { + "precision": 0.32045520128134836, + "recall": 0.7180304376195379, + "fscore": 0.42300161074719367 + }, + "rouge2": { + "precision": 0.1979935998869878, + "recall": 0.4389654131557785, + "fscore": 0.26233077626475276 + }, + "rougeL": { + "precision": 0.2663865768311303, + "recall": 0.6115257311712957, + "fscore": 0.3538037743008641 + } + }, + "bleurt": 0.5081962022251187, + "gpt": 0.4696132596685083 + } +} \ No newline at end of file diff --git a/leaderboard-submissions/results/gpt-3.5-turbo.json b/leaderboard-submissions/results/gpt-3.5-turbo.json new file mode 100644 index 0000000..ea1e9dd --- /dev/null +++ b/leaderboard-submissions/results/gpt-3.5-turbo.json @@ -0,0 +1,87 @@ +{ + "_submission_hash": "", + "_results_hash": "", + "metadata": { + "name": "GPT-3.5-turbo", + "authors": "OpenAI", + "url": "https://platform.openai.com/docs/models/gpt-3-5-turbo", + "citation": "OpenAI, 2023", + "type": "FOUNDATION", + "context": 16384 + }, + "closedbook": { + "acc": { + "loose": 0.3979994873440785, + "strict": 0.058011049723756904 + }, + "rouge": { + "rouge1": { + "precision": 0.40734904473273553, + "recall": 0.45162239056919035, + "fscore": 0.4012819044350075 + }, + "rouge2": { + "precision": 0.22803347570177684, + "recall": 0.2518584000616542, + "fscore": 0.2271839790263482 + }, + "rougeL": { + "precision": 0.34711466296455995, + "recall": 0.38661536042587596, + "fscore": 0.34195490807715956 + } + }, + "bleurt": 0.4551408462926169, + "gpt": 0.14502762430939226 + }, + "openbook": { + "acc": { + "loose": 0.15469947220693514, + "strict": 0.031767955801104975 + }, + "rouge": { + "rouge1": { + "precision": 0.09699008021536212, + "recall": 0.19469418167728442, + "fscore": 0.11418755334242695 + }, + "rouge2": { + "precision": 0.040413694872158276, + "recall": 0.08488911320642317, + "fscore": 0.05075429578237907 + }, + "rougeL": { + "precision": 0.0835343704701655, + "recall": 0.1699066826399081, + "fscore": 0.09861238283404707 + } + }, + "bleurt": 0.33750931579022775, + "gpt": 0.07596685082872928 + }, + "evidenceprovided": { + "acc": { + "loose": 0.5165072478945334, + "strict": 0.10220994475138122 + }, + "rouge": { + "rouge1": { + "precision": 0.463516335636312, + "recall": 0.5689152894435296, + "fscore": 0.45546862566608826 + }, + "rouge2": { + "precision": 0.24946325633429448, + "recall": 0.31371309343477694, + "fscore": 0.25221468710915207 + }, + "rougeL": { + "precision": 0.3653434458264324, + "recall": 0.45283802498465286, + "fscore": 0.3581738799002991 + } + }, + "bleurt": 0.49659434195546154, + "gpt": 0.2430939226519337 + } +} \ No newline at end of file diff --git a/leaderboard-submissions/results/gpt-4-turbo.json b/leaderboard-submissions/results/gpt-4-turbo.json new file mode 100644 index 0000000..9e3297c --- /dev/null +++ b/leaderboard-submissions/results/gpt-4-turbo.json @@ -0,0 +1,87 @@ +{ + "_submission_hash": "", + "_results_hash": "", + "metadata": { + "name": "GPT-4-turbo", + "authors": "OpenAI", + "url": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "citation": "OpenAI, 2023", + "type": "FOUNDATION", + "context": 128000 + }, + "closedbook": { + "acc": { + "loose": 0.4597296177579326, + "strict": 0.10082872928176796 + }, + "rouge": { + "rouge1": { + "precision": 0.5061057938990791, + "recall": 0.5171893148007345, + "fscore": 0.48194841967428786 + }, + "rouge2": { + "precision": 0.29559659009870354, + "recall": 0.30901380162874503, + "fscore": 0.290250346638784 + }, + "rougeL": { + "precision": 0.43012909178526687, + "recall": 0.440115792132854, + "fscore": 0.4091321043635482 + } + }, + "bleurt": 0.49333308180228125, + "gpt": 0.19889502762430938 + }, + "openbook": { + "acc": { + "loose": 0.4703468836864732, + "strict": 0.10911602209944751 + }, + "rouge": { + "rouge1": { + "precision": 0.32591691806739764, + "recall": 0.5274025346455309, + "fscore": 0.35589614186761614 + }, + "rouge2": { + "precision": 0.18949097802681353, + "recall": 0.28817587063251693, + "fscore": 0.20743984742825158 + }, + "rougeL": { + "precision": 0.28537777495423555, + "recall": 0.46967882341300493, + "fscore": 0.31379331104952 + } + }, + "bleurt": 0.4865400605212424, + "gpt": 0.26243093922651933 + }, + "evidenceprovided": { + "acc": { + "loose": 0.6281475667110468, + "strict": 0.1919889502762431 + }, + "rouge": { + "rouge1": { + "precision": 0.6321127734126731, + "recall": 0.6764341478682872, + "fscore": 0.6144612886380147 + }, + "rouge2": { + "precision": 0.3984152059157445, + "recall": 0.4292697483707228, + "fscore": 0.39470073987610177 + }, + "rougeL": { + "precision": 0.536314082392947, + "recall": 0.5794550216005496, + "fscore": 0.5226280415927149 + } + }, + "bleurt": 0.5809288421923614, + "gpt": 0.4129834254143646 + } +} \ No newline at end of file diff --git a/leaderboard-submissions/results/gpt-4.json b/leaderboard-submissions/results/gpt-4.json new file mode 100644 index 0000000..305e62a --- /dev/null +++ b/leaderboard-submissions/results/gpt-4.json @@ -0,0 +1,87 @@ +{ + "_submission_hash": "", + "_results_hash": "", + "metadata": { + "name": "GPT-4", + "authors": "OpenAI", + "url": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "citation": "OpenAI, 2023", + "type": "FOUNDATION", + "context": 8192 + }, + "closedbook": { + "acc": { + "loose": 0.3549958003169331, + "strict": 0.06629834254143646 + }, + "rouge": { + "rouge1": { + "precision": 0.3081323409240976, + "recall": 0.4026625098759217, + "fscore": 0.313123968945042 + }, + "rouge2": { + "precision": 0.1696203592964782, + "recall": 0.22556614456096988, + "fscore": 0.1769179306107922 + }, + "rougeL": { + "precision": 0.2618982426700318, + "recall": 0.3437078118709097, + "fscore": 0.26676820286543684 + } + }, + "bleurt": 0.4191427961588729, + "gpt": 0.14917127071823205 + }, + "openbook": { + "acc": { + "loose": 0.3152160636708284, + "strict": 0.05662983425414365 + }, + "rouge": { + "rouge1": { + "precision": 0.17356455306164578, + "recall": 0.39150580236680255, + "fscore": 0.20783575754357375 + }, + "rouge2": { + "precision": 0.09085032002525105, + "recall": 0.17549598793784338, + "fscore": 0.10625950758793883 + }, + "rougeL": { + "precision": 0.1532112048714597, + "recall": 0.34618346407417766, + "fscore": 0.18344705502411385 + } + }, + "bleurt": 0.4267584370109587, + "gpt": 0.1643646408839779 + }, + "evidenceprovided": { + "acc": { + "loose": 0.5456098545791228, + "strict": 0.143646408839779 + }, + "rouge": { + "rouge1": { + "precision": 0.4725312298278398, + "recall": 0.6085356260042157, + "fscore": 0.49998452049151393 + }, + "rouge2": { + "precision": 0.2815675162719931, + "recall": 0.35623992580949976, + "fscore": 0.3005037957568856 + }, + "rougeL": { + "precision": 0.38835972782295014, + "recall": 0.5045559933909823, + "fscore": 0.41272370572817085 + } + }, + "bleurt": 0.5298143146528723, + "gpt": 0.30386740331491713 + } +} \ No newline at end of file diff --git a/leaderboard-submissions/results/llama-chat.json b/leaderboard-submissions/results/llama-chat.json new file mode 100644 index 0000000..f5209b3 --- /dev/null +++ b/leaderboard-submissions/results/llama-chat.json @@ -0,0 +1,87 @@ +{ + "_submission_hash": "", + "_results_hash": "", + "metadata": { + "name": "LLaMA 2 70B", + "authors": "Meta", + "url": "https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/", + "citation": "Touvron et al., 2023", + "type": "FOUNDATION", + "context": 4096 + }, + "closedbook": { + "acc": { + "loose": 0.4403229470249054, + "strict": 0.058011049723756904 + }, + "rouge": { + "rouge1": { + "precision": 0.21079994013879805, + "recall": 0.5309704790876308, + "fscore": 0.2847323680812482 + }, + "rouge2": { + "precision": 0.11206570754887477, + "recall": 0.26618658655979804, + "fscore": 0.14873440544450633 + }, + "rougeL": { + "precision": 0.17624114336611502, + "recall": 0.44614569152308986, + "fscore": 0.23787236888912539 + } + }, + "bleurt": 0.44125290332331185, + "gpt": 0.12016574585635359 + }, + "openbook": { + "acc": { + "loose": 0.38953698927856806, + "strict": 0.06353591160220995 + }, + "rouge": { + "rouge1": { + "precision": 0.09966641622516868, + "recall": 0.47799987641310493, + "fscore": 0.15662035695680138 + }, + "rouge2": { + "precision": 0.04823947632622272, + "recall": 0.21575914217888367, + "fscore": 0.07543987012466698 + }, + "rougeL": { + "precision": 0.08324811278648249, + "recall": 0.40852712042925454, + "fscore": 0.13141945954530423 + } + }, + "bleurt": 0.4429991969951938, + "gpt": 0.10773480662983426 + }, + "evidenceprovided": { + "acc": { + "loose": 0.5143679183027113, + "strict": 0.07734806629834254 + }, + "rouge": { + "rouge1": { + "precision": 0.3407245588154435, + "recall": 0.5902473705256917, + "fscore": 0.3762628122180759 + }, + "rouge2": { + "precision": 0.1832699274716267, + "recall": 0.31368060953904164, + "fscore": 0.20593914103223815 + }, + "rougeL": { + "precision": 0.27518702361418795, + "recall": 0.4795536695039635, + "fscore": 0.30410024296144206 + } + }, + "bleurt": 0.47221096980678773, + "gpt": 0.16160220994475138 + } +} \ No newline at end of file diff --git a/leaderboard-submissions/results/mistral-chat.json b/leaderboard-submissions/results/mistral-chat.json new file mode 100644 index 0000000..f2f0b1f --- /dev/null +++ b/leaderboard-submissions/results/mistral-chat.json @@ -0,0 +1,87 @@ +{ + "_submission_hash": "", + "_results_hash": "", + "metadata": { + "name": "Mistral-7B", + "authors": "Mistral AI", + "url": "https://mistral.ai/news/announcing-mistral-7b/", + "citation": "Jiang et al., 2023", + "type": "FOUNDATION", + "context": 32000 + }, + "closedbook": { + "acc": { + "loose": 0.4271172854502475, + "strict": 0.055248618784530384 + }, + "rouge": { + "rouge1": { + "precision": 0.19504405031464528, + "recall": 0.5205896670595586, + "fscore": 0.2603643670617223 + }, + "rouge2": { + "precision": 0.09285001789519015, + "recall": 0.23254410979936643, + "fscore": 0.12283050286793348 + }, + "rougeL": { + "precision": 0.15867921904916787, + "recall": 0.42898549246034995, + "fscore": 0.21227960835014928 + } + }, + "bleurt": 0.4494648841801434, + "gpt": 0.10220994475138122 + }, + "openbook": { + "acc": { + "loose": 0.024145046175432918, + "strict": 0.0013812154696132596 + }, + "rouge": { + "rouge1": { + "precision": 0.007230514490414043, + "recall": 0.029838755315060313, + "fscore": 0.010116056959318522 + }, + "rouge2": { + "precision": 0.0030492661970760036, + "recall": 0.01235636714726415, + "fscore": 0.004309447012284605 + }, + "rougeL": { + "precision": 0.006576325561071797, + "recall": 0.02700577469421076, + "fscore": 0.009108932193761509 + } + }, + "bleurt": 0.1700935731461858, + "gpt": 0.011049723756906077 + }, + "evidenceprovided": { + "acc": { + "loose": 0.539846654625103, + "strict": 0.08839779005524862 + }, + "rouge": { + "rouge1": { + "precision": 0.25362992874397977, + "recall": 0.6206499844406299, + "fscore": 0.3300987746835614 + }, + "rouge2": { + "precision": 0.13136173541137755, + "recall": 0.31550366270027114, + "fscore": 0.17182351464162274 + }, + "rougeL": { + "precision": 0.20309078966310257, + "recall": 0.5001600255631405, + "fscore": 0.2639495371227864 + } + }, + "bleurt": 0.4745529419974069, + "gpt": 0.20165745856353592 + } +} \ No newline at end of file diff --git a/leaderboard-submissions/results/mixtral.json b/leaderboard-submissions/results/mixtral.json new file mode 100644 index 0000000..c708cc5 --- /dev/null +++ b/leaderboard-submissions/results/mixtral.json @@ -0,0 +1,87 @@ +{ + "_submission_hash": "", + "_results_hash": "", + "metadata": { + "name": "Mixtral-8x7B", + "authors": "Mistral AI", + "url": "https://mistral.ai/news/mixtral-of-experts/", + "citation": "Jiang et al., 2024", + "type": "FOUNDATION", + "context": 32000 + }, + "closedbook": { + "acc": { + "loose": 0.4695900993561285, + "strict": 0.08149171270718232 + }, + "rouge": { + "rouge1": { + "precision": 0.2402434133434171, + "recall": 0.5465023664516744, + "fscore": 0.3023042602650845 + }, + "rouge2": { + "precision": 0.1278099874524417, + "recall": 0.26409680478109976, + "fscore": 0.15751696515253602 + }, + "rougeL": { + "precision": 0.20205125579019406, + "recall": 0.46086662623646035, + "fscore": 0.2538329572357096 + } + }, + "bleurt": 0.46641780631565255, + "gpt": 0.18646408839779005 + }, + "openbook": { + "acc": { + "loose": 0.3961719668514913, + "strict": 0.055248618784530384 + }, + "rouge": { + "rouge1": { + "precision": 0.11787584988437909, + "recall": 0.47223446046975565, + "fscore": 0.17315178004485718 + }, + "rouge2": { + "precision": 0.053319584239651184, + "recall": 0.20480681092188244, + "fscore": 0.07820162065687233 + }, + "rougeL": { + "precision": 0.10000718238197719, + "recall": 0.4077342141256453, + "fscore": 0.147182462986372 + } + }, + "bleurt": 0.4493722631107213, + "gpt": 0.1477900552486188 + }, + "evidenceprovided": { + "acc": { + "loose": 0.5760681994469418, + "strict": 0.13535911602209943 + }, + "rouge": { + "rouge1": { + "precision": 0.337280859260002, + "recall": 0.6511326541771959, + "fscore": 0.40933745120007703 + }, + "rouge2": { + "precision": 0.1943735281050035, + "recall": 0.3485360713472991, + "fscore": 0.23141031987255675 + }, + "rougeL": { + "precision": 0.282396718505277, + "recall": 0.5503517413117182, + "fscore": 0.34298526601178303 + } + }, + "bleurt": 0.508856576296259, + "gpt": 0.28314917127071826 + } +} \ No newline at end of file