Merge branch 'main' into leaderboard-gemma

zhudotexe · Apr 17, 2024 · cc39b33 · cc39b33
2 parents d8366a1 + 43c1fd8
commit cc39b33
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 8 deletions.
diff --git a/leaderboard-submissions/hydrate.py b/leaderboard-submissions/hydrate.py
@@ -1,3 +1,4 @@
+import asyncio
 import hashlib
 import json
 import os
@@ -9,6 +10,7 @@
 
 import fanoutqa
 from fanoutqa.eval import evaluate
+from fanoutqa.eval.scorer import Scorer
 
 # prevent manipulation of results - the results must be generated by this script or else the hash will not match
 LEADERBOARD_SALT = os.getenv("LEADERBOARD_SALT", "supersecret").encode()
@@ -55,7 +57,7 @@ def read_jsonl_answers(fp: Path) -> List[dict]:
 
 
 # ==== main ====
-def hydrate_all():
+async def hydrate_all():
     """Main entrypoint - ensure all metadata submissions have valid associated results files"""
     exit_code = 0
     written_files = []
@@ -78,7 +80,7 @@ def hydrate_all():
         print(f"Open-book generations path: {OB_PATH / check_result.metadata.openbook_generations}")
         print(f"Evidence-provided generations path: {EP_PATH / check_result.metadata.evidenceprovided_generations}")
         try:
-            result_fp = eval_submission(metadata_fp, check_result)
+            result_fp = await eval_submission(metadata_fp, check_result)
             written_files.append(result_fp)
         except Exception as e:
             # if invalid, log a check annotation and mark job failure
@@ -140,7 +142,7 @@ def check_submission(metadata_fp: Path) -> CheckResult:
     return CheckResult(metadata_data, True, the_hash)
 
 
-def eval_submission(metadata_fp: Path, check_result: CheckResult):
+async def eval_submission(metadata_fp: Path, check_result: CheckResult):
     """Read in the answers and generations and eval them all, then write the results file."""
     # dummy = {
     #     "acc": {"loose": 0.0, "strict": 0.0},
@@ -158,15 +160,18 @@ def eval_submission(metadata_fp: Path, check_result: CheckResult):
 
     print("Evaluating closed book answers...")
     closedbook_answers = read_jsonl_answers(CB_PATH / check_result.metadata.closedbook_generations)
-    closedbook_results = asdict(evaluate(questions, closedbook_answers))
+    closedbook_scorer = Scorer(questions, closedbook_answers)
+    closedbook_results = asdict(await closedbook_scorer.score())
 
     print("Evaluating open book answers...")
     openbook_answers = read_jsonl_answers(OB_PATH / check_result.metadata.openbook_generations)
-    openbook_results = asdict(evaluate(questions, openbook_answers))
+    openbook_scorer = Scorer(questions, openbook_answers)
+    openbook_results = asdict(await openbook_scorer.score())
 
     print("Evaluating evidence provided answers...")
     evidenceprovided_answers = read_jsonl_answers(EP_PATH / check_result.metadata.evidenceprovided_generations)
-    evidenceprovided_results = asdict(evaluate(questions, evidenceprovided_answers))
+    evidenceprovided_scorer = Scorer(questions, evidenceprovided_answers)
+    evidenceprovided_results = asdict(await evidenceprovided_scorer.score())
 
     # hash the results to prevent score manipulation
     results_hash = hashlib.sha256()
@@ -203,5 +208,5 @@ def eval_submission(metadata_fp: Path, check_result: CheckResult):
 
 
 if __name__ == "__main__":
-    ec = hydrate_all()
+    ec = asyncio.run(hydrate_all())
     exit(ec)
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ retrieval = [
 ]
 
 eval = [
-    "kani[openai]~=0.7.2",
+    "kani[openai]>=1.0.0rc0,<2.0.0",
     "rouge-score~=0.1.2",
 ]