Skip to content

Commit

Permalink
Merge branch 'main' into leaderboard-gemma
Browse files Browse the repository at this point in the history
  • Loading branch information
zhudotexe committed Apr 17, 2024
2 parents d8366a1 + 43c1fd8 commit cc39b33
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 8 deletions.
19 changes: 12 additions & 7 deletions leaderboard-submissions/hydrate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import hashlib
import json
import os
Expand All @@ -9,6 +10,7 @@

import fanoutqa
from fanoutqa.eval import evaluate
from fanoutqa.eval.scorer import Scorer

# prevent manipulation of results - the results must be generated by this script or else the hash will not match
LEADERBOARD_SALT = os.getenv("LEADERBOARD_SALT", "supersecret").encode()
Expand Down Expand Up @@ -55,7 +57,7 @@ def read_jsonl_answers(fp: Path) -> List[dict]:


# ==== main ====
def hydrate_all():
async def hydrate_all():
"""Main entrypoint - ensure all metadata submissions have valid associated results files"""
exit_code = 0
written_files = []
Expand All @@ -78,7 +80,7 @@ def hydrate_all():
print(f"Open-book generations path: {OB_PATH / check_result.metadata.openbook_generations}")
print(f"Evidence-provided generations path: {EP_PATH / check_result.metadata.evidenceprovided_generations}")
try:
result_fp = eval_submission(metadata_fp, check_result)
result_fp = await eval_submission(metadata_fp, check_result)
written_files.append(result_fp)
except Exception as e:
# if invalid, log a check annotation and mark job failure
Expand Down Expand Up @@ -140,7 +142,7 @@ def check_submission(metadata_fp: Path) -> CheckResult:
return CheckResult(metadata_data, True, the_hash)


def eval_submission(metadata_fp: Path, check_result: CheckResult):
async def eval_submission(metadata_fp: Path, check_result: CheckResult):
"""Read in the answers and generations and eval them all, then write the results file."""
# dummy = {
# "acc": {"loose": 0.0, "strict": 0.0},
Expand All @@ -158,15 +160,18 @@ def eval_submission(metadata_fp: Path, check_result: CheckResult):

print("Evaluating closed book answers...")
closedbook_answers = read_jsonl_answers(CB_PATH / check_result.metadata.closedbook_generations)
closedbook_results = asdict(evaluate(questions, closedbook_answers))
closedbook_scorer = Scorer(questions, closedbook_answers)
closedbook_results = asdict(await closedbook_scorer.score())

print("Evaluating open book answers...")
openbook_answers = read_jsonl_answers(OB_PATH / check_result.metadata.openbook_generations)
openbook_results = asdict(evaluate(questions, openbook_answers))
openbook_scorer = Scorer(questions, openbook_answers)
openbook_results = asdict(await openbook_scorer.score())

print("Evaluating evidence provided answers...")
evidenceprovided_answers = read_jsonl_answers(EP_PATH / check_result.metadata.evidenceprovided_generations)
evidenceprovided_results = asdict(evaluate(questions, evidenceprovided_answers))
evidenceprovided_scorer = Scorer(questions, evidenceprovided_answers)
evidenceprovided_results = asdict(await evidenceprovided_scorer.score())

# hash the results to prevent score manipulation
results_hash = hashlib.sha256()
Expand Down Expand Up @@ -203,5 +208,5 @@ def eval_submission(metadata_fp: Path, check_result: CheckResult):


if __name__ == "__main__":
ec = hydrate_all()
ec = asyncio.run(hydrate_all())
exit(ec)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ retrieval = [
]

eval = [
"kani[openai]~=0.7.2",
"kani[openai]>=1.0.0rc0,<2.0.0",
"rouge-score~=0.1.2",
]

Expand Down

0 comments on commit cc39b33

Please sign in to comment.