diff --git a/leaderboard-submissions/hydrate.py b/leaderboard-submissions/hydrate.py index e625e04..633f7be 100644 --- a/leaderboard-submissions/hydrate.py +++ b/leaderboard-submissions/hydrate.py @@ -9,7 +9,6 @@ from typing import List, Literal, Optional import fanoutqa -from fanoutqa.eval import evaluate from fanoutqa.eval.scorer import Scorer # prevent manipulation of results - the results must be generated by this script or else the hash will not match @@ -32,9 +31,11 @@ class SubmissionMetadata: citation: str type: Literal["FOUNDATION", "FINETUNE", "PROMPT", "OTHER"] context: int + is_trained_for_function_calling: bool closedbook_generations: str openbook_generations: str evidenceprovided_generations: str + details: Optional[str] = None CheckResult = namedtuple("CheckResult", "metadata needs_eval submission_hash") @@ -134,6 +135,7 @@ def check_submission(metadata_fp: Path) -> CheckResult: try: result_data = json.load(f) result_hash = result_data["_submission_hash"] + # result file exists and hash matches if result_hash == the_hash.hexdigest(): # if so, no eval needed! return CheckResult(metadata_data, False, the_hash) @@ -194,6 +196,8 @@ async def eval_submission(metadata_fp: Path, check_result: CheckResult): "citation": check_result.metadata.citation, "type": check_result.metadata.type, "context": check_result.metadata.context, + "is_trained_for_function_calling": check_result.metadata.is_trained_for_function_calling, + "details": check_result.metadata.details, }, # results "closedbook": closedbook_results,