-
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from zhudotexe/leaderboard-submissions
Add leaderboard submission action
- Loading branch information
Showing
21 changed files
with
1,038 additions
and
187 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
name: Leaderboard Submission | ||
|
||
# Since we need write access to GH, we use pull_request_target here | ||
# The eval workflow will need to clone the PR head branch alongside the base branch, then copy over only the changed | ||
# files to run base eval code on (for security) | ||
on: | ||
pull_request_target: | ||
paths: | ||
- leaderboard-submissions/metadata/*.json | ||
- leaderboard-submissions/*generations/*.jsonl | ||
|
||
permissions: | ||
pull-requests: write | ||
contents: write | ||
|
||
jobs: | ||
evaluate: | ||
runs-on: ubuntu-latest | ||
env: | ||
GH_TOKEN: ${{ github.token }} | ||
|
||
steps: | ||
# Clone the main repo | ||
- uses: actions/checkout@v4 | ||
|
||
# Clone the PR head to _pr_submission | ||
- uses: actions/checkout@v4 | ||
with: | ||
path: _pr_submission | ||
ref: ${{ github.head_ref }} | ||
|
||
# copy submission files over to main repo | ||
# results files go to a new dir to prevent weird commit behaviour when committing results | ||
- name: Copy submission files to eval workspace | ||
run: | | ||
cp -r _pr_submission/leaderboard-submissions/closedbook-generations/. leaderboard-submissions/closedbook-generations | ||
cp -r _pr_submission/leaderboard-submissions/openbook-generations/. leaderboard-submissions/openbook-generations | ||
cp -r _pr_submission/leaderboard-submissions/evidenceprovided-generations/. leaderboard-submissions/evidenceprovided-generations | ||
cp -r _pr_submission/leaderboard-submissions/metadata/. leaderboard-submissions/metadata | ||
rm leaderboard-submissions/pr-results | ||
cp -r _pr_submission/leaderboard-submissions/results/. leaderboard-submissions/pr-results | ||
- name: Download test set answers | ||
run: wget -q ${{ secrets.TEST_ANSWERS_URL }} -O fanoutqa-test-answers.json | ||
|
||
# set up in local workdir and hydrate results | ||
- name: Set up Python 3.10 | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: '3.10' | ||
cache: 'pip' | ||
|
||
- name: Install library for eval | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install -r requirements.txt | ||
wget -nv https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip | ||
unzip BLEURT-20.zip | ||
rm BLEURT-20.zip | ||
- name: Run eval script | ||
id: eval | ||
env: | ||
LEADERBOARD_SALT: ${{ secrets.LEADERBOARD_SALT }} | ||
FANOUTQA_OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
run: python leaderboard-submissions/hydrate.py | ||
|
||
- name: Add PR comment (failure) | ||
if: failure() | ||
run: gh pr comment ${{ github.event.number }} --edit-last -b "It looks like this eval run failed. Please check the [workflow logs](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) to see what went wrong, then push a new commit to your PR to rerun the eval." | ||
|
||
- name: Add PR comment (success) | ||
if: steps.eval.outputs.changed > 0 | ||
env: | ||
RUN_LINK: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} | ||
run: python leaderboard-submissions/gh-print-new-results.py ${{ steps.eval.outputs.written-results }} | gh pr comment ${{ github.event.number }} --edit-last -F - | ||
|
||
- name: Commit results files to PR | ||
if: steps.eval.outputs.changed > 0 | ||
run: | | ||
cp ${{ steps.eval.outputs.written-results }} _pr_submission/leaderboard-submissions/results/ | ||
cd _pr_submission | ||
git config user.name github-actions | ||
git config user.email [email protected] | ||
git add leaderboard-submissions/results | ||
git commit -m "leaderboard: add eval results" | ||
git push |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -155,3 +155,4 @@ cython_debug/ | |
**.DS_Store | ||
BLEURT-20/ | ||
docs/_extra/leaderboard/ | ||
fanoutqa-test-answers.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -172,10 +172,9 @@ and return an `EvaluationScore` object, which has attributes matching the follow | |
|
||
### Test Set Evaluation | ||
|
||
To evaluate your model on the hidden test set, please email your generations | ||
to [[email protected]](mailto:[email protected]) with the subject "FanOutQA Test Evaluation". Your generations | ||
should be in the form of a JSONL file, with each line being a JSON object with the following schema for each test | ||
question: | ||
To evaluate your model on the hidden test set, first generate answers for each question in the test set. | ||
Your generations should be in the form of a JSONL file, with each line being a JSON object with the following schema for | ||
each test question: | ||
|
||
```json | ||
{ | ||
|
@@ -184,13 +183,33 @@ question: | |
} | ||
``` | ||
|
||
In the email body, please include details about your system, including: | ||
You will also need to write a metadata file for your model. Your metadata file should use this template: | ||
|
||
- the name of your system | ||
- the list of authors | ||
- a link to your paper and recommended short citation, if applicable | ||
- the context length of your model | ||
- whether your model is a new foundation model, a fine-tune, a prompting approach, or other | ||
```json | ||
{ | ||
"name": "The name of your model", | ||
"authors": "The list of authors, in natural language (e.g. `Andrew Zhu, Alyssa Hwang, Liam Dugan, and Chris Callison-Burch`)", | ||
"url": "A link to your model's website, if applicable (null otherwise)", | ||
"citation": "The list of authors and year, in citation format (e.g. `Zhu et al., 2024`)", | ||
"type": "FOUNDATION | FINETUNE | PROMPT | OTHER", | ||
"context": "The context length of the model your system uses (as an int)", | ||
"closedbook_generations": "YOUR-SYSTEM-NAME.jsonl", | ||
"openbook_generations": "YOUR-SYSTEM-NAME.jsonl", | ||
"evidenceprovided_generations": "YOUR-SYSTEM-NAME.jsonl" | ||
} | ||
``` | ||
|
||
Then, fork this repository. Add your generation files to | ||
`leaderboard-submissions/[SETTING]-generations/YOUR-SYSTEM-NAME.jsonl` | ||
and your metadata file to `leaderboard-submissions/metadata/YOUR-SYSTEM-NAME.json` and make a pull request. | ||
|
||
If you do not want to release the generations of your model, please email these files to | ||
[[email protected]](mailto:[email protected]) instead and we will add your model to the leaderboards without | ||
pushing the generations. | ||
|
||
Finally, our GitHub bot will automatically run metrics on the submitted generations and commit a metrics file to | ||
`leaderboard-submissions/results/YOUR-SYSTEM-NAME.jsonl`. If all looks well, a maintainer will merge the PR and your | ||
model will appear on the leaderboards! | ||
|
||
## Additional Resources | ||
|
||
|
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
"""Given a list of newly-written results files as argv, output a nice little Markdown summary of them.""" | ||
|
||
import json | ||
import os | ||
import sys | ||
|
||
RUN_LINK = os.getenv("RUN_LINK") | ||
|
||
print(f"Eval run succeeded! Link to run: [link]({RUN_LINK})\n\nHere are the results of the submission(s):\n") | ||
|
||
for fp in sys.argv[1:]: | ||
with open(fp) as f: | ||
results = json.load(f) | ||
|
||
metadata = results["metadata"] | ||
cb = results["closedbook"] | ||
ob = results["openbook"] | ||
ep = results["evidenceprovided"] | ||
|
||
print( | ||
f"# {metadata['name']}\n" | ||
f"*[{metadata['citation']}]({metadata['url']})*\n\n" | ||
"## Closed Book\n\n" | ||
f"- **Loose**: {cb['acc']['loose']:.3}\n" | ||
f"- **Strict**: {cb['acc']['strict']:.3}\n" | ||
f"- **ROUGE-1**: {cb['rouge']['rouge1']['fscore']:.3}\n" | ||
f"- **ROUGE-2**: {cb['rouge']['rouge2']['fscore']:.3}\n" | ||
f"- **ROUGE-L**: {cb['rouge']['rougeL']['fscore']:.3}\n" | ||
f"- **BLEURT**: {cb['bleurt']:.3}\n" | ||
f"- **GPT Judge**: {cb['gpt']:.3}\n\n" | ||
"## Open Book\n\n" | ||
f"- **Loose**: {ob['acc']['loose']:.3}\n" | ||
f"- **Strict**: {ob['acc']['strict']:.3}\n" | ||
f"- **ROUGE-1**: {ob['rouge']['rouge1']['fscore']:.3}\n" | ||
f"- **ROUGE-2**: {ob['rouge']['rouge2']['fscore']:.3}\n" | ||
f"- **ROUGE-L**: {ob['rouge']['rougeL']['fscore']:.3}\n" | ||
f"- **BLEURT**: {ob['bleurt']:.3}\n" | ||
f"- **GPT Judge**: {ob['gpt']:.3}\n\n" | ||
"## Evidence Provided\n\n" | ||
f"- **Loose**: {ep['acc']['loose']:.3}\n" | ||
f"- **Strict**: {ep['acc']['strict']:.3}\n" | ||
f"- **ROUGE-1**: {ep['rouge']['rouge1']['fscore']:.3}\n" | ||
f"- **ROUGE-2**: {ep['rouge']['rouge2']['fscore']:.3}\n" | ||
f"- **ROUGE-L**: {ep['rouge']['rougeL']['fscore']:.3}\n" | ||
f"- **BLEURT**: {ep['bleurt']:.3}\n" | ||
f"- **GPT Judge**: {ep['gpt']:.3}\n" | ||
) | ||
|
||
print( | ||
"If all looks well, a maintainer will come by soon to merge this PR and your entry/entries will appear on the" | ||
" leaderboard. If you need to make any changes, feel free to push new commits to this PR. Thanks for submitting to" | ||
" FanOutQA!" | ||
) |
Oops, something went wrong.