Merge pull request #4 from zhudotexe/leaderboard-submissions

Add leaderboard submission action
zhudotexe · Mar 14, 2024 · 07230b4 · 07230b4
2 parents 788d51f + 58487c0
commit 07230b4
Show file tree

Hide file tree

Showing 21 changed files with 1,038 additions and 187 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -9,6 +9,21 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      # ==== python, leaderboard data ====
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Format leaderboard results for web
+        run: python leaderboard-submissions/webfmt.py
+
       # ==== vue ====
       - name: Set up Node.js
         uses: actions/setup-node@v4
@@ -29,17 +44,6 @@ jobs:
         run: cp -R leaderboard/dist/. docs/_extra/leaderboard
 
       # ==== sphinx ====
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-          cache: 'pip'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
       - name: Build docs
         working-directory: ./docs
         run: sphinx-build -T -E -W --keep-going -b html "." "_build/html"

diff --git a/.github/workflows/leaderboard-submission.yml b/.github/workflows/leaderboard-submission.yml
@@ -0,0 +1,87 @@
+name: Leaderboard Submission
+
+# Since we need write access to GH, we use pull_request_target here
+# The eval workflow will need to clone the PR head branch alongside the base branch, then copy over only the changed
+# files to run base eval code on (for security)
+on:
+  pull_request_target:
+    paths:
+      - leaderboard-submissions/metadata/*.json
+      - leaderboard-submissions/*generations/*.jsonl
+
+permissions:
+  pull-requests: write
+  contents: write
+
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+    env:
+      GH_TOKEN: ${{ github.token }}
+
+    steps:
+      # Clone the main repo
+      - uses: actions/checkout@v4
+
+      # Clone the PR head to _pr_submission
+      - uses: actions/checkout@v4
+        with:
+          path: _pr_submission
+          ref: ${{ github.head_ref }}
+
+      # copy submission files over to main repo
+      # results files go to a new dir to prevent weird commit behaviour when committing results
+      - name: Copy submission files to eval workspace
+        run: |
+          cp -r _pr_submission/leaderboard-submissions/closedbook-generations/. leaderboard-submissions/closedbook-generations
+          cp -r _pr_submission/leaderboard-submissions/openbook-generations/. leaderboard-submissions/openbook-generations
+          cp -r _pr_submission/leaderboard-submissions/evidenceprovided-generations/. leaderboard-submissions/evidenceprovided-generations
+          cp -r _pr_submission/leaderboard-submissions/metadata/. leaderboard-submissions/metadata
+          rm leaderboard-submissions/pr-results
+          cp -r _pr_submission/leaderboard-submissions/results/. leaderboard-submissions/pr-results
+
+      - name: Download test set answers
+        run: wget -q ${{ secrets.TEST_ANSWERS_URL }} -O fanoutqa-test-answers.json
+
+      # set up in local workdir and hydrate results
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+
+      - name: Install library for eval
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          wget -nv https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip
+          unzip BLEURT-20.zip
+          rm BLEURT-20.zip
+
+      - name: Run eval script
+        id: eval
+        env:
+          LEADERBOARD_SALT: ${{ secrets.LEADERBOARD_SALT }}
+          FANOUTQA_OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: python leaderboard-submissions/hydrate.py
+
+      - name: Add PR comment (failure)
+        if: failure()
+        run: gh pr comment ${{ github.event.number }} --edit-last -b "It looks like this eval run failed. Please check the [workflow logs](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) to see what went wrong, then push a new commit to your PR to rerun the eval."
+
+      - name: Add PR comment (success)
+        if: steps.eval.outputs.changed > 0
+        env:
+          RUN_LINK: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: python leaderboard-submissions/gh-print-new-results.py ${{ steps.eval.outputs.written-results }} | gh pr comment ${{ github.event.number }} --edit-last -F -
+
+      - name: Commit results files to PR
+        if: steps.eval.outputs.changed > 0
+        run: |
+          cp ${{ steps.eval.outputs.written-results }} _pr_submission/leaderboard-submissions/results/
+          cd _pr_submission
+          git config user.name github-actions
+          git config user.email [email protected]
+          git add leaderboard-submissions/results
+          git commit -m "leaderboard: add eval results"
+          git push
diff --git a/.gitignore b/.gitignore
@@ -155,3 +155,4 @@ cython_debug/
 **.DS_Store
 BLEURT-20/
 docs/_extra/leaderboard/
+fanoutqa-test-answers.json
diff --git a/README.md b/README.md
@@ -172,10 +172,9 @@ and return an `EvaluationScore` object, which has attributes matching the follow
 
 ### Test Set Evaluation
 
-To evaluate your model on the hidden test set, please email your generations
-to [[email protected]](mailto:[email protected]) with the subject "FanOutQA Test Evaluation". Your generations
-should be in the form of a JSONL file, with each line being a JSON object with the following schema for each test
-question:
+To evaluate your model on the hidden test set, first generate answers for each question in the test set.
+Your generations should be in the form of a JSONL file, with each line being a JSON object with the following schema for
+each test question:
 
 ```json
 {
@@ -184,13 +183,33 @@ question:
 }
 ```
 
-In the email body, please include details about your system, including:
+You will also need to write a metadata file for your model. Your metadata file should use this template:
 
-- the name of your system
-- the list of authors
-- a link to your paper and recommended short citation, if applicable
-- the context length of your model
-- whether your model is a new foundation model, a fine-tune, a prompting approach, or other
+```json
+{
+  "name": "The name of your model",
+  "authors": "The list of authors, in natural language (e.g. `Andrew Zhu, Alyssa Hwang, Liam Dugan, and Chris Callison-Burch`)",
+  "url": "A link to your model's website, if applicable (null otherwise)",
+  "citation": "The list of authors and year, in citation format (e.g. `Zhu et al., 2024`)",
+  "type": "FOUNDATION | FINETUNE | PROMPT | OTHER",
+  "context": "The context length of the model your system uses (as an int)",
+  "closedbook_generations": "YOUR-SYSTEM-NAME.jsonl",
+  "openbook_generations": "YOUR-SYSTEM-NAME.jsonl",
+  "evidenceprovided_generations": "YOUR-SYSTEM-NAME.jsonl"
+}
+```
+
+Then, fork this repository. Add your generation files to 
+`leaderboard-submissions/[SETTING]-generations/YOUR-SYSTEM-NAME.jsonl`
+and your metadata file to `leaderboard-submissions/metadata/YOUR-SYSTEM-NAME.json` and make a pull request.
+
+If you do not want to release the generations of your model, please email these files to 
+[[email protected]](mailto:[email protected]) instead and we will add your model to the leaderboards without
+pushing the generations.
+
+Finally, our GitHub bot will automatically run metrics on the submitted generations and commit a metrics file to
+`leaderboard-submissions/results/YOUR-SYSTEM-NAME.jsonl`. If all looks well, a maintainer will merge the PR and your
+model will appear on the leaderboards!
 
 ## Additional Resources
 

diff --git a/leaderboard-submissions/closedbook-generations/.gitkeep b/leaderboard-submissions/closedbook-generations/.gitkeep
diff --git a/leaderboard-submissions/evidenceprovided-generations/.gitkeep b/leaderboard-submissions/evidenceprovided-generations/.gitkeep
diff --git a/leaderboard-submissions/gh-print-new-results.py b/leaderboard-submissions/gh-print-new-results.py
@@ -0,0 +1,53 @@
+"""Given a list of newly-written results files as argv, output a nice little Markdown summary of them."""
+
+import json
+import os
+import sys
+
+RUN_LINK = os.getenv("RUN_LINK")
+
+print(f"Eval run succeeded! Link to run: [link]({RUN_LINK})\n\nHere are the results of the submission(s):\n")
+
+for fp in sys.argv[1:]:
+    with open(fp) as f:
+        results = json.load(f)
+
+    metadata = results["metadata"]
+    cb = results["closedbook"]
+    ob = results["openbook"]
+    ep = results["evidenceprovided"]
+
+    print(
+        f"# {metadata['name']}\n"
+        f"*[{metadata['citation']}]({metadata['url']})*\n\n"
+        "## Closed Book\n\n"
+        f"- **Loose**: {cb['acc']['loose']:.3}\n"
+        f"- **Strict**: {cb['acc']['strict']:.3}\n"
+        f"- **ROUGE-1**: {cb['rouge']['rouge1']['fscore']:.3}\n"
+        f"- **ROUGE-2**: {cb['rouge']['rouge2']['fscore']:.3}\n"
+        f"- **ROUGE-L**: {cb['rouge']['rougeL']['fscore']:.3}\n"
+        f"- **BLEURT**: {cb['bleurt']:.3}\n"
+        f"- **GPT Judge**: {cb['gpt']:.3}\n\n"
+        "## Open Book\n\n"
+        f"- **Loose**: {ob['acc']['loose']:.3}\n"
+        f"- **Strict**: {ob['acc']['strict']:.3}\n"
+        f"- **ROUGE-1**: {ob['rouge']['rouge1']['fscore']:.3}\n"
+        f"- **ROUGE-2**: {ob['rouge']['rouge2']['fscore']:.3}\n"
+        f"- **ROUGE-L**: {ob['rouge']['rougeL']['fscore']:.3}\n"
+        f"- **BLEURT**: {ob['bleurt']:.3}\n"
+        f"- **GPT Judge**: {ob['gpt']:.3}\n\n"
+        "## Evidence Provided\n\n"
+        f"- **Loose**: {ep['acc']['loose']:.3}\n"
+        f"- **Strict**: {ep['acc']['strict']:.3}\n"
+        f"- **ROUGE-1**: {ep['rouge']['rouge1']['fscore']:.3}\n"
+        f"- **ROUGE-2**: {ep['rouge']['rouge2']['fscore']:.3}\n"
+        f"- **ROUGE-L**: {ep['rouge']['rougeL']['fscore']:.3}\n"
+        f"- **BLEURT**: {ep['bleurt']:.3}\n"
+        f"- **GPT Judge**: {ep['gpt']:.3}\n"
+    )
+
+print(
+    "If all looks well, a maintainer will come by soon to merge this PR and your entry/entries will appear on the"
+    " leaderboard. If you need to make any changes, feel free to push new commits to this PR. Thanks for submitting to"
+    " FanOutQA!"
+)