-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added github action to evaluate on private dataset
- Loading branch information
Showing
12 changed files
with
274 additions
and
143 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
name: ci | ||
name: Deploy to github pages | ||
on: | ||
push: | ||
branches: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
name: Update leaderboard. | ||
|
||
on: | ||
pull_request: | ||
types: [opened, reopened, synchronize] | ||
|
||
jobs: | ||
private_evaluation: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Check if there are any changes in submissions dir | ||
uses: dorny/[email protected] | ||
id: changes | ||
with: | ||
filters: | | ||
src: | ||
- 'session_2/challenge/submissions/**' | ||
list-files: "shell" | ||
|
||
# Exit early if no changes in the submissions directory | ||
- name: Print changed files | ||
run: | | ||
echo '${{ toJSON(steps.changes.outputs) }}' | ||
# Install evaluation dependencies and run the evals. | ||
- name: Checkout code | ||
if: ${{ (steps.changes.outputs.src == 'true') }} | ||
uses: actions/checkout@v4 | ||
|
||
- name: Install Python | ||
if: ${{ (steps.changes.outputs.src == 'true') }} | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: "3.10" | ||
|
||
- name: Install dependencies | ||
if: ${{ (steps.changes.outputs.src == 'true') }} | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install -r session_2/challenge/requirements.txt | ||
- name: Run leaderboard update script | ||
id: leaderboard-update | ||
if: ${{ (steps.changes.outputs.src == 'true') }} | ||
run: | | ||
cd session_2/challenge | ||
python -m scripts.leaderboard --github_user="${{ github.actor }}" --prompt="${{ steps.changes.outputs.src_files }}" | ||
# # Commit the updated leaderboard | ||
# - name: Commit Updated Leaderboard | ||
# id: commit-leaderboard | ||
# run: | | ||
# git config --global user.name "GitHub Actions" | ||
# git config --global user.email "[email protected]" | ||
# git add leaderboard.md | ||
# git commit -m "Update leaderboard" | ||
# git push origin HEAD:${{ github.ref }} | ||
|
||
# # Print the commit SHA for reference | ||
# - name: Print Commit SHA | ||
# run: echo "Commit SHA: ${{ steps.commit-leaderboard.outputs.commit_sha }}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
"""Utilities to load evaluation datasets.""" | ||
|
||
import glob | ||
import os | ||
|
||
|
||
def load_sample_test_set(samples_dir: str) -> list[tuple[str, bool]]: | ||
"""Loads sample job descriptions and answers for local testing.""" | ||
sample_files = glob.glob(os.path.join(samples_dir, "*.txt")) | ||
sample_inputs = [] | ||
for filepath in sample_files: | ||
content = open(filepath, "r").read() | ||
filename = os.path.basename(filepath).lower() | ||
if filename.endswith("_yes.txt"): | ||
target = True | ||
elif filename.endswith("_no.txt"): | ||
target = False | ||
else: | ||
raise ValueError( | ||
"File %s must end with yes.txt or no.txt" % filepath | ||
) | ||
target = True if "yes" in filename.lower() else False | ||
sample_inputs.append((content, target)) | ||
return sample_inputs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
"""Library function for evaluating a prompt on a particular dataset.""" | ||
|
||
import logging | ||
|
||
import tqdm | ||
from scripts import model, registry | ||
from submissions import * # noqa: F401, F403 | ||
from submissions import baseline # noqa: F401 | ||
|
||
|
||
def evaluate(dataset: list[tuple[str, bool]], prompt_name: str): | ||
"""Evaluates the prompt submission.""" | ||
# Loads a free gpt4 model. | ||
llm = model.G4fModel() | ||
|
||
# Loads a prompt submission. | ||
prompt_handler = registry.get(name=prompt_name) | ||
|
||
# Generate results for the dataset. | ||
correct_pred = 0 | ||
for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)): | ||
prompt = prompt_handler.build_prompt(job_description=job_description) | ||
response = llm.generate(prompt=prompt) | ||
prediction = prompt_handler.parse_response(model_response=response) | ||
if prediction == target: | ||
correct_pred += 1 | ||
result = "[PASS]" | ||
else: | ||
result = "[FAIL]" | ||
|
||
logging.debug( | ||
"No=%d. target=%s prediction=%s %s\n[prompt]\n%s\n[response]\n%s" | ||
% (idx, target, prediction, result, prompt, response) | ||
) | ||
acc = correct_pred / len(dataset) * 100 | ||
return acc |
Oops, something went wrong.