-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added github action to evaluate on private dataset
- Loading branch information
Showing
9 changed files
with
234 additions
and
127 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
name: Evaluate prompt engineering challenge results on a private dataset. | ||
|
||
on: | ||
pull_request: | ||
types: [opened, reopened, synchronize] | ||
|
||
jobs: | ||
private_evaluation: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Check if there are any changes in submissions dir | ||
uses: dorny/[email protected] | ||
id: changes | ||
with: | ||
filters: | | ||
src: | ||
- 'session_2/challenge/submissions/**' | ||
- name: Clone private dataset. | ||
if: steps.changes.outputs.src == 'true' | ||
uses: InFoCusp/[email protected] | ||
with: | ||
owner: 'hetulvp' | ||
repository: 'promp-engineering-challange-private-dataset' | ||
access-token: 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 | ||
|
||
- name: Access cloned repository content | ||
run: | | ||
ls -la promp-engineering-challange-private-dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
name: ci | ||
name: Deploy to github pages | ||
on: | ||
push: | ||
branches: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
"""Utilities to load evaluation datasets.""" | ||
|
||
import glob | ||
import os | ||
|
||
|
||
def load_sample_test_set(samples_dir: str) -> list[tuple[str, bool]]: | ||
"""Loads sample job descriptions and answers for local testing.""" | ||
sample_files = glob.glob(os.path.join(samples_dir, "*.txt")) | ||
sample_inputs = [] | ||
for filepath in sample_files: | ||
content = open(filepath, "r").read() | ||
filename = os.path.basename(filepath).lower() | ||
if filename.endswith("_yes.txt"): | ||
target = True | ||
elif filename.endswith("_no.txt"): | ||
target = False | ||
else: | ||
raise ValueError( | ||
"File %s must end with yes.txt or no.txt" % filepath | ||
) | ||
target = True if "yes" in filename.lower() else False | ||
sample_inputs.append((content, target)) | ||
return sample_inputs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
"""Library function for evaluating a prompt on a particular dataset.""" | ||
|
||
import logging | ||
|
||
import tqdm | ||
from scripts import model, registry | ||
from submissions import * # noqa: F401, F403 | ||
from submissions import baseline # noqa: F401 | ||
|
||
|
||
def evaluate(dataset: list[tuple[str, bool]], prompt_name: str): | ||
"""Evaluates the prompt submission.""" | ||
# Loads a free gpt4 model. | ||
llm = model.G4fModel() | ||
|
||
# Loads a prompt submission. | ||
prompt_handler = registry.get(name=prompt_name) | ||
|
||
# Generate results for the dataset. | ||
correct_pred = 0 | ||
for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)): | ||
prompt = prompt_handler.build_prompt(job_description=job_description) | ||
response = llm.generate(prompt=prompt) | ||
prediction = prompt_handler.parse_response(model_response=response) | ||
if prediction == target: | ||
correct_pred += 1 | ||
result = "[PASS]" | ||
else: | ||
result = "[FAIL]" | ||
|
||
logging.debug( | ||
"No=%d. target=%s prediction=%s %s\n[prompt]\n%s\n[response]\n%s" | ||
% (idx, target, prediction, result, prompt, response) | ||
) | ||
acc = correct_pred / len(dataset) * 100 | ||
return acc |
Oops, something went wrong.