Skip to content

Commit

Permalink
Added github action to evaluate on private dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
hetulvp committed Apr 20, 2024
1 parent 002f879 commit ba68719
Show file tree
Hide file tree
Showing 9 changed files with 234 additions and 127 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/evaluate_on_private_dataset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Evaluate prompt engineering challenge results on a private dataset.

on:
pull_request:
types: [opened, reopened, synchronize]

jobs:
private_evaluation:
runs-on: ubuntu-latest
steps:
- name: Check if there are any changes in submissions dir
uses: dorny/[email protected]
id: changes
with:
filters: |
src:
- 'session_2/challenge/submissions/**'
- name: Clone private dataset.
if: steps.changes.outputs.src == 'true'
uses: GuillaumeFalourd/[email protected]
with:
owner: hetulvp
repository: promp-engineering-challange-private-dataset
access-token: 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

- name: Access cloned repository content
run: |
ls -la promp-engineering-challange-private-dataset
2 changes: 1 addition & 1 deletion .github/workflows/github_pages.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: ci
name: Deploy to github pages
on:
push:
branches:
Expand Down
15 changes: 8 additions & 7 deletions session_2/challenge/leaderboard.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@ Check [participation guide](how_to_participate.md).
<center>

<!-- leader-board-begins -->
| Rank | Profile Image | GitHub Username | Solution | Accuracy % |
|-------:|:------------------------------------------------------------------------------------------------|:-------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:|
| 1 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [New User](https://github.com/new_user) | [New Solution](https://github.com/new_solution) | 99.5 |
| 2 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 2](https://github.com/username2) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 |
| 3 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 4](https://github.com/username4) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 |
| 4 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 3](https://github.com/username3) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 10 |
| 5 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 1](https://github.com/username1) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 0 |
| Rank | Profile Image | GitHub Username | Solution | Accuracy % |
|-------:|:------------------------------------------------------------------------------------------------|:----------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:|
| 1 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [New User](https://github.com/new_user) | [New Solution](https://github.com/new_solution) | 99.5 |
| 2 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 2](https://github.com/username2) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 |
| 3 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 4](https://github.com/username4) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 |
| 4 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [hetul-patel](https://github.com/hetul-patel) | [baseline](https://github.com/infocusp/llm_seminar_series/blob/main/session_2/challenge/submissions/baseline.py) | 50 |
| 5 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 3](https://github.com/username3) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 10 |
| 6 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 1](https://github.com/username1) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 0 |
<!-- leader-board-ends -->

</center>
24 changes: 24 additions & 0 deletions session_2/challenge/scripts/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Utilities to load evaluation datasets."""

import glob
import os


def load_sample_test_set(samples_dir: str) -> list[tuple[str, bool]]:
"""Loads sample job descriptions and answers for local testing."""
sample_files = glob.glob(os.path.join(samples_dir, "*.txt"))
sample_inputs = []
for filepath in sample_files:
content = open(filepath, "r").read()
filename = os.path.basename(filepath).lower()
if filename.endswith("_yes.txt"):
target = True
elif filename.endswith("_no.txt"):
target = False
else:
raise ValueError(
"File %s must end with yes.txt or no.txt" % filepath
)
target = True if "yes" in filename.lower() else False
sample_inputs.append((content, target))
return sample_inputs
61 changes: 9 additions & 52 deletions session_2/challenge/scripts/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,11 @@ def build_prompt(self, job_description: str) -> str:
python3 -m scripts.evaluate --prompt=baseline
"""

import glob
import logging
import os
from collections.abc import Sequence

import tqdm
from absl import app, flags
from scripts import model, registry
from submissions import baseline # noqa: F401
from scripts import dataset, evaluate_lib

_PROMPT = flags.DEFINE_string(
"prompt", None, "Name of the prompt to evaluate."
Expand All @@ -39,52 +35,12 @@ def build_prompt(self, job_description: str) -> str:
"debug", True, "Prints prompt and response if true."
)

_SAMPLES_DIR = "sample_inputs"


def load_sample_test_set() -> list[tuple[str, bool]]:
"""Loads sample job descriptions and answers for local testing."""
sample_files = glob.glob(os.path.join(_SAMPLES_DIR, "*.txt"))
sample_inputs = []
for filepath in sample_files:
content = open(filepath, "r").read()
filename = os.path.basename(filepath).lower()
if filename.endswith("_yes.txt"):
target = True
elif filename.endswith("_no.txt"):
target = False
else:
raise ValueError(
"File %s must end with yes.txt or no.txt" % filepath
)
target = True if "yes" in filename.lower() else False
sample_inputs.append((content, target))
return sample_inputs


def evaluate(prompt_name: str):
"""Evaluates the prompt submission."""
# Loads a free gpt4 model.
llm = model.G4fModel()

# Loads a prompt submission.
prompt_handler = registry.get(name=prompt_name)

# Generate results for the dataset.
dataset = load_sample_test_set()
correct_pred = 0
for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)):
prompt = prompt_handler.build_prompt(job_description=job_description)
logging.debug("[prompt %d]\n%s", idx, prompt)
response = llm.generate(prompt=prompt)
logging.debug("[response %d]\n%s", idx, response)
output = prompt_handler.parse_response(model_response=response)
logging.debug("[target %d]\n%s", idx, target)
logging.debug("[prediction %d]\n%s", idx, output)
if output == target:
correct_pred += 1

print("Accuracy: [%.3f] %%" % (correct_pred / len(dataset) * 100)) # noqa: T201

def evaluate_on_sample_dataset(prompt_name: str):
"""Evaluates the prompt on a sample_dataset."""
sample_inputs = dataset.load_sample_test_set(samples_dir="sample_inputs")
acc = evaluate_lib.evaluate(dataset=sample_inputs, prompt_name=prompt_name)
print("Accuracy: [%.3f] %%" % acc) # noqa: T201


def main(argv: Sequence[str]) -> None:
Expand All @@ -95,8 +51,9 @@ def main(argv: Sequence[str]) -> None:
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.getLogger().setLevel(logging.INFO)
evaluate(prompt_name=_PROMPT.value)
evaluate_on_sample_dataset(prompt_name=_PROMPT.value)


if __name__ == "__main__":
flags.mark_flag_as_required("prompt")
app.run(main)
36 changes: 36 additions & 0 deletions session_2/challenge/scripts/evaluate_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Library function for evaluating a prompt on a particular dataset."""

import logging

import tqdm
from scripts import model, registry
from submissions import * # noqa: F401, F403
from submissions import baseline # noqa: F401


def evaluate(dataset: list[tuple[str, bool]], prompt_name: str):
"""Evaluates the prompt submission."""
# Loads a free gpt4 model.
llm = model.G4fModel()

# Loads a prompt submission.
prompt_handler = registry.get(name=prompt_name)

# Generate results for the dataset.
correct_pred = 0
for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)):
prompt = prompt_handler.build_prompt(job_description=job_description)
response = llm.generate(prompt=prompt)
prediction = prompt_handler.parse_response(model_response=response)
if prediction == target:
correct_pred += 1
result = "[PASS]"
else:
result = "[FAIL]"

logging.debug(
"No=%d. target=%s prediction=%s %s\n[prompt]\n%s\n[response]\n%s"
% (idx, target, prediction, result, prompt, response)
)
acc = correct_pred / len(dataset) * 100
return acc
Loading

0 comments on commit ba68719

Please sign in to comment.