Added github action to evaluate on private dataset

infocusp · Apr 20, 2024 · ba68719 · ba68719
1 parent 002f879
commit ba68719
Show file tree

Hide file tree

Showing 9 changed files with 234 additions and 127 deletions.
diff --git a/.../check_star_for_challange_submission.yaml → .../check_star_for_challenge_submission.yaml b/.../check_star_for_challange_submission.yaml → .../check_star_for_challenge_submission.yaml
diff --git a/.github/workflows/evaluate_on_private_dataset.yaml b/.github/workflows/evaluate_on_private_dataset.yaml
@@ -0,0 +1,29 @@
+name: Evaluate prompt engineering challenge results on a private dataset.
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize]
+
+jobs:
+  private_evaluation:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check if there are any changes in submissions dir
+        uses: dorny/[email protected]
+        id: changes
+        with:
+          filters: |
+            src:
+              - 'session_2/challenge/submissions/**'
+      
+      - name: Clone private dataset.
+        if: steps.changes.outputs.src == 'true'
+        uses: GuillaumeFalourd/[email protected]
+        with:
+          owner: hetulvp
+          repository: promp-engineering-challange-private-dataset
+          access-token: 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
+
+      - name: Access cloned repository content
+        run: |
+          ls -la promp-engineering-challange-private-dataset        
diff --git a/.github/workflows/github_pages.yaml b/.github/workflows/github_pages.yaml
@@ -1,4 +1,4 @@
-name: ci 
+name: Deploy to github pages 
 on:
   push:
     branches:

diff --git a/session_2/challenge/leaderboard.md b/session_2/challenge/leaderboard.md
@@ -12,13 +12,14 @@ Check [participation guide](how_to_participate.md).
 <center>
 
 <!-- leader-board-begins -->
-|   Rank | Profile Image                                                                                   | GitHub Username                            | Solution                                                                                                                                 |   Accuracy % |
-|-------:|:------------------------------------------------------------------------------------------------|:-------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:|
-|      1 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [New User](https://github.com/new_user)    | [New Solution](https://github.com/new_solution)                                                                                          |         99.5 |
-|      2 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 2](https://github.com/username2) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) |         95   |
-|      3 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 4](https://github.com/username4) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) |         95   |
-|      4 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 3](https://github.com/username3) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) |         10   |
-|      5 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 1](https://github.com/username1) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) |          0   |
+|   Rank | Profile Image                                                                                   | GitHub Username                               | Solution                                                                                                                                 |   Accuracy % |
+|-------:|:------------------------------------------------------------------------------------------------|:----------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:|
+|      1 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [New User](https://github.com/new_user)       | [New Solution](https://github.com/new_solution)                                                                                          |         99.5 |
+|      2 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 2](https://github.com/username2)    | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) |         95   |
+|      3 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 4](https://github.com/username4)    | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) |         95   |
+|      4 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [hetul-patel](https://github.com/hetul-patel) | [baseline](https://github.com/infocusp/llm_seminar_series/blob/main/session_2/challenge/submissions/baseline.py)                         |         50   |
+|      5 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 3](https://github.com/username3)    | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) |         10   |
+|      6 | <img src="https://github.com/hetul-patel.png" width="50px" height="50px" class="profile-image"> | [Username 1](https://github.com/username1)    | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) |          0   |
 <!-- leader-board-ends -->
 
 </center>
diff --git a/session_2/challenge/scripts/dataset.py b/session_2/challenge/scripts/dataset.py
@@ -0,0 +1,24 @@
+"""Utilities to load evaluation datasets."""
+
+import glob
+import os
+
+
+def load_sample_test_set(samples_dir: str) -> list[tuple[str, bool]]:
+    """Loads sample job descriptions and answers for local testing."""
+    sample_files = glob.glob(os.path.join(samples_dir, "*.txt"))
+    sample_inputs = []
+    for filepath in sample_files:
+        content = open(filepath, "r").read()
+        filename = os.path.basename(filepath).lower()
+        if filename.endswith("_yes.txt"):
+            target = True
+        elif filename.endswith("_no.txt"):
+            target = False
+        else:
+            raise ValueError(
+                "File %s must end with yes.txt or no.txt" % filepath
+            )
+        target = True if "yes" in filename.lower() else False
+        sample_inputs.append((content, target))
+    return sample_inputs
diff --git a/session_2/challenge/scripts/evaluate.py b/session_2/challenge/scripts/evaluate.py
@@ -21,15 +21,11 @@ def build_prompt(self, job_description: str) -> str:
 python3 -m scripts.evaluate --prompt=baseline
 """
 
-import glob
 import logging
-import os
 from collections.abc import Sequence
 
-import tqdm
 from absl import app, flags
-from scripts import model, registry
-from submissions import baseline  # noqa: F401
+from scripts import dataset, evaluate_lib
 
 _PROMPT = flags.DEFINE_string(
     "prompt", None, "Name of the prompt to evaluate."
@@ -39,52 +35,12 @@ def build_prompt(self, job_description: str) -> str:
     "debug", True, "Prints prompt and response if true."
 )
 
-_SAMPLES_DIR = "sample_inputs"
-
-
-def load_sample_test_set() -> list[tuple[str, bool]]:
-    """Loads sample job descriptions and answers for local testing."""
-    sample_files = glob.glob(os.path.join(_SAMPLES_DIR, "*.txt"))
-    sample_inputs = []
-    for filepath in sample_files:
-        content = open(filepath, "r").read()
-        filename = os.path.basename(filepath).lower()
-        if filename.endswith("_yes.txt"):
-            target = True
-        elif filename.endswith("_no.txt"):
-            target = False
-        else:
-            raise ValueError(
-                "File %s must end with yes.txt or no.txt" % filepath
-            )
-        target = True if "yes" in filename.lower() else False
-        sample_inputs.append((content, target))
-    return sample_inputs
-
-
-def evaluate(prompt_name: str):
-    """Evaluates the prompt submission."""
-    # Loads a free gpt4 model.
-    llm = model.G4fModel()
-
-    # Loads a prompt submission.
-    prompt_handler = registry.get(name=prompt_name)
-
-    # Generate results for the dataset.
-    dataset = load_sample_test_set()
-    correct_pred = 0
-    for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)):
-        prompt = prompt_handler.build_prompt(job_description=job_description)
-        logging.debug("[prompt %d]\n%s", idx, prompt)
-        response = llm.generate(prompt=prompt)
-        logging.debug("[response %d]\n%s", idx, response)
-        output = prompt_handler.parse_response(model_response=response)
-        logging.debug("[target %d]\n%s", idx, target)
-        logging.debug("[prediction %d]\n%s", idx, output)
-        if output == target:
-            correct_pred += 1
-
-    print("Accuracy: [%.3f] %%" % (correct_pred / len(dataset) * 100))  # noqa: T201
+
+def evaluate_on_sample_dataset(prompt_name: str):
+    """Evaluates the prompt on a sample_dataset."""
+    sample_inputs = dataset.load_sample_test_set(samples_dir="sample_inputs")
+    acc = evaluate_lib.evaluate(dataset=sample_inputs, prompt_name=prompt_name)
+    print("Accuracy: [%.3f] %%" % acc)  # noqa: T201
 
 
 def main(argv: Sequence[str]) -> None:
@@ -95,8 +51,9 @@ def main(argv: Sequence[str]) -> None:
         logging.getLogger().setLevel(logging.DEBUG)
     else:
         logging.getLogger().setLevel(logging.INFO)
-    evaluate(prompt_name=_PROMPT.value)
+    evaluate_on_sample_dataset(prompt_name=_PROMPT.value)
 
 
 if __name__ == "__main__":
+    flags.mark_flag_as_required("prompt")
     app.run(main)
diff --git a/session_2/challenge/scripts/evaluate_lib.py b/session_2/challenge/scripts/evaluate_lib.py
@@ -0,0 +1,36 @@
+"""Library function for evaluating a prompt on a particular dataset."""
+
+import logging
+
+import tqdm
+from scripts import model, registry
+from submissions import *  # noqa: F401, F403
+from submissions import baseline  # noqa: F401
+
+
+def evaluate(dataset: list[tuple[str, bool]], prompt_name: str):
+    """Evaluates the prompt submission."""
+    # Loads a free gpt4 model.
+    llm = model.G4fModel()
+
+    # Loads a prompt submission.
+    prompt_handler = registry.get(name=prompt_name)
+
+    # Generate results for the dataset.
+    correct_pred = 0
+    for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)):
+        prompt = prompt_handler.build_prompt(job_description=job_description)
+        response = llm.generate(prompt=prompt)
+        prediction = prompt_handler.parse_response(model_response=response)
+        if prediction == target:
+            correct_pred += 1
+            result = "[PASS]"
+        else:
+            result = "[FAIL]"
+
+        logging.debug(
+            "No=%d. target=%s prediction=%s %s\n[prompt]\n%s\n[response]\n%s"
+            % (idx, target, prediction, result, prompt, response)
+        )
+    acc = correct_pred / len(dataset) * 100
+    return acc