diff --git a/.github/workflows/check_star_for_challange_submission.yaml b/.github/workflows/check_star_for_challenge_submission.yaml
similarity index 100%
rename from .github/workflows/check_star_for_challange_submission.yaml
rename to .github/workflows/check_star_for_challenge_submission.yaml
diff --git a/.github/workflows/evaluate_on_private_dataset.yaml b/.github/workflows/evaluate_on_private_dataset.yaml
new file mode 100644
index 0000000..462efbe
--- /dev/null
+++ b/.github/workflows/evaluate_on_private_dataset.yaml
@@ -0,0 +1,29 @@
+name: Evaluate prompt engineering challenge results on a private dataset.
+
+on:
+ pull_request:
+ types: [opened, reopened, synchronize]
+
+jobs:
+ private_evaluation:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Check if there are any changes in submissions dir
+ uses: dorny/paths-filter@v3.0.2
+ id: changes
+ with:
+ filters: |
+ src:
+ - 'session_2/challenge/submissions/**'
+
+ - name: Clone private dataset.
+ if: steps.changes.outputs.src == 'true'
+ uses: GuillaumeFalourd/clone-github-repo-action@v2.3
+ with:
+ owner: 'hetulvp'
+ repository: 'promp-engineering-challange-private-dataset'
+ access-token: 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
+
+ - name: Access cloned repository content
+ run: |
+ ls -la promp-engineering-challange-private-dataset
\ No newline at end of file
diff --git a/.github/workflows/github_pages.yaml b/.github/workflows/github_pages.yaml
index e2da726..f65b651 100644
--- a/.github/workflows/github_pages.yaml
+++ b/.github/workflows/github_pages.yaml
@@ -1,4 +1,4 @@
-name: ci
+name: Deploy to github pages
on:
push:
branches:
diff --git a/session_2/challenge/leaderboard.md b/session_2/challenge/leaderboard.md
index e9937bd..f3dce9a 100644
--- a/session_2/challenge/leaderboard.md
+++ b/session_2/challenge/leaderboard.md
@@ -12,13 +12,14 @@ Check [participation guide](how_to_participate.md).
-| Rank | Profile Image | GitHub Username | Solution | Accuracy % |
-|-------:|:------------------------------------------------------------------------------------------------|:-------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:|
-| 1 | | [New User](https://github.com/new_user) | [New Solution](https://github.com/new_solution) | 99.5 |
-| 2 | | [Username 2](https://github.com/username2) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 |
-| 3 | | [Username 4](https://github.com/username4) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 |
-| 4 | | [Username 3](https://github.com/username3) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 10 |
-| 5 | | [Username 1](https://github.com/username1) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 0 |
+| Rank | Profile Image | GitHub Username | Solution | Accuracy % |
+|-------:|:------------------------------------------------------------------------------------------------|:----------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:|
+| 1 | | [New User](https://github.com/new_user) | [New Solution](https://github.com/new_solution) | 99.5 |
+| 2 | | [Username 2](https://github.com/username2) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 |
+| 3 | | [Username 4](https://github.com/username4) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 |
+| 4 | | [hetul-patel](https://github.com/hetul-patel) | [baseline](https://github.com/infocusp/llm_seminar_series/blob/main/session_2/challenge/submissions/baseline.py) | 50 |
+| 5 | | [Username 3](https://github.com/username3) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 10 |
+| 6 | | [Username 1](https://github.com/username1) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 0 |
\ No newline at end of file
diff --git a/session_2/challenge/scripts/dataset.py b/session_2/challenge/scripts/dataset.py
new file mode 100644
index 0000000..74bd476
--- /dev/null
+++ b/session_2/challenge/scripts/dataset.py
@@ -0,0 +1,24 @@
+"""Utilities to load evaluation datasets."""
+
+import glob
+import os
+
+
+def load_sample_test_set(samples_dir: str) -> list[tuple[str, bool]]:
+ """Loads sample job descriptions and answers for local testing."""
+ sample_files = glob.glob(os.path.join(samples_dir, "*.txt"))
+ sample_inputs = []
+ for filepath in sample_files:
+ content = open(filepath, "r").read()
+ filename = os.path.basename(filepath).lower()
+ if filename.endswith("_yes.txt"):
+ target = True
+ elif filename.endswith("_no.txt"):
+ target = False
+ else:
+ raise ValueError(
+ "File %s must end with yes.txt or no.txt" % filepath
+ )
+ target = True if "yes" in filename.lower() else False
+ sample_inputs.append((content, target))
+ return sample_inputs
diff --git a/session_2/challenge/scripts/evaluate.py b/session_2/challenge/scripts/evaluate.py
index c2fe3a9..1ca6baa 100644
--- a/session_2/challenge/scripts/evaluate.py
+++ b/session_2/challenge/scripts/evaluate.py
@@ -21,15 +21,11 @@ def build_prompt(self, job_description: str) -> str:
python3 -m scripts.evaluate --prompt=baseline
"""
-import glob
import logging
-import os
from collections.abc import Sequence
-import tqdm
from absl import app, flags
-from scripts import model, registry
-from submissions import baseline # noqa: F401
+from scripts import dataset, evaluate_lib
_PROMPT = flags.DEFINE_string(
"prompt", None, "Name of the prompt to evaluate."
@@ -39,52 +35,12 @@ def build_prompt(self, job_description: str) -> str:
"debug", True, "Prints prompt and response if true."
)
-_SAMPLES_DIR = "sample_inputs"
-
-
-def load_sample_test_set() -> list[tuple[str, bool]]:
- """Loads sample job descriptions and answers for local testing."""
- sample_files = glob.glob(os.path.join(_SAMPLES_DIR, "*.txt"))
- sample_inputs = []
- for filepath in sample_files:
- content = open(filepath, "r").read()
- filename = os.path.basename(filepath).lower()
- if filename.endswith("_yes.txt"):
- target = True
- elif filename.endswith("_no.txt"):
- target = False
- else:
- raise ValueError(
- "File %s must end with yes.txt or no.txt" % filepath
- )
- target = True if "yes" in filename.lower() else False
- sample_inputs.append((content, target))
- return sample_inputs
-
-
-def evaluate(prompt_name: str):
- """Evaluates the prompt submission."""
- # Loads a free gpt4 model.
- llm = model.G4fModel()
-
- # Loads a prompt submission.
- prompt_handler = registry.get(name=prompt_name)
-
- # Generate results for the dataset.
- dataset = load_sample_test_set()
- correct_pred = 0
- for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)):
- prompt = prompt_handler.build_prompt(job_description=job_description)
- logging.debug("[prompt %d]\n%s", idx, prompt)
- response = llm.generate(prompt=prompt)
- logging.debug("[response %d]\n%s", idx, response)
- output = prompt_handler.parse_response(model_response=response)
- logging.debug("[target %d]\n%s", idx, target)
- logging.debug("[prediction %d]\n%s", idx, output)
- if output == target:
- correct_pred += 1
-
- print("Accuracy: [%.3f] %%" % (correct_pred / len(dataset) * 100)) # noqa: T201
+
+def evaluate_on_sample_dataset(prompt_name: str):
+ """Evaluates the prompt on a sample_dataset."""
+ sample_inputs = dataset.load_sample_test_set(samples_dir="sample_inputs")
+ acc = evaluate_lib.evaluate(dataset=sample_inputs, prompt_name=prompt_name)
+ print("Accuracy: [%.3f] %%" % acc) # noqa: T201
def main(argv: Sequence[str]) -> None:
@@ -95,8 +51,9 @@ def main(argv: Sequence[str]) -> None:
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.getLogger().setLevel(logging.INFO)
- evaluate(prompt_name=_PROMPT.value)
+ evaluate_on_sample_dataset(prompt_name=_PROMPT.value)
if __name__ == "__main__":
+ flags.mark_flag_as_required("prompt")
app.run(main)
diff --git a/session_2/challenge/scripts/evaluate_lib.py b/session_2/challenge/scripts/evaluate_lib.py
new file mode 100644
index 0000000..355979e
--- /dev/null
+++ b/session_2/challenge/scripts/evaluate_lib.py
@@ -0,0 +1,36 @@
+"""Library function for evaluating a prompt on a particular dataset."""
+
+import logging
+
+import tqdm
+from scripts import model, registry
+from submissions import * # noqa: F401, F403
+from submissions import baseline # noqa: F401
+
+
+def evaluate(dataset: list[tuple[str, bool]], prompt_name: str):
+ """Evaluates the prompt submission."""
+ # Loads a free gpt4 model.
+ llm = model.G4fModel()
+
+ # Loads a prompt submission.
+ prompt_handler = registry.get(name=prompt_name)
+
+ # Generate results for the dataset.
+ correct_pred = 0
+ for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)):
+ prompt = prompt_handler.build_prompt(job_description=job_description)
+ response = llm.generate(prompt=prompt)
+ prediction = prompt_handler.parse_response(model_response=response)
+ if prediction == target:
+ correct_pred += 1
+ result = "[PASS]"
+ else:
+ result = "[FAIL]"
+
+ logging.debug(
+ "No=%d. target=%s prediction=%s %s\n[prompt]\n%s\n[response]\n%s"
+ % (idx, target, prediction, result, prompt, response)
+ )
+ acc = correct_pred / len(dataset) * 100
+ return acc
diff --git a/session_2/challenge/scripts/leaderboard.py b/session_2/challenge/scripts/leaderboard.py
index 08e8631..802269f 100644
--- a/session_2/challenge/scripts/leaderboard.py
+++ b/session_2/challenge/scripts/leaderboard.py
@@ -1,76 +1,128 @@
-"""Generates leaderboard."""
+"""Updates the public leaderboard after evaluating given submission."""
+import logging
import re
+from collections.abc import Sequence
import pandas as pd
+from absl import app, flags
+from scripts import dataset, evaluate_lib, registry
-# Read the markdown table into a DataFrame
-with open("session_2/challenge/leaderboard.md", "r") as file:
- content = file.read()
-
-start_marker = "\n"
-start_index = content.find(start_marker)
-end_index = content.find("\n")
-table_content = content[start_index:end_index]
-
-
-# Extract rows using regex
-rows = re.findall(
- r"\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|", table_content
-)[2:]
-
-# Create a DataFrame from the extracted rows
-df = pd.DataFrame(
- rows,
- columns=[
- "Rank",
- "Profile Image",
- "GitHub Username",
- "Solution",
- "Accuracy %",
- ],
+_PROMPT = flags.DEFINE_string(
+ "prompt", None, "Name of the submitted prompt to evaluate."
)
-# Strip extra spaces before and after text in each cell
-df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
-
-# Convert "Rank" column to integer and "Accuracy %" column to float
-df["Rank"] = df["Rank"].astype(int)
-df["Accuracy %"] = df["Accuracy %"].astype(float)
-
-# Add a new entry to the DataFrame
-new_entry = {
- "Rank": len(df) + 1,
- "Profile Image": '',
- "GitHub Username": "[New User](https://github.com/new_user)",
- "Solution": "[New Solution](https://github.com/new_solution)",
- "Accuracy %": 99.5,
-} # Example accuracy value
-
-df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
-
-# Keep only the highest submission for each user
-highest_indices = df.groupby("GitHub Username")["Accuracy %"].idxmax()
-df_highest = df.loc[highest_indices]
-
-# Sort the DataFrame by "Accuracy %" column in descending order
-df_sorted = df_highest.sort_values(
- by="Accuracy %", ascending=False
-).reset_index(drop=True)
-
-# Update the "Rank" column after sorting
-df_sorted["Rank"] = df_sorted.index + 1
-
-# Convert the DataFrame back to markdown format
-markdown_table = df_sorted.to_markdown(index=False)
-
-# Replace the existing table in the markdown file with the sorted table
-new_content = (
- content[: start_index + len(start_marker)]
- + markdown_table
- + content[end_index:]
+_GITHUB_USER = flags.DEFINE_string(
+ "github_user", None, "Github username to add an entry in leaderboard."
)
-# Write the updated content back to the markdown file
-with open("session_2/challenge/leaderboard.md", "w") as file:
- file.write(new_content)
+
+_LEADERBORAD = "leaderboard.md" # current leaderboard
+
+
+def generate_leaderboard(prompt_name: str, accuracy: float, github_user: str):
+ """Generates leaderboard."""
+ # Read the markdown table into a DataFrame
+ with open(_LEADERBORAD, "r") as file:
+ content = file.read()
+
+ start_marker = "\n"
+ start_index = content.find(start_marker)
+ end_index = content.find("\n")
+ table_content = content[start_index:end_index]
+
+ # Extract rows using regex
+ rows = re.findall(
+ r"\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|", table_content
+ )[2:]
+
+ # Create a DataFrame from the extracted rows
+ df = pd.DataFrame(
+ rows,
+ columns=[
+ "Rank",
+ "Profile Image",
+ "GitHub Username",
+ "Solution",
+ "Accuracy %",
+ ],
+ )
+
+ # Strip extra spaces before and after text in each cell
+ df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
+
+ # Convert "Rank" column to integer and "Accuracy %" column to float
+ df["Rank"] = df["Rank"].astype(int)
+ df["Accuracy %"] = df["Accuracy %"].astype(float)
+
+ # Add a new entry to the DataFrame
+ prompt_file = registry.get_filename(name=prompt_name)
+ new_entry = {
+ "Rank": len(df) + 1,
+ "Profile Image": f'',
+ "GitHub Username": f"[{github_user}](https://github.com/{github_user})",
+ "Solution": f"[{prompt_name}](https://github.com/infocusp/llm_seminar_series/blob/main/session_2/challenge/submissions/{prompt_file})",
+ "Accuracy %": accuracy,
+ }
+
+ df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
+
+ # Keep only the highest submission for each user
+ highest_indices = df.groupby("GitHub Username")["Accuracy %"].idxmax()
+ df_highest = df.loc[highest_indices]
+
+ # Sort the DataFrame by "Accuracy %" column in descending order
+ df_sorted = df_highest.sort_values(
+ by="Accuracy %", ascending=False
+ ).reset_index(drop=True)
+
+ # Update the "Rank" column after sorting
+ df_sorted["Rank"] = df_sorted.index + 1
+
+ # Convert the DataFrame back to markdown format
+ markdown_table = df_sorted.to_markdown(index=False)
+
+ # Replace the existing table in the markdown file with the sorted table
+ new_content = (
+ content[: start_index + len(start_marker)]
+ + markdown_table
+ + content[end_index:]
+ )
+
+ # Write the updated content back to the markdown file
+ with open(_LEADERBORAD, "w") as file:
+ file.write(new_content)
+
+ logging.info(
+ "Submission by %s with prompt %s updated in the leaderboard.",
+ github_user,
+ prompt_name,
+ )
+
+
+def update_leaderboard(prompt_name: str, github_user: str):
+ """Generates a public leaderboard by evaluating given submission."""
+ sample_dataset = dataset.load_sample_test_set(samples_dir="sample_inputs")
+ acc = evaluate_lib.evaluate(
+ dataset=sample_dataset, prompt_name=prompt_name
+ )
+ generate_leaderboard(
+ prompt_name=prompt_name, accuracy=acc, github_user=github_user
+ )
+
+
+def main(argv: Sequence[str]) -> None:
+ """Entrypoint."""
+ if len(argv) > 1:
+ raise app.UsageError("Too many command-line arguments.")
+ logging.getLogger().setLevel(logging.INFO)
+ update_leaderboard(
+ prompt_name=_PROMPT.value, github_user=_GITHUB_USER.value
+ )
+
+
+if __name__ == "__main__":
+ flags.mark_flag_as_required("prompt")
+ flags.mark_flag_as_required("github_user")
+ app.run(main)
diff --git a/session_2/challenge/scripts/registry.py b/session_2/challenge/scripts/registry.py
index 3659dec..fc661d1 100644
--- a/session_2/challenge/scripts/registry.py
+++ b/session_2/challenge/scripts/registry.py
@@ -1,5 +1,6 @@
"""Registry of all the submitted prompts."""
+import os
from typing import Type
from scripts import base
@@ -28,3 +29,10 @@ def get(name: str) -> base.PromptSubmission:
def get_all() -> list[Type[base.PromptSubmission]]:
"""Returns all the submissions."""
return list(_SUBMISSIONS_REGISTRY.values())
+
+
+def get_filename(name: str) -> str:
+ """Returns the name of the file containing class with registered name."""
+ klass = get(name=name)
+ filename = klass.__class__.__module__.split(".")[-1] + ".py"
+ return filename