diff --git a/.github/workflows/check_star_for_challange_submission.yaml b/.github/workflows/check_star_for_challenge_submission.yaml similarity index 100% rename from .github/workflows/check_star_for_challange_submission.yaml rename to .github/workflows/check_star_for_challenge_submission.yaml diff --git a/.github/workflows/evaluate_on_private_dataset.yaml b/.github/workflows/evaluate_on_private_dataset.yaml new file mode 100644 index 0000000..462efbe --- /dev/null +++ b/.github/workflows/evaluate_on_private_dataset.yaml @@ -0,0 +1,29 @@ +name: Evaluate prompt engineering challenge results on a private dataset. + +on: + pull_request: + types: [opened, reopened, synchronize] + +jobs: + private_evaluation: + runs-on: ubuntu-latest + steps: + - name: Check if there are any changes in submissions dir + uses: dorny/paths-filter@v3.0.2 + id: changes + with: + filters: | + src: + - 'session_2/challenge/submissions/**' + + - name: Clone private dataset. + if: steps.changes.outputs.src == 'true' + uses: GuillaumeFalourd/clone-github-repo-action@v2.3 + with: + owner: 'hetulvp' + repository: 'promp-engineering-challange-private-dataset' + access-token: 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 + + - name: Access cloned repository content + run: | + ls -la promp-engineering-challange-private-dataset \ No newline at end of file diff --git a/.github/workflows/github_pages.yaml b/.github/workflows/github_pages.yaml index e2da726..f65b651 100644 --- a/.github/workflows/github_pages.yaml +++ b/.github/workflows/github_pages.yaml @@ -1,4 +1,4 @@ -name: ci +name: Deploy to github pages on: push: branches: diff --git a/session_2/challenge/leaderboard.md b/session_2/challenge/leaderboard.md index e9937bd..f3dce9a 100644 --- a/session_2/challenge/leaderboard.md +++ b/session_2/challenge/leaderboard.md @@ -12,13 +12,14 @@ Check [participation guide](how_to_participate.md).
-| Rank | Profile Image | GitHub Username | Solution | Accuracy % | -|-------:|:------------------------------------------------------------------------------------------------|:-------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:| -| 1 | | [New User](https://github.com/new_user) | [New Solution](https://github.com/new_solution) | 99.5 | -| 2 | | [Username 2](https://github.com/username2) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 | -| 3 | | [Username 4](https://github.com/username4) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 | -| 4 | | [Username 3](https://github.com/username3) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 10 | -| 5 | | [Username 1](https://github.com/username1) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 0 | +| Rank | Profile Image | GitHub Username | Solution | Accuracy % | +|-------:|:------------------------------------------------------------------------------------------------|:----------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:| +| 1 | | [New User](https://github.com/new_user) | [New Solution](https://github.com/new_solution) | 99.5 | +| 2 | | [Username 2](https://github.com/username2) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 | +| 3 | | [Username 4](https://github.com/username4) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 | +| 4 | | [hetul-patel](https://github.com/hetul-patel) | [baseline](https://github.com/infocusp/llm_seminar_series/blob/main/session_2/challenge/submissions/baseline.py) | 50 | +| 5 | | [Username 3](https://github.com/username3) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 10 | +| 6 | | [Username 1](https://github.com/username1) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 0 |
\ No newline at end of file diff --git a/session_2/challenge/scripts/dataset.py b/session_2/challenge/scripts/dataset.py new file mode 100644 index 0000000..74bd476 --- /dev/null +++ b/session_2/challenge/scripts/dataset.py @@ -0,0 +1,24 @@ +"""Utilities to load evaluation datasets.""" + +import glob +import os + + +def load_sample_test_set(samples_dir: str) -> list[tuple[str, bool]]: + """Loads sample job descriptions and answers for local testing.""" + sample_files = glob.glob(os.path.join(samples_dir, "*.txt")) + sample_inputs = [] + for filepath in sample_files: + content = open(filepath, "r").read() + filename = os.path.basename(filepath).lower() + if filename.endswith("_yes.txt"): + target = True + elif filename.endswith("_no.txt"): + target = False + else: + raise ValueError( + "File %s must end with yes.txt or no.txt" % filepath + ) + target = True if "yes" in filename.lower() else False + sample_inputs.append((content, target)) + return sample_inputs diff --git a/session_2/challenge/scripts/evaluate.py b/session_2/challenge/scripts/evaluate.py index c2fe3a9..1ca6baa 100644 --- a/session_2/challenge/scripts/evaluate.py +++ b/session_2/challenge/scripts/evaluate.py @@ -21,15 +21,11 @@ def build_prompt(self, job_description: str) -> str: python3 -m scripts.evaluate --prompt=baseline """ -import glob import logging -import os from collections.abc import Sequence -import tqdm from absl import app, flags -from scripts import model, registry -from submissions import baseline # noqa: F401 +from scripts import dataset, evaluate_lib _PROMPT = flags.DEFINE_string( "prompt", None, "Name of the prompt to evaluate." @@ -39,52 +35,12 @@ def build_prompt(self, job_description: str) -> str: "debug", True, "Prints prompt and response if true." ) -_SAMPLES_DIR = "sample_inputs" - - -def load_sample_test_set() -> list[tuple[str, bool]]: - """Loads sample job descriptions and answers for local testing.""" - sample_files = glob.glob(os.path.join(_SAMPLES_DIR, "*.txt")) - sample_inputs = [] - for filepath in sample_files: - content = open(filepath, "r").read() - filename = os.path.basename(filepath).lower() - if filename.endswith("_yes.txt"): - target = True - elif filename.endswith("_no.txt"): - target = False - else: - raise ValueError( - "File %s must end with yes.txt or no.txt" % filepath - ) - target = True if "yes" in filename.lower() else False - sample_inputs.append((content, target)) - return sample_inputs - - -def evaluate(prompt_name: str): - """Evaluates the prompt submission.""" - # Loads a free gpt4 model. - llm = model.G4fModel() - - # Loads a prompt submission. - prompt_handler = registry.get(name=prompt_name) - - # Generate results for the dataset. - dataset = load_sample_test_set() - correct_pred = 0 - for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)): - prompt = prompt_handler.build_prompt(job_description=job_description) - logging.debug("[prompt %d]\n%s", idx, prompt) - response = llm.generate(prompt=prompt) - logging.debug("[response %d]\n%s", idx, response) - output = prompt_handler.parse_response(model_response=response) - logging.debug("[target %d]\n%s", idx, target) - logging.debug("[prediction %d]\n%s", idx, output) - if output == target: - correct_pred += 1 - - print("Accuracy: [%.3f] %%" % (correct_pred / len(dataset) * 100)) # noqa: T201 + +def evaluate_on_sample_dataset(prompt_name: str): + """Evaluates the prompt on a sample_dataset.""" + sample_inputs = dataset.load_sample_test_set(samples_dir="sample_inputs") + acc = evaluate_lib.evaluate(dataset=sample_inputs, prompt_name=prompt_name) + print("Accuracy: [%.3f] %%" % acc) # noqa: T201 def main(argv: Sequence[str]) -> None: @@ -95,8 +51,9 @@ def main(argv: Sequence[str]) -> None: logging.getLogger().setLevel(logging.DEBUG) else: logging.getLogger().setLevel(logging.INFO) - evaluate(prompt_name=_PROMPT.value) + evaluate_on_sample_dataset(prompt_name=_PROMPT.value) if __name__ == "__main__": + flags.mark_flag_as_required("prompt") app.run(main) diff --git a/session_2/challenge/scripts/evaluate_lib.py b/session_2/challenge/scripts/evaluate_lib.py new file mode 100644 index 0000000..355979e --- /dev/null +++ b/session_2/challenge/scripts/evaluate_lib.py @@ -0,0 +1,36 @@ +"""Library function for evaluating a prompt on a particular dataset.""" + +import logging + +import tqdm +from scripts import model, registry +from submissions import * # noqa: F401, F403 +from submissions import baseline # noqa: F401 + + +def evaluate(dataset: list[tuple[str, bool]], prompt_name: str): + """Evaluates the prompt submission.""" + # Loads a free gpt4 model. + llm = model.G4fModel() + + # Loads a prompt submission. + prompt_handler = registry.get(name=prompt_name) + + # Generate results for the dataset. + correct_pred = 0 + for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)): + prompt = prompt_handler.build_prompt(job_description=job_description) + response = llm.generate(prompt=prompt) + prediction = prompt_handler.parse_response(model_response=response) + if prediction == target: + correct_pred += 1 + result = "[PASS]" + else: + result = "[FAIL]" + + logging.debug( + "No=%d. target=%s prediction=%s %s\n[prompt]\n%s\n[response]\n%s" + % (idx, target, prediction, result, prompt, response) + ) + acc = correct_pred / len(dataset) * 100 + return acc diff --git a/session_2/challenge/scripts/leaderboard.py b/session_2/challenge/scripts/leaderboard.py index 08e8631..802269f 100644 --- a/session_2/challenge/scripts/leaderboard.py +++ b/session_2/challenge/scripts/leaderboard.py @@ -1,76 +1,128 @@ -"""Generates leaderboard.""" +"""Updates the public leaderboard after evaluating given submission.""" +import logging import re +from collections.abc import Sequence import pandas as pd +from absl import app, flags +from scripts import dataset, evaluate_lib, registry -# Read the markdown table into a DataFrame -with open("session_2/challenge/leaderboard.md", "r") as file: - content = file.read() - -start_marker = "\n" -start_index = content.find(start_marker) -end_index = content.find("\n") -table_content = content[start_index:end_index] - - -# Extract rows using regex -rows = re.findall( - r"\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|", table_content -)[2:] - -# Create a DataFrame from the extracted rows -df = pd.DataFrame( - rows, - columns=[ - "Rank", - "Profile Image", - "GitHub Username", - "Solution", - "Accuracy %", - ], +_PROMPT = flags.DEFINE_string( + "prompt", None, "Name of the submitted prompt to evaluate." ) -# Strip extra spaces before and after text in each cell -df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) - -# Convert "Rank" column to integer and "Accuracy %" column to float -df["Rank"] = df["Rank"].astype(int) -df["Accuracy %"] = df["Accuracy %"].astype(float) - -# Add a new entry to the DataFrame -new_entry = { - "Rank": len(df) + 1, - "Profile Image": '', - "GitHub Username": "[New User](https://github.com/new_user)", - "Solution": "[New Solution](https://github.com/new_solution)", - "Accuracy %": 99.5, -} # Example accuracy value - -df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) - -# Keep only the highest submission for each user -highest_indices = df.groupby("GitHub Username")["Accuracy %"].idxmax() -df_highest = df.loc[highest_indices] - -# Sort the DataFrame by "Accuracy %" column in descending order -df_sorted = df_highest.sort_values( - by="Accuracy %", ascending=False -).reset_index(drop=True) - -# Update the "Rank" column after sorting -df_sorted["Rank"] = df_sorted.index + 1 - -# Convert the DataFrame back to markdown format -markdown_table = df_sorted.to_markdown(index=False) - -# Replace the existing table in the markdown file with the sorted table -new_content = ( - content[: start_index + len(start_marker)] - + markdown_table - + content[end_index:] +_GITHUB_USER = flags.DEFINE_string( + "github_user", None, "Github username to add an entry in leaderboard." ) -# Write the updated content back to the markdown file -with open("session_2/challenge/leaderboard.md", "w") as file: - file.write(new_content) + +_LEADERBORAD = "leaderboard.md" # current leaderboard + + +def generate_leaderboard(prompt_name: str, accuracy: float, github_user: str): + """Generates leaderboard.""" + # Read the markdown table into a DataFrame + with open(_LEADERBORAD, "r") as file: + content = file.read() + + start_marker = "\n" + start_index = content.find(start_marker) + end_index = content.find("\n") + table_content = content[start_index:end_index] + + # Extract rows using regex + rows = re.findall( + r"\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|", table_content + )[2:] + + # Create a DataFrame from the extracted rows + df = pd.DataFrame( + rows, + columns=[ + "Rank", + "Profile Image", + "GitHub Username", + "Solution", + "Accuracy %", + ], + ) + + # Strip extra spaces before and after text in each cell + df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) + + # Convert "Rank" column to integer and "Accuracy %" column to float + df["Rank"] = df["Rank"].astype(int) + df["Accuracy %"] = df["Accuracy %"].astype(float) + + # Add a new entry to the DataFrame + prompt_file = registry.get_filename(name=prompt_name) + new_entry = { + "Rank": len(df) + 1, + "Profile Image": f'', + "GitHub Username": f"[{github_user}](https://github.com/{github_user})", + "Solution": f"[{prompt_name}](https://github.com/infocusp/llm_seminar_series/blob/main/session_2/challenge/submissions/{prompt_file})", + "Accuracy %": accuracy, + } + + df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) + + # Keep only the highest submission for each user + highest_indices = df.groupby("GitHub Username")["Accuracy %"].idxmax() + df_highest = df.loc[highest_indices] + + # Sort the DataFrame by "Accuracy %" column in descending order + df_sorted = df_highest.sort_values( + by="Accuracy %", ascending=False + ).reset_index(drop=True) + + # Update the "Rank" column after sorting + df_sorted["Rank"] = df_sorted.index + 1 + + # Convert the DataFrame back to markdown format + markdown_table = df_sorted.to_markdown(index=False) + + # Replace the existing table in the markdown file with the sorted table + new_content = ( + content[: start_index + len(start_marker)] + + markdown_table + + content[end_index:] + ) + + # Write the updated content back to the markdown file + with open(_LEADERBORAD, "w") as file: + file.write(new_content) + + logging.info( + "Submission by %s with prompt %s updated in the leaderboard.", + github_user, + prompt_name, + ) + + +def update_leaderboard(prompt_name: str, github_user: str): + """Generates a public leaderboard by evaluating given submission.""" + sample_dataset = dataset.load_sample_test_set(samples_dir="sample_inputs") + acc = evaluate_lib.evaluate( + dataset=sample_dataset, prompt_name=prompt_name + ) + generate_leaderboard( + prompt_name=prompt_name, accuracy=acc, github_user=github_user + ) + + +def main(argv: Sequence[str]) -> None: + """Entrypoint.""" + if len(argv) > 1: + raise app.UsageError("Too many command-line arguments.") + logging.getLogger().setLevel(logging.INFO) + update_leaderboard( + prompt_name=_PROMPT.value, github_user=_GITHUB_USER.value + ) + + +if __name__ == "__main__": + flags.mark_flag_as_required("prompt") + flags.mark_flag_as_required("github_user") + app.run(main) diff --git a/session_2/challenge/scripts/registry.py b/session_2/challenge/scripts/registry.py index 3659dec..fc661d1 100644 --- a/session_2/challenge/scripts/registry.py +++ b/session_2/challenge/scripts/registry.py @@ -1,5 +1,6 @@ """Registry of all the submitted prompts.""" +import os from typing import Type from scripts import base @@ -28,3 +29,10 @@ def get(name: str) -> base.PromptSubmission: def get_all() -> list[Type[base.PromptSubmission]]: """Returns all the submissions.""" return list(_SUBMISSIONS_REGISTRY.values()) + + +def get_filename(name: str) -> str: + """Returns the name of the file containing class with registered name.""" + klass = get(name=name) + filename = klass.__class__.__module__.split(".")[-1] + ".py" + return filename