From 2461e6a46d4219500f4c5b49ee9c8f37bcc61852 Mon Sep 17 00:00:00 2001 From: Hetul Patel Date: Sat, 20 Apr 2024 15:17:11 +0530 Subject: [PATCH] Added github action to evaluate on private dataset --- ... check_star_for_challenge_submission.yaml} | 2 +- .github/workflows/github_pages.yaml | 2 +- .github/workflows/update_leaderboard.yaml | 82 ++++++++ session_2/challenge/how_to_participate.md | 16 +- session_2/challenge/leaderboard.md | 15 +- session_2/challenge/scripts/dataset.py | 24 +++ session_2/challenge/scripts/evaluate.py | 61 +----- session_2/challenge/scripts/evaluate_lib.py | 36 ++++ session_2/challenge/scripts/leaderboard.py | 192 ++++++++++++------ session_2/challenge/scripts/model.py | 2 +- session_2/challenge/scripts/registry.py | 5 +- session_2/challenge/submissions/baseline.py | 2 +- 12 files changed, 295 insertions(+), 144 deletions(-) rename .github/workflows/{check_star_for_challange_submission.yaml => check_star_for_challenge_submission.yaml} (93%) create mode 100644 .github/workflows/update_leaderboard.yaml create mode 100644 session_2/challenge/scripts/dataset.py create mode 100644 session_2/challenge/scripts/evaluate_lib.py diff --git a/.github/workflows/check_star_for_challange_submission.yaml b/.github/workflows/check_star_for_challenge_submission.yaml similarity index 93% rename from .github/workflows/check_star_for_challange_submission.yaml rename to .github/workflows/check_star_for_challenge_submission.yaml index e515d4f..d58f8c5 100644 --- a/.github/workflows/check_star_for_challange_submission.yaml +++ b/.github/workflows/check_star_for_challenge_submission.yaml @@ -20,6 +20,6 @@ jobs: id: check-star - if: ${{ (steps.changes.outputs.src == 'true') && (steps.check-star.outputs.is-stargazer != 'true') }} - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: script: core.setFailed('⭐ Please, star this repository!') \ No newline at end of file diff --git a/.github/workflows/github_pages.yaml b/.github/workflows/github_pages.yaml index e2da726..f65b651 100644 --- a/.github/workflows/github_pages.yaml +++ b/.github/workflows/github_pages.yaml @@ -1,4 +1,4 @@ -name: ci +name: Deploy to github pages on: push: branches: diff --git a/.github/workflows/update_leaderboard.yaml b/.github/workflows/update_leaderboard.yaml new file mode 100644 index 0000000..c1fde3f --- /dev/null +++ b/.github/workflows/update_leaderboard.yaml @@ -0,0 +1,82 @@ +name: Update leaderboard. + +on: + pull_request: + types: [opened, reopened, synchronize] + +jobs: + leaderboard_evaluation: + runs-on: ubuntu-latest + steps: + - name: Check if there are any changes in submissions dir + uses: dorny/paths-filter@v3.0.2 + id: changes + with: + filters: | + src: + - 'session_2/challenge/submissions/**' + list-files: "shell" + + - name: Print changed files + run: | + echo '${{ toJSON(steps.changes.outputs) }}' + + - if: ${{ (steps.changes.outputs.src_count > 1) }} + uses: actions/github-script@v7 + with: + script: core.setFailed('More than one submissions are not allowed at once.') + + # Update leaderboard only if single file is changed in submission dir + - if: ${{ (steps.changes.outputs.src == 'true') && (steps.changes.outputs.src_count == 1) }} + name: Checkout code + uses: actions/checkout@v4 + with: + repository: ${{ github.event.pull_request.head.repo.full_name }} + ref: ${{ github.event.pull_request.head.ref }} + + - if: ${{ (steps.changes.outputs.src == 'true') && (steps.changes.outputs.src_count == 1) }} + name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - if: ${{ (steps.changes.outputs.src == 'true') && (steps.changes.outputs.src_count == 1) }} + name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r session_2/challenge/requirements.txt + + - if: ${{ (steps.changes.outputs.src == 'true') && (steps.changes.outputs.src_count == 1) }} + name: Run leaderboard update script + id: leaderboard-update + run: | + cd session_2/challenge + filename=$(basename "${{ steps.changes.outputs.src_files }}") + filename_without_extension="${filename%.*}" # Remove extension + python -m scripts.leaderboard --github_user="${{ github.actor }}" --prompt="$filename_without_extension" + + - name: Commit changes + uses: EndBug/add-and-commit@v9 + with: + author_name: GitHub Actions + author_email: actions@github.com + message: 'Updated leader board' + add: 'session_2/challenge/leaderboard.md' + + # # Commit the updated leaderboard + # - if: ${{ (steps.changes.outputs.src == 'true') && (steps.changes.outputs.src_count == 1) }} + # name: Commit updated leaderboard + # id: commit-leaderboard + # run: | + # git config --global user.name "GitHub Actions" + # git config --global user.email "actions@github.com" + # git add session_2/challenge/leaderboard.md + # git commit -m "Update leaderboard" + # git push -f origin HEAD:${{ github.ref }} + + + # # Print the commit SHA for reference + # - if: ${{ (steps.changes.outputs.src == 'true') && (steps.changes.outputs.src_count == 1) }} + # name: Print Commit SHA + # run: | + # echo "Commit SHA: ${{ steps.commit-leaderboard.outputs.commit_sha }}" \ No newline at end of file diff --git a/session_2/challenge/how_to_participate.md b/session_2/challenge/how_to_participate.md index 740cbf4..a0c0c94 100644 --- a/session_2/challenge/how_to_participate.md +++ b/session_2/challenge/how_to_participate.md @@ -20,19 +20,18 @@ ``` 3. To submit your own prompt, make a copy of `submissions/baseline.py` and - change the name of the prompt from `baseline` to something else which + change the name of the file from `baseline` to something else which describes your prompt. E.g, ```python # file: submissions/name_of_your_prompt.py - @registry.register("name_of_your_prompt") + @registry.register() class NameOfYourPrompt(base.PromptSubmission): ... ``` - Also change the class name and register it with a new name (can be same as the - filename.) + Also change the class name. 4. Update the `build_prompt` and `parse_response` method. @@ -62,11 +61,4 @@ your prompt. 8. Congratulations 🎉, once a repo maintainer approves your submission and merges - your PR, your rank based on a private test set will be published on the - public leader board. - -!!! note - You can test your prompt on your own samples by adding new files under - `sample_inputs` dir. The file name must ends with `"yes.txt"` if the JD is - for a fresher, otherwise it should end with `"no.txt"`. Do not commit - these files. \ No newline at end of file + your PR, your rank will be published on the public leader board. diff --git a/session_2/challenge/leaderboard.md b/session_2/challenge/leaderboard.md index e9937bd..b17a344 100644 --- a/session_2/challenge/leaderboard.md +++ b/session_2/challenge/leaderboard.md @@ -12,13 +12,14 @@ Check [participation guide](how_to_participate.md).
-| Rank | Profile Image | GitHub Username | Solution | Accuracy % | -|-------:|:------------------------------------------------------------------------------------------------|:-------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:| -| 1 | | [New User](https://github.com/new_user) | [New Solution](https://github.com/new_solution) | 99.5 | -| 2 | | [Username 2](https://github.com/username2) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 | -| 3 | | [Username 4](https://github.com/username4) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 | -| 4 | | [Username 3](https://github.com/username3) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 10 | -| 5 | | [Username 1](https://github.com/username1) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 0 | +| Rank | Profile Image | GitHub Username | Solution | Accuracy % | +|-------:|:------------------------------------------------------------------------------------------------|:----------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|-------------:| +| 1 | | [New User](https://github.com/new_user) | [New Solution](https://github.com/new_solution) | 99.5 | +| 2 | | [Username 2](https://github.com/username2) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 | +| 3 | | [Username 4](https://github.com/username4) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 95 | +| 4 | | [hetul-patel](https://github.com/hetul-patel) | [baseline](https://github.com/infocusp/llm_seminar_series/blob/main/session_2/challenge/submissions/baseline.py) | 50 | +| 6 | | [Username 3](https://github.com/username3) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 10 | +| 7 | | [Username 1](https://github.com/username1) | [Baseline](https://github.com/infocusp/llm_seminar_series/blob/hetul/prompting-leader-board/session_2/challenge/submissions/baseline.py) | 0 |
\ No newline at end of file diff --git a/session_2/challenge/scripts/dataset.py b/session_2/challenge/scripts/dataset.py new file mode 100644 index 0000000..74bd476 --- /dev/null +++ b/session_2/challenge/scripts/dataset.py @@ -0,0 +1,24 @@ +"""Utilities to load evaluation datasets.""" + +import glob +import os + + +def load_sample_test_set(samples_dir: str) -> list[tuple[str, bool]]: + """Loads sample job descriptions and answers for local testing.""" + sample_files = glob.glob(os.path.join(samples_dir, "*.txt")) + sample_inputs = [] + for filepath in sample_files: + content = open(filepath, "r").read() + filename = os.path.basename(filepath).lower() + if filename.endswith("_yes.txt"): + target = True + elif filename.endswith("_no.txt"): + target = False + else: + raise ValueError( + "File %s must end with yes.txt or no.txt" % filepath + ) + target = True if "yes" in filename.lower() else False + sample_inputs.append((content, target)) + return sample_inputs diff --git a/session_2/challenge/scripts/evaluate.py b/session_2/challenge/scripts/evaluate.py index c2fe3a9..1ca6baa 100644 --- a/session_2/challenge/scripts/evaluate.py +++ b/session_2/challenge/scripts/evaluate.py @@ -21,15 +21,11 @@ def build_prompt(self, job_description: str) -> str: python3 -m scripts.evaluate --prompt=baseline """ -import glob import logging -import os from collections.abc import Sequence -import tqdm from absl import app, flags -from scripts import model, registry -from submissions import baseline # noqa: F401 +from scripts import dataset, evaluate_lib _PROMPT = flags.DEFINE_string( "prompt", None, "Name of the prompt to evaluate." @@ -39,52 +35,12 @@ def build_prompt(self, job_description: str) -> str: "debug", True, "Prints prompt and response if true." ) -_SAMPLES_DIR = "sample_inputs" - - -def load_sample_test_set() -> list[tuple[str, bool]]: - """Loads sample job descriptions and answers for local testing.""" - sample_files = glob.glob(os.path.join(_SAMPLES_DIR, "*.txt")) - sample_inputs = [] - for filepath in sample_files: - content = open(filepath, "r").read() - filename = os.path.basename(filepath).lower() - if filename.endswith("_yes.txt"): - target = True - elif filename.endswith("_no.txt"): - target = False - else: - raise ValueError( - "File %s must end with yes.txt or no.txt" % filepath - ) - target = True if "yes" in filename.lower() else False - sample_inputs.append((content, target)) - return sample_inputs - - -def evaluate(prompt_name: str): - """Evaluates the prompt submission.""" - # Loads a free gpt4 model. - llm = model.G4fModel() - - # Loads a prompt submission. - prompt_handler = registry.get(name=prompt_name) - - # Generate results for the dataset. - dataset = load_sample_test_set() - correct_pred = 0 - for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)): - prompt = prompt_handler.build_prompt(job_description=job_description) - logging.debug("[prompt %d]\n%s", idx, prompt) - response = llm.generate(prompt=prompt) - logging.debug("[response %d]\n%s", idx, response) - output = prompt_handler.parse_response(model_response=response) - logging.debug("[target %d]\n%s", idx, target) - logging.debug("[prediction %d]\n%s", idx, output) - if output == target: - correct_pred += 1 - - print("Accuracy: [%.3f] %%" % (correct_pred / len(dataset) * 100)) # noqa: T201 + +def evaluate_on_sample_dataset(prompt_name: str): + """Evaluates the prompt on a sample_dataset.""" + sample_inputs = dataset.load_sample_test_set(samples_dir="sample_inputs") + acc = evaluate_lib.evaluate(dataset=sample_inputs, prompt_name=prompt_name) + print("Accuracy: [%.3f] %%" % acc) # noqa: T201 def main(argv: Sequence[str]) -> None: @@ -95,8 +51,9 @@ def main(argv: Sequence[str]) -> None: logging.getLogger().setLevel(logging.DEBUG) else: logging.getLogger().setLevel(logging.INFO) - evaluate(prompt_name=_PROMPT.value) + evaluate_on_sample_dataset(prompt_name=_PROMPT.value) if __name__ == "__main__": + flags.mark_flag_as_required("prompt") app.run(main) diff --git a/session_2/challenge/scripts/evaluate_lib.py b/session_2/challenge/scripts/evaluate_lib.py new file mode 100644 index 0000000..355979e --- /dev/null +++ b/session_2/challenge/scripts/evaluate_lib.py @@ -0,0 +1,36 @@ +"""Library function for evaluating a prompt on a particular dataset.""" + +import logging + +import tqdm +from scripts import model, registry +from submissions import * # noqa: F401, F403 +from submissions import baseline # noqa: F401 + + +def evaluate(dataset: list[tuple[str, bool]], prompt_name: str): + """Evaluates the prompt submission.""" + # Loads a free gpt4 model. + llm = model.G4fModel() + + # Loads a prompt submission. + prompt_handler = registry.get(name=prompt_name) + + # Generate results for the dataset. + correct_pred = 0 + for idx, (job_description, target) in enumerate(tqdm.tqdm(dataset)): + prompt = prompt_handler.build_prompt(job_description=job_description) + response = llm.generate(prompt=prompt) + prediction = prompt_handler.parse_response(model_response=response) + if prediction == target: + correct_pred += 1 + result = "[PASS]" + else: + result = "[FAIL]" + + logging.debug( + "No=%d. target=%s prediction=%s %s\n[prompt]\n%s\n[response]\n%s" + % (idx, target, prediction, result, prompt, response) + ) + acc = correct_pred / len(dataset) * 100 + return acc diff --git a/session_2/challenge/scripts/leaderboard.py b/session_2/challenge/scripts/leaderboard.py index 08e8631..c6b25e1 100644 --- a/session_2/challenge/scripts/leaderboard.py +++ b/session_2/challenge/scripts/leaderboard.py @@ -1,76 +1,134 @@ -"""Generates leaderboard.""" +"""Updates the public leaderboard after evaluating given submission. +Sample command: +python -m scripts.leaderboard \ + --github_user=your_github_user \ + --prompt_file=baseline +""" + +import logging import re +from collections.abc import Sequence import pandas as pd +from absl import app, flags +from scripts import dataset, evaluate_lib -# Read the markdown table into a DataFrame -with open("session_2/challenge/leaderboard.md", "r") as file: - content = file.read() - -start_marker = "\n" -start_index = content.find(start_marker) -end_index = content.find("\n") -table_content = content[start_index:end_index] - - -# Extract rows using regex -rows = re.findall( - r"\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|", table_content -)[2:] - -# Create a DataFrame from the extracted rows -df = pd.DataFrame( - rows, - columns=[ - "Rank", - "Profile Image", - "GitHub Username", - "Solution", - "Accuracy %", - ], +_PROMPT = flags.DEFINE_string( + "prompt", None, "Name of the submitted prompt to evaluate." ) -# Strip extra spaces before and after text in each cell -df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) - -# Convert "Rank" column to integer and "Accuracy %" column to float -df["Rank"] = df["Rank"].astype(int) -df["Accuracy %"] = df["Accuracy %"].astype(float) - -# Add a new entry to the DataFrame -new_entry = { - "Rank": len(df) + 1, - "Profile Image": '', - "GitHub Username": "[New User](https://github.com/new_user)", - "Solution": "[New Solution](https://github.com/new_solution)", - "Accuracy %": 99.5, -} # Example accuracy value - -df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) - -# Keep only the highest submission for each user -highest_indices = df.groupby("GitHub Username")["Accuracy %"].idxmax() -df_highest = df.loc[highest_indices] - -# Sort the DataFrame by "Accuracy %" column in descending order -df_sorted = df_highest.sort_values( - by="Accuracy %", ascending=False -).reset_index(drop=True) - -# Update the "Rank" column after sorting -df_sorted["Rank"] = df_sorted.index + 1 - -# Convert the DataFrame back to markdown format -markdown_table = df_sorted.to_markdown(index=False) - -# Replace the existing table in the markdown file with the sorted table -new_content = ( - content[: start_index + len(start_marker)] - + markdown_table - + content[end_index:] +_GITHUB_USER = flags.DEFINE_string( + "github_user", None, "Github username to add an entry in leaderboard." ) -# Write the updated content back to the markdown file -with open("session_2/challenge/leaderboard.md", "w") as file: - file.write(new_content) + +_LEADERBORAD = "leaderboard.md" # current leaderboard + + +def generate_leaderboard(prompt_name: str, accuracy: float, github_user: str): + """Generates leaderboard.""" + # Read the markdown table into a DataFrame + with open(_LEADERBORAD, "r") as file: + content = file.read() + + start_marker = "\n" + start_index = content.find(start_marker) + end_index = content.find("\n") + table_content = content[start_index:end_index] + + # Extract rows using regex + rows = re.findall( + r"\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|", table_content + )[2:] + + # Create a DataFrame from the extracted rows + df = pd.DataFrame( + rows, + columns=[ + "Rank", + "Profile Image", + "GitHub Username", + "Solution", + "Accuracy %", + ], + ) + + # Strip extra spaces before and after text in each cell + df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) + + # Convert "Rank" column to integer and "Accuracy %" column to float + df["Rank"] = df["Rank"].astype(int) + df["Accuracy %"] = df["Accuracy %"].astype(float) + + # Add a new entry to the DataFrame + repo_url = "https://github.com/infocusp/llm_seminar_series/blob/main/session_2/challenge/submissions" + new_entry = { + "Rank": len(df) + 1, + "Profile Image": f'', + "GitHub Username": f"[{github_user}](https://github.com/{github_user})", + "Solution": f"[{prompt_name}]({repo_url}/{prompt_name}.py)", + "Accuracy %": accuracy, + } + + df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) + + # Keep only the highest submission for each user + highest_indices = df.groupby("GitHub Username")["Accuracy %"].idxmax() + df_highest = df.loc[highest_indices] + + # Sort the DataFrame by "Accuracy %" column in descending order + df_sorted = df_highest.sort_values( + by="Accuracy %", ascending=False + ).reset_index(drop=True) + + # Update the "Rank" column after sorting + df_sorted["Rank"] = df_sorted.index + 1 + + # Convert the DataFrame back to markdown format + markdown_table = df_sorted.to_markdown(index=False) + + # Replace the existing table in the markdown file with the sorted table + new_content = ( + content[: start_index + len(start_marker)] + + markdown_table + + content[end_index:] + ) + + # Write the updated content back to the markdown file + with open(_LEADERBORAD, "w") as file: + file.write(new_content) + + logging.info( + "Submission by %s with prompt %s updated in the leaderboard.", + github_user, + prompt_name, + ) + + +def update_leaderboard(prompt_name: str, github_user: str): + """Generates a public leaderboard by evaluating given submission.""" + sample_dataset = dataset.load_sample_test_set(samples_dir="sample_inputs") + acc = evaluate_lib.evaluate( + dataset=sample_dataset, prompt_name=prompt_name + ) + generate_leaderboard( + prompt_name=prompt_name, accuracy=acc, github_user=github_user + ) + + +def main(argv: Sequence[str]) -> None: + """Entrypoint.""" + if len(argv) > 1: + raise app.UsageError("Too many command-line arguments.") + logging.getLogger().setLevel(logging.INFO) + update_leaderboard( + prompt_name=_PROMPT.value, github_user=_GITHUB_USER.value + ) + + +if __name__ == "__main__": + flags.mark_flag_as_required("prompt") + flags.mark_flag_as_required("github_user") + app.run(main) diff --git a/session_2/challenge/scripts/model.py b/session_2/challenge/scripts/model.py index ba6c450..67a554e 100644 --- a/session_2/challenge/scripts/model.py +++ b/session_2/challenge/scripts/model.py @@ -1,6 +1,6 @@ """Model inference.""" -import g4f +import g4f # noqa: F401 class Model: diff --git a/session_2/challenge/scripts/registry.py b/session_2/challenge/scripts/registry.py index 3659dec..02d1c1c 100644 --- a/session_2/challenge/scripts/registry.py +++ b/session_2/challenge/scripts/registry.py @@ -7,10 +7,11 @@ _SUBMISSIONS_REGISTRY: dict[str, Type[base.PromptSubmission]] = {} -def register(name: str): - """Returns a decorator that registers a submission with the given name.""" +def register(): + """Returns a decorator that registers a submission with its file as key.""" def _register(klass: Type[base.PromptSubmission]): + name = klass.__module__.split(".")[-1] _SUBMISSIONS_REGISTRY[name] = klass return klass diff --git a/session_2/challenge/submissions/baseline.py b/session_2/challenge/submissions/baseline.py index 1f76d67..c8f6b3e 100644 --- a/session_2/challenge/submissions/baseline.py +++ b/session_2/challenge/submissions/baseline.py @@ -3,7 +3,7 @@ from scripts import base, registry -@registry.register("baseline") +@registry.register() class Baseline(base.PromptSubmission): """Baseline submission."""