From fe2297b6e61359d82054fc6890d4286f9328cd5a Mon Sep 17 00:00:00 2001 From: Maram Hasanain Date: Mon, 28 Aug 2023 15:14:50 +0300 Subject: [PATCH] Updated NewsCat_ASND_BLOOMZ_ZeroShot.py Much better performance with this version --- .../NewsCat_ASND_BLOOMZ_ZeroShot.py | 53 +++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py index 48f34c4c..eb2eebf9 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py @@ -1,12 +1,9 @@ import os -import random from llmebench.datasets import NewsCatASNDDataset from llmebench.models import BLOOMPetalModel from llmebench.tasks import NewsCatASNDTask -random.seed(1333) - def config(): return { @@ -40,21 +37,14 @@ def config(): def prompt(input_sample): - arr = input_sample.split() - - if len(arr) > 1000: - article = " ".join(arr[:1000]) - else: - article = " ".join(arr) - prompt_string = ( - f"You are an expert news editor and know how to categorize news articles.\n\n" + f"You are an expert news editor and know how to categorize news tweets.\n\n" f"Categorize the following tweet into one of the following categories: " f"crime-war-conflict, spiritual, health, politics, human-rights-press-freedom, " f"education, business-and-economy, art-and-entertainment, others, " f"science-and-technology, sports, environment\n" f"Provide only label and in English.\n\n" - f"\ntweet: {article}" + f"\ntweet: {input_sample}" f"\ncategory: \n" ) @@ -66,21 +56,30 @@ def post_process(response): label = label.replace("", "") label = label.replace("", "") - label_fixed = label.lower() - label_fixed = label_fixed.replace("category: ", "") - label_fixed = label_fixed.replace("science/physics", "tech") - label_fixed = label_fixed.replace("health/nutrition", "medical") - if len(label_fixed.split("\s+")) > 1: - label_fixed = label_fixed.split("\s+")[0] - label_fixed = random.choice(label_fixed.split("/")).strip() - if "science/physics" in label_fixed: - label_fixed = label_fixed.replace("science/physics", "tech") - elif "science and technology" in label: - label_fixed = "tech" - elif label_fixed.startswith("culture"): - label_fixed = label_fixed.split("(")[0] - - label_fixed = label_fixed.replace("culture.", "culture") + if "crime-war-conflict" in label or "war" in label: + label_fixed = "crime-war-conflict" + elif "spiritual" in label: + label_fixed = "spiritual" + elif "health" in label: + label_fixed = "health" + elif "politics" in label: + label_fixed = "politics" + elif "human-rights-press-freedom" in label: + label_fixed = "human-rights-press-freedom" + elif "education" in label: + label_fixed = "education" + elif "business-and-economy" in label: + label_fixed = "business-and-economy" + elif "art-and-entertainment" in label or "entertainment" in label: + label_fixed = "art-and-entertainment" + elif "others" in label: + label_fixed = "others" + elif "science-and-technology" in label or "science" in label: + label_fixed = "science-and-technology" + elif "sports" in label: + label_fixed = "sports" + elif "environment" in label: + label_fixed = "environment" else: label_fixed = None