From fe2297b6e61359d82054fc6890d4286f9328cd5a Mon Sep 17 00:00:00 2001
From: Maram Hasanain <maramhasanain@gmail.com>
Date: Mon, 28 Aug 2023 15:14:50 +0300
Subject: [PATCH] Updated NewsCat_ASND_BLOOMZ_ZeroShot.py

Much better performance with this version
---
 .../NewsCat_ASND_BLOOMZ_ZeroShot.py           | 53 +++++++++----------
 1 file changed, 26 insertions(+), 27 deletions(-)
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py
index 48f34c4c..eb2eebf9 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py
@@ -1,12 +1,9 @@
 import os
-import random
 
 from llmebench.datasets import NewsCatASNDDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import NewsCatASNDTask
 
-random.seed(1333)
-
 
 def config():
     return {
@@ -40,21 +37,14 @@ def config():
 
 
 def prompt(input_sample):
-    arr = input_sample.split()
-
-    if len(arr) > 1000:
-        article = " ".join(arr[:1000])
-    else:
-        article = " ".join(arr)
-
     prompt_string = (
-        f"You are an expert news editor and know how to categorize news articles.\n\n"
+        f"You are an expert news editor and know how to categorize news tweets.\n\n"
         f"Categorize the following tweet into one of the following categories: "
         f"crime-war-conflict, spiritual, health, politics, human-rights-press-freedom, "
         f"education, business-and-economy, art-and-entertainment, others, "
         f"science-and-technology, sports, environment\n"
         f"Provide only label and in English.\n\n"
-        f"\ntweet: {article}"
+        f"\ntweet: {input_sample}"
         f"\ncategory: \n"
     )
 
@@ -66,21 +56,30 @@ def post_process(response):
     label = label.replace("<s>", "")
     label = label.replace("</s>", "")
 
-    label_fixed = label.lower()
-    label_fixed = label_fixed.replace("category: ", "")
-    label_fixed = label_fixed.replace("science/physics", "tech")
-    label_fixed = label_fixed.replace("health/nutrition", "medical")
-    if len(label_fixed.split("\s+")) > 1:
-        label_fixed = label_fixed.split("\s+")[0]
-    label_fixed = random.choice(label_fixed.split("/")).strip()
-    if "science/physics" in label_fixed:
-        label_fixed = label_fixed.replace("science/physics", "tech")
-    elif "science and technology" in label:
-        label_fixed = "tech"
-    elif label_fixed.startswith("culture"):
-        label_fixed = label_fixed.split("(")[0]
-
-        label_fixed = label_fixed.replace("culture.", "culture")
+    if "crime-war-conflict" in label or "war" in label:
+        label_fixed = "crime-war-conflict"
+    elif "spiritual" in label:
+        label_fixed = "spiritual"
+    elif "health" in label:
+        label_fixed = "health"
+    elif "politics" in label:
+        label_fixed = "politics"
+    elif "human-rights-press-freedom" in label:
+        label_fixed = "human-rights-press-freedom"
+    elif "education" in label:
+        label_fixed = "education"
+    elif "business-and-economy" in label:
+        label_fixed = "business-and-economy"
+    elif "art-and-entertainment" in label or "entertainment" in label:
+        label_fixed = "art-and-entertainment"
+    elif "others" in label:
+        label_fixed = "others"
+    elif "science-and-technology" in label or "science" in label:
+        label_fixed = "science-and-technology"
+    elif "sports" in label:
+        label_fixed = "sports"
+    elif "environment" in label:
+        label_fixed = "environment"
     else:
         label_fixed = None