From c1b3848774061f0d58abab324a66613716f2ee2c Mon Sep 17 00:00:00 2001 From: Arid Hasan Date: Mon, 1 Jan 2024 23:21:24 -0400 Subject: [PATCH] Add JAIS assests for arabic news categorization --- .../ASND_JAIS13b_ZeroShot.py | 71 +++++++++++++++++ .../SANADAkhbarona_JAIS13b_ZeroShot.py | 73 +++++++++++++++++ .../SANADAlArabiya_JAIS13b_ZeroShot.py | 73 +++++++++++++++++ .../SANADAlKhaleej_JAIS13b_ZeroShot.py | 78 +++++++++++++++++++ 4 files changed, 295 insertions(+) create mode 100644 assets/ar/news_categorization/ASND_JAIS13b_ZeroShot.py create mode 100644 assets/ar/news_categorization/SANADAkhbarona_JAIS13b_ZeroShot.py create mode 100644 assets/ar/news_categorization/SANADAlArabiya_JAIS13b_ZeroShot.py create mode 100644 assets/ar/news_categorization/SANADAlKhaleej_JAIS13b_ZeroShot.py diff --git a/assets/ar/news_categorization/ASND_JAIS13b_ZeroShot.py b/assets/ar/news_categorization/ASND_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..4af91439 --- /dev/null +++ b/assets/ar/news_categorization/ASND_JAIS13b_ZeroShot.py @@ -0,0 +1,71 @@ +from llmebench.datasets import ASNDDataset +from llmebench.models import FastChatModel +from llmebench.tasks import NewsCategorizationTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "JAIS-13b", + "description": "Locally hosted JAIS-13b-chat model using FastChat.", + "scores": {"Macro-F1": ""}, + } + + +def config(): + return { + "dataset": ASNDDataset, + "task": NewsCategorizationTask, + "model": FastChatModel, + } + + +def prompt(input_sample): + base_prompt = ( + f"صنف التغريدة التالية إلى واحدة من الفئات التالية: " + f"جريمة-حرب-صراع ، روحي-ديني ، صحة ، سياسة ، حقوق-الإنسان-حرية-الصحافة ، " + f"تعليم ، أعمال-اقتصاد ، فن-ترفيه ، أخرى ، " + f"علوم-تكنولوجيا ، رياضة ، بيئة\n" + f"\nالتغريدة: {input_sample}" + f"\nالفئة: \n" + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + if "جريمة-حرب-صراع" in label or "صراع-حرب" in label: + label_fixed = "crime-war-conflict" + elif "روحي" in label or "ديني" in label: + label_fixed = "spiritual" + elif "صحة" in label: + label_fixed = "health" + elif "سياسة" in label: + label_fixed = "politics" + elif "حقوق-الإنسان-حرية-الصحافة" in label: + label_fixed = "human-rights-press-freedom" + elif "تعليم" in label: + label_fixed = "education" + elif "أعمال-و-اقتصاد" in label or "أعمال" in label or "اقتصاد" in label: + label_fixed = "business-and-economy" + elif "فن-و-ترفيه" in label or "ترفيه" in label: + label_fixed = "art-and-entertainment" + elif "أخرى" in label: + label_fixed = "others" + elif "علم-و-تكنولوجيا" in label or "علوم" in label or "تكنولوجيا" in label: + label_fixed = "science-and-technology" + elif "رياضة" in label: + label_fixed = "sports" + elif "بيئة" in label: + label_fixed = "environment" + else: + label_fixed = "others" + + return label_fixed diff --git a/assets/ar/news_categorization/SANADAkhbarona_JAIS13b_ZeroShot.py b/assets/ar/news_categorization/SANADAkhbarona_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..4b3807b8 --- /dev/null +++ b/assets/ar/news_categorization/SANADAkhbarona_JAIS13b_ZeroShot.py @@ -0,0 +1,73 @@ +import random + +from llmebench.datasets import SANADAkhbaronaDataset +from llmebench.models import FastChatModel +from llmebench.tasks import NewsCategorizationTask + + +random.seed(1333) + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "JAIS-13b", + "description": "Locally hosted JAIS-13b-chat model using FastChat.", + "scores": {"Macro-F1": ""}, + } + + +def config(): + return { + "dataset": SANADAkhbaronaDataset, + "task": NewsCategorizationTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "politics", + "religion", + "medical", + "sports", + "tech", + "finance", + "culture", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Categorize the news "article" into one of the following categories: politics, religion, medical, sports, tech, finance, culture\n\n' + f"article: {input_sample}\n" + f"category: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label_fixed = label.lower() + label_fixed = label_fixed.replace("category: ", "") + label_fixed = label_fixed.replace("science/physics", "tech") + label_fixed = label_fixed.replace("health/nutrition", "medical") + if "سياسة" in label or "السياسة" in label: + label_fixed = "politics" + if len(label_fixed.split("\s+")) > 1: + label_fixed = label_fixed.split("\s+")[0] + label_fixed = random.choice(label_fixed.split("/")).strip() + if "science/physics" in label_fixed: + label_fixed = label_fixed.replace("science/physics", "tech") + if label_fixed.startswith("culture"): + label_fixed = label_fixed.split("(")[0] + + label_fixed = label_fixed.replace("culture.", "culture") + + return label_fixed diff --git a/assets/ar/news_categorization/SANADAlArabiya_JAIS13b_ZeroShot.py b/assets/ar/news_categorization/SANADAlArabiya_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..90b9b1d2 --- /dev/null +++ b/assets/ar/news_categorization/SANADAlArabiya_JAIS13b_ZeroShot.py @@ -0,0 +1,73 @@ +import random + +from llmebench.datasets import SANADAlArabiyaDataset +from llmebench.models import FastChatModel +from llmebench.tasks import NewsCategorizationTask + + +random.seed(1333) + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "JAIS-13b", + "description": "Locally hosted JAIS-13b-chat model using FastChat.", + "scores": {"Macro-F1": ""}, + } + + +def config(): + return { + "dataset": SANADAlArabiyaDataset, + "task": NewsCategorizationTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "politics", + "religion", + "medical", + "sports", + "tech", + "finance", + "culture", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Categorize the news "article" into one of the following categories: politics, religion, medical, sports, tech, finance, culture\n\n' + f"article: {input_sample}\n" + f"category: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label_fixed = label.lower() + label_fixed = label_fixed.replace("category: ", "") + label_fixed = label_fixed.replace("science/physics", "tech") + label_fixed = label_fixed.replace("health/nutrition", "medical") + if "سياسة" in label or "السياسة" in label: + label_fixed = "politics" + if len(label_fixed.split("\s+")) > 1: + label_fixed = label_fixed.split("\s+")[0] + label_fixed = random.choice(label_fixed.split("/")).strip() + if "science/physics" in label_fixed: + label_fixed = label_fixed.replace("science/physics", "tech") + if label_fixed.startswith("culture"): + label_fixed = label_fixed.split("(")[0] + + label_fixed = label_fixed.replace("culture.", "culture") + + return label_fixed diff --git a/assets/ar/news_categorization/SANADAlKhaleej_JAIS13b_ZeroShot.py b/assets/ar/news_categorization/SANADAlKhaleej_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..d8beaddc --- /dev/null +++ b/assets/ar/news_categorization/SANADAlKhaleej_JAIS13b_ZeroShot.py @@ -0,0 +1,78 @@ +import random + +from llmebench.datasets import SANADAlKhaleejDataset +from llmebench.models import FastChatModel +from llmebench.tasks import NewsCategorizationTask + + +random.seed(1333) + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "JAIS-13b", + "description": "Locally hosted JAIS-13b-chat model using FastChat.", + "scores": {"Macro-F1": ""}, + } + + +def config(): + return { + "dataset": SANADAlKhaleejDataset, + "task": NewsCategorizationTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "culture", + "finance", + "medical", + "politics", + "religion", + "sports", + "tech", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Categorize the news "article" into one of the following categories: culture, finance, medical, politics, religion, sports, tech\n\n' + f"article: {input_sample}\n" + f"category: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + label_list = config()["model_args"]["class_labels"] + label_fixed = label.lower() + label_fixed = label_fixed.replace("category: ", "") + label_fixed = label_fixed.replace("science/physics", "tech") + label_fixed = label_fixed.replace("health/nutrition", "medical") + + if "سياسة" in label or "السياسة" in label: + label_fixed = "politics" + + if label_fixed.strip() in label_list: + label_fixed = label_fixed.strip() + + elif "science/physics" in label_fixed: + label_fixed = label_fixed.replace("science/physics", "tech") + elif label_fixed.startswith("culture"): + label_fixed = label_fixed.split("(")[0] + label_fixed = label_fixed.replace("culture.", "culture") + elif "/" in label: + label_fixed = random.choice(label_fixed.split("/")).strip() + else: + label_fixed = None + + return label_fixed