Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Checkworthiness scripts for multiple models and shot configurations #365

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import re

from llmebench.datasets import CT22CheckworthinessDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import CheckworthinessTask


def metadata():
return {
"author": "Arabic Language Technologies, QCRI, HBKU",
"model": "gpt-4-32k (version 0314)",
"description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
"scores": {"F1 (POS)": "0.554"},
}


def config():
return {
"dataset": CT22CheckworthinessDataset,
"task": CheckworthinessTask,
"model": OpenAIModel,
"model_args": {
"class_labels": ["0", "1"],
"max_tries": 30,
},
"general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
}


def few_shot_prompt(input_sample, base_prompt, examples):
out_prompt = base_prompt + "\n"
out_prompt = out_prompt + "اليك بعض الامثلة:\n\n"
for index, example in enumerate(examples):
label = "لا" if example["label"] == "0" else "نعم"

out_prompt = (
out_prompt
+ "مثال "
+ str(index)
+ ":"
+ "\n"
+ "التغريدة: "
+ example["input"]
+ "التصنيف: "
+ label
+ "\n\n"
)

# Append the sentence we want the model to predict for but leave the Label blank
out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"

return out_prompt


def prompt(input_sample, examples):
base_prompt = f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ نعم أو لا. قدم التصنيف فقط دون شرح.'
return [
{
"role": "system",
"content": "أنت خبير في تحليل وتصنيف التغريدات.",
},
{
"role": "user",
"content": few_shot_prompt(input_sample, base_prompt, examples),
},
]


def post_process(response):
label = response["choices"][0]["message"]["content"]

label = label.replace("label:", "").strip().lower()

if (
"لا_يستحق_التحقق" in label
or "لا يستحق التحقق" in label
or "ليس يستحق التحقق" in label
or "لا تستحق التحقق" in label
or "no" in label
or "لا" in label
or "not" in label
):
return "0"
elif (
"yes" in label
or "نعم" in label
or "يستحق التحقق" in label
or "checkworthy" in label
):
return "1"
else:
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import re

from llmebench.datasets import CT22CheckworthinessDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import CheckworthinessTask


def metadata():
return {
"author": "Arabic Language Technologies, QCRI, HBKU",
"model": "gpt-4-32k (version 0314)",
"description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
"scores": {"F1 (POS)": "0.554"},
}


def config():
return {
"dataset": CT22CheckworthinessDataset,
"task": CheckworthinessTask,
"model": OpenAIModel,
"model_args": {
"class_labels": ["0", "1"],
"max_tries": 30,
},
"general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
}


def few_shot_prompt(input_sample, base_prompt, examples):
out_prompt = base_prompt + "\n"
out_prompt = out_prompt + "Here are some examples:\n\n"
for index, example in enumerate(examples):
label = "no" if example["label"] == "0" else "yes"

out_prompt = (
out_prompt
+ "Example "
+ str(index)
+ ":"
+ "\n"
+ "tweet: "
+ example["input"]
+ "\nlabel: "
+ label
+ "\n\n"
)

# Append the sentence we want the model to predict for but leave the Label blank
out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"

return out_prompt


def prompt(input_sample, examples):
base_prompt = f'Annotate the "tweet" into "one" of the following categories: checkworthy or not_checkworthy. Provide only label.'
return [
{
"role": "system",
"content": "You can analyze and classify tweets.",
},
{
"role": "user",
"content": few_shot_prompt(input_sample, base_prompt, examples),
},
]


def post_process(response):
label = response["choices"][0]["message"]["content"]

label = label.replace("label:", "").strip().lower()

if (
"لا_يستحق_التحقق" in label
or "لا يستحق التحقق" in label
or "ليس يستحق التحقق" in label
or "لا تستحق التحقق" in label
or "no" in label
or "لا" in label
or "not" in label
):
return "0"
elif (
"yes" in label
or "نعم" in label
or "يستحق التحقق" in label
or "checkworthy" in label
):
return "1"
else:
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import re

from llmebench.datasets import CT22CheckworthinessDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import CheckworthinessTask


def metadata():
return {
"author": "Arabic Language Technologies, QCRI, HBKU",
"model": "gpt-4-32k (version 0314)",
"description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
"scores": {"F1 (POS)": "0.554"},
}


def config():
return {
"dataset": CT22CheckworthinessDataset,
"task": CheckworthinessTask,
"model": OpenAIModel,
"model_args": {
"class_labels": ["0", "1"],
"max_tries": 30,
},
"general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
}


def few_shot_prompt(input_sample, base_prompt, examples):
out_prompt = base_prompt + "\n"
out_prompt = out_prompt + "اليك بعض الامثلة:\n\n"
for index, example in enumerate(examples):
label = "not_checkworthy" if example["label"] == "0" else "checkworthy"

out_prompt = (
out_prompt
+ "مثال "
+ str(index)
+ ":"
+ "\n"
+ "التغريدة: "
+ example["input"]
+ "\التصنيف: "
+ label
+ "\n\n"
)

# Append the sentence we want the model to predict for but leave the Label blank
out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"

return out_prompt


def prompt(input_sample, examples):
base_prompt = 'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ checkworthy أو not_checkworthy'
return [
{
"role": "system",
"content": "أنت خبير في تحليل وتصنيف التغريدات.",
},
{
"role": "user",
"content": few_shot_prompt(input_sample, base_prompt, examples),
},
]


def post_process(response):
label = response["choices"][0]["message"]["content"]
label = label.replace("label:", "").strip().lower()

if (
"لا_يستحق_التحقق" in label
or "لا يستحق التحقق" in label
or "ليس يستحق التحقق" in label
or "لا تستحق التحقق" in label
or "no" in label
or "لا" in label
or "not" in label
):
return "0"
elif (
"yes" in label
or "نعم" in label
or "يستحق التحقق" in label
or "checkworthy" in label
):
return "1"
else:
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import re

from llmebench.datasets import CT22CheckworthinessDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import CheckworthinessTask


def metadata():
return {
"author": "Arabic Language Technologies, QCRI, HBKU",
"model": "gpt-4-32k (version 0314)",
"description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
"scores": {"F1 (POS)": "0.560"},
}


def config():
return {
"dataset": CT22CheckworthinessDataset,
"task": CheckworthinessTask,
"model": OpenAIModel,
"model_args": {
"class_labels": ["0", "1"],
"max_tries": 30,
},
"general_args": {"test_split": "ar"},
}


def prompt(input_sample):
prompt_string = (
f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ نعم أو لا. قدم التصنيف فقط دون شرح.\n\n'
f"التغريدة: {input_sample}\n"
f"التصنيف: \n"
)
return [
{
"role": "system",
"content": "أنت خبير في تحليل وتصنيف التغريدات.",
},
{
"role": "user",
"content": prompt_string,
},
]


def post_process(response):
label = response["choices"][0]["message"]["content"]

label = label.replace("label:", "").strip().lower()

if (
"لا_يستحق_التحقق" in label
or "لا يستحق التحقق" in label
or "ليس يستحق التحقق" in label
or "لا تستحق التحقق" in label
or "no" in label
or "لا" in label
or "not" in label
):
return "0"
elif (
"yes" in label
or "نعم" in label
or "يستحق التحقق" in label
or "checkworthy" in label
):
return "1"
else:
return None
Loading