-
Notifications
You must be signed in to change notification settings - Fork 18
/
CT22Attentionworthy.py
66 lines (59 loc) · 2.59 KB
/
CT22Attentionworthy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
from llmebench.datasets.dataset_base import DatasetBase
from llmebench.tasks import TaskType
class CT22AttentionworthyDataset(DatasetBase):
def __init__(self, **kwargs):
super(CT22AttentionworthyDataset, self).__init__(**kwargs)
@staticmethod
def get_data_sample():
return {"input": "some tweet", "label": "no_not_interesting"}
@staticmethod
def metadata():
return {
"language": ["ar", "bg", "nl", "en", "tr"],
"citation": """@InProceedings{clef-checkthat:2022:task1,
author = {Nakov, Preslav and Barr\\'{o}n-Cede\\~{n}o, Alberto and Da San Martino, Giovanni and Alam, Firoj and M\\'{\\i}guez, Rub\'{e}n and Caselli, Tommaso and Kutlu, Mucahid and Zaghouani, Wajdi and Li, Chengkai and Shaar, Shaden and Mubarak, Hamdy and Nikolov, Alex and Kartal, Yavuz Selim and Beltr\\'{a}n, Javier},
title = "Overview of the {CLEF}-2022 {CheckThat}! Lab Task 1 on Identifying Relevant Claims in Tweets",
year = {2022},
booktitle = "Working Notes of {CLEF} 2022---Conference and Labs of the Evaluation Forum",
series = {CLEF~'2022},
address = {Bologna, Italy},
}""",
"link": "https://gitlab.com/checkthat_lab/clef2022-checkthat-lab/clef2022-checkthat-lab",
"license": "Research Purpose Only",
"splits": {
"ar": {
"test": "CT22_arabic_1D_attentionworthy_test_gold.tsv",
"train": "CT22_arabic_1D_attentionworthy_train.tsv",
}
},
"task_type": TaskType.Classification,
"class_labels": [
"yes_discusses_action_taken",
"harmful",
"yes_discusses_cure",
"yes_asks_question",
"no_not_interesting",
"yes_other",
"yes_blame_authorities",
"yes_contains_advice",
"yes_calls_for_action",
],
}
def load_data(self, data_path):
data_path = self.resolve_path(data_path)
data = []
raw_data = pd.read_csv(data_path, sep="\t")
for index, row in raw_data.iterrows():
text = row["tweet_text"]
input_id = row["tweet_id"]
label = str(row["class_label"]).lower()
data.append(
{
"input": text,
"label": label,
"input_id": input_id,
"line_number": index,
}
)
return data