-
Notifications
You must be signed in to change notification settings - Fork 18
/
Aqmar.py
129 lines (121 loc) · 4.68 KB
/
Aqmar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from pathlib import Path
from llmebench.datasets.dataset_base import DatasetBase
from llmebench.tasks import TaskType
class AqmarDataset(DatasetBase):
def __init__(self, **kwargs):
super(AqmarDataset, self).__init__(**kwargs)
self.dev_filenames = kwargs.get(
"dev_filenames",
[
"Damascus.txt",
"Atom.txt",
"Raul_Gonzales.txt",
"Linux.txt",
"Imam_Hussein_Shrine.txt",
"Nuclear_Power.txt",
"Real_Madrid.txt",
"Solaris.txt",
],
)
self.test_filenames = kwargs.get(
"test_filenames",
[
"Crusades.txt",
"Islamic_Golden_Age.txt",
"Islamic_History.txt",
"Ibn_Tolun_Mosque.txt",
"Ummaya_Mosque.txt",
"Enrico_Fermi.txt",
"Light.txt",
"Periodic_Table.txt",
"Physics.txt",
"Razi.txt",
"Summer_Olympics2004.txt",
"Christiano_Ronaldo.txt",
"Football.txt",
"Portugal_football_team.txt",
"Soccer_Worldcup.txt",
"Computer.txt",
"Computer_Software.txt",
"Internet.txt",
"Richard_Stallman.txt",
"X_window_system.txt",
],
)
@staticmethod
def metadata():
return {
"language": "ar",
"citation": """@inproceedings{mohit-etal-2012-recall,
title = \"Recall-Oriented Learning of Named Entities in {A}rabic {W}ikipedia\",
author = \"Mohit, Behrang and
Schneider, Nathan and
Bhowmick, Rishav and
Oflazer, Kemal and
Smith, Noah A.\",
booktitle = \"Proceedings of the 13th Conference of the {E}uropean Chapter of the Association for Computational Linguistics\",
month = apr,
year = \"2012\",
address = \"Avignon, France\",
publisher = \"Association for Computational Linguistics\",
url = \"https://aclanthology.org/E12-1017\",
pages = \"162--173\",
}""",
"link": "http://www.cs.cmu.edu/~ark/AQMAR/",
"license": "CC BY-SA 3.0",
"splits": {
"test": {
"split": "test",
"path": "AQMAR_Arabic_NER_corpus-1.0",
},
"dev": {
"split": "dev",
"path": "AQMAR_Arabic_NER_corpus-1.0",
},
},
"task_type": TaskType.SequenceLabeling,
"class_labels": [
"B-PERS",
"I-PERS",
"B-LOC",
"I-LOC",
"B-ORG",
"I-ORG",
"B-MISC",
"I-MISC",
"O",
],
}
@staticmethod
def get_data_sample():
return {
"input": ".كانت السبب الرئيس في سقوط البيزنطيين بسبب الدمار الذي كانت تخلفه الحملات الأولى المارة في بيزنطة ( مدينة القسطنطينية ) عاصمة الإمبراطورية البيزنطية وتحول حملات لاحقة نحوها",
"label": "O O O O O B-PER O O O O O O O O O B-LOC O O B-LOC O O B-LOC I-LOC O O O O O",
}
def load_data(self, data_path, no_labels=False):
split = data_path["split"]
data_path = data_path["path"]
filenames = self.test_filenames
if split == "dev":
filenames = self.dev_filenames
data = []
for fname in filenames:
path = Path(data_path) / fname
path = self.resolve_path(path)
with open(path, "r") as reader:
current_sentence = []
current_label = []
for line_idx, line in enumerate(reader):
if len(line.strip()) == 0:
sentence = " ".join(current_sentence)
label = " ".join(current_label)
data.append(
{"input": sentence, "label": label, "line_number": line_idx}
)
current_sentence = []
current_label = []
else:
elements = line.strip().split()
current_sentence.append(elements[0])
current_label.append(elements[1])
return data