forked from GEM-benchmark/NL-Augmenter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluation_engine.py
145 lines (133 loc) Β· 4.55 KB
/
evaluation_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from evaluation import (
evaluate_ner_tagging,
evaluate_text_generation,
evaluate_question_answering,
evaluate_text_classification,
)
from interfaces.QuestionAnswerOperation import QuestionAnswerOperation
from interfaces.SentenceOperation import SentenceOperation
from interfaces.TaggingOperation import TaggingOperation
from tasks.TaskTypes import TaskType
"""
This is the evaluation engine.
Currently has been implemented for SentenceTransformation:
eg. python evaluate.py -t ButterFingersPerturbation
"""
def evaluate(
implementation,
task_type,
language="en",
model=None,
dataset=None,
percentage_of_examples=None,
evaluate_filter=False,
):
# The evaluation engine would effectively do the following
# (1) Loading a standard model and a test set (the model's original test set would be the best choice)
# (2) Executing perturbations to generate the perturbed test set.
# (3) Executing these against the model and evaluate its performance (display nicely :P )
# (4) Writing a neat README.
task_type = get_task_type(implementation, task_type)
execute_model(
implementation,
evaluate_filter=evaluate_filter,
task_type=task_type,
locale=language,
model_name=model,
dataset=dataset,
percentage_of_examples=percentage_of_examples,
)
return
def evaluate_mt(
implementation,
task_type,
src_locale="en",
tgt_locale="en",
model=None,
dataset=None,
percent_of_examples=None,
evaluate_filter=False,
):
# TODO
return
def get_task_type(implementation, task_type):
if task_type is None:
print(
"Undefined task type, switching to default task %s",
implementation.tasks[0].name,
)
return str(implementation.tasks[0]).split(".")[1]
return task_type
def execute_model(
implementation,
task_type,
locale="en",
model_name=None,
dataset=None,
percentage_of_examples=20,
evaluate_filter=False,
):
interface = implementation.__bases__[0] # SentenceTransformation
impl = implementation()
if locale is "en":
if (
isinstance(impl, SentenceOperation)
and TaskType[task_type] == TaskType.TEXT_CLASSIFICATION
):
return evaluate_text_classification.evaluate(
impl,
evaluate_filter,
model_name,
dataset,
split=f"test[:{percentage_of_examples}%]",
)
elif (
isinstance(impl, QuestionAnswerOperation)
and TaskType[task_type] == TaskType.QUESTION_ANSWERING
):
return evaluate_question_answering.evaluate(
impl,
evaluate_filter,
model_name,
dataset,
split=f"validation[:{percentage_of_examples}%]",
)
elif (
isinstance(impl, SentenceOperation)
and TaskType[task_type] == TaskType.TEXT_TO_TEXT_GENERATION
):
return evaluate_text_generation.evaluate(
impl,
evaluate_filter,
model_name,
dataset,
split=f"test[:{percentage_of_examples}%]",
)
elif (
isinstance(impl, TaggingOperation)
and TaskType[task_type] == TaskType.TEXT_TAGGING
):
return evaluate_ner_tagging.evaluate(
impl,
evaluate_filter,
model_name,
dataset,
split=f"test[:{percentage_of_examples}%]",
)
# Other if else cases should be added here.
else:
print(
f"No default evaluation model exists for the interface {interface} in the locale {locale}."
f"It's okay to skip the evaluation for the purpose of the PR. If you are interested to evaluate "
f"your perturbation on a task and a dataset, "
f"the right place to do it would to add a new class in the evaluation folder "
f"and call it from execute_model. That's it!"
)
else:
print(
f"No default evaluation model exists in the locale {locale}."
f"It's okay to skip the evaluation for the purpose of the PR. If you are interested to evaluate "
f"your perturbation on a task and a dataset, "
f"the right place to do it would to add a new class in the evaluation folder "
f"and call it from execute_model. That's it!"
)