-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
181 lines (172 loc) · 8.5 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
### testing framework for outputs
### sample output:
'''10 - 4 = 6 (left: 5 6 6)
5 * 6 = 30 (left: 6 30)
30 - 6 = 24 (left: 24)
Answer: (5 * (10 - 4)) - 6 = 24
'''
'''
We should test the accuracy based on two metrics:
(1)
[1.1] check if the numbers used are in the left:
of the previous step
[1.2] check if the left of the current step contains
the numbers not used
[1.3] the output of the expression is correct.
(2) check if last step outputs 24
'''
################## BEGIN CLASS DEFINITION ##################
## class that is an evaluator
class Gameof24OutputTester():
# constructor
def __init__(self, puzzle: str, response: str, solution: list = []):
# original puzzle
self.puzzle = puzzle.split()
# lists to store unused and used numbers
self.unused_nums = self.puzzle.copy()
self.used_nums = []
# do we need the solution?
self.solution = solution.copy()
# preprocess response string
self.response = response.strip().split("\n")
for j in range(len(self.response)):
self.response[j] = self.response[j].strip()
if self.response[0] == "Steps:":
self.response.pop(0)
self.response[-1] = self.response[-1].replace('###', '').strip()
self.response[-1] = self.response[-1].replace("Final", "").strip()
while len(self.response) > 0 and not self.response[-1].startswith("Answer:"):
self.response.pop()
# get the left values of a regular line
def get_left(self, rline: str):
leftPos = rline.find("(left:") + len("(left:")
closeParPos = rline.find(")")
leftNums = rline[leftPos:closeParPos]
leftNums = leftNums.replace(",","").strip()
return leftNums.split()
# get the expression and numbers of a regular line
def get_nums_and_expr(self, rline: str):
leftPos = rline.find("(left:")
expr = rline[:leftPos].strip()
# subfunction to find the oprator within a line
def find_operator(exp: str):
for op in "+-*/":
if op in exp:
return op
opr = find_operator(expr)
num1, num2 = expr.split("=")[0].strip().split(opr)
num1 = num1.strip()
num2 = num2.strip()
return (num1, num2, expr.replace("=","=="))
# check that the used in expression are in the left of the previous step (1.1)
def check_used_in_left_of_previous(self, thisStep: str, prevStep: str):
this1, this2, _ = self.get_nums_and_expr(rline=thisStep)
last_lefts = self.get_left(rline=prevStep)
return (this1 in last_lefts) and (this2 in last_lefts)
# check if the unused numbers are in the left of the current step (1.2)
def check_unused_in_left_of_current(self, step: str):
lefts = self.get_left(rline=step)
for un in self.unused_nums:
if un not in lefts:
return False
return True
# evaluate a regular line (not the answer) according to the guidelines in the string above
# Ex: 5 * 6 = 30 (left: 6 30)
def eval_regular_line(self, rline: str, first: bool, prevLine: str = ""):
try:
# list that keeps track of which conditions we met, in the order as described above
conditions = [first, False, False]
# get the expression and numbers of this line
n1, n2, expr = self.get_nums_and_expr(rline=rline)
# remove the used numbers from unused, and put into used
if len(self.unused_nums) != 0 and n1 in self.puzzle and n1 in self.unused_nums:
self.unused_nums.remove(n1)
self.used_nums.append(n1)
if len(self.unused_nums) != 0 and n2 in self.puzzle and n2 in self.unused_nums:
self.unused_nums.remove(n2)
self.used_nums.append(n2)
# check conditions [1.2] and [1.3]
cond12 = self.check_unused_in_left_of_current(step=rline)
cond13 = eval(expr)
conditions[1] = cond12
conditions[2] = cond13
# if this is not the first line, we also need to check condition [1.1]
if not first:
cond11 = self.check_used_in_left_of_previous(thisStep=rline, prevStep=prevLine)
conditions[0] = cond11
except:
conditions = [False]*3
return conditions
# evaluate the answer line
# Ex: Answer: (5 * (10 - 4)) - 6 = 24
def eval_answer(self, ans: str):
this_ans = ans.replace("Answer:", "").replace("=","==").strip() + " == 24"
this_ans = this_ans.replace("###", "")
this_ans = this_ans.replace("Final", "")
# 1. evaluate the expression to chck correctness.
try:
expr_result = eval(this_ans)
if hasattr(expr_result, "__iter__") and len(expr_result)==1:
expr_result = expr_result.pop()
except:
expr_result = False
# 2. Expression can only use each num once, and cannot use extraneous values
num_check = ans.replace("Answer:","").strip()
for ch in list("+-*/()[]=")+["24"]:
num_check = num_check.replace(ch,"")
nums_in_expr = sorted(num_check.split(), key=lambda val: int(val))
return expr_result and (nums_in_expr == sorted(self.puzzle, key=lambda puz: int(puz)))
# main procedure - iterate through all lines of response
def eval_response(self):
out = {"Numbers used are in the left of the previous step": None,
"The left of the current step contains the numbers not used": None,
"The output of the expression is correct": None,
"Last step outputs 24": None,
"Failure Step Number": "Correct",
"Failed Step": ""
}
outFlag = False
# if the length of the list, excluding the answer, is not at least 4 or not a multiple of 3, we know it did something wrong.
# output a message of "fundamental error" so we can distingusih this from the other kind of failure.
# This is counted as failing on the first step.
# if it is a multiple of 3, look at the last 4 lines ONLY.
if len(self.response) < 4 or (len(self.response)-1)%3 != 0:
count = 0
for stt in out.keys():
if count < 4:
out[stt] = False
count += 1
out["Failure Step Number"] = "1"
return out, True, "fundamental error"
else:
self.response = self.response[-4:]
# normal process
for i in range(len(self.response)):
# if first line, remember - we don't need to check 1.1
# if last line, it is the answer
if i != len(self.response)-1:
c11, c12, c13 = self.eval_regular_line(rline=self.response[i],
first= i==0,
prevLine= self.response[i-1] if i != 0 else ""
)
out["Numbers used are in the left of the previous step"] = c11
out["The left of the current step contains the numbers not used"] = c12
out["The output of the expression is correct"] = c13
# if any conditions fail, note the line number and step.
if not c11 or not c12 or not c13:
out["Failure Step Number"] = str(i+1)
out["Failed Step"] = self.response[i]
outFlag = True
break
else:
final_correct = self.eval_answer(ans=self.response[-1])
out["Last step outputs 24"] = final_correct
# if final answer is incorrect, note this.
if not final_correct:
out["Failure Step Number"] = str(len(self.response))
out["Failed Step"] = self.response[-1]
outFlag = True
return out, outFlag, ("Note: Conditions with a check result of 'None' were not evaluated because of an intermediate failure.\n"
"Alternatively, the final answer may not have been possible to calculate according to the model.") \
if outFlag else "OK"
################## END CLASS DEFINITION ##################