-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaggregate_predictions_wic.py
executable file
·220 lines (190 loc) · 7.27 KB
/
aggregate_predictions_wic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import json
import os
import numpy as np
from scipy.spatial.distance import jensenshannon
from scipy.stats import spearmanr
from bayes_opt import BayesianOptimization
from sklearn.metrics import f1_score
from sklearn.metrics import dcg_score
from scipy.stats import kendalltau
import pickle
language = 'English'
language_short = language[:3].lower()
D = {}
with open(f'/mimer/NOBACKUP/groups/cik_data/datasets/LSC/SemEval-{language}/semeval2020_ulscd_{language_short}/truth/graded.txt') as f:
for line in f:
word, label = line.split()
label = float(label)
D[word] = label
D2 = {}
with open(f'/mimer/NOBACKUP/groups/cik_data/datasets/LSC/SemEval-{language}/semeval2020_ulscd_{language_short}/truth/binary.txt') as f:
for line in f:
word, label = line.split()
label = int(label)
D2[word] = label
language = language.lower()
word2sub = {}
#mGPT
#Llama-2-13b-hf
#Llama-2-7b-hf
#bloom-7b1
for model in ['Llama-2-7b-hf']:#os.listdir(f'semeval_predictions'):
languages = os.listdir(f'semeval_predictions/{model}')
if language in languages:
for fname in os.listdir(f'semeval_predictions/{model}/{language}'):
tempname = fname.split('.')[0]
word = '_'.join(tempname.split('_')[:-1])
time = tempname.split('_')[-1]
if not word in word2sub:
word2sub[word] = {}
if not time in word2sub[word]:
word2sub[word][time] = {}
if not model in word2sub[word][time]:
word2sub[word][time][model] = []
with open(f'semeval_predictions/{model}/{language}/{fname}') as f:
for line in f:
word2sub[word][time][model].append(json.loads(line))
def comp_function(**kwargs):
model_rep = {}
words_set = {}
for word in word2sub:
word_orig = word.split()[0]
for time in sorted(word2sub[word]):
for model in word2sub[word][time]:
count = {}
if not model in model_rep:
model_rep[model] = {}
for sentence in word2sub[word][time][model]:
if not word in words_set:
words_set[word] = {}
if not time in words_set[word]:
words_set[word][time] = []
tokens_set = []
predictions = sentence['output'].replace('<pad>','').replace('[PAD]','').split('<|answer|>')[1]
if '<|end|>' in predictions:
predictions = predictions.split('<|end|>')[0].split('<|s|>')
else:
predictions = predictions.split('<|s|>')[:5]
for pred in predictions:
token = pred.strip()
if len(token) >= 3 and token.replace(' ','').isalpha() and not word_orig.split('_')[0] in token:
tokens_set.append(token)
words_set[word][time].append(tokens_set)
def jaccard(list1, list2):
intersection = len(list(set(list1).intersection(list2)))
union = (len(list1) + len(list2)) - intersection
return float(intersection) / union
def rbo(l1, l2, p=0.98):
"""
Calculates Ranked Biased Overlap (RBO) score.
l1 -- Ranked List 1
l2 -- Ranked List 2
"""
if l1 == None: l1 = []
if l2 == None: l2 = []
sl, ll = sorted([(len(l1), l1), (len(l2), l2)])
s, S = sl
l, L = ll
if s == 0: return 0
# Calculate the overlaps at ranks 1 through l
# (the longer of the two lists)
ss = set([]) # contains elements from the smaller list till depth i
ls = set([]) # contains elements from the longer list till depth i
x_d = {0: 0}
sum1 = 0.0
for i in range(l):
x = L[i]
y = S[i] if i < s else None
d = i + 1
# if two elements are same then
# we don't need to add to either of the set
if x == y:
x_d[d] = x_d[d - 1] + 1.0
# else add items to respective list
# and calculate overlap
else:
ls.add(x)
if y != None: ss.add(y)
x_d[d] = x_d[d - 1] + (1.0 if x in ss else 0.0) + (1.0 if y in ls else 0.0)
# calculate average overlap
sum1 += x_d[d] / d * pow(p, d)
sum2 = 0.0
for i in range(l - s):
d = s + i + 1
sum2 += x_d[d] * (d - s) / (d * s) * pow(p, d)
sum3 = ((x_d[l] - x_d[s]) / l + x_d[s] / s) * pow(p, l)
# Equation 32
rbo_ext = (1 - p) / p * (sum1 + sum2) + sum3
return rbo_ext
"""
truth = []
predict = []
for word in words_set:
set1 = set([s for s1 in words_set[word]['1'] for s in s1])
set2 = set([s for s2 in words_set[word]['2'] for s in s2])
print(word)
print(jaccard(set1,set2))
predict.append(jaccard(set1,set2))
truth.append(D[word])
print(spearmanr(truth,predict))
"""
truth = []
predict = []
for word in words_set:
scores = []
for s1 in words_set[word]['1']:
for s2 in words_set[word]['2']:
if len(s1) >= 5 and len(s2) >= 5:
#if len(s1)>3 and len(s2) > 4:
# m = min(len(s1),len(s2))
# print(s1)
# print(s2)
#if not np.isnan(spearmanr(s1[:m],s2[:m]).statistic):
# scores.append(spearmanr(s1[:m],s2[:m]).statistic)
scores.append(jaccard(s1,s2))
print(word)
print(1-(sum(scores)/len(scores)))
predict.append(1-(sum(scores)/len(scores)))
truth.append(D[word])
print(spearmanr(truth,predict))
"""
truth = []
predict = []
binary_predict = []
binary_truth = []
for model in model_rep:
for word in sorted(model_rep[model]):
word_orig = word.split()[0]
p0 = [0.]* len(model_rep[model][word])
p1 = [0.]* len(model_rep[model][word])
for j,rep in enumerate(model_rep[model][word]):
if rep in time_counts[model][word][0]:
p0[j] = time_counts[model][word][0][rep]
else:
p0[j] = 0.
if rep in time_counts[model][word][1]:
p1[j] = time_counts[model][word][1][rep]
else:
p1[j] = 0.
p0 = np.array(p0)/np.sum(p0)
p1 = np.array(p1)/np.sum(p1)
predict.append(jensenshannon(p0,p1))
binary_predict.append(0)
for j in range(len(p0)):
if p0[j] < 0.001 and p1[j] > 0.05:
binary_predict[-1] = 1
elif p1[j] < 0.001 and p0[j] > 0.05:
binary_predict[-1] = 1
binary_truth.append(D2[word])
truth.append(D[word])
spvalue = spearmanr(truth,predict).statistic
f1 = f1_score(binary_truth,binary_predict,average='weighted')
"""
"""
print(spvalue)
print(f1)
for j,word in enumerate(sorted(model_rep[model])):
print(word,predict[j])
"""
#return (1/2*f1) + (1/2*spvalue)
comp_function()