-
Notifications
You must be signed in to change notification settings - Fork 3
/
split_files.py
76 lines (72 loc) · 2.13 KB
/
split_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pdb
def gen_qa_pairs(names, stoplist):
for n in names:
fi = open(n+'.txt')
fo = open(n+'.rf', 'w')
prev, labellist = "", []
i = 0
alist = []
for line in fi:
sent = line.strip().split("\t")
if prev != sent[0]:
if i != 0:
qlist = [i for i in prev.split() if i not in stoplist]
flatalist = [j for i in alist for j in i]
label = 1 if 1 in labellist else 0
lineout = " ".join(qlist) + "\t" + " ".join(flatalist) + "\t" + str(label)
fo.write(lineout + "\n")
alist = []
labellist = []
else:
i = i + 1
prev = sent[0]
alist.append([i for i in sent[1].split() if i not in stoplist])
labellist.append(int(sent[2]))
qlist = [i for i in prev.split() if i not in stoplist]
flatalist = [j for i in alist for j in i]
label = 1 if 1 in labellist else 0
lineout = " ".join(qlist) + "\t" + " ".join(flatalist) + "\t" + str(label)
fo.write(lineout + "\n")
fi.close()
fo.close()
def gen_qa_pairs_bal(names, stoplist):
for n in names:
fi = open(n+'.txt')
fo = open(n+'.bal', 'w')
prev, labellist = "", []
samples = {0:'', 1:'','q':''}
i = 0
alist = []
for line in fi:
sent = line.strip().split("\t")
if prev != sent[0]:
if i!=0:
qlist = [i for i in prev.split() if i not in stoplist]
lineout = ""
if samples[1]:
alist = [i for i in samples[1].split() if i not in stoplist]
lineout = lineout + " ".join(qlist) + "\t" + " ".join(alist) + "\t" + "1" + "\n"
elif samples[0]:
alist = [i for i in samples[0].split() if i not in stoplist]
lineout = lineout + " ".join(qlist) + "\t" + " ".join(alist) + "\t" + "0" +"\n"
fo.write(lineout)
samples[0], samples[1] = '', ''
else:
i = i+1
prev = sent[0]
label = int(sent[2])
if not samples[label]:
samples[label] = sent[1]
fi.close()
fo.close()
def genstoplist(stopfile):
stop = []
with open(stopfile) as sf:
for line in sf:
stop.append(line.strip())
return set(stop)
def preproc():
slist = genstoplist('data/short-stopwords.txt')
gen_qa_pairs_bal(['data/old/WikiQASent-train', 'data/old/WikiQASent-dev', 'data/old/WikiQASent-test'],
slist)
preproc()