forked from Eerie16/deep-learning-morph-analyzer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_encoders.py
59 lines (57 loc) · 1.68 KB
/
make_encoders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
encoders_file = open('tag_encoders.pickle','wb')
import pickle
import os
# encoders = pickle.load(encoders_file)
# print (*[encoders[i].classes_ for i in range((len(encoders)))], sep="\n")
from sklearn.preprocessing import LabelEncoder
DATASET_DIR = "datasets/HDTB_pre_release_version-0.05/IntraChunk/CoNLL/utf/news_articles_and_heritage/Training/"
encoders = [ LabelEncoder() for x in range(6)]
pos=[]
gender=[]
number=[]
person=[]
case=[]
tam=[]
files = [f for f in os.listdir(DATASET_DIR) if os.path.isfile(os.path.join(DATASET_DIR, f))]
BASE_DIR = os.path.join(os.path.dirname(__file__),DATASET_DIR)
for xyz in files:
file = open(os.path.join(BASE_DIR, xyz), 'r')
for line in file.readlines():
inp = line.split('\t')
if len(inp)<3:
continue
pos.append(inp[3])
features = inp[5]
if len(features)>1:
features = features.split('|')
for f in features:
keyval = f.split('-')
if keyval[1]=='':
continue
key,val=keyval
if key=='case':
case.append(val)
if key == 'num':
number.append(val)
if key=='pers':
person.append(val)
if key=="gen":
gender.append(val)
if key=="tam":
tam.append(val)
file.close()
pos.append("Unk")
gender.append("Unk")
number.append("Unk")
person.append("Unk")
case.append("Unk")
tam.append("Unk")
encoders[0].fit(pos)
encoders[1].fit(gender)
encoders[2].fit(number)
encoders[3].fit(person)
encoders[4].fit(case)
encoders[5].fit(tam)
print(encoders[5].classes_)
pickle.dump(encoders, encoders_file)
encoders_file.close()