-
Notifications
You must be signed in to change notification settings - Fork 0
/
subword_embedding.py
185 lines (161 loc) · 6.19 KB
/
subword_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import jieba
from utility import *
class Subword_Embedding:
"""
Get Subwords via FMM and BMM algorithm
Get Word Embedding of sub-word
"""
def __init__(self, sub_list, pre_trained, standard_synonym):
self.sub_list = sub_list
self.pre_trained = pre_trained
self.standard_synonym = standard_synonym
def FMM(self, term: str):
"""
Forward Maximum Matching (FMM)
Get Embeddings
"""
start = 0
while start != self.len_term:
index = start + self.max_len
if index > self.len_term:
index = self.len_term
for i in range(self.max_len):
if (term[start:index] in self.sub_list) or (len(term[start:index]) == 1):
self.standard_subs.append(term[start:index])
start = index
break
index += -1
def BMM(self, term: str):
"""
Backward Maximum Matching (BMM)
Get Embeddings
"""
start = self.len_term
while start != 0:
index = start - self.max_len
if index < 0:
index = 0
for i in range(self.max_len):
if (term[index:start] in self.sub_list) or (len(term[index:start]) == 1):
self.standard_subs.append(term[index:start])
start = index
break
index += 1
def get_subword(self, term: str, is_print: bool) -> list:
"""
Get Sub-word(s) of the term through
Forward Maximum Matching (FMM)
Backward Maximum Matching (BMM)
Reference: https://zhuanlan.zhihu.com/p/103392455
:param term: The Input Term
"""
self.standard_subs = []
self.len_term = len(term)
self.max_len = len(self.sub_list[0])
if self.len_term != 0:
self.FMM(term=term)
self.BMM(term=term)
# Remove repeated sub-words
self.standard_subs = list(set(self.standard_subs))
else:
self.standard_subs.append('')
if is_print is True:
print('Sub-word(s) are ', self.standard_subs)
return self.standard_subs
def jieba_subword(self, term: str, negative: int) -> tuple:
"""
Jieba tokenizatin Embedding and Subword Embedding
"""
neg_word = ['非', '不', '无', '否', '假']
jieba_token = jieba.lcut(term, HMM=True)
subword_token = self.get_subword(term=term, is_print=False)
tokens = jieba_token + subword_token
for token in tokens:
if token in neg_word:
negative = 1
term.replace(token, '')
continue
else:
temp_vec = self.pre_trained.get(token)
if temp_vec is not None and token != '':
temp_vec = temp_vec.tolist()
if temp_vec not in self.grams:
self.grams.append(temp_vec)
return term, negative
# def jieba_subword(self, term: str) -> tuple:
# """
# Jieba tokenizatin Embedding and Subword Embedding
# """
# jieba_token = jieba.lcut(term, HMM=True)
# subword_token = self.get_subword(term=term, is_print=False)
# tokens = jieba_token + subword_token
#
# for token in tokens:
# temp_vec = self.pre_trained.get(token)
# if temp_vec is not None and token != '':
# temp_vec = temp_vec.tolist()
# if temp_vec not in self.grams:
# self.grams.append(temp_vec)
# return term
def n_gram(self, term: list):
"""
Get N-gram Embeddings from the term
"""
index = 0
for i in range(len(term)):
temp_grams = [term[index:t + 1] for t in range(len(term))]
for temp_gram in temp_grams:
temp_vec = self.pre_trained.get(temp_gram)
if temp_vec is not None and temp_gram != '':
temp_vec = temp_vec.tolist()
if temp_vec not in self.grams:
self.grams.append(temp_vec)
index += 1
def get_embedding(self, term: str) -> np.array:
"""
Get a word's embedding
"""
vec = self.pre_trained.get(term)
if vec is not None:
return vec.tolist()
else:
# Some standard-defined rules
self.grams = []
negative = 0
term, negative = self.jieba_subword(term=term, negative=negative) # Jieba and sub-word Embedding
# term = self.jieba_subword(term=term) # Jieba and sub-word Embedding
self.n_gram(term=term) # N-gram Embedding
# Return outvec
if negative == 1:
outvec = None if self.grams == [] else (np.mean(self.grams, axis=0) * -1).tolist()
else:
outvec = None if self.grams == [] else np.mean(self.grams, axis=0).tolist()
# outvec = None if self.grams == [] else np.mean(self.grams, axis=0).tolist()
return outvec
def load_standard_vector(self) -> tuple:
"""
Load word vector for term(s)]
Notice: We use the sub-words to get the word Embedding
instead of the original word!
"""
output_vec = []
output_term = []
# Iterate the Standard and Synonym Terms
for i in self.standard_synonym:
temp_out = []
# Remove Punctuations from the synonym term
i = remove_punctuation(term=i)
# Get sub-words from the synonym term
subs = self.get_subword(term=i, is_print=False) # Get Sub-words of this term
# Iterate the terms
for sub in subs:
outvec = self.get_embedding(term=sub)
if outvec is not None and outvec != []:
temp_out.append(outvec)
if temp_out != []:
temp_out = np.mean(temp_out, axis=0).tolist()
output_vec.append(temp_out)
output_term.append(i)
return output_vec, output_term