-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathtokenizeChinese.py
133 lines (109 loc) · 4.82 KB
/
tokenizeChinese.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#encoding=utf-8
# script of python2.7
# the tokenization of Chinese text contains two steps: separate each Chinese characters (by utf-8 encoding); tokenize the non Chinese part (following the mteval script).
# Shujian Huang [email protected]
import re
import sys
def isChineseChar(uchar):
if uchar >= u'\u3400' and uchar <= u'\u4db5': # CJK Unified Ideographs Extension A, release 3.0
return True
elif uchar >= u'\u4e00' and uchar <= u'\u9fa5': # CJK Unified Ideographs, release 1.1
return True
elif uchar >= u'\u9fa6' and uchar <= u'\u9fbb': # CJK Unified Ideographs, release 4.1
return True
elif uchar >= u'\uf900' and uchar <= u'\ufa2d': # CJK Compatibility Ideographs, release 1.1
return True
elif uchar >= u'\ufa30' and uchar <= u'\ufa6a': # CJK Compatibility Ideographs, release 3.2
return True
elif uchar >= u'\ufa70' and uchar <= u'\ufad9': # CJK Compatibility Ideographs, release 4.1
return True
elif uchar >= u'\u20000' and uchar <= u'\u2a6d6': # CJK Unified Ideographs Extension B, release 3.1
return True
elif uchar >= u'\u2f800' and uchar <= u'\u2fa1d': # CJK Compatibility Supplement, release 3.1
return True
elif uchar >= u'\uff00' and uchar <= u'\uffef': # Full width ASCII, full width of English punctuation, half width Katakana, half wide half width kana, Korean alphabet
return True
elif uchar >= u'\u2e80' and uchar <= u'\u2eff': # CJK Radicals Supplement
return True
elif uchar >= u'\u3000' and uchar <= u'\u303f': # CJK punctuation mark
return True
elif uchar >= u'\u31c0' and uchar <= u'\u31ef': # CJK stroke
return True
elif uchar >= u'\u2f00' and uchar <= u'\u2fdf': # Kangxi Radicals
return True
elif uchar >= u'\u2ff0' and uchar <= u'\u2fff': # Chinese character structure
return True
elif uchar >= u'\u3100' and uchar <= u'\u312f': # Phonetic symbols
return True
elif uchar >= u'\u31a0' and uchar <= u'\u31bf': # Phonetic symbols (Taiwanese and Hakka expansion)
return True
elif uchar >= u'\ufe10' and uchar <= u'\ufe1f':
return True
elif uchar >= u'\ufe30' and uchar <= u'\ufe4f':
return True
elif uchar >= u'\u2600' and uchar <= u'\u26ff':
return True
elif uchar >= u'\u2700' and uchar <= u'\u27bf':
return True
elif uchar >= u'\u3200' and uchar <= u'\u32ff':
return True
elif uchar >= u'\u3300' and uchar <= u'\u33ff':
return True
else:
return False
def tokenizeString(sentence, lc=False):
"""
:param sentence: input sentence
:param lc: flag of lowercase. default=True
:return: tokenized sentence, with \n
"""
#sentence = sentence.decode("utf-8")
sentence = sentence.strip()
sentence_in_chars = ""
for c in sentence:
if isChineseChar(c):
sentence_in_chars += " "
sentence_in_chars += c
sentence_in_chars += " "
else:
sentence_in_chars += c
sentence = sentence_in_chars
if lc:
sentence = sentence.lower()
# tokenize punctuation
sentence = re.sub(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])', r' \1 ', sentence)
# tokenize period and comma unless preceded by a digit
sentence = re.sub(r'([^0-9])([\.,])', r'\1 \2 ', sentence)
# tokenize period and comma unless followed by a digit
sentence = re.sub(r'([\.,])([^0-9])', r' \1 \2', sentence)
# tokenize dash when preceded by a digit
sentence = re.sub(r'([0-9])(-)', r'\1 \2 ', sentence)
# one space only between words
sentence = re.sub(r'\s+', r' ', sentence)
# no leading space
sentence = re.sub(r'^\s+', r'', sentence)
# no trailing space
sentence = re.sub(r'\s+$', r'', sentence)
#sentence += "\n"
#sentence = sentence.encode("utf-8")
return sentence
def tokenizeSentence(sentences):
#file_r = open(inputFile, 'r', encoding='utf-8') # input file
#file_w = open(outputFile, 'w') # result file
output = []
#<seg id="1">-28 "老欧洲" Chef Found , 就是背井离乡来到旧金山追求财富的巴西人 Mall</seg>
for sentence in sentences:
if sentence.startswith("<seg"):
start = sentence.find(">") + 1
end = sentence.rfind("<")
#new_sentence = tokenizeString(sentence)
new_sentence = sentence[:start] + tokenizeString(sentence[start:end]) + sentence[end:]
else:
new_sentence = tokenizeString(sentence)
#file_w.write(new_sentence)
output.append(new_sentence)
#file_r.close()
#file_w.close()
return output
if __name__ == '__main__':
tokenizeSentence(sys.argv[1])