-
Notifications
You must be signed in to change notification settings - Fork 3
/
tokenizer.py
76 lines (67 loc) · 2.41 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Created on July 27 2017
Wikipedia database dump tokenizer
"""
import os, logging, sys, datetime, time
import glob, io, codecs, re
import nltk
import pickle
from nltk.corpus import PlaintextCorpusReader
from nltk.probability import FreqDist
start_time = time.time()
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
logging.basicConfig(
format = '%(asctime)s : %(levelname)s : %(message)s',
level = logging.INFO
)
# Specify paths
dump_path = '/media/khgkim/data/khgkim/compling/processed/'
token_path = '/media/khgkim/data/khgkim/compling/'
filename = 'tokenizer_tokens.txt'
filename2 = 'tokenizer_tokens2.txt'
filename3 = 'freqDist.pickle'
os.chdir(dump_path)
#counter for line length control
lineCount = 0
if not (os.path.isfile(token_path + filename)):
print timestamp
tokensFile = codecs.open(token_path + filename, "a+", "utf-8")
fDistTotal = FreqDist()
# For each directory
for directory in glob.glob("*"):
print "directory: " + directory
# Get all wiki articles
for files in os.listdir(directory):
with io.open(directory + '/' + files, encoding='utf-8') as s:
string = s.read() #read each article to string
tokens = nltk.word_tokenize(string) #tokenize string
for word in tokens:
tokensFile.write(word + " ") #write word to tokens file
lineCount += 1
if lineCount >= 60: #keep lines under 60 words
tokensFile.write('\n')
lineCount = 0
fDistTotal = fDistTotal + FreqDist(tokens) #add freq dist of line to total
del tokens
del string
s.close()
tokensFile.close()
print "Finished tokenizer_tokens.txt in %s minutes ---" % ((time.time() - start_time)/60)
# Replace UNK tokens based on frequency and save to tokenizer_tokens2.txt
f = codecs.open(token_path + filename2, "a+", "utf-8")
with codecs.open(token_path + filename, "a+", "utf-8") as t:
for line in t:
for word in nltk.word_tokenize(line):
if (fDistTotal[word] <= 10): #if frequency <= 10, replace with UNK
f.write("UNK" + " ")
else:
f.write(word + " ")
f.close()
f = open(token_path + filename3, "wb")
pickle.dump(fDistTotal, f) #save total freq dist to file
f.close()
else:
print "Output message: File tokenizer_tokens.txt already exists!"
exit()
# Print execution time
print("--- Execution time: %s minutes ---" % ((time.time() - start_time)/60))