-
Notifications
You must be signed in to change notification settings - Fork 1
/
loadDataStreamed.py
92 lines (66 loc) · 2.46 KB
/
loadDataStreamed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import csv
import os
from subprocess import call
from google_ngram_downloader import readline_google_store
import argparse
parser = argparse.ArgumentParser(description='download and process google files')
parser.add_argument('letter',help='first letter of type')
parser.add_argument('second_letters',help='letters to combine first letter with')
args = parser.parse_args()
letter = args.letter
second_letters = args.second_letters
def addPair(result, word1, word2, score):
i1 = word1.find("_")
if i1 > -1 and i1 < 3: return
if i1 > -1: word1 = word1[:i1]
i2 = word2.find("_")
if i2 > -1 and i2 < 3: return
if i2 > -1: word2 = word2[:i2]
word1 = word1.encode('utf-8')
word2 = word2.encode('utf-8')
if not word1 in result:
result[word1] = {word2: score}
elif not word2 in result[word1]:
result[word1][word2] = score
else: result[word1][word2] += score
os.chdir("./pydata")
def parse(name):
print("PARSING", name)
result = {}
MIN_LENGTH = 2;
def valid(word):
if '$' in word or '.' in word or '/' in word or 'NUM' in word:
return False
return True
call(["rm", "googlebooks-eng-all-3gram-20120701-"+name+"_results.csv"])
a,b,records = next(readline_google_store(ngram_len=3, indices=[name], lang='eng'))
for record in records:
ngram = record.ngram.split()
l0 = len(ngram[0])
l1 = len(ngram[1])
l2 = len(ngram[2])
valid0 = valid(ngram[0])
valid1 = valid(ngram[1])
valid2 = valid(ngram[2])
match_count = int(record.match_count)
if valid0 and valid1 and l0 > MIN_LENGTH and l1 > MIN_LENGTH:
addPair(result, ngram[0], ngram[1], match_count)
if valid1 and valid2 and l1 > MIN_LENGTH and l2 > MIN_LENGTH:
addPair(result, ngram[1], ngram[2], match_count)
if valid0 and valid2 and l0 > MIN_LENGTH and l2 > MIN_LENGTH:
addPair(result, ngram[0], ngram[2], match_count)
# save
save = True
if save:
with open('googlebooks-eng-all-3gram-20120701-'+name+'_result.csv', 'w') as fp:
a = csv.writer(fp, delimiter=',')
for key in result:
item = result[key]
data = []
for itemkey in item:
data.append([key, itemkey, item[itemkey]])
a.writerows(data)
if second_letters == "all":
second_letters = "_abcdefghijklmnopqrstuvwxyz"
for c in second_letters:
parse(letter+c)