-
Notifications
You must be signed in to change notification settings - Fork 1
/
loadData.py
99 lines (74 loc) · 2.95 KB
/
loadData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import csv
import os
from subprocess import call
import argparse
parser = argparse.ArgumentParser(description='download and process google files')
parser.add_argument('letter',help='first letter of type')
parser.add_argument('second_letters',help='letters to combine first letter with')
args = parser.parse_args()
letter = args.letter
second_letters = args.second_letters
def addPair(result, word1, word2, score):
i1 = word1.find("_")
if i1 > -1 and i1 < 3: return
if i1 > -1: word1 = word1[:i1]
i2 = word2.find("_")
if i2 > -1 and i2 < 3: return
if i2 > -1: word2 = word2[:i2]
if not word1 in result:
result[word1] = {word2: score}
elif not word2 in result[word1]:
result[word1][word2] = score
else: result[word1][word2] += score
os.chdir("./pydata")
def parse(name):
print("PARSING", name)
result = {}
MIN_LENGTH = 2;
def valid(word):
if '$' in word or '.' in word or '/' in word or 'NUM' in word:
return False
return True
i = 0
LIMIT = 2 * 1000 * 1000 * 1000
call(["rm", "googlebooks-eng-all-3gram-20120701-"+name+".gz"])
call(["rm", "googlebooks-eng-all-3gram-20120701-"+name])
call(["rm", "googlebooks-eng-all-3gram-20120701-"+name+"_results.csv"])
call(["wget", "http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-3gram-20120701-"+name+".gz"])
call(["gunzip", "googlebooks-eng-all-3gram-20120701-"+name+".gz"])
with open("googlebooks-eng-all-3gram-20120701-"+name) as tsv:
for line in csv.reader(tsv, dialect="excel-tab"):
if i > LIMIT:
break
i = i + 1;
ngram = line[0].split()
l0 = len(ngram[0])
l1 = len(ngram[1])
l2 = len(ngram[2])
valid0 = valid(ngram[0])
valid1 = valid(ngram[1])
valid2 = valid(ngram[2])
match_count = int(line[2])
if valid0 and valid1 and l0 > MIN_LENGTH and l1 > MIN_LENGTH:
addPair(result, ngram[0], ngram[1], match_count)
if valid1 and valid2 and l1 > MIN_LENGTH and l2 > MIN_LENGTH:
addPair(result, ngram[1], ngram[2], match_count)
if valid0 and valid2 and l0 > MIN_LENGTH and l2 > MIN_LENGTH:
addPair(result, ngram[0], ngram[2], match_count)
# save
save = True
if save:
with open('googlebooks-eng-all-3gram-20120701-'+name+'_result.csv', 'w') as fp:
a = csv.writer(fp, delimiter=',')
for key in result:
item = result[key]
data = []
for itemkey in item:
data.append([key, itemkey, item[itemkey]])
a.writerows(data)
call(["rm", "googlebooks-eng-all-3gram-20120701-"+name])
#print(result)
if second_letters == "all":
second_letters = "_abcdefghijklmnopqrstuvwxyz"
for c in second_letters:
parse(letter+c)