-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
129 lines (114 loc) · 5.59 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from utils import *
import firebase
from firebase_admin import db
from unidecode import unidecode
import sys
from unidecode import unidecode
from pycorenlp import StanfordCoreNLP
'''
training on training.json; results placed in dict3.json
'''
training_data = 'Pre_Post_Processing/Preprocessing/Training_Data/training.json'
vocab_count = 'Pre_Post_Processing/vocab2.txt'
word_biases = 'Pre_Post_Processing/Preprocessing/Training_Results/dict5.json'
testing_data = 'Pre_Post_Processing/Preprocessing/Test_Data/data.txt'
testing_results = 'Pre_Post_Processing/Preprocessing/Results/dataout.txt'
glove_training = 'Pre_Post_Processing/text8.txt'
keywords = 'Pre_Post_Processing/keywords.json'
testing_set = {}
testing_set['collection'] = []
input = sys.argv[1]
if input is '':
print("\n###DATABASE:###")
print("Getting ISBNs of training books")
ISBN = db.reference().child('ISBN_numbers').order_by_child('rating').start_at(0).end_at(100).get().keys()
print("Get Training set and parse into file")
count = 0
for i in range(len(ISBN)):
print "reading: " , i , "\r"
for j in range(100):
child = 'review_' + str(j)
string1 = db.reference().child('ISBN_numbers').child(ISBN[i]).child(child).get()
if isinstance(string1,unicode):
string1 = unidecode(string1)
if not string1 == None:
testing_set['collection'].append({})
testing_set['collection'][count]['rating'] = int(float(db.reference().child('ISBN_numbers').child(ISBN[i]).child('rating').get()))
testing_set['collection'][count]['review'] = string1
count += 1
print("Number of Words used: "+str(count))
print("\nDump data into datafiles")
with open(training_data,'w') as data_file:
json.dump(testing_set,data_file)
print("Number of Books used: "+str(len(ISBN)))
if input is '1':
print("\n###TRAINING SEMANTIC:###")
print("Retrieving sentences from:\t"+training_data)
sentence_list = retrieve_sentences(training_data)
print("Writing wordcounts into:\t"+vocab_count)
[word_count, word_num] = retrieve_wordcount_max(sentence_list,vocab_count) #make this write to a text file for GloVe purposes
print("Number of Reviews used: "+str(word_num))
print("Writing word biases into:\t"+word_biases)
word_bias(word_count,word_biases,5)
if input is '2':
print("\n###TRAINING GLOVE:###")
print("Getting ISBN of all books")
text8string = ''
separator = 15
total_reviews = 0
results = db.reference().child('ISBN_numbers').get()
ISBN = results.keys()
print("filling all book reviews")
wordcount = 0
current_book = 0
nlp = StanfordCoreNLP('http://localhost:9000')
while wordcount < 17000000 and current_book < len(ISBN):
for x in range(40):
child = 'review_' + str(x)
str1 = db.reference().child('ISBN_numbers').child(ISBN[current_book]).child(child).get()
if str1:
i = unicodedata.normalize('NFKD', str1).encode('ascii','ignore') #convert unicode to str
output = nlp.annotate(i,properties={'annotators': 'tokenize,ssplit,pos,lemma','outputFormat': 'json'}) #invoke nlp to convert word to lemma
str1 = '' #initlialize lemma sentence
if output['sentences']:
for sentence in output['sentences']: #for each word in the sentence
for word in sentence['tokens']:
str1 += word['lemma'] #place the lemma in the sentence
str1 += ' ' #followed by a space
str1 = str1[:-1]
if isinstance(str1,unicode):
str1 = unidecode(str1)
total_reviews += 1
if isinstance(str1,str):
str1 = str1.lower()
str1 = ''.join([ c if c.isalnum() else ' ' for c in str1 ])
str1.replace('1','one ')
str1.replace('2','two ')
str1.replace('3','three ')
str1.replace('4','four ')
str1.replace('5','five ')
str1.replace('6','six ')
str1.replace('7','seven ')
str1.replace('8','eight ')
str1.replace('9','nine ')
str1.replace('0','zero ')
text8string += str1 + ' ' + separator * 'placeholder_string '
text8string.replace(' ',' ')
wordcount += len(str1.split(' '))
current_book += 1
print "Appending book Number: " , current_book , " New wordcount: " , wordcount , "\r"
with open(glove_training, 'w') as glove_text:
print("Writing all " + str(total_reviews) + " reviews into:\t"+glove_training)
glove_text.write(text8string)
print("\n###FILTERING:###")
'''
testing on data.txt using dict.json; results placed in dataout.json
'''
#print("Writing filtered document into:\t"+testing_results)
#sentence_filter(testing_data,testing_results,word_biases)
'''
Post processing: employing the GloVe algorithm
'''
#CC_mat, decoding_list = fill_co_occurrence_matrix('Preprocessing/Results/dataout.json','Postprocessing/cooccur.json')
#prob_mat = probability_word1_given_word2(CC_mat)
#synonyms_dict = create_cluster(['plot','character','theme','dialogue','style'],'vectors.txt',50)