-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
92 lines (85 loc) · 3.11 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pickle
import csv
import re
from train import removePunctuation
from train import removeStopWords
from train import pos_prior
from train import neg_prior
from train import sorted_chisquared_dic
import math
import time
#Variable Declarations
flag=False
testing_example_count=0
token_list=[]
output_list=[]
total_test_pos_reviews=0
total_test_neg_reviews=0
totalfreqpos=0
totalfreqneg = 0
#Import the trained model files
f=open('pos_dictionary.txt','r')
pos_feature_dic=pickle.load(f)
f=open('neg_dictionary.txt','r')
neg_feature_dic=pickle.load(f)
f=open('vocabulary.txt','r')
freq_dic=pickle.load(f)
print 'Please enter the Path of the testing file'
testing_file_path=raw_input()
#Classification
start=time.time()
with open ('test.csv','rb') as testing_file:
testing_data=csv.reader(testing_file)
for row in testing_data:
token_pos_prob_list=[]
token_neg_prob_list=[]
outputList=[]
if flag==False:
flag=True
i=0
else:
testing_example_count+=1
word_list_split=re.split('\s+',row[0].lower())
word_list_minus_punct=removePunctuation(word_list_split)
word_list_minus_stop=removeStopWords(word_list_minus_punct)
for word in word_list_minus_stop:
token_list.append(word)
if word in pos_feature_dic:
pos_prob_token=(float)(pos_feature_dic[word][0]+1)/(len(freq_dic)+totalfreqpos)
token_pos_prob_list.append(math.log(pos_prob_token))
else:
token_pos_prob_list.append(math.log((float)(1)/(len(freq_dic)+totalfreqpos)))
if word in neg_feature_dic:
neg_prob_token=(float)(neg_feature_dic[word][0]+1)/(len(freq_dic)+totalfreqneg)
token_neg_prob_list.append(math.log(neg_prob_token))
else:
token_neg_prob_list.append(math.log((float)(1)/(len(freq_dic)+totalfreqneg)))
pos_prob_total=0
pos_prob_total = reduce(lambda a,d: a + d,token_pos_prob_list)
pos_prob_total = pos_prob_total + math.log(pos_prior)
neg_prob_total = math.log(neg_prior) + (reduce(lambda a, d:a + d,token_neg_prob_list))
if pos_prob_total>neg_prob_total:
review=1
total_test_pos_reviews+=1
else:
review=0
total_test_neg_reviews+=1
i+=1
output_list.append([i, review])
c = csv.writer(open("result.csv", "wb"))
c.writerow(['Id', 'Category'])
for row in output_list:
c.writerow(row)
# print 'SORTED CHI-SQUARED VALUE DICTIONARY IS',sorted_chisquared_dic
#
#
# print 'POSITIVE DICTIONARY IS'
# for key,value in pos_feature_dic.iteritems():
# print key,value
#
# print 'NEGATIVE DICTIONARY IS'
# for key,value in neg_feature_dic.iteritems():
# print key,value
print 'Positive: ',total_test_pos_reviews,'Negative: ',total_test_neg_reviews
print 'Testing Completed...The classified results have been stored in the result.csv file'
print 'The time taken to train was:',time.time()-start