-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultiforest.py
129 lines (118 loc) · 4.11 KB
/
multiforest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from numpy import genfromtxt, savetxt, array
from collections import Counter
import logging
import csv
logging.basicConfig(filename='RFC.log',level=logging.DEBUG, format='%(asctime)s %(name)s %(levelname)s %(message)s')
logger = logging.getLogger('RFC')
def main():
#create the training & test sets, skipping the header row with [1:]
logger.debug("Startin")
#dataset = genfromtxt(open('train.csv','r'), delimiter=',', dtype='f8')[1:]
csvfile = csv.reader(open('train.csv','r'))
csvfile.next() #dont need the first one
targetl = []
trainl = []
for row in csvfile:
targetl.append(row[0])
trainl.append(row[1:])
target = array(targetl)
train = array(trainl)
"""
target = [x[0] for x in dataset]
train = [x[1:] for x in dataset]
"""
logger.debug("About to create RFC")
#create and train the random forest
#multi-core CPUs can use: rf = RandomForestClassifier(n_estimators=100, n_jobs=2)
rf = RandomForestClassifier(n_estimators=1000, n_jobs=2)
cv = cross_validation.KFold(len(train), n_folds=5)
results = []
forests = []
logger.debug("About to fit RFC")
count = 0
for traincv, testcv in cv:
logger.debug("fitting")
forest = rf.fit(train[traincv], target[traincv])
cls = forest.predict(train[testcv])
forests.append(forest)
logger.debug("fit")
pickle.dump(forest, open('forest' + str(count) + '.pkl', 'wb'))
count += 1
"""
logger.debug("resulting")
good = 0
bad = 0
logger.debug('about to compare results')
for i in range(len(testcv)):
if target[testcv[i]] != cls[i]:
logger.info('bad guess, target: ' + target[testcv][i] +', guess: ' + cls[i])
bad += 1
else:
good += 1
logger.debug('round done, good: ' + str(good) + ', bad: ' + str(bad))
"""
logger.debug('testing accuracy')
testpredictions = []
testresults = []
for forest in forests:
testpredictions.append(forest.predict(train))
good = 0
bad = 0
for i in range(len(target)):
counter = Counter([testpredictions[j][i] for j in range(len(forests))])
result = counter.most_common()[0][0]
if result == target[i]:
good += 1
else:
bad += 1
logger.debug('Bad guess, target: ' + str(target[i]) + ', guess: ' + str(result))
print good
print bad
logger.debug('results, good: ' + str(good) + ', bad: ' + str(bad))
logger.debug('doing real results now')
logger.debug('opening test file')
#test = genfromtxt(open('test.csv','r'), delimiter=',', dtype='f8')[1:]
csvtest = csv.reader(open('test.csv', 'r'))
csvtest.next() #dont need header
datatest = []
for row in csvtest:
datatest.append(row)
#datatest = array(datatestl)
logger.debug('starting real predicting')
results =[]
fullresults = []
for forest in forests:
logger.debug('doing a prediction')
res = forest.predict(datatest)
fullresults.append(res)
logger.debug('compiling results')
for i in range(len(fullresults[0])):
counter = Counter([fullresults[j][i] for j in range(len(forests))])
print counter
result = counter.most_common()[0][0]
print result
results.append(result)
"""
for i in range(len(datatest)):
subresults = []
logger.debug('doing prediction')
for forest in forests:
res = forest.predict(datatest[i])
subresults.append(res[0])
#now append the mode of subresults to results
counter = Counter(subresults)
result = counter.most_common()[0][0]
print result
results.append(result)
logger.debug('done with prediction!')
print results
"""
outstring = ''
for res in results:
outstring += str(res) + '\n'
outfile = open('multisubmit1000.txt', 'w')
outfile.write(outstring)
if __name__=="__main__":
main()