-
Notifications
You must be signed in to change notification settings - Fork 1
/
NaiveBayes.py
211 lines (186 loc) · 8.95 KB
/
NaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from Bayes import Bayes
import pandas as pd
import operator
import math
import numpy as np
import mpmath
class NaiveBayes(Bayes):
def __init__(self):
self.model = []
'''
Create model filled as such:
self.model = array of attributes (e.g. race, position, etc.) where each index points to a dictionary
attrDict (categorical) { key = attribute category (e.g. white, black, hispanic), value = probability dictionary}
probabilityDict = { key = classification (e.g. gets loan, differed, doesn't get loan), value = P(attrCategory|classification)}
attrDict (numerical) { key = "mean" or "std", value = meanDict or stdDict}
meanDict = {key = classification, value = conditional mean given this classification}
stdDict = {key = classification, value = conditional std given this classification}
dataSet (DataSet) - the dataset
model (Bayesian model object) - the model to train
'''
def train(self, dataSet, model):
dataFrame = dataSet.trainDataFrame
groundTruth = dataSet.trueLabels
classificationList = dataFrame[groundTruth].unique()
#to ensure that we don't train twice
if bool(model):
print("Error: Model not empty.")
pass
#for each of the attributes in the datset (a1...an)
for attribute in dataSet.trainHeaders:
#create outermost dictionary of the model (key = attribute category, value = another dictionary)
attrDict = {}
#if numerical type data
if(attribute in dataSet.getNumericalColumns("train")):
#for each numerical attribute create dict to hold mean and standard deviation
meanDict = {}
stdDict = {}
#for each of the possible classifications possible (i.e. lieutenant, captain, etc.)
for classification in classificationList:
#skip this case
if(groundTruth == attribute):
continue
#calculate the conditional mean and standard deviation based on each classification
mean = self.calculateConditionalMean(dataFrame, attribute, groundTruth, classification)
std = self.calculateConditionalStandardDeviation(dataFrame, attribute, groundTruth, classification)
meanDict[classification] = mean
stdDict[classification] = std
#append it to the outer dictionary
attrDict["mean"] = meanDict
attrDict["std"] = stdDict
#categorical type data
else:
#array of the unique values for the given attribute
attrCategories = self.getAttributeCategories(dataFrame, attribute)
attrCategories = attrCategories.tolist()
rares = self.getRares(dataFrame, attribute)
if len(rares) > 0:
attrCategories.append("rare")
for attrCategory in attrCategories:
if attrCategory in rares:
continue
#key = classification, value = probability of P(attr|classification)
probabilityDict = {}
#for each of the possible classifications (i.e. 1 or 0)
for classification in classificationList:
#skip this case
if(groundTruth == attribute):
continue
if attrCategory == "rare":
crossAttributeProbability = self.getRareProb(dataFrame, groundTruth, classification, attribute, rares)
else:
#the value part of the dictionary: P(a|C)
crossAttributeProbability = self.calculateCrossAttributeProbability(dataFrame, groundTruth, classification, attribute, attrCategory)
probabilityDict[classification] = crossAttributeProbability
#outermost dictionary
attrDict[attrCategory] = probabilityDict
model.append(attrDict)
#Construct a dictionary that will hold the probability of a particular classification C_x (e.g. lieutenant, captain)
classificationProbabilitiesDict = {}
#for each of the possible classfications
for Cx in classificationList:
#probability of the particular classification
#P = (# people with this particular classification) / (total # of people)
probOfCx = self.attributeCategoryProbability(dataFrame, dataSet.trueLabels, Cx)
classificationProbabilitiesDict[Cx] = probOfCx
#append it to the end of the outermost model array
model.append(classificationProbabilitiesDict)
'''
Pretty prints out the Bayesian model
dataSet (DataSet) - the dataset
model (Baysian model object) - the model to print
'''
def printModel(self, dataSet, model):
#Through the outermost model array, we loop up until the 2nd to last element
#The last element has the dictionary of classification probabilities
for i in range(0, len(model) - 1):
print("Attribute: ", dataSet.trainHeaders[i])
for attrCategory in model[i].keys():
if(attrCategory == 'mean' or attrCategory == 'std'): #numerical type
if(attrCategory == 'mean'):
print("\t Numerical Data: Conditional mean")
elif(attrCategory == 'std'):
print("\t Numerical Data: Condition standard deviation")
for classification in model[i][attrCategory].keys():
print("\t \t Classification and mean/std: ", classification, ", ", model[i][attrCategory][classification])
else: #categorical type
print("\t Attribute Category: ", attrCategory)
for classification in model[i][attrCategory].keys():
print("\t \t Classification & Probability: ", classification, ", ", model[i][attrCategory][classification])
print("Classification Probabilities: ")
classificationProbs = model[-1]
for Cx in classificationProbs.keys():
print("\t Classification: ", Cx)
print("\t Probability: ", classificationProbs[Cx])
'''
Given the attributes of an entry in an dataset and our trained model, classify calculates the P(classification|attributes)
for every possible classification, then appends a classification to dataset based on those probabilities.
Appends a new column of classifications to the dataset under the header "Bayes Classification"
dataSet (DataSet) - the dataset
testOrTrain (str) - a string that denotes whether we are classifying the train or test set
Returns: the classified DataFrame
Note: the testOrTrain parameter exists only because of inheritance; this function will only ever classify the test set.
'''
def classify(self, dataSet, testOrTrain):
dataFrame = dataSet.testDataFrame
groundTruth = dataSet.trueLabels
# variable that points to the dictionary of classification probabilities
classificationList = self.model[-1]
#make a new column for the data frame where our classifications are going to go
classificationColumn = []
#for each of the rows (people) in the dataset
for row in dataFrame.iterrows():
#dictionary {key = classification, value = complete bayesian probability}
bayesianDict = {}
#dictionary {key = classification, value = numerator probability}
numeratorDict = {}
# reset it for every row
denominatorSum = 0
#iterate through the possible outcomes of the class variable
for classification in classificationList.keys():
#start the numerator product with the value of P(C) for the current classification
#(we will be multiplying this by all the other attribute probabilities)
numeratorDict[classification] = classificationList[classification]
#loop through outer array of the model (but we stop at second to last element of array)
for j, attributeDict in enumerate(self.model):
#skip the last element because this isn't an attribute -- it's the classification probabilities dictionary
if(j == len(self.model) - 1):
continue
#if we run into the blank ground truth column, skip this
if(dataSet.testHeaders[j] == dataSet.trueLabels): #NOTE: this used to be .headers
continue
#value for the current row of the given attribute
attrValue = row[1].iloc[j]
if(dataSet.testHeaders[j] in dataSet.getNumericalColumns("test")):
meanDict = attributeDict["mean"]
stdDict = attributeDict["std"]
#NUMERATOR = P(person|classification) * P(classification)
bayesNumerator = self.calculateGaussianProbability(meanDict[classification], stdDict[classification], row[1].iloc[j])
try:
numeratorDict[classification] += math.log(bayesNumerator)
except:
pass
else:
if attrValue in attributeDict:
bayesNumerator = attributeDict[attrValue][classification]
else:
if "rare" in attributeDict:
bayesNumerator = attributeDict["rare"][classification]
else:
bayesNumerator = 1
try:
numeratorDict[classification] += math.log(bayesNumerator)
except:
pass
for key in numeratorDict.keys():
denominatorSum += mpmath.exp(numeratorDict[key] - (max(numeratorDict.items(), key=operator.itemgetter(1))[0]))
#currently just adding dictionary of all probabilities given all classifications
#but eventually want to be adding the max of these (the final classification)
for key in numeratorDict.keys():
bayesianDict[key] = mpmath.exp(numeratorDict[key] - (max(numeratorDict.items(), key=operator.itemgetter(1))[0])) / denominatorSum
maxClassification = max(bayesianDict.items(), key=operator.itemgetter(1))[0]
classificationColumn.append(maxClassification)
#sets new column equal to the array of classifications
dataFrame["Bayes Classification"] = classificationColumn
dataSet.resetHeaders(testOrTrain)
return dataFrame