-
Notifications
You must be signed in to change notification settings - Fork 1
/
Bayes.py
154 lines (139 loc) · 4.93 KB
/
Bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from mpmath import *
class Bayes:
def __init__(self):
pass
'''
Counts the number of rows in a that have both a1Val and a2Val
dataFrame - the DataFrame object
a1 - column name of attribute 1
a1Val - value of attribute 1
a2 - column name of attribute 2
a2Val - value of attribute 2
(Ex: a1= sex, a1Val= Female)
'''
def countIntersection(self, dataFrame, a1, a1Val, a2, a2Val):
try:
return (len(dataFrame.groupby([a1, a2]).get_group((a1Val, a2Val))))
except:
return 0
'''
Counts the number of rows that have aVal
dataFrame - the DataFrame object
a - column name of attribute
aVal - value of attribute
(Ex: a= sex, aVal= Female)
'''
def countAttr(self, dataFrame, a, aVal):
try:
return dataFrame.loc[dataFrame[a] == aVal, a].count()
except:
return 0
'''
Returns the probability of a specific attribute value for a given category
(num people with given value for column/num people total).
Returns a probability.
a - column name
value - value of attribute
'''
def attributeCategoryProbability(self, dataFrame, a, value):
return dataFrame.loc[dataFrame[a] == value, a].count() / len(dataFrame.index)
'''
Returns an array of the unique categories (strings) in a column
dataFrame - the DataFrame object
a - name of attribute
'''
def getAttributeCategories(self, dataFrame, a):
return dataFrame[a].unique()
'''
Find all "rare" values in the DataFrame, where "rare" values are any value that
appear in less than or equal to 1% of the rows.
Returns a list containing all rare values.
dataFrame - the DataFrame object
a - column name of attribute
'''
def getRares(self, dataFrame, a):
attributeCounts = {}
total = []
rares = []
for val in dataFrame[a].unique():
attributeCounts[val] = self.countAttr(dataFrame, a, val)
total += self.countAttr(dataFrame, a, val)
cutoff = .01 * total
for key in attributeCounts.keys():
if attributeCounts[key] <= cutoff:
rares.append(key)
return rares
'''
Find the overall probability of rare values in the DataFrame.
Returns the probability of getting a given classification given a rare value.
dataFrame - the DataFrame object
groundTruth - column name of the ground truth column
classification - a given classification (a value in the groundTruth column)
attribute - column name containing the current rare values
rares - a list of rare values for the column
'''
def getRareProb(self, dataFrame, groundTruth, classification, attribute, rares):
numerator = 0
for val in rares:
numerator += self.countIntersection(dataFrame, attribute, val, groundTruth, classification)
return numerator / self.countAttr(dataFrame, groundTruth, classification)
'''
Compute the mean of attribute a.
dataFrame - the DataFrame object
a - the column name for the attribute
'''
def calculateMean(self, dataFrame, a):
return dataFrame[a].mean()
'''
Compute the conditional mean of attribute a given the classification (ground truth value)
Returns: the conditional mean
dataFrame - the DataFrame object
a - the column name for the attribute
groundTruth - the name of the groundTruth column
gTValue - a given classification (a value in the groundTruth column)
'''
def calculateConditionalMean(self, dataFrame, a, groundTruth, gTValue):
return dataFrame.groupby([groundTruth]).get_group(gTValue)[a].mean()
'''
Compute standard deviation of attribute a
Returns: the standard deviation
dataFrame - the DataFrame object
a - the column name for the attribute
'''
def calculateStandardDeviation(self, dataFrame, a):
stdd = dataFrame[a].std()
return stdd
'''
Compute conditional standard deviation of attribute a given the classification
Returns: the conditional standard deviation
dataFrame - the DataFrame object
a - the column name for the attribute
groundTruth - the name of the groundTruth column
gTValue - a given classification (a value in the groundTruth column)
'''
def calculateConditionalStandardDeviation(self, dataFrame, a, groundTruth, gTValue):
stdd = dataFrame.groupby([groundTruth]).get_group(gTValue)[a].std()
return stdd
'''
Compute cross attribute probability --
P(a | b) = (number of rows that have both aValue and bValue) / number of rows with bValue
Returns: the cross attribute probability
dataFrame - the DataFrame object
a - the column name for attribute a
aValue - value of attribute a
b - the column name for attribute b
bValue - value of attribute b
'''
def calculateCrossAttributeProbability(self, dataFrame, b, bValue, a, aValue):
return self.countIntersection(dataFrame, a, aValue, b, bValue) / self.countAttr(dataFrame, b, bValue)
'''
Compute Gaussian probability
Returns: Gaussian probability
mean - a mean
std - standard deviation
value - current numerical attribute value
'''
def calculateGaussianProbability(self, mean, std, value):
zscore = ((value - mean) * (value - mean)) / ((2*(std*std)))
gaussian = (1 / sqrt(2*pi*(std * std))) * (e **(-zscore))
return gaussian