-
Notifications
You must be signed in to change notification settings - Fork 1
/
RepairData.py
227 lines (199 loc) · 10.7 KB
/
RepairData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import pandas as pd
from DataSet import DataSet
from statistics import median
class RepairData:
def __init__(self):
pass
'''
Sets the instance variables for a DataSet. Also copies provided DataSet and saves it as dataSetCopy.
dataSet (DataSet) - a DataSet object
'''
def setDataSetVariables(self, dataSet):
self.dataSetOriginal = dataSet
self.dataSetCopy = dataSet.copyDataSet()
self.maxBuckets = self.getMaxBuckets()
'''
Finds the protected attribute value with the fewest individuals and returns the count of those individuals
'''
def getMaxBuckets(self):
protectedAttribute = self.dataSetOriginal.protectedAttribute
df = self.dataSetOriginal.dataFrame
return min(df[protectedAttribute].value_counts())
'''
Finds all unique attribute values in our protected attributes and returns the distributions attached to
those values. Also returns a list of all possible values for the current protected attribute.
nonProtectedAttribute (str) - the name of the numerical, non-protected attribute we want to get a distribution for
Returns: all unique attribute values in protected attributes, a list of all possible values for the current
protected attribute
'''
def makeDistributions(self, nonProtectedAttribute):
df = self.dataSetOriginal.dataFrame
protectedAttribute = self.dataSetOriginal.protectedAttribute
attributeDistributions = []
attributeValues = []
for value in df[protectedAttribute].unique():
protectedDataFrame = df.loc[df[protectedAttribute] == value, [nonProtectedAttribute]]
series = protectedDataFrame[nonProtectedAttribute].tolist()
attributeDistributions.append(series)
attributeValues.append(value)
return attributeDistributions, attributeValues
'''
Takes the list of distributions from makeDistributions and puts the values into buckets.
distributions (list of lists) - the values from a single column separated by a protectedAttribute value
Returns: the list of distributions of a protected attribute's values, a list of the minimum and maximum values
in each bucket
'''
def bucketize(self, distributions):
# bucketAssignments is a list containing the index values for the bucket that the distribution values should end up in
# e.g. [0, 1, 2, 3, 0] assigns the first and last items to bucket 0, the second item to bucket 1, etc.
bucketAssignments = []
for i in range(len(distributions)):
bucketAssignments.append(pd.qcut(distributions[i], self.maxBuckets, labels=False))
# A list of distributions of a protected attribute's values, organized by bucket
bucketList = [[[] for i in range(self.maxBuckets)] for subList in bucketAssignments]
for i in range(len(bucketAssignments)):
for j in range(len(bucketAssignments[i])):
# Use the bucket assignment to append the distribution value to the appropriate bucket
bucketList[i][bucketAssignments[i][j]].append(distributions[i][j])
minMaxList = []
for i in range(len(bucketList)):
distributionList = []
for j in range(len(bucketList[i])):
if len(bucketList[i][j]) == 0:
print("No items in bucket: i = " + str(i) + ", j = " +
str(j) + ", bucketList[i][j] = " + str(bucketList[i][j]))
minimum = min(bucketList[i][j])
maximum = max(bucketList[i][j])
distributionList.append([minimum, maximum])
minMaxList.append(distributionList)
return bucketList, minMaxList
'''
Takes in bucketized values and returns a median distribution.
bucketList (list of list of list of floats) - a list of distributions of a protected attribute's values,
organized by bucket
Returns: a list containing the median distribution
'''
def findMedianDistribution(self, bucketList):
bucketMedians = [[] for subList in bucketList]
for dist in range(len(bucketList)):
for bucket in bucketList[dist]:
bucketMedians[dist].append(median(bucket))
zippedList = list(zip(*bucketMedians))
medianDistribution = []
for sublist in zippedList:
medianDistribution.append(median(sublist))
return medianDistribution
'''
Updates a DataSet object with modified values
columnName (string) - a column header
medianDistribution (list of floats) - a one-dimensional list containing the median values for each bucket
in bucketList
bucketList (list of list of list of floats) - a list of distributions of a protected
attribute's values, organized by bucket
minMaxList (list of list of list of floats) - a list of lists of the minimum and maximum in each bucket
attributeValues (list of strings) - a list of all possible values for the current protected attribute
'''
def modifyData(self, columnName, medianDistribution, bucketList, minMaxList, attributeValues):
df = self.dataSetCopy.dataFrame
for i in range(df.shape[0]):
protectedAttributeValue = df.at[i, self.dataSetCopy.protectedAttribute]
indexForProtectedAttributeValue = attributeValues.index(protectedAttributeValue)
currentValue = df.at[i, columnName]
bucket = self.getBucket(currentValue, indexForProtectedAttributeValue, bucketList, minMaxList)
df.loc[[i], [columnName]] = medianDistribution[bucket]
'''
Finds the index of the pre-filled bucket containing the given value
value (float) - the value to find
indexForProtectedAttributeValue (int) - the index within bucketList for a given protected attribute
bucketList (list of list of list of floats) - a list of distributions of a protected
attribute's values, organized by bucket
minMaxList (list of list of list of floats) - a list of lists of the minimum and maximum in each bucket
Returns: the index of the bucket containing the given value
'''
def getBucket(self, value, indexForProtectedAttributeValue, bucketList, minMaxList):
bucketedDistribution = bucketList[indexForProtectedAttributeValue]
minMaxSublist = minMaxList[indexForProtectedAttributeValue]
return self.getBucketHelper(value, 0, len(bucketedDistribution) - 1, bucketedDistribution, minMaxSublist)
'''
Recursive helper function for getBucket binary search.
value (float) - the value to find
start (int) - the index of where to start searching
stop (int) - the index of where to stop searching
bucketedDistribution (list of list of floats) - a distribution of a protected attribute value, organized by bucket
minMaxSublist (list of list of floats) - a list of the minimum and maximum in each bucket
Returns: the bucket containing the given value
'''
def getBucketHelper(self, value, start, stop, bucketedDistribution, minMaxSublist):
middleIndex = (start + stop) // 2
if value > minMaxSublist[middleIndex][1]:
return self.getBucketHelper(value, middleIndex + 1, stop, bucketedDistribution, minMaxSublist)
elif value < minMaxSublist[middleIndex][0]:
return self.getBucketHelper(value, start, middleIndex, bucketedDistribution, minMaxSublist)
else:
return middleIndex
'''
Creates a DataSet object
fileName (string) - a file name
protectedAttribute (string) - the name of the protected attribute
groundTruth (string) - a 1 or 0 indicating the ground truth of a particular row
noiseScale (float) - the standard deviation of the normal distribution used to add noise to the data
'''
def createDataSet(self, fileName, protectedAttribute, groundTruth, noiseScale):
data = DataSet()
data.loadData(fileName, protectedAttribute, groundTruth)
numericalColumns = data.getNumericalColumns("main")
for column in numericalColumns:
data.addRandomNoise(column, noiseScale)
self.setDataSetVariables(data)
'''
Select columns for Feldman repair.
dataSet (DataSet) - the dataset
dataName (str) - the name for a particular dataset (e.g. "Jury" or "Restaurants")
Returns: a list of the columns to repair for the given dataset
'''
def chooseColumnsForFeldman(self, dataSet, dataName):
columns = dataSet.getNumericalColumns("main")
if dataName == "Restaurant":
return ["ZIPCODE", "Latitude", "Longitude", "Community Board", "Council District", "Census Tract"]
elif dataName == "Portuguese":
# We can repair on all numerical columns
return columns
elif dataName == "Credit":
# We can repair on all numerical columns
return columns
elif dataName == "Income":
# We can repair on all numerical columns
return columns
elif dataName == "Ricci":
# We can repair on all numerical columns
return columns
elif dataName == "Jury":
# We should not repair on "trial_id, so there are no columns to repair"
return []
elif dataName == "German":
return columns
else:
return "Invalid dataset name."
'''
Repairs the data in a single column
columnName (string) - a column header
'''
def repairColumn(self, columnName):
distributions, attributeValues = self.makeDistributions(columnName)
bucketList, minMaxList = self.bucketize(distributions)
medianDistributions = self.findMedianDistribution(bucketList)
self.modifyData(columnName, medianDistributions, bucketList, minMaxList, attributeValues)
'''
Makes DataSet object from a file, then repairs the data
fileName (string) - a file name
protectedAttribute (string) - the name of the protected attribute
groundTruth (string) - a 1 or 0 indicating the ground truth of a particular row
dataName (str) - the name for a particular dataset (e.g. "Jury" or "Restaurants")
noiseScale (float, optional) - the standard deviation of the normal distribution used to add noise to the data
'''
def runRepair(self, fileName, protectedAttribute, groundTruth, dataName, noiseScale=.01):
self.createDataSet(fileName, protectedAttribute, groundTruth, noiseScale)
repairColumns = self.chooseColumnsForFeldman(self.dataSetCopy, dataName)
print("Columns to repair: ", repairColumns)
for column in repairColumns:
self.repairColumn(column)