-
Notifications
You must be signed in to change notification settings - Fork 1
/
pipeline.py
81 lines (69 loc) · 3.16 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from DataSet import DataSet
from RepairData import RepairData
from NaiveBayes import NaiveBayes
from ModifiedBayes import ModifiedBayes
from TwoBayes import TwoBayes
from Metrics import Metrics
from classifierForDI import detectDI
'''
Parameters:
fileName (str) - The path to the file whose data should be loaded
nameForFiles (str) - The name to assign to files written by the pipeline
protectedAttribute (str) - The header name for the column containing the protectedAttribute data in the dataset
trueLabels (str) - The header name for the column containing the true classifications in the dataset
feldman (bool) - If True, run Feldman repair algorithm. If False, do not run Feldman repair algorithm
bayes (str) - If "naive", run Naive Bayes. If "modified", run Modified Bayes. If "two", run Two Bayes
dataName (str) - The name for the dataset (e.g. "Restaurants" or "Jury")
Notes:
Results (e.g. DI detector, metrics results) will be written to the results/ directory
Pickled objects will be written to the pickledObjects/ directory
CSVs of data will be written to the dataCSVs/ directory
'''
def pipeline(fileName, nameForFiles, protectedAttribute, trueLabels, feldman, bayes, dataName):
# Load data into DataSet
ds = DataSet()
ds.loadData(fileName, protectedAttribute, trueLabels)
# Open a file for writing results
f = open("results/" + nameForFiles + ".txt", "w")
print("Starting DI detection")
DIresult = detectDI(ds)
f.write("DI results on original data: " + DIresult)
# Feldman repair algorithm
currDataSet = ds
if feldman == "yes":
print("Starting Feldman")
repair = RepairData()
repair.runRepair(ds.fileName, ds.protectedAttribute, ds.trueLabels, dataName, noiseScale=.01)
# Pickle the Feldman-repaired data
repair.dataSetCopy.savePickle("pickledObjects/repairedData/" + nameForFiles)
repair.dataSetCopy.saveToCsv("dataCSVs/repairedData/" + nameForFiles + ".csv")
currDataSet = repair.dataSetCopy
print("Starting post-Feldman DI detection")
postFeldmanDIresult = detectDI(repair.dataSetCopy)
f.write("DI results after Feldman: " + postFeldmanDIresult)
#Split data into test and training set
currDataSet.splitIntoTrainTest()
print("Split into test train")
#Bayes
if bayes == "naive":
print("Starting Naive Bayes")
bayesObject = NaiveBayes()
bayesObject.train(currDataSet, bayesObject.model)
bayesObject.classify(currDataSet, "test")
print("Completed Naive Bayes")
elif bayes == "modified":
bayesObject = ModifiedBayes()
bayesObject.train(currDataSet, 1)
bayesObject.classify(currDataSet, "test")
else:
bayesObject = TwoBayes()
bayesObject.train(currDataSet, 1)
bayesObject.classify(currDataSet, "test")
currDataSet.savePickle("pickledObjects/classifiedData/" + nameForFiles)
currDataSet.saveToCsv("dataCSVs/classifiedData/" + nameForFiles + ".csv")
# Metrics
print("Starting metrics")
metrics = Metrics()
metrics.runAllMetrics(f, currDataSet, bayes, bayesObject)
print("Completed metrics")
f.close()