-
Notifications
You must be signed in to change notification settings - Fork 0
/
final_poi_predict.py
176 lines (134 loc) · 5.5 KB
/
final_poi_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/python
import sys
import pickle
import matplotlib.pyplot
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features
### Load the dictionary containing the dataset
data_dict = pickle.load(open("final_project_dataset.pkl", "rb") )
### Task 2: Remove outliers
features = ["salary", "bonus"]
data_dict.pop("TOTAL")
# ~ data_dict.pop("LAY KENNETH L")
# ~ data_dict.pop("SKILLING JEFFREY K")
# ~ data_dict.pop("FREVERT MARK A")
# ~ data_dict.pop("LAVORATO JOHN J")
data = featureFormat(data_dict, features)
# ~ sal,bon=[],[]
# ~ for i in data:
# ~ sal.append(i[0])
# ~ bon.append(i[1])
# ~ sal=sorted(sal,reverse=True)
# ~ bon=sorted(bon,reverse=True)
# ~ sal=(sal[0:3])
# ~ bon=(bon[0:3])
# ~ for k,v in data_dict.items():
# ~ if(data_dict[k]["salary"] in sal)or(data_dict[k]["bonus"] in bon):
# ~ print(k)
for point in data:
salary = point[0]
bonus = point[1]
matplotlib.pyplot.scatter( salary, bonus)
matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()
### Task 3: Create new feature(s)
def computeFraction( poi_messages, all_messages ):
""" given a number messages to/from POI (numerator)
and number of all messages to/from a person (denominator),
return the fraction of messages to/from that person
that are from/to a POI
"""
if all_messages == 'NaNNaN': # occurred when created additive features (all emails)
all_messages = 'NaN'
if poi_messages == 'NaNNaN':
poi_messages = 'NaN'
if all_messages == 'NaN':
return 0
if poi_messages == 'NaN':
return 0
if all_messages == 0:
return 0
return 1.*poi_messages/all_messages
return fraction
for name in data_dict:
poi_msg_to = data_dict[name]['from_poi_to_this_person']
all_msg_to = data_dict[name]['to_messages']
data_dict[name]['fraction_from_poi'] = computeFraction(poi_msg_to, all_msg_to)
poi_msg_from = data_dict[name]['from_this_person_to_poi']
all_msg_from = data_dict[name]['from_messages']
data_dict[name]['fraction_to_poi'] = computeFraction(poi_msg_from, all_msg_from)
poi_msg_all = poi_msg_to + poi_msg_from
all_msg_all = all_msg_to + all_msg_from
data_dict[name]['fraction_emails_with_poi'] = computeFraction(poi_msg_all, all_msg_all)
### Store to my_dataset for easy export below.
features_list = ["poi","salary","fraction_from_poi","fraction_emails_with_poi","fraction_to_poi", "shared_receipt_with_poi"]
my_dataset = data_dict
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
### Task 4: Try a varity of classifiers
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test =train_test_split(features, labels, test_size=0.1, random_state=42)
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier( n_estimators=12,min_samples_split=5)
clf.fit(features_train,labels_train)
pred=clf.predict(features_test)
acc=accuracy_score(pred,labels_test)
print("RF ccuracy = ", acc)
#####
from sklearn.cross_validation import KFold
kf=KFold(len(labels),3)
for train_indices, test_indices in kf:
#make training and testing sets
features_train= [features[ii] for ii in train_indices]
features_test= [features[ii] for ii in test_indices]
labels_train=[labels[ii] for ii in train_indices]
labels_test=[labels[ii] for ii in test_indices]
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(min_samples_split=5)
clf = clf.fit(features_train,labels_train)
pred= clf.predict(features_test)
acc=accuracy_score(labels_test, pred)
print ("DT ccuracy = ", acc)
# function for calculation ratio of true positives
# out of all positives (true + false)
print ('precision = ', precision_score(labels_test,pred))
# function for calculation ratio of true positives
# out of true positives and false negatives
print ('recall = ', recall_score(labels_test,pred))
#####
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = accuracy_score(pred,labels_test)
print ("NB ccuracy = ",accuracy)
print ('precision = ', precision_score(labels_test,pred))
print ('recall = ', recall_score(labels_test,pred))
from sklearn.metrics import classification_report
print(classification_report(pred,labels_test))
param_grid = {
'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
####
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print ("SVM ccuracy = ", acc)
print ('precision = ', precision_score(labels_test,pred))
print ('recall = ', recall_score(labels_test,pred))
pickle.dump(clf, open("my_classifier.pkl", "wb") )
pickle.dump(data_dict, open("my_dataset.pkl", "wb") )
pickle.dump(features_list, open("my_feature_list.pkl", "wb") )