forked from liehendi11/IFN645
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dm_tools.py
57 lines (41 loc) · 1.85 KB
/
dm_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
import pandas as pd
def data_prep():
# read the veteran dataset
df = pd.read_csv('datasets/veteran.csv')
# change DemCluster from interval/integer to nominal/str
df['DemCluster'] = df['DemCluster'].astype(str)
# change DemHomeOwner into binary 0/1 variable
dem_home_owner_map = {'U':0, 'H': 1}
df['DemHomeOwner'] = df['DemHomeOwner'].map(dem_home_owner_map)
# denote errorneous values in DemMidIncome
mask = df['DemMedIncome'] < 1
df.loc[mask, 'DemMedIncome'] = np.nan
# impute missing values in DemAge with its mean
df['DemAge'].fillna(df['DemAge'].mean(), inplace=True)
# impute med income using mean
df['DemMedIncome'].fillna(df['DemMedIncome'].mean(), inplace=True)
# impute gift avg card 36 using mean
df['GiftAvgCard36'].fillna(df['GiftAvgCard36'].mean(), inplace=True)
# drop ID and the unused target variable
df.drop(['ID', 'TargetD'], axis=1, inplace=True)
df = pd.get_dummies(df)
return df
def analyse_feature_importance(dm_model, feature_names, n_to_display=20):
# grab feature importances from the model
importances = dm_model.feature_importances_
# sort them out in descending order
indices = np.argsort(importances)
indices = np.flip(indices, axis=0)
# limit to 20 features, you can leave this out to print out everything
indices = indices[:n_to_display]
for i in indices:
print(feature_names[i], ':', importances[i])
def visualize_decision_tree(dm_model, feature_names, save_name):
import pydot
from io import StringIO
from sklearn.tree import export_graphviz
dotfile = StringIO()
export_graphviz(dm_model, out_file=dotfile, feature_names=feature_names)
graph = pydot.graph_from_dot_data(dotfile.getvalue())
graph[0].write_png(save_name) # saved in the following file