-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabsenteeism_module.py
149 lines (112 loc) · 6.62 KB
/
absenteeism_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# coding: utf-8
# In[1]:
# import all libraries needed
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
# the custom scaler class
class CustomScaler(BaseEstimator,TransformerMixin):
def __init__(self,columns,copy=True,with_mean=True,with_std=True):
self.scaler = StandardScaler(copy,with_mean,with_std)
self.columns = columns
self.mean_ = None
self.var_ = None
def fit(self, X, y=None):
self.scaler.fit(X[self.columns], y)
self.mean_ = np.array(np.mean(X[self.columns]))
self.var_ = np.array(np.var(X[self.columns]))
return self
def transform(self, X, y=None, copy=None):
init_col_order = X.columns
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
# create the special class that we are going to use from here on to predict new data
class absenteeism_model():
def __init__(self, model_file, scaler_file):
# read the 'model' and 'scaler' files which were saved
with open('model','rb') as model_file, open('scaler', 'rb') as scaler_file:
self.reg = pickle.load(model_file)
self.scaler = pickle.load(scaler_file)
self.data = None
# take a data file (*.csv) and preprocess it in the same way as in the lectures
def load_and_clean_data(self, data_file):
# import the data
df = pd.read_csv(data_file,delimiter=',')
# store the data in a new variable for later use
self.df_with_predictions = df.copy()
# drop the 'ID' column
df = df.drop(['ID'], axis = 1)
# to preserve the code we've created in the previous section, we will add a column with 'NaN' strings
df['Absenteeism Time in Hours'] = 'NaN'
# create a separate dataframe, containing dummy values for ALL avaiable reasons
reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)
# split reason_columns into 4 types
reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
reason_type_4 = reason_columns.loc[:,22:].max(axis=1)
# to avoid multicollinearity, drop the 'Reason for Absence' column from df
df = df.drop(['Reason for Absence'], axis = 1)
# concatenate df and the 4 types of reason for absence
df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
# assign names to the 4 reason type columns
# note: there is a more universal version of this code, however the following will best suit our current purposes
column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
'Pet', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
df.columns = column_names
# re-order the columns in df
column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense',
'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pet', 'Absenteeism Time in Hours']
df = df[column_names_reordered]
# convert the 'Date' column into datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
# create a list with month values retrieved from the 'Date' column
list_months = []
for i in range(df.shape[0]):
list_months.append(df['Date'][i].month)
# insert the values in a new column in df, called 'Month Value'
df['Month Value'] = list_months
# create a new feature called 'Day of the Week'
df['Day of the Week'] = df['Date'].apply(lambda x: x.weekday())
# drop the 'Date' column from df
df = df.drop(['Date'], axis = 1)
# re-order the columns in df
column_names_upd = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Day of the Week',
'Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
'Pet', 'Absenteeism Time in Hours']
df = df[column_names_upd]
# map 'Education' variables; the result is a dummy
df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
# replace the NaN values
df = df.fillna(value=0)
# drop the original absenteeism time
df = df.drop(['Absenteeism Time in Hours'],axis=1)
# drop the variables we decide we don't need
df = df.drop(['Day of the Week','Daily Work Load Average','Distance to Work'],axis=1)
# we have included this line of code if you want to call the 'preprocessed data'
self.preprocessed_data = df.copy()
# we need this line so we can use it in the next functions
self.data = self.scaler.transform(df)
# a function which outputs the probability of a data point to be 1
def predicted_probability(self):
if (self.data is not None):
pred = self.reg.predict_proba(self.data)[:,1]
return pred
# a function which outputs 0 or 1 based on our model
def predicted_output_category(self):
if (self.data is not None):
pred_outputs = self.reg.predict(self.data)
return pred_outputs
# predict the outputs and the probabilities and
# add columns with these values at the end of the new data
def predicted_outputs(self):
if (self.data is not None):
self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:,1]
self.preprocessed_data ['Prediction'] = self.reg.predict(self.data)
return self.preprocessed_data