-
Notifications
You must be signed in to change notification settings - Fork 0
/
ibc_example.py
100 lines (81 loc) · 4.19 KB
/
ibc_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid, train_test_split
from src.utils import calculate_passthroughs, calculate_duration, train_model
from semantic import dataloader,featuretransformer, modelbuilder, Pipeline
@dataloader(output = ['atacq_df'])
def fetch_atacq_data():
atacq_df = pd.read_parquet('./data/raw/atacq_df.parquet')
return atacq_df
@dataloader(output = ['eventlog_df'])
def fetch_eventlog_data():
eventlog_df = pd.read_parquet('./data/raw/eventlog_df.parquet')
return eventlog_df
@dataloader(output = ['production_df'])
def fetch_production_data():
production_df = pd.read_parquet('./data/raw/production_df.parquet')
return production_df
@dataloader(output = ['sto_df'])
def fetch_sto_data():
sto_df = pd.read_parquet('./data/raw/sto_df.parquet')
return sto_df
@dataloader(output = ['tracking_df'])
def fetch_tracking_data():
tracking_df = pd.read_parquet('./data/raw/tracking_df.parquet')
return tracking_df
@dataloader(output = ['ibc_reg_points'], pertain=['ibc'])
def fetch_ibc_regpoints():
reg_points = ('20905', '20907')
return reg_points
@featuretransformer(input=['tracking_df', 'atacq_df', 'ibc_reg_points'], output=['body_duration_df'], pertain=['ibc'])
def ibc_calculate_passthroughs(tracking_df, atacq_df, reg_points):
tracking_points_df = pd.DataFrame(data={'begin':[reg_points[0]], 'end':reg_points[1]})
passthrough_dict = calculate_passthroughs(tracking_df, atacq_df, tracking_points_df)
body_duration_dict = calculate_duration(passthrough_dict)
body_duration_df = body_duration_dict[reg_points]
return body_duration_df
@featuretransformer(input=['body_duration_df', 'tracking_df'], output=['feat_label_df'])
def ibc_extract_features(body_duration_df, tracking_df):
# Remove all rows where end timestamps are earlier than begin timestamps
body_duration_df = body_duration_df.loc[body_duration_df['end_timestamp'] > body_duration_df['begin_timestamp']]
# Add tracking features using body duration df
renamed_body_duration_df = body_duration_df.reset_index().rename(columns={'level_0':'bodynumber', 'level_1':'passthrough'})
tracking_feat_df = pd.merge(tracking_df.reset_index(), renamed_body_duration_df, how='inner', left_on=['bodyNumber', 'timestamp'], right_on=['bodynumber', 'begin_timestamp'])[['timestamp', 'primerColorCode', 'bodyType', 'flowCode', 'diff_min']]
# Remove duplicates in tracking_feat_df and set index to timestamp
features_df = tracking_feat_df.drop_duplicates(keep='first').set_index('timestamp').sort_index()
# Link body number back to timestamp
label_series = body_duration_df.loc[body_duration_df['begin_timestamp'].isin(features_df.index)][['begin_timestamp', 'fault']]
time_label_series = label_series.set_index('begin_timestamp')
feat_label_df = pd.concat([features_df, time_label_series], axis=1)
label_series = label_series['fault']
return feat_label_df
@modelbuilder(input=['feat_label_df'])
def ibc_model(features_df):
# Get an out-of-sample test set
train_feat_df, test_feat_df = train_test_split(features_df, test_size=0.20, shuffle=False)
# Define the gap between train and test set
embargo_size = 10
train_feat_df = train_feat_df.iloc[:-embargo_size]
sel_feats = list(train_feat_df.drop(columns=['fault']).columns)
print(sel_feats)
sel_train_feat_df = train_feat_df[sel_feats + ['fault']]
sel_test_feat_df = test_feat_df[sel_feats + ['fault']]
# Select hyperparameters with a cross-validation grid search
best_score = 0
best_params = None
best_model_list = None
parameter_grid = ParameterGrid({'iterations': [1000], 'depth': [8], 'od_type': ['Iter'], 'od_wait': [50], 'eval_metric': ['BalancedAccuracy']})
for cur_params in parameter_grid:
print(cur_params)
cur_score, cur_model_list = train_model(sel_train_feat_df, cur_params, verbose=True)
# Print new line
print()
if cur_score > best_score:
best_score = cur_score
best_params = cur_params
best_model_list = cur_model_list
print(best_score)
return best_score, best_params, best_model_list
p = Pipeline()
p.search(pertain="ibc")
model = p.execute()