forked from sohviluukkonen/kinase-modelling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_models.py
147 lines (124 loc) · 5.83 KB
/
run_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import sys
import numpy as np
import pandas as pd
from src.chemprop import train_chemprop_MT, train_chemprop_STs, predict_chemprop_MT, predict_chemprop_STs
from src.rf import train_RF_STs, predict_RF_STs, train_optim_RF_STs
from src.xgb import train_XGB_STs, predict_XGB_STs, train_optim_XGB_STs
from src.pyboost import train_PB_MT, predict_PB_MT, train_optim_PB_MT
from src.utils import mkdirs
from src.inputs import create_imputed_inputs
# Set random seed
seed = 2022
import torch
torch.manual_seed(seed)
import random
random.seed(seed)
np.random.seed(seed)
def run_model(model :str,
dataset : str = 'kinase200',
split : str = 'Random',
mode : str = 'ST',
params : str = 'Default',
param_path : str = None,
imputation : str = None):
"""
Run a model on a dataset.
Parameters
----------
model : str
Name of the model to run.
dataset : str
Name of the dataset to use.
split : str
Name of the split to use.
mode : str
Name of the mode to use: ST or MT.
params : str
Name of the parameters to use.
param_path : str
Path to the parameters to use.
imputation : str
Name of the imputation method to use.
"""
# If param_path is given, set params to HyperOpt istaed of the name of the method
params = 'HyperOpt' if param_path else params
# If an imputation method is given, change model name based on imputation method
if imputation :
if mode == 'ST' : sys.exit('Imputation can only be used with multitask models')
else: model = model + '_Imputed' + imputation
# Create paths
data_path = f'ModelInputs/{dataset}/{split}'
model_path = f'Models/{model}/{mode}/{dataset}/{split}/{params}'
preds_path = f'Predictions/{model}/{mode}/{dataset}/{split}/{params}/predictions.csv'
# Modify data_path depending on imputation method
if imputation : data_path = f'{data_path}/Imputed{imputation}'
else : data_path = f'{data_path}/Original'
print(f'Data path: {data_path}')
print(f'Model path: {model_path}')
print(f'Predictions path: {preds_path}')
# Random Forest
if model == 'RF':
if mode == 'MT' or imputation != None:
sys.exit('For RF, only ST model is implemented')
elif params == 'Default':
train_RF_STs(f'{data_path}/train.csv', model_path, )
predict_RF_STs(f'{data_path}/test.csv', model_path, preds_path)
else:
train_optim_RF_STs(f'{data_path}/train.csv', f'{data_path}/valid.csv', model_path)
predict_RF_STs(f'{data_path}/test.csv', model_path, preds_path)
# XGBoost
elif model == 'XGB':
if mode == 'MT' or imputation != None:
sys.exit('For XGB, only ST model is implemented')
elif params == 'Default':
train_XGB_STs(f'{data_path}/train.csv', model_path)
predict_XGB_STs(f'{data_path}/test.csv', model_path, preds_path)
else:
train_optim_XGB_STs(f'{data_path}/train.csv', f'{data_path}/valid.csv', model_path)
predict_XGB_STs(f'{data_path}/test.csv', model_path, preds_path)
# Chemprop
elif model.startswith('CP'):
if mode == 'ST':
# train_chemprop_STs(f'{data_path}/train.csv', f'{data_path}/valid.csv', f'{data_path}/test.csv', model_path, param_path)
predict_chemprop_STs(f'{data_path}/test.csv', model_path, preds_path)
else:
train_chemprop_MT(f'{data_path}/train.csv', f'{data_path}/valid.csv', f'{data_path}/test.csv', model_path, param_path)
predict_chemprop_MT(f'{data_path}/test.csv', model_path, preds_path)
# PyBoost
elif model == 'PB':
if mode == 'ST' or imputation != None:
sys.exit('For PyBoost, only MT model is implemented')
elif params == 'Default':
train_PB_MT(f'{data_path}/train.csv', model_path)
predict_PB_MT(f'{data_path}/test.csv', model_path, preds_path)
else:
train_optim_PB_MT(f'{data_path}/train.csv', f'{data_path}/valid.csv', model_path)
predict_PB_MT(f'{data_path}/test.csv', model_path, preds_path)
if __name__ == '__main__':
for dataset in ['kinase1000', 'kinase200']:
for split in ['RGES', 'DGBC']:
# # With default parameters
run_model('RF', dataset, split, 'ST')
run_model('XGB', dataset, split, 'ST')
run_model('PB', dataset, split, 'MT')
run_model('CP', dataset, split, 'ST')
run_model('CP', dataset, split, 'MT')
create_imputed_inputs(f'data/datasets/{dataset}_{split}.csv.gz', f'ModelInputs/{dataset}/{split}/ImputedMean', 'Mean')
run_model('CP', dataset, split, 'MT', imputation='Mean')
create_imputed_inputs(f'data/datasets/{dataset}_{split}.csv.gz', f'ModelInputs/{dataset}/{split}/ImputedRF',
'RF', f'Models/RF/ST/{dataset}/{split}/Default/'
)
run_model('CP', dataset, split, 'MT', imputation='RF')
# With optimized parameters
if dataset == 'kinase200':
run_model('RF', dataset, split, 'ST', 'HyperOpt')
run_model('XGB', dataset, split, 'ST', 'HyperOpt')
run_model('PB', dataset, split, 'MT', 'HyperOpt')
run_model('CP', dataset, split, 'ST', param_path='cp_params.json')
run_model('CP', dataset, split, 'MT', param_path='cp_params.json')
run_model('CP', dataset, split, 'MT', param_path='cp_params.json', imputation='Mean')
run_model('CP', dataset, split, 'MT', param_path='cp_params.json', imputation='RF')
if dataset == 'kinase1000':
run_model('PB', dataset, split, 'MT', 'HyperOpt')
run_model('CP', dataset, split, 'MT', param_path='cp_params.json')