-
Notifications
You must be signed in to change notification settings - Fork 2
/
make_overview_parameters.py
149 lines (143 loc) · 5.26 KB
/
make_overview_parameters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import pickle
import pandas as pd
import numpy as np
from itertools import product
from data.train_test_split import AbstractTrainTestSplitter, FractionalRandomSplitter
from gpflow.kernels import SquaredExponential
from algorithms import GPonRealSpace, RandomForest, KNN
from data.train_test_split import (
RandomSplitter,
BlockPostionSplitter,
PositionSplitter,
FractionalRandomSplitter,
BioSplitter,
)
from util.mlflow.convenience_functions import (
get_mlflow_results,
get_mlflow_results_artifacts,
)
from util.mlflow.constants import (
EVE_DENSITY,
ONE_HOT,
TRANSFORMER,
VAE,
ESM,
VAE_AUX,
VAE_RAND,
)
from util.mlflow.constants import GP_LEN, GP_L_VAR, GP_VAR, MLL, MSE, SPEARMAN_RHO
def extract_parameters(
datasets: list,
algos: list,
reps: list,
metrics: list,
train_test_splitter: AbstractTrainTestSplitter,
):
"""
NOTE: extracting and visualizing parameters (tables and figures) is part of the MlFlow results UI. This method is therefore deprecated.
"""
results_dict = get_mlflow_results(
datasets, algos, reps, metrics, train_test_splitter
)
cv_name = train_test_splitter(datasets[0]).get_name()
sub_metrics = ["mu", "std"]
header_multi_index = pd.MultiIndex.from_tuples(
(
(a, m, sm)
for a in results_dict.get(datasets[0]).keys()
for m in metrics
for sm in sub_metrics
),
names=["algo", "metric", "sub_m"],
)
multi_index = pd.MultiIndex.from_tuples(
((d, r, cv_name) for d in datasets for r in reps),
names=["data", "representation", "cv"],
)
df = pd.DataFrame(columns=header_multi_index, index=multi_index)
for alg in results_dict.get(datasets[0]).keys():
for metric in metrics:
for dat in datasets:
for rep in reps:
entry = (
results_dict.get(dat).get(alg).get(rep).get(None).get(metric)
)
df.loc[(dat, rep, cv_name), (alg, metric)] = (
np.round(np.mean(entry), 2),
np.round(np.std(entry), 2),
)
return df
def extract_cv_information(
datasets: list, representations: list, protocols: list, cached_results=True
) -> pd.DataFrame:
algos = [GPonRealSpace().get_name()] # splits are the same across all algos
cached_filename = f"/Users/rcml/protein_regression/results/cache/metric_comparison_d={'_'.join(datasets)}_a={'_'.join(algos)}_r={'_'.join(representations)}_s={'_'.join([s.get_name() for s in protocols[:5]])}.pkl"
if cached_results and os.path.exists(cached_filename):
with open(cached_filename, "rb") as infile:
mlflow_results = pickle.load(infile)
else:
mlflow_results = {}
for protocol in protocols:
mlflow_protocol_results = get_mlflow_results_artifacts(
datasets=datasets,
algos=algos,
reps=representations,
metrics=[None],
train_test_splitter=protocol,
) # TODO: what metrics?
mlflow_results[protocol.get_name()] = mlflow_protocol_results
if cached_results:
with open(cached_filename, "wb") as outfile:
pickle.dump(mlflow_results, outfile)
protocol_names = [protocol.get_name() for protocol in protocols]
results_iterator = product(datasets, representations, protocol_names)
header_multi_index = pd.MultiIndex.from_tuples(
(d, r) for d, r in product(datasets, representations)
)
row_multi_index = pd.MultiIndex.from_tuples(
(p, idx) for p, idx in product(protocol_names, [0])
)
df = pd.DataFrame(columns=header_multi_index, index=row_multi_index)
for d, r, p in results_iterator:
entries = mlflow_results.get(p).get(d).get(algos[0]).get(r).get(None)
if entries:
for e in entries:
n_train = len(entries[e].get("train_trues"))
n_test = len(entries[e].get("trues"))
df.loc[(p, e), (d, r)] = f"[{n_train};{n_test}]"
else:
df.loc[(p, 0), (d, r)] = np.nan
return df
if __name__ == "__main__":
### MAKE CV TABLE
datasets = ["1FQG", "UBQT", "TIMB", "MTH3", "BRCA"]
representations = [
ONE_HOT,
EVE_DENSITY,
] # splits are the same for ESM/ProtBert/OH , different for EVE derivative
train_test_splitter = [
RandomSplitter("1FQG"),
PositionSplitter("1FQG"),
] # NOTE: fractional splitter is not included here
cv_df = extract_cv_information(
datasets=datasets,
representations=representations,
protocols=train_test_splitter,
)
# main overview
print(cv_df.to_latex(bold_rows=True, multicolumn_format="r"))
# biosplitter overview - separate table
cv_df = extract_cv_information(
datasets=["TOXI"],
representations=[ONE_HOT],
protocols=[
BioSplitter("TOXI", 1, 1),
BioSplitter("TOXI", 1, 2),
BioSplitter("TOXI", 2, 2),
BioSplitter("TOXI", 2, 3),
BioSplitter("TOXI", 3, 3),
BioSplitter("TOXI", 3, 4),
],
)
print(cv_df.to_latex(bold_rows=True, multicolumn_format="r"))