From a3c357901b2a6769e4614b2456dc04431477cc77 Mon Sep 17 00:00:00 2001 From: Miquel Duran-Frigola Date: Wed, 5 Apr 2023 14:45:44 +0200 Subject: [PATCH 1/8] add ersilia embedding --- config/example_params.json | 1 + install_linux.sh | 5 +- zairachem/descriptors/eosce.py | 63 ++++++ .../from_ersilia_embedding/__init__.py | 1 + .../from_ersilia_embedding/estimate.py | 183 ++++++++++++++++++ .../estimators/from_ersilia_embedding/pipe.py | 9 + zairachem/estimators/pipe.py | 12 ++ .../tools/fpsim2/FPSim2/tests/test_io.py | 19 +- .../macest/macest/classification/models.py | 8 +- .../tools/macest/macest/model_selection.py | 2 +- .../tools/macest/macest/regression/models.py | 4 +- .../tools/macest/macest/regression/plots.py | 2 +- .../molmap/bidd-molmap/molmap/model/cbks2.py | 2 +- zairachem/vars.py | 1 + 14 files changed, 291 insertions(+), 21 deletions(-) create mode 100644 zairachem/descriptors/eosce.py create mode 100644 zairachem/estimators/from_ersilia_embedding/__init__.py create mode 100644 zairachem/estimators/from_ersilia_embedding/estimate.py create mode 100644 zairachem/estimators/from_ersilia_embedding/pipe.py diff --git a/config/example_params.json b/config/example_params.json index 4bea76bc..47fce1d4 100644 --- a/config/example_params.json +++ b/config/example_params.json @@ -13,6 +13,7 @@ "flaml-individual-descriptors", "autogluon-manifolds", "kerastuner-reference-embedding", + "kerastuner-eosce-embedding", "molmap" ] } \ No newline at end of file diff --git a/install_linux.sh b/install_linux.sh index 5f083277..c53f2a0a 100644 --- a/install_linux.sh +++ b/install_linux.sh @@ -31,7 +31,7 @@ python3 -m pip install autogluon.tabular[all]==0.5.2 python3 -m pip install "xgboost==1.3.3" python3 -m pip install "SQLAlchemy<1.4.0" -# install zairachem +# install extra dependencies python3 -m pip install git+https://github.com/chembl/FPSim2.git@0.3.0 python3 -m pip install -q -U keras-tuner==1.1.3 @@ -39,6 +39,9 @@ python3 -m pip install -q -U keras-tuner==1.1.3 python3 -m pip install git+https://github.com/ersilia-os/ersilia.git ersilia --help +# install ersilia compound embedding +python3 -m pip install git+https://github.com/ersilia-os/compound-embedding-lite.git + # install isaura python3 -m pip install git+https://github.com/ersilia-os/isaura.git@ce293244ad0bdd6d7d4f796d2a84b17208a87b56 diff --git a/zairachem/descriptors/eosce.py b/zairachem/descriptors/eosce.py new file mode 100644 index 00000000..90e1da6c --- /dev/null +++ b/zairachem/descriptors/eosce.py @@ -0,0 +1,63 @@ +import os +import pandas as pd +import h5py + +from eosce.models import ErsiliaCompoundEmbedding +from ..utils.matrices import Hdf5 +from .. import ZairaBase + +from ..setup import SMILES_COLUMN +from ..vars import DATA_SUBFOLDER, DATA_FILENAME, DESCRIPTORS_SUBFOLDER + +EOSCE_FILE_NAME = "eosce.h5" + + +class EosceEmbedder(ZairaBase): + def __init__(self): + ZairaBase.__init__(self) + self.model = ErsiliaCompoundEmbedding() + + def calculate(self, smiles_list, output_h5): + X = self.model.transform(smiles_list) + if output_h5 is None: + return X + keys = ["key-{0}".format(i) for i in range(len(smiles_list))] + features = ["feat-{0}".format(i) for i in range(X.shape[1])] + inputs = smiles_list + with h5py.File(output_h5, "w") as f: + f.create_dataset("Keys", data=keys) + f.create_dataset("Features", data=features) + f.create_dataset("Inputs", data=inputs) + f.create_dataset("Values", data=X) + + +class EosceLoader(ZairaBase): + def __init__(self): + ZairaBase.__init__(self) + self.path = self.get_output_dir() + + def open(self, eos_id): + path = os.path.join(self.path, DESCRIPTORS_SUBFOLDER, eos_id, EOSCE_FILE_NAME) + return Hdf5(path) + + +class EosceDescriptors(ZairaBase): + def __init__(self): + ZairaBase.__init__(self) + self.path = self.get_output_dir() + self.input_csv = os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME) + self.smiles_list = self._get_smiles_list() + + def _get_smiles_list(self): + df = pd.read_csv(self.input_csv) + return list(df[SMILES_COLUMN]) + + def output_h5_filename(self): + path = os.path.join(self.path, DESCRIPTORS_SUBFOLDER) + os.makedirs(path, exist_ok=True) + return os.path.join(path, EOSCE_FILE_NAME) + + def run(self): + output_h5 = self.output_h5_filename() + ref = EosceEmbedder() + ref.calculate(self.smiles_list, output_h5) diff --git a/zairachem/estimators/from_ersilia_embedding/__init__.py b/zairachem/estimators/from_ersilia_embedding/__init__.py new file mode 100644 index 00000000..1d7b9c64 --- /dev/null +++ b/zairachem/estimators/from_ersilia_embedding/__init__.py @@ -0,0 +1 @@ +ESTIMATORS_FAMILY_SUBFOLDER = "ersilia_embedding" diff --git a/zairachem/estimators/from_ersilia_embedding/estimate.py b/zairachem/estimators/from_ersilia_embedding/estimate.py new file mode 100644 index 00000000..be6c099e --- /dev/null +++ b/zairachem/estimators/from_ersilia_embedding/estimate.py @@ -0,0 +1,183 @@ +import os +import numpy as np +import pandas as pd +import h5py + +from ... import ZairaBase +from ..base import BaseEstimator, BaseOutcomeAssembler +from ...automl.kerastuner import KerasTunerEstimator +from ...vars import ( + DATA_FILENAME, + DATA_SUBFOLDER, + DESCRIPTORS_SUBFOLDER, + ESTIMATORS_SUBFOLDER, +) +from . import ESTIMATORS_FAMILY_SUBFOLDER +from .. import RESULTS_MAPPED_FILENAME, RESULTS_UNMAPPED_FILENAME + + +class XGetter(ZairaBase): + def __init__(self, path): + ZairaBase.__init__(self) + self.path = path + self.X = [] + self.columns = [] + + def _get_eosce_descriptor(self): + with h5py.File( + os.path.join(self.path, DESCRIPTORS_SUBFOLDER, "eosce.h5"), "r" + ) as f: + X_ = f["Values"][:] + self.X += [X_] + self.columns += [ + "feat-{0}".format(x.decode("utf-8")) for x in f["Features"][:] + ] + + def get(self): + self._get_eosce_descriptor() + X = np.hstack(self.X) + df = pd.DataFrame(X, columns=self.columns) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + DATA_FILENAME, + ), + index=False, + ) + return df + + +class Fitter(BaseEstimator): + def __init__(self, path): + BaseEstimator.__init__(self, path=path) + self.trained_path = os.path.join( + self.get_output_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + self.x_getter = XGetter + + def _get_X(self): + df = self.x_getter(path=self.path).get() + return df + + def _get_y(self, task): + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + return np.array(df[task]) + + def _get_Y(self): + Y = [] + columns = [] + for t in self._get_reg_tasks(): + y = self._get_y(t) + Y += [y] + columns += [t] + for t in self._get_clf_tasks(): + y = self._get_y(t) + Y += [y] + columns += [t] + Y = np.array(Y).T + df = pd.DataFrame(Y, columns=columns) + return df + + def run(self, time_budget_sec=None): + self.reset_time() + if time_budget_sec is None: + time_budget_sec = self._estimate_time_budget() + else: + time_budget_sec = time_budget_sec + train_idxs = self.get_train_indices(path=self.path) + df_X = self._get_X() + df_Y = self._get_Y() + df = pd.concat([df_X, df_Y], axis=1) + labels = list(df_Y.columns) + self.logger.debug("Starting KerasTuner estimation") + estimator = KerasTunerEstimator(save_path=self.trained_path) + self.logger.debug("Fitting") + estimator.fit(data=df.iloc[train_idxs, :], labels=labels) + estimator.save() + estimator = estimator.load() + results = estimator.run(df) + self.update_elapsed_time() + return results + + +class Predictor(BaseEstimator): + def __init__(self, path): + BaseEstimator.__init__(self, path=path) + self.trained_path = os.path.join( + self.get_trained_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + self.x_getter = XGetter + + def run(self): + self.reset_time() + df = self.x_getter(path=self.path).get() + model = KerasTunerEstimator(save_path=self.trained_path).load() + results = model.run(df) + self.update_elapsed_time() + return results + + +class Assembler(BaseOutcomeAssembler): + def __init__(self, path=None): + BaseOutcomeAssembler.__init__(self, path=path) + + def run(self, df): + df_c = self._get_compounds() + df_y = df + df = pd.concat([df_c, df_y], axis=1) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + RESULTS_UNMAPPED_FILENAME, + ), + index=False, + ) + mappings = self._get_mappings() + df = self._remap(df, mappings) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + RESULTS_MAPPED_FILENAME, + ), + index=False, + ) + + +class Estimator(ZairaBase): + def __init__(self, path=None): + ZairaBase.__init__(self) + if path is None: + self.path = self.get_output_dir() + else: + self.path = path + path_ = os.path.join( + self.path, ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + if not os.path.exists(path_): + os.makedirs(path_, exist_ok=True) + if not self.is_predict(): + self.logger.debug("Starting kerastuner fitter") + self.estimator = Fitter(path=self.path) + else: + self.logger.debug("Starting kerastuner predictor") + self.estimator = Predictor(path=self.path) + self.assembler = Assembler(path=self.path) + + def run(self, time_budget_sec=None): + if time_budget_sec is not None: + self.time_budget_sec = int(time_budget_sec) + else: + self.time_budget_sec = None + if not self.is_predict(): + self.logger.debug("Mode: fit") + results = self.estimator.run() + else: + self.logger.debug("Mode: predict") + results = self.estimator.run() + self.assembler.run(results) diff --git a/zairachem/estimators/from_ersilia_embedding/pipe.py b/zairachem/estimators/from_ersilia_embedding/pipe.py new file mode 100644 index 00000000..89adfda8 --- /dev/null +++ b/zairachem/estimators/from_ersilia_embedding/pipe.py @@ -0,0 +1,9 @@ +from .estimate import Estimator + + +class EosceEmbeddingPipeline(object): + def __init__(self, path): + self.e = Estimator(path=path) + + def run(self, time_budget_sec=None): + self.e.run(time_budget_sec=time_budget_sec) diff --git a/zairachem/estimators/pipe.py b/zairachem/estimators/pipe.py index dddd61f6..1ff45b17 100644 --- a/zairachem/estimators/pipe.py +++ b/zairachem/estimators/pipe.py @@ -9,6 +9,7 @@ from .from_individual_full_descriptors.pipe import IndividualFullDescriptorPipeline from .from_manifolds.pipe import ManifoldPipeline from .from_reference_embedding.pipe import ReferenceEmbeddingPipeline +from .from_ersilia_embedding.pipe import EosceEmbeddingPipeline from .from_molmap.pipe import MolMapPipeline from .evaluate import SimpleEvaluator @@ -107,6 +108,16 @@ def _reference_pipeline(self, time_budget_sec): p.run(time_budget_sec=time_budget_sec) step.update() + def _eosce_pipeline(self, time_budget_sec): + if "kerastuner-eosce-embedding" not in self._estimators_to_use: + return + step = PipelineStep("eosce_pipeline", self.output_dir) + if not step.is_done(): + self.logger.debug("Ersilia compound embedding pipeline") + p = EosceEmbeddingPipeline(path=self.path) + p.run(time_budget_sec=time_budget_sec) + step.update() + def _molmap_pipeline(self, time_budget_sec): if self.is_lazy(): self.logger.info("Lazy mode skips molmap") @@ -136,5 +147,6 @@ def run(self, time_budget_sec=None): self._individual_estimator_pipeline(time_budget_sec) self._manifolds_pipeline(time_budget_sec) self._reference_pipeline(time_budget_sec) + self._eosce_pipeline(time_budget_sec) self._molmap_pipeline(time_budget_sec) self._simple_evaluation() diff --git a/zairachem/tools/fpsim2/FPSim2/tests/test_io.py b/zairachem/tools/fpsim2/FPSim2/tests/test_io.py index 3fc45548..8e681658 100644 --- a/zairachem/tools/fpsim2/FPSim2/tests/test_io.py +++ b/zairachem/tools/fpsim2/FPSim2/tests/test_io.py @@ -62,17 +62,14 @@ def test_get_mol_supplier(): def test_get_bounds_range(): - assert ( - get_bounds_range( - query=fps[0], - threshold=0.7, - a=0, - b=0, - ranges=popcnt_bins, - search_type="tanimoto", - ) - == (0, 10) - ) + assert get_bounds_range( + query=fps[0], + threshold=0.7, + a=0, + b=0, + ranges=popcnt_bins, + search_type="tanimoto", + ) == (0, 10) def test_build_fp(): diff --git a/zairachem/tools/macest/macest/classification/models.py b/zairachem/tools/macest/macest/classification/models.py index ef8b5381..82874d7f 100644 --- a/zairachem/tools/macest/macest/classification/models.py +++ b/zairachem/tools/macest/macest/classification/models.py @@ -207,7 +207,7 @@ def calc_dist_to_neighbours( x_star, k=self._num_neighbours, num_threads=num_threads_available ) ) - class_dist = neighbours[:, 1, :].clip(min=10 ** -15) + class_dist = neighbours[:, 1, :].clip(min=10**-15) class_ind = neighbours[:, 0, :].astype(int) if self.training_preds_by_class is None: raise ValueError("training_preds_by_class has already been cached") @@ -249,7 +249,7 @@ def calc_linear_distance_error_func( :return: A weighted sum of average error and average distance """ dist = self._alpha * np.average( - local_distance.clip(min=10 ** -15), + local_distance.clip(min=10**-15), weights=np.arange(local_distance.shape[1], 0, -1), axis=1, ) @@ -277,7 +277,7 @@ def predict_proba( class_dist, _, class_error = self.calc_dist_to_neighbours(x_star, cls) dist, error = self.calc_linear_distance_error_func(class_dist, class_error) - av_dist_func[i, :] = dist.clip(min=10 ** -10) + error + av_dist_func[i, :] = dist.clip(min=10**-10) + error relative_conf = self._calc_relative_distance_softmax_normalisation(av_dist_func) if change_conflicts: relative_conf = self._renormalise_conf_with_empirical_constant( @@ -308,7 +308,7 @@ def predict_confidence_of_point_prediction( point_prediction_confidence = class_confidence[ np.arange(len(class_confidence)), point_prediction - ].clip(max=1 - 10 ** -15) + ].clip(max=1 - 10**-15) return point_prediction_confidence def _calc_relative_distance_softmax_normalisation( diff --git a/zairachem/tools/macest/macest/model_selection.py b/zairachem/tools/macest/macest/model_selection.py index a137997b..8579c293 100644 --- a/zairachem/tools/macest/macest/model_selection.py +++ b/zairachem/tools/macest/macest/model_selection.py @@ -32,7 +32,7 @@ def __init__( self.shuffle = shuffle self.random_state = random_state self.pp_train_graph_cal_split = pp_train_graph_cal_split - if abs(np.array(self.pp_train_graph_cal_split).sum() - 1.0) > 10 ** -6: + if abs(np.array(self.pp_train_graph_cal_split).sum() - 1.0) > 10**-6: raise ValueError("split of training data must sum to 1") def split( diff --git a/zairachem/tools/macest/macest/regression/models.py b/zairachem/tools/macest/macest/regression/models.py index 90c99910..f82d095d 100644 --- a/zairachem/tools/macest/macest/regression/models.py +++ b/zairachem/tools/macest/macest/regression/models.py @@ -235,7 +235,7 @@ def calc_error_weighted_dist(self, x_star: np.ndarray) -> np.ndarray: "Need to remove pre-cached training neighbour data from training" ) - error_weighted_poly = self._alpha * error_weighted_dist ** self._beta + error_weighted_poly = self._alpha * error_weighted_dist**self._beta return error_weighted_poly def std_on_y_star(self, x_star: np.ndarray) -> np.ndarray: @@ -326,7 +326,7 @@ def calculate_prediction_interval_width( return np.diff(intervals) def sample_prediction( - self, x_star: np.ndarray, nsamples: int = 10 ** 3 + self, x_star: np.ndarray, nsamples: int = 10**3 ) -> np.ndarray: """ Draw samples from any predicted distribution to get a distribution of predictions. diff --git a/zairachem/tools/macest/macest/regression/plots.py b/zairachem/tools/macest/macest/regression/plots.py index 9179b400..191f586f 100644 --- a/zairachem/tools/macest/macest/regression/plots.py +++ b/zairachem/tools/macest/macest/regression/plots.py @@ -58,7 +58,7 @@ def plot_pred_with_conf( :return: None """ - pred_dist = conf_model.sample_prediction(x_star, nsamples=10 ** 4) + pred_dist = conf_model.sample_prediction(x_star, nsamples=10**4) point_pred = np.mean(pred_dist) plt.figure(figsize=(12, 6)) diff --git a/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py b/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py index f831578e..870ef037 100644 --- a/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py +++ b/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py @@ -11,7 +11,7 @@ def r2_score(x, y): pcc, _ = pearsonr(x, y) - return pcc ** 2 + return pcc**2 def prc_auc_score(y_true, y_score): diff --git a/zairachem/vars.py b/zairachem/vars.py index b1039c66..84c633cf 100644 --- a/zairachem/vars.py +++ b/zairachem/vars.py @@ -53,6 +53,7 @@ "flaml-individual-descriptors", "autogluon-manifolds", "kerastuner-reference-embedding", + "kerastuner-eosce-embedding", "molmap", ] From 9a3a5293976ef58634ea296c6ec6b3ca6c6f564d Mon Sep 17 00:00:00 2001 From: GemmaTuron Date: Thu, 6 Apr 2023 08:44:19 +0200 Subject: [PATCH 2/8] bugfixes eosce --- zairachem/descriptors/describe.py | 8 ++++++++ zairachem/descriptors/eosce.py | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/zairachem/descriptors/describe.py b/zairachem/descriptors/describe.py index 2fff69f5..c2075463 100644 --- a/zairachem/descriptors/describe.py +++ b/zairachem/descriptors/describe.py @@ -3,6 +3,7 @@ from .raw import RawDescriptors from .treated import TreatedDescriptors from .reference import ReferenceDescriptors, SimpleDescriptors +from .eosce import EosceDescriptors from .manifolds import Manifolds from .. import ZairaBase @@ -47,6 +48,12 @@ def _reference_descriptors(self): ReferenceDescriptors().run() step.update() + def _eosce_descriptors(self): + step = PipelineStep("eosce_descriptors", self.output_dir) + if not step.is_done(): + EosceDescriptors().run() + step.update() + def _manifolds(self): step = PipelineStep("manifolds", self.output_dir) if not step.is_done(): @@ -58,5 +65,6 @@ def run(self): self._raw_descriptions() self._treated_descriptions() self._reference_descriptors() + self._eosce_descriptors() self._manifolds() self.update_elapsed_time() diff --git a/zairachem/descriptors/eosce.py b/zairachem/descriptors/eosce.py index 90e1da6c..21194f37 100644 --- a/zairachem/descriptors/eosce.py +++ b/zairachem/descriptors/eosce.py @@ -2,7 +2,7 @@ import pandas as pd import h5py -from eosce.models import ErsiliaCompoundEmbedding +from eosce.models import ErsiliaCompoundEmbeddings from ..utils.matrices import Hdf5 from .. import ZairaBase @@ -15,7 +15,7 @@ class EosceEmbedder(ZairaBase): def __init__(self): ZairaBase.__init__(self) - self.model = ErsiliaCompoundEmbedding() + self.model = ErsiliaCompoundEmbeddings() def calculate(self, smiles_list, output_h5): X = self.model.transform(smiles_list) From f567d245e3d262e3658ef113df136bafbc396a31 Mon Sep 17 00:00:00 2001 From: GemmaTuron Date: Thu, 6 Apr 2023 18:10:32 +0200 Subject: [PATCH 3/8] small bugfix --- zairachem/vars.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zairachem/vars.py b/zairachem/vars.py index 84c633cf..e0f0fc39 100644 --- a/zairachem/vars.py +++ b/zairachem/vars.py @@ -41,11 +41,13 @@ # Ersilia Model Hub ERSILIA_HUB_DEFAULT_MODELS = [ - "morgan-counts", + #"morgan-counts", "cc-signaturizer", + "image-mol-embeddings", + "rdkit-fingerprint", "grover-embedding", "mordred", -] # molbert was removed +] DEFAULT_ESTIMATORS = [ "baseline-classic", From 5abbd09918e598dd7b5a9b02703e47549cdf15fc Mon Sep 17 00:00:00 2001 From: GemmaTuron Date: Sat, 8 Apr 2023 12:14:19 +0200 Subject: [PATCH 4/8] tabpfn balanced class --- zairachem/automl/binarytabpfn.py | 133 ++++++++++ .../assemble.py | 6 +- .../estimate.py | 6 +- .../performance.py | 6 +- .../__init__.py | 1 + .../assemble.py | 101 ++++++++ .../estimate.py | 185 ++++++++++++++ .../performance.py | 233 ++++++++++++++++++ .../pipe.py | 15 ++ zairachem/estimators/pipe.py | 17 ++ .../tools/fpsim2/FPSim2/tests/test_io.py | 19 +- .../macest/macest/classification/models.py | 8 +- .../tools/macest/macest/model_selection.py | 2 +- .../tools/macest/macest/regression/models.py | 4 +- .../tools/macest/macest/regression/plots.py | 2 +- .../molmap/bidd-molmap/molmap/model/cbks2.py | 2 +- zairachem/vars.py | 5 +- 17 files changed, 717 insertions(+), 28 deletions(-) create mode 100644 zairachem/automl/binarytabpfn.py create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py diff --git a/zairachem/automl/binarytabpfn.py b/zairachem/automl/binarytabpfn.py new file mode 100644 index 00000000..c24a3b87 --- /dev/null +++ b/zairachem/automl/binarytabpfn.py @@ -0,0 +1,133 @@ +import numpy as np +from lol import LOL +import random +import collections +from tabpfn import TabPFNClassifier +from imblearn.combine import SMOTETomek +from imblearn.over_sampling import KMeansSMOTE +from imblearn.under_sampling import EditedNearestNeighbours +import joblib + + +class TabPFNBinaryClassifier(object): + def __init__(self, device="cpu", N_ensemble_configurations=4): + self.device = device + self.N_ensemble_configurations = N_ensemble_configurations + self.max_samples = 1000 + + def _get_balanced_datasets(self, X, y): + smp = SMOTETomek(sampling_strategy="auto") + X_0, y_0 = smp.fit_resample(X, y) + smp = KMeansSMOTE(sampling_strategy="auto") + X_1, y_1 = smp.fit_resample(X, y) + smp = EditedNearestNeighbours(sampling_strategy="auto") + X_2, y_2 = smp.fit_resample(X, y) + results = [(X_0, y_0), (X_1, y_1), (X_2, y_2)] + return results + + def _cap_samples(self, X, y): + if X.shape[0] <= self.max_samples: + return [(X, y)] + idxs = [i for i in range(X.shape[0])] + R = [] + for _ in range(3): + smp_idxs = random.sample(idxs, self.max_samples) + X_, y_ = X[smp_idxs], y[smp_idxs] + if np.sum(y_) == 0: + continue + R += [(X_, y_)] + return R + + def _get_ensemble(self, X, y): + R = [] + for X_0, y_0 in self._get_balanced_datasets(X, y): + for X_1, y_1 in self._cap_samples(X_0, y_0): + R += [(X_1, y_1)] + return R + + def fit(self, X, y): + self.reducer = LOL(n_components=100) + self.reducer.fit(X, y) + X = self.reducer.transform(X) + self.ensemble = self._get_ensemble(X, y) + + def predict_proba(self, X): + model = TabPFNClassifier( + device=self.device, N_ensemble_configurations=self.N_ensemble_configurations + ) + X = self.reducer.transform(X) + R = [] + for X_tr, y_tr in self.ensemble: + # print(X_tr.shape, np.sum(y_tr)) + model.fit(X_tr, y_tr) + R += [model.predict_proba(X)[:, 1]] + model.remove_models_from_memory() + R = np.array(R).T + y_h1 = np.mean(R, axis=1) + y_h0 = 1 - y_h1 + y_h = np.array([y_h0, y_h1]).T + return y_h + + def save(self, file_name): + data = { + "device": self.device, + "N_ensemble_configurations": self.N_ensemble_configurations, + "reducer": self.reducer, + "ensemble": self.ensemble, + } + joblib.dump(data, file_name) + + def load(self, file_name): + data = joblib.load(file_name) + model = TabPFNBinaryClassifier( + device=data["device"], + N_ensemble_configurations=data["N_ensemble_configurations"], + ) + model.ensemble = data["ensemble"] + model.reducer = data["reducer"] + return TabPFNClassifierArtifact(model, 0.5) + + +class Binarizer(object): + def __init__(self, threshold): + self.threshold = threshold + + def binarize(self, y_hat): + y_bin = [] + for y in y_hat: + if y > self.threshold: + y_bin += [1] + else: + y_bin += [0] + return np.array(y_bin, dtype=np.uint8) + + +class TabPFNClassifierArtifact(object): + def __init__(self, model, threshold): + self.model = model + self.threshold = threshold + if threshold is not None: + self.binarizer = Binarizer(self.threshold) + else: + self.binarizer = None + + def predict_proba(self, X): + return self.model.predict_proba(X)[:, 1] + + def predict(self, X): + if self.binarizer is not None: + y_hat = self.predict_proba(X) + y_bin = self.binarizer.binarize(y_hat) + else: + y_bin = self.model.predict(X) + return y_bin + + def run(self, X, y=None): + results = collections.OrderedDict() + results["main"] = { + "idxs": None, + "y": y, + "y_hat": self.predict_proba(X), + "b_hat": self.predict(X), + } + return results diff --git a/zairachem/estimators/from_individual_full_descriptors/assemble.py b/zairachem/estimators/from_individual_full_descriptors/assemble.py index 5b2b7cbc..ba4acda1 100644 --- a/zairachem/estimators/from_individual_full_descriptors/assemble.py +++ b/zairachem/estimators/from_individual_full_descriptors/assemble.py @@ -80,7 +80,7 @@ def _get_model_ids(self): os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" ) as f: model_ids = list(json.load(f)) - model_ids_successfull = [] + model_ids_successful = [] for model_id in model_ids: if os.path.isfile( os.path.join( @@ -91,8 +91,8 @@ def _get_model_ids(self): "y_hat.joblib", ) ): - model_ids_successfull += [model_id] - return model_ids_successfull + model_ids_successful += [model_id] + return model_ids_successful def run(self): model_ids = self._get_model_ids() diff --git a/zairachem/estimators/from_individual_full_descriptors/estimate.py b/zairachem/estimators/from_individual_full_descriptors/estimate.py index a047383e..e96906a6 100644 --- a/zairachem/estimators/from_individual_full_descriptors/estimate.py +++ b/zairachem/estimators/from_individual_full_descriptors/estimate.py @@ -244,13 +244,13 @@ def _get_model_ids(self): os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" ) as f: model_ids = list(json.load(f)) - model_ids_successfull = [] + model_ids_successful = [] for model_id in model_ids: if os.path.isfile( os.path.join(path, DESCRIPTORS_SUBFOLDER, model_id, "treated.h5") ): - model_ids_successfull += [model_id] - return model_ids_successfull + model_ids_successful += [model_id] + return model_ids_successful def run(self, time_budget_sec=None): model_ids = self._get_model_ids() diff --git a/zairachem/estimators/from_individual_full_descriptors/performance.py b/zairachem/estimators/from_individual_full_descriptors/performance.py index 4bcd91b5..d067456f 100644 --- a/zairachem/estimators/from_individual_full_descriptors/performance.py +++ b/zairachem/estimators/from_individual_full_descriptors/performance.py @@ -212,7 +212,7 @@ def _get_model_ids(self): os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" ) as f: model_ids = list(json.load(f)) - model_ids_successfull = [] + model_ids_successful = [] for model_id in model_ids: if os.path.isfile( os.path.join( @@ -223,8 +223,8 @@ def _get_model_ids(self): "y_hat.joblib", ) ): - model_ids_successfull += [model_id] - return model_ids_successfull + model_ids_successful += [model_id] + return model_ids_successful def run(self): model_ids = self._get_model_ids() diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py new file mode 100644 index 00000000..856581cd --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py @@ -0,0 +1 @@ +ESTIMATORS_FAMILY_SUBFOLDER = "individual_full_descriptors_tabpfn" diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py new file mode 100644 index 00000000..ba4acda1 --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py @@ -0,0 +1,101 @@ +import pandas as pd +import json +import os +import joblib +import collections + +from . import ESTIMATORS_FAMILY_SUBFOLDER +from ... import ZairaBase +from ...vars import DESCRIPTORS_SUBFOLDER, ESTIMATORS_SUBFOLDER +from .. import Y_HAT_FILE, RESULTS_UNMAPPED_FILENAME, RESULTS_MAPPED_FILENAME +from ..base import BaseOutcomeAssembler + + +class IndividualOutcomeAssembler(BaseOutcomeAssembler): + def __init__(self, path=None, model_id=None): + BaseOutcomeAssembler.__init__(self, path=path) + self.model_id = model_id + + def _get_y_hat(self): + results = joblib.load( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + Y_HAT_FILE, + ) + ) + data = collections.OrderedDict() + for c, r in results.items(): + r = r["main"] + data[c] = r["y_hat"] + if "b_hat" in r: + data[c + "_bin"] = r["b_hat"] + return pd.DataFrame(data) + + def run(self): + df_c = self._get_compounds() + df_y = self._get_y_hat() + df = pd.concat([df_c, df_y], axis=1) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + RESULTS_UNMAPPED_FILENAME, + ), + index=False, + ) + mappings = self._get_mappings() + df = self._remap(df, mappings) + df.to_csv( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + RESULTS_MAPPED_FILENAME, + ), + index=False, + ) + + +class OutcomeAssembler(ZairaBase): + def __init__(self, path=None): + ZairaBase.__init__(self) + self.path = path + + def _get_model_ids(self): + if self.path is None: + path = self.get_output_dir() + else: + path = self.path + if self.is_predict(): + path_trained = self.get_trained_dir() + else: + path_trained = path + with open( + os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" + ) as f: + model_ids = list(json.load(f)) + model_ids_successful = [] + for model_id in model_ids: + if os.path.isfile( + os.path.join( + path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + model_id, + "y_hat.joblib", + ) + ): + model_ids_successful += [model_id] + return model_ids_successful + + def run(self): + model_ids = self._get_model_ids() + for model_id in model_ids: + o = IndividualOutcomeAssembler(path=self.path, model_id=model_id) + o.run() diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py new file mode 100644 index 00000000..6f9c9f4f --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py @@ -0,0 +1,185 @@ +import os +import json +import h5py +import pandas as pd +import numpy as np +import collections +import joblib + +from ...descriptors.treated import TREATED_FILE_NAME + +from ... import ZairaBase +from ...automl.binarytabpfn import TabPFNBinaryClassifier + +from ...vars import ( + DESCRIPTORS_SUBFOLDER, + DATA_SUBFOLDER, + DATA_FILENAME, + ESTIMATORS_SUBFOLDER, +) +from ..base import BaseEstimator + +from .. import Y_HAT_FILE +from . import ESTIMATORS_FAMILY_SUBFOLDER + + +class BaseEstimatorIndividual(BaseEstimator): + def __init__(self, path, model_id): + BaseEstimator.__init__(self, path=path) + path_ = os.path.join( + self.path, ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER, model_id + ) + if not os.path.exists(path_): + os.makedirs(path_) + self.model_id = model_id + + def _get_X(self): + f = os.path.join( + self.path, DESCRIPTORS_SUBFOLDER, self.model_id, TREATED_FILE_NAME + ) + with h5py.File(f, "r") as f: + X = f["Values"][:] + return X + + +class Fitter(BaseEstimatorIndividual): + def __init__(self, path, model_id): + BaseEstimatorIndividual.__init__(self, path=path, model_id=model_id) + self.trained_path = os.path.join( + self.get_output_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + + def _get_flds(self): + # for now only auxiliary folds are used + col = [f for f in self.schema["folds"] if "_aux" in f][0] + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + return np.array(df[col]) + + def _get_y(self, task): + # for now iterate task by task + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + return np.array(df[task]) + + def run(self, time_budget_sec=60): + self.reset_time() + if time_budget_sec is None: + time_budget_sec = self._estimate_time_budget() + else: + time_budget_sec = time_budget_sec + tasks = collections.OrderedDict() + X = self._get_X() + train_idxs = self.get_train_indices(path=self.path) + valid_idxs = self.get_validation_indices(path=self.path) + for t in self._get_clf_tasks(): + y = self._get_y(t) + model = TabPFNBinaryClassifier() + model.fit(X[train_idxs], y[train_idxs]) + file_name = os.path.join(self.trained_path, self.model_id, t + ".joblib") + model.save(file_name) + model = model.load(file_name) + tasks[t] = model.run(X, y) + _valid_task = model.run(X[valid_idxs], y[valid_idxs]) + tasks[t]["valid"] = _valid_task["main"] + self.update_elapsed_time() + return tasks + + +class Predictor(BaseEstimatorIndividual): + def __init__(self, path, model_id): + BaseEstimatorIndividual.__init__(self, path=path, model_id=model_id) + self.trained_path = os.path.join( + self.get_trained_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER + ) + + def _get_y(self, task): + # for now iterate task by task + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + columns = set(df.columns) + if task in columns: + return np.array(df[task]) + else: + return None + + def run(self): + self.reset_time() + tasks = collections.OrderedDict() + X = self._get_X() + for t in self._get_clf_tasks(): + y = self._get_y(t) + model = TabPFNBinaryClassifier() + file_name = os.path.join(self.trained_path, self.model_id, t + ".joblib") + model = model.load(file_name) + tasks[t] = model.run(X, y) + self.update_elapsed_time() + return tasks + + +class IndividualEstimator(ZairaBase): + def __init__(self, path=None, model_id=None): + ZairaBase.__init__(self) + self.model_id = model_id + if path is None: + self.path = self.get_output_dir() + else: + self.path = path + if not self.is_predict(): + self.estimator = Fitter(path=self.path, model_id=self.model_id) + else: + self.estimator = Predictor(path=self.path, model_id=self.model_id) + + def run(self, time_budget_sec=None): + if time_budget_sec is not None: + self.time_budget_sec = int(time_budget_sec) + else: + self.time_budget_sec = None + if not self.is_predict(): + results = self.estimator.run(time_budget_sec=self.time_budget_sec) + else: + results = self.estimator.run() + joblib.dump( + results, + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + Y_HAT_FILE, + ), + ) + + +class Estimator(ZairaBase): + def __init__(self, path=None): + ZairaBase.__init__(self) + self.path = path + + def _get_model_ids(self): + if self.path is None: + path = self.get_output_dir() + else: + path = self.path + if self.is_predict(): + path_trained = self.get_trained_dir() + else: + path_trained = path + with open( + os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" + ) as f: + model_ids = list(json.load(f)) + model_ids_successful = [] + for model_id in model_ids: + if os.path.isfile( + os.path.join(path, DESCRIPTORS_SUBFOLDER, model_id, "treated.h5") + ): + model_ids_successful += [model_id] + return model_ids_successful + + def run(self, time_budget_sec=None): + model_ids = self._get_model_ids() + if time_budget_sec is not None: + tbs = max(int(time_budget_sec / len(model_ids)), 1) + else: + tbs = None + for model_id in model_ids: + estimator = IndividualEstimator(path=self.path, model_id=model_id) + estimator.run(time_budget_sec=tbs) diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py new file mode 100644 index 00000000..d067456f --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py @@ -0,0 +1,233 @@ +import os +import json +import numpy as np +import pandas as pd +import joblib +import collections + +from sklearn import metrics + +from .. import Y_HAT_FILE +from ... import ZairaBase + +from . import ESTIMATORS_FAMILY_SUBFOLDER +from ...vars import ( + DATA_SUBFOLDER, + DESCRIPTORS_SUBFOLDER, + ESTIMATORS_SUBFOLDER, + DATA_FILENAME, +) + +from .. import CLF_REPORT_FILENAME, REG_REPORT_FILENAME + + +class BasePerformance(ZairaBase): + def __init__(self, path=None, model_id=None): + ZairaBase.__init__(self) + if path is None: + self.path = self.get_output_dir() + else: + self.path = path + self.model_id = model_id + + def _get_y_hat_dict(self): + return joblib.load( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + Y_HAT_FILE, + ) + ) + + +class ClassificationPerformance(BasePerformance): + def __init__(self, path, model_id): + BasePerformance.__init__(self, path=path, model_id=model_id) + self.results = self._get_y_hat_dict() + self._prefix = self._get_prefix() + self.results = self.results[self._prefix] + + def _get_prefix(self): + for c in list(self.results.keys()): + if "clf_" in c: + return c + + def _try_metric(self, fun, t, p): + try: + return float(fun(t, p)) + except: + return None + + def _calculate(self, key): + r = self.results[key] + y_true = np.array(r["y"]) + y_pred = np.array(r["y_hat"]) + b_pred = np.array(r["b_hat"]) + try: + confu = metrics.confusion_matrix(y_true, b_pred, labels=[0, 1]) + except: + confu = np.array([[-1, -1], [-1, -1]]) + report = { + "roc_auc_score": self._try_metric(metrics.roc_auc_score, y_true, y_pred), + "precision_score": self._try_metric( + metrics.precision_score, y_true, b_pred + ), + "recall_score": self._try_metric(metrics.recall_score, y_true, b_pred), + "tp": int(confu[1, 1]), + "tn": int(confu[0, 0]), + "fp": int(confu[0, 1]), + "fn": int(confu[1, 0]), + "y_true": [int(y) for y in y_true], + "y_pred": [float(y) for y in y_pred], + "b_pred": [int(y) for y in b_pred], + } + return report + + def calculate(self): + report = collections.OrderedDict() + for k in self.results.keys(): + report[k] = self._calculate(k) + return report + + +class RegressionPerformance(BasePerformance): + def __init__(self, path, model_id): + BasePerformance.__init__(self, path=path, model_id=model_id) + self.results = self._get_y_hat_dict() + self._prefix = self._get_prefix() + self.results = self.results[self._prefix] + + def _get_prefix(self): + for c in list(self.results.keys()): + if "reg_" in c: + return c + + def _calculate(self, key): + r = self.results[key] + y_true = np.array(r["y"]) + y_pred = np.array(r["y_hat"]) + report = { + "r2_score": float(metrics.r2_score(y_true, y_pred)), + "mean_absolute_error": float(metrics.mean_absolute_error(y_true, y_pred)), + "mean_squared_error": float(metrics.mean_squared_error(y_true, y_pred)), + "y_true": [float(y) for y in y_true], + "y_pred": [float(y) for y in y_pred], + } + return report + + def calculate(self): + report = collections.OrderedDict() + for k in self.results.keys(): + report[k] = self._calculate(k) + return report + + +class IndividualPerformanceReporter(ZairaBase): + def __init__(self, path=None, model_id=None): + ZairaBase.__init__(self) + if path is None: + self.path = self.get_output_dir() + else: + self.path = path + self.has_tasks = self._has_tasks() + if self._has_clf_tasks(): + self.clf = ClassificationPerformance(path=path, model_id=model_id) + else: + self.clf = None + if self._has_reg_tasks(): + self.reg = RegressionPerformance(path=path, model_id=model_id) + else: + self.reg = None + self.model_id = model_id + + def _has_tasks(self): + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + for c in list(df.columns): + if "clf_" in c or "reg_" in c: + return True + return False + + def _has_reg_tasks(self): + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + for c in list(df.columns): + if "reg_" in c and "_skip" not in c and "_aux" not in c: + return True + return False + + def _has_clf_tasks(self): + df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)) + for c in list(df.columns): + if "clf_" in c and "_skip" not in c and "_aux" not in c: + return True + return False + + def run(self): + if not self.has_tasks: + return + if self.clf is not None: + clf_rep = self.clf.calculate() + with open( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + CLF_REPORT_FILENAME, + ), + "w", + ) as f: + json.dump(clf_rep, f, indent=4) + if self.reg is not None: + reg_rep = self.reg.calculate() + with open( + os.path.join( + self.path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + self.model_id, + REG_REPORT_FILENAME, + ), + "w", + ) as f: + json.dump(reg_rep, f, indent=4) + + +class PerformanceReporter(ZairaBase): + def __init__(self, path=None): + ZairaBase.__init__(self) + self.path = path + + def _get_model_ids(self): + if self.path is None: + path = self.get_output_dir() + else: + path = self.path + if self.is_predict(): + path_trained = self.get_trained_dir() + else: + path_trained = path + with open( + os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r" + ) as f: + model_ids = list(json.load(f)) + model_ids_successful = [] + for model_id in model_ids: + if os.path.isfile( + os.path.join( + path, + ESTIMATORS_SUBFOLDER, + ESTIMATORS_FAMILY_SUBFOLDER, + model_id, + "y_hat.joblib", + ) + ): + model_ids_successful += [model_id] + return model_ids_successful + + def run(self): + model_ids = self._get_model_ids() + for model_id in model_ids: + p = IndividualPerformanceReporter(path=self.path, model_id=model_id) + p.run() diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py new file mode 100644 index 00000000..8e397fbf --- /dev/null +++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py @@ -0,0 +1,15 @@ +from .estimate import Estimator +from .assemble import OutcomeAssembler +from .performance import PerformanceReporter + + +class IndividualFullDescriptorTabPFNPipeline(object): + def __init__(self, path): + self.e = Estimator(path=path) + self.a = OutcomeAssembler(path=path) + self.p = PerformanceReporter(path=path) + + def run(self, time_budget_sec=None): + self.e.run(time_budget_sec=time_budget_sec) + self.a.run() + self.p.run() diff --git a/zairachem/estimators/pipe.py b/zairachem/estimators/pipe.py index 1ff45b17..fe658a71 100644 --- a/zairachem/estimators/pipe.py +++ b/zairachem/estimators/pipe.py @@ -7,6 +7,9 @@ from .from_classic.pipe import ClassicPipeline from .from_fingerprint.pipe import FingerprintPipeline from .from_individual_full_descriptors.pipe import IndividualFullDescriptorPipeline +from .from_individual_full_descriptors_tabpfn.pipe import ( + IndividualFullDescriptorTabPFNPipeline, +) from .from_manifolds.pipe import ManifoldPipeline from .from_reference_embedding.pipe import ReferenceEmbeddingPipeline from .from_ersilia_embedding.pipe import EosceEmbeddingPipeline @@ -82,6 +85,19 @@ def _individual_estimator_pipeline(self, time_budget_sec): p.run(time_budget_sec=time_budget_sec) step.update() + def _individual_estimator_tabpfn_pipeline(self, time_budget_sec): + if self.is_lazy(): + self.logger.info("Lazy mode skips individual descriptors with tabpfn") + return + if "tabpfn-individual-descriptors" not in self._estimators_to_use: + return + step = PipelineStep("individual_estimator_pipeline_tabpfn", self.output_dir) + if not step.is_done(): + self.logger.debug("Running individual estimator pipeline") + p = IndividualFullDescriptorTabPFNPipeline(path=self.path) + p.run(time_budget_sec=time_budget_sec) + step.update() + def _manifolds_pipeline(self, time_budget_sec): if self.is_lazy(): self.logger.info("Lazy mode skips manifolds") @@ -144,6 +160,7 @@ def _simple_evaluation(self): def run(self, time_budget_sec=None): self._classic_estimator_pipeline(time_budget_sec) self._fingerprint_estimator_pipeline(time_budget_sec) + self._individual_estimator_tabpfn_pipeline(time_budget_sec) self._individual_estimator_pipeline(time_budget_sec) self._manifolds_pipeline(time_budget_sec) self._reference_pipeline(time_budget_sec) diff --git a/zairachem/tools/fpsim2/FPSim2/tests/test_io.py b/zairachem/tools/fpsim2/FPSim2/tests/test_io.py index 8e681658..3fc45548 100644 --- a/zairachem/tools/fpsim2/FPSim2/tests/test_io.py +++ b/zairachem/tools/fpsim2/FPSim2/tests/test_io.py @@ -62,14 +62,17 @@ def test_get_mol_supplier(): def test_get_bounds_range(): - assert get_bounds_range( - query=fps[0], - threshold=0.7, - a=0, - b=0, - ranges=popcnt_bins, - search_type="tanimoto", - ) == (0, 10) + assert ( + get_bounds_range( + query=fps[0], + threshold=0.7, + a=0, + b=0, + ranges=popcnt_bins, + search_type="tanimoto", + ) + == (0, 10) + ) def test_build_fp(): diff --git a/zairachem/tools/macest/macest/classification/models.py b/zairachem/tools/macest/macest/classification/models.py index 82874d7f..ef8b5381 100644 --- a/zairachem/tools/macest/macest/classification/models.py +++ b/zairachem/tools/macest/macest/classification/models.py @@ -207,7 +207,7 @@ def calc_dist_to_neighbours( x_star, k=self._num_neighbours, num_threads=num_threads_available ) ) - class_dist = neighbours[:, 1, :].clip(min=10**-15) + class_dist = neighbours[:, 1, :].clip(min=10 ** -15) class_ind = neighbours[:, 0, :].astype(int) if self.training_preds_by_class is None: raise ValueError("training_preds_by_class has already been cached") @@ -249,7 +249,7 @@ def calc_linear_distance_error_func( :return: A weighted sum of average error and average distance """ dist = self._alpha * np.average( - local_distance.clip(min=10**-15), + local_distance.clip(min=10 ** -15), weights=np.arange(local_distance.shape[1], 0, -1), axis=1, ) @@ -277,7 +277,7 @@ def predict_proba( class_dist, _, class_error = self.calc_dist_to_neighbours(x_star, cls) dist, error = self.calc_linear_distance_error_func(class_dist, class_error) - av_dist_func[i, :] = dist.clip(min=10**-10) + error + av_dist_func[i, :] = dist.clip(min=10 ** -10) + error relative_conf = self._calc_relative_distance_softmax_normalisation(av_dist_func) if change_conflicts: relative_conf = self._renormalise_conf_with_empirical_constant( @@ -308,7 +308,7 @@ def predict_confidence_of_point_prediction( point_prediction_confidence = class_confidence[ np.arange(len(class_confidence)), point_prediction - ].clip(max=1 - 10**-15) + ].clip(max=1 - 10 ** -15) return point_prediction_confidence def _calc_relative_distance_softmax_normalisation( diff --git a/zairachem/tools/macest/macest/model_selection.py b/zairachem/tools/macest/macest/model_selection.py index 8579c293..a137997b 100644 --- a/zairachem/tools/macest/macest/model_selection.py +++ b/zairachem/tools/macest/macest/model_selection.py @@ -32,7 +32,7 @@ def __init__( self.shuffle = shuffle self.random_state = random_state self.pp_train_graph_cal_split = pp_train_graph_cal_split - if abs(np.array(self.pp_train_graph_cal_split).sum() - 1.0) > 10**-6: + if abs(np.array(self.pp_train_graph_cal_split).sum() - 1.0) > 10 ** -6: raise ValueError("split of training data must sum to 1") def split( diff --git a/zairachem/tools/macest/macest/regression/models.py b/zairachem/tools/macest/macest/regression/models.py index f82d095d..90c99910 100644 --- a/zairachem/tools/macest/macest/regression/models.py +++ b/zairachem/tools/macest/macest/regression/models.py @@ -235,7 +235,7 @@ def calc_error_weighted_dist(self, x_star: np.ndarray) -> np.ndarray: "Need to remove pre-cached training neighbour data from training" ) - error_weighted_poly = self._alpha * error_weighted_dist**self._beta + error_weighted_poly = self._alpha * error_weighted_dist ** self._beta return error_weighted_poly def std_on_y_star(self, x_star: np.ndarray) -> np.ndarray: @@ -326,7 +326,7 @@ def calculate_prediction_interval_width( return np.diff(intervals) def sample_prediction( - self, x_star: np.ndarray, nsamples: int = 10**3 + self, x_star: np.ndarray, nsamples: int = 10 ** 3 ) -> np.ndarray: """ Draw samples from any predicted distribution to get a distribution of predictions. diff --git a/zairachem/tools/macest/macest/regression/plots.py b/zairachem/tools/macest/macest/regression/plots.py index 191f586f..9179b400 100644 --- a/zairachem/tools/macest/macest/regression/plots.py +++ b/zairachem/tools/macest/macest/regression/plots.py @@ -58,7 +58,7 @@ def plot_pred_with_conf( :return: None """ - pred_dist = conf_model.sample_prediction(x_star, nsamples=10**4) + pred_dist = conf_model.sample_prediction(x_star, nsamples=10 ** 4) point_pred = np.mean(pred_dist) plt.figure(figsize=(12, 6)) diff --git a/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py b/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py index 870ef037..f831578e 100644 --- a/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py +++ b/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py @@ -11,7 +11,7 @@ def r2_score(x, y): pcc, _ = pearsonr(x, y) - return pcc**2 + return pcc ** 2 def prc_auc_score(y_true, y_score): diff --git a/zairachem/vars.py b/zairachem/vars.py index e0f0fc39..bc6812c1 100644 --- a/zairachem/vars.py +++ b/zairachem/vars.py @@ -41,18 +41,19 @@ # Ersilia Model Hub ERSILIA_HUB_DEFAULT_MODELS = [ - #"morgan-counts", + # "morgan-counts", "cc-signaturizer", "image-mol-embeddings", "rdkit-fingerprint", "grover-embedding", "mordred", -] +] DEFAULT_ESTIMATORS = [ "baseline-classic", "baseline-fingerprint", "flaml-individual-descriptors", + "tabpfn-individual-descriptors", "autogluon-manifolds", "kerastuner-reference-embedding", "kerastuner-eosce-embedding", From 340b52a2827008eed00b6c1f51ab9273703e1ffd Mon Sep 17 00:00:00 2001 From: gemmaturon Date: Sun, 9 Apr 2023 11:16:41 +0200 Subject: [PATCH 5/8] small bugfix --- install_linux.sh | 6 ++++++ zairachem/automl/binarytabpfn.py | 21 +++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/install_linux.sh b/install_linux.sh index c53f2a0a..abfd1fce 100644 --- a/install_linux.sh +++ b/install_linux.sh @@ -54,5 +54,11 @@ python3 -m pip install git+https://github.com/ersilia-os/lazy-qsar.git # install melloddy-tuner python3 -m pip install git+https://github.com/melloddy/MELLODDY-TUNER.git@2.1.3 +# install tabpfn +python3 -m pip install tabpfn==0.1.8 + +# install imblearn +python3 -m pip install imbalanced-learn==0.10.1 + # install zairachem python3 -m pip install -e . diff --git a/zairachem/automl/binarytabpfn.py b/zairachem/automl/binarytabpfn.py index c24a3b87..aad81454 100644 --- a/zairachem/automl/binarytabpfn.py +++ b/zairachem/automl/binarytabpfn.py @@ -16,12 +16,21 @@ def __init__(self, device="cpu", N_ensemble_configurations=4): self.max_samples = 1000 def _get_balanced_datasets(self, X, y): - smp = SMOTETomek(sampling_strategy="auto") - X_0, y_0 = smp.fit_resample(X, y) - smp = KMeansSMOTE(sampling_strategy="auto") - X_1, y_1 = smp.fit_resample(X, y) - smp = EditedNearestNeighbours(sampling_strategy="auto") - X_2, y_2 = smp.fit_resample(X, y) + try: + smp = SMOTETomek(sampling_strategy="auto") + X_0, y_0 = smp.fit_resample(X, y) + except: + X_0, y_0 = X, y + try: + smp = KMeansSMOTE(sampling_strategy="auto") + X_1, y_1 = smp.fit_resample(X, y) + except: + X_1, y_1 = X, y + try: + smp = EditedNearestNeighbours(sampling_strategy="auto") + X_2, y_2 = smp.fit_resample(X, y) + except: + X_2, y_2 = X, y results = [(X_0, y_0), (X_1, y_1), (X_2, y_2)] return results From 5abd23780f9e38fab4583194c8a295c9bb10bdf5 Mon Sep 17 00:00:00 2001 From: gemmaturon Date: Fri, 14 Apr 2023 16:55:22 +0200 Subject: [PATCH 6/8] molfeat-chemgpt --- zairachem/vars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zairachem/vars.py b/zairachem/vars.py index bc6812c1..fc229d10 100644 --- a/zairachem/vars.py +++ b/zairachem/vars.py @@ -43,7 +43,7 @@ ERSILIA_HUB_DEFAULT_MODELS = [ # "morgan-counts", "cc-signaturizer", - "image-mol-embeddings", + "molfeat-chemgpt", "rdkit-fingerprint", "grover-embedding", "mordred", From 8af2372225e4a955619f006100e311bad436eb85 Mon Sep 17 00:00:00 2001 From: gemmaturon Date: Mon, 24 Apr 2023 19:31:42 +0200 Subject: [PATCH 7/8] bagger not constrained to 1 --- zairachem/pool/bagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zairachem/pool/bagger.py b/zairachem/pool/bagger.py index e1a3e7c9..759248eb 100644 --- a/zairachem/pool/bagger.py +++ b/zairachem/pool/bagger.py @@ -230,7 +230,7 @@ def _predict_scaling(self, df_X): w = w + 1 y_hats += [np.average(Y_hat, axis=1, weights=w)] y_hats += [np.mean(Y_hat, axis=1)] - y_hats = np.clip(np.array(y_hats), 0, 1) + y_hats = np.array(y_hats) y_hat = np.mean(y_hats, axis=0) return y_hat From ef4c0135bfffe55d1a397684f32e96db33ca212f Mon Sep 17 00:00:00 2001 From: gemmaturon Date: Wed, 26 Apr 2023 19:48:03 +0200 Subject: [PATCH 8/8] new rescaling --- zairachem/pool/bagger.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/zairachem/pool/bagger.py b/zairachem/pool/bagger.py index 759248eb..535c33d0 100644 --- a/zairachem/pool/bagger.py +++ b/zairachem/pool/bagger.py @@ -5,9 +5,10 @@ import joblib import h5py import collections +from scipy.special import expit from sklearn.linear_model import LogisticRegressionCV, LinearRegression from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.preprocessing import RobustScaler +from sklearn.preprocessing import RobustScaler, PowerTransformer from sklearn.metrics import roc_curve, auc, r2_score from .. import ZairaBase @@ -155,8 +156,8 @@ def save(self, filename): class PoolClassifier(object): - def __init__(self, path, mode="scaling"): - assert mode in ["scaling", "median", "model"] + def __init__(self, path, mode="weighting"): + assert mode in ["weighting", "median", "model"] self.path = path if not os.path.exists(self.path): os.makedirs(self.path, exist_ok=True) @@ -168,7 +169,8 @@ def _get_model_filename(self, n): def _fit_just_median(self, df_X, df_y): return np.median(np.array(df_X), axis=1) - def _fit_scaling(self, df_X, df_y): + def _fit_weighting(self, df_X, df_y): + y = np.array(df_y).ravel() cols = list(df_X.columns) X = np.array(df_X) p25 = np.percentile(X.ravel(), 25) @@ -177,10 +179,13 @@ def _fit_scaling(self, df_X, df_y): scale = (p25, p50, p75) for c in cols: X = np.array(df_X[c]).reshape(-1, 1) - mdl = RobustScaler() - mdl.fit(X) + mdl0 = PowerTransformer() + mdl0.fit(X) + X = mdl0.transform(X) + mdl1 = LogisticRegressionCV() + mdl1.fit(X, y) filename = self._get_model_filename(c) - joblib.dump(mdl, filename) + joblib.dump((mdl0, mdl1), filename) filename = self._get_model_filename("overall") joblib.dump(scale, filename) filename = self._get_model_filename("weighting") @@ -188,7 +193,7 @@ def _fit_scaling(self, df_X, df_y): ws.distance_to_leads() ws.importance() ws.save(filename) - return self._predict_scaling(df_X) + return self._predict_weighting(df_X) def _fit_model(self, df_X, df_y): y = np.array(df_y).ravel() @@ -204,22 +209,19 @@ def _fit_model(self, df_X, df_y): def _predict_just_median(self, df_X): return np.median(np.array(df_X), axis=1) - def _predict_scaling(self, df_X): + def _predict_weighting(self, df_X): cols = list(df_X.columns) Y_hat = [] for c in cols: filename = self._get_model_filename(c) if os.path.exists(filename): - mdl = joblib.load(filename) + mdl0, mdl1 = joblib.load(filename) X = np.array(df_X[c]).reshape(-1, 1) - y_hat = mdl.transform(X).ravel() + X = mdl0.transform(X) + y_hat = mdl1.predict_proba(X)[:,1] Y_hat += [y_hat] Y_hat = np.array(Y_hat).T filename = self._get_model_filename("overall") - scale = joblib.load(filename) - iqr = scale[-1] - scale[0] - med = scale[1] - Y_hat = Y_hat * iqr + med filename = self._get_model_filename("weighting") weights = joblib.load(filename) wvals = weights["weights"] @@ -248,16 +250,16 @@ def _predict_model(self, df_X): return np.median(Y_hat, axis=1) def fit(self, df_X, df_y): - if self.mode == "scaling": - return self._fit_scaling(df_X, df_y) + if self.mode == "weighting": + return self._fit_weighting(df_X, df_y) if self.mode == "median": return self._fit_just_median(df_X, df_y) if self.mdoe == "model": return self._fit_model(df_X, df_y) def predict(self, df_X): - if self.mode == "scaling": - return self._predict_scaling(df_X) + if self.mode == "weighting": + return self._predict_weighting(df_X) if self.mode == "median": return self._predict_just_median(df_X) if self.mode == "model":