From a3c357901b2a6769e4614b2456dc04431477cc77 Mon Sep 17 00:00:00 2001
From: Miquel Duran-Frigola <miquelduranfrigola@gmail.com>
Date: Wed, 5 Apr 2023 14:45:44 +0200
Subject: [PATCH 1/8] add ersilia embedding

---
 config/example_params.json                    |   1 +
 install_linux.sh                              |   5 +-
 zairachem/descriptors/eosce.py                |  63 ++++++
 .../from_ersilia_embedding/__init__.py        |   1 +
 .../from_ersilia_embedding/estimate.py        | 183 ++++++++++++++++++
 .../estimators/from_ersilia_embedding/pipe.py |   9 +
 zairachem/estimators/pipe.py                  |  12 ++
 .../tools/fpsim2/FPSim2/tests/test_io.py      |  19 +-
 .../macest/macest/classification/models.py    |   8 +-
 .../tools/macest/macest/model_selection.py    |   2 +-
 .../tools/macest/macest/regression/models.py  |   4 +-
 .../tools/macest/macest/regression/plots.py   |   2 +-
 .../molmap/bidd-molmap/molmap/model/cbks2.py  |   2 +-
 zairachem/vars.py                             |   1 +
 14 files changed, 291 insertions(+), 21 deletions(-)
 create mode 100644 zairachem/descriptors/eosce.py
 create mode 100644 zairachem/estimators/from_ersilia_embedding/__init__.py
 create mode 100644 zairachem/estimators/from_ersilia_embedding/estimate.py
 create mode 100644 zairachem/estimators/from_ersilia_embedding/pipe.py

diff --git a/config/example_params.json b/config/example_params.json
index 4bea76bc..47fce1d4 100644
--- a/config/example_params.json
+++ b/config/example_params.json
@@ -13,6 +13,7 @@
         "flaml-individual-descriptors",
         "autogluon-manifolds",
         "kerastuner-reference-embedding",
+        "kerastuner-eosce-embedding",
         "molmap"
     ]
 }
\ No newline at end of file
diff --git a/install_linux.sh b/install_linux.sh
index 5f083277..c53f2a0a 100644
--- a/install_linux.sh
+++ b/install_linux.sh
@@ -31,7 +31,7 @@ python3 -m pip install autogluon.tabular[all]==0.5.2
 python3 -m pip install "xgboost==1.3.3"
 python3 -m pip install "SQLAlchemy<1.4.0"
 
-# install zairachem
+# install extra dependencies
 python3 -m pip install git+https://github.com/chembl/FPSim2.git@0.3.0
 python3 -m pip install -q -U keras-tuner==1.1.3
 
@@ -39,6 +39,9 @@ python3 -m pip install -q -U keras-tuner==1.1.3
 python3 -m pip install git+https://github.com/ersilia-os/ersilia.git
 ersilia --help
 
+# install ersilia compound embedding
+python3 -m pip install git+https://github.com/ersilia-os/compound-embedding-lite.git
+
 # install isaura
 python3 -m pip install git+https://github.com/ersilia-os/isaura.git@ce293244ad0bdd6d7d4f796d2a84b17208a87b56
 
diff --git a/zairachem/descriptors/eosce.py b/zairachem/descriptors/eosce.py
new file mode 100644
index 00000000..90e1da6c
--- /dev/null
+++ b/zairachem/descriptors/eosce.py
@@ -0,0 +1,63 @@
+import os
+import pandas as pd
+import h5py
+
+from eosce.models import ErsiliaCompoundEmbedding
+from ..utils.matrices import Hdf5
+from .. import ZairaBase
+
+from ..setup import SMILES_COLUMN
+from ..vars import DATA_SUBFOLDER, DATA_FILENAME, DESCRIPTORS_SUBFOLDER
+
+EOSCE_FILE_NAME = "eosce.h5"
+
+
+class EosceEmbedder(ZairaBase):
+    def __init__(self):
+        ZairaBase.__init__(self)
+        self.model = ErsiliaCompoundEmbedding()
+
+    def calculate(self, smiles_list, output_h5):
+        X = self.model.transform(smiles_list)
+        if output_h5 is None:
+            return X
+        keys = ["key-{0}".format(i) for i in range(len(smiles_list))]
+        features = ["feat-{0}".format(i) for i in range(X.shape[1])]
+        inputs = smiles_list
+        with h5py.File(output_h5, "w") as f:
+            f.create_dataset("Keys", data=keys)
+            f.create_dataset("Features", data=features)
+            f.create_dataset("Inputs", data=inputs)
+            f.create_dataset("Values", data=X)
+
+
+class EosceLoader(ZairaBase):
+    def __init__(self):
+        ZairaBase.__init__(self)
+        self.path = self.get_output_dir()
+
+    def open(self, eos_id):
+        path = os.path.join(self.path, DESCRIPTORS_SUBFOLDER, eos_id, EOSCE_FILE_NAME)
+        return Hdf5(path)
+
+
+class EosceDescriptors(ZairaBase):
+    def __init__(self):
+        ZairaBase.__init__(self)
+        self.path = self.get_output_dir()
+        self.input_csv = os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME)
+        self.smiles_list = self._get_smiles_list()
+
+    def _get_smiles_list(self):
+        df = pd.read_csv(self.input_csv)
+        return list(df[SMILES_COLUMN])
+
+    def output_h5_filename(self):
+        path = os.path.join(self.path, DESCRIPTORS_SUBFOLDER)
+        os.makedirs(path, exist_ok=True)
+        return os.path.join(path, EOSCE_FILE_NAME)
+
+    def run(self):
+        output_h5 = self.output_h5_filename()
+        ref = EosceEmbedder()
+        ref.calculate(self.smiles_list, output_h5)
diff --git a/zairachem/estimators/from_ersilia_embedding/__init__.py b/zairachem/estimators/from_ersilia_embedding/__init__.py
new file mode 100644
index 00000000..1d7b9c64
--- /dev/null
+++ b/zairachem/estimators/from_ersilia_embedding/__init__.py
@@ -0,0 +1 @@
+ESTIMATORS_FAMILY_SUBFOLDER = "ersilia_embedding"
diff --git a/zairachem/estimators/from_ersilia_embedding/estimate.py b/zairachem/estimators/from_ersilia_embedding/estimate.py
new file mode 100644
index 00000000..be6c099e
--- /dev/null
+++ b/zairachem/estimators/from_ersilia_embedding/estimate.py
@@ -0,0 +1,183 @@
+import os
+import numpy as np
+import pandas as pd
+import h5py
+
+from ... import ZairaBase
+from ..base import BaseEstimator, BaseOutcomeAssembler
+from ...automl.kerastuner import KerasTunerEstimator
+from ...vars import (
+    DATA_FILENAME,
+    DATA_SUBFOLDER,
+    DESCRIPTORS_SUBFOLDER,
+    ESTIMATORS_SUBFOLDER,
+)
+from . import ESTIMATORS_FAMILY_SUBFOLDER
+from .. import RESULTS_MAPPED_FILENAME, RESULTS_UNMAPPED_FILENAME
+
+
+class XGetter(ZairaBase):
+    def __init__(self, path):
+        ZairaBase.__init__(self)
+        self.path = path
+        self.X = []
+        self.columns = []
+
+    def _get_eosce_descriptor(self):
+        with h5py.File(
+            os.path.join(self.path, DESCRIPTORS_SUBFOLDER, "eosce.h5"), "r"
+        ) as f:
+            X_ = f["Values"][:]
+            self.X += [X_]
+            self.columns += [
+                "feat-{0}".format(x.decode("utf-8")) for x in f["Features"][:]
+            ]
+
+    def get(self):
+        self._get_eosce_descriptor()
+        X = np.hstack(self.X)
+        df = pd.DataFrame(X, columns=self.columns)
+        df.to_csv(
+            os.path.join(
+                self.path,
+                ESTIMATORS_SUBFOLDER,
+                ESTIMATORS_FAMILY_SUBFOLDER,
+                DATA_FILENAME,
+            ),
+            index=False,
+        )
+        return df
+
+
+class Fitter(BaseEstimator):
+    def __init__(self, path):
+        BaseEstimator.__init__(self, path=path)
+        self.trained_path = os.path.join(
+            self.get_output_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER
+        )
+        self.x_getter = XGetter
+
+    def _get_X(self):
+        df = self.x_getter(path=self.path).get()
+        return df
+
+    def _get_y(self, task):
+        df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME))
+        return np.array(df[task])
+
+    def _get_Y(self):
+        Y = []
+        columns = []
+        for t in self._get_reg_tasks():
+            y = self._get_y(t)
+            Y += [y]
+            columns += [t]
+        for t in self._get_clf_tasks():
+            y = self._get_y(t)
+            Y += [y]
+            columns += [t]
+        Y = np.array(Y).T
+        df = pd.DataFrame(Y, columns=columns)
+        return df
+
+    def run(self, time_budget_sec=None):
+        self.reset_time()
+        if time_budget_sec is None:
+            time_budget_sec = self._estimate_time_budget()
+        else:
+            time_budget_sec = time_budget_sec
+        train_idxs = self.get_train_indices(path=self.path)
+        df_X = self._get_X()
+        df_Y = self._get_Y()
+        df = pd.concat([df_X, df_Y], axis=1)
+        labels = list(df_Y.columns)
+        self.logger.debug("Starting KerasTuner estimation")
+        estimator = KerasTunerEstimator(save_path=self.trained_path)
+        self.logger.debug("Fitting")
+        estimator.fit(data=df.iloc[train_idxs, :], labels=labels)
+        estimator.save()
+        estimator = estimator.load()
+        results = estimator.run(df)
+        self.update_elapsed_time()
+        return results
+
+
+class Predictor(BaseEstimator):
+    def __init__(self, path):
+        BaseEstimator.__init__(self, path=path)
+        self.trained_path = os.path.join(
+            self.get_trained_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER
+        )
+        self.x_getter = XGetter
+
+    def run(self):
+        self.reset_time()
+        df = self.x_getter(path=self.path).get()
+        model = KerasTunerEstimator(save_path=self.trained_path).load()
+        results = model.run(df)
+        self.update_elapsed_time()
+        return results
+
+
+class Assembler(BaseOutcomeAssembler):
+    def __init__(self, path=None):
+        BaseOutcomeAssembler.__init__(self, path=path)
+
+    def run(self, df):
+        df_c = self._get_compounds()
+        df_y = df
+        df = pd.concat([df_c, df_y], axis=1)
+        df.to_csv(
+            os.path.join(
+                self.path,
+                ESTIMATORS_SUBFOLDER,
+                ESTIMATORS_FAMILY_SUBFOLDER,
+                RESULTS_UNMAPPED_FILENAME,
+            ),
+            index=False,
+        )
+        mappings = self._get_mappings()
+        df = self._remap(df, mappings)
+        df.to_csv(
+            os.path.join(
+                self.path,
+                ESTIMATORS_SUBFOLDER,
+                ESTIMATORS_FAMILY_SUBFOLDER,
+                RESULTS_MAPPED_FILENAME,
+            ),
+            index=False,
+        )
+
+
+class Estimator(ZairaBase):
+    def __init__(self, path=None):
+        ZairaBase.__init__(self)
+        if path is None:
+            self.path = self.get_output_dir()
+        else:
+            self.path = path
+        path_ = os.path.join(
+            self.path, ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER
+        )
+        if not os.path.exists(path_):
+            os.makedirs(path_, exist_ok=True)
+        if not self.is_predict():
+            self.logger.debug("Starting kerastuner fitter")
+            self.estimator = Fitter(path=self.path)
+        else:
+            self.logger.debug("Starting kerastuner predictor")
+            self.estimator = Predictor(path=self.path)
+        self.assembler = Assembler(path=self.path)
+
+    def run(self, time_budget_sec=None):
+        if time_budget_sec is not None:
+            self.time_budget_sec = int(time_budget_sec)
+        else:
+            self.time_budget_sec = None
+        if not self.is_predict():
+            self.logger.debug("Mode: fit")
+            results = self.estimator.run()
+        else:
+            self.logger.debug("Mode: predict")
+            results = self.estimator.run()
+        self.assembler.run(results)
diff --git a/zairachem/estimators/from_ersilia_embedding/pipe.py b/zairachem/estimators/from_ersilia_embedding/pipe.py
new file mode 100644
index 00000000..89adfda8
--- /dev/null
+++ b/zairachem/estimators/from_ersilia_embedding/pipe.py
@@ -0,0 +1,9 @@
+from .estimate import Estimator
+
+
+class EosceEmbeddingPipeline(object):
+    def __init__(self, path):
+        self.e = Estimator(path=path)
+
+    def run(self, time_budget_sec=None):
+        self.e.run(time_budget_sec=time_budget_sec)
diff --git a/zairachem/estimators/pipe.py b/zairachem/estimators/pipe.py
index dddd61f6..1ff45b17 100644
--- a/zairachem/estimators/pipe.py
+++ b/zairachem/estimators/pipe.py
@@ -9,6 +9,7 @@
 from .from_individual_full_descriptors.pipe import IndividualFullDescriptorPipeline
 from .from_manifolds.pipe import ManifoldPipeline
 from .from_reference_embedding.pipe import ReferenceEmbeddingPipeline
+from .from_ersilia_embedding.pipe import EosceEmbeddingPipeline
 from .from_molmap.pipe import MolMapPipeline
 from .evaluate import SimpleEvaluator
 
@@ -107,6 +108,16 @@ def _reference_pipeline(self, time_budget_sec):
             p.run(time_budget_sec=time_budget_sec)
             step.update()
 
+    def _eosce_pipeline(self, time_budget_sec):
+        if "kerastuner-eosce-embedding" not in self._estimators_to_use:
+            return
+        step = PipelineStep("eosce_pipeline", self.output_dir)
+        if not step.is_done():
+            self.logger.debug("Ersilia compound embedding pipeline")
+            p = EosceEmbeddingPipeline(path=self.path)
+            p.run(time_budget_sec=time_budget_sec)
+            step.update()
+
     def _molmap_pipeline(self, time_budget_sec):
         if self.is_lazy():
             self.logger.info("Lazy mode skips molmap")
@@ -136,5 +147,6 @@ def run(self, time_budget_sec=None):
         self._individual_estimator_pipeline(time_budget_sec)
         self._manifolds_pipeline(time_budget_sec)
         self._reference_pipeline(time_budget_sec)
+        self._eosce_pipeline(time_budget_sec)
         self._molmap_pipeline(time_budget_sec)
         self._simple_evaluation()
diff --git a/zairachem/tools/fpsim2/FPSim2/tests/test_io.py b/zairachem/tools/fpsim2/FPSim2/tests/test_io.py
index 3fc45548..8e681658 100644
--- a/zairachem/tools/fpsim2/FPSim2/tests/test_io.py
+++ b/zairachem/tools/fpsim2/FPSim2/tests/test_io.py
@@ -62,17 +62,14 @@ def test_get_mol_supplier():
 
 
 def test_get_bounds_range():
-    assert (
-        get_bounds_range(
-            query=fps[0],
-            threshold=0.7,
-            a=0,
-            b=0,
-            ranges=popcnt_bins,
-            search_type="tanimoto",
-        )
-        == (0, 10)
-    )
+    assert get_bounds_range(
+        query=fps[0],
+        threshold=0.7,
+        a=0,
+        b=0,
+        ranges=popcnt_bins,
+        search_type="tanimoto",
+    ) == (0, 10)
 
 
 def test_build_fp():
diff --git a/zairachem/tools/macest/macest/classification/models.py b/zairachem/tools/macest/macest/classification/models.py
index ef8b5381..82874d7f 100644
--- a/zairachem/tools/macest/macest/classification/models.py
+++ b/zairachem/tools/macest/macest/classification/models.py
@@ -207,7 +207,7 @@ def calc_dist_to_neighbours(
                     x_star, k=self._num_neighbours, num_threads=num_threads_available
                 )
             )
-            class_dist = neighbours[:, 1, :].clip(min=10 ** -15)
+            class_dist = neighbours[:, 1, :].clip(min=10**-15)
             class_ind = neighbours[:, 0, :].astype(int)
             if self.training_preds_by_class is None:
                 raise ValueError("training_preds_by_class has already been cached")
@@ -249,7 +249,7 @@ def calc_linear_distance_error_func(
         :return: A weighted sum of average error and average distance
         """
         dist = self._alpha * np.average(
-            local_distance.clip(min=10 ** -15),
+            local_distance.clip(min=10**-15),
             weights=np.arange(local_distance.shape[1], 0, -1),
             axis=1,
         )
@@ -277,7 +277,7 @@ def predict_proba(
             class_dist, _, class_error = self.calc_dist_to_neighbours(x_star, cls)
 
             dist, error = self.calc_linear_distance_error_func(class_dist, class_error)
-            av_dist_func[i, :] = dist.clip(min=10 ** -10) + error
+            av_dist_func[i, :] = dist.clip(min=10**-10) + error
         relative_conf = self._calc_relative_distance_softmax_normalisation(av_dist_func)
         if change_conflicts:
             relative_conf = self._renormalise_conf_with_empirical_constant(
@@ -308,7 +308,7 @@ def predict_confidence_of_point_prediction(
 
         point_prediction_confidence = class_confidence[
             np.arange(len(class_confidence)), point_prediction
-        ].clip(max=1 - 10 ** -15)
+        ].clip(max=1 - 10**-15)
         return point_prediction_confidence
 
     def _calc_relative_distance_softmax_normalisation(
diff --git a/zairachem/tools/macest/macest/model_selection.py b/zairachem/tools/macest/macest/model_selection.py
index a137997b..8579c293 100644
--- a/zairachem/tools/macest/macest/model_selection.py
+++ b/zairachem/tools/macest/macest/model_selection.py
@@ -32,7 +32,7 @@ def __init__(
         self.shuffle = shuffle
         self.random_state = random_state
         self.pp_train_graph_cal_split = pp_train_graph_cal_split
-        if abs(np.array(self.pp_train_graph_cal_split).sum() - 1.0) > 10 ** -6:
+        if abs(np.array(self.pp_train_graph_cal_split).sum() - 1.0) > 10**-6:
             raise ValueError("split of training data must sum to 1")
 
     def split(
diff --git a/zairachem/tools/macest/macest/regression/models.py b/zairachem/tools/macest/macest/regression/models.py
index 90c99910..f82d095d 100644
--- a/zairachem/tools/macest/macest/regression/models.py
+++ b/zairachem/tools/macest/macest/regression/models.py
@@ -235,7 +235,7 @@ def calc_error_weighted_dist(self, x_star: np.ndarray) -> np.ndarray:
                 "Need to remove pre-cached training neighbour data from training"
             )
 
-        error_weighted_poly = self._alpha * error_weighted_dist ** self._beta
+        error_weighted_poly = self._alpha * error_weighted_dist**self._beta
         return error_weighted_poly
 
     def std_on_y_star(self, x_star: np.ndarray) -> np.ndarray:
@@ -326,7 +326,7 @@ def calculate_prediction_interval_width(
         return np.diff(intervals)
 
     def sample_prediction(
-        self, x_star: np.ndarray, nsamples: int = 10 ** 3
+        self, x_star: np.ndarray, nsamples: int = 10**3
     ) -> np.ndarray:
         """
         Draw samples from any predicted distribution to get a distribution of predictions.
diff --git a/zairachem/tools/macest/macest/regression/plots.py b/zairachem/tools/macest/macest/regression/plots.py
index 9179b400..191f586f 100644
--- a/zairachem/tools/macest/macest/regression/plots.py
+++ b/zairachem/tools/macest/macest/regression/plots.py
@@ -58,7 +58,7 @@ def plot_pred_with_conf(
 
     :return: None
     """
-    pred_dist = conf_model.sample_prediction(x_star, nsamples=10 ** 4)
+    pred_dist = conf_model.sample_prediction(x_star, nsamples=10**4)
 
     point_pred = np.mean(pred_dist)
     plt.figure(figsize=(12, 6))
diff --git a/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py b/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py
index f831578e..870ef037 100644
--- a/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py
+++ b/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py
@@ -11,7 +11,7 @@
 
 def r2_score(x, y):
     pcc, _ = pearsonr(x, y)
-    return pcc ** 2
+    return pcc**2
 
 
 def prc_auc_score(y_true, y_score):
diff --git a/zairachem/vars.py b/zairachem/vars.py
index b1039c66..84c633cf 100644
--- a/zairachem/vars.py
+++ b/zairachem/vars.py
@@ -53,6 +53,7 @@
     "flaml-individual-descriptors",
     "autogluon-manifolds",
     "kerastuner-reference-embedding",
+    "kerastuner-eosce-embedding",
     "molmap",
 ]
 

From 9a3a5293976ef58634ea296c6ec6b3ca6c6f564d Mon Sep 17 00:00:00 2001
From: GemmaTuron <gemma@ersilia.io>
Date: Thu, 6 Apr 2023 08:44:19 +0200
Subject: [PATCH 2/8] bugfixes eosce

---
 zairachem/descriptors/describe.py | 8 ++++++++
 zairachem/descriptors/eosce.py    | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/zairachem/descriptors/describe.py b/zairachem/descriptors/describe.py
index 2fff69f5..c2075463 100644
--- a/zairachem/descriptors/describe.py
+++ b/zairachem/descriptors/describe.py
@@ -3,6 +3,7 @@
 from .raw import RawDescriptors
 from .treated import TreatedDescriptors
 from .reference import ReferenceDescriptors, SimpleDescriptors
+from .eosce import EosceDescriptors
 from .manifolds import Manifolds
 
 from .. import ZairaBase
@@ -47,6 +48,12 @@ def _reference_descriptors(self):
                 ReferenceDescriptors().run()
             step.update()
 
+    def _eosce_descriptors(self):
+        step = PipelineStep("eosce_descriptors", self.output_dir)
+        if not step.is_done():
+            EosceDescriptors().run()
+            step.update()
+
     def _manifolds(self):
         step = PipelineStep("manifolds", self.output_dir)
         if not step.is_done():
@@ -58,5 +65,6 @@ def run(self):
         self._raw_descriptions()
         self._treated_descriptions()
         self._reference_descriptors()
+        self._eosce_descriptors()
         self._manifolds()
         self.update_elapsed_time()
diff --git a/zairachem/descriptors/eosce.py b/zairachem/descriptors/eosce.py
index 90e1da6c..21194f37 100644
--- a/zairachem/descriptors/eosce.py
+++ b/zairachem/descriptors/eosce.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import h5py
 
-from eosce.models import ErsiliaCompoundEmbedding
+from eosce.models import ErsiliaCompoundEmbeddings
 from ..utils.matrices import Hdf5
 from .. import ZairaBase
 
@@ -15,7 +15,7 @@
 class EosceEmbedder(ZairaBase):
     def __init__(self):
         ZairaBase.__init__(self)
-        self.model = ErsiliaCompoundEmbedding()
+        self.model = ErsiliaCompoundEmbeddings()
 
     def calculate(self, smiles_list, output_h5):
         X = self.model.transform(smiles_list)

From f567d245e3d262e3658ef113df136bafbc396a31 Mon Sep 17 00:00:00 2001
From: GemmaTuron <gemma@ersilia.io>
Date: Thu, 6 Apr 2023 18:10:32 +0200
Subject: [PATCH 3/8] small bugfix

---
 zairachem/vars.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/zairachem/vars.py b/zairachem/vars.py
index 84c633cf..e0f0fc39 100644
--- a/zairachem/vars.py
+++ b/zairachem/vars.py
@@ -41,11 +41,13 @@
 # Ersilia Model Hub
 
 ERSILIA_HUB_DEFAULT_MODELS = [
-    "morgan-counts",
+    #"morgan-counts",
     "cc-signaturizer",
+    "image-mol-embeddings",
+    "rdkit-fingerprint",
     "grover-embedding",
     "mordred",
-]  # molbert was removed
+] 
 
 DEFAULT_ESTIMATORS = [
     "baseline-classic",

From 5abbd09918e598dd7b5a9b02703e47549cdf15fc Mon Sep 17 00:00:00 2001
From: GemmaTuron <gemma@ersilia.io>
Date: Sat, 8 Apr 2023 12:14:19 +0200
Subject: [PATCH 4/8] tabpfn balanced class

---
 zairachem/automl/binarytabpfn.py              | 133 ++++++++++
 .../assemble.py                               |   6 +-
 .../estimate.py                               |   6 +-
 .../performance.py                            |   6 +-
 .../__init__.py                               |   1 +
 .../assemble.py                               | 101 ++++++++
 .../estimate.py                               | 185 ++++++++++++++
 .../performance.py                            | 233 ++++++++++++++++++
 .../pipe.py                                   |  15 ++
 zairachem/estimators/pipe.py                  |  17 ++
 .../tools/fpsim2/FPSim2/tests/test_io.py      |  19 +-
 .../macest/macest/classification/models.py    |   8 +-
 .../tools/macest/macest/model_selection.py    |   2 +-
 .../tools/macest/macest/regression/models.py  |   4 +-
 .../tools/macest/macest/regression/plots.py   |   2 +-
 .../molmap/bidd-molmap/molmap/model/cbks2.py  |   2 +-
 zairachem/vars.py                             |   5 +-
 17 files changed, 717 insertions(+), 28 deletions(-)
 create mode 100644 zairachem/automl/binarytabpfn.py
 create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py
 create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py
 create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py
 create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py
 create mode 100644 zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py

diff --git a/zairachem/automl/binarytabpfn.py b/zairachem/automl/binarytabpfn.py
new file mode 100644
index 00000000..c24a3b87
--- /dev/null
+++ b/zairachem/automl/binarytabpfn.py
@@ -0,0 +1,133 @@
+import numpy as np
+from lol import LOL
+import random
+import collections
+from tabpfn import TabPFNClassifier
+from imblearn.combine import SMOTETomek
+from imblearn.over_sampling import KMeansSMOTE
+from imblearn.under_sampling import EditedNearestNeighbours
+import joblib
+
+
+class TabPFNBinaryClassifier(object):
+    def __init__(self, device="cpu", N_ensemble_configurations=4):
+        self.device = device
+        self.N_ensemble_configurations = N_ensemble_configurations
+        self.max_samples = 1000
+
+    def _get_balanced_datasets(self, X, y):
+        smp = SMOTETomek(sampling_strategy="auto")
+        X_0, y_0 = smp.fit_resample(X, y)
+        smp = KMeansSMOTE(sampling_strategy="auto")
+        X_1, y_1 = smp.fit_resample(X, y)
+        smp = EditedNearestNeighbours(sampling_strategy="auto")
+        X_2, y_2 = smp.fit_resample(X, y)
+        results = [(X_0, y_0), (X_1, y_1), (X_2, y_2)]
+        return results
+
+    def _cap_samples(self, X, y):
+        if X.shape[0] <= self.max_samples:
+            return [(X, y)]
+        idxs = [i for i in range(X.shape[0])]
+        R = []
+        for _ in range(3):
+            smp_idxs = random.sample(idxs, self.max_samples)
+            X_, y_ = X[smp_idxs], y[smp_idxs]
+            if np.sum(y_) == 0:
+                continue
+            R += [(X_, y_)]
+        return R
+
+    def _get_ensemble(self, X, y):
+        R = []
+        for X_0, y_0 in self._get_balanced_datasets(X, y):
+            for X_1, y_1 in self._cap_samples(X_0, y_0):
+                R += [(X_1, y_1)]
+        return R
+
+    def fit(self, X, y):
+        self.reducer = LOL(n_components=100)
+        self.reducer.fit(X, y)
+        X = self.reducer.transform(X)
+        self.ensemble = self._get_ensemble(X, y)
+
+    def predict_proba(self, X):
+        model = TabPFNClassifier(
+            device=self.device, N_ensemble_configurations=self.N_ensemble_configurations
+        )
+        X = self.reducer.transform(X)
+        R = []
+        for X_tr, y_tr in self.ensemble:
+            # print(X_tr.shape, np.sum(y_tr))
+            model.fit(X_tr, y_tr)
+            R += [model.predict_proba(X)[:, 1]]
+            model.remove_models_from_memory()
+        R = np.array(R).T
+        y_h1 = np.mean(R, axis=1)
+        y_h0 = 1 - y_h1
+        y_h = np.array([y_h0, y_h1]).T
+        return y_h
+
+    def save(self, file_name):
+        data = {
+            "device": self.device,
+            "N_ensemble_configurations": self.N_ensemble_configurations,
+            "reducer": self.reducer,
+            "ensemble": self.ensemble,
+        }
+        joblib.dump(data, file_name)
+
+    def load(self, file_name):
+        data = joblib.load(file_name)
+        model = TabPFNBinaryClassifier(
+            device=data["device"],
+            N_ensemble_configurations=data["N_ensemble_configurations"],
+        )
+        model.ensemble = data["ensemble"]
+        model.reducer = data["reducer"]
+        return TabPFNClassifierArtifact(model, 0.5)
+
+
+class Binarizer(object):
+    def __init__(self, threshold):
+        self.threshold = threshold
+
+    def binarize(self, y_hat):
+        y_bin = []
+        for y in y_hat:
+            if y > self.threshold:
+                y_bin += [1]
+            else:
+                y_bin += [0]
+        return np.array(y_bin, dtype=np.uint8)
+
+
+class TabPFNClassifierArtifact(object):
+    def __init__(self, model, threshold):
+        self.model = model
+        self.threshold = threshold
+        if threshold is not None:
+            self.binarizer = Binarizer(self.threshold)
+        else:
+            self.binarizer = None
+
+    def predict_proba(self, X):
+        return self.model.predict_proba(X)[:, 1]
+
+    def predict(self, X):
+        if self.binarizer is not None:
+            y_hat = self.predict_proba(X)
+            y_bin = self.binarizer.binarize(y_hat)
+        else:
+            y_bin = self.model.predict(X)
+        return y_bin
+
+    def run(self, X, y=None):
+        results = collections.OrderedDict()
+        results["main"] = {
+            "idxs": None,
+            "y": y,
+            "y_hat": self.predict_proba(X),
+            "b_hat": self.predict(X),
+        }
+        return results
diff --git a/zairachem/estimators/from_individual_full_descriptors/assemble.py b/zairachem/estimators/from_individual_full_descriptors/assemble.py
index 5b2b7cbc..ba4acda1 100644
--- a/zairachem/estimators/from_individual_full_descriptors/assemble.py
+++ b/zairachem/estimators/from_individual_full_descriptors/assemble.py
@@ -80,7 +80,7 @@ def _get_model_ids(self):
             os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r"
         ) as f:
             model_ids = list(json.load(f))
-        model_ids_successfull = []
+        model_ids_successful = []
         for model_id in model_ids:
             if os.path.isfile(
                 os.path.join(
@@ -91,8 +91,8 @@ def _get_model_ids(self):
                     "y_hat.joblib",
                 )
             ):
-                model_ids_successfull += [model_id]
-        return model_ids_successfull
+                model_ids_successful += [model_id]
+        return model_ids_successful
 
     def run(self):
         model_ids = self._get_model_ids()
diff --git a/zairachem/estimators/from_individual_full_descriptors/estimate.py b/zairachem/estimators/from_individual_full_descriptors/estimate.py
index a047383e..e96906a6 100644
--- a/zairachem/estimators/from_individual_full_descriptors/estimate.py
+++ b/zairachem/estimators/from_individual_full_descriptors/estimate.py
@@ -244,13 +244,13 @@ def _get_model_ids(self):
             os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r"
         ) as f:
             model_ids = list(json.load(f))
-        model_ids_successfull = []
+        model_ids_successful = []
         for model_id in model_ids:
             if os.path.isfile(
                 os.path.join(path, DESCRIPTORS_SUBFOLDER, model_id, "treated.h5")
             ):
-                model_ids_successfull += [model_id]
-        return model_ids_successfull
+                model_ids_successful += [model_id]
+        return model_ids_successful
 
     def run(self, time_budget_sec=None):
         model_ids = self._get_model_ids()
diff --git a/zairachem/estimators/from_individual_full_descriptors/performance.py b/zairachem/estimators/from_individual_full_descriptors/performance.py
index 4bcd91b5..d067456f 100644
--- a/zairachem/estimators/from_individual_full_descriptors/performance.py
+++ b/zairachem/estimators/from_individual_full_descriptors/performance.py
@@ -212,7 +212,7 @@ def _get_model_ids(self):
             os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r"
         ) as f:
             model_ids = list(json.load(f))
-        model_ids_successfull = []
+        model_ids_successful = []
         for model_id in model_ids:
             if os.path.isfile(
                 os.path.join(
@@ -223,8 +223,8 @@ def _get_model_ids(self):
                     "y_hat.joblib",
                 )
             ):
-                model_ids_successfull += [model_id]
-        return model_ids_successfull
+                model_ids_successful += [model_id]
+        return model_ids_successful
 
     def run(self):
         model_ids = self._get_model_ids()
diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py
new file mode 100644
index 00000000..856581cd
--- /dev/null
+++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/__init__.py
@@ -0,0 +1 @@
+ESTIMATORS_FAMILY_SUBFOLDER = "individual_full_descriptors_tabpfn"
diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py
new file mode 100644
index 00000000..ba4acda1
--- /dev/null
+++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/assemble.py
@@ -0,0 +1,101 @@
+import pandas as pd
+import json
+import os
+import joblib
+import collections
+
+from . import ESTIMATORS_FAMILY_SUBFOLDER
+from ... import ZairaBase
+from ...vars import DESCRIPTORS_SUBFOLDER, ESTIMATORS_SUBFOLDER
+from .. import Y_HAT_FILE, RESULTS_UNMAPPED_FILENAME, RESULTS_MAPPED_FILENAME
+from ..base import BaseOutcomeAssembler
+
+
+class IndividualOutcomeAssembler(BaseOutcomeAssembler):
+    def __init__(self, path=None, model_id=None):
+        BaseOutcomeAssembler.__init__(self, path=path)
+        self.model_id = model_id
+
+    def _get_y_hat(self):
+        results = joblib.load(
+            os.path.join(
+                self.path,
+                ESTIMATORS_SUBFOLDER,
+                ESTIMATORS_FAMILY_SUBFOLDER,
+                self.model_id,
+                Y_HAT_FILE,
+            )
+        )
+        data = collections.OrderedDict()
+        for c, r in results.items():
+            r = r["main"]
+            data[c] = r["y_hat"]
+            if "b_hat" in r:
+                data[c + "_bin"] = r["b_hat"]
+        return pd.DataFrame(data)
+
+    def run(self):
+        df_c = self._get_compounds()
+        df_y = self._get_y_hat()
+        df = pd.concat([df_c, df_y], axis=1)
+        df.to_csv(
+            os.path.join(
+                self.path,
+                ESTIMATORS_SUBFOLDER,
+                ESTIMATORS_FAMILY_SUBFOLDER,
+                self.model_id,
+                RESULTS_UNMAPPED_FILENAME,
+            ),
+            index=False,
+        )
+        mappings = self._get_mappings()
+        df = self._remap(df, mappings)
+        df.to_csv(
+            os.path.join(
+                self.path,
+                ESTIMATORS_SUBFOLDER,
+                ESTIMATORS_FAMILY_SUBFOLDER,
+                self.model_id,
+                RESULTS_MAPPED_FILENAME,
+            ),
+            index=False,
+        )
+
+
+class OutcomeAssembler(ZairaBase):
+    def __init__(self, path=None):
+        ZairaBase.__init__(self)
+        self.path = path
+
+    def _get_model_ids(self):
+        if self.path is None:
+            path = self.get_output_dir()
+        else:
+            path = self.path
+        if self.is_predict():
+            path_trained = self.get_trained_dir()
+        else:
+            path_trained = path
+        with open(
+            os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r"
+        ) as f:
+            model_ids = list(json.load(f))
+        model_ids_successful = []
+        for model_id in model_ids:
+            if os.path.isfile(
+                os.path.join(
+                    path,
+                    ESTIMATORS_SUBFOLDER,
+                    ESTIMATORS_FAMILY_SUBFOLDER,
+                    model_id,
+                    "y_hat.joblib",
+                )
+            ):
+                model_ids_successful += [model_id]
+        return model_ids_successful
+
+    def run(self):
+        model_ids = self._get_model_ids()
+        for model_id in model_ids:
+            o = IndividualOutcomeAssembler(path=self.path, model_id=model_id)
+            o.run()
diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py
new file mode 100644
index 00000000..6f9c9f4f
--- /dev/null
+++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/estimate.py
@@ -0,0 +1,185 @@
+import os
+import json
+import h5py
+import pandas as pd
+import numpy as np
+import collections
+import joblib
+
+from ...descriptors.treated import TREATED_FILE_NAME
+
+from ... import ZairaBase
+from ...automl.binarytabpfn import TabPFNBinaryClassifier
+
+from ...vars import (
+    DESCRIPTORS_SUBFOLDER,
+    DATA_SUBFOLDER,
+    DATA_FILENAME,
+    ESTIMATORS_SUBFOLDER,
+)
+from ..base import BaseEstimator
+
+from .. import Y_HAT_FILE
+from . import ESTIMATORS_FAMILY_SUBFOLDER
+
+
+class BaseEstimatorIndividual(BaseEstimator):
+    def __init__(self, path, model_id):
+        BaseEstimator.__init__(self, path=path)
+        path_ = os.path.join(
+            self.path, ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER, model_id
+        )
+        if not os.path.exists(path_):
+            os.makedirs(path_)
+        self.model_id = model_id
+
+    def _get_X(self):
+        f = os.path.join(
+            self.path, DESCRIPTORS_SUBFOLDER, self.model_id, TREATED_FILE_NAME
+        )
+        with h5py.File(f, "r") as f:
+            X = f["Values"][:]
+        return X
+
+
+class Fitter(BaseEstimatorIndividual):
+    def __init__(self, path, model_id):
+        BaseEstimatorIndividual.__init__(self, path=path, model_id=model_id)
+        self.trained_path = os.path.join(
+            self.get_output_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER
+        )
+
+    def _get_flds(self):
+        # for now only auxiliary folds are used
+        col = [f for f in self.schema["folds"] if "_aux" in f][0]
+        df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME))
+        return np.array(df[col])
+
+    def _get_y(self, task):
+        # for now iterate task by task
+        df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME))
+        return np.array(df[task])
+
+    def run(self, time_budget_sec=60):
+        self.reset_time()
+        if time_budget_sec is None:
+            time_budget_sec = self._estimate_time_budget()
+        else:
+            time_budget_sec = time_budget_sec
+        tasks = collections.OrderedDict()
+        X = self._get_X()
+        train_idxs = self.get_train_indices(path=self.path)
+        valid_idxs = self.get_validation_indices(path=self.path)
+        for t in self._get_clf_tasks():
+            y = self._get_y(t)
+            model = TabPFNBinaryClassifier()
+            model.fit(X[train_idxs], y[train_idxs])
+            file_name = os.path.join(self.trained_path, self.model_id, t + ".joblib")
+            model.save(file_name)
+            model = model.load(file_name)
+            tasks[t] = model.run(X, y)
+            _valid_task = model.run(X[valid_idxs], y[valid_idxs])
+            tasks[t]["valid"] = _valid_task["main"]
+        self.update_elapsed_time()
+        return tasks
+
+
+class Predictor(BaseEstimatorIndividual):
+    def __init__(self, path, model_id):
+        BaseEstimatorIndividual.__init__(self, path=path, model_id=model_id)
+        self.trained_path = os.path.join(
+            self.get_trained_dir(), ESTIMATORS_SUBFOLDER, ESTIMATORS_FAMILY_SUBFOLDER
+        )
+
+    def _get_y(self, task):
+        # for now iterate task by task
+        df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME))
+        columns = set(df.columns)
+        if task in columns:
+            return np.array(df[task])
+        else:
+            return None
+
+    def run(self):
+        self.reset_time()
+        tasks = collections.OrderedDict()
+        X = self._get_X()
+        for t in self._get_clf_tasks():
+            y = self._get_y(t)
+            model = TabPFNBinaryClassifier()
+            file_name = os.path.join(self.trained_path, self.model_id, t + ".joblib")
+            model = model.load(file_name)
+            tasks[t] = model.run(X, y)
+        self.update_elapsed_time()
+        return tasks
+
+
+class IndividualEstimator(ZairaBase):
+    def __init__(self, path=None, model_id=None):
+        ZairaBase.__init__(self)
+        self.model_id = model_id
+        if path is None:
+            self.path = self.get_output_dir()
+        else:
+            self.path = path
+        if not self.is_predict():
+            self.estimator = Fitter(path=self.path, model_id=self.model_id)
+        else:
+            self.estimator = Predictor(path=self.path, model_id=self.model_id)
+
+    def run(self, time_budget_sec=None):
+        if time_budget_sec is not None:
+            self.time_budget_sec = int(time_budget_sec)
+        else:
+            self.time_budget_sec = None
+        if not self.is_predict():
+            results = self.estimator.run(time_budget_sec=self.time_budget_sec)
+        else:
+            results = self.estimator.run()
+        joblib.dump(
+            results,
+            os.path.join(
+                self.path,
+                ESTIMATORS_SUBFOLDER,
+                ESTIMATORS_FAMILY_SUBFOLDER,
+                self.model_id,
+                Y_HAT_FILE,
+            ),
+        )
+
+
+class Estimator(ZairaBase):
+    def __init__(self, path=None):
+        ZairaBase.__init__(self)
+        self.path = path
+
+    def _get_model_ids(self):
+        if self.path is None:
+            path = self.get_output_dir()
+        else:
+            path = self.path
+        if self.is_predict():
+            path_trained = self.get_trained_dir()
+        else:
+            path_trained = path
+        with open(
+            os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r"
+        ) as f:
+            model_ids = list(json.load(f))
+        model_ids_successful = []
+        for model_id in model_ids:
+            if os.path.isfile(
+                os.path.join(path, DESCRIPTORS_SUBFOLDER, model_id, "treated.h5")
+            ):
+                model_ids_successful += [model_id]
+        return model_ids_successful
+
+    def run(self, time_budget_sec=None):
+        model_ids = self._get_model_ids()
+        if time_budget_sec is not None:
+            tbs = max(int(time_budget_sec / len(model_ids)), 1)
+        else:
+            tbs = None
+        for model_id in model_ids:
+            estimator = IndividualEstimator(path=self.path, model_id=model_id)
+            estimator.run(time_budget_sec=tbs)
diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py
new file mode 100644
index 00000000..d067456f
--- /dev/null
+++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/performance.py
@@ -0,0 +1,233 @@
+import os
+import json
+import numpy as np
+import pandas as pd
+import joblib
+import collections
+
+from sklearn import metrics
+
+from .. import Y_HAT_FILE
+from ... import ZairaBase
+
+from . import ESTIMATORS_FAMILY_SUBFOLDER
+from ...vars import (
+    DATA_SUBFOLDER,
+    DESCRIPTORS_SUBFOLDER,
+    ESTIMATORS_SUBFOLDER,
+    DATA_FILENAME,
+)
+
+from .. import CLF_REPORT_FILENAME, REG_REPORT_FILENAME
+
+
+class BasePerformance(ZairaBase):
+    def __init__(self, path=None, model_id=None):
+        ZairaBase.__init__(self)
+        if path is None:
+            self.path = self.get_output_dir()
+        else:
+            self.path = path
+        self.model_id = model_id
+
+    def _get_y_hat_dict(self):
+        return joblib.load(
+            os.path.join(
+                self.path,
+                ESTIMATORS_SUBFOLDER,
+                ESTIMATORS_FAMILY_SUBFOLDER,
+                self.model_id,
+                Y_HAT_FILE,
+            )
+        )
+
+
+class ClassificationPerformance(BasePerformance):
+    def __init__(self, path, model_id):
+        BasePerformance.__init__(self, path=path, model_id=model_id)
+        self.results = self._get_y_hat_dict()
+        self._prefix = self._get_prefix()
+        self.results = self.results[self._prefix]
+
+    def _get_prefix(self):
+        for c in list(self.results.keys()):
+            if "clf_" in c:
+                return c
+
+    def _try_metric(self, fun, t, p):
+        try:
+            return float(fun(t, p))
+        except:
+            return None
+
+    def _calculate(self, key):
+        r = self.results[key]
+        y_true = np.array(r["y"])
+        y_pred = np.array(r["y_hat"])
+        b_pred = np.array(r["b_hat"])
+        try:
+            confu = metrics.confusion_matrix(y_true, b_pred, labels=[0, 1])
+        except:
+            confu = np.array([[-1, -1], [-1, -1]])
+        report = {
+            "roc_auc_score": self._try_metric(metrics.roc_auc_score, y_true, y_pred),
+            "precision_score": self._try_metric(
+                metrics.precision_score, y_true, b_pred
+            ),
+            "recall_score": self._try_metric(metrics.recall_score, y_true, b_pred),
+            "tp": int(confu[1, 1]),
+            "tn": int(confu[0, 0]),
+            "fp": int(confu[0, 1]),
+            "fn": int(confu[1, 0]),
+            "y_true": [int(y) for y in y_true],
+            "y_pred": [float(y) for y in y_pred],
+            "b_pred": [int(y) for y in b_pred],
+        }
+        return report
+
+    def calculate(self):
+        report = collections.OrderedDict()
+        for k in self.results.keys():
+            report[k] = self._calculate(k)
+        return report
+
+
+class RegressionPerformance(BasePerformance):
+    def __init__(self, path, model_id):
+        BasePerformance.__init__(self, path=path, model_id=model_id)
+        self.results = self._get_y_hat_dict()
+        self._prefix = self._get_prefix()
+        self.results = self.results[self._prefix]
+
+    def _get_prefix(self):
+        for c in list(self.results.keys()):
+            if "reg_" in c:
+                return c
+
+    def _calculate(self, key):
+        r = self.results[key]
+        y_true = np.array(r["y"])
+        y_pred = np.array(r["y_hat"])
+        report = {
+            "r2_score": float(metrics.r2_score(y_true, y_pred)),
+            "mean_absolute_error": float(metrics.mean_absolute_error(y_true, y_pred)),
+            "mean_squared_error": float(metrics.mean_squared_error(y_true, y_pred)),
+            "y_true": [float(y) for y in y_true],
+            "y_pred": [float(y) for y in y_pred],
+        }
+        return report
+
+    def calculate(self):
+        report = collections.OrderedDict()
+        for k in self.results.keys():
+            report[k] = self._calculate(k)
+        return report
+
+
+class IndividualPerformanceReporter(ZairaBase):
+    def __init__(self, path=None, model_id=None):
+        ZairaBase.__init__(self)
+        if path is None:
+            self.path = self.get_output_dir()
+        else:
+            self.path = path
+        self.has_tasks = self._has_tasks()
+        if self._has_clf_tasks():
+            self.clf = ClassificationPerformance(path=path, model_id=model_id)
+        else:
+            self.clf = None
+        if self._has_reg_tasks():
+            self.reg = RegressionPerformance(path=path, model_id=model_id)
+        else:
+            self.reg = None
+        self.model_id = model_id
+
+    def _has_tasks(self):
+        df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME))
+        for c in list(df.columns):
+            if "clf_" in c or "reg_" in c:
+                return True
+        return False
+
+    def _has_reg_tasks(self):
+        df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME))
+        for c in list(df.columns):
+            if "reg_" in c and "_skip" not in c and "_aux" not in c:
+                return True
+        return False
+
+    def _has_clf_tasks(self):
+        df = pd.read_csv(os.path.join(self.path, DATA_SUBFOLDER, DATA_FILENAME))
+        for c in list(df.columns):
+            if "clf_" in c and "_skip" not in c and "_aux" not in c:
+                return True
+        return False
+
+    def run(self):
+        if not self.has_tasks:
+            return
+        if self.clf is not None:
+            clf_rep = self.clf.calculate()
+            with open(
+                os.path.join(
+                    self.path,
+                    ESTIMATORS_SUBFOLDER,
+                    ESTIMATORS_FAMILY_SUBFOLDER,
+                    self.model_id,
+                    CLF_REPORT_FILENAME,
+                ),
+                "w",
+            ) as f:
+                json.dump(clf_rep, f, indent=4)
+        if self.reg is not None:
+            reg_rep = self.reg.calculate()
+            with open(
+                os.path.join(
+                    self.path,
+                    ESTIMATORS_SUBFOLDER,
+                    ESTIMATORS_FAMILY_SUBFOLDER,
+                    self.model_id,
+                    REG_REPORT_FILENAME,
+                ),
+                "w",
+            ) as f:
+                json.dump(reg_rep, f, indent=4)
+
+
+class PerformanceReporter(ZairaBase):
+    def __init__(self, path=None):
+        ZairaBase.__init__(self)
+        self.path = path
+
+    def _get_model_ids(self):
+        if self.path is None:
+            path = self.get_output_dir()
+        else:
+            path = self.path
+        if self.is_predict():
+            path_trained = self.get_trained_dir()
+        else:
+            path_trained = path
+        with open(
+            os.path.join(path_trained, DESCRIPTORS_SUBFOLDER, "done_eos.json"), "r"
+        ) as f:
+            model_ids = list(json.load(f))
+        model_ids_successful = []
+        for model_id in model_ids:
+            if os.path.isfile(
+                os.path.join(
+                    path,
+                    ESTIMATORS_SUBFOLDER,
+                    ESTIMATORS_FAMILY_SUBFOLDER,
+                    model_id,
+                    "y_hat.joblib",
+                )
+            ):
+                model_ids_successful += [model_id]
+        return model_ids_successful
+
+    def run(self):
+        model_ids = self._get_model_ids()
+        for model_id in model_ids:
+            p = IndividualPerformanceReporter(path=self.path, model_id=model_id)
+            p.run()
diff --git a/zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py b/zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py
new file mode 100644
index 00000000..8e397fbf
--- /dev/null
+++ b/zairachem/estimators/from_individual_full_descriptors_tabpfn/pipe.py
@@ -0,0 +1,15 @@
+from .estimate import Estimator
+from .assemble import OutcomeAssembler
+from .performance import PerformanceReporter
+
+
+class IndividualFullDescriptorTabPFNPipeline(object):
+    def __init__(self, path):
+        self.e = Estimator(path=path)
+        self.a = OutcomeAssembler(path=path)
+        self.p = PerformanceReporter(path=path)
+
+    def run(self, time_budget_sec=None):
+        self.e.run(time_budget_sec=time_budget_sec)
+        self.a.run()
+        self.p.run()
diff --git a/zairachem/estimators/pipe.py b/zairachem/estimators/pipe.py
index 1ff45b17..fe658a71 100644
--- a/zairachem/estimators/pipe.py
+++ b/zairachem/estimators/pipe.py
@@ -7,6 +7,9 @@
 from .from_classic.pipe import ClassicPipeline
 from .from_fingerprint.pipe import FingerprintPipeline
 from .from_individual_full_descriptors.pipe import IndividualFullDescriptorPipeline
+from .from_individual_full_descriptors_tabpfn.pipe import (
+    IndividualFullDescriptorTabPFNPipeline,
+)
 from .from_manifolds.pipe import ManifoldPipeline
 from .from_reference_embedding.pipe import ReferenceEmbeddingPipeline
 from .from_ersilia_embedding.pipe import EosceEmbeddingPipeline
@@ -82,6 +85,19 @@ def _individual_estimator_pipeline(self, time_budget_sec):
             p.run(time_budget_sec=time_budget_sec)
             step.update()
 
+    def _individual_estimator_tabpfn_pipeline(self, time_budget_sec):
+        if self.is_lazy():
+            self.logger.info("Lazy mode skips individual descriptors with tabpfn")
+            return
+        if "tabpfn-individual-descriptors" not in self._estimators_to_use:
+            return
+        step = PipelineStep("individual_estimator_pipeline_tabpfn", self.output_dir)
+        if not step.is_done():
+            self.logger.debug("Running individual estimator pipeline")
+            p = IndividualFullDescriptorTabPFNPipeline(path=self.path)
+            p.run(time_budget_sec=time_budget_sec)
+            step.update()
+
     def _manifolds_pipeline(self, time_budget_sec):
         if self.is_lazy():
             self.logger.info("Lazy mode skips manifolds")
@@ -144,6 +160,7 @@ def _simple_evaluation(self):
     def run(self, time_budget_sec=None):
         self._classic_estimator_pipeline(time_budget_sec)
         self._fingerprint_estimator_pipeline(time_budget_sec)
+        self._individual_estimator_tabpfn_pipeline(time_budget_sec)
         self._individual_estimator_pipeline(time_budget_sec)
         self._manifolds_pipeline(time_budget_sec)
         self._reference_pipeline(time_budget_sec)
diff --git a/zairachem/tools/fpsim2/FPSim2/tests/test_io.py b/zairachem/tools/fpsim2/FPSim2/tests/test_io.py
index 8e681658..3fc45548 100644
--- a/zairachem/tools/fpsim2/FPSim2/tests/test_io.py
+++ b/zairachem/tools/fpsim2/FPSim2/tests/test_io.py
@@ -62,14 +62,17 @@ def test_get_mol_supplier():
 
 
 def test_get_bounds_range():
-    assert get_bounds_range(
-        query=fps[0],
-        threshold=0.7,
-        a=0,
-        b=0,
-        ranges=popcnt_bins,
-        search_type="tanimoto",
-    ) == (0, 10)
+    assert (
+        get_bounds_range(
+            query=fps[0],
+            threshold=0.7,
+            a=0,
+            b=0,
+            ranges=popcnt_bins,
+            search_type="tanimoto",
+        )
+        == (0, 10)
+    )
 
 
 def test_build_fp():
diff --git a/zairachem/tools/macest/macest/classification/models.py b/zairachem/tools/macest/macest/classification/models.py
index 82874d7f..ef8b5381 100644
--- a/zairachem/tools/macest/macest/classification/models.py
+++ b/zairachem/tools/macest/macest/classification/models.py
@@ -207,7 +207,7 @@ def calc_dist_to_neighbours(
                     x_star, k=self._num_neighbours, num_threads=num_threads_available
                 )
             )
-            class_dist = neighbours[:, 1, :].clip(min=10**-15)
+            class_dist = neighbours[:, 1, :].clip(min=10 ** -15)
             class_ind = neighbours[:, 0, :].astype(int)
             if self.training_preds_by_class is None:
                 raise ValueError("training_preds_by_class has already been cached")
@@ -249,7 +249,7 @@ def calc_linear_distance_error_func(
         :return: A weighted sum of average error and average distance
         """
         dist = self._alpha * np.average(
-            local_distance.clip(min=10**-15),
+            local_distance.clip(min=10 ** -15),
             weights=np.arange(local_distance.shape[1], 0, -1),
             axis=1,
         )
@@ -277,7 +277,7 @@ def predict_proba(
             class_dist, _, class_error = self.calc_dist_to_neighbours(x_star, cls)
 
             dist, error = self.calc_linear_distance_error_func(class_dist, class_error)
-            av_dist_func[i, :] = dist.clip(min=10**-10) + error
+            av_dist_func[i, :] = dist.clip(min=10 ** -10) + error
         relative_conf = self._calc_relative_distance_softmax_normalisation(av_dist_func)
         if change_conflicts:
             relative_conf = self._renormalise_conf_with_empirical_constant(
@@ -308,7 +308,7 @@ def predict_confidence_of_point_prediction(
 
         point_prediction_confidence = class_confidence[
             np.arange(len(class_confidence)), point_prediction
-        ].clip(max=1 - 10**-15)
+        ].clip(max=1 - 10 ** -15)
         return point_prediction_confidence
 
     def _calc_relative_distance_softmax_normalisation(
diff --git a/zairachem/tools/macest/macest/model_selection.py b/zairachem/tools/macest/macest/model_selection.py
index 8579c293..a137997b 100644
--- a/zairachem/tools/macest/macest/model_selection.py
+++ b/zairachem/tools/macest/macest/model_selection.py
@@ -32,7 +32,7 @@ def __init__(
         self.shuffle = shuffle
         self.random_state = random_state
         self.pp_train_graph_cal_split = pp_train_graph_cal_split
-        if abs(np.array(self.pp_train_graph_cal_split).sum() - 1.0) > 10**-6:
+        if abs(np.array(self.pp_train_graph_cal_split).sum() - 1.0) > 10 ** -6:
             raise ValueError("split of training data must sum to 1")
 
     def split(
diff --git a/zairachem/tools/macest/macest/regression/models.py b/zairachem/tools/macest/macest/regression/models.py
index f82d095d..90c99910 100644
--- a/zairachem/tools/macest/macest/regression/models.py
+++ b/zairachem/tools/macest/macest/regression/models.py
@@ -235,7 +235,7 @@ def calc_error_weighted_dist(self, x_star: np.ndarray) -> np.ndarray:
                 "Need to remove pre-cached training neighbour data from training"
             )
 
-        error_weighted_poly = self._alpha * error_weighted_dist**self._beta
+        error_weighted_poly = self._alpha * error_weighted_dist ** self._beta
         return error_weighted_poly
 
     def std_on_y_star(self, x_star: np.ndarray) -> np.ndarray:
@@ -326,7 +326,7 @@ def calculate_prediction_interval_width(
         return np.diff(intervals)
 
     def sample_prediction(
-        self, x_star: np.ndarray, nsamples: int = 10**3
+        self, x_star: np.ndarray, nsamples: int = 10 ** 3
     ) -> np.ndarray:
         """
         Draw samples from any predicted distribution to get a distribution of predictions.
diff --git a/zairachem/tools/macest/macest/regression/plots.py b/zairachem/tools/macest/macest/regression/plots.py
index 191f586f..9179b400 100644
--- a/zairachem/tools/macest/macest/regression/plots.py
+++ b/zairachem/tools/macest/macest/regression/plots.py
@@ -58,7 +58,7 @@ def plot_pred_with_conf(
 
     :return: None
     """
-    pred_dist = conf_model.sample_prediction(x_star, nsamples=10**4)
+    pred_dist = conf_model.sample_prediction(x_star, nsamples=10 ** 4)
 
     point_pred = np.mean(pred_dist)
     plt.figure(figsize=(12, 6))
diff --git a/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py b/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py
index 870ef037..f831578e 100644
--- a/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py
+++ b/zairachem/tools/molmap/bidd-molmap/molmap/model/cbks2.py
@@ -11,7 +11,7 @@
 
 def r2_score(x, y):
     pcc, _ = pearsonr(x, y)
-    return pcc**2
+    return pcc ** 2
 
 
 def prc_auc_score(y_true, y_score):
diff --git a/zairachem/vars.py b/zairachem/vars.py
index e0f0fc39..bc6812c1 100644
--- a/zairachem/vars.py
+++ b/zairachem/vars.py
@@ -41,18 +41,19 @@
 # Ersilia Model Hub
 
 ERSILIA_HUB_DEFAULT_MODELS = [
-    #"morgan-counts",
+    # "morgan-counts",
     "cc-signaturizer",
     "image-mol-embeddings",
     "rdkit-fingerprint",
     "grover-embedding",
     "mordred",
-] 
+]
 
 DEFAULT_ESTIMATORS = [
     "baseline-classic",
     "baseline-fingerprint",
     "flaml-individual-descriptors",
+    "tabpfn-individual-descriptors",
     "autogluon-manifolds",
     "kerastuner-reference-embedding",
     "kerastuner-eosce-embedding",

From 340b52a2827008eed00b6c1f51ab9273703e1ffd Mon Sep 17 00:00:00 2001
From: gemmaturon <gemma@ersilia.io>
Date: Sun, 9 Apr 2023 11:16:41 +0200
Subject: [PATCH 5/8] small bugfix

---
 install_linux.sh                 |  6 ++++++
 zairachem/automl/binarytabpfn.py | 21 +++++++++++++++------
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/install_linux.sh b/install_linux.sh
index c53f2a0a..abfd1fce 100644
--- a/install_linux.sh
+++ b/install_linux.sh
@@ -54,5 +54,11 @@ python3 -m pip install git+https://github.com/ersilia-os/lazy-qsar.git
 # install melloddy-tuner
 python3 -m pip install git+https://github.com/melloddy/MELLODDY-TUNER.git@2.1.3
 
+# install tabpfn
+python3 -m pip install tabpfn==0.1.8
+
+# install imblearn
+python3 -m pip install imbalanced-learn==0.10.1
+
 # install zairachem
 python3 -m pip install -e .
diff --git a/zairachem/automl/binarytabpfn.py b/zairachem/automl/binarytabpfn.py
index c24a3b87..aad81454 100644
--- a/zairachem/automl/binarytabpfn.py
+++ b/zairachem/automl/binarytabpfn.py
@@ -16,12 +16,21 @@ def __init__(self, device="cpu", N_ensemble_configurations=4):
         self.max_samples = 1000
 
     def _get_balanced_datasets(self, X, y):
-        smp = SMOTETomek(sampling_strategy="auto")
-        X_0, y_0 = smp.fit_resample(X, y)
-        smp = KMeansSMOTE(sampling_strategy="auto")
-        X_1, y_1 = smp.fit_resample(X, y)
-        smp = EditedNearestNeighbours(sampling_strategy="auto")
-        X_2, y_2 = smp.fit_resample(X, y)
+        try:
+            smp = SMOTETomek(sampling_strategy="auto")
+            X_0, y_0 = smp.fit_resample(X, y)
+        except:
+            X_0, y_0 = X, y
+        try:
+            smp = KMeansSMOTE(sampling_strategy="auto")
+            X_1, y_1 = smp.fit_resample(X, y)
+        except:
+            X_1, y_1 = X, y
+        try:
+            smp = EditedNearestNeighbours(sampling_strategy="auto")
+            X_2, y_2 = smp.fit_resample(X, y)
+        except:
+            X_2, y_2 = X, y
         results = [(X_0, y_0), (X_1, y_1), (X_2, y_2)]
         return results
 

From 5abd23780f9e38fab4583194c8a295c9bb10bdf5 Mon Sep 17 00:00:00 2001
From: gemmaturon <gemma@ersilia.io>
Date: Fri, 14 Apr 2023 16:55:22 +0200
Subject: [PATCH 6/8] molfeat-chemgpt

---
 zairachem/vars.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zairachem/vars.py b/zairachem/vars.py
index bc6812c1..fc229d10 100644
--- a/zairachem/vars.py
+++ b/zairachem/vars.py
@@ -43,7 +43,7 @@
 ERSILIA_HUB_DEFAULT_MODELS = [
     # "morgan-counts",
     "cc-signaturizer",
-    "image-mol-embeddings",
+    "molfeat-chemgpt",
     "rdkit-fingerprint",
     "grover-embedding",
     "mordred",

From 8af2372225e4a955619f006100e311bad436eb85 Mon Sep 17 00:00:00 2001
From: gemmaturon <gemma@ersilia.io>
Date: Mon, 24 Apr 2023 19:31:42 +0200
Subject: [PATCH 7/8] bagger not constrained to 1

---
 zairachem/pool/bagger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zairachem/pool/bagger.py b/zairachem/pool/bagger.py
index e1a3e7c9..759248eb 100644
--- a/zairachem/pool/bagger.py
+++ b/zairachem/pool/bagger.py
@@ -230,7 +230,7 @@ def _predict_scaling(self, df_X):
                 w = w + 1
             y_hats += [np.average(Y_hat, axis=1, weights=w)]
         y_hats += [np.mean(Y_hat, axis=1)]
-        y_hats = np.clip(np.array(y_hats), 0, 1)
+        y_hats = np.array(y_hats)
         y_hat = np.mean(y_hats, axis=0)
         return y_hat
 

From ef4c0135bfffe55d1a397684f32e96db33ca212f Mon Sep 17 00:00:00 2001
From: gemmaturon <gemma@ersilia.io>
Date: Wed, 26 Apr 2023 19:48:03 +0200
Subject: [PATCH 8/8] new rescaling

---
 zairachem/pool/bagger.py | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/zairachem/pool/bagger.py b/zairachem/pool/bagger.py
index 759248eb..535c33d0 100644
--- a/zairachem/pool/bagger.py
+++ b/zairachem/pool/bagger.py
@@ -5,9 +5,10 @@
 import joblib
 import h5py
 import collections
+from scipy.special import expit
 from sklearn.linear_model import LogisticRegressionCV, LinearRegression
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import RobustScaler, PowerTransformer
 from sklearn.metrics import roc_curve, auc, r2_score
 
 from .. import ZairaBase
@@ -155,8 +156,8 @@ def save(self, filename):
 
 
 class PoolClassifier(object):
-    def __init__(self, path, mode="scaling"):
-        assert mode in ["scaling", "median", "model"]
+    def __init__(self, path, mode="weighting"):
+        assert mode in ["weighting", "median", "model"]
         self.path = path
         if not os.path.exists(self.path):
             os.makedirs(self.path, exist_ok=True)
@@ -168,7 +169,8 @@ def _get_model_filename(self, n):
     def _fit_just_median(self, df_X, df_y):
         return np.median(np.array(df_X), axis=1)
 
-    def _fit_scaling(self, df_X, df_y):
+    def _fit_weighting(self, df_X, df_y):
+        y = np.array(df_y).ravel()
         cols = list(df_X.columns)
         X = np.array(df_X)
         p25 = np.percentile(X.ravel(), 25)
@@ -177,10 +179,13 @@ def _fit_scaling(self, df_X, df_y):
         scale = (p25, p50, p75)
         for c in cols:
             X = np.array(df_X[c]).reshape(-1, 1)
-            mdl = RobustScaler()
-            mdl.fit(X)
+            mdl0 = PowerTransformer()
+            mdl0.fit(X)
+            X = mdl0.transform(X)
+            mdl1 = LogisticRegressionCV()
+            mdl1.fit(X, y)
             filename = self._get_model_filename(c)
-            joblib.dump(mdl, filename)
+            joblib.dump((mdl0, mdl1), filename)
         filename = self._get_model_filename("overall")
         joblib.dump(scale, filename)
         filename = self._get_model_filename("weighting")
@@ -188,7 +193,7 @@ def _fit_scaling(self, df_X, df_y):
         ws.distance_to_leads()
         ws.importance()
         ws.save(filename)
-        return self._predict_scaling(df_X)
+        return self._predict_weighting(df_X)
 
     def _fit_model(self, df_X, df_y):
         y = np.array(df_y).ravel()
@@ -204,22 +209,19 @@ def _fit_model(self, df_X, df_y):
     def _predict_just_median(self, df_X):
         return np.median(np.array(df_X), axis=1)
 
-    def _predict_scaling(self, df_X):
+    def _predict_weighting(self, df_X):
         cols = list(df_X.columns)
         Y_hat = []
         for c in cols:
             filename = self._get_model_filename(c)
             if os.path.exists(filename):
-                mdl = joblib.load(filename)
+                mdl0, mdl1 = joblib.load(filename)
                 X = np.array(df_X[c]).reshape(-1, 1)
-                y_hat = mdl.transform(X).ravel()
+                X = mdl0.transform(X)
+                y_hat = mdl1.predict_proba(X)[:,1]
                 Y_hat += [y_hat]
         Y_hat = np.array(Y_hat).T
         filename = self._get_model_filename("overall")
-        scale = joblib.load(filename)
-        iqr = scale[-1] - scale[0]
-        med = scale[1]
-        Y_hat = Y_hat * iqr + med
         filename = self._get_model_filename("weighting")
         weights = joblib.load(filename)
         wvals = weights["weights"]
@@ -248,16 +250,16 @@ def _predict_model(self, df_X):
         return np.median(Y_hat, axis=1)
 
     def fit(self, df_X, df_y):
-        if self.mode == "scaling":
-            return self._fit_scaling(df_X, df_y)
+        if self.mode == "weighting":
+            return self._fit_weighting(df_X, df_y)
         if self.mode == "median":
             return self._fit_just_median(df_X, df_y)
         if self.mdoe == "model":
             return self._fit_model(df_X, df_y)
 
     def predict(self, df_X):
-        if self.mode == "scaling":
-            return self._predict_scaling(df_X)
+        if self.mode == "weighting":
+            return self._predict_weighting(df_X)
         if self.mode == "median":
             return self._predict_just_median(df_X)
         if self.mode == "model":