Feature: data packages + several changes for NeurIPS camera ready (#281)

* Implements data packages and TDC data packages * Adds data packages for foldx and rasp * Adds raw data for the foldx and rasp data packages * Updates the set-up to include txt files * Updates data packages for protein tasks * Modifies the message passing for ehrlich holo re n initial points * Runs formatting * Adds a hotfix for ehrlich holo env * Passes a numpy array instead * Passes numpy arrays for the initial solutions * Goes back to having the initial sampler be a function * Adds test for the random data package * removes an outdated comment in test * bumps version
MachineLearningLifeScience · Oct 28, 2024 · 4a9f06e · 4a9f06e
1 parent 51ebfc4
commit 4a9f06e
Show file tree

Hide file tree

Showing 68 changed files with 28,585 additions and 82 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "poli"
-version = "1.0.0.dev12"
+version = "1.0.0.dev13"
 description = "poli, a library of discrete objective functions"
 readme = "README.md"
 authors = [{name="Miguel González-Duque", email="[email protected]"}, {name="Simon Bartels"}]
@@ -79,7 +79,7 @@ profile = "black"
 exclude = ["src/poli/core/util/proteins/rasp/inner_rasp", "src/poli/objective_repository/gfp_cbas"]
 
 [tool.bumpversion]
-current_version = "1.0.0.dev12"
+current_version = "1.0.0.dev13"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = poli
-version = "1.0.0.dev12"
+version = "1.0.0.dev13"
 author = Miguel González-Duque
 author_email = [email protected]
 description = A library of discrete objective functions
@@ -22,4 +22,4 @@ include_package_data = True
 where=src
 
 [options.package_data]
-* = *.sht, *.yml, *.jar, *.pt, *.json, *.pdb, *.npz, *.npy, *.h5, *.fa, *.csv
+* = *.sht, *.yml, *.jar, *.pt, *.json, *.pdb, *.npz, *.npy, *.h5, *.fa, *.csv, *.txt
diff --git a/src/poli/__init__.py b/src/poli/__init__.py
@@ -1,6 +1,6 @@
 """poli, a library for discrete black-box objective functions."""
 
-__version__ = "1.0.0.dev12"
+__version__ = "1.0.0.dev13"
 from .core.util.isolation.instancing import instance_function_as_isolated_process
 
 # from .core import get_problems

diff --git a/src/poli/core/chemistry/data_packages/__init__.py b/src/poli/core/chemistry/data_packages/__init__.py
@@ -0,0 +1,5 @@
+from .random_molecules_data_package import RandomMoleculesDataPackage
+
+__all__ = [
+    "RandomMoleculesDataPackage",
+]
diff --git a/src/poli/core/chemistry/data_packages/five_thousand_smiles.npz b/src/poli/core/chemistry/data_packages/five_thousand_smiles.npz
diff --git a/src/poli/core/chemistry/data_packages/random_molecules_data_package.py b/src/poli/core/chemistry/data_packages/random_molecules_data_package.py
@@ -0,0 +1,74 @@
+"""
+This script contains a data package that is frequently used in
+small molecule optimization: sampling random molecules from Zinc250k.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Callable, Literal
+
+import numpy as np
+
+from poli.core.data_package import DataPackage
+from poli.core.util.chemistry.string_to_molecule import translate_smiles_to_selfies
+
+
+class RandomMoleculesDataPackage(DataPackage):
+    """
+    Returns a supervised data package with random molecules from Zinc250k.
+
+    We subsampled 5000 smiles from Zinc250k and stored them in a numpy file,
+    and this data package samples n_molecules from this set.
+
+    Parameters
+    ----------
+    string_representation : Literal["SMILES", "SELFIES"]
+        The string representation of the molecules.
+    n_molecules : int, optional
+        The number of molecules to sample from the dataset, by default 10.
+    seed : int, optional
+        The seed for the random number generator, by default None.
+        If provided, we seed numpy random number generator with this seed.
+    tokenize_with : Callable[[str], list[str]], optional
+        A function that tokenizes the molecules, by default None.
+        If provided, we tokenize the molecules with this function.
+    """
+
+    def __init__(
+        self,
+        string_representation: Literal["SMILES", "SELFIES"],
+        n_molecules: int = 10,
+        seed: int | None = None,
+        tokenize_with: Callable[[str], list[str]] = None,
+    ):
+        assert (
+            n_molecules <= 5000
+        ), "This data package has been implemented for up to 5000 random molecules."
+        CHEMISTRY_DATA_PACKAGES_DIR = Path(__file__).parent
+        five_thousand_molecules = np.load(
+            CHEMISTRY_DATA_PACKAGES_DIR / "five_thousand_smiles.npz",
+            allow_pickle=True,
+        )["x"]
+
+        if string_representation.upper() == "SELFIES":
+            five_thousand_molecules_ = translate_smiles_to_selfies(
+                five_thousand_molecules,
+                strict=True,
+            )
+            five_thousand_molecules = np.array(five_thousand_molecules_)
+
+        if seed is not None:
+            np.random.seed(seed)
+
+        unsupervised_data = np.random.choice(
+            five_thousand_molecules, (n_molecules,), replace=False
+        )
+        supervised_data = None
+
+        if tokenize_with is not None:
+            unsupervised_data = np.array(
+                [tokenize_with(mol) for mol in unsupervised_data if mol is not None]
+            )
+
+        super().__init__(unsupervised_data, supervised_data)
diff --git a/src/poli/core/chemistry/tdc_black_box.py b/src/poli/core/chemistry/tdc_black_box.py
@@ -100,6 +100,7 @@ def __init__(
         self.oracle_name = oracle_name
         self.alphabet = alphabet
         self.max_sequence_length = max_sequence_length
+        self.string_representation = string_representation
 
         from_smiles = string_representation.upper() == "SMILES"
         self.inner_function = get_inner_function(

diff --git a/src/poli/core/chemistry/tdc_problem.py b/src/poli/core/chemistry/tdc_problem.py
@@ -0,0 +1,18 @@
+from poli.core.chemistry.data_packages import RandomMoleculesDataPackage
+from poli.core.chemistry.tdc_black_box import TDCBlackBox
+from poli.core.problem import Problem
+
+
+class TDCProblem(Problem):
+    def __init__(
+        self, black_box: TDCBlackBox, x0, data_package=None, strict_validation=True
+    ):
+        if data_package is None:
+            data_package = RandomMoleculesDataPackage(black_box.string_representation)
+
+        super().__init__(
+            black_box=black_box,
+            x0=x0,
+            data_package=data_package,
+            strict_validation=strict_validation,
+        )
diff --git a/src/poli/core/data_package.py b/src/poli/core/data_package.py
@@ -0,0 +1,30 @@
+"""
+Implements a data package, containing unsupervised and
+supervised data.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+class DataPackage:
+    def __init__(
+        self,
+        unsupervised_data: np.ndarray | None,
+        supervised_data: tuple[np.ndarray, np.ndarray] | None,
+    ):
+        self.unsupervised_data = unsupervised_data
+        self.supervised_data = supervised_data
+
+    def __repr__(self) -> str:
+        unsupervised_data = (
+            self.unsupervised_data.shape if self.unsupervised_data is not None else None
+        )
+        supervised_data = (
+            self.supervised_data[0].shape if self.supervised_data is not None else None
+        )
+        return f"DataPackage(unsupervised_data={unsupervised_data}, supervised_data={supervised_data})"
+
+    def __str__(self) -> str:
+        return self.__repr__()
diff --git a/src/poli/core/problem.py b/src/poli/core/problem.py
@@ -15,9 +15,12 @@
   allowed.
 """
 
+from __future__ import annotations
+
 import numpy as np
 
 from poli.core.abstract_black_box import AbstractBlackBox
+from poli.core.data_package import DataPackage
 from poli.core.util.algorithm_observer_wrapper import AlgorithmObserverWrapper
 from poli.core.util.default_observer import DefaultObserver
 
@@ -27,10 +30,12 @@ def __init__(
         self,
         black_box: AbstractBlackBox,
         x0: np.ndarray,
+        data_package: DataPackage | None = None,
         strict_validation: bool = True,
     ):
         self.black_box: AbstractBlackBox = black_box
         self.x0: np.ndarray = x0
+        self.data_package: DataPackage | None = data_package
         self.black_box_information = black_box.info
         if strict_validation:
             self._validate()
@@ -64,7 +69,7 @@ def info(self):
         return self.black_box.info
 
     def __str__(self):
-        return f"<Problem: {self.black_box.info.get_problem_name()} x0: {self.x0}>"
+        return f"<Problem: {self.black_box.info.get_problem_name()} x0: {self.x0} data_package: {self.data_package}>"
 
     def __repr__(self):
-        return f"<Problem: {self.black_box.info.get_problem_name()} x0: {self.x0} discrete={self.is_discrete()} deterministic={self.is_deterministic()}>"
+        return f"<Problem: {self.black_box.info.get_problem_name()} x0: {self.x0} data_package: {self.data_package} discrete={self.is_discrete()} deterministic={self.is_deterministic()}>"
diff --git a/src/poli/core/proteins/data_packages/__init__.py b/src/poli/core/proteins/data_packages/__init__.py
@@ -0,0 +1,7 @@
+from .foldx_supervised_data_packages import RFPFoldXStabilitySupervisedDataPackage
+from .rasp_supervised_data_packages import RFPRaspSupervisedDataPackage
+
+__all__ = [
+    "RFPRaspSupervisedDataPackage",
+    "RFPFoldXStabilitySupervisedDataPackage",
+]
diff --git a/src/poli/core/proteins/data_packages/foldx_supervised_data_packages.py b/src/poli/core/proteins/data_packages/foldx_supervised_data_packages.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+import numpy as np
+
+from poli.core.data_package import DataPackage
+
+
+class RFPFoldXStabilitySupervisedDataPackage(DataPackage):
+    def __init__(self):
+        PROTEIN_DATA_PACKAGES_DIR = Path(__file__).parent
+        sequences = np.loadtxt(
+            PROTEIN_DATA_PACKAGES_DIR / "rfp_sequences.txt", dtype=str
+        )
+        rasp_scores = np.loadtxt(PROTEIN_DATA_PACKAGES_DIR / "rfp_foldx_scores.txt")
+        padding_token = ""
+        max_sequence_length = max(len(sequence) for sequence in sequences)
+        unsupervised_data = np.array(
+            [
+                list(sequence) + [padding_token] * (max_sequence_length - len(sequence))
+                for sequence in sequences
+            ]
+        )
+        supervised_data = unsupervised_data, rasp_scores.reshape(-1, 1)
+
+        super().__init__(unsupervised_data, supervised_data)
diff --git a/src/poli/core/proteins/data_packages/rasp_supervised_data_packages.py b/src/poli/core/proteins/data_packages/rasp_supervised_data_packages.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+import numpy as np
+
+from poli.core.data_package import DataPackage
+
+
+class RFPRaspSupervisedDataPackage(DataPackage):
+    def __init__(self):
+        PROTEIN_DATA_PACKAGES_DIR = Path(__file__).parent
+        sequences = np.loadtxt(
+            PROTEIN_DATA_PACKAGES_DIR / "rfp_sequences.txt", dtype=str
+        )
+        rasp_scores = np.loadtxt(PROTEIN_DATA_PACKAGES_DIR / "rfp_rasp_scores.txt")
+        padding_token = ""
+        max_sequence_length = max(len(sequence) for sequence in sequences)
+        unsupervised_data = np.array(
+            [
+                list(sequence) + [padding_token] * (max_sequence_length - len(sequence))
+                for sequence in sequences
+            ]
+        )
+        supervised_data = unsupervised_data, rasp_scores.reshape(-1, 1)
+
+        super().__init__(unsupervised_data, supervised_data)