-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature: data packages + several changes for NeurIPS camera ready (#281)
* Implements data packages and TDC data packages * Adds data packages for foldx and rasp * Adds raw data for the foldx and rasp data packages * Updates the set-up to include txt files * Updates data packages for protein tasks * Modifies the message passing for ehrlich holo re n initial points * Runs formatting * Adds a hotfix for ehrlich holo env * Passes a numpy array instead * Passes numpy arrays for the initial solutions * Goes back to having the initial sampler be a function * Adds test for the random data package * removes an outdated comment in test * bumps version
- Loading branch information
1 parent
51ebfc4
commit 4a9f06e
Showing
68 changed files
with
28,585 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" | |
|
||
[project] | ||
name = "poli" | ||
version = "1.0.0.dev12" | ||
version = "1.0.0.dev13" | ||
description = "poli, a library of discrete objective functions" | ||
readme = "README.md" | ||
authors = [{name="Miguel González-Duque", email="[email protected]"}, {name="Simon Bartels"}] | ||
|
@@ -79,7 +79,7 @@ profile = "black" | |
exclude = ["src/poli/core/util/proteins/rasp/inner_rasp", "src/poli/objective_repository/gfp_cbas"] | ||
|
||
[tool.bumpversion] | ||
current_version = "1.0.0.dev12" | ||
current_version = "1.0.0.dev13" | ||
parse = """(?x) | ||
(?P<major>0|[1-9]\\d*)\\. | ||
(?P<minor>0|[1-9]\\d*)\\. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[metadata] | ||
name = poli | ||
version = "1.0.0.dev12" | ||
version = "1.0.0.dev13" | ||
author = Miguel González-Duque | ||
author_email = [email protected] | ||
description = A library of discrete objective functions | ||
|
@@ -22,4 +22,4 @@ include_package_data = True | |
where=src | ||
|
||
[options.package_data] | ||
* = *.sht, *.yml, *.jar, *.pt, *.json, *.pdb, *.npz, *.npy, *.h5, *.fa, *.csv | ||
* = *.sht, *.yml, *.jar, *.pt, *.json, *.pdb, *.npz, *.npy, *.h5, *.fa, *.csv, *.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .random_molecules_data_package import RandomMoleculesDataPackage | ||
|
||
__all__ = [ | ||
"RandomMoleculesDataPackage", | ||
] |
Binary file not shown.
74 changes: 74 additions & 0 deletions
74
src/poli/core/chemistry/data_packages/random_molecules_data_package.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
""" | ||
This script contains a data package that is frequently used in | ||
small molecule optimization: sampling random molecules from Zinc250k. | ||
""" | ||
|
||
from __future__ import annotations | ||
|
||
from pathlib import Path | ||
from typing import Callable, Literal | ||
|
||
import numpy as np | ||
|
||
from poli.core.data_package import DataPackage | ||
from poli.core.util.chemistry.string_to_molecule import translate_smiles_to_selfies | ||
|
||
|
||
class RandomMoleculesDataPackage(DataPackage): | ||
""" | ||
Returns a supervised data package with random molecules from Zinc250k. | ||
We subsampled 5000 smiles from Zinc250k and stored them in a numpy file, | ||
and this data package samples n_molecules from this set. | ||
Parameters | ||
---------- | ||
string_representation : Literal["SMILES", "SELFIES"] | ||
The string representation of the molecules. | ||
n_molecules : int, optional | ||
The number of molecules to sample from the dataset, by default 10. | ||
seed : int, optional | ||
The seed for the random number generator, by default None. | ||
If provided, we seed numpy random number generator with this seed. | ||
tokenize_with : Callable[[str], list[str]], optional | ||
A function that tokenizes the molecules, by default None. | ||
If provided, we tokenize the molecules with this function. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
string_representation: Literal["SMILES", "SELFIES"], | ||
n_molecules: int = 10, | ||
seed: int | None = None, | ||
tokenize_with: Callable[[str], list[str]] = None, | ||
): | ||
assert ( | ||
n_molecules <= 5000 | ||
), "This data package has been implemented for up to 5000 random molecules." | ||
CHEMISTRY_DATA_PACKAGES_DIR = Path(__file__).parent | ||
five_thousand_molecules = np.load( | ||
CHEMISTRY_DATA_PACKAGES_DIR / "five_thousand_smiles.npz", | ||
allow_pickle=True, | ||
)["x"] | ||
|
||
if string_representation.upper() == "SELFIES": | ||
five_thousand_molecules_ = translate_smiles_to_selfies( | ||
five_thousand_molecules, | ||
strict=True, | ||
) | ||
five_thousand_molecules = np.array(five_thousand_molecules_) | ||
|
||
if seed is not None: | ||
np.random.seed(seed) | ||
|
||
unsupervised_data = np.random.choice( | ||
five_thousand_molecules, (n_molecules,), replace=False | ||
) | ||
supervised_data = None | ||
|
||
if tokenize_with is not None: | ||
unsupervised_data = np.array( | ||
[tokenize_with(mol) for mol in unsupervised_data if mol is not None] | ||
) | ||
|
||
super().__init__(unsupervised_data, supervised_data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from poli.core.chemistry.data_packages import RandomMoleculesDataPackage | ||
from poli.core.chemistry.tdc_black_box import TDCBlackBox | ||
from poli.core.problem import Problem | ||
|
||
|
||
class TDCProblem(Problem): | ||
def __init__( | ||
self, black_box: TDCBlackBox, x0, data_package=None, strict_validation=True | ||
): | ||
if data_package is None: | ||
data_package = RandomMoleculesDataPackage(black_box.string_representation) | ||
|
||
super().__init__( | ||
black_box=black_box, | ||
x0=x0, | ||
data_package=data_package, | ||
strict_validation=strict_validation, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
""" | ||
Implements a data package, containing unsupervised and | ||
supervised data. | ||
""" | ||
|
||
from __future__ import annotations | ||
|
||
import numpy as np | ||
|
||
|
||
class DataPackage: | ||
def __init__( | ||
self, | ||
unsupervised_data: np.ndarray | None, | ||
supervised_data: tuple[np.ndarray, np.ndarray] | None, | ||
): | ||
self.unsupervised_data = unsupervised_data | ||
self.supervised_data = supervised_data | ||
|
||
def __repr__(self) -> str: | ||
unsupervised_data = ( | ||
self.unsupervised_data.shape if self.unsupervised_data is not None else None | ||
) | ||
supervised_data = ( | ||
self.supervised_data[0].shape if self.supervised_data is not None else None | ||
) | ||
return f"DataPackage(unsupervised_data={unsupervised_data}, supervised_data={supervised_data})" | ||
|
||
def __str__(self) -> str: | ||
return self.__repr__() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from .foldx_supervised_data_packages import RFPFoldXStabilitySupervisedDataPackage | ||
from .rasp_supervised_data_packages import RFPRaspSupervisedDataPackage | ||
|
||
__all__ = [ | ||
"RFPRaspSupervisedDataPackage", | ||
"RFPFoldXStabilitySupervisedDataPackage", | ||
] |
25 changes: 25 additions & 0 deletions
25
src/poli/core/proteins/data_packages/foldx_supervised_data_packages.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
|
||
from poli.core.data_package import DataPackage | ||
|
||
|
||
class RFPFoldXStabilitySupervisedDataPackage(DataPackage): | ||
def __init__(self): | ||
PROTEIN_DATA_PACKAGES_DIR = Path(__file__).parent | ||
sequences = np.loadtxt( | ||
PROTEIN_DATA_PACKAGES_DIR / "rfp_sequences.txt", dtype=str | ||
) | ||
rasp_scores = np.loadtxt(PROTEIN_DATA_PACKAGES_DIR / "rfp_foldx_scores.txt") | ||
padding_token = "" | ||
max_sequence_length = max(len(sequence) for sequence in sequences) | ||
unsupervised_data = np.array( | ||
[ | ||
list(sequence) + [padding_token] * (max_sequence_length - len(sequence)) | ||
for sequence in sequences | ||
] | ||
) | ||
supervised_data = unsupervised_data, rasp_scores.reshape(-1, 1) | ||
|
||
super().__init__(unsupervised_data, supervised_data) |
25 changes: 25 additions & 0 deletions
25
src/poli/core/proteins/data_packages/rasp_supervised_data_packages.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
|
||
from poli.core.data_package import DataPackage | ||
|
||
|
||
class RFPRaspSupervisedDataPackage(DataPackage): | ||
def __init__(self): | ||
PROTEIN_DATA_PACKAGES_DIR = Path(__file__).parent | ||
sequences = np.loadtxt( | ||
PROTEIN_DATA_PACKAGES_DIR / "rfp_sequences.txt", dtype=str | ||
) | ||
rasp_scores = np.loadtxt(PROTEIN_DATA_PACKAGES_DIR / "rfp_rasp_scores.txt") | ||
padding_token = "" | ||
max_sequence_length = max(len(sequence) for sequence in sequences) | ||
unsupervised_data = np.array( | ||
[ | ||
list(sequence) + [padding_token] * (max_sequence_length - len(sequence)) | ||
for sequence in sequences | ||
] | ||
) | ||
supervised_data = unsupervised_data, rasp_scores.reshape(-1, 1) | ||
|
||
super().__init__(unsupervised_data, supervised_data) |
Oops, something went wrong.