Skip to content

Commit

Permalink
Feature: data packages + several changes for NeurIPS camera ready (#281)
Browse files Browse the repository at this point in the history
* Implements data packages and TDC data packages

* Adds data packages for foldx and rasp

* Adds raw data for the foldx and rasp data packages

* Updates the set-up to include txt files

* Updates data packages for protein tasks

* Modifies the message passing for ehrlich holo re n initial points

* Runs formatting

* Adds a hotfix for ehrlich holo env

* Passes a numpy array instead

* Passes numpy arrays for the initial solutions

* Goes back to having the initial sampler be a function

* Adds test for the random data package

* removes an outdated comment in test

* bumps version
  • Loading branch information
miguelgondu authored Oct 28, 2024
1 parent 51ebfc4 commit 4a9f06e
Show file tree
Hide file tree
Showing 68 changed files with 28,585 additions and 82 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "poli"
version = "1.0.0.dev12"
version = "1.0.0.dev13"
description = "poli, a library of discrete objective functions"
readme = "README.md"
authors = [{name="Miguel González-Duque", email="[email protected]"}, {name="Simon Bartels"}]
Expand Down Expand Up @@ -79,7 +79,7 @@ profile = "black"
exclude = ["src/poli/core/util/proteins/rasp/inner_rasp", "src/poli/objective_repository/gfp_cbas"]

[tool.bumpversion]
current_version = "1.0.0.dev12"
current_version = "1.0.0.dev13"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.
Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = poli
version = "1.0.0.dev12"
version = "1.0.0.dev13"
author = Miguel González-Duque
author_email = [email protected]
description = A library of discrete objective functions
Expand All @@ -22,4 +22,4 @@ include_package_data = True
where=src

[options.package_data]
* = *.sht, *.yml, *.jar, *.pt, *.json, *.pdb, *.npz, *.npy, *.h5, *.fa, *.csv
* = *.sht, *.yml, *.jar, *.pt, *.json, *.pdb, *.npz, *.npy, *.h5, *.fa, *.csv, *.txt
2 changes: 1 addition & 1 deletion src/poli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""poli, a library for discrete black-box objective functions."""

__version__ = "1.0.0.dev12"
__version__ = "1.0.0.dev13"
from .core.util.isolation.instancing import instance_function_as_isolated_process

# from .core import get_problems
Expand Down
5 changes: 5 additions & 0 deletions src/poli/core/chemistry/data_packages/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .random_molecules_data_package import RandomMoleculesDataPackage

__all__ = [
"RandomMoleculesDataPackage",
]
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
This script contains a data package that is frequently used in
small molecule optimization: sampling random molecules from Zinc250k.
"""

from __future__ import annotations

from pathlib import Path
from typing import Callable, Literal

import numpy as np

from poli.core.data_package import DataPackage
from poli.core.util.chemistry.string_to_molecule import translate_smiles_to_selfies


class RandomMoleculesDataPackage(DataPackage):
"""
Returns a supervised data package with random molecules from Zinc250k.
We subsampled 5000 smiles from Zinc250k and stored them in a numpy file,
and this data package samples n_molecules from this set.
Parameters
----------
string_representation : Literal["SMILES", "SELFIES"]
The string representation of the molecules.
n_molecules : int, optional
The number of molecules to sample from the dataset, by default 10.
seed : int, optional
The seed for the random number generator, by default None.
If provided, we seed numpy random number generator with this seed.
tokenize_with : Callable[[str], list[str]], optional
A function that tokenizes the molecules, by default None.
If provided, we tokenize the molecules with this function.
"""

def __init__(
self,
string_representation: Literal["SMILES", "SELFIES"],
n_molecules: int = 10,
seed: int | None = None,
tokenize_with: Callable[[str], list[str]] = None,
):
assert (
n_molecules <= 5000
), "This data package has been implemented for up to 5000 random molecules."
CHEMISTRY_DATA_PACKAGES_DIR = Path(__file__).parent
five_thousand_molecules = np.load(
CHEMISTRY_DATA_PACKAGES_DIR / "five_thousand_smiles.npz",
allow_pickle=True,
)["x"]

if string_representation.upper() == "SELFIES":
five_thousand_molecules_ = translate_smiles_to_selfies(
five_thousand_molecules,
strict=True,
)
five_thousand_molecules = np.array(five_thousand_molecules_)

if seed is not None:
np.random.seed(seed)

unsupervised_data = np.random.choice(
five_thousand_molecules, (n_molecules,), replace=False
)
supervised_data = None

if tokenize_with is not None:
unsupervised_data = np.array(
[tokenize_with(mol) for mol in unsupervised_data if mol is not None]
)

super().__init__(unsupervised_data, supervised_data)
1 change: 1 addition & 0 deletions src/poli/core/chemistry/tdc_black_box.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def __init__(
self.oracle_name = oracle_name
self.alphabet = alphabet
self.max_sequence_length = max_sequence_length
self.string_representation = string_representation

from_smiles = string_representation.upper() == "SMILES"
self.inner_function = get_inner_function(
Expand Down
18 changes: 18 additions & 0 deletions src/poli/core/chemistry/tdc_problem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from poli.core.chemistry.data_packages import RandomMoleculesDataPackage
from poli.core.chemistry.tdc_black_box import TDCBlackBox
from poli.core.problem import Problem


class TDCProblem(Problem):
def __init__(
self, black_box: TDCBlackBox, x0, data_package=None, strict_validation=True
):
if data_package is None:
data_package = RandomMoleculesDataPackage(black_box.string_representation)

super().__init__(
black_box=black_box,
x0=x0,
data_package=data_package,
strict_validation=strict_validation,
)
30 changes: 30 additions & 0 deletions src/poli/core/data_package.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
Implements a data package, containing unsupervised and
supervised data.
"""

from __future__ import annotations

import numpy as np


class DataPackage:
def __init__(
self,
unsupervised_data: np.ndarray | None,
supervised_data: tuple[np.ndarray, np.ndarray] | None,
):
self.unsupervised_data = unsupervised_data
self.supervised_data = supervised_data

def __repr__(self) -> str:
unsupervised_data = (
self.unsupervised_data.shape if self.unsupervised_data is not None else None
)
supervised_data = (
self.supervised_data[0].shape if self.supervised_data is not None else None
)
return f"DataPackage(unsupervised_data={unsupervised_data}, supervised_data={supervised_data})"

def __str__(self) -> str:
return self.__repr__()
9 changes: 7 additions & 2 deletions src/poli/core/problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@
allowed.
"""

from __future__ import annotations

import numpy as np

from poli.core.abstract_black_box import AbstractBlackBox
from poli.core.data_package import DataPackage
from poli.core.util.algorithm_observer_wrapper import AlgorithmObserverWrapper
from poli.core.util.default_observer import DefaultObserver

Expand All @@ -27,10 +30,12 @@ def __init__(
self,
black_box: AbstractBlackBox,
x0: np.ndarray,
data_package: DataPackage | None = None,
strict_validation: bool = True,
):
self.black_box: AbstractBlackBox = black_box
self.x0: np.ndarray = x0
self.data_package: DataPackage | None = data_package
self.black_box_information = black_box.info
if strict_validation:
self._validate()
Expand Down Expand Up @@ -64,7 +69,7 @@ def info(self):
return self.black_box.info

def __str__(self):
return f"<Problem: {self.black_box.info.get_problem_name()} x0: {self.x0}>"
return f"<Problem: {self.black_box.info.get_problem_name()} x0: {self.x0} data_package: {self.data_package}>"

def __repr__(self):
return f"<Problem: {self.black_box.info.get_problem_name()} x0: {self.x0} discrete={self.is_discrete()} deterministic={self.is_deterministic()}>"
return f"<Problem: {self.black_box.info.get_problem_name()} x0: {self.x0} data_package: {self.data_package} discrete={self.is_discrete()} deterministic={self.is_deterministic()}>"
7 changes: 7 additions & 0 deletions src/poli/core/proteins/data_packages/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .foldx_supervised_data_packages import RFPFoldXStabilitySupervisedDataPackage
from .rasp_supervised_data_packages import RFPRaspSupervisedDataPackage

__all__ = [
"RFPRaspSupervisedDataPackage",
"RFPFoldXStabilitySupervisedDataPackage",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pathlib import Path

import numpy as np

from poli.core.data_package import DataPackage


class RFPFoldXStabilitySupervisedDataPackage(DataPackage):
def __init__(self):
PROTEIN_DATA_PACKAGES_DIR = Path(__file__).parent
sequences = np.loadtxt(
PROTEIN_DATA_PACKAGES_DIR / "rfp_sequences.txt", dtype=str
)
rasp_scores = np.loadtxt(PROTEIN_DATA_PACKAGES_DIR / "rfp_foldx_scores.txt")
padding_token = ""
max_sequence_length = max(len(sequence) for sequence in sequences)
unsupervised_data = np.array(
[
list(sequence) + [padding_token] * (max_sequence_length - len(sequence))
for sequence in sequences
]
)
supervised_data = unsupervised_data, rasp_scores.reshape(-1, 1)

super().__init__(unsupervised_data, supervised_data)
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pathlib import Path

import numpy as np

from poli.core.data_package import DataPackage


class RFPRaspSupervisedDataPackage(DataPackage):
def __init__(self):
PROTEIN_DATA_PACKAGES_DIR = Path(__file__).parent
sequences = np.loadtxt(
PROTEIN_DATA_PACKAGES_DIR / "rfp_sequences.txt", dtype=str
)
rasp_scores = np.loadtxt(PROTEIN_DATA_PACKAGES_DIR / "rfp_rasp_scores.txt")
padding_token = ""
max_sequence_length = max(len(sequence) for sequence in sequences)
unsupervised_data = np.array(
[
list(sequence) + [padding_token] * (max_sequence_length - len(sequence))
for sequence in sequences
]
)
supervised_data = unsupervised_data, rasp_scores.reshape(-1, 1)

super().__init__(unsupervised_data, supervised_data)
Loading

0 comments on commit 4a9f06e

Please sign in to comment.