From bd3fcf982c5be95e363da3bbdaf83324b125910d Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 1 Mar 2024 16:07:08 -0500 Subject: [PATCH 01/47] started splitting datasets into 'interaction' and 'potential' --- src/openqdc/datasets/interaction/des370k.py | 122 ++++++++++++++++++ .../datasets/{ => potential}/__init__.py | 0 src/openqdc/datasets/{ => potential}/ani.py | 0 src/openqdc/datasets/{ => potential}/base.py | 0 src/openqdc/datasets/{ => potential}/comp6.py | 0 src/openqdc/datasets/{ => potential}/dess.py | 0 src/openqdc/datasets/{ => potential}/dummy.py | 0 src/openqdc/datasets/{ => potential}/gdml.py | 0 src/openqdc/datasets/{ => potential}/geom.py | 0 .../datasets/{ => potential}/iso_17.py | 0 .../datasets/{ => potential}/molecule3d.py | 0 .../datasets/{ => potential}/nabladft.py | 0 .../datasets/{ => potential}/orbnet_denali.py | 0 src/openqdc/datasets/{ => potential}/pcqm.py | 0 src/openqdc/datasets/{ => potential}/qm7x.py | 0 src/openqdc/datasets/{ => potential}/qmugs.py | 0 .../datasets/{ => potential}/sn2_rxn.py | 0 .../{ => potential}/solvated_peptides.py | 0 src/openqdc/datasets/{ => potential}/spice.py | 0 src/openqdc/datasets/{ => potential}/tmqm.py | 0 .../datasets/{ => potential}/transition1x.py | 0 .../{ => potential}/waterclusters3_30.py | 0 22 files changed, 122 insertions(+) create mode 100644 src/openqdc/datasets/interaction/des370k.py rename src/openqdc/datasets/{ => potential}/__init__.py (100%) rename src/openqdc/datasets/{ => potential}/ani.py (100%) rename src/openqdc/datasets/{ => potential}/base.py (100%) rename src/openqdc/datasets/{ => potential}/comp6.py (100%) rename src/openqdc/datasets/{ => potential}/dess.py (100%) rename src/openqdc/datasets/{ => potential}/dummy.py (100%) rename src/openqdc/datasets/{ => potential}/gdml.py (100%) rename src/openqdc/datasets/{ => potential}/geom.py (100%) rename src/openqdc/datasets/{ => potential}/iso_17.py (100%) rename src/openqdc/datasets/{ => potential}/molecule3d.py (100%) rename src/openqdc/datasets/{ => potential}/nabladft.py (100%) rename src/openqdc/datasets/{ => potential}/orbnet_denali.py (100%) rename src/openqdc/datasets/{ => potential}/pcqm.py (100%) rename src/openqdc/datasets/{ => potential}/qm7x.py (100%) rename src/openqdc/datasets/{ => potential}/qmugs.py (100%) rename src/openqdc/datasets/{ => potential}/sn2_rxn.py (100%) rename src/openqdc/datasets/{ => potential}/solvated_peptides.py (100%) rename src/openqdc/datasets/{ => potential}/spice.py (100%) rename src/openqdc/datasets/{ => potential}/tmqm.py (100%) rename src/openqdc/datasets/{ => potential}/transition1x.py (100%) rename src/openqdc/datasets/{ => potential}/waterclusters3_30.py (100%) diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py new file mode 100644 index 0000000..e9b6a21 --- /dev/null +++ b/src/openqdc/datasets/interaction/des370k.py @@ -0,0 +1,122 @@ +import torch +import pandas as pd + +from typing import Dict, List +from collections import defaultdict, Counter +from torch.utils.data import Dataset + +class Dimer: + def __init__( + self, + smiles_0: str, + smiles_1: str, + charge_0: int, + charge_1: int, + n_atoms_0: int, + n_atoms_1: int, + pos: torch.Tensor, + sapt_energies: List[float], + ) -> None: + self.smiles_0 = smiles_0 + self.smiles_1 = smiles_1 + self.charge_1 = charge_0 + self.charge_1 = charge_1 + self.n_atoms_0 = n_atoms_0 + self.n_atoms_1 = n_atoms_1 + self.pos = pos + self.sapt_energies = sapt_energies + ( + self.sapt_es, + self.sapt_ex, + self.sapt_exs2, + self.sapt_ind, + self.sapt_exind, + self.sapt_disp, + self.sapt_exdisp_os, + self.sapt_exdisp_ss, + self.sapt_delta_HF, + self.sapt_all + ) = tuple(sapt_energies) + + def __str__(self) -> str: + return f"Dimer(smiles_0='{self.smiles_0}', smiles_1='{self.smiles_1}')" + + def __repr__(self) -> str: + return str(self) + + +class DES370K(Dataset): + def __init__(self, filepath="data/des370k.csv") -> None: + self.df = pd.read_csv(filepath) + self._atom_types = defaultdict(int) + self.data = [] + self._preprocess() + + def _preprocess(self) -> None: + for idx, row in self.df.iterrows(): + smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] + natoms0, natoms1 = row["natoms0"], row["natoms1"] + pos = torch.tensor(list(map(float, row["xyz"].split()))).view(-1, 3) + sapt_energies = [row[col] for col in self.df.columns if "sapt" in col] + dimer = Dimer( + smiles0, smiles1, + charge0, charge1, + natoms0, natoms1, + pos, sapt_energies + ) + self.data.append(dimer) + + # get atom types + elems = row["elements"].split() + counts = Counter(set(elems)) + for key in counts: + self._atom_types[key] += counts[key] + + # convert defaultdict to regular dict + self._atom_types = dict(self._atom_types) + + def __str__(self) -> str: + return f"DES370K(n_atoms={self.num_atoms}, n_molecules={self.num_molecules}, atom_types={self.species})" + + def __repr__(self) -> str: + return str(self) + + @property + def atom_types(self) -> Dict[str, int]: + """ + Returns a dictionary of + (element, count) pairs. + """ + return self._atom_types + + @property + def num_molecules(self) -> int: + return self.df.shape[0] + + @property + def num_atoms(self) -> int: + """ + Returns the total number of atoms in + the dataset. + """ + if not hasattr(self, "_num_atoms"): + self._num_atoms = sum(self.atom_types.values()) + return self._num_atoms + + @property + def species(self) -> List[str]: + """ + Returns a list of the unique atom + species contained in the dataset. + """ + if not hasattr(self, "_species"): + self._species = list(self.atom_types.keys()) + return self._species + + def atom_count(self, element: str) -> int: + """ + Returns the count of a given + element in the dataset. + """ + return self.atom_types[element] diff --git a/src/openqdc/datasets/__init__.py b/src/openqdc/datasets/potential/__init__.py similarity index 100% rename from src/openqdc/datasets/__init__.py rename to src/openqdc/datasets/potential/__init__.py diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/potential/ani.py similarity index 100% rename from src/openqdc/datasets/ani.py rename to src/openqdc/datasets/potential/ani.py diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/potential/base.py similarity index 100% rename from src/openqdc/datasets/base.py rename to src/openqdc/datasets/potential/base.py diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/potential/comp6.py similarity index 100% rename from src/openqdc/datasets/comp6.py rename to src/openqdc/datasets/potential/comp6.py diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/potential/dess.py similarity index 100% rename from src/openqdc/datasets/dess.py rename to src/openqdc/datasets/potential/dess.py diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/potential/dummy.py similarity index 100% rename from src/openqdc/datasets/dummy.py rename to src/openqdc/datasets/potential/dummy.py diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/potential/gdml.py similarity index 100% rename from src/openqdc/datasets/gdml.py rename to src/openqdc/datasets/potential/gdml.py diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/potential/geom.py similarity index 100% rename from src/openqdc/datasets/geom.py rename to src/openqdc/datasets/potential/geom.py diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/potential/iso_17.py similarity index 100% rename from src/openqdc/datasets/iso_17.py rename to src/openqdc/datasets/potential/iso_17.py diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/potential/molecule3d.py similarity index 100% rename from src/openqdc/datasets/molecule3d.py rename to src/openqdc/datasets/potential/molecule3d.py diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/potential/nabladft.py similarity index 100% rename from src/openqdc/datasets/nabladft.py rename to src/openqdc/datasets/potential/nabladft.py diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/potential/orbnet_denali.py similarity index 100% rename from src/openqdc/datasets/orbnet_denali.py rename to src/openqdc/datasets/potential/orbnet_denali.py diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/potential/pcqm.py similarity index 100% rename from src/openqdc/datasets/pcqm.py rename to src/openqdc/datasets/potential/pcqm.py diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/potential/qm7x.py similarity index 100% rename from src/openqdc/datasets/qm7x.py rename to src/openqdc/datasets/potential/qm7x.py diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/potential/qmugs.py similarity index 100% rename from src/openqdc/datasets/qmugs.py rename to src/openqdc/datasets/potential/qmugs.py diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/potential/sn2_rxn.py similarity index 100% rename from src/openqdc/datasets/sn2_rxn.py rename to src/openqdc/datasets/potential/sn2_rxn.py diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/potential/solvated_peptides.py similarity index 100% rename from src/openqdc/datasets/solvated_peptides.py rename to src/openqdc/datasets/potential/solvated_peptides.py diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/potential/spice.py similarity index 100% rename from src/openqdc/datasets/spice.py rename to src/openqdc/datasets/potential/spice.py diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/potential/tmqm.py similarity index 100% rename from src/openqdc/datasets/tmqm.py rename to src/openqdc/datasets/potential/tmqm.py diff --git a/src/openqdc/datasets/transition1x.py b/src/openqdc/datasets/potential/transition1x.py similarity index 100% rename from src/openqdc/datasets/transition1x.py rename to src/openqdc/datasets/potential/transition1x.py diff --git a/src/openqdc/datasets/waterclusters3_30.py b/src/openqdc/datasets/potential/waterclusters3_30.py similarity index 100% rename from src/openqdc/datasets/waterclusters3_30.py rename to src/openqdc/datasets/potential/waterclusters3_30.py From a800ea5836027297d59f10b981dc73d28d3d1df0 Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 1 Mar 2024 16:24:09 -0500 Subject: [PATCH 02/47] add num_unique_molecules property --- src/openqdc/datasets/interaction/des370k.py | 25 ++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py index e9b6a21..2884439 100644 --- a/src/openqdc/datasets/interaction/des370k.py +++ b/src/openqdc/datasets/interaction/des370k.py @@ -49,6 +49,7 @@ class DES370K(Dataset): def __init__(self, filepath="data/des370k.csv") -> None: self.df = pd.read_csv(filepath) self._atom_types = defaultdict(int) + self.smiles = set() self.data = [] self._preprocess() @@ -67,6 +68,10 @@ def _preprocess(self) -> None: ) self.data.append(dimer) + # keep track of unique smiles strings + self.smiles.add(smiles0) + self.smiles.add(smiles1) + # get atom types elems = row["elements"].split() counts = Counter(set(elems)) @@ -77,7 +82,9 @@ def _preprocess(self) -> None: self._atom_types = dict(self._atom_types) def __str__(self) -> str: - return f"DES370K(n_atoms={self.num_atoms}, n_molecules={self.num_molecules}, atom_types={self.species})" + return f"DES370K(n_atoms={self.num_atoms},\ + n_molecules={self.num_molecules},\ + atom_types={self.species})" def __repr__(self) -> str: return str(self) @@ -91,8 +98,20 @@ def atom_types(self) -> Dict[str, int]: return self._atom_types @property - def num_molecules(self) -> int: - return self.df.shape[0] + def num_dimers(self) -> int: + """ + Returns the number of + dimers in the dataset. + """ + return len(self.data) + + @property + def num_unique_molecules(self) -> int: + """ + Returns the number of unique + molecules in the dataset. + """ + return len(self.smiles) @property def num_atoms(self) -> int: From 9d6fca6ff6f514a0f7d5bcad4ecec3188c448336 Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 1 Mar 2024 16:26:58 -0500 Subject: [PATCH 03/47] added logging --- src/openqdc/datasets/interaction/des370k.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py index 2884439..127ec89 100644 --- a/src/openqdc/datasets/interaction/des370k.py +++ b/src/openqdc/datasets/interaction/des370k.py @@ -3,6 +3,8 @@ from typing import Dict, List from collections import defaultdict, Counter + +from loguru import logger from torch.utils.data import Dataset class Dimer: @@ -47,6 +49,7 @@ def __repr__(self) -> str: class DES370K(Dataset): def __init__(self, filepath="data/des370k.csv") -> None: + self.filepath = filepath self.df = pd.read_csv(filepath) self._atom_types = defaultdict(int) self.smiles = set() @@ -54,6 +57,7 @@ def __init__(self, filepath="data/des370k.csv") -> None: self._preprocess() def _preprocess(self) -> None: + logger.info(f"Reading data from {self.filepath}") for idx, row in self.df.iterrows(): smiles0, smiles1 = row["smiles0"], row["smiles1"] charge0, charge1 = row["charge0"], row["charge1"] From 794e63fc1faff3f8d1e03d485799688352bc43db Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 1 Mar 2024 16:31:20 -0500 Subject: [PATCH 04/47] started base interaction dataset --- src/openqdc/datasets/interaction/base.py | 553 +++++++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 src/openqdc/datasets/interaction/base.py diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py new file mode 100644 index 0000000..de43887 --- /dev/null +++ b/src/openqdc/datasets/interaction/base.py @@ -0,0 +1,553 @@ +from typing import Dict, List, Optional, Union +from openqdc.utils.io import ( + copy_exists, + dict_to_atoms, + get_local_cache, + load_hdf5_file, + load_pkl, + pull_locally, + push_remote, + set_cache_dir, +) + +from loguru import logger + +class BaseInteractionDataset: + __energy_methods__ = [] + __force_methods__ = [] + energy_target_names = [] + force_target_names = [] + __isolated_atom_energies__ = [] + + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __fn_energy__ = lambda x: x + __fn_distance__ = lambda x: x + __fn_forces__ = lambda x: x + __average_nb_atoms__ = None + __stats__ = {} + + def __init__( + self, + energy_unit: Optional[str] = None, + distance_unit: Optional[str] = None, + overwrite_local_cache: bool = False, + cache_dir: Optional[str] = None, + ) -> None: + set_cache_dir(cache_dir) + self.data = None + if not self.is_preprocessed(): + raise DatasetNotAvailableError(self.__name__) + else: + self.read_preprocess(overwrite_local_cache=overwrite_local_cache) + self._post_init(overwrite_local_cache, energy_unit, distance_unit) + + def _post_init( + self, + overwrite_local_cache: bool = False, + energy_unit: Optional[str] = None, + distance_unit: Optional[str] = None, + ) -> None: + self._set_units(None, None) + self._set_isolated_atom_energies() + self._precompute_statistics(overwrite_local_cache=overwrite_local_cache) + self._set_units(energy_unit, distance_unit) + self._convert_data() + self._set_isolated_atom_energies() + + def _convert_data(self): + logger.info( + f"Converting {self.__name__} data to the following units:\n\ + Energy: {self.energy_unit},\n\ + Distance: {self.distance_unit},\n\ + Forces: {self.force_unit if self.__force_methods__ else 'None'}" + ) + for key in self.data_keys: + self.data[key] = self._convert_on_loading(self.data[key], key) + + def _precompute_statistics(self, overwrite_local_cache: bool = False): + local_path = p_join(self.preprocess_path, "stats.pkl") + if self.is_preprocessed_statistics() and not overwrite_local_cache: + stats = load_pkl(local_path) + logger.info("Loaded precomputed statistics") + else: + logger.info("Precomputing relevant statistics") + ( + inter_E_mean, + inter_E_std, + formation_E_mean, + formation_E_std, + total_E_mean, + total_E_std, + ) = self._precompute_E() + forces_dict = self._precompute_F() + stats = { + "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict}, + "inter": {"energy": {"mean": inter_E_mean, "std": inter_E_std}, "forces": forces_dict}, + "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict}, + } + with open(local_path, "wb") as f: + pkl.dump(stats, f) + self._compute_average_nb_atoms() + self.__stats__ = stats + + def _compute_average_nb_atoms(self): + self.__average_nb_atoms__ = np.mean(self.data["n_atoms"]) + + def _precompute_E(self): + splits_idx = self.data["position_idx_range"][:, 1] + s = np.array(self.data["atomic_inputs"][:, :2], dtype=int) + s[:, 1] += IsolatedAtomEnergyFactory.max_charge + matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__] + converted_energy_data = self.data["energies"] + # calculation per molecule formation energy statistics + E = [] + for i, matrix in enumerate(matrixs): + c = np.cumsum(np.append([0], matrix))[splits_idx] + c[1:] = c[1:] - c[:-1] + E.append(converted_energy_data[:, i] - c) + E = np.array(E).T + inter_E_mean = np.nanmean(E / self.data["n_atoms"][:, None], axis=0) + inter_E_std = np.nanstd(E / self.data["n_atoms"][:, None], axis=0) + formation_E_mean = np.nanmean(E, axis=0) + formation_E_std = np.nanstd(E, axis=0) + total_E_mean = np.nanmean(converted_energy_data, axis=0) + total_E_std = np.nanstd(converted_energy_data, axis=0) + + return ( + np.atleast_2d(inter_E_mean), + np.atleast_2d(inter_E_std), + np.atleast_2d(formation_E_mean), + np.atleast_2d(formation_E_std), + np.atleast_2d(total_E_mean), + np.atleast_2d(total_E_std), + ) + + def _precompute_F(self): + if len(self.__force_methods__) == 0: + return NOT_DEFINED + converted_force_data = self.convert_forces(self.data["forces"]) + force_mean = np.nanmean(converted_force_data, axis=0) + force_std = np.nanstd(converted_force_data, axis=0) + force_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0)) + return { + "mean": np.atleast_2d(force_mean.mean(axis=0)), + "std": np.atleast_2d(force_std.mean(axis=0)), + "components": {"rms": force_rms, "std": force_std, "mean": force_mean}, + } + + @property + def numbers(self): + if hasattr(self, "_numbers"): + return self._numbers + self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32) + return self._numbers + + @property + def chemical_species(self): + return np.array(chemical_symbols)[self.numbers] + + @property + def energy_unit(self): + return self.__energy_unit__ + + @property + def distance_unit(self): + return self.__distance_unit__ + + @property + def force_unit(self): + return self.__forces_unit__ + + @property + def root(self): + return p_join(get_local_cache(), self.__name__) + + @property + def preprocess_path(self): + path = p_join(self.root, "preprocessed") + os.makedirs(path, exist_ok=True) + return path + + @property + def data_keys(self): + keys = list(self.data_types.keys()) + if len(self.__force_methods__) == 0: + keys.remove("forces") + return keys + + @property + def data_types(self): + return { + "atomic_inputs": np.float32, + "position_idx_range": np.int32, + "energies": np.float32, + "forces": np.float32, + } + + @property + def data_shapes(self): + return { + "atomic_inputs": (-1, NB_ATOMIC_FEATURES), + "position_idx_range": (-1, 2), + "energies": (-1, len(self.energy_target_names)), + "forces": (-1, 3, len(self.force_target_names)), + } + + @property + def atoms_per_molecules(self): + try: + if hasattr(self, "_n_atoms"): + return self._n_atoms + self._n_atoms = self.data["n_atoms"] + return self._n_atoms + except: # noqa + return None + + def _set_units(self, en, ds): + old_en, old_ds = self.energy_unit, self.distance_unit + en = en if en is not None else old_en + ds = ds if ds is not None else old_ds + + # if en is None: + self.set_energy_unit(en) + # if ds is not None: + self.set_distance_unit(ds) + if self.__force_methods__: + self.__forces_unit__ = self.energy_unit + "/" + self.distance_unit + self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__) + + def _set_isolated_atom_energies(self): + if self.__energy_methods__ is None: + logger.error("No energy methods defined for this dataset.") + f = get_conversion("hartree", self.__energy_unit__) + self.__isolated_atom_energies__ = f( + np.array([IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.__energy_methods__]) + ) + + def convert_energy(self, x): + return self.__class__.__fn_energy__(x) + + def convert_distance(self, x): + return self.__class__.__fn_distance__(x) + + def convert_forces(self, x): + return self.__class__.__fn_forces__(x) + + def set_energy_unit(self, value: str): + """ + Set a new energy unit for the dataset. + """ + old_unit = self.energy_unit + self.__energy_unit__ = value + self.__class__.__fn_energy__ = get_conversion(old_unit, value) + + def set_distance_unit(self, value: str): + """ + Set a new distance unit for the dataset. + """ + old_unit = self.distance_unit + self.__distance_unit__ = value + self.__class__.__fn_distance__ = get_conversion(old_unit, value) + + def read_raw_entries(self): + raise NotImplementedError + + def collate_list(self, list_entries): + # concatenate entries + res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]} + + csum = np.cumsum(res.get("n_atoms")) + x = np.zeros((csum.shape[0], 2), dtype=np.int32) + x[1:, 0], x[:, 1] = csum[:-1], csum + res["position_idx_range"] = x + + return res + + def save_preprocess(self, data_dict): + # save memmaps + logger.info("Preprocessing data and saving it to cache.") + for key in self.data_keys: + local_path = p_join(self.preprocess_path, f"{key}.mmap") + out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) + out[:] = data_dict.pop(key)[:] + out.flush() + push_remote(local_path, overwrite=True) + + # save smiles and subset + local_path = p_join(self.preprocess_path, "props.pkl") + for key in ["name", "subset"]: + data_dict[key] = np.unique(data_dict[key], return_inverse=True) + + with open(local_path, "wb") as f: + pkl.dump(data_dict, f) + push_remote(local_path, overwrite=True) + + def _convert_on_loading(self, x, key): + if key == "energies": + return self.convert_energy(x) + elif key == "forces": + return self.convert_forces(x) + elif key == "atomic_inputs": + x = np.array(x, dtype=np.float32) + x[:, -3:] = self.convert_distance(x[:, -3:]) + return x + else: + return x + + def read_preprocess(self, overwrite_local_cache=False): + logger.info("Reading preprocessed data") + logger.info( + f"{self.__name__} data with the following units:\n\ + Energy: {self.energy_unit},\n\ + Distance: {self.distance_unit},\n\ + Forces: {self.force_unit if self.__force_methods__ else 'None'}" + ) + self.data = {} + for key in self.data_keys: + filename = p_join(self.preprocess_path, f"{key}.mmap") + pull_locally(filename, overwrite=overwrite_local_cache) + self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key]) + + filename = p_join(self.preprocess_path, "props.pkl") + pull_locally(filename, overwrite=overwrite_local_cache) + with open(filename, "rb") as f: + tmp = pkl.load(f) + for key in ["name", "subset", "n_atoms"]: + x = tmp.pop(key) + if len(x) == 2: + self.data[key] = x[0][x[1]] + else: + self.data[key] = x + + for key in self.data: + logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") + + def is_preprocessed(self): + predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] + predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))] + return all(predicats) + + def is_preprocessed_statistics(self): + return bool(copy_exists(p_join(self.preprocess_path, "stats.pkl"))) + + def preprocess(self, overwrite=False): + if overwrite or not self.is_preprocessed(): + entries = self.read_raw_entries() + res = self.collate_list(entries) + self.save_preprocess(res) + + def save_xyz(self, idx: int, path: Optional[str] = None, name=None): + """ + Save the entry at index idx as an extxyz file. + """ + if path is None: + path = os.getcwd() + at = self.get_ase_atoms(idx, ext=True) + if name is not None: + name = at.info["name"] + write_extxyz(p_join(path, f"{name}.xyz"), at) + + def get_ase_atoms(self, idx: int, ext=True): + """ + Get the ASE atoms object for the entry at index idx. + + Parameters + ---------- + idx : int + Index of the entry. + ext : bool, optional + Whether to include additional informations + """ + entry = self[idx] + # _ = entry.pop("forces") + at = dict_to_atoms(entry, ext=ext) + return at + + @requires_package("dscribe") + @requires_package("datamol") + def soap_descriptors( + self, + n_samples: Optional[Union[List[int], int]] = None, + return_idxs: bool = True, + progress: bool = True, + **soap_kwargs, + ) -> Dict[str, np.ndarray]: + """ + Compute the SOAP descriptors for the dataset. + + Parameters + ---------- + n_samples : Optional[Union[List[int],int]], optional + Number of samples to use for the computation, by default None. If None, all the dataset is used. + If a list of integers is provided, the descriptors are computed for each of the specified idx of samples. + return_idxs : bool, optional + Whether to return the indices of the samples used, by default True. + progress : bool, optional + Whether to show a progress bar, by default True. + **soap_kwargs : dict + Keyword arguments to pass to the SOAP descriptor. + By defaut, the following values are used: + - r_cut : 5.0 + - n_max : 8 + - l_max : 6 + - average : "inner" + - periodic : False + - compression : {"mode" : "mu1nu1"} + + Returns + ------- + Dict[str, np.ndarray] + Dictionary containing the following keys: + - soap : np.ndarray of shape (N, M) containing the SOAP descriptors for the dataset + - soap_kwargs : dict containing the keyword arguments used for the SOAP descriptor + - idxs : np.ndarray of shape (N,) containing the indices of the samples used + + """ + import datamol as dm + from dscribe.descriptors import SOAP + + if n_samples is None: + idxs = list(range(len(self))) + elif isinstance(n_samples, int): + idxs = np.random.choice(len(self), size=n_samples, replace=False) + else: # list, set, np.ndarray + idxs = n_samples + datum = {} + r_cut = soap_kwargs.pop("r_cut", 5.0) + n_max = soap_kwargs.pop("n_max", 8) + l_max = soap_kwargs.pop("l_max", 6) + average = soap_kwargs.pop("average", "inner") + periodic = soap_kwargs.pop("periodic", False) + compression = soap_kwargs.pop("compression", {"mode": "mu1nu1"}) + soap = SOAP( + species=self.chemical_species, + periodic=periodic, + r_cut=r_cut, + n_max=n_max, + l_max=l_max, + average=average, + compression=compression, + ) + datum["soap_kwargs"] = { + "r_cut": r_cut, + "n_max": n_max, + "l_max": l_max, + "average": average, + "compression": compression, + "species": self.chemical_species, + "periodic": periodic, + **soap_kwargs, + } + + def wrapper(idx): + entry = self.get_ase_atoms(idx, ext=False) + return soap.create(entry, centers=entry.positions) + + descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads", n_jobs=-1) + datum["soap"] = np.vstack(descr) + if return_idxs: + datum["idxs"] = idxs + return datum + + def __len__(self): + return self.data["energies"].shape[0] + + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return x + + def __getitem__(self, idx: int): + shift = IsolatedAtomEnergyFactory.max_charge + p_start, p_end = self.data["position_idx_range"][idx] + input = self.data["atomic_inputs"][p_start:p_end] + z, c, positions, energies = ( + np.array(input[:, 0], dtype=np.int32), + np.array(input[:, 1], dtype=np.int32), + np.array(input[:, -3:], dtype=np.float32), + np.array(self.data["energies"][idx], dtype=np.float32), + ) + name = self.__smiles_converter__(self.data["name"][idx]) + subset = self.data["subset"][idx] + + if "forces" in self.data: + forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32) + else: + forces = None + return Bunch( + positions=positions, + atomic_numbers=z, + charges=c, + e0=self.__isolated_atom_energies__[..., z, c + shift].T, + energies=energies, + name=name, + subset=subset, + forces=forces, + ) + + def __str__(self): + return f"{self.__name__}" + + def __repr__(self): + return f"{self.__name__}" + + @property + def _stats(self): + return self.__stats__ + + @property + def average_n_atoms(self): + """ + Average number of atoms in a molecule in the dataset. + """ + if self.__average_nb_atoms__ is None: + raise StatisticsNotAvailableError(self.__name__) + return self.__average_nb_atoms__ + + def get_statistics(self, normalization: str = "formation", return_none: bool = True): + """ + Get the statistics of the dataset. + normalization : str, optional + Type of energy, by default "formation", must be one of ["formation", "total", "inter"] + return_none : bool, optional + Whether to return None if the statistics for the forces are not available, by default True + Otherwise, the statistics for the forces are set to 0.0 + """ + stats = deepcopy(self._stats) + if len(stats) == 0: + raise StatisticsNotAvailableError(self.__name__) + if normalization not in POSSIBLE_NORMALIZATION: + raise NormalizationNotAvailableError(normalization) + selected_stats = stats[normalization] + if len(self.__force_methods__) == 0 and not return_none: + selected_stats.update( + { + "forces": { + "mean": np.array([0.0]), + "std": np.array([0.0]), + "components": { + "mean": np.array([[0.0], [0.0], [0.0]]), + "std": np.array([[0.0], [0.0], [0.0]]), + "rms": np.array([[0.0], [0.0], [0.0]]), + }, + } + } + ) + # cycle trough dict to convert units + for key in selected_stats: + if key == "forces": + for key2 in selected_stats[key]: + if key2 != "components": + selected_stats[key][key2] = self.convert_forces(selected_stats[key][key2]) + else: + for key2 in selected_stats[key]["components"]: + selected_stats[key]["components"][key2] = self.convert_forces( + selected_stats[key]["components"][key2] + ) + else: + for key2 in selected_stats[key]: + selected_stats[key][key2] = self.convert_energy(selected_stats[key][key2]) + return selected_stats \ No newline at end of file From 0db476578c9e7f4f9ee2c068eb9926c2df62b891 Mon Sep 17 00:00:00 2001 From: mcneela Date: Mon, 4 Mar 2024 10:01:58 -0500 Subject: [PATCH 05/47] add interaction __init__ file and revise potential __init__ file --- src/openqdc/datasets/interaction/__init__.py | 46 ++++++++++++++++++++ src/openqdc/datasets/potential/__init__.py | 44 +++++++++---------- 2 files changed, 68 insertions(+), 22 deletions(-) create mode 100644 src/openqdc/datasets/interaction/__init__.py diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py new file mode 100644 index 0000000..782b972 --- /dev/null +++ b/src/openqdc/datasets/interaction/__init__.py @@ -0,0 +1,46 @@ +import importlib +import os +from typing import TYPE_CHECKING # noqa F401 + +# The below lazy import logic is coming from openff-toolkit: +# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44 + +# Dictionary of objects to lazily import; maps the object's name to its module path + +_lazy_imports_obj = { + "DES370K": "openqdc.datasets.interaction.des370k", +} + +_lazy_imports_mod = {} + + +def __getattr__(name): + """Lazily import objects from _lazy_imports_obj or _lazy_imports_mod + + Note that this method is only called by Python if the name cannot be found + in the current module.""" + obj_mod = _lazy_imports_obj.get(name) + if obj_mod is not None: + mod = importlib.import_module(obj_mod) + return mod.__dict__[name] + + lazy_mod = _lazy_imports_mod.get(name) + if lazy_mod is not None: + return importlib.import_module(lazy_mod) + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__(): + """Add _lazy_imports_obj and _lazy_imports_mod to dir()""" + keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys()) + return sorted(keys) + + +if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": + + from .des370k import DES370K + + __all__ = [ + "DES370K" + ] diff --git a/src/openqdc/datasets/potential/__init__.py b/src/openqdc/datasets/potential/__init__.py index d989935..1513c28 100644 --- a/src/openqdc/datasets/potential/__init__.py +++ b/src/openqdc/datasets/potential/__init__.py @@ -8,28 +8,28 @@ # Dictionary of objects to lazily import; maps the object's name to its module path _lazy_imports_obj = { - "ANI1": "openqdc.datasets.ani", - "ANI1CCX": "openqdc.datasets.ani", - "ANI1X": "openqdc.datasets.ani", - "Spice": "openqdc.datasets.spice", - "GEOM": "openqdc.datasets.geom", - "QMugs": "openqdc.datasets.qmugs", - "ISO17": "openqdc.datasets.iso_17", - "COMP6": "openqdc.datasets.comp6", - "GDML": "openqdc.datasets.gdml", - "Molecule3D": "openqdc.datasets.molecule3d", - "OrbnetDenali": "openqdc.datasets.orbnet_denali", - "SN2RXN": "openqdc.datasets.sn2_rxn", - "QM7X": "openqdc.datasets.qm7x", - "DESS": "openqdc.datasets.dess", - "NablaDFT": "openqdc.datasets.nabladft", - "SolvatedPeptides": "openqdc.datasets.solvated_peptides", - "WaterClusters": "openqdc.datasets.waterclusters3_30", - "TMQM": "openqdc.datasets.tmqm", - "Dummy": "openqdc.datasets.dummy", - "PCQM_B3LYP": "openqdc.datasets.pcqm", - "PCQM_PM6": "openqdc.datasets.pcqm", - "Transition1X": "openqdc.datasets.transition1x", + "ANI1": "openqdc.datasets.potential.ani", + "ANI1CCX": "openqdc.datasets.potential.ani", + "ANI1X": "openqdc.datasets.potential.ani", + "Spice": "openqdc.datasets.potential.spice", + "GEOM": "openqdc.datasets.potential.geom", + "QMugs": "openqdc.datasets.potential.qmugs", + "ISO17": "openqdc.datasets.potential.iso_17", + "COMP6": "openqdc.datasets.potential.comp6", + "GDML": "openqdc.datasets.potential.gdml", + "Molecule3D": "openqdc.datasets.potential.molecule3d", + "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali", + "SN2RXN": "openqdc.datasets.potential.sn2_rxn", + "QM7X": "openqdc.datasets.potential.qm7x", + "DESS": "openqdc.datasets.potential.dess", + "NablaDFT": "openqdc.datasets.potential.nabladft", + "SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides", + "WaterClusters": "openqdc.datasets.potential.waterclusters3_30", + "TMQM": "openqdc.datasets.potential.tmqm", + "Dummy": "openqdc.datasets.potential.dummy", + "PCQM_B3LYP": "openqdc.datasets.potential.pcqm", + "PCQM_PM6": "openqdc.datasets.potential.pcqm", + "Transition1X": "openqdc.datasets.potential.transition1x", } _lazy_imports_mod = {} From 6e5a00287c8ea3f7fe1276c27a5cd0cc409e9b5c Mon Sep 17 00:00:00 2001 From: mcneela Date: Mon, 4 Mar 2024 10:02:53 -0500 Subject: [PATCH 06/47] add des370k interaction to config_factory.py --- src/openqdc/raws/config_factory.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py index c8dddba..df54307 100644 --- a/src/openqdc/raws/config_factory.py +++ b/src/openqdc/raws/config_factory.py @@ -90,6 +90,13 @@ class DataConfigFactory: }, ) + des370k_interaction = dict( + dataset_name="des370k_interaction", + links={ + "DES370K.zip": "https://zenodo.org/record/5676266/files/DES370K.zip", + } + ) + tmqm = dict( dataset_name="tmqm", links={ From 8e1e0031f08462e5ac254f2bfbe07981758ac893 Mon Sep 17 00:00:00 2001 From: mcneela Date: Mon, 4 Mar 2024 10:18:29 -0500 Subject: [PATCH 07/47] have BaseInteractionDataset inherit BaseDataset --- src/openqdc/datasets/interaction/base.py | 538 +---------------------- 1 file changed, 8 insertions(+), 530 deletions(-) diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py index de43887..cb7c812 100644 --- a/src/openqdc/datasets/interaction/base.py +++ b/src/openqdc/datasets/interaction/base.py @@ -9,25 +9,13 @@ push_remote, set_cache_dir, ) +from openqdc.datasets.potential.base import BaseDataset from loguru import logger -class BaseInteractionDataset: - __energy_methods__ = [] - __force_methods__ = [] - energy_target_names = [] - force_target_names = [] - __isolated_atom_energies__ = [] - - __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" - __fn_energy__ = lambda x: x - __fn_distance__ = lambda x: x - __fn_forces__ = lambda x: x - __average_nb_atoms__ = None - __stats__ = {} +import numpy as np +class BaseInteractionDataset(BaseDataset): def __init__( self, energy_unit: Optional[str] = None, @@ -35,519 +23,9 @@ def __init__( overwrite_local_cache: bool = False, cache_dir: Optional[str] = None, ) -> None: - set_cache_dir(cache_dir) - self.data = None - if not self.is_preprocessed(): - raise DatasetNotAvailableError(self.__name__) - else: - self.read_preprocess(overwrite_local_cache=overwrite_local_cache) - self._post_init(overwrite_local_cache, energy_unit, distance_unit) - - def _post_init( - self, - overwrite_local_cache: bool = False, - energy_unit: Optional[str] = None, - distance_unit: Optional[str] = None, - ) -> None: - self._set_units(None, None) - self._set_isolated_atom_energies() - self._precompute_statistics(overwrite_local_cache=overwrite_local_cache) - self._set_units(energy_unit, distance_unit) - self._convert_data() - self._set_isolated_atom_energies() - - def _convert_data(self): - logger.info( - f"Converting {self.__name__} data to the following units:\n\ - Energy: {self.energy_unit},\n\ - Distance: {self.distance_unit},\n\ - Forces: {self.force_unit if self.__force_methods__ else 'None'}" - ) - for key in self.data_keys: - self.data[key] = self._convert_on_loading(self.data[key], key) - - def _precompute_statistics(self, overwrite_local_cache: bool = False): - local_path = p_join(self.preprocess_path, "stats.pkl") - if self.is_preprocessed_statistics() and not overwrite_local_cache: - stats = load_pkl(local_path) - logger.info("Loaded precomputed statistics") - else: - logger.info("Precomputing relevant statistics") - ( - inter_E_mean, - inter_E_std, - formation_E_mean, - formation_E_std, - total_E_mean, - total_E_std, - ) = self._precompute_E() - forces_dict = self._precompute_F() - stats = { - "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict}, - "inter": {"energy": {"mean": inter_E_mean, "std": inter_E_std}, "forces": forces_dict}, - "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict}, - } - with open(local_path, "wb") as f: - pkl.dump(stats, f) - self._compute_average_nb_atoms() - self.__stats__ = stats - - def _compute_average_nb_atoms(self): - self.__average_nb_atoms__ = np.mean(self.data["n_atoms"]) - - def _precompute_E(self): - splits_idx = self.data["position_idx_range"][:, 1] - s = np.array(self.data["atomic_inputs"][:, :2], dtype=int) - s[:, 1] += IsolatedAtomEnergyFactory.max_charge - matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__] - converted_energy_data = self.data["energies"] - # calculation per molecule formation energy statistics - E = [] - for i, matrix in enumerate(matrixs): - c = np.cumsum(np.append([0], matrix))[splits_idx] - c[1:] = c[1:] - c[:-1] - E.append(converted_energy_data[:, i] - c) - E = np.array(E).T - inter_E_mean = np.nanmean(E / self.data["n_atoms"][:, None], axis=0) - inter_E_std = np.nanstd(E / self.data["n_atoms"][:, None], axis=0) - formation_E_mean = np.nanmean(E, axis=0) - formation_E_std = np.nanstd(E, axis=0) - total_E_mean = np.nanmean(converted_energy_data, axis=0) - total_E_std = np.nanstd(converted_energy_data, axis=0) - - return ( - np.atleast_2d(inter_E_mean), - np.atleast_2d(inter_E_std), - np.atleast_2d(formation_E_mean), - np.atleast_2d(formation_E_std), - np.atleast_2d(total_E_mean), - np.atleast_2d(total_E_std), - ) - - def _precompute_F(self): - if len(self.__force_methods__) == 0: - return NOT_DEFINED - converted_force_data = self.convert_forces(self.data["forces"]) - force_mean = np.nanmean(converted_force_data, axis=0) - force_std = np.nanstd(converted_force_data, axis=0) - force_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0)) - return { - "mean": np.atleast_2d(force_mean.mean(axis=0)), - "std": np.atleast_2d(force_std.mean(axis=0)), - "components": {"rms": force_rms, "std": force_std, "mean": force_mean}, - } - - @property - def numbers(self): - if hasattr(self, "_numbers"): - return self._numbers - self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32) - return self._numbers - - @property - def chemical_species(self): - return np.array(chemical_symbols)[self.numbers] - - @property - def energy_unit(self): - return self.__energy_unit__ - - @property - def distance_unit(self): - return self.__distance_unit__ - - @property - def force_unit(self): - return self.__forces_unit__ - - @property - def root(self): - return p_join(get_local_cache(), self.__name__) - - @property - def preprocess_path(self): - path = p_join(self.root, "preprocessed") - os.makedirs(path, exist_ok=True) - return path - - @property - def data_keys(self): - keys = list(self.data_types.keys()) - if len(self.__force_methods__) == 0: - keys.remove("forces") - return keys - - @property - def data_types(self): - return { - "atomic_inputs": np.float32, - "position_idx_range": np.int32, - "energies": np.float32, - "forces": np.float32, - } - - @property - def data_shapes(self): - return { - "atomic_inputs": (-1, NB_ATOMIC_FEATURES), - "position_idx_range": (-1, 2), - "energies": (-1, len(self.energy_target_names)), - "forces": (-1, 3, len(self.force_target_names)), - } - - @property - def atoms_per_molecules(self): - try: - if hasattr(self, "_n_atoms"): - return self._n_atoms - self._n_atoms = self.data["n_atoms"] - return self._n_atoms - except: # noqa - return None - - def _set_units(self, en, ds): - old_en, old_ds = self.energy_unit, self.distance_unit - en = en if en is not None else old_en - ds = ds if ds is not None else old_ds - - # if en is None: - self.set_energy_unit(en) - # if ds is not None: - self.set_distance_unit(ds) - if self.__force_methods__: - self.__forces_unit__ = self.energy_unit + "/" + self.distance_unit - self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__) - - def _set_isolated_atom_energies(self): - if self.__energy_methods__ is None: - logger.error("No energy methods defined for this dataset.") - f = get_conversion("hartree", self.__energy_unit__) - self.__isolated_atom_energies__ = f( - np.array([IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.__energy_methods__]) - ) - - def convert_energy(self, x): - return self.__class__.__fn_energy__(x) - - def convert_distance(self, x): - return self.__class__.__fn_distance__(x) - - def convert_forces(self, x): - return self.__class__.__fn_forces__(x) - - def set_energy_unit(self, value: str): - """ - Set a new energy unit for the dataset. - """ - old_unit = self.energy_unit - self.__energy_unit__ = value - self.__class__.__fn_energy__ = get_conversion(old_unit, value) - - def set_distance_unit(self, value: str): - """ - Set a new distance unit for the dataset. - """ - old_unit = self.distance_unit - self.__distance_unit__ = value - self.__class__.__fn_distance__ = get_conversion(old_unit, value) - - def read_raw_entries(self): - raise NotImplementedError - - def collate_list(self, list_entries): - # concatenate entries - res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]} - - csum = np.cumsum(res.get("n_atoms")) - x = np.zeros((csum.shape[0], 2), dtype=np.int32) - x[1:, 0], x[:, 1] = csum[:-1], csum - res["position_idx_range"] = x - - return res - - def save_preprocess(self, data_dict): - # save memmaps - logger.info("Preprocessing data and saving it to cache.") - for key in self.data_keys: - local_path = p_join(self.preprocess_path, f"{key}.mmap") - out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) - out[:] = data_dict.pop(key)[:] - out.flush() - push_remote(local_path, overwrite=True) - - # save smiles and subset - local_path = p_join(self.preprocess_path, "props.pkl") - for key in ["name", "subset"]: - data_dict[key] = np.unique(data_dict[key], return_inverse=True) - - with open(local_path, "wb") as f: - pkl.dump(data_dict, f) - push_remote(local_path, overwrite=True) - - def _convert_on_loading(self, x, key): - if key == "energies": - return self.convert_energy(x) - elif key == "forces": - return self.convert_forces(x) - elif key == "atomic_inputs": - x = np.array(x, dtype=np.float32) - x[:, -3:] = self.convert_distance(x[:, -3:]) - return x - else: - return x - - def read_preprocess(self, overwrite_local_cache=False): - logger.info("Reading preprocessed data") - logger.info( - f"{self.__name__} data with the following units:\n\ - Energy: {self.energy_unit},\n\ - Distance: {self.distance_unit},\n\ - Forces: {self.force_unit if self.__force_methods__ else 'None'}" - ) - self.data = {} - for key in self.data_keys: - filename = p_join(self.preprocess_path, f"{key}.mmap") - pull_locally(filename, overwrite=overwrite_local_cache) - self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key]) - - filename = p_join(self.preprocess_path, "props.pkl") - pull_locally(filename, overwrite=overwrite_local_cache) - with open(filename, "rb") as f: - tmp = pkl.load(f) - for key in ["name", "subset", "n_atoms"]: - x = tmp.pop(key) - if len(x) == 2: - self.data[key] = x[0][x[1]] - else: - self.data[key] = x - - for key in self.data: - logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") - - def is_preprocessed(self): - predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] - predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))] - return all(predicats) - - def is_preprocessed_statistics(self): - return bool(copy_exists(p_join(self.preprocess_path, "stats.pkl"))) - - def preprocess(self, overwrite=False): - if overwrite or not self.is_preprocessed(): - entries = self.read_raw_entries() - res = self.collate_list(entries) - self.save_preprocess(res) - - def save_xyz(self, idx: int, path: Optional[str] = None, name=None): - """ - Save the entry at index idx as an extxyz file. - """ - if path is None: - path = os.getcwd() - at = self.get_ase_atoms(idx, ext=True) - if name is not None: - name = at.info["name"] - write_extxyz(p_join(path, f"{name}.xyz"), at) - - def get_ase_atoms(self, idx: int, ext=True): - """ - Get the ASE atoms object for the entry at index idx. - - Parameters - ---------- - idx : int - Index of the entry. - ext : bool, optional - Whether to include additional informations - """ - entry = self[idx] - # _ = entry.pop("forces") - at = dict_to_atoms(entry, ext=ext) - return at - - @requires_package("dscribe") - @requires_package("datamol") - def soap_descriptors( - self, - n_samples: Optional[Union[List[int], int]] = None, - return_idxs: bool = True, - progress: bool = True, - **soap_kwargs, - ) -> Dict[str, np.ndarray]: - """ - Compute the SOAP descriptors for the dataset. - - Parameters - ---------- - n_samples : Optional[Union[List[int],int]], optional - Number of samples to use for the computation, by default None. If None, all the dataset is used. - If a list of integers is provided, the descriptors are computed for each of the specified idx of samples. - return_idxs : bool, optional - Whether to return the indices of the samples used, by default True. - progress : bool, optional - Whether to show a progress bar, by default True. - **soap_kwargs : dict - Keyword arguments to pass to the SOAP descriptor. - By defaut, the following values are used: - - r_cut : 5.0 - - n_max : 8 - - l_max : 6 - - average : "inner" - - periodic : False - - compression : {"mode" : "mu1nu1"} - - Returns - ------- - Dict[str, np.ndarray] - Dictionary containing the following keys: - - soap : np.ndarray of shape (N, M) containing the SOAP descriptors for the dataset - - soap_kwargs : dict containing the keyword arguments used for the SOAP descriptor - - idxs : np.ndarray of shape (N,) containing the indices of the samples used - - """ - import datamol as dm - from dscribe.descriptors import SOAP - - if n_samples is None: - idxs = list(range(len(self))) - elif isinstance(n_samples, int): - idxs = np.random.choice(len(self), size=n_samples, replace=False) - else: # list, set, np.ndarray - idxs = n_samples - datum = {} - r_cut = soap_kwargs.pop("r_cut", 5.0) - n_max = soap_kwargs.pop("n_max", 8) - l_max = soap_kwargs.pop("l_max", 6) - average = soap_kwargs.pop("average", "inner") - periodic = soap_kwargs.pop("periodic", False) - compression = soap_kwargs.pop("compression", {"mode": "mu1nu1"}) - soap = SOAP( - species=self.chemical_species, - periodic=periodic, - r_cut=r_cut, - n_max=n_max, - l_max=l_max, - average=average, - compression=compression, - ) - datum["soap_kwargs"] = { - "r_cut": r_cut, - "n_max": n_max, - "l_max": l_max, - "average": average, - "compression": compression, - "species": self.chemical_species, - "periodic": periodic, - **soap_kwargs, - } - - def wrapper(idx): - entry = self.get_ase_atoms(idx, ext=False) - return soap.create(entry, centers=entry.positions) - - descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads", n_jobs=-1) - datum["soap"] = np.vstack(descr) - if return_idxs: - datum["idxs"] = idxs - return datum - - def __len__(self): - return self.data["energies"].shape[0] - - def __smiles_converter__(self, x): - """util function to convert string to smiles: useful if the smiles is - encoded in a different format than its display format - """ - return x - - def __getitem__(self, idx: int): - shift = IsolatedAtomEnergyFactory.max_charge - p_start, p_end = self.data["position_idx_range"][idx] - input = self.data["atomic_inputs"][p_start:p_end] - z, c, positions, energies = ( - np.array(input[:, 0], dtype=np.int32), - np.array(input[:, 1], dtype=np.int32), - np.array(input[:, -3:], dtype=np.float32), - np.array(self.data["energies"][idx], dtype=np.float32), + super().__init__( + energy_unit=energy_unit, + distance_unit=distance_unit, + overwrite_local_cache=overwrite_local_cache, + cache_dir=cache_dir ) - name = self.__smiles_converter__(self.data["name"][idx]) - subset = self.data["subset"][idx] - - if "forces" in self.data: - forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32) - else: - forces = None - return Bunch( - positions=positions, - atomic_numbers=z, - charges=c, - e0=self.__isolated_atom_energies__[..., z, c + shift].T, - energies=energies, - name=name, - subset=subset, - forces=forces, - ) - - def __str__(self): - return f"{self.__name__}" - - def __repr__(self): - return f"{self.__name__}" - - @property - def _stats(self): - return self.__stats__ - - @property - def average_n_atoms(self): - """ - Average number of atoms in a molecule in the dataset. - """ - if self.__average_nb_atoms__ is None: - raise StatisticsNotAvailableError(self.__name__) - return self.__average_nb_atoms__ - - def get_statistics(self, normalization: str = "formation", return_none: bool = True): - """ - Get the statistics of the dataset. - normalization : str, optional - Type of energy, by default "formation", must be one of ["formation", "total", "inter"] - return_none : bool, optional - Whether to return None if the statistics for the forces are not available, by default True - Otherwise, the statistics for the forces are set to 0.0 - """ - stats = deepcopy(self._stats) - if len(stats) == 0: - raise StatisticsNotAvailableError(self.__name__) - if normalization not in POSSIBLE_NORMALIZATION: - raise NormalizationNotAvailableError(normalization) - selected_stats = stats[normalization] - if len(self.__force_methods__) == 0 and not return_none: - selected_stats.update( - { - "forces": { - "mean": np.array([0.0]), - "std": np.array([0.0]), - "components": { - "mean": np.array([[0.0], [0.0], [0.0]]), - "std": np.array([[0.0], [0.0], [0.0]]), - "rms": np.array([[0.0], [0.0], [0.0]]), - }, - } - } - ) - # cycle trough dict to convert units - for key in selected_stats: - if key == "forces": - for key2 in selected_stats[key]: - if key2 != "components": - selected_stats[key][key2] = self.convert_forces(selected_stats[key][key2]) - else: - for key2 in selected_stats[key]["components"]: - selected_stats[key]["components"][key2] = self.convert_forces( - selected_stats[key]["components"][key2] - ) - else: - for key2 in selected_stats[key]: - selected_stats[key][key2] = self.convert_energy(selected_stats[key][key2]) - return selected_stats \ No newline at end of file From d68bae6529c1343adb719f03a83d7fa7e9b4003e Mon Sep 17 00:00:00 2001 From: mcneela Date: Mon, 4 Mar 2024 12:08:48 -0500 Subject: [PATCH 08/47] implemented read_raw_entries for DES370K --- src/openqdc/datasets/interaction/des370k.py | 169 +++++++++----------- 1 file changed, 73 insertions(+), 96 deletions(-) diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py index 127ec89..4c79fcc 100644 --- a/src/openqdc/datasets/interaction/des370k.py +++ b/src/openqdc/datasets/interaction/des370k.py @@ -1,11 +1,12 @@ -import torch +import os +import numpy as np import pandas as pd +from tqdm import tqdm from typing import Dict, List -from collections import defaultdict, Counter from loguru import logger -from torch.utils.data import Dataset +from openqdc.datasets.interaction import BaseInteractionDataset class Dimer: def __init__( @@ -16,7 +17,7 @@ def __init__( charge_1: int, n_atoms_0: int, n_atoms_1: int, - pos: torch.Tensor, + pos: np.array, sapt_energies: List[float], ) -> None: self.smiles_0 = smiles_0 @@ -47,99 +48,75 @@ def __repr__(self) -> str: return str(self) -class DES370K(Dataset): - def __init__(self, filepath="data/des370k.csv") -> None: - self.filepath = filepath - self.df = pd.read_csv(filepath) - self._atom_types = defaultdict(int) - self.smiles = set() - self.data = [] - self._preprocess() - - def _preprocess(self) -> None: +class DES370K(BaseInteractionDataset): + __name__ = "des370k_interaction" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cbs", # cbs + "ccsd(t)/nn", # nn + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", + ] + + energy_target_names = [ + "cc_MP2_all", + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "cc_CCSD(T)_all", + "cbs_CCSD(T)_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + def read_raw_entries(self) -> List[Dict]: + self.filepath = os.path.join(self.root, "DES370K.csv") logger.info(f"Reading data from {self.filepath}") - for idx, row in self.df.iterrows(): + df = pd.read_csv(self.filepath) + data = [] + for idx, row in tqdm(df.iterrows(), total=df.shape[0]): smiles0, smiles1 = row["smiles0"], row["smiles1"] - charge0, charge1 = row["charge0"], row["charge1"] natoms0, natoms1 = row["natoms0"], row["natoms1"] - pos = torch.tensor(list(map(float, row["xyz"].split()))).view(-1, 3) - sapt_energies = [row[col] for col in self.df.columns if "sapt" in col] - dimer = Dimer( - smiles0, smiles1, - charge0, charge1, - natoms0, natoms1, - pos, sapt_energies + pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) + pos0 = pos[:natoms0] + pos1 = pos[natoms0:] + # sapt_components = {col: row[col] for col in df.columns if "sapt" in col} + item = dict( + mol0=dict( + smiles=smiles0, + atomic_inputs=pos0, + n_atoms=natoms0, + ), + mol1=dict( + smiles=smiles1, + atomic_inputs=pos1, + n_atoms=natoms1, + ), + targets=row[self.energy_target_names].values, ) - self.data.append(dimer) - - # keep track of unique smiles strings - self.smiles.add(smiles0) - self.smiles.add(smiles1) - - # get atom types - elems = row["elements"].split() - counts = Counter(set(elems)) - for key in counts: - self._atom_types[key] += counts[key] - - # convert defaultdict to regular dict - self._atom_types = dict(self._atom_types) - - def __str__(self) -> str: - return f"DES370K(n_atoms={self.num_atoms},\ - n_molecules={self.num_molecules},\ - atom_types={self.species})" - - def __repr__(self) -> str: - return str(self) - - @property - def atom_types(self) -> Dict[str, int]: - """ - Returns a dictionary of - (element, count) pairs. - """ - return self._atom_types - - @property - def num_dimers(self) -> int: - """ - Returns the number of - dimers in the dataset. - """ - return len(self.data) - - @property - def num_unique_molecules(self) -> int: - """ - Returns the number of unique - molecules in the dataset. - """ - return len(self.smiles) - - @property - def num_atoms(self) -> int: - """ - Returns the total number of atoms in - the dataset. - """ - if not hasattr(self, "_num_atoms"): - self._num_atoms = sum(self.atom_types.values()) - return self._num_atoms - - @property - def species(self) -> List[str]: - """ - Returns a list of the unique atom - species contained in the dataset. - """ - if not hasattr(self, "_species"): - self._species = list(self.atom_types.keys()) - return self._species - - def atom_count(self, element: str) -> int: - """ - Returns the count of a given - element in the dataset. - """ - return self.atom_types[element] + data.append(item) + return data From 5e94d6727236dd3dd2737a516dc79b6dafbbdf27 Mon Sep 17 00:00:00 2001 From: mcneela Date: Mon, 4 Mar 2024 14:04:40 -0500 Subject: [PATCH 09/47] finished implementation of DES370K interaction --- src/openqdc/datasets/interaction/__init__.py | 6 +- src/openqdc/datasets/interaction/base.py | 12 ++++ src/openqdc/datasets/interaction/des370k.py | 76 +++++++++----------- src/openqdc/datasets/potential/base.py | 5 ++ src/openqdc/datasets/potential/dess.py | 3 +- src/openqdc/datasets/potential/tmqm.py | 2 +- 6 files changed, 55 insertions(+), 49 deletions(-) diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py index 782b972..62392e3 100644 --- a/src/openqdc/datasets/interaction/__init__.py +++ b/src/openqdc/datasets/interaction/__init__.py @@ -8,6 +8,7 @@ # Dictionary of objects to lazily import; maps the object's name to its module path _lazy_imports_obj = { + "BaseInteractionDataset": "openqdc.datasets.interaction.base", "DES370K": "openqdc.datasets.interaction.des370k", } @@ -38,9 +39,10 @@ def __dir__(): if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": - + from .base import BaseInteractionDataset from .des370k import DES370K __all__ = [ - "DES370K" + "BaseInteractionDataset", + "DES370K", ] diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py index cb7c812..e266658 100644 --- a/src/openqdc/datasets/interaction/base.py +++ b/src/openqdc/datasets/interaction/base.py @@ -29,3 +29,15 @@ def __init__( overwrite_local_cache=overwrite_local_cache, cache_dir=cache_dir ) + + def collate_list(self, list_entries: List[Dict]): + # concatenate entries + res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \ + for key in list_entries[0] if not isinstance(list_entries[0][key], dict)} + + csum = np.cumsum(res.get("n_atoms")) + x = np.zeros((csum.shape[0], 2), dtype=np.int32) + x[1:, 0], x[:, 1] = csum[:-1], csum + res["position_idx_range"] = x + + return res diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py index 4c79fcc..da6a9b9 100644 --- a/src/openqdc/datasets/interaction/des370k.py +++ b/src/openqdc/datasets/interaction/des370k.py @@ -2,50 +2,12 @@ import numpy as np import pandas as pd -from tqdm import tqdm from typing import Dict, List +from tqdm import tqdm from loguru import logger from openqdc.datasets.interaction import BaseInteractionDataset - -class Dimer: - def __init__( - self, - smiles_0: str, - smiles_1: str, - charge_0: int, - charge_1: int, - n_atoms_0: int, - n_atoms_1: int, - pos: np.array, - sapt_energies: List[float], - ) -> None: - self.smiles_0 = smiles_0 - self.smiles_1 = smiles_1 - self.charge_1 = charge_0 - self.charge_1 = charge_1 - self.n_atoms_0 = n_atoms_0 - self.n_atoms_1 = n_atoms_1 - self.pos = pos - self.sapt_energies = sapt_energies - ( - self.sapt_es, - self.sapt_ex, - self.sapt_exs2, - self.sapt_ind, - self.sapt_exind, - self.sapt_disp, - self.sapt_exdisp_os, - self.sapt_exdisp_ss, - self.sapt_delta_HF, - self.sapt_all - ) = tuple(sapt_energies) - - def __str__(self) -> str: - return f"Dimer(smiles_0='{self.smiles_0}', smiles_1='{self.smiles_1}')" - - def __repr__(self) -> str: - return str(self) +from openqdc.utils.molecule import atom_table class DES370K(BaseInteractionDataset): @@ -95,28 +57,54 @@ class DES370K(BaseInteractionDataset): def read_raw_entries(self) -> List[Dict]: self.filepath = os.path.join(self.root, "DES370K.csv") - logger.info(f"Reading data from {self.filepath}") + logger.info(f"Reading DES370K interaction data from {self.filepath}") df = pd.read_csv(self.filepath) data = [] for idx, row in tqdm(df.iterrows(), total=df.shape[0]): smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] natoms0, natoms1 = row["natoms0"], row["natoms1"] pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) pos0 = pos[:natoms0] pos1 = pos[natoms0:] - # sapt_components = {col: row[col] for col in df.columns if "sapt" in col} + + elements = row["elements"].split() + elements0 = np.array(elements[:natoms0]) + elements1 = np.array(elements[natoms0:]) + + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) + atomic_nums0 = np.array(atomic_nums[:natoms0]) + atomic_nums1 = np.array(atomic_nums[natoms0:]) + + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + atomic_inputs0 = atomic_inputs[:natoms0, :] + atomic_inputs1 = atomic_inputs[natoms0:, :] + item = dict( mol0=dict( smiles=smiles0, - atomic_inputs=pos0, + atomic_inputs=atomic_inputs0, n_atoms=natoms0, + charge=charge0, + elements=elements0, + atomic_nums=atomic_nums0, + pos=pos0, ), mol1=dict( smiles=smiles1, - atomic_inputs=pos1, + atomic_inputs=atomic_inputs1, n_atoms=natoms1, + charge=charge1, + elements=elements1, + atomic_nums=atomic_nums1, + pos=pos1, ), targets=row[self.energy_target_names].values, + subset=np.array(["DES370K"]), + n_atoms=np.array([natoms0 + natoms1]), + atomic_inputs=atomic_inputs, ) data.append(item) return data diff --git a/src/openqdc/datasets/potential/base.py b/src/openqdc/datasets/potential/base.py index adf9ae6..447985a 100644 --- a/src/openqdc/datasets/potential/base.py +++ b/src/openqdc/datasets/potential/base.py @@ -125,6 +125,10 @@ def _post_init( self._convert_data() self._set_isolated_atom_energies() + @classmethod + def no_init(cls): + return cls.__new__(cls) + def _convert_data(self): logger.info( f"Converting {self.__name__} data to the following units:\n\ @@ -325,6 +329,7 @@ def read_raw_entries(self): def collate_list(self, list_entries): # concatenate entries + logger.info(f"list entries: {type(list_entries)}") res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]} csum = np.cumsum(res.get("n_atoms")) diff --git a/src/openqdc/datasets/potential/dess.py b/src/openqdc/datasets/potential/dess.py index 80b1e1c..fd36e07 100644 --- a/src/openqdc/datasets/potential/dess.py +++ b/src/openqdc/datasets/potential/dess.py @@ -5,7 +5,7 @@ import pandas as pd from tqdm import tqdm -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.molecule import get_atomic_number_and_charge @@ -58,7 +58,6 @@ class DESS(BaseDataset): "nn_CCSD(T)_all", "sapt_all", ] - # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all'] partitions = ["DES370K", "DES5M"] diff --git a/src/openqdc/datasets/potential/tmqm.py b/src/openqdc/datasets/potential/tmqm.py index 8952aaa..b613929 100644 --- a/src/openqdc/datasets/potential/tmqm.py +++ b/src/openqdc/datasets/potential/tmqm.py @@ -5,7 +5,7 @@ import pandas as pd from tqdm import tqdm -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.molecule import atom_table From 3c9508beb6e6d73e0832a553e9456baed16db5ec Mon Sep 17 00:00:00 2001 From: mcneela Date: Mon, 4 Mar 2024 14:16:43 -0500 Subject: [PATCH 10/47] finished implementation of DES370K interaction --- src/openqdc/datasets/interaction/des370k.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py index da6a9b9..824da46 100644 --- a/src/openqdc/datasets/interaction/des370k.py +++ b/src/openqdc/datasets/interaction/des370k.py @@ -82,6 +82,10 @@ def read_raw_entries(self) -> List[Dict]: atomic_inputs0 = atomic_inputs[:natoms0, :] atomic_inputs1 = atomic_inputs[natoms0:, :] + energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] + + name = np.array([smiles0 + "." + smiles1]) + item = dict( mol0=dict( smiles=smiles0, @@ -101,10 +105,11 @@ def read_raw_entries(self) -> List[Dict]: atomic_nums=atomic_nums1, pos=pos1, ), - targets=row[self.energy_target_names].values, + energies=energies, subset=np.array(["DES370K"]), - n_atoms=np.array([natoms0 + natoms1]), + n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), atomic_inputs=atomic_inputs, + name=name, ) data.append(item) return data From 768fb2e39ebd56b30bac5b9b68df4dac51015c6d Mon Sep 17 00:00:00 2001 From: mcneela Date: Mon, 4 Mar 2024 14:23:47 -0500 Subject: [PATCH 11/47] update BaseDataset import path --- src/openqdc/datasets/potential/comp6.py | 2 +- src/openqdc/datasets/potential/dummy.py | 2 +- src/openqdc/datasets/potential/gdml.py | 2 +- src/openqdc/datasets/potential/geom.py | 2 +- src/openqdc/datasets/potential/iso_17.py | 2 +- src/openqdc/datasets/potential/molecule3d.py | 2 +- src/openqdc/datasets/potential/nabladft.py | 2 +- src/openqdc/datasets/potential/orbnet_denali.py | 2 +- src/openqdc/datasets/potential/pcqm.py | 2 +- src/openqdc/datasets/potential/qm7x.py | 2 +- src/openqdc/datasets/potential/qmugs.py | 2 +- src/openqdc/datasets/potential/sn2_rxn.py | 2 +- src/openqdc/datasets/potential/solvated_peptides.py | 2 +- src/openqdc/datasets/potential/spice.py | 2 +- src/openqdc/datasets/potential/transition1x.py | 2 +- src/openqdc/datasets/potential/waterclusters3_30.py | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/openqdc/datasets/potential/comp6.py b/src/openqdc/datasets/potential/comp6.py index 7b6890b..ac37094 100644 --- a/src/openqdc/datasets/potential/comp6.py +++ b/src/openqdc/datasets/potential/comp6.py @@ -1,6 +1,6 @@ from os.path import join as p_join -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5 class COMP6(BaseDataset): diff --git a/src/openqdc/datasets/potential/dummy.py b/src/openqdc/datasets/potential/dummy.py index c87e03d..b4bd2e8 100644 --- a/src/openqdc/datasets/potential/dummy.py +++ b/src/openqdc/datasets/potential/dummy.py @@ -2,7 +2,7 @@ from numpy import array from sklearn.utils import Bunch -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory from openqdc.utils.constants import NOT_DEFINED diff --git a/src/openqdc/datasets/potential/gdml.py b/src/openqdc/datasets/potential/gdml.py index e40b3fa..67f6bde 100644 --- a/src/openqdc/datasets/potential/gdml.py +++ b/src/openqdc/datasets/potential/gdml.py @@ -1,6 +1,6 @@ from os.path import join as p_join -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5 class GDML(BaseDataset): diff --git a/src/openqdc/datasets/potential/geom.py b/src/openqdc/datasets/potential/geom.py index c016a9f..8b87ead 100644 --- a/src/openqdc/datasets/potential/geom.py +++ b/src/openqdc/datasets/potential/geom.py @@ -4,7 +4,7 @@ import datamol as dm import numpy as np -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils import load_json, load_pkl from openqdc.utils.molecule import get_atomic_number_and_charge diff --git a/src/openqdc/datasets/potential/iso_17.py b/src/openqdc/datasets/potential/iso_17.py index 4553ec1..1dd5d56 100644 --- a/src/openqdc/datasets/potential/iso_17.py +++ b/src/openqdc/datasets/potential/iso_17.py @@ -1,6 +1,6 @@ from os.path import join as p_join -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5 class ISO17(BaseDataset): diff --git a/src/openqdc/datasets/potential/molecule3d.py b/src/openqdc/datasets/potential/molecule3d.py index dc47e53..834525a 100644 --- a/src/openqdc/datasets/potential/molecule3d.py +++ b/src/openqdc/datasets/potential/molecule3d.py @@ -8,7 +8,7 @@ from rdkit import Chem from tqdm import tqdm -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.molecule import get_atomic_number_and_charge diff --git a/src/openqdc/datasets/potential/nabladft.py b/src/openqdc/datasets/potential/nabladft.py index 0555cdc..703fdc5 100644 --- a/src/openqdc/datasets/potential/nabladft.py +++ b/src/openqdc/datasets/potential/nabladft.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.molecule import z_to_formula from openqdc.utils.package_utils import requires_package diff --git a/src/openqdc/datasets/potential/orbnet_denali.py b/src/openqdc/datasets/potential/orbnet_denali.py index 614e252..14c19ef 100644 --- a/src/openqdc/datasets/potential/orbnet_denali.py +++ b/src/openqdc/datasets/potential/orbnet_denali.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.molecule import atom_table diff --git a/src/openqdc/datasets/potential/pcqm.py b/src/openqdc/datasets/potential/pcqm.py index 543c494..c86d2b6 100644 --- a/src/openqdc/datasets/potential/pcqm.py +++ b/src/openqdc/datasets/potential/pcqm.py @@ -10,7 +10,7 @@ import pandas as pd from loguru import logger -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.io import get_local_cache, push_remote diff --git a/src/openqdc/datasets/potential/qm7x.py b/src/openqdc/datasets/potential/qm7x.py index eb8b015..361707f 100644 --- a/src/openqdc/datasets/potential/qm7x.py +++ b/src/openqdc/datasets/potential/qm7x.py @@ -3,7 +3,7 @@ import numpy as np from tqdm import tqdm -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.io import load_hdf5_file diff --git a/src/openqdc/datasets/potential/qmugs.py b/src/openqdc/datasets/potential/qmugs.py index c75f8b5..b7e0a86 100644 --- a/src/openqdc/datasets/potential/qmugs.py +++ b/src/openqdc/datasets/potential/qmugs.py @@ -5,7 +5,7 @@ import datamol as dm import numpy as np -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.molecule import get_atomic_number_and_charge diff --git a/src/openqdc/datasets/potential/sn2_rxn.py b/src/openqdc/datasets/potential/sn2_rxn.py index abcbd62..f8e35cc 100644 --- a/src/openqdc/datasets/potential/sn2_rxn.py +++ b/src/openqdc/datasets/potential/sn2_rxn.py @@ -1,6 +1,6 @@ from os.path import join as p_join -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5 class SN2RXN(BaseDataset): diff --git a/src/openqdc/datasets/potential/solvated_peptides.py b/src/openqdc/datasets/potential/solvated_peptides.py index 216ecdd..77d8318 100644 --- a/src/openqdc/datasets/potential/solvated_peptides.py +++ b/src/openqdc/datasets/potential/solvated_peptides.py @@ -1,6 +1,6 @@ from os.path import join as p_join -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5 class SolvatedPeptides(BaseDataset): diff --git a/src/openqdc/datasets/potential/spice.py b/src/openqdc/datasets/potential/spice.py index 0b90912..d891f6b 100644 --- a/src/openqdc/datasets/potential/spice.py +++ b/src/openqdc/datasets/potential/spice.py @@ -4,7 +4,7 @@ import numpy as np from tqdm import tqdm -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils import load_hdf5_file from openqdc.utils.molecule import get_atomic_number_and_charge diff --git a/src/openqdc/datasets/potential/transition1x.py b/src/openqdc/datasets/potential/transition1x.py index 0285ec9..c266325 100644 --- a/src/openqdc/datasets/potential/transition1x.py +++ b/src/openqdc/datasets/potential/transition1x.py @@ -3,7 +3,7 @@ import numpy as np from tqdm import tqdm -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.constants import NB_ATOMIC_FEATURES from openqdc.utils.io import load_hdf5_file diff --git a/src/openqdc/datasets/potential/waterclusters3_30.py b/src/openqdc/datasets/potential/waterclusters3_30.py index 6aa5748..ce0e5de 100644 --- a/src/openqdc/datasets/potential/waterclusters3_30.py +++ b/src/openqdc/datasets/potential/waterclusters3_30.py @@ -4,7 +4,7 @@ import numpy as np from tqdm import tqdm -from openqdc.datasets.base import BaseDataset +from openqdc.datasets.potential.base import BaseDataset from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import atom_table From 8aeadd84259223d1fc7daa48d83f6e869122c010 Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 5 Mar 2024 10:36:34 -0500 Subject: [PATCH 12/47] added Metcalf dataset --- src/openqdc/datasets/interaction/__init__.py | 2 + src/openqdc/datasets/interaction/base.py | 2 + src/openqdc/datasets/interaction/metcalf.py | 91 ++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 src/openqdc/datasets/interaction/metcalf.py diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py index 62392e3..a920396 100644 --- a/src/openqdc/datasets/interaction/__init__.py +++ b/src/openqdc/datasets/interaction/__init__.py @@ -10,6 +10,7 @@ _lazy_imports_obj = { "BaseInteractionDataset": "openqdc.datasets.interaction.base", "DES370K": "openqdc.datasets.interaction.des370k", + "Metcalf": "openqdc.datasets.interaction.metcalf", } _lazy_imports_mod = {} @@ -45,4 +46,5 @@ def __dir__(): __all__ = [ "BaseInteractionDataset", "DES370K", + "Metcalf", ] diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py index e266658..c73ce7c 100644 --- a/src/openqdc/datasets/interaction/base.py +++ b/src/openqdc/datasets/interaction/base.py @@ -32,10 +32,12 @@ def __init__( def collate_list(self, list_entries: List[Dict]): # concatenate entries + print(list_entries[0]) res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \ for key in list_entries[0] if not isinstance(list_entries[0][key], dict)} csum = np.cumsum(res.get("n_atoms")) + print(csum) x = np.zeros((csum.shape[0], 2), dtype=np.int32) x[1:, 0], x[:, 1] = csum[:-1], csum res["position_idx_range"] = x diff --git a/src/openqdc/datasets/interaction/metcalf.py b/src/openqdc/datasets/interaction/metcalf.py new file mode 100644 index 0000000..3a8e714 --- /dev/null +++ b/src/openqdc/datasets/interaction/metcalf.py @@ -0,0 +1,91 @@ +import os +import numpy as np + +from typing import Dict, List + +from tqdm import tqdm +from rdkit import Chem +from loguru import logger +from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.utils.molecule import atom_table + +class Metcalf(BaseInteractionDataset): + __name__ = "metcalf" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = None + __energy_methods__ = [ + "SAPT0/jun-cc-pVDZ" + ] + energy_target_names = [ + "total energy", + "electrostatic energy", + "exchange energy", + "induction energy", + "dispersion energy", + ] + + def read_raw_entries(self) -> List[Dict]: + """ + SMILES strings are inferred from the + .xyz files using the RDKit xyz2mol function. + More details here: + + https://github.com/jensengroup/xyz2mol + + """ + data = [] + for dirname in os.listdir(self.root): + xyz_dir = os.path.join(self.root, dirname) + if not os.path.isdir(xyz_dir): + continue + subset = np.array([dirname.split("-")[0].lower()]) # training, validation, or test + for filename in os.listdir(xyz_dir): + if not filename.endswith(".xyz"): + continue + lines = list(map(lambda x: x.strip(), open(os.path.join(xyz_dir, filename), "r").readlines())) + line_two = lines[1].split(",") + energies = np.array([line_two[1:6]], dtype=np.float32) + num_atoms = np.array([int(lines[0])]) + num_atoms0 = int(line_two[-1]) + num_atoms1 = num_atoms[0] - num_atoms0 + + elem_xyz = np.array([x.split() for x in lines[2:]]) + # elements = np.expand_dims(elem_xyz[:, 0], axis=0) + elements = elem_xyz[:, 0] + xyz = elem_xyz[:, 1:].astype(np.float32) + # xyz0_fname = os.path.join(xyz_dir, f"{filename}_0_tmp.xyz") + # with open(xyz0_fname, "w") as xyz_0_file: + # lines_to_write = [str(num_atoms0) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[:num_atoms0].tolist())) + # lines_to_write[-1] = lines_to_write[-1][:-1] + # print(lines_to_write) + # xyz_0_file.writelines(lines_to_write) # writelines doesn't actually add a newline to each string (weird) + + # xyz1_fname = os.path.join(xyz_dir, f"{filename}_1_tmp.xyz") + # with open(xyz1_fname, "w") as xyz_1_file: + # lines_to_write = [str(num_atoms1) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[num_atoms0:].tolist())) + # lines_to_write[-1] = lines_to_write[-1][:-1] + # xyz_1_file.writelines(lines_to_write) + + # smiles0 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz0_fname)) + # smiles1 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz1_fname)) + # + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) + charges = np.expand_dims(np.array([0] * num_atoms[0]), axis=1) + + atomic_inputs = np.concatenate((atomic_nums, charges, xyz), axis=-1, dtype=np.float32) + + item = dict( + # elements=elements, + n_atoms=num_atoms, + subset=subset, + energies=energies, + positions=xyz, + atomic_inputs=atomic_inputs, + name=np.array([""]) + ) + data.append(item) + return data + + + From 9cf6034b43cfa6fd8989bc58a849453d1671872a Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 5 Mar 2024 10:59:06 -0500 Subject: [PATCH 13/47] updated DES370K based on Prudencio's comments --- src/openqdc/datasets/interaction/des370k.py | 37 ++++++--------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py index 824da46..78b3bde 100644 --- a/src/openqdc/datasets/interaction/des370k.py +++ b/src/openqdc/datasets/interaction/des370k.py @@ -24,15 +24,15 @@ class DES370K(BaseInteractionDataset): "ccsd(t)/cbs", # cbs "ccsd(t)/nn", # nn "sapt0/aug-cc-pwcvxz", - "sapt0/aug-cc-pwcvxz", - "sapt0/aug-cc-pwcvxz", - "sapt0/aug-cc-pwcvxz", - "sapt0/aug-cc-pwcvxz", - "sapt0/aug-cc-pwcvxz", - "sapt0/aug-cc-pwcvxz", - "sapt0/aug-cc-pwcvxz", - "sapt0/aug-cc-pwcvxz", - "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz_es", + "sapt0/aug-cc-pwcvxz_ex", + "sapt0/aug-cc-pwcvxz_exs2", + "sapt0/aug-cc-pwcvxz_ind", + "sapt0/aug-cc-pwcvxz_exind", + "sapt0/aug-cc-pwcvxz_disp", + "sapt0/aug-cc-pwcvxz_exdisp_os", + "sapt0/aug-cc-pwcvxz_exdisp_ss", + "sapt0/aug-cc-pwcvxz_delta_HF", ] energy_target_names = [ @@ -87,27 +87,10 @@ def read_raw_entries(self) -> List[Dict]: name = np.array([smiles0 + "." + smiles1]) item = dict( - mol0=dict( - smiles=smiles0, - atomic_inputs=atomic_inputs0, - n_atoms=natoms0, - charge=charge0, - elements=elements0, - atomic_nums=atomic_nums0, - pos=pos0, - ), - mol1=dict( - smiles=smiles1, - atomic_inputs=atomic_inputs1, - n_atoms=natoms1, - charge=charge1, - elements=elements1, - atomic_nums=atomic_nums1, - pos=pos1, - ), energies=energies, subset=np.array(["DES370K"]), n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), + n_atoms_first=np.array([natoms0], dtype=np.int32), atomic_inputs=atomic_inputs, name=name, ) From 6206665a601d2ba4192ccc6292fc85086b3cf989 Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 5 Mar 2024 15:09:08 -0500 Subject: [PATCH 14/47] added const molecule_groups lookup for DES370K dataset --- src/openqdc/utils/molecule.py | 43 +++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py index 82a58d2..4aeea1b 100644 --- a/src/openqdc/utils/molecule.py +++ b/src/openqdc/utils/molecule.py @@ -8,6 +8,49 @@ atom_table = Chem.GetPeriodicTable() +# molecule group classification for DES datasets +molecule_groups = { + "acids": set(["CCC(=O)O", "CC(=O)O", "OC=O", "OC(=O)CC(=O)O"]), + "alcohols": set(["CCCO", "CCC(O)C", "CCO", "CC(O)C", "CO", "OC1CCCC1", "OC1CCCCC1", "OCCCCO", "OCCCO", "OCCO"]), + "alkanes": set(["C1CCCC1", "C1CCCCC1", "C", "CC1CCCC1", "CC1CCCCC1", "CC", "CCC", "CC(C)C", "CCCC", "CC(C)(C)C", "CCC(C)C", "CCCCC", "CCC(C)(C)C", "CCCCCC"]), + "alkenes": set(["C=C", "CC=C", "CC=CC", "CC(=C)C", "CCC=C", "CC=C(C)C", "CCC=CC", "CCC(=C)C", "CC(=C(C)C)C", "CCC=C(C)C", "CCC(=CC)C", "CCC(=C(C)C)C"]), + "amides": set(["CCCNC=O", "CCC(=O)N", "CCC(=O)NC", "CCC(=O)N(C)C", "CCC(=O)N(CC)C", "CCNC=O", "CCNC(=O)C", "CCN(C=O)CC", "CCN(C(=O)C)C", "CCNC(=O)CC", \ + "CCN(C(=O)C)CC", "CC(=O)N", "CC(=O)N(C)C", "CNC=O", "CNC(=O)C", "CN(C=O)CC", "CNC(=O)CC(=O)N", "CNC(=O)CC(=O)NC", \ + "CNC(=O)CNC=O", "CNC(=O)CNC(=O)C", "CNC(=O)C(NC(=O)C)C", "NC=O, NC(=O)CC(=O)N", "O=CN(C)C", "O=CNCCC(=O)N", "O=CNCCC(=O)NC", "O=CNCCNC=O", "O=CNCC(=O)N"]), + "amines": set(["C1CCCN1", "C1CCCNC1", "CCCN", "CCCNC", "CCCN(C)C", "CCN", "CCN(C)C", "CCNCC", "CCN(CC)C", "CN", "CNC", "CN(C)C", "CNCC", "CNCCCN", "CNCCCNC", "CNCCN", "CNCCNC", "N", "NCCCN", "NCCN"]), + "ammoniums": set(["CC[NH3+]", "C[N+](C)(C)C", "C[NH2+]C", "C[NH3+]", "C[NH+](C)C", "[NH4+]"]), + "benzene": set(["c1ccccc1", "Cc1ccccc1", "CCc1ccccc1"]), + "carboxylates": set(["[O-]C=O", "[O-]C(=O)C", "[O-]C(=O)CC"]), + "esters": set(["CCCOC=O", "CCC(=O)OC", "CCOC(=O)CC", "CCOC(=O)C", "CCOC=O", "COC(=O)C", "COC=O", "O=COCCCOC=O", "O=COCCOC=O", "O=COCOC=O"]), + "ethers": set(["C1CCCO1", "C1CCCOC1", "C1CCOCO1", "C1OCCO1", "CCCOC", "CCCOCOC", "CCOCC", "COCCCOC", "COCC", "COCCOC", "COC", "COCOCC", "COCOC", "O1CCOCC1", "O1COCOC1"]), + "guanidiums": set(["CCNC(=[NH2+])N", "CNC(=[NH2+])N", "NC(=[NH2+])N"]), + "imidazolium": set(["c1[nH]cc[nH+]1", "Cc1c[nH]c[nH+]1", "CCc1c[nH]c[nH+]1"]), + "ketones": set(["CCC(=O)CC", "CCC(=O)C", "CCC=O", "CC(=O)C", "CC=O", "C=O"]), + "monoatomics": set(["[Ar]", "[Br-]", "[Ca+2]", "[Cl-]", "[F-]", "[He]", "[I-]", "[K+]", "[Kr]", "[Li+]", "[Mg+2]", "[Na+]", "[Ne]", "[Xe]"]), + "other": set(["Brc1ccc(cc1)Br", "Brc1ccccc1", "BrC(Br)Br", "BrCBr", "BrCCBr", "CBr", "CC(Br)Br", "CCBr", "CCCC#CC", "CCCC(Cl)(Cl)Cl", "CCCC(Cl)Cl", \ + "CCCCCl", "CCC#CC", "CCCC#C", "CCCC(F)(F)F", "CCCC(F)F", "CCCCF", "CCC(Cl)(Cl)Cl", "CCC(Cl)Cl", "CCCCl", "CCCC#N", "CC#CC", "CCC#C", "CCC(F)(F)F", \ + "CCC(F)F", "CCCF", "CC(Cl)(Cl)Cl", "CC(Cl)Cl", "CCCl", "CCC#N", "CC#C", "CC(F)(F)F", "CC(F)F", "CCF", "CC(I)I", "CCI", "CCl", "CC#N", "CCOP(=O)(OC)OC", \ + "CCOP(=O)(OC)[O-]", "CCOP(=O)(OC)O", "C#C", "CF", "CI", "Clc1ccc(cc1)Cl", "Clc1cccc(c1)Cl", "Clc1ccccc1Cl", "Clc1ccccc1", "Clc1cc(Cl)c(c(c1Cl)Cl)Cl", \ + "Clc1cc(Cl)cc(c1)Cl", "Clc1c(Cl)c(Cl)c(c(c1Cl)Cl)Cl", "ClC(C(Cl)(Cl)Cl)(Cl)Cl", "ClC(C(Cl)(Cl)Cl)Cl", "ClCC(Cl)(Cl)Cl", "ClCC(Cl)Cl", "ClCCCl", "ClC(Cl)Cl", \ + "ClCCl", "CNCCCOC=O", "CNCCCOC", "CNCCC(=O)NC", "CNCCC(=O)N", "CNCCC(=O)O", "CNCCCO", "CNCCCSC", "CNCCCS", "CNCCNC=O", "CNCCOC=O", "CNCCOC", "CNCC(=O)NC", \ + "CNCC(=O)N", "CNCC(=O)O", "CNCCO", "CNCCSC", "CNCCS", "CNC(=O)CCN", "CNC(=O)CC(=O)O", "CNC(=O)CCO", "CNC(=O)CCS", "CNC(=O)CN", "CNC(=O)COC=O", "CNC(=O)CO", \ + "CNCOC=O", "CNCOC", "CNC(=O)CS", "CNCSC", "C#N", "COCCCN", "COCCCOC=O", "COCCC(=O)NC", "COCCC(=O)N", "COCCC(=O)O", "COCCCO", "COCCCSC", "COCCCS", "COCCNC=O", \ + "COCCN", "COCCOC=O", "COCC(=O)NC", "COCC(=O)N", "COCC(=O)O", "COCCO", "COCCSC", "COCCS", "COCNC=O", "COCN", "COCOC=O", "COCO", "COCSC", "COCS", "COP(=O)(OC)OC", \ + "COP(=O)(OC)[O-]", "COP(=O)(OC)O", "COP(=O)(O)O", "COP(=O)(OP(=O)(O)O)[O-]", "CSCCCNC=O", "CSCCCN", "CSCCCOC=O", "CSCCC(=O)N", "CSCCC(=O)O", "CSCCCO", "CSCCN", "CSCCOC=O", \ + "CSCC(=O)NC", "CSCC(=O)N", "CSCC(=O)O", "CSCCO", "CSCNC=O", "CSCN", "CSCOC=O", "CSCO", "Fc1ccc(cc1)F", "Fc1cccc(c1)F", "Fc1ccccc1F", "Fc1ccccc1", "Fc1cc(F)c(c(c1F)F)F", \ + "Fc1cc(F)cc(c1)F", "Fc1c(F)c(F)c(c(c1F)F)F", "FC(C(F)(F)F)(F)F", "FC(C(F)(F)F)F", "FCC(F)(F)F", "FCC(F)F", "FCCF", "FC(F)F", "FCF", "ICCI", "ICI", "NCCCOC=O", "NCCC(=O)N", \ + "NCCC(=O)O", "NCCCO", "NCCCS", "NCCNC=O", "NCCOC=O", "NCC(=O)N", "NCC(=O)O", "NCCO", "NCCS", "NC(=O)CC(=O)O","NC(=O)CCO", "NC(=O)CCS", "NC(=O)CO", "NCOC=O", "NC(=O)CS", \ + "OCCCNC=O", "OCCCOC=O", "OCCC(=O)O", "OCCCS", "OCCNC=O", "OCCOC=O", "OCC(=O)O", "OCCS", "O=CNCCC(=O)O", "O=CNCCOC=O", "O=CNCC(=O)O", "O=CNCOC=O", "O=COCCC(=O)NC", \ + "O=COCCC(=O)N", "O=COCCC(=O)O", "O=COCC(=O)N", "O=COCC(=O)O", "OC(=O)CCS", "OCOC=O", "OC(=O)CS", "OP(=O)(O)O", "[O-]P(=O)(OP(=O)(OC)O)O", "SCCCOC=O", "SCCNC=O", "SCCOC=O", \ + "SCOC=O", "[H][H]"]), + "phenol": set(["Cc1ccc(cc1)O", "CCc1ccc(cc1)O", "Oc1ccccc1"]), + "pyridine": set(["c1cccnc1", "c1ccncn1", "n1ccncc1"]), + "pyrrole": set(["c1ccc2c(c1)[nH]cc2", "c1ccc[nH]1", "c1ncc[nH]1", "Cc1cnc[nH]1", "Cc1c[nH]c2c1cccc2", "Cc1c[nH]cn1", "CCc1cnc[nH]1", "CCc1c[nH]c2c1cccc2", "CCc1c[nH]cn1"]), + "sulfides": set(["C1CCCS1", "C1CCCSC1", "C1CCSCS1", "C1CCSSC1", "C1CSSC1", "C1SCCS1", "CCCSCSC", "CCCSC", "CCCSSC", "CCSCC", "CCSSCC", "CCSSC", "CSCCCSC", "CSCCSC", "CSCC", \ + "CSCSCC", "CSCSC", "CSC", "CSSC", "S1CCSCC1", "S1CSCSC1"]), + "thiols": set(["CCCSS", "CCCS", "CCSS", "CCS", "CSCCCS", "CSCCS", "CSCS", "CSS", "CS", "SCCCS", "SCCS", "SS", "S"]), + "water": set(["O"]), +} def z_to_formula(z): u, c = np.unique(z, return_counts=True) From 5cb57d95a2e70ba59cb90748eecda6bda15b3547 Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 5 Mar 2024 15:31:18 -0500 Subject: [PATCH 15/47] updated subsets for DES370K --- src/openqdc/datasets/interaction/des370k.py | 19 +++++++++++++++++-- src/openqdc/utils/molecule.py | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py index 78b3bde..ed5b7fb 100644 --- a/src/openqdc/datasets/interaction/des370k.py +++ b/src/openqdc/datasets/interaction/des370k.py @@ -5,9 +5,10 @@ from typing import Dict, List from tqdm import tqdm +from rdkit import Chem from loguru import logger from openqdc.datasets.interaction import BaseInteractionDataset -from openqdc.utils.molecule import atom_table +from openqdc.utils.molecule import atom_table, molecule_groups class DES370K(BaseInteractionDataset): @@ -85,10 +86,24 @@ def read_raw_entries(self) -> List[Dict]: energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] name = np.array([smiles0 + "." + smiles1]) + canon_smiles0 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles0)) + canon_smiles1 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1)) + + subsets = [] + # for smiles in [canon_smiles0, canon_smiles1]: + for smiles in [smiles0, smiles1]: + found = False + for functional_group, smiles_set in molecule_groups.items(): + if smiles in smiles_set: + subsets.append(functional_group) + found = True + if not found: + logger.info(f"molecule group lookup failed for {smiles}") + item = dict( energies=energies, - subset=np.array(["DES370K"]), + subset=np.array([subsets]), n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), n_atoms_first=np.array([natoms0], dtype=np.int32), atomic_inputs=atomic_inputs, diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py index 4aeea1b..e7a3710 100644 --- a/src/openqdc/utils/molecule.py +++ b/src/openqdc/utils/molecule.py @@ -16,7 +16,7 @@ "alkenes": set(["C=C", "CC=C", "CC=CC", "CC(=C)C", "CCC=C", "CC=C(C)C", "CCC=CC", "CCC(=C)C", "CC(=C(C)C)C", "CCC=C(C)C", "CCC(=CC)C", "CCC(=C(C)C)C"]), "amides": set(["CCCNC=O", "CCC(=O)N", "CCC(=O)NC", "CCC(=O)N(C)C", "CCC(=O)N(CC)C", "CCNC=O", "CCNC(=O)C", "CCN(C=O)CC", "CCN(C(=O)C)C", "CCNC(=O)CC", \ "CCN(C(=O)C)CC", "CC(=O)N", "CC(=O)N(C)C", "CNC=O", "CNC(=O)C", "CN(C=O)CC", "CNC(=O)CC(=O)N", "CNC(=O)CC(=O)NC", \ - "CNC(=O)CNC=O", "CNC(=O)CNC(=O)C", "CNC(=O)C(NC(=O)C)C", "NC=O, NC(=O)CC(=O)N", "O=CN(C)C", "O=CNCCC(=O)N", "O=CNCCC(=O)NC", "O=CNCCNC=O", "O=CNCC(=O)N"]), + "CNC(=O)CNC=O", "CNC(=O)CNC(=O)C", "CNC(=O)C(NC(=O)C)C", "NC=O", "NC(=O)CC(=O)N", "O=CN(C)C", "O=CNCCC(=O)N", "O=CNCCC(=O)NC", "O=CNCCNC=O", "O=CNCC(=O)N"]), "amines": set(["C1CCCN1", "C1CCCNC1", "CCCN", "CCCNC", "CCCN(C)C", "CCN", "CCN(C)C", "CCNCC", "CCN(CC)C", "CN", "CNC", "CN(C)C", "CNCC", "CNCCCN", "CNCCCNC", "CNCCN", "CNCCNC", "N", "NCCCN", "NCCN"]), "ammoniums": set(["CC[NH3+]", "C[N+](C)(C)C", "C[NH2+]C", "C[NH3+]", "C[NH+](C)C", "[NH4+]"]), "benzene": set(["c1ccccc1", "Cc1ccccc1", "CCc1ccccc1"]), From e18b7104c6b9bc07580eb030b2b7dc18029641a3 Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 5 Mar 2024 15:41:35 -0500 Subject: [PATCH 16/47] added download url for des5m_interaction --- src/openqdc/raws/config_factory.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py index df54307..28f115e 100644 --- a/src/openqdc/raws/config_factory.py +++ b/src/openqdc/raws/config_factory.py @@ -97,6 +97,13 @@ class DataConfigFactory: } ) + des5m_interaction = dict( + dataset_name="des5m_interaction", + links={ + "DES5M.zip": "https://zenodo.org/records/5706002/files/DESS5M.zip?download=1", + } + ) + tmqm = dict( dataset_name="tmqm", links={ From 54cadbf8e21b46fe0467b41c9ba1b2af13c59fc8 Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 5 Mar 2024 15:48:37 -0500 Subject: [PATCH 17/47] updated README with new datasets --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c143e95..4e3b6e2 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ pytest 6. QM Level of Theory --> -We provide support for the following publicly available QM Datasets. +We provide support for the following publicly available QM Potential Energy Datasets. | Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| | --- | --- | --- | --- | --- | --- | --- | --- | @@ -46,3 +46,11 @@ We provide support for the following publicly available QM Datasets. | [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | | [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | | [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | + +We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets. + +| Dataset | +| --- | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | +| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) | \ No newline at end of file From a922ef7568013236d852a366eb5e5bc1e579aae8 Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 5 Mar 2024 17:05:01 -0500 Subject: [PATCH 18/47] Added DES5M dataset --- src/openqdc/datasets/interaction/__init__.py | 2 + src/openqdc/datasets/interaction/des5m.py | 104 +++++++++++++++++++ src/openqdc/utils/molecule.py | 2 + 3 files changed, 108 insertions(+) create mode 100644 src/openqdc/datasets/interaction/des5m.py diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py index a920396..aa33b66 100644 --- a/src/openqdc/datasets/interaction/__init__.py +++ b/src/openqdc/datasets/interaction/__init__.py @@ -10,6 +10,7 @@ _lazy_imports_obj = { "BaseInteractionDataset": "openqdc.datasets.interaction.base", "DES370K": "openqdc.datasets.interaction.des370k", + "DES5M": "openqdc.datasets.interaction.des5m", "Metcalf": "openqdc.datasets.interaction.metcalf", } @@ -46,5 +47,6 @@ def __dir__(): __all__ = [ "BaseInteractionDataset", "DES370K", + "DES5M", "Metcalf", ] diff --git a/src/openqdc/datasets/interaction/des5m.py b/src/openqdc/datasets/interaction/des5m.py new file mode 100644 index 0000000..7fb5d50 --- /dev/null +++ b/src/openqdc/datasets/interaction/des5m.py @@ -0,0 +1,104 @@ +import os +import numpy as np +import pandas as pd + +from typing import Dict, List + +from tqdm import tqdm +from rdkit import Chem +from loguru import logger +from openqdc.datasets.interaction import DES370K +from openqdc.utils.molecule import atom_table, molecule_groups + + +class DES5M(DES370K): + __name__ = "des5m_interaction" + __energy_methods__ = [ + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/nn", # nn + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz_es", + "sapt0/aug-cc-pwcvxz_ex", + "sapt0/aug-cc-pwcvxz_exs2", + "sapt0/aug-cc-pwcvxz_ind", + "sapt0/aug-cc-pwcvxz_exind", + "sapt0/aug-cc-pwcvxz_disp", + "sapt0/aug-cc-pwcvxz_exdisp_os", + "sapt0/aug-cc-pwcvxz_exdisp_ss", + "sapt0/aug-cc-pwcvxz_delta_HF", + ] + + energy_target_names = [ + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + def read_raw_entries(self) -> List[Dict]: + self.filepath = os.path.join(self.root, "DES5M.csv") + logger.info(f"Reading DES5M interaction data from {self.filepath}") + df = pd.read_csv(self.filepath) + data = [] + for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] + natoms0, natoms1 = row["natoms0"], row["natoms1"] + pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) + pos0 = pos[:natoms0] + pos1 = pos[natoms0:] + + elements = row["elements"].split() + elements0 = np.array(elements[:natoms0]) + elements1 = np.array(elements[natoms0:]) + + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) + atomic_nums0 = np.array(atomic_nums[:natoms0]) + atomic_nums1 = np.array(atomic_nums[natoms0:]) + + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + atomic_inputs0 = atomic_inputs[:natoms0, :] + atomic_inputs1 = atomic_inputs[natoms0:, :] + + energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] + + name = np.array([smiles0 + "." + smiles1]) + canon_smiles0 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles0)) + canon_smiles1 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1)) + + subsets = [] + # for smiles in [canon_smiles0, canon_smiles1]: + for smiles in [smiles0, smiles1]: + found = False + for functional_group, smiles_set in molecule_groups.items(): + if smiles in smiles_set: + subsets.append(functional_group) + found = True + if not found: + logger.info(f"molecule group lookup failed for {smiles}") + + + item = dict( + energies=energies, + subset=np.array([subsets]), + n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), + n_atoms_first=np.array([natoms0], dtype=np.int32), + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py index e7a3710..bddaaab 100644 --- a/src/openqdc/utils/molecule.py +++ b/src/openqdc/utils/molecule.py @@ -50,6 +50,8 @@ "CSCSCC", "CSCSC", "CSC", "CSSC", "S1CCSCC1", "S1CSCSC1"]), "thiols": set(["CCCSS", "CCCS", "CCSS", "CCS", "CSCCCS", "CSCCS", "CSCS", "CSS", "CS", "SCCCS", "SCCS", "SS", "S"]), "water": set(["O"]), + "flourane": set(["F"]), + "hydrogen chloride": set(["Cl"]), } def z_to_formula(z): From 2146058503c91a7b6685618b6a9084050179f106 Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 5 Mar 2024 21:56:37 -0500 Subject: [PATCH 19/47] added des_s66 dataset --- src/openqdc/datasets/interaction/__init__.py | 2 + src/openqdc/datasets/interaction/des5m.py | 2 - src/openqdc/datasets/interaction/dess66.py | 102 +++++++++++++++++++ src/openqdc/raws/config_factory.py | 11 ++ 4 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 src/openqdc/datasets/interaction/dess66.py diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py index aa33b66..c7cddb4 100644 --- a/src/openqdc/datasets/interaction/__init__.py +++ b/src/openqdc/datasets/interaction/__init__.py @@ -12,6 +12,7 @@ "DES370K": "openqdc.datasets.interaction.des370k", "DES5M": "openqdc.datasets.interaction.des5m", "Metcalf": "openqdc.datasets.interaction.metcalf", + "DESS66": "openqdc.datasets.interaction.dess66", } _lazy_imports_mod = {} @@ -49,4 +50,5 @@ def __dir__(): "DES370K", "DES5M", "Metcalf", + "DESS66", ] diff --git a/src/openqdc/datasets/interaction/des5m.py b/src/openqdc/datasets/interaction/des5m.py index 7fb5d50..e79d690 100644 --- a/src/openqdc/datasets/interaction/des5m.py +++ b/src/openqdc/datasets/interaction/des5m.py @@ -77,8 +77,6 @@ def read_raw_entries(self) -> List[Dict]: energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] name = np.array([smiles0 + "." + smiles1]) - canon_smiles0 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles0)) - canon_smiles1 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1)) subsets = [] # for smiles in [canon_smiles0, canon_smiles1]: diff --git a/src/openqdc/datasets/interaction/dess66.py b/src/openqdc/datasets/interaction/dess66.py new file mode 100644 index 0000000..13f5b9c --- /dev/null +++ b/src/openqdc/datasets/interaction/dess66.py @@ -0,0 +1,102 @@ +import os +import numpy as np +import pandas as pd + +from typing import Dict, List + +from tqdm import tqdm +from rdkit import Chem +from loguru import logger +from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.utils.molecule import atom_table, molecule_groups + + +class DESS66(BaseInteractionDataset): + __name__ = "des_s66" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cbs", # cbs + "ccsd(t)/nn", # nn + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz_es", + "sapt0/aug-cc-pwcvxz_ex", + "sapt0/aug-cc-pwcvxz_exs2", + "sapt0/aug-cc-pwcvxz_ind", + "sapt0/aug-cc-pwcvxz_exind", + "sapt0/aug-cc-pwcvxz_disp", + "sapt0/aug-cc-pwcvxz_exdisp_os", + "sapt0/aug-cc-pwcvxz_exdisp_ss", + "sapt0/aug-cc-pwcvxz_delta_HF", + ] + + energy_target_names = [ + "cc_MP2_all", + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "cc_CCSD(T)_all", + "cbs_CCSD(T)_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + def read_raw_entries(self) -> List[Dict]: + self.filepath = os.path.join(self.root, "DESS66.csv") + logger.info(f"Reading DESS66 interaction data from {self.filepath}") + df = pd.read_csv(self.filepath) + data = [] + for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] + natoms0, natoms1 = row["natoms0"], row["natoms1"] + pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) + pos0 = pos[:natoms0] + pos1 = pos[natoms0:] + + elements = row["elements"].split() + elements0 = np.array(elements[:natoms0]) + elements1 = np.array(elements[natoms0:]) + + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) + atomic_nums0 = np.array(atomic_nums[:natoms0]) + atomic_nums1 = np.array(atomic_nums[natoms0:]) + + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + atomic_inputs0 = atomic_inputs[:natoms0, :] + atomic_inputs1 = atomic_inputs[natoms0:, :] + + energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] + + name = np.array([smiles0 + "." + smiles1]) + + + subset = row["system_name"] + + item = dict( + energies=energies, + subset=np.array([subset]), + n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), + n_atoms_first=np.array([natoms0], dtype=np.int32), + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py index 28f115e..5e01a91 100644 --- a/src/openqdc/raws/config_factory.py +++ b/src/openqdc/raws/config_factory.py @@ -144,6 +144,17 @@ class DataConfigFactory: dataset_name="transition1x", links={"Transition1x.h5": "https://figshare.com/ndownloader/files/36035789"}, ) + + # l7 = dict( + # dataset_name="l7", + # links={"l7.zip": "http://www.begdb.org/moldown.php?id=40"} + # ) + + des_s66 = dict( + dataset_name="des_s66", + links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"}, + ) + available_datasets = [k for k in locals().keys() if not k.startswith("__")] def __init__(self): From 4d9a4bab09a5a32770e4190da40bef28bc0fd0c3 Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 5 Mar 2024 22:02:08 -0500 Subject: [PATCH 20/47] added DESS66x8 dataset --- src/openqdc/datasets/interaction/__init__.py | 2 + src/openqdc/datasets/interaction/dess66x8.py | 102 +++++++++++++++++++ src/openqdc/raws/config_factory.py | 5 + 3 files changed, 109 insertions(+) create mode 100644 src/openqdc/datasets/interaction/dess66x8.py diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py index c7cddb4..b848fc7 100644 --- a/src/openqdc/datasets/interaction/__init__.py +++ b/src/openqdc/datasets/interaction/__init__.py @@ -13,6 +13,7 @@ "DES5M": "openqdc.datasets.interaction.des5m", "Metcalf": "openqdc.datasets.interaction.metcalf", "DESS66": "openqdc.datasets.interaction.dess66", + "DESS66x8": "openqdc.datasets.interaction.dess66x8", } _lazy_imports_mod = {} @@ -51,4 +52,5 @@ def __dir__(): "DES5M", "Metcalf", "DESS66", + "DESS66x8", ] diff --git a/src/openqdc/datasets/interaction/dess66x8.py b/src/openqdc/datasets/interaction/dess66x8.py new file mode 100644 index 0000000..80e3bad --- /dev/null +++ b/src/openqdc/datasets/interaction/dess66x8.py @@ -0,0 +1,102 @@ +import os +import numpy as np +import pandas as pd + +from typing import Dict, List + +from tqdm import tqdm +from rdkit import Chem +from loguru import logger +from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.utils.molecule import atom_table, molecule_groups + + +class DESS66x8(BaseInteractionDataset): + __name__ = "des_s66x8" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cbs", # cbs + "ccsd(t)/nn", # nn + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz_es", + "sapt0/aug-cc-pwcvxz_ex", + "sapt0/aug-cc-pwcvxz_exs2", + "sapt0/aug-cc-pwcvxz_ind", + "sapt0/aug-cc-pwcvxz_exind", + "sapt0/aug-cc-pwcvxz_disp", + "sapt0/aug-cc-pwcvxz_exdisp_os", + "sapt0/aug-cc-pwcvxz_exdisp_ss", + "sapt0/aug-cc-pwcvxz_delta_HF", + ] + + energy_target_names = [ + "cc_MP2_all", + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "cc_CCSD(T)_all", + "cbs_CCSD(T)_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + def read_raw_entries(self) -> List[Dict]: + self.filepath = os.path.join(self.root, "DESS66x8.csv") + logger.info(f"Reading DESS66x8 interaction data from {self.filepath}") + df = pd.read_csv(self.filepath) + data = [] + for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] + natoms0, natoms1 = row["natoms0"], row["natoms1"] + pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) + pos0 = pos[:natoms0] + pos1 = pos[natoms0:] + + elements = row["elements"].split() + elements0 = np.array(elements[:natoms0]) + elements1 = np.array(elements[natoms0:]) + + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) + atomic_nums0 = np.array(atomic_nums[:natoms0]) + atomic_nums1 = np.array(atomic_nums[natoms0:]) + + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + atomic_inputs0 = atomic_inputs[:natoms0, :] + atomic_inputs1 = atomic_inputs[natoms0:, :] + + energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] + + name = np.array([smiles0 + "." + smiles1]) + + + subset = row["system_name"] + + item = dict( + energies=energies, + subset=np.array([subset]), + n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), + n_atoms_first=np.array([natoms0], dtype=np.int32), + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py index 5e01a91..312faf0 100644 --- a/src/openqdc/raws/config_factory.py +++ b/src/openqdc/raws/config_factory.py @@ -155,6 +155,11 @@ class DataConfigFactory: links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"}, ) + des_s66x8 = dict( + dataset_name="des_s66x8", + links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"} + ) + available_datasets = [k for k in locals().keys() if not k.startswith("__")] def __init__(self): From c2229e38be0d76680155323250ef95d5ce78143d Mon Sep 17 00:00:00 2001 From: mcneela Date: Wed, 6 Mar 2024 09:56:24 -0500 Subject: [PATCH 21/47] small update to __init__ file --- src/openqdc/datasets/interaction/__init__.py | 4 ++++ src/openqdc/datasets/interaction/des370k.py | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py index b848fc7..bf266c2 100644 --- a/src/openqdc/datasets/interaction/__init__.py +++ b/src/openqdc/datasets/interaction/__init__.py @@ -45,6 +45,10 @@ def __dir__(): if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": from .base import BaseInteractionDataset from .des370k import DES370K + from .des5m import DES5M + from .metcalf import Metcalf + from .dess66 import DESS66 + from .dess66x8 import DESS66x8 __all__ = [ "BaseInteractionDataset", diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py index ed5b7fb..65a5835 100644 --- a/src/openqdc/datasets/interaction/des370k.py +++ b/src/openqdc/datasets/interaction/des370k.py @@ -86,8 +86,6 @@ def read_raw_entries(self) -> List[Dict]: energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] name = np.array([smiles0 + "." + smiles1]) - canon_smiles0 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles0)) - canon_smiles1 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1)) subsets = [] # for smiles in [canon_smiles0, canon_smiles1]: From 9349454ec47a76fc9bd07109189f6edfd1fc42fe Mon Sep 17 00:00:00 2001 From: mcneela Date: Wed, 6 Mar 2024 10:52:40 -0500 Subject: [PATCH 22/47] added L7 dataset --- src/openqdc/datasets/interaction/L7.py | 73 ++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 src/openqdc/datasets/interaction/L7.py diff --git a/src/openqdc/datasets/interaction/L7.py b/src/openqdc/datasets/interaction/L7.py new file mode 100644 index 0000000..a0d657f --- /dev/null +++ b/src/openqdc/datasets/interaction/L7.py @@ -0,0 +1,73 @@ +import os +import numpy as np +import pandas as pd + +from typing import Dict, List + +from tqdm import tqdm +from rdkit import Chem +from ruamel.yaml import YAML +from loguru import logger +from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.utils.molecule import atom_table, molecule_groups + + +class L7(BaseInteractionDataset): + __name__ = "L7" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "CSD(T) | QCISD(T)", + "DLPNO-CCSD(T)", + "MP2/CBS", + "MP2C/CBS", + "fixed", + "DLPNO-CCSD(T0)", + "LNO-CCSD(T)", + "FN-DMC", + ] + + energy_target_names = [] + + def read_raw_entries(self) -> List[Dict]: + yaml_fpath = os.path.join(self.root, "l7.yaml") + logger.info(f"Reading L7 interaction data from {self.root}") + yaml_file = open(yaml_fpath, "r") + yaml = YAML() + data = [] + data_dict = yaml.load(yaml_file) + charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"]) + charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"]) + + for idx, item in enumerate(data_dict["items"]): + energies = [] + name = np.array([item["shortname"]]) + fname = item["geometry"].split(":")[1] + energies.append(item["reference_value"]) + xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r") + lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) + lines.pop(1) + n_atoms = np.array([int(lines[0][0])], dtype=np.int32) + n_atoms_first = np.array([int(item["setup"]["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) + subset = np.array([item["group"]]) + energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] + energies = np.array([energies], dtype=np.float32) + pos = np.array(lines[1:])[:, 1:].astype(np.float32) + elems = np.array(lines[1:])[:, 0] + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) + natoms0 = n_atoms_first[0] + natoms1 = n_atoms[0] - natoms0 + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + + item = dict( + energies=energies, + subset=subset, + n_atoms=n_atoms, + n_atoms_first=n_atoms_first, + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data From c3bdc640d7039d00350fb6fc26f0449b81e9133b Mon Sep 17 00:00:00 2001 From: mcneela Date: Wed, 6 Mar 2024 11:10:52 -0500 Subject: [PATCH 23/47] added X40 dataset --- src/openqdc/datasets/interaction/X40.py | 70 +++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 src/openqdc/datasets/interaction/X40.py diff --git a/src/openqdc/datasets/interaction/X40.py b/src/openqdc/datasets/interaction/X40.py new file mode 100644 index 0000000..f89e6dd --- /dev/null +++ b/src/openqdc/datasets/interaction/X40.py @@ -0,0 +1,70 @@ +import os +import numpy as np +import pandas as pd + +from typing import Dict, List + +from tqdm import tqdm +from rdkit import Chem +from ruamel.yaml import YAML +from loguru import logger +from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.utils.molecule import atom_table, molecule_groups + + +class X40(BaseInteractionDataset): + __name__ = "X40" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "default", + "MP2/CBS", + "dCCSD(T)/haDZ", + "dCCSD(T)/haTZ", + "MP2.5/CBS(aDZ)", + ] + + energy_target_names = [] + + def read_raw_entries(self) -> List[Dict]: + yaml_fpath = os.path.join(self.root, "x40.yaml") + logger.info(f"Reading X40 interaction data from {self.root}") + yaml_file = open(yaml_fpath, "r") + yaml = YAML() + data = [] + data_dict = yaml.load(yaml_file) + charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"]) + charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"]) + + for idx, item in enumerate(data_dict["items"]): + energies = [] + name = np.array([item["shortname"]]) + energies.append(float(item["reference_value"])) + xyz_file = open(os.path.join(self.root, f"{item['shortname']}.xyz"), "r") + lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) + setup = lines.pop(1) + n_atoms = np.array([int(lines[0][0])], dtype=np.int32) + n_atoms_first = setup[0].split("-")[1] + n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32) + subset = np.array([item["group"]]) + energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] + energies = np.array([energies], dtype=np.float32) + pos = np.array(lines[1:])[:, 1:].astype(np.float32) + elems = np.array(lines[1:])[:, 0] + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) + natoms0 = n_atoms_first[0] + natoms1 = n_atoms[0] - natoms0 + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + + item = dict( + energies=energies, + subset=subset, + n_atoms=n_atoms, + n_atoms_first=n_atoms_first, + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data From 23c0739c177b56168d84039e95b12e257f60de7a Mon Sep 17 00:00:00 2001 From: mcneela Date: Wed, 6 Mar 2024 11:13:28 -0500 Subject: [PATCH 24/47] add new datasets to __init__.py --- src/openqdc/datasets/interaction/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py index bf266c2..3d6f17e 100644 --- a/src/openqdc/datasets/interaction/__init__.py +++ b/src/openqdc/datasets/interaction/__init__.py @@ -14,6 +14,8 @@ "Metcalf": "openqdc.datasets.interaction.metcalf", "DESS66": "openqdc.datasets.interaction.dess66", "DESS66x8": "openqdc.datasets.interaction.dess66x8", + "L7": "openqdc.datasets.interaction.L7", + "X40": "openqdc.datasets.interaction.X40", } _lazy_imports_mod = {} @@ -49,6 +51,8 @@ def __dir__(): from .metcalf import Metcalf from .dess66 import DESS66 from .dess66x8 import DESS66x8 + from .L7 import L7 + from .X40 import X40 __all__ = [ "BaseInteractionDataset", @@ -57,4 +61,6 @@ def __dir__(): "Metcalf", "DESS66", "DESS66x8", + "L7", + "X40", ] From 74f87a6934c9ddcf847e80bda884ba764a934b8d Mon Sep 17 00:00:00 2001 From: mcneela Date: Thu, 7 Mar 2024 10:35:00 -0500 Subject: [PATCH 25/47] added splinter dataset --- src/openqdc/datasets/interaction/__init__.py | 3 + src/openqdc/datasets/interaction/splinter.py | 107 +++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 src/openqdc/datasets/interaction/splinter.py diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py index 3d6f17e..1906b7f 100644 --- a/src/openqdc/datasets/interaction/__init__.py +++ b/src/openqdc/datasets/interaction/__init__.py @@ -16,6 +16,7 @@ "DESS66x8": "openqdc.datasets.interaction.dess66x8", "L7": "openqdc.datasets.interaction.L7", "X40": "openqdc.datasets.interaction.X40", + "Splinter": "openqdc.datasets.interaction.splinter", } _lazy_imports_mod = {} @@ -53,6 +54,7 @@ def __dir__(): from .dess66x8 import DESS66x8 from .L7 import L7 from .X40 import X40 + from .splinter import Splinter __all__ = [ "BaseInteractionDataset", @@ -63,4 +65,5 @@ def __dir__(): "DESS66x8", "L7", "X40", + "Splinter", ] diff --git a/src/openqdc/datasets/interaction/splinter.py b/src/openqdc/datasets/interaction/splinter.py new file mode 100644 index 0000000..ff39268 --- /dev/null +++ b/src/openqdc/datasets/interaction/splinter.py @@ -0,0 +1,107 @@ +import os +import numpy as np +import pandas as pd + +from typing import Dict, List + +from tqdm import tqdm +from rdkit import Chem +from loguru import logger +from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.utils.molecule import atom_table, molecule_groups + + +class Splinter(BaseInteractionDataset): + __name__ = "splinter" + __energy_methods__ = [ + "sapt0/jun-cc-pV(D+d)Z_unscaled", + "sapt0/jun-cc-pV(D+d)Z_es_unscaled", + "sapt0/jun-cc-pV(D+d)Z_ex_unscaled", + "sapt0/jun-cc-pV(D+d)Z_ind_unscaled", + "sapt0/jun-cc-pV(D+d)Z_disp_unscaled", + "sapt0/jun-cc-pV(D+d)Z_scaled", + "sapt0/jun-cc-pV(D+d)Z_es_scaled", + "sapt0/jun-cc-pV(D+d)Z_ex_scaled", + "sapt0/jun-cc-pV(D+d)Z_ind_scaled", + "sapt0/jun-cc-pV(D+d)Z_disp_scaled", + "sapt0/aug-cc-pV(D+d)Z_unscaled", + "sapt0/aug-cc-pV(D+d)Z_es_unscaled", + "sapt0/aug-cc-pV(D+d)Z_ex_unscaled", + "sapt0/aug-cc-pV(D+d)Z_ind_unscaled", + "sapt0/aug-cc-pV(D+d)Z_disp_unscaled", + "sapt0/aug-cc-pV(D+d)Z_scaled", + "sapt0/aug-cc-pV(D+d)Z_es_scaled", + "sapt0/aug-cc-pV(D+d)Z_ex_scaled", + "sapt0/aug-cc-pV(D+d)Z_ind_scaled", + "sapt0/aug-cc-pV(D+d)Z_disp_scaled", + ] + + energy_target_names = [] + + def read_raw_entries(self) -> List[Dict]: + logger.info(f"Reading Splinter interaction data from {self.root}") + data = [] + i = 0 + for root, dirs, files in tqdm(os.walk(self.root), total=1709042): # total is currently an approximation + i += 1 + if not files: + continue + for filename in files: + if not filename.endswith(".xyz"): + continue + filepath = os.path.join(root, filename) + filein = open(filepath, "r") + lines = list(map(lambda x: x.strip(), filein.readlines())) + n_atoms = np.array([int(lines[0])], dtype=np.int32) + metadata = lines[1].split(",") + try: + ( + protein_monomer_name, + protein_interaction_site_type, + ligand_monomer_name, + ligand_interaction_site_type, + index, r, theta_P, tau_P, theta_L, + tau_L, tau_PL + ) = metadata[0].split("_") + index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])) + except: + (protein_monomer_name, + protein_interaction_site_type, + ligand_monomer_name, + ligand_interaction_site_type, + index, _) = metadata[0].split("_") + r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6 + energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32) + n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32) + total_charge, charge0, charge1 = list(map(int, metadata[1:4])) + lines = list(map(lambda x: x.split(), lines[2:])) + pos = np.array(lines)[:, 1:].astype(np.float32) + elems = np.array(lines)[:, 0] + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) + natoms0 = n_atoms_first[0] + natoms1 = n_atoms[0] - natoms0 + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + subset = np.array([root.split("/")[-1]]) + + item = dict( + energies=energies, + subset=subset, + n_atoms=n_atoms, + n_atoms_first=n_atoms_first, + atomic_inputs=atomic_inputs, + protein_monomer_name= np.array([protein_monomer_name]), + protein_interaction_site_type=np.array([protein_interaction_site_type]), + ligand_monomer_name=np.array([ligand_monomer_name]), + ligand_interaction_site_type=np.array([ligand_interaction_site_type]), + index = np.array([index]), + r=np.array([r]), + theta_P=np.array([theta_P]), + tau_P=np.array([tau_P]), + theta_L=np.array([theta_L]), + tau_L=np.array([tau_L]), + tau_PL=np.array([tau_PL]), + ) + data.append(item) + logger.info(f"Processed {i} files in total") + return data From f046ea9b56076066b420eeedba12341039e42174 Mon Sep 17 00:00:00 2001 From: Danny McNeela Date: Thu, 7 Mar 2024 16:29:31 +0000 Subject: [PATCH 26/47] fixed a couple splinter things --- src/openqdc/datasets/interaction/splinter.py | 121 ++++++++++--------- 1 file changed, 61 insertions(+), 60 deletions(-) diff --git a/src/openqdc/datasets/interaction/splinter.py b/src/openqdc/datasets/interaction/splinter.py index ff39268..7d7dbd0 100644 --- a/src/openqdc/datasets/interaction/splinter.py +++ b/src/openqdc/datasets/interaction/splinter.py @@ -42,66 +42,67 @@ def read_raw_entries(self) -> List[Dict]: logger.info(f"Reading Splinter interaction data from {self.root}") data = [] i = 0 - for root, dirs, files in tqdm(os.walk(self.root), total=1709042): # total is currently an approximation - i += 1 - if not files: - continue - for filename in files: - if not filename.endswith(".xyz"): - continue - filepath = os.path.join(root, filename) - filein = open(filepath, "r") - lines = list(map(lambda x: x.strip(), filein.readlines())) - n_atoms = np.array([int(lines[0])], dtype=np.int32) - metadata = lines[1].split(",") - try: - ( - protein_monomer_name, - protein_interaction_site_type, - ligand_monomer_name, - ligand_interaction_site_type, - index, r, theta_P, tau_P, theta_L, - tau_L, tau_PL - ) = metadata[0].split("_") - index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])) - except: - (protein_monomer_name, - protein_interaction_site_type, - ligand_monomer_name, - ligand_interaction_site_type, - index, _) = metadata[0].split("_") - r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6 - energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32) - n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32) - total_charge, charge0, charge1 = list(map(int, metadata[1:4])) - lines = list(map(lambda x: x.split(), lines[2:])) - pos = np.array(lines)[:, 1:].astype(np.float32) - elems = np.array(lines)[:, 0] - atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) - natoms0 = n_atoms_first[0] - natoms1 = n_atoms[0] - natoms0 - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - subset = np.array([root.split("/")[-1]]) + with tqdm(total=1706343) as progress_bar: + for root, dirs, files in os.walk(self.root): # total is currently an approximation + for filename in files: + if not filename.endswith(".xyz"): + continue + i += 1 + filepath = os.path.join(root, filename) + filein = open(filepath, "r") + lines = list(map(lambda x: x.strip(), filein.readlines())) + n_atoms = np.array([int(lines[0])], dtype=np.int32) + metadata = lines[1].split(",") + try: + ( + protein_monomer_name, + protein_interaction_site_type, + ligand_monomer_name, + ligand_interaction_site_type, + index, r, theta_P, tau_P, theta_L, + tau_L, tau_PL + ) = metadata[0].split("_") + index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])) + except: + (protein_monomer_name, + protein_interaction_site_type, + ligand_monomer_name, + ligand_interaction_site_type, + index, _) = metadata[0].split("_") + r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6 + energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32) + n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32) + total_charge, charge0, charge1 = list(map(int, metadata[1:4])) + lines = list(map(lambda x: x.split(), lines[2:])) + pos = np.array(lines)[:, 1:].astype(np.float32) + elems = np.array(lines)[:, 0] + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) + natoms0 = n_atoms_first[0] + natoms1 = n_atoms[0] - natoms0 + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + subset = np.array([root.split("/")[-1]]) - item = dict( - energies=energies, - subset=subset, - n_atoms=n_atoms, - n_atoms_first=n_atoms_first, - atomic_inputs=atomic_inputs, - protein_monomer_name= np.array([protein_monomer_name]), - protein_interaction_site_type=np.array([protein_interaction_site_type]), - ligand_monomer_name=np.array([ligand_monomer_name]), - ligand_interaction_site_type=np.array([ligand_interaction_site_type]), - index = np.array([index]), - r=np.array([r]), - theta_P=np.array([theta_P]), - tau_P=np.array([tau_P]), - theta_L=np.array([theta_L]), - tau_L=np.array([tau_L]), - tau_PL=np.array([tau_PL]), - ) - data.append(item) + item = dict( + energies=energies, + subset=subset, + n_atoms=n_atoms, + n_atoms_first=n_atoms_first, + atomic_inputs=atomic_inputs, + protein_monomer_name= np.array([protein_monomer_name]), + protein_interaction_site_type=np.array([protein_interaction_site_type]), + ligand_monomer_name=np.array([ligand_monomer_name]), + ligand_interaction_site_type=np.array([ligand_interaction_site_type]), + index = np.array([index]), + r=np.array([r]), + theta_P=np.array([theta_P]), + tau_P=np.array([tau_P]), + theta_L=np.array([theta_L]), + tau_L=np.array([tau_L]), + tau_PL=np.array([tau_PL]), + name=np.array([protein_monomer_name + "." + ligand_monomer_name]) + ) + data.append(item) + progress_bar.update(1) logger.info(f"Processed {i} files in total") return data From 3c84ee910ecf1731ef51529fbdeed0103e3dcc71 Mon Sep 17 00:00:00 2001 From: Danny McNeela Date: Thu, 7 Mar 2024 16:44:31 +0000 Subject: [PATCH 27/47] update default data shapes for interaction datasets --- src/openqdc/datasets/interaction/base.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py index c73ce7c..8de065e 100644 --- a/src/openqdc/datasets/interaction/base.py +++ b/src/openqdc/datasets/interaction/base.py @@ -10,6 +10,9 @@ set_cache_dir, ) from openqdc.datasets.potential.base import BaseDataset +from openqdc.utils.constants import ( + NB_ATOMIC_FEATURES +) from loguru import logger @@ -43,3 +46,12 @@ def collate_list(self, list_entries: List[Dict]): res["position_idx_range"] = x return res + + @property + def data_shapes(self): + return { + "atomic_inputs": (-1, NB_ATOMIC_FEATURES), + "position_idx_range": (-1, 2), + "energies": (-1, len(self.__energy_methods__)), + "forces": (-1, 3, len(self.force_target_names)), + } From 04c81aea5c4745cfd36ef53c24a1e516fe2b14fc Mon Sep 17 00:00:00 2001 From: mcneela Date: Thu, 7 Mar 2024 11:51:06 -0500 Subject: [PATCH 28/47] updated test_dummy.py with new import structure --- tests/test_dummy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dummy.py b/tests/test_dummy.py index 65fe9b6..f82376c 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,6 +1,6 @@ """Path hack to make tests work.""" -from openqdc.datasets.dummy import Dummy # noqa: E402 +from openqdc.datasets.potential.dummy import Dummy # noqa: E402 from openqdc.utils.atomization_energies import ( ISOLATED_ATOM_ENERGIES, IsolatedAtomEnergyFactory, From 11e28581713ce2c584a6c7e8caa9ae7bb963babd Mon Sep 17 00:00:00 2001 From: mcneela Date: Thu, 7 Mar 2024 11:53:02 -0500 Subject: [PATCH 29/47] fix test_import.py --- tests/test_import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_import.py b/tests/test_import.py index 0736e30..0d3344a 100644 --- a/tests/test_import.py +++ b/tests/test_import.py @@ -1,2 +1,2 @@ def test_open_qdc(): - import openQDC # noqa + import openqdc # noqa From 78f042338dd5c0e8dd8760dd1da7abd0d7bbf409 Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 10:00:56 -0500 Subject: [PATCH 30/47] code cleanup for the linter --- src/openqdc/datasets/interaction/L7.py | 13 ++--- src/openqdc/datasets/interaction/X40.py | 13 ++--- src/openqdc/datasets/interaction/__init__.py | 6 +-- src/openqdc/datasets/interaction/base.py | 31 ++++------- src/openqdc/datasets/interaction/des370k.py | 20 ++------ src/openqdc/datasets/interaction/des5m.py | 20 ++------ src/openqdc/datasets/interaction/dess66.py | 22 +++----- src/openqdc/datasets/interaction/dess66x8.py | 22 +++----- src/openqdc/datasets/interaction/metcalf.py | 39 +++----------- src/openqdc/datasets/interaction/splinter.py | 54 +++++++++++--------- 10 files changed, 83 insertions(+), 157 deletions(-) diff --git a/src/openqdc/datasets/interaction/L7.py b/src/openqdc/datasets/interaction/L7.py index a0d657f..db6fa3c 100644 --- a/src/openqdc/datasets/interaction/L7.py +++ b/src/openqdc/datasets/interaction/L7.py @@ -1,15 +1,12 @@ import os -import numpy as np -import pandas as pd - from typing import Dict, List -from tqdm import tqdm -from rdkit import Chem -from ruamel.yaml import YAML +import numpy as np from loguru import logger +from ruamel.yaml import YAML + from openqdc.datasets.interaction import BaseInteractionDataset -from openqdc.utils.molecule import atom_table, molecule_groups +from openqdc.utils.molecule import atom_table class L7(BaseInteractionDataset): @@ -47,7 +44,7 @@ def read_raw_entries(self) -> List[Dict]: energies.append(item["reference_value"]) xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r") lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) - lines.pop(1) + lines.pop(1) n_atoms = np.array([int(lines[0][0])], dtype=np.int32) n_atoms_first = np.array([int(item["setup"]["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) subset = np.array([item["group"]]) diff --git a/src/openqdc/datasets/interaction/X40.py b/src/openqdc/datasets/interaction/X40.py index f89e6dd..826e455 100644 --- a/src/openqdc/datasets/interaction/X40.py +++ b/src/openqdc/datasets/interaction/X40.py @@ -1,15 +1,12 @@ import os -import numpy as np -import pandas as pd - from typing import Dict, List -from tqdm import tqdm -from rdkit import Chem -from ruamel.yaml import YAML +import numpy as np from loguru import logger +from ruamel.yaml import YAML + from openqdc.datasets.interaction import BaseInteractionDataset -from openqdc.utils.molecule import atom_table, molecule_groups +from openqdc.utils.molecule import atom_table class X40(BaseInteractionDataset): @@ -43,7 +40,7 @@ def read_raw_entries(self) -> List[Dict]: energies.append(float(item["reference_value"])) xyz_file = open(os.path.join(self.root, f"{item['shortname']}.xyz"), "r") lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) - setup = lines.pop(1) + setup = lines.pop(1) n_atoms = np.array([int(lines[0][0])], dtype=np.int32) n_atoms_first = setup[0].split("-")[1] n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32) diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py index 1906b7f..f3cdfa0 100644 --- a/src/openqdc/datasets/interaction/__init__.py +++ b/src/openqdc/datasets/interaction/__init__.py @@ -47,14 +47,14 @@ def __dir__(): if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": from .base import BaseInteractionDataset - from .des370k import DES370K from .des5m import DES5M - from .metcalf import Metcalf + from .des370k import DES370K from .dess66 import DESS66 from .dess66x8 import DESS66x8 from .L7 import L7 - from .X40 import X40 + from .metcalf import Metcalf from .splinter import Splinter + from .X40 import X40 __all__ = [ "BaseInteractionDataset", diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py index 8de065e..c06c4bc 100644 --- a/src/openqdc/datasets/interaction/base.py +++ b/src/openqdc/datasets/interaction/base.py @@ -1,23 +1,11 @@ -from typing import Dict, List, Optional, Union -from openqdc.utils.io import ( - copy_exists, - dict_to_atoms, - get_local_cache, - load_hdf5_file, - load_pkl, - pull_locally, - push_remote, - set_cache_dir, -) -from openqdc.datasets.potential.base import BaseDataset -from openqdc.utils.constants import ( - NB_ATOMIC_FEATURES -) - -from loguru import logger +from typing import Dict, List, Optional import numpy as np +from openqdc.datasets.potential.base import BaseDataset +from openqdc.utils.constants import NB_ATOMIC_FEATURES + + class BaseInteractionDataset(BaseDataset): def __init__( self, @@ -30,14 +18,17 @@ def __init__( energy_unit=energy_unit, distance_unit=distance_unit, overwrite_local_cache=overwrite_local_cache, - cache_dir=cache_dir + cache_dir=cache_dir, ) def collate_list(self, list_entries: List[Dict]): # concatenate entries print(list_entries[0]) - res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \ - for key in list_entries[0] if not isinstance(list_entries[0][key], dict)} + res = { + key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) + for key in list_entries[0] + if not isinstance(list_entries[0][key], dict) + } csum = np.cumsum(res.get("n_atoms")) print(csum) diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py index 65a5835..4b866e7 100644 --- a/src/openqdc/datasets/interaction/des370k.py +++ b/src/openqdc/datasets/interaction/des370k.py @@ -1,12 +1,11 @@ import os -import numpy as np -import pandas as pd - from typing import Dict, List -from tqdm import tqdm -from rdkit import Chem +import numpy as np +import pandas as pd from loguru import logger +from tqdm import tqdm + from openqdc.datasets.interaction import BaseInteractionDataset from openqdc.utils.molecule import atom_table, molecule_groups @@ -66,22 +65,14 @@ def read_raw_entries(self) -> List[Dict]: charge0, charge1 = row["charge0"], row["charge1"] natoms0, natoms1 = row["natoms0"], row["natoms1"] pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) - pos0 = pos[:natoms0] - pos1 = pos[natoms0:] - + elements = row["elements"].split() - elements0 = np.array(elements[:natoms0]) - elements1 = np.array(elements[natoms0:]) atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) - atomic_nums0 = np.array(atomic_nums[:natoms0]) - atomic_nums1 = np.array(atomic_nums[natoms0:]) charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - atomic_inputs0 = atomic_inputs[:natoms0, :] - atomic_inputs1 = atomic_inputs[natoms0:, :] energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] @@ -98,7 +89,6 @@ def read_raw_entries(self) -> List[Dict]: if not found: logger.info(f"molecule group lookup failed for {smiles}") - item = dict( energies=energies, subset=np.array([subsets]), diff --git a/src/openqdc/datasets/interaction/des5m.py b/src/openqdc/datasets/interaction/des5m.py index e79d690..256df0b 100644 --- a/src/openqdc/datasets/interaction/des5m.py +++ b/src/openqdc/datasets/interaction/des5m.py @@ -1,12 +1,11 @@ import os -import numpy as np -import pandas as pd - from typing import Dict, List -from tqdm import tqdm -from rdkit import Chem +import numpy as np +import pandas as pd from loguru import logger +from tqdm import tqdm + from openqdc.datasets.interaction import DES370K from openqdc.utils.molecule import atom_table, molecule_groups @@ -57,22 +56,14 @@ def read_raw_entries(self) -> List[Dict]: charge0, charge1 = row["charge0"], row["charge1"] natoms0, natoms1 = row["natoms0"], row["natoms1"] pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) - pos0 = pos[:natoms0] - pos1 = pos[natoms0:] - + elements = row["elements"].split() - elements0 = np.array(elements[:natoms0]) - elements1 = np.array(elements[natoms0:]) atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) - atomic_nums0 = np.array(atomic_nums[:natoms0]) - atomic_nums1 = np.array(atomic_nums[natoms0:]) charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - atomic_inputs0 = atomic_inputs[:natoms0, :] - atomic_inputs1 = atomic_inputs[natoms0:, :] energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] @@ -89,7 +80,6 @@ def read_raw_entries(self) -> List[Dict]: if not found: logger.info(f"molecule group lookup failed for {smiles}") - item = dict( energies=energies, subset=np.array([subsets]), diff --git a/src/openqdc/datasets/interaction/dess66.py b/src/openqdc/datasets/interaction/dess66.py index 13f5b9c..a0d361b 100644 --- a/src/openqdc/datasets/interaction/dess66.py +++ b/src/openqdc/datasets/interaction/dess66.py @@ -1,14 +1,13 @@ import os -import numpy as np -import pandas as pd - from typing import Dict, List -from tqdm import tqdm -from rdkit import Chem +import numpy as np +import pandas as pd from loguru import logger +from tqdm import tqdm + from openqdc.datasets.interaction import BaseInteractionDataset -from openqdc.utils.molecule import atom_table, molecule_groups +from openqdc.utils.molecule import atom_table class DESS66(BaseInteractionDataset): @@ -66,28 +65,19 @@ def read_raw_entries(self) -> List[Dict]: charge0, charge1 = row["charge0"], row["charge1"] natoms0, natoms1 = row["natoms0"], row["natoms1"] pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) - pos0 = pos[:natoms0] - pos1 = pos[natoms0:] - + elements = row["elements"].split() - elements0 = np.array(elements[:natoms0]) - elements1 = np.array(elements[natoms0:]) atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) - atomic_nums0 = np.array(atomic_nums[:natoms0]) - atomic_nums1 = np.array(atomic_nums[natoms0:]) charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - atomic_inputs0 = atomic_inputs[:natoms0, :] - atomic_inputs1 = atomic_inputs[natoms0:, :] energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] name = np.array([smiles0 + "." + smiles1]) - subset = row["system_name"] item = dict( diff --git a/src/openqdc/datasets/interaction/dess66x8.py b/src/openqdc/datasets/interaction/dess66x8.py index 80e3bad..575d601 100644 --- a/src/openqdc/datasets/interaction/dess66x8.py +++ b/src/openqdc/datasets/interaction/dess66x8.py @@ -1,14 +1,13 @@ import os -import numpy as np -import pandas as pd - from typing import Dict, List -from tqdm import tqdm -from rdkit import Chem +import numpy as np +import pandas as pd from loguru import logger +from tqdm import tqdm + from openqdc.datasets.interaction import BaseInteractionDataset -from openqdc.utils.molecule import atom_table, molecule_groups +from openqdc.utils.molecule import atom_table class DESS66x8(BaseInteractionDataset): @@ -66,28 +65,19 @@ def read_raw_entries(self) -> List[Dict]: charge0, charge1 = row["charge0"], row["charge1"] natoms0, natoms1 = row["natoms0"], row["natoms1"] pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) - pos0 = pos[:natoms0] - pos1 = pos[natoms0:] - + elements = row["elements"].split() - elements0 = np.array(elements[:natoms0]) - elements1 = np.array(elements[natoms0:]) atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) - atomic_nums0 = np.array(atomic_nums[:natoms0]) - atomic_nums1 = np.array(atomic_nums[natoms0:]) charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - atomic_inputs0 = atomic_inputs[:natoms0, :] - atomic_inputs1 = atomic_inputs[natoms0:, :] energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] name = np.array([smiles0 + "." + smiles1]) - subset = row["system_name"] item = dict( diff --git a/src/openqdc/datasets/interaction/metcalf.py b/src/openqdc/datasets/interaction/metcalf.py index 3a8e714..3553d90 100644 --- a/src/openqdc/datasets/interaction/metcalf.py +++ b/src/openqdc/datasets/interaction/metcalf.py @@ -1,22 +1,18 @@ import os -import numpy as np - from typing import Dict, List -from tqdm import tqdm -from rdkit import Chem -from loguru import logger +import numpy as np + from openqdc.datasets.interaction import BaseInteractionDataset from openqdc.utils.molecule import atom_table + class Metcalf(BaseInteractionDataset): __name__ = "metcalf" __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = None - __energy_methods__ = [ - "SAPT0/jun-cc-pVDZ" - ] + __energy_methods__ = ["SAPT0/jun-cc-pVDZ"] energy_target_names = [ "total energy", "electrostatic energy", @@ -39,7 +35,7 @@ def read_raw_entries(self) -> List[Dict]: xyz_dir = os.path.join(self.root, dirname) if not os.path.isdir(xyz_dir): continue - subset = np.array([dirname.split("-")[0].lower()]) # training, validation, or test + subset = np.array([dirname.split("-")[0].lower()]) # training, validation, or test for filename in os.listdir(xyz_dir): if not filename.endswith(".xyz"): continue @@ -47,45 +43,22 @@ def read_raw_entries(self) -> List[Dict]: line_two = lines[1].split(",") energies = np.array([line_two[1:6]], dtype=np.float32) num_atoms = np.array([int(lines[0])]) - num_atoms0 = int(line_two[-1]) - num_atoms1 = num_atoms[0] - num_atoms0 elem_xyz = np.array([x.split() for x in lines[2:]]) - # elements = np.expand_dims(elem_xyz[:, 0], axis=0) elements = elem_xyz[:, 0] xyz = elem_xyz[:, 1:].astype(np.float32) - # xyz0_fname = os.path.join(xyz_dir, f"{filename}_0_tmp.xyz") - # with open(xyz0_fname, "w") as xyz_0_file: - # lines_to_write = [str(num_atoms0) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[:num_atoms0].tolist())) - # lines_to_write[-1] = lines_to_write[-1][:-1] - # print(lines_to_write) - # xyz_0_file.writelines(lines_to_write) # writelines doesn't actually add a newline to each string (weird) - - # xyz1_fname = os.path.join(xyz_dir, f"{filename}_1_tmp.xyz") - # with open(xyz1_fname, "w") as xyz_1_file: - # lines_to_write = [str(num_atoms1) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[num_atoms0:].tolist())) - # lines_to_write[-1] = lines_to_write[-1][:-1] - # xyz_1_file.writelines(lines_to_write) - - # smiles0 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz0_fname)) - # smiles1 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz1_fname)) - # atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) charges = np.expand_dims(np.array([0] * num_atoms[0]), axis=1) atomic_inputs = np.concatenate((atomic_nums, charges, xyz), axis=-1, dtype=np.float32) item = dict( - # elements=elements, n_atoms=num_atoms, subset=subset, energies=energies, positions=xyz, atomic_inputs=atomic_inputs, - name=np.array([""]) + name=np.array([""]), ) data.append(item) return data - - - diff --git a/src/openqdc/datasets/interaction/splinter.py b/src/openqdc/datasets/interaction/splinter.py index 7d7dbd0..2841cf7 100644 --- a/src/openqdc/datasets/interaction/splinter.py +++ b/src/openqdc/datasets/interaction/splinter.py @@ -1,14 +1,12 @@ import os -import numpy as np -import pandas as pd - from typing import Dict, List -from tqdm import tqdm -from rdkit import Chem +import numpy as np from loguru import logger +from tqdm import tqdm + from openqdc.datasets.interaction import BaseInteractionDataset -from openqdc.utils.molecule import atom_table, molecule_groups +from openqdc.utils.molecule import atom_table class Splinter(BaseInteractionDataset): @@ -43,33 +41,43 @@ def read_raw_entries(self) -> List[Dict]: data = [] i = 0 with tqdm(total=1706343) as progress_bar: - for root, dirs, files in os.walk(self.root): # total is currently an approximation + for root, dirs, files in os.walk(self.root): # total is currently an approximation for filename in files: if not filename.endswith(".xyz"): continue i += 1 filepath = os.path.join(root, filename) - filein = open(filepath, "r") + filein = open(filepath, "r") lines = list(map(lambda x: x.strip(), filein.readlines())) n_atoms = np.array([int(lines[0])], dtype=np.int32) metadata = lines[1].split(",") try: ( - protein_monomer_name, - protein_interaction_site_type, - ligand_monomer_name, + protein_monomer_name, + protein_interaction_site_type, + ligand_monomer_name, ligand_interaction_site_type, - index, r, theta_P, tau_P, theta_L, - tau_L, tau_PL + index, + r, + theta_P, + tau_P, + theta_L, + tau_L, + tau_PL, ) = metadata[0].split("_") - index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])) - except: - (protein_monomer_name, - protein_interaction_site_type, - ligand_monomer_name, + index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list( + map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL]) + ) + except ValueError: + ( + protein_monomer_name, + protein_interaction_site_type, + ligand_monomer_name, ligand_interaction_site_type, - index, _) = metadata[0].split("_") - r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6 + index, + _, + ) = metadata[0].split("_") + r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6 energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32) n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32) total_charge, charge0, charge1 = list(map(int, metadata[1:4])) @@ -89,18 +97,18 @@ def read_raw_entries(self) -> List[Dict]: n_atoms=n_atoms, n_atoms_first=n_atoms_first, atomic_inputs=atomic_inputs, - protein_monomer_name= np.array([protein_monomer_name]), + protein_monomer_name=np.array([protein_monomer_name]), protein_interaction_site_type=np.array([protein_interaction_site_type]), ligand_monomer_name=np.array([ligand_monomer_name]), ligand_interaction_site_type=np.array([ligand_interaction_site_type]), - index = np.array([index]), + index=np.array([index]), r=np.array([r]), theta_P=np.array([theta_P]), tau_P=np.array([tau_P]), theta_L=np.array([theta_L]), tau_L=np.array([tau_L]), tau_PL=np.array([tau_PL]), - name=np.array([protein_monomer_name + "." + ligand_monomer_name]) + name=np.array([protein_monomer_name + "." + ligand_monomer_name]), ) data.append(item) progress_bar.update(1) From bd58fdf627b272e0e3fe7aac669c4f1a209a8555 Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 10:18:40 -0500 Subject: [PATCH 31/47] fix ani import --- src/openqdc/datasets/potential/ani.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openqdc/datasets/potential/ani.py b/src/openqdc/datasets/potential/ani.py index 3f1b92b..34ab904 100644 --- a/src/openqdc/datasets/potential/ani.py +++ b/src/openqdc/datasets/potential/ani.py @@ -1,7 +1,7 @@ import os from os.path import join as p_join -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5 from openqdc.utils.io import get_local_cache From 4bc3a49268baab66fac7f93037fb07bf288940d1 Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 10:40:50 -0500 Subject: [PATCH 32/47] fix base dataset import --- openqdc/datasets/interaction/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index c06c4bc..71c8e84 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -2,7 +2,7 @@ import numpy as np -from openqdc.datasets.potential.base import BaseDataset +from openqdc.datasets.base import BaseDataset from openqdc.utils.constants import NB_ATOMIC_FEATURES From b046eeac3d52f79549c8bb99c924ae883c9dc8ff Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 10:42:33 -0500 Subject: [PATCH 33/47] black formatting --- openqdc/raws/config_factory.py | 6 +- openqdc/utils/molecule.py | 403 ++++++++++++++++++++++++++++++--- tests/test_import.py | 2 +- 3 files changed, 379 insertions(+), 32 deletions(-) diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py index 312faf0..c9530cf 100644 --- a/openqdc/raws/config_factory.py +++ b/openqdc/raws/config_factory.py @@ -94,14 +94,14 @@ class DataConfigFactory: dataset_name="des370k_interaction", links={ "DES370K.zip": "https://zenodo.org/record/5676266/files/DES370K.zip", - } + }, ) des5m_interaction = dict( dataset_name="des5m_interaction", links={ "DES5M.zip": "https://zenodo.org/records/5706002/files/DESS5M.zip?download=1", - } + }, ) tmqm = dict( @@ -157,7 +157,7 @@ class DataConfigFactory: des_s66x8 = dict( dataset_name="des_s66x8", - links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"} + links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"}, ) available_datasets = [k for k in locals().keys() if not k.startswith("__")] diff --git a/openqdc/utils/molecule.py b/openqdc/utils/molecule.py index 47f9ded..6b48cc5 100644 --- a/openqdc/utils/molecule.py +++ b/openqdc/utils/molecule.py @@ -14,48 +14,395 @@ molecule_groups = { "acids": set(["CCC(=O)O", "CC(=O)O", "OC=O", "OC(=O)CC(=O)O"]), "alcohols": set(["CCCO", "CCC(O)C", "CCO", "CC(O)C", "CO", "OC1CCCC1", "OC1CCCCC1", "OCCCCO", "OCCCO", "OCCO"]), - "alkanes": set(["C1CCCC1", "C1CCCCC1", "C", "CC1CCCC1", "CC1CCCCC1", "CC", "CCC", "CC(C)C", "CCCC", "CC(C)(C)C", "CCC(C)C", "CCCCC", "CCC(C)(C)C", "CCCCCC"]), - "alkenes": set(["C=C", "CC=C", "CC=CC", "CC(=C)C", "CCC=C", "CC=C(C)C", "CCC=CC", "CCC(=C)C", "CC(=C(C)C)C", "CCC=C(C)C", "CCC(=CC)C", "CCC(=C(C)C)C"]), - "amides": set(["CCCNC=O", "CCC(=O)N", "CCC(=O)NC", "CCC(=O)N(C)C", "CCC(=O)N(CC)C", "CCNC=O", "CCNC(=O)C", "CCN(C=O)CC", "CCN(C(=O)C)C", "CCNC(=O)CC", \ - "CCN(C(=O)C)CC", "CC(=O)N", "CC(=O)N(C)C", "CNC=O", "CNC(=O)C", "CN(C=O)CC", "CNC(=O)CC(=O)N", "CNC(=O)CC(=O)NC", \ - "CNC(=O)CNC=O", "CNC(=O)CNC(=O)C", "CNC(=O)C(NC(=O)C)C", "NC=O", "NC(=O)CC(=O)N", "O=CN(C)C", "O=CNCCC(=O)N", "O=CNCCC(=O)NC", "O=CNCCNC=O", "O=CNCC(=O)N"]), - "amines": set(["C1CCCN1", "C1CCCNC1", "CCCN", "CCCNC", "CCCN(C)C", "CCN", "CCN(C)C", "CCNCC", "CCN(CC)C", "CN", "CNC", "CN(C)C", "CNCC", "CNCCCN", "CNCCCNC", "CNCCN", "CNCCNC", "N", "NCCCN", "NCCN"]), + "alkanes": set( + [ + "C1CCCC1", + "C1CCCCC1", + "C", + "CC1CCCC1", + "CC1CCCCC1", + "CC", + "CCC", + "CC(C)C", + "CCCC", + "CC(C)(C)C", + "CCC(C)C", + "CCCCC", + "CCC(C)(C)C", + "CCCCCC", + ] + ), + "alkenes": set( + [ + "C=C", + "CC=C", + "CC=CC", + "CC(=C)C", + "CCC=C", + "CC=C(C)C", + "CCC=CC", + "CCC(=C)C", + "CC(=C(C)C)C", + "CCC=C(C)C", + "CCC(=CC)C", + "CCC(=C(C)C)C", + ] + ), + "amides": set( + [ + "CCCNC=O", + "CCC(=O)N", + "CCC(=O)NC", + "CCC(=O)N(C)C", + "CCC(=O)N(CC)C", + "CCNC=O", + "CCNC(=O)C", + "CCN(C=O)CC", + "CCN(C(=O)C)C", + "CCNC(=O)CC", + "CCN(C(=O)C)CC", + "CC(=O)N", + "CC(=O)N(C)C", + "CNC=O", + "CNC(=O)C", + "CN(C=O)CC", + "CNC(=O)CC(=O)N", + "CNC(=O)CC(=O)NC", + "CNC(=O)CNC=O", + "CNC(=O)CNC(=O)C", + "CNC(=O)C(NC(=O)C)C", + "NC=O", + "NC(=O)CC(=O)N", + "O=CN(C)C", + "O=CNCCC(=O)N", + "O=CNCCC(=O)NC", + "O=CNCCNC=O", + "O=CNCC(=O)N", + ] + ), + "amines": set( + [ + "C1CCCN1", + "C1CCCNC1", + "CCCN", + "CCCNC", + "CCCN(C)C", + "CCN", + "CCN(C)C", + "CCNCC", + "CCN(CC)C", + "CN", + "CNC", + "CN(C)C", + "CNCC", + "CNCCCN", + "CNCCCNC", + "CNCCN", + "CNCCNC", + "N", + "NCCCN", + "NCCN", + ] + ), "ammoniums": set(["CC[NH3+]", "C[N+](C)(C)C", "C[NH2+]C", "C[NH3+]", "C[NH+](C)C", "[NH4+]"]), "benzene": set(["c1ccccc1", "Cc1ccccc1", "CCc1ccccc1"]), "carboxylates": set(["[O-]C=O", "[O-]C(=O)C", "[O-]C(=O)CC"]), - "esters": set(["CCCOC=O", "CCC(=O)OC", "CCOC(=O)CC", "CCOC(=O)C", "CCOC=O", "COC(=O)C", "COC=O", "O=COCCCOC=O", "O=COCCOC=O", "O=COCOC=O"]), - "ethers": set(["C1CCCO1", "C1CCCOC1", "C1CCOCO1", "C1OCCO1", "CCCOC", "CCCOCOC", "CCOCC", "COCCCOC", "COCC", "COCCOC", "COC", "COCOCC", "COCOC", "O1CCOCC1", "O1COCOC1"]), + "esters": set( + [ + "CCCOC=O", + "CCC(=O)OC", + "CCOC(=O)CC", + "CCOC(=O)C", + "CCOC=O", + "COC(=O)C", + "COC=O", + "O=COCCCOC=O", + "O=COCCOC=O", + "O=COCOC=O", + ] + ), + "ethers": set( + [ + "C1CCCO1", + "C1CCCOC1", + "C1CCOCO1", + "C1OCCO1", + "CCCOC", + "CCCOCOC", + "CCOCC", + "COCCCOC", + "COCC", + "COCCOC", + "COC", + "COCOCC", + "COCOC", + "O1CCOCC1", + "O1COCOC1", + ] + ), "guanidiums": set(["CCNC(=[NH2+])N", "CNC(=[NH2+])N", "NC(=[NH2+])N"]), "imidazolium": set(["c1[nH]cc[nH+]1", "Cc1c[nH]c[nH+]1", "CCc1c[nH]c[nH+]1"]), "ketones": set(["CCC(=O)CC", "CCC(=O)C", "CCC=O", "CC(=O)C", "CC=O", "C=O"]), - "monoatomics": set(["[Ar]", "[Br-]", "[Ca+2]", "[Cl-]", "[F-]", "[He]", "[I-]", "[K+]", "[Kr]", "[Li+]", "[Mg+2]", "[Na+]", "[Ne]", "[Xe]"]), - "other": set(["Brc1ccc(cc1)Br", "Brc1ccccc1", "BrC(Br)Br", "BrCBr", "BrCCBr", "CBr", "CC(Br)Br", "CCBr", "CCCC#CC", "CCCC(Cl)(Cl)Cl", "CCCC(Cl)Cl", \ - "CCCCCl", "CCC#CC", "CCCC#C", "CCCC(F)(F)F", "CCCC(F)F", "CCCCF", "CCC(Cl)(Cl)Cl", "CCC(Cl)Cl", "CCCCl", "CCCC#N", "CC#CC", "CCC#C", "CCC(F)(F)F", \ - "CCC(F)F", "CCCF", "CC(Cl)(Cl)Cl", "CC(Cl)Cl", "CCCl", "CCC#N", "CC#C", "CC(F)(F)F", "CC(F)F", "CCF", "CC(I)I", "CCI", "CCl", "CC#N", "CCOP(=O)(OC)OC", \ - "CCOP(=O)(OC)[O-]", "CCOP(=O)(OC)O", "C#C", "CF", "CI", "Clc1ccc(cc1)Cl", "Clc1cccc(c1)Cl", "Clc1ccccc1Cl", "Clc1ccccc1", "Clc1cc(Cl)c(c(c1Cl)Cl)Cl", \ - "Clc1cc(Cl)cc(c1)Cl", "Clc1c(Cl)c(Cl)c(c(c1Cl)Cl)Cl", "ClC(C(Cl)(Cl)Cl)(Cl)Cl", "ClC(C(Cl)(Cl)Cl)Cl", "ClCC(Cl)(Cl)Cl", "ClCC(Cl)Cl", "ClCCCl", "ClC(Cl)Cl", \ - "ClCCl", "CNCCCOC=O", "CNCCCOC", "CNCCC(=O)NC", "CNCCC(=O)N", "CNCCC(=O)O", "CNCCCO", "CNCCCSC", "CNCCCS", "CNCCNC=O", "CNCCOC=O", "CNCCOC", "CNCC(=O)NC", \ - "CNCC(=O)N", "CNCC(=O)O", "CNCCO", "CNCCSC", "CNCCS", "CNC(=O)CCN", "CNC(=O)CC(=O)O", "CNC(=O)CCO", "CNC(=O)CCS", "CNC(=O)CN", "CNC(=O)COC=O", "CNC(=O)CO", \ - "CNCOC=O", "CNCOC", "CNC(=O)CS", "CNCSC", "C#N", "COCCCN", "COCCCOC=O", "COCCC(=O)NC", "COCCC(=O)N", "COCCC(=O)O", "COCCCO", "COCCCSC", "COCCCS", "COCCNC=O", \ - "COCCN", "COCCOC=O", "COCC(=O)NC", "COCC(=O)N", "COCC(=O)O", "COCCO", "COCCSC", "COCCS", "COCNC=O", "COCN", "COCOC=O", "COCO", "COCSC", "COCS", "COP(=O)(OC)OC", \ - "COP(=O)(OC)[O-]", "COP(=O)(OC)O", "COP(=O)(O)O", "COP(=O)(OP(=O)(O)O)[O-]", "CSCCCNC=O", "CSCCCN", "CSCCCOC=O", "CSCCC(=O)N", "CSCCC(=O)O", "CSCCCO", "CSCCN", "CSCCOC=O", \ - "CSCC(=O)NC", "CSCC(=O)N", "CSCC(=O)O", "CSCCO", "CSCNC=O", "CSCN", "CSCOC=O", "CSCO", "Fc1ccc(cc1)F", "Fc1cccc(c1)F", "Fc1ccccc1F", "Fc1ccccc1", "Fc1cc(F)c(c(c1F)F)F", \ - "Fc1cc(F)cc(c1)F", "Fc1c(F)c(F)c(c(c1F)F)F", "FC(C(F)(F)F)(F)F", "FC(C(F)(F)F)F", "FCC(F)(F)F", "FCC(F)F", "FCCF", "FC(F)F", "FCF", "ICCI", "ICI", "NCCCOC=O", "NCCC(=O)N", \ - "NCCC(=O)O", "NCCCO", "NCCCS", "NCCNC=O", "NCCOC=O", "NCC(=O)N", "NCC(=O)O", "NCCO", "NCCS", "NC(=O)CC(=O)O","NC(=O)CCO", "NC(=O)CCS", "NC(=O)CO", "NCOC=O", "NC(=O)CS", \ - "OCCCNC=O", "OCCCOC=O", "OCCC(=O)O", "OCCCS", "OCCNC=O", "OCCOC=O", "OCC(=O)O", "OCCS", "O=CNCCC(=O)O", "O=CNCCOC=O", "O=CNCC(=O)O", "O=CNCOC=O", "O=COCCC(=O)NC", \ - "O=COCCC(=O)N", "O=COCCC(=O)O", "O=COCC(=O)N", "O=COCC(=O)O", "OC(=O)CCS", "OCOC=O", "OC(=O)CS", "OP(=O)(O)O", "[O-]P(=O)(OP(=O)(OC)O)O", "SCCCOC=O", "SCCNC=O", "SCCOC=O", \ - "SCOC=O", "[H][H]"]), + "monoatomics": set( + [ + "[Ar]", + "[Br-]", + "[Ca+2]", + "[Cl-]", + "[F-]", + "[He]", + "[I-]", + "[K+]", + "[Kr]", + "[Li+]", + "[Mg+2]", + "[Na+]", + "[Ne]", + "[Xe]", + ] + ), + "other": set( + [ + "Brc1ccc(cc1)Br", + "Brc1ccccc1", + "BrC(Br)Br", + "BrCBr", + "BrCCBr", + "CBr", + "CC(Br)Br", + "CCBr", + "CCCC#CC", + "CCCC(Cl)(Cl)Cl", + "CCCC(Cl)Cl", + "CCCCCl", + "CCC#CC", + "CCCC#C", + "CCCC(F)(F)F", + "CCCC(F)F", + "CCCCF", + "CCC(Cl)(Cl)Cl", + "CCC(Cl)Cl", + "CCCCl", + "CCCC#N", + "CC#CC", + "CCC#C", + "CCC(F)(F)F", + "CCC(F)F", + "CCCF", + "CC(Cl)(Cl)Cl", + "CC(Cl)Cl", + "CCCl", + "CCC#N", + "CC#C", + "CC(F)(F)F", + "CC(F)F", + "CCF", + "CC(I)I", + "CCI", + "CCl", + "CC#N", + "CCOP(=O)(OC)OC", + "CCOP(=O)(OC)[O-]", + "CCOP(=O)(OC)O", + "C#C", + "CF", + "CI", + "Clc1ccc(cc1)Cl", + "Clc1cccc(c1)Cl", + "Clc1ccccc1Cl", + "Clc1ccccc1", + "Clc1cc(Cl)c(c(c1Cl)Cl)Cl", + "Clc1cc(Cl)cc(c1)Cl", + "Clc1c(Cl)c(Cl)c(c(c1Cl)Cl)Cl", + "ClC(C(Cl)(Cl)Cl)(Cl)Cl", + "ClC(C(Cl)(Cl)Cl)Cl", + "ClCC(Cl)(Cl)Cl", + "ClCC(Cl)Cl", + "ClCCCl", + "ClC(Cl)Cl", + "ClCCl", + "CNCCCOC=O", + "CNCCCOC", + "CNCCC(=O)NC", + "CNCCC(=O)N", + "CNCCC(=O)O", + "CNCCCO", + "CNCCCSC", + "CNCCCS", + "CNCCNC=O", + "CNCCOC=O", + "CNCCOC", + "CNCC(=O)NC", + "CNCC(=O)N", + "CNCC(=O)O", + "CNCCO", + "CNCCSC", + "CNCCS", + "CNC(=O)CCN", + "CNC(=O)CC(=O)O", + "CNC(=O)CCO", + "CNC(=O)CCS", + "CNC(=O)CN", + "CNC(=O)COC=O", + "CNC(=O)CO", + "CNCOC=O", + "CNCOC", + "CNC(=O)CS", + "CNCSC", + "C#N", + "COCCCN", + "COCCCOC=O", + "COCCC(=O)NC", + "COCCC(=O)N", + "COCCC(=O)O", + "COCCCO", + "COCCCSC", + "COCCCS", + "COCCNC=O", + "COCCN", + "COCCOC=O", + "COCC(=O)NC", + "COCC(=O)N", + "COCC(=O)O", + "COCCO", + "COCCSC", + "COCCS", + "COCNC=O", + "COCN", + "COCOC=O", + "COCO", + "COCSC", + "COCS", + "COP(=O)(OC)OC", + "COP(=O)(OC)[O-]", + "COP(=O)(OC)O", + "COP(=O)(O)O", + "COP(=O)(OP(=O)(O)O)[O-]", + "CSCCCNC=O", + "CSCCCN", + "CSCCCOC=O", + "CSCCC(=O)N", + "CSCCC(=O)O", + "CSCCCO", + "CSCCN", + "CSCCOC=O", + "CSCC(=O)NC", + "CSCC(=O)N", + "CSCC(=O)O", + "CSCCO", + "CSCNC=O", + "CSCN", + "CSCOC=O", + "CSCO", + "Fc1ccc(cc1)F", + "Fc1cccc(c1)F", + "Fc1ccccc1F", + "Fc1ccccc1", + "Fc1cc(F)c(c(c1F)F)F", + "Fc1cc(F)cc(c1)F", + "Fc1c(F)c(F)c(c(c1F)F)F", + "FC(C(F)(F)F)(F)F", + "FC(C(F)(F)F)F", + "FCC(F)(F)F", + "FCC(F)F", + "FCCF", + "FC(F)F", + "FCF", + "ICCI", + "ICI", + "NCCCOC=O", + "NCCC(=O)N", + "NCCC(=O)O", + "NCCCO", + "NCCCS", + "NCCNC=O", + "NCCOC=O", + "NCC(=O)N", + "NCC(=O)O", + "NCCO", + "NCCS", + "NC(=O)CC(=O)O", + "NC(=O)CCO", + "NC(=O)CCS", + "NC(=O)CO", + "NCOC=O", + "NC(=O)CS", + "OCCCNC=O", + "OCCCOC=O", + "OCCC(=O)O", + "OCCCS", + "OCCNC=O", + "OCCOC=O", + "OCC(=O)O", + "OCCS", + "O=CNCCC(=O)O", + "O=CNCCOC=O", + "O=CNCC(=O)O", + "O=CNCOC=O", + "O=COCCC(=O)NC", + "O=COCCC(=O)N", + "O=COCCC(=O)O", + "O=COCC(=O)N", + "O=COCC(=O)O", + "OC(=O)CCS", + "OCOC=O", + "OC(=O)CS", + "OP(=O)(O)O", + "[O-]P(=O)(OP(=O)(OC)O)O", + "SCCCOC=O", + "SCCNC=O", + "SCCOC=O", + "SCOC=O", + "[H][H]", + ] + ), "phenol": set(["Cc1ccc(cc1)O", "CCc1ccc(cc1)O", "Oc1ccccc1"]), "pyridine": set(["c1cccnc1", "c1ccncn1", "n1ccncc1"]), - "pyrrole": set(["c1ccc2c(c1)[nH]cc2", "c1ccc[nH]1", "c1ncc[nH]1", "Cc1cnc[nH]1", "Cc1c[nH]c2c1cccc2", "Cc1c[nH]cn1", "CCc1cnc[nH]1", "CCc1c[nH]c2c1cccc2", "CCc1c[nH]cn1"]), - "sulfides": set(["C1CCCS1", "C1CCCSC1", "C1CCSCS1", "C1CCSSC1", "C1CSSC1", "C1SCCS1", "CCCSCSC", "CCCSC", "CCCSSC", "CCSCC", "CCSSCC", "CCSSC", "CSCCCSC", "CSCCSC", "CSCC", \ - "CSCSCC", "CSCSC", "CSC", "CSSC", "S1CCSCC1", "S1CSCSC1"]), + "pyrrole": set( + [ + "c1ccc2c(c1)[nH]cc2", + "c1ccc[nH]1", + "c1ncc[nH]1", + "Cc1cnc[nH]1", + "Cc1c[nH]c2c1cccc2", + "Cc1c[nH]cn1", + "CCc1cnc[nH]1", + "CCc1c[nH]c2c1cccc2", + "CCc1c[nH]cn1", + ] + ), + "sulfides": set( + [ + "C1CCCS1", + "C1CCCSC1", + "C1CCSCS1", + "C1CCSSC1", + "C1CSSC1", + "C1SCCS1", + "CCCSCSC", + "CCCSC", + "CCCSSC", + "CCSCC", + "CCSSCC", + "CCSSC", + "CSCCCSC", + "CSCCSC", + "CSCC", + "CSCSCC", + "CSCSC", + "CSC", + "CSSC", + "S1CCSCC1", + "S1CSCSC1", + ] + ), "thiols": set(["CCCSS", "CCCS", "CCSS", "CCS", "CSCCCS", "CSCCS", "CSCS", "CSS", "CS", "SCCCS", "SCCS", "SS", "S"]), "water": set(["O"]), "flourane": set(["F"]), "hydrogen chloride": set(["Cl"]), } + def z_to_formula(z): u, c = np.unique(z, return_counts=True) idxs = np.argsort(u) diff --git a/tests/test_import.py b/tests/test_import.py index 0d3344a..8c272b1 100644 --- a/tests/test_import.py +++ b/tests/test_import.py @@ -1,2 +1,2 @@ def test_open_qdc(): - import openqdc # noqa + import openqdc # noqa From fe540444dd97ae2cf07ce1a0b815cdb1da71c51c Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 10:45:34 -0500 Subject: [PATCH 34/47] ran precommit --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 405bd7b..c0db26d 100644 --- a/README.md +++ b/README.md @@ -85,8 +85,8 @@ We provide support for the following publicly available QM Potential Energy Data We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets. -| Dataset | -| --- | +| Dataset | +| --- | | [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | | [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | -| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) | \ No newline at end of file +| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) | From ef2528ca3c57f33874277dbe076ffb7d1e21e3dd Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 10:48:47 -0500 Subject: [PATCH 35/47] removed DES from datasets/__init__.py --- openqdc/datasets/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py index 9291d77..c86b8b3 100644 --- a/openqdc/datasets/__init__.py +++ b/openqdc/datasets/__init__.py @@ -24,7 +24,6 @@ "ani1ccx": ANI1CCX, "ani1x": ANI1X, "comp6": COMP6, - "des": DES, "gdml": GDML, "geom": GEOM, "iso17": ISO17, From c0ef5b168e96ef10b0e3f4e8c5bbac85842ae22e Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 10:50:39 -0500 Subject: [PATCH 36/47] removed DES from datasets/__init__.py --- openqdc/datasets/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py index c86b8b3..b8ac166 100644 --- a/openqdc/datasets/__init__.py +++ b/openqdc/datasets/__init__.py @@ -1,5 +1,4 @@ from .base import BaseDataset # noqa -from .interaction import DES # noqa from .potential.ani import ANI1, ANI1CCX, ANI1X # noqa from .potential.comp6 import COMP6 # noqa from .potential.dummy import Dummy # noqa From ad552962c7bc84132e72b280bf1d18f45eac724a Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 11:04:13 -0500 Subject: [PATCH 37/47] fix X40 energy methods --- openqdc/datasets/interaction/X40.py | 2 +- openqdc/datasets/interaction/des370k.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py index 826e455..09d65aa 100644 --- a/openqdc/datasets/interaction/X40.py +++ b/openqdc/datasets/interaction/X40.py @@ -15,7 +15,7 @@ class X40(BaseInteractionDataset): __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" __energy_methods__ = [ - "default", + "CCSD(T)/CBS", "MP2/CBS", "dCCSD(T)/haDZ", "dCCSD(T)/haTZ", diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py index 4b866e7..9cbf736 100644 --- a/openqdc/datasets/interaction/des370k.py +++ b/openqdc/datasets/interaction/des370k.py @@ -79,7 +79,6 @@ def read_raw_entries(self) -> List[Dict]: name = np.array([smiles0 + "." + smiles1]) subsets = [] - # for smiles in [canon_smiles0, canon_smiles1]: for smiles in [smiles0, smiles1]: found = False for functional_group, smiles_set in molecule_groups.items(): From 0a51e7c2fe43341b85f12187158bbbe6ae7d084e Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 11:24:24 -0500 Subject: [PATCH 38/47] added interaction dataset docstrings --- openqdc/datasets/interaction/L7.py | 12 ++++++++++++ openqdc/datasets/interaction/X40.py | 13 ++++++++++++ openqdc/datasets/interaction/des370k.py | 10 ++++++++++ openqdc/datasets/interaction/des5m.py | 10 ++++++++++ openqdc/datasets/interaction/dess66.py | 15 ++++++++++++++ openqdc/datasets/interaction/dess66x8.py | 16 +++++++++++++++ openqdc/datasets/interaction/metcalf.py | 25 ++++++++++++++++-------- openqdc/datasets/interaction/splinter.py | 10 ++++++++++ 8 files changed, 103 insertions(+), 8 deletions(-) diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py index db6fa3c..8c76c03 100644 --- a/openqdc/datasets/interaction/L7.py +++ b/openqdc/datasets/interaction/L7.py @@ -10,6 +10,18 @@ class L7(BaseInteractionDataset): + """ + The L7 interaction energy dataset as described in: + + Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes + Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza + Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374 + DOI: 10.1021/ct400036b + + Data was downloaded and extracted from: + http://cuby4.molecular.cz/dataset_l7.html + """ + __name__ = "L7" __energy_unit__ = "hartree" __distance_unit__ = "ang" diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py index 09d65aa..f38f22a 100644 --- a/openqdc/datasets/interaction/X40.py +++ b/openqdc/datasets/interaction/X40.py @@ -10,6 +10,19 @@ class X40(BaseInteractionDataset): + """ + X40 interaction dataset of 40 dimer pairs as + introduced in the following paper: + + Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules + Jan Řezáč, Kevin E. Riley, and Pavel Hobza + Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292 + DOI: 10.1021/ct300647k + + Dataset retrieved and processed from: + http://cuby4.molecular.cz/dataset_x40.html + """ + __name__ = "X40" __energy_unit__ = "hartree" __distance_unit__ = "ang" diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py index 9cbf736..b98b296 100644 --- a/openqdc/datasets/interaction/des370k.py +++ b/openqdc/datasets/interaction/des370k.py @@ -11,6 +11,16 @@ class DES370K(BaseInteractionDataset): + """ + DE Shaw Research interaction energy of over 370K + small molecule dimers as described in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + """ + __name__ = "des370k_interaction" __energy_unit__ = "hartree" __distance_unit__ = "ang" diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py index 256df0b..40e2ca1 100644 --- a/openqdc/datasets/interaction/des5m.py +++ b/openqdc/datasets/interaction/des5m.py @@ -11,6 +11,16 @@ class DES5M(DES370K): + """ + DE Shaw Research interaction energy calculations for + over 5M small molecule dimers as described in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + """ + __name__ = "des5m_interaction" __energy_methods__ = [ "mp2/cc-pvqz", diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py index a0d361b..7b53057 100644 --- a/openqdc/datasets/interaction/dess66.py +++ b/openqdc/datasets/interaction/dess66.py @@ -11,6 +11,21 @@ class DESS66(BaseInteractionDataset): + """ + DE Shaw Research interaction energy + estimates of all 66 conformers from + the original S66 dataset as described + in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + + Data was downloaded from Zenodo: + https://zenodo.org/records/5676284 + """ + __name__ = "des_s66" __energy_unit__ = "hartree" __distance_unit__ = "ang" diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py index 575d601..1291c7a 100644 --- a/openqdc/datasets/interaction/dess66x8.py +++ b/openqdc/datasets/interaction/dess66x8.py @@ -11,6 +11,22 @@ class DESS66x8(BaseInteractionDataset): + """ + DE Shaw Research interaction energy + estimates of all 528 conformers from + the original S66x8 dataset as described + in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + + Data was downloaded from Zenodo: + + https://zenodo.org/records/5676284 + """ + __name__ = "des_s66x8" __energy_unit__ = "hartree" __distance_unit__ = "ang" diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py index 3553d90..caddc8f 100644 --- a/openqdc/datasets/interaction/metcalf.py +++ b/openqdc/datasets/interaction/metcalf.py @@ -8,6 +8,23 @@ class Metcalf(BaseInteractionDataset): + """ + Hydrogen-bonded dimers of NMA with 126 molecules as described in: + + Approaches for machine learning intermolecular interaction energies and + application to energy components from symmetry adapted perturbation theory. + Derek P. Metcalf, Alexios Koutsoukas, Steven A. Spronk, Brian L. Claus, + Deborah A. Loughney, Stephen R. Johnson, Daniel L. Cheney, C. David Sherrill; + J. Chem. Phys. 21 February 2020; 152 (7): 074103. + https://doi.org/10.1063/1.5142636 + + Further details: + "Hydrogen-bonded dimers involving N-methylacetamide (NMA) and 126 molecules + (46 donors and 80 acceptors; Figs. 2 and 3) were used. Optimized geometries + for the 126 individual monomers were obtained and paired with NMA in broad + arrays of spatial configurations to generate thousands of complexes for training. + """ + __name__ = "metcalf" __energy_unit__ = "hartree" __distance_unit__ = "ang" @@ -22,14 +39,6 @@ class Metcalf(BaseInteractionDataset): ] def read_raw_entries(self) -> List[Dict]: - """ - SMILES strings are inferred from the - .xyz files using the RDKit xyz2mol function. - More details here: - - https://github.com/jensengroup/xyz2mol - - """ data = [] for dirname in os.listdir(self.root): xyz_dir = os.path.join(self.root, dirname) diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py index 2841cf7..25611c2 100644 --- a/openqdc/datasets/interaction/splinter.py +++ b/openqdc/datasets/interaction/splinter.py @@ -10,6 +10,16 @@ class Splinter(BaseInteractionDataset): + """ + A dataset of over 1.7 million protein-ligand + interactions as described in the paper: + + A quantum chemical interaction energy dataset for accurately modeling protein-ligand interactions. + Spronk, S.A., Glick, Z.L., Metcalf, D.P. et al. + Sci Data 10, 619 (2023). + https://doi.org/10.1038/s41597-023-02443-1 + """ + __name__ = "splinter" __energy_methods__ = [ "sapt0/jun-cc-pV(D+d)Z_unscaled", From b6c3a6a581312f3cd5e08e474eec3e3af7e366ce Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 11:29:55 -0500 Subject: [PATCH 39/47] update readme with all interaction datasets --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c0db26d..267a83a 100644 --- a/README.md +++ b/README.md @@ -78,11 +78,6 @@ We provide support for the following publicly available QM Potential Energy Data # Interaction energy -| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| -| --- | --- | --- | --- | --- | --- | --- | --- | -| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes | -| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes | - We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets. | Dataset | @@ -90,3 +85,8 @@ We also provide support for the following publicly available QM Noncovalent Inte | [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | | [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | | [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) | +| [DESS66](https://www.nature.com/articles/s41597-021-00833-x) | +| [DESS66x8](https://www.nature.com/articles/s41597-021-00833-x) | +| [Splinter](https://www.nature.com/articles/s41597-023-02443-1) | +| [X40](https://pubs.acs.org/doi/10.1021/ct300647k) | +| [L7](https://pubs.acs.org/doi/10.1021/ct400036b) | From 07f70b8c51cbf20a813bd9c85b5e0e563524c2ff Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 11:34:27 -0500 Subject: [PATCH 40/47] update metcalf __energy_methods__ --- openqdc/datasets/interaction/metcalf.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py index caddc8f..5e1cd73 100644 --- a/openqdc/datasets/interaction/metcalf.py +++ b/openqdc/datasets/interaction/metcalf.py @@ -29,7 +29,13 @@ class Metcalf(BaseInteractionDataset): __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = None - __energy_methods__ = ["SAPT0/jun-cc-pVDZ"] + __energy_methods__ = [ + "SAPT0/jun-cc-pVDZ", + "SAPT0/jun-cc-pVDZ_es", + "SAPT0/jun-cc-pVDZ_ex", + "SAPT0/jun-cc-pVDZ_ind", + "SAPT0/jun-cc-pVDZ_disp", + ] energy_target_names = [ "total energy", "electrostatic energy", From 1443450de7f036238b1793c85ec4a1c4b03f4838 Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 11:44:39 -0500 Subject: [PATCH 41/47] refactored des370k and des5m --- openqdc/datasets/interaction/des370k.py | 17 +++++--- openqdc/datasets/interaction/des5m.py | 54 ++----------------------- 2 files changed, 16 insertions(+), 55 deletions(-) diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py index b98b296..e97710c 100644 --- a/openqdc/datasets/interaction/des370k.py +++ b/openqdc/datasets/interaction/des370k.py @@ -65,10 +65,14 @@ class DES370K(BaseInteractionDataset): "sapt_delta_HF", ] - def read_raw_entries(self) -> List[Dict]: - self.filepath = os.path.join(self.root, "DES370K.csv") - logger.info(f"Reading DES370K interaction data from {self.filepath}") - df = pd.read_csv(self.filepath) + _filename = "DES370K.csv" + _short_name = "DES370K" + + @classmethod + def _read_raw_entries(cls) -> List[Dict]: + filepath = os.path.join(cls.root, cls._filename) + logger.info(f"Reading {cls._short_name} interaction data from {filepath}") + df = pd.read_csv(filepath) data = [] for idx, row in tqdm(df.iterrows(), total=df.shape[0]): smiles0, smiles1 = row["smiles0"], row["smiles1"] @@ -84,7 +88,7 @@ def read_raw_entries(self) -> List[Dict]: atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] + energies = np.array(row[cls.energy_target_names].values).astype(np.float32)[None, :] name = np.array([smiles0 + "." + smiles1]) @@ -108,3 +112,6 @@ def read_raw_entries(self) -> List[Dict]: ) data.append(item) return data + + def read_raw_entries(self) -> List[Dict]: + return DES370K._read_raw_entries() diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py index 40e2ca1..ea0d929 100644 --- a/openqdc/datasets/interaction/des5m.py +++ b/openqdc/datasets/interaction/des5m.py @@ -1,13 +1,6 @@ -import os from typing import Dict, List -import numpy as np -import pandas as pd -from loguru import logger -from tqdm import tqdm - from openqdc.datasets.interaction import DES370K -from openqdc.utils.molecule import atom_table, molecule_groups class DES5M(DES370K): @@ -56,47 +49,8 @@ class DES5M(DES370K): "sapt_delta_HF", ] - def read_raw_entries(self) -> List[Dict]: - self.filepath = os.path.join(self.root, "DES5M.csv") - logger.info(f"Reading DES5M interaction data from {self.filepath}") - df = pd.read_csv(self.filepath) - data = [] - for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - smiles0, smiles1 = row["smiles0"], row["smiles1"] - charge0, charge1 = row["charge0"], row["charge1"] - natoms0, natoms1 = row["natoms0"], row["natoms1"] - pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) - - elements = row["elements"].split() - - atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) - - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + _filename = "DES5M.csv" + _short_name = "DES5M" - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - - energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] - - name = np.array([smiles0 + "." + smiles1]) - - subsets = [] - # for smiles in [canon_smiles0, canon_smiles1]: - for smiles in [smiles0, smiles1]: - found = False - for functional_group, smiles_set in molecule_groups.items(): - if smiles in smiles_set: - subsets.append(functional_group) - found = True - if not found: - logger.info(f"molecule group lookup failed for {smiles}") - - item = dict( - energies=energies, - subset=np.array([subsets]), - n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), - n_atoms_first=np.array([natoms0], dtype=np.int32), - atomic_inputs=atomic_inputs, - name=name, - ) - data.append(item) - return data + def read_raw_entries(self) -> List[Dict]: + return DES5M._read_raw_entries() From 802b70b5d7028e4d2e7bfa92867ec1429f535b92 Mon Sep 17 00:00:00 2001 From: mcneela Date: Fri, 8 Mar 2024 12:23:29 -0500 Subject: [PATCH 42/47] update base interaction dataset to add n_atoms_first property --- openqdc/datasets/interaction/base.py | 43 +++++++++++++++++++++++++ openqdc/datasets/interaction/des370k.py | 11 +++++-- openqdc/datasets/interaction/des5m.py | 2 +- openqdc/datasets/interaction/metcalf.py | 1 + 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 71c8e84..27c2f88 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -1,8 +1,10 @@ from typing import Dict, List, Optional import numpy as np +from sklearn.utils import Bunch from openqdc.datasets.base import BaseDataset +from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory from openqdc.utils.constants import NB_ATOMIC_FEATURES @@ -45,4 +47,45 @@ def data_shapes(self): "position_idx_range": (-1, 2), "energies": (-1, len(self.__energy_methods__)), "forces": (-1, 3, len(self.force_target_names)), + "n_atoms_first": (-1,), } + + @property + def data_types(self): + return { + "atomic_inputs": np.float32, + "position_idx_range": np.int32, + "energies": np.float32, + "forces": np.float32, + "n_atoms_first": np.int32, + } + + def __getitem__(self, idx: int): + shift = IsolatedAtomEnergyFactory.max_charge + p_start, p_end = self.data["position_idx_range"][idx] + input = self.data["atomic_inputs"][p_start:p_end] + z, c, positions, energies = ( + np.array(input[:, 0], dtype=np.int32), + np.array(input[:, 1], dtype=np.int32), + np.array(input[:, -3:], dtype=np.float32), + np.array(self.data["energies"][idx], dtype=np.float32), + ) + name = self.__smiles_converter__(self.data["name"][idx]) + subset = self.data["subset"][idx] + n_atoms_first = self.data["n_atoms_first"][idx] + + if "forces" in self.data: + forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32) + else: + forces = None + return Bunch( + positions=positions, + atomic_numbers=z, + charges=c, + e0=self.__isolated_atom_energies__[..., z, c + shift].T, + energies=energies, + name=name, + subset=subset, + forces=forces, + n_atoms_first=n_atoms_first, + ) diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py index e97710c..382b84c 100644 --- a/openqdc/datasets/interaction/des370k.py +++ b/openqdc/datasets/interaction/des370k.py @@ -7,6 +7,7 @@ from tqdm import tqdm from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.utils.io import get_local_cache from openqdc.utils.molecule import atom_table, molecule_groups @@ -66,12 +67,16 @@ class DES370K(BaseInteractionDataset): ] _filename = "DES370K.csv" - _short_name = "DES370K" + _name = "des370k_interaction" + + @classmethod + def _root(cls): + return os.path.join(get_local_cache(), cls._name) @classmethod def _read_raw_entries(cls) -> List[Dict]: - filepath = os.path.join(cls.root, cls._filename) - logger.info(f"Reading {cls._short_name} interaction data from {filepath}") + filepath = os.path.join(cls._root(), cls._filename) + logger.info(f"Reading {cls._name} interaction data from {filepath}") df = pd.read_csv(filepath) data = [] for idx, row in tqdm(df.iterrows(), total=df.shape[0]): diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py index ea0d929..5b027f4 100644 --- a/openqdc/datasets/interaction/des5m.py +++ b/openqdc/datasets/interaction/des5m.py @@ -50,7 +50,7 @@ class DES5M(DES370K): ] _filename = "DES5M.csv" - _short_name = "DES5M" + _name = "des5m_interaction" def read_raw_entries(self) -> List[Dict]: return DES5M._read_raw_entries() diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py index 5e1cd73..c9921da 100644 --- a/openqdc/datasets/interaction/metcalf.py +++ b/openqdc/datasets/interaction/metcalf.py @@ -74,6 +74,7 @@ def read_raw_entries(self) -> List[Dict]: positions=xyz, atomic_inputs=atomic_inputs, name=np.array([""]), + n_atoms_first=np.array([-1]), ) data.append(item) return data From e969b54b0dab138285edc1d68cc05a1fb0dc574d Mon Sep 17 00:00:00 2001 From: mcneela Date: Tue, 12 Mar 2024 13:56:12 -0400 Subject: [PATCH 43/47] update L7 and X40 to use python base yaml package --- openqdc/datasets/interaction/L7.py | 59 ++++++++++++++++++++++++----- openqdc/datasets/interaction/X40.py | 18 ++++----- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py index 8c76c03..fc2f899 100644 --- a/openqdc/datasets/interaction/L7.py +++ b/openqdc/datasets/interaction/L7.py @@ -2,13 +2,53 @@ from typing import Dict, List import numpy as np +import yaml from loguru import logger -from ruamel.yaml import YAML from openqdc.datasets.interaction import BaseInteractionDataset from openqdc.utils.molecule import atom_table +class DataItemYAMLObj: + def __init__(self, name, shortname, geometry, reference_value, setup, group, tags): + self.name = name + self.shortname = shortname + self.geometry = geometry + self.reference_value = reference_value + self.setup = setup + self.group = group + self.tags = tags + + +class DataSetYAMLObj: + def __init__(self, name, references, text, method_energy, groups_by, groups, global_setup): + self.name = name + self.references = references + self.text = text + self.method_energy = method_energy + self.groups_by = groups_by + self.groups = groups + self.global_setup = global_setup + + +def data_item_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode): + """Construct an employee.""" + return DataItemYAMLObj(**loader.construct_mapping(node)) + + +def dataset_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode): + """Construct an employee.""" + return DataSetYAMLObj(**loader.construct_mapping(node)) + + +def get_loader(): + """Add constructors to PyYAML loader.""" + loader = yaml.SafeLoader + loader.add_constructor("!ruby/object:ProtocolDataset::DataSetItem", data_item_constructor) + loader.add_constructor("!ruby/object:ProtocolDataset::DataSetDescription", dataset_constructor) + return loader + + class L7(BaseInteractionDataset): """ The L7 interaction energy dataset as described in: @@ -43,23 +83,22 @@ def read_raw_entries(self) -> List[Dict]: yaml_fpath = os.path.join(self.root, "l7.yaml") logger.info(f"Reading L7 interaction data from {self.root}") yaml_file = open(yaml_fpath, "r") - yaml = YAML() data = [] - data_dict = yaml.load(yaml_file) - charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"]) - charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"]) + data_dict = yaml.load(yaml_file, Loader=get_loader()) + charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"]) + charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"]) for idx, item in enumerate(data_dict["items"]): energies = [] - name = np.array([item["shortname"]]) - fname = item["geometry"].split(":")[1] - energies.append(item["reference_value"]) + name = np.array([item.shortname]) + fname = item.geometry.split(":")[1] + energies.append(item.reference_value) xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r") lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) lines.pop(1) n_atoms = np.array([int(lines[0][0])], dtype=np.int32) - n_atoms_first = np.array([int(item["setup"]["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) - subset = np.array([item["group"]]) + n_atoms_first = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) + subset = np.array([item.group]) energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] energies = np.array([energies], dtype=np.float32) pos = np.array(lines[1:])[:, 1:].astype(np.float32) diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py index f38f22a..3888469 100644 --- a/openqdc/datasets/interaction/X40.py +++ b/openqdc/datasets/interaction/X40.py @@ -2,10 +2,11 @@ from typing import Dict, List import numpy as np +import yaml from loguru import logger -from ruamel.yaml import YAML from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.datasets.interaction.L7 import get_loader from openqdc.utils.molecule import atom_table @@ -41,23 +42,22 @@ def read_raw_entries(self) -> List[Dict]: yaml_fpath = os.path.join(self.root, "x40.yaml") logger.info(f"Reading X40 interaction data from {self.root}") yaml_file = open(yaml_fpath, "r") - yaml = YAML() data = [] - data_dict = yaml.load(yaml_file) - charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"]) - charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"]) + data_dict = yaml.load(yaml_file, Loader=get_loader()) + charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"]) + charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"]) for idx, item in enumerate(data_dict["items"]): energies = [] - name = np.array([item["shortname"]]) - energies.append(float(item["reference_value"])) - xyz_file = open(os.path.join(self.root, f"{item['shortname']}.xyz"), "r") + name = np.array([item.shortname]) + energies.append(float(item.reference_value)) + xyz_file = open(os.path.join(self.root, f"{item.shortname}.xyz"), "r") lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) setup = lines.pop(1) n_atoms = np.array([int(lines[0][0])], dtype=np.int32) n_atoms_first = setup[0].split("-")[1] n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32) - subset = np.array([item["group"]]) + subset = np.array([item.group]) energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] energies = np.array([energies], dtype=np.float32) pos = np.array(lines[1:])[:, 1:].astype(np.float32) From 5725fed90b5ec5c00df1992d433f1009e2cfb664 Mon Sep 17 00:00:00 2001 From: mcneela Date: Wed, 13 Mar 2024 11:23:16 -0400 Subject: [PATCH 44/47] modify interaction/base.py to save keys other than force/energy in props.pkl --- openqdc/datasets/interaction/base.py | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 27c2f88..3abe01f 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -1,11 +1,15 @@ +import pickle as pkl +from os.path import join as p_join from typing import Dict, List, Optional import numpy as np +from loguru import logger from sklearn.utils import Bunch from openqdc.datasets.base import BaseDataset from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory from openqdc.utils.constants import NB_ATOMIC_FEATURES +from openqdc.utils.io import pull_locally, push_remote class BaseInteractionDataset(BaseDataset): @@ -89,3 +93,51 @@ def __getitem__(self, idx: int): forces=forces, n_atoms_first=n_atoms_first, ) + + def save_preprocess(self, data_dict): + # save memmaps + logger.info("Preprocessing data and saving it to cache.") + for key in self.data_keys: + local_path = p_join(self.preprocess_path, f"{key}.mmap") + out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) + out[:] = data_dict.pop(key)[:] + out.flush() + push_remote(local_path, overwrite=True) + + # save all other keys in props.pkl + local_path = p_join(self.preprocess_path, "props.pkl") + for key in data_dict: + if key not in self.data_keys: + data_dict[key] = np.unique(data_dict[key], return_inverse=True) + + with open(local_path, "wb") as f: + pkl.dump(data_dict, f) + push_remote(local_path, overwrite=True) + + def read_preprocess(self, overwrite_local_cache=False): + logger.info("Reading preprocessed data.") + logger.info( + f"Dataset {self.__name__} with the following units:\n\ + Energy: {self.energy_unit},\n\ + Distance: {self.distance_unit},\n\ + Forces: {self.force_unit if self.__force_methods__ else 'None'}" + ) + self.data = {} + for key in self.data_keys: + filename = p_join(self.preprocess_path, f"{key}.mmap") + pull_locally(filename, overwrite=overwrite_local_cache) + self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key]) + + filename = p_join(self.preprocess_path, "props.pkl") + pull_locally(filename, overwrite=overwrite_local_cache) + with open(filename, "rb") as f: + tmp = pkl.load(f) + for key in set(tmp.keys()) - set(self.data_keys): + x = tmp.pop(key) + if len(x) == 2: + self.data[key] = x[0][x[1]] + else: + self.data[key] = x + + for key in self.data: + logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") From 6c6b2867425938018b10e81769d56ad8bf1dae89 Mon Sep 17 00:00:00 2001 From: mcneela Date: Wed, 13 Mar 2024 12:59:42 -0400 Subject: [PATCH 45/47] fix base dataset issue --- openqdc/datasets/interaction/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 3abe01f..42bf573 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -51,7 +51,6 @@ def data_shapes(self): "position_idx_range": (-1, 2), "energies": (-1, len(self.__energy_methods__)), "forces": (-1, 3, len(self.force_target_names)), - "n_atoms_first": (-1,), } @property @@ -61,7 +60,6 @@ def data_types(self): "position_idx_range": np.int32, "energies": np.float32, "forces": np.float32, - "n_atoms_first": np.int32, } def __getitem__(self, idx: int): @@ -108,7 +106,9 @@ def save_preprocess(self, data_dict): local_path = p_join(self.preprocess_path, "props.pkl") for key in data_dict: if key not in self.data_keys: - data_dict[key] = np.unique(data_dict[key], return_inverse=True) + x = data_dict[key] + x[np.where(x is None)] = np.inf + data_dict[key] = np.unique(x, return_inverse=True) with open(local_path, "wb") as f: pkl.dump(data_dict, f) From 46c5ebe0235707b0f075dbba7cea961c4337f3a4 Mon Sep 17 00:00:00 2001 From: mcneela Date: Wed, 13 Mar 2024 14:36:43 -0400 Subject: [PATCH 46/47] fix circular imports --- openqdc/datasets/interaction/L7.py | 2 +- openqdc/datasets/interaction/X40.py | 2 +- openqdc/datasets/interaction/des370k.py | 2 +- openqdc/datasets/interaction/des5m.py | 2 +- openqdc/datasets/interaction/dess66.py | 2 +- openqdc/datasets/interaction/dess66x8.py | 2 +- openqdc/datasets/interaction/metcalf.py | 2 +- openqdc/datasets/interaction/splinter.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py index fc2f899..4e6ad01 100644 --- a/openqdc/datasets/interaction/L7.py +++ b/openqdc/datasets/interaction/L7.py @@ -5,7 +5,7 @@ import yaml from loguru import logger -from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.utils.molecule import atom_table diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py index 3888469..08f4037 100644 --- a/openqdc/datasets/interaction/X40.py +++ b/openqdc/datasets/interaction/X40.py @@ -5,7 +5,7 @@ import yaml from loguru import logger -from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.datasets.interaction.L7 import get_loader from openqdc.utils.molecule import atom_table diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py index 382b84c..75198fd 100644 --- a/openqdc/datasets/interaction/des370k.py +++ b/openqdc/datasets/interaction/des370k.py @@ -6,7 +6,7 @@ from loguru import logger from tqdm import tqdm -from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.utils.io import get_local_cache from openqdc.utils.molecule import atom_table, molecule_groups diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py index 5b027f4..46d9ba1 100644 --- a/openqdc/datasets/interaction/des5m.py +++ b/openqdc/datasets/interaction/des5m.py @@ -1,6 +1,6 @@ from typing import Dict, List -from openqdc.datasets.interaction import DES370K +from openqdc.datasets.interaction.des370k import DES370K class DES5M(DES370K): diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py index 7b53057..ae3ce81 100644 --- a/openqdc/datasets/interaction/dess66.py +++ b/openqdc/datasets/interaction/dess66.py @@ -6,7 +6,7 @@ from loguru import logger from tqdm import tqdm -from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.utils.molecule import atom_table diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py index 1291c7a..2e97221 100644 --- a/openqdc/datasets/interaction/dess66x8.py +++ b/openqdc/datasets/interaction/dess66x8.py @@ -6,7 +6,7 @@ from loguru import logger from tqdm import tqdm -from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.utils.molecule import atom_table diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py index c9921da..041964a 100644 --- a/openqdc/datasets/interaction/metcalf.py +++ b/openqdc/datasets/interaction/metcalf.py @@ -3,7 +3,7 @@ import numpy as np -from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.utils.molecule import atom_table diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py index 25611c2..06b6d86 100644 --- a/openqdc/datasets/interaction/splinter.py +++ b/openqdc/datasets/interaction/splinter.py @@ -5,7 +5,7 @@ from loguru import logger from tqdm import tqdm -from openqdc.datasets.interaction import BaseInteractionDataset +from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.utils.molecule import atom_table From cb9987cdca2d45eb83464e3ba229059865c4b7d5 Mon Sep 17 00:00:00 2001 From: Daniel McNeela Date: Thu, 14 Mar 2024 17:01:59 -0500 Subject: [PATCH 47/47] removed print statements --- openqdc/datasets/interaction/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 42bf573..bd3ab70 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -29,7 +29,6 @@ def __init__( def collate_list(self, list_entries: List[Dict]): # concatenate entries - print(list_entries[0]) res = { key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0] @@ -37,7 +36,6 @@ def collate_list(self, list_entries: List[Dict]): } csum = np.cumsum(res.get("n_atoms")) - print(csum) x = np.zeros((csum.shape[0], 2), dtype=np.int32) x[1:, 0], x[:, 1] = csum[:-1], csum res["position_idx_range"] = x