diff --git a/README.md b/README.md index 776b8ea..267a83a 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ openqdc download Spice QMugs 6. QM Level of Theory --> -We provide support for the following publicly available QM Datasets. +We provide support for the following publicly available QM Potential Energy Datasets. # Potential Energy @@ -78,7 +78,15 @@ We provide support for the following publicly available QM Datasets. # Interaction energy -| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| -| --- | --- | --- | --- | --- | --- | --- | --- | -| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes | -| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes | +We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets. + +| Dataset | +| --- | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | +| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) | +| [DESS66](https://www.nature.com/articles/s41597-021-00833-x) | +| [DESS66x8](https://www.nature.com/articles/s41597-021-00833-x) | +| [Splinter](https://www.nature.com/articles/s41597-023-02443-1) | +| [X40](https://pubs.acs.org/doi/10.1021/ct300647k) | +| [L7](https://pubs.acs.org/doi/10.1021/ct400036b) | diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py index c8b2469..3e7db7b 100644 --- a/openqdc/datasets/__init__.py +++ b/openqdc/datasets/__init__.py @@ -1,25 +1,4 @@ -from .base import BaseDataset # noqa from .interaction import AVAILABLE_INTERACTION_DATASETS # noqa -from .interaction import DES # noqa from .potential import AVAILABLE_POTENTIAL_DATASETS # noqa -from .potential.ani import ANI1, ANI1CCX, ANI1X # noqa -from .potential.comp6 import COMP6 # noqa -from .potential.dummy import Dummy # noqa -from .potential.gdml import GDML # noqa -from .potential.geom import GEOM # noqa -from .potential.iso_17 import ISO17 # noqa -from .potential.molecule3d import Molecule3D # noqa -from .potential.multixcqm9 import MultixcQM9 # noqa -from .potential.nabladft import NablaDFT # noqa -from .potential.orbnet_denali import OrbnetDenali # noqa -from .potential.pcqm import PCQM_B3LYP, PCQM_PM6 # noqa -from .potential.qm7x import QM7X # noqa -from .potential.qmugs import QMugs # noqa -from .potential.sn2_rxn import SN2RXN # noqa -from .potential.solvated_peptides import SolvatedPeptides # noqa -from .potential.spice import Spice # noqa -from .potential.tmqm import TMQM # noqa -from .potential.transition1x import Transition1X # noqa -from .potential.waterclusters3_30 import WaterClusters # noqa AVAILABLE_DATASETS = {**AVAILABLE_POTENTIAL_DATASETS, **AVAILABLE_INTERACTION_DATASETS} diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py new file mode 100644 index 0000000..4e6ad01 --- /dev/null +++ b/openqdc/datasets/interaction/L7.py @@ -0,0 +1,121 @@ +import os +from typing import Dict, List + +import numpy as np +import yaml +from loguru import logger + +from openqdc.datasets.interaction.base import BaseInteractionDataset +from openqdc.utils.molecule import atom_table + + +class DataItemYAMLObj: + def __init__(self, name, shortname, geometry, reference_value, setup, group, tags): + self.name = name + self.shortname = shortname + self.geometry = geometry + self.reference_value = reference_value + self.setup = setup + self.group = group + self.tags = tags + + +class DataSetYAMLObj: + def __init__(self, name, references, text, method_energy, groups_by, groups, global_setup): + self.name = name + self.references = references + self.text = text + self.method_energy = method_energy + self.groups_by = groups_by + self.groups = groups + self.global_setup = global_setup + + +def data_item_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode): + """Construct an employee.""" + return DataItemYAMLObj(**loader.construct_mapping(node)) + + +def dataset_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode): + """Construct an employee.""" + return DataSetYAMLObj(**loader.construct_mapping(node)) + + +def get_loader(): + """Add constructors to PyYAML loader.""" + loader = yaml.SafeLoader + loader.add_constructor("!ruby/object:ProtocolDataset::DataSetItem", data_item_constructor) + loader.add_constructor("!ruby/object:ProtocolDataset::DataSetDescription", dataset_constructor) + return loader + + +class L7(BaseInteractionDataset): + """ + The L7 interaction energy dataset as described in: + + Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes + Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza + Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374 + DOI: 10.1021/ct400036b + + Data was downloaded and extracted from: + http://cuby4.molecular.cz/dataset_l7.html + """ + + __name__ = "L7" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "CSD(T) | QCISD(T)", + "DLPNO-CCSD(T)", + "MP2/CBS", + "MP2C/CBS", + "fixed", + "DLPNO-CCSD(T0)", + "LNO-CCSD(T)", + "FN-DMC", + ] + + energy_target_names = [] + + def read_raw_entries(self) -> List[Dict]: + yaml_fpath = os.path.join(self.root, "l7.yaml") + logger.info(f"Reading L7 interaction data from {self.root}") + yaml_file = open(yaml_fpath, "r") + data = [] + data_dict = yaml.load(yaml_file, Loader=get_loader()) + charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"]) + charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"]) + + for idx, item in enumerate(data_dict["items"]): + energies = [] + name = np.array([item.shortname]) + fname = item.geometry.split(":")[1] + energies.append(item.reference_value) + xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r") + lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) + lines.pop(1) + n_atoms = np.array([int(lines[0][0])], dtype=np.int32) + n_atoms_first = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) + subset = np.array([item.group]) + energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] + energies = np.array([energies], dtype=np.float32) + pos = np.array(lines[1:])[:, 1:].astype(np.float32) + elems = np.array(lines[1:])[:, 0] + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) + natoms0 = n_atoms_first[0] + natoms1 = n_atoms[0] - natoms0 + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + + item = dict( + energies=energies, + subset=subset, + n_atoms=n_atoms, + n_atoms_first=n_atoms_first, + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py new file mode 100644 index 0000000..08f4037 --- /dev/null +++ b/openqdc/datasets/interaction/X40.py @@ -0,0 +1,80 @@ +import os +from typing import Dict, List + +import numpy as np +import yaml +from loguru import logger + +from openqdc.datasets.interaction.base import BaseInteractionDataset +from openqdc.datasets.interaction.L7 import get_loader +from openqdc.utils.molecule import atom_table + + +class X40(BaseInteractionDataset): + """ + X40 interaction dataset of 40 dimer pairs as + introduced in the following paper: + + Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules + Jan Řezáč, Kevin E. Riley, and Pavel Hobza + Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292 + DOI: 10.1021/ct300647k + + Dataset retrieved and processed from: + http://cuby4.molecular.cz/dataset_x40.html + """ + + __name__ = "X40" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "CCSD(T)/CBS", + "MP2/CBS", + "dCCSD(T)/haDZ", + "dCCSD(T)/haTZ", + "MP2.5/CBS(aDZ)", + ] + + energy_target_names = [] + + def read_raw_entries(self) -> List[Dict]: + yaml_fpath = os.path.join(self.root, "x40.yaml") + logger.info(f"Reading X40 interaction data from {self.root}") + yaml_file = open(yaml_fpath, "r") + data = [] + data_dict = yaml.load(yaml_file, Loader=get_loader()) + charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"]) + charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"]) + + for idx, item in enumerate(data_dict["items"]): + energies = [] + name = np.array([item.shortname]) + energies.append(float(item.reference_value)) + xyz_file = open(os.path.join(self.root, f"{item.shortname}.xyz"), "r") + lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) + setup = lines.pop(1) + n_atoms = np.array([int(lines[0][0])], dtype=np.int32) + n_atoms_first = setup[0].split("-")[1] + n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32) + subset = np.array([item.group]) + energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] + energies = np.array([energies], dtype=np.float32) + pos = np.array(lines[1:])[:, 1:].astype(np.float32) + elems = np.array(lines[1:])[:, 0] + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) + natoms0 = n_atoms_first[0] + natoms1 = n_atoms[0] - natoms0 + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + + item = dict( + energies=energies, + subset=subset, + n_atoms=n_atoms, + n_atoms_first=n_atoms_first, + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py index abf99a6..82154a5 100644 --- a/openqdc/datasets/interaction/__init__.py +++ b/openqdc/datasets/interaction/__init__.py @@ -1,3 +1,21 @@ -from .des import DES +from .base import BaseInteractionDataset +from .des5m import DES5M +from .des370k import DES370K +from .dess66 import DESS66 +from .dess66x8 import DESS66x8 +from .L7 import L7 +from .metcalf import Metcalf +from .splinter import Splinter +from .X40 import X40 -AVAILABLE_INTERACTION_DATASETS = {"des": DES} +AVAILABLE_INTERACTION_DATASETS = { + "base": BaseInteractionDataset, + "des5m": DES5M, + "des370k": DES370K, + "dess66": DESS66, + "dess66x8": DESS66x8, + "l7": L7, + "metcalf": Metcalf, + "splinter": Splinter, + "x40": X40, +} diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py new file mode 100644 index 0000000..bd3ab70 --- /dev/null +++ b/openqdc/datasets/interaction/base.py @@ -0,0 +1,141 @@ +import pickle as pkl +from os.path import join as p_join +from typing import Dict, List, Optional + +import numpy as np +from loguru import logger +from sklearn.utils import Bunch + +from openqdc.datasets.base import BaseDataset +from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory +from openqdc.utils.constants import NB_ATOMIC_FEATURES +from openqdc.utils.io import pull_locally, push_remote + + +class BaseInteractionDataset(BaseDataset): + def __init__( + self, + energy_unit: Optional[str] = None, + distance_unit: Optional[str] = None, + overwrite_local_cache: bool = False, + cache_dir: Optional[str] = None, + ) -> None: + super().__init__( + energy_unit=energy_unit, + distance_unit=distance_unit, + overwrite_local_cache=overwrite_local_cache, + cache_dir=cache_dir, + ) + + def collate_list(self, list_entries: List[Dict]): + # concatenate entries + res = { + key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) + for key in list_entries[0] + if not isinstance(list_entries[0][key], dict) + } + + csum = np.cumsum(res.get("n_atoms")) + x = np.zeros((csum.shape[0], 2), dtype=np.int32) + x[1:, 0], x[:, 1] = csum[:-1], csum + res["position_idx_range"] = x + + return res + + @property + def data_shapes(self): + return { + "atomic_inputs": (-1, NB_ATOMIC_FEATURES), + "position_idx_range": (-1, 2), + "energies": (-1, len(self.__energy_methods__)), + "forces": (-1, 3, len(self.force_target_names)), + } + + @property + def data_types(self): + return { + "atomic_inputs": np.float32, + "position_idx_range": np.int32, + "energies": np.float32, + "forces": np.float32, + } + + def __getitem__(self, idx: int): + shift = IsolatedAtomEnergyFactory.max_charge + p_start, p_end = self.data["position_idx_range"][idx] + input = self.data["atomic_inputs"][p_start:p_end] + z, c, positions, energies = ( + np.array(input[:, 0], dtype=np.int32), + np.array(input[:, 1], dtype=np.int32), + np.array(input[:, -3:], dtype=np.float32), + np.array(self.data["energies"][idx], dtype=np.float32), + ) + name = self.__smiles_converter__(self.data["name"][idx]) + subset = self.data["subset"][idx] + n_atoms_first = self.data["n_atoms_first"][idx] + + if "forces" in self.data: + forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32) + else: + forces = None + return Bunch( + positions=positions, + atomic_numbers=z, + charges=c, + e0=self.__isolated_atom_energies__[..., z, c + shift].T, + energies=energies, + name=name, + subset=subset, + forces=forces, + n_atoms_first=n_atoms_first, + ) + + def save_preprocess(self, data_dict): + # save memmaps + logger.info("Preprocessing data and saving it to cache.") + for key in self.data_keys: + local_path = p_join(self.preprocess_path, f"{key}.mmap") + out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) + out[:] = data_dict.pop(key)[:] + out.flush() + push_remote(local_path, overwrite=True) + + # save all other keys in props.pkl + local_path = p_join(self.preprocess_path, "props.pkl") + for key in data_dict: + if key not in self.data_keys: + x = data_dict[key] + x[np.where(x is None)] = np.inf + data_dict[key] = np.unique(x, return_inverse=True) + + with open(local_path, "wb") as f: + pkl.dump(data_dict, f) + push_remote(local_path, overwrite=True) + + def read_preprocess(self, overwrite_local_cache=False): + logger.info("Reading preprocessed data.") + logger.info( + f"Dataset {self.__name__} with the following units:\n\ + Energy: {self.energy_unit},\n\ + Distance: {self.distance_unit},\n\ + Forces: {self.force_unit if self.__force_methods__ else 'None'}" + ) + self.data = {} + for key in self.data_keys: + filename = p_join(self.preprocess_path, f"{key}.mmap") + pull_locally(filename, overwrite=overwrite_local_cache) + self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key]) + + filename = p_join(self.preprocess_path, "props.pkl") + pull_locally(filename, overwrite=overwrite_local_cache) + with open(filename, "rb") as f: + tmp = pkl.load(f) + for key in set(tmp.keys()) - set(self.data_keys): + x = tmp.pop(key) + if len(x) == 2: + self.data[key] = x[0][x[1]] + else: + self.data[key] = x + + for key in self.data: + logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py deleted file mode 100644 index 337a8e8..0000000 --- a/openqdc/datasets/interaction/des.py +++ /dev/null @@ -1,93 +0,0 @@ -from os.path import join as p_join - -import datamol as dm -import numpy as np -import pandas as pd -from tqdm import tqdm - -from openqdc.datasets.base import BaseDataset -from openqdc.utils.molecule import get_atomic_number_and_charge - - -def read_mol(mol_path, smiles, subset, targets): - try: - with open(mol_path, "r") as f: - mol_block = f.read() - mol = dm.read_molblock(mol_block, remove_hs=False, fail_if_invalid=True) - - x = get_atomic_number_and_charge(mol) - positions = mol.GetConformer().GetPositions() - - res = dict( - name=np.array([smiles]), - subset=np.array([subset]), - energies=np.array(targets).astype(np.float32)[None, :], - atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32), - n_atoms=np.array([x.shape[0]], dtype=np.int32), - ) - except Exception as e: - print(f"Skipping: {mol_path} due to {e}") - res = None - - return res - - -class DES(BaseDataset): - __name__ = "des" - __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" - __energy_methods__ = [ - "mp2/cc-pvdz", - "mp2/cc-pvqz", - "mp2/cc-pvtz", - "mp2/cbs", - "ccsd(t)/cc-pvdz", - "ccsd(t)/cbs", # cbs - "ccsd(t)/nn", # nn - "sapt0/aug-cc-pwcvxz", - ] - - energy_target_names = [ - "cc_MP2_all", - "qz_MP2_all", - "tz_MP2_all", - "cbs_MP2_all", - "cc_CCSD(T)_all", - "cbs_CCSD(T)_all", - "nn_CCSD(T)_all", - "sapt_all", - ] - # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all'] - - partitions = ["DES370K", "DES5M"] - - def _read_raw_(self, part): - df = pd.read_csv(p_join(self.root, f"{part}.csv")) - for col in self.energy_target_names: - if col not in df.columns: - df[col] = np.nan - smiles = (df["smiles0"] + "." + df["smiles1"]).tolist() - subsets = (f"{part}_" + df["group_orig"]).tolist() - targets = df[self.energy_target_names].values - paths = ( - p_join(self.root, "geometries/") - + df["system_id"].astype(str) - + f"/{part}_" - + df["geom_id"].astype(str) - + ".mol" - ) - - inputs = [ - dict(smiles=smiles[i], subset=subsets[i], targets=targets[i], mol_path=paths[i]) - for i in tqdm(range(len(smiles))) - ] - f = lambda xs: [read_mol(**x) for x in xs] - samples = dm.parallelized_with_batches( - f, inputs, n_jobs=-1, progress=True, batch_size=1024, scheduler="threads" - ) - return samples - - def read_raw_entries(self): - samples = sum([self._read_raw_(partition) for partition in self.partitions], []) - return samples diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py new file mode 100644 index 0000000..75198fd --- /dev/null +++ b/openqdc/datasets/interaction/des370k.py @@ -0,0 +1,122 @@ +import os +from typing import Dict, List + +import numpy as np +import pandas as pd +from loguru import logger +from tqdm import tqdm + +from openqdc.datasets.interaction.base import BaseInteractionDataset +from openqdc.utils.io import get_local_cache +from openqdc.utils.molecule import atom_table, molecule_groups + + +class DES370K(BaseInteractionDataset): + """ + DE Shaw Research interaction energy of over 370K + small molecule dimers as described in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + """ + + __name__ = "des370k_interaction" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cbs", # cbs + "ccsd(t)/nn", # nn + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz_es", + "sapt0/aug-cc-pwcvxz_ex", + "sapt0/aug-cc-pwcvxz_exs2", + "sapt0/aug-cc-pwcvxz_ind", + "sapt0/aug-cc-pwcvxz_exind", + "sapt0/aug-cc-pwcvxz_disp", + "sapt0/aug-cc-pwcvxz_exdisp_os", + "sapt0/aug-cc-pwcvxz_exdisp_ss", + "sapt0/aug-cc-pwcvxz_delta_HF", + ] + + energy_target_names = [ + "cc_MP2_all", + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "cc_CCSD(T)_all", + "cbs_CCSD(T)_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + _filename = "DES370K.csv" + _name = "des370k_interaction" + + @classmethod + def _root(cls): + return os.path.join(get_local_cache(), cls._name) + + @classmethod + def _read_raw_entries(cls) -> List[Dict]: + filepath = os.path.join(cls._root(), cls._filename) + logger.info(f"Reading {cls._name} interaction data from {filepath}") + df = pd.read_csv(filepath) + data = [] + for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] + natoms0, natoms1 = row["natoms0"], row["natoms1"] + pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) + + elements = row["elements"].split() + + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) + + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + + energies = np.array(row[cls.energy_target_names].values).astype(np.float32)[None, :] + + name = np.array([smiles0 + "." + smiles1]) + + subsets = [] + for smiles in [smiles0, smiles1]: + found = False + for functional_group, smiles_set in molecule_groups.items(): + if smiles in smiles_set: + subsets.append(functional_group) + found = True + if not found: + logger.info(f"molecule group lookup failed for {smiles}") + + item = dict( + energies=energies, + subset=np.array([subsets]), + n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), + n_atoms_first=np.array([natoms0], dtype=np.int32), + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data + + def read_raw_entries(self) -> List[Dict]: + return DES370K._read_raw_entries() diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py new file mode 100644 index 0000000..46d9ba1 --- /dev/null +++ b/openqdc/datasets/interaction/des5m.py @@ -0,0 +1,56 @@ +from typing import Dict, List + +from openqdc.datasets.interaction.des370k import DES370K + + +class DES5M(DES370K): + """ + DE Shaw Research interaction energy calculations for + over 5M small molecule dimers as described in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + """ + + __name__ = "des5m_interaction" + __energy_methods__ = [ + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/nn", # nn + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz_es", + "sapt0/aug-cc-pwcvxz_ex", + "sapt0/aug-cc-pwcvxz_exs2", + "sapt0/aug-cc-pwcvxz_ind", + "sapt0/aug-cc-pwcvxz_exind", + "sapt0/aug-cc-pwcvxz_disp", + "sapt0/aug-cc-pwcvxz_exdisp_os", + "sapt0/aug-cc-pwcvxz_exdisp_ss", + "sapt0/aug-cc-pwcvxz_delta_HF", + ] + + energy_target_names = [ + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + _filename = "DES5M.csv" + _name = "des5m_interaction" + + def read_raw_entries(self) -> List[Dict]: + return DES5M._read_raw_entries() diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py new file mode 100644 index 0000000..ae3ce81 --- /dev/null +++ b/openqdc/datasets/interaction/dess66.py @@ -0,0 +1,107 @@ +import os +from typing import Dict, List + +import numpy as np +import pandas as pd +from loguru import logger +from tqdm import tqdm + +from openqdc.datasets.interaction.base import BaseInteractionDataset +from openqdc.utils.molecule import atom_table + + +class DESS66(BaseInteractionDataset): + """ + DE Shaw Research interaction energy + estimates of all 66 conformers from + the original S66 dataset as described + in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + + Data was downloaded from Zenodo: + https://zenodo.org/records/5676284 + """ + + __name__ = "des_s66" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cbs", # cbs + "ccsd(t)/nn", # nn + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz_es", + "sapt0/aug-cc-pwcvxz_ex", + "sapt0/aug-cc-pwcvxz_exs2", + "sapt0/aug-cc-pwcvxz_ind", + "sapt0/aug-cc-pwcvxz_exind", + "sapt0/aug-cc-pwcvxz_disp", + "sapt0/aug-cc-pwcvxz_exdisp_os", + "sapt0/aug-cc-pwcvxz_exdisp_ss", + "sapt0/aug-cc-pwcvxz_delta_HF", + ] + + energy_target_names = [ + "cc_MP2_all", + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "cc_CCSD(T)_all", + "cbs_CCSD(T)_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + def read_raw_entries(self) -> List[Dict]: + self.filepath = os.path.join(self.root, "DESS66.csv") + logger.info(f"Reading DESS66 interaction data from {self.filepath}") + df = pd.read_csv(self.filepath) + data = [] + for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] + natoms0, natoms1 = row["natoms0"], row["natoms1"] + pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) + + elements = row["elements"].split() + + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) + + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + + energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] + + name = np.array([smiles0 + "." + smiles1]) + + subset = row["system_name"] + + item = dict( + energies=energies, + subset=np.array([subset]), + n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), + n_atoms_first=np.array([natoms0], dtype=np.int32), + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py new file mode 100644 index 0000000..2e97221 --- /dev/null +++ b/openqdc/datasets/interaction/dess66x8.py @@ -0,0 +1,108 @@ +import os +from typing import Dict, List + +import numpy as np +import pandas as pd +from loguru import logger +from tqdm import tqdm + +from openqdc.datasets.interaction.base import BaseInteractionDataset +from openqdc.utils.molecule import atom_table + + +class DESS66x8(BaseInteractionDataset): + """ + DE Shaw Research interaction energy + estimates of all 528 conformers from + the original S66x8 dataset as described + in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + + Data was downloaded from Zenodo: + + https://zenodo.org/records/5676284 + """ + + __name__ = "des_s66x8" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __energy_methods__ = [ + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cbs", # cbs + "ccsd(t)/nn", # nn + "sapt0/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz_es", + "sapt0/aug-cc-pwcvxz_ex", + "sapt0/aug-cc-pwcvxz_exs2", + "sapt0/aug-cc-pwcvxz_ind", + "sapt0/aug-cc-pwcvxz_exind", + "sapt0/aug-cc-pwcvxz_disp", + "sapt0/aug-cc-pwcvxz_exdisp_os", + "sapt0/aug-cc-pwcvxz_exdisp_ss", + "sapt0/aug-cc-pwcvxz_delta_HF", + ] + + energy_target_names = [ + "cc_MP2_all", + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "cc_CCSD(T)_all", + "cbs_CCSD(T)_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + def read_raw_entries(self) -> List[Dict]: + self.filepath = os.path.join(self.root, "DESS66x8.csv") + logger.info(f"Reading DESS66x8 interaction data from {self.filepath}") + df = pd.read_csv(self.filepath) + data = [] + for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] + natoms0, natoms1 = row["natoms0"], row["natoms1"] + pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) + + elements = row["elements"].split() + + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) + + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + + energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] + + name = np.array([smiles0 + "." + smiles1]) + + subset = row["system_name"] + + item = dict( + energies=energies, + subset=np.array([subset]), + n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), + n_atoms_first=np.array([natoms0], dtype=np.int32), + atomic_inputs=atomic_inputs, + name=name, + ) + data.append(item) + return data diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py new file mode 100644 index 0000000..041964a --- /dev/null +++ b/openqdc/datasets/interaction/metcalf.py @@ -0,0 +1,80 @@ +import os +from typing import Dict, List + +import numpy as np + +from openqdc.datasets.interaction.base import BaseInteractionDataset +from openqdc.utils.molecule import atom_table + + +class Metcalf(BaseInteractionDataset): + """ + Hydrogen-bonded dimers of NMA with 126 molecules as described in: + + Approaches for machine learning intermolecular interaction energies and + application to energy components from symmetry adapted perturbation theory. + Derek P. Metcalf, Alexios Koutsoukas, Steven A. Spronk, Brian L. Claus, + Deborah A. Loughney, Stephen R. Johnson, Daniel L. Cheney, C. David Sherrill; + J. Chem. Phys. 21 February 2020; 152 (7): 074103. + https://doi.org/10.1063/1.5142636 + + Further details: + "Hydrogen-bonded dimers involving N-methylacetamide (NMA) and 126 molecules + (46 donors and 80 acceptors; Figs. 2 and 3) were used. Optimized geometries + for the 126 individual monomers were obtained and paired with NMA in broad + arrays of spatial configurations to generate thousands of complexes for training. + """ + + __name__ = "metcalf" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = None + __energy_methods__ = [ + "SAPT0/jun-cc-pVDZ", + "SAPT0/jun-cc-pVDZ_es", + "SAPT0/jun-cc-pVDZ_ex", + "SAPT0/jun-cc-pVDZ_ind", + "SAPT0/jun-cc-pVDZ_disp", + ] + energy_target_names = [ + "total energy", + "electrostatic energy", + "exchange energy", + "induction energy", + "dispersion energy", + ] + + def read_raw_entries(self) -> List[Dict]: + data = [] + for dirname in os.listdir(self.root): + xyz_dir = os.path.join(self.root, dirname) + if not os.path.isdir(xyz_dir): + continue + subset = np.array([dirname.split("-")[0].lower()]) # training, validation, or test + for filename in os.listdir(xyz_dir): + if not filename.endswith(".xyz"): + continue + lines = list(map(lambda x: x.strip(), open(os.path.join(xyz_dir, filename), "r").readlines())) + line_two = lines[1].split(",") + energies = np.array([line_two[1:6]], dtype=np.float32) + num_atoms = np.array([int(lines[0])]) + + elem_xyz = np.array([x.split() for x in lines[2:]]) + elements = elem_xyz[:, 0] + xyz = elem_xyz[:, 1:].astype(np.float32) + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1) + charges = np.expand_dims(np.array([0] * num_atoms[0]), axis=1) + + atomic_inputs = np.concatenate((atomic_nums, charges, xyz), axis=-1, dtype=np.float32) + + item = dict( + n_atoms=num_atoms, + subset=subset, + energies=energies, + positions=xyz, + atomic_inputs=atomic_inputs, + name=np.array([""]), + n_atoms_first=np.array([-1]), + ) + data.append(item) + return data diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py new file mode 100644 index 0000000..06b6d86 --- /dev/null +++ b/openqdc/datasets/interaction/splinter.py @@ -0,0 +1,126 @@ +import os +from typing import Dict, List + +import numpy as np +from loguru import logger +from tqdm import tqdm + +from openqdc.datasets.interaction.base import BaseInteractionDataset +from openqdc.utils.molecule import atom_table + + +class Splinter(BaseInteractionDataset): + """ + A dataset of over 1.7 million protein-ligand + interactions as described in the paper: + + A quantum chemical interaction energy dataset for accurately modeling protein-ligand interactions. + Spronk, S.A., Glick, Z.L., Metcalf, D.P. et al. + Sci Data 10, 619 (2023). + https://doi.org/10.1038/s41597-023-02443-1 + """ + + __name__ = "splinter" + __energy_methods__ = [ + "sapt0/jun-cc-pV(D+d)Z_unscaled", + "sapt0/jun-cc-pV(D+d)Z_es_unscaled", + "sapt0/jun-cc-pV(D+d)Z_ex_unscaled", + "sapt0/jun-cc-pV(D+d)Z_ind_unscaled", + "sapt0/jun-cc-pV(D+d)Z_disp_unscaled", + "sapt0/jun-cc-pV(D+d)Z_scaled", + "sapt0/jun-cc-pV(D+d)Z_es_scaled", + "sapt0/jun-cc-pV(D+d)Z_ex_scaled", + "sapt0/jun-cc-pV(D+d)Z_ind_scaled", + "sapt0/jun-cc-pV(D+d)Z_disp_scaled", + "sapt0/aug-cc-pV(D+d)Z_unscaled", + "sapt0/aug-cc-pV(D+d)Z_es_unscaled", + "sapt0/aug-cc-pV(D+d)Z_ex_unscaled", + "sapt0/aug-cc-pV(D+d)Z_ind_unscaled", + "sapt0/aug-cc-pV(D+d)Z_disp_unscaled", + "sapt0/aug-cc-pV(D+d)Z_scaled", + "sapt0/aug-cc-pV(D+d)Z_es_scaled", + "sapt0/aug-cc-pV(D+d)Z_ex_scaled", + "sapt0/aug-cc-pV(D+d)Z_ind_scaled", + "sapt0/aug-cc-pV(D+d)Z_disp_scaled", + ] + + energy_target_names = [] + + def read_raw_entries(self) -> List[Dict]: + logger.info(f"Reading Splinter interaction data from {self.root}") + data = [] + i = 0 + with tqdm(total=1706343) as progress_bar: + for root, dirs, files in os.walk(self.root): # total is currently an approximation + for filename in files: + if not filename.endswith(".xyz"): + continue + i += 1 + filepath = os.path.join(root, filename) + filein = open(filepath, "r") + lines = list(map(lambda x: x.strip(), filein.readlines())) + n_atoms = np.array([int(lines[0])], dtype=np.int32) + metadata = lines[1].split(",") + try: + ( + protein_monomer_name, + protein_interaction_site_type, + ligand_monomer_name, + ligand_interaction_site_type, + index, + r, + theta_P, + tau_P, + theta_L, + tau_L, + tau_PL, + ) = metadata[0].split("_") + index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list( + map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL]) + ) + except ValueError: + ( + protein_monomer_name, + protein_interaction_site_type, + ligand_monomer_name, + ligand_interaction_site_type, + index, + _, + ) = metadata[0].split("_") + r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6 + energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32) + n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32) + total_charge, charge0, charge1 = list(map(int, metadata[1:4])) + lines = list(map(lambda x: x.split(), lines[2:])) + pos = np.array(lines)[:, 1:].astype(np.float32) + elems = np.array(lines)[:, 0] + atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) + natoms0 = n_atoms_first[0] + natoms1 = n_atoms[0] - natoms0 + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + subset = np.array([root.split("/")[-1]]) + + item = dict( + energies=energies, + subset=subset, + n_atoms=n_atoms, + n_atoms_first=n_atoms_first, + atomic_inputs=atomic_inputs, + protein_monomer_name=np.array([protein_monomer_name]), + protein_interaction_site_type=np.array([protein_interaction_site_type]), + ligand_monomer_name=np.array([ligand_monomer_name]), + ligand_interaction_site_type=np.array([ligand_interaction_site_type]), + index=np.array([index]), + r=np.array([r]), + theta_P=np.array([theta_P]), + tau_P=np.array([tau_P]), + theta_L=np.array([theta_L]), + tau_L=np.array([tau_L]), + tau_PL=np.array([tau_PL]), + name=np.array([protein_monomer_name + "." + ligand_monomer_name]), + ) + data.append(item) + progress_bar.update(1) + logger.info(f"Processed {i} files in total") + return data diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py index b8cd73e..6205a3e 100644 --- a/openqdc/raws/config_factory.py +++ b/openqdc/raws/config_factory.py @@ -90,6 +90,20 @@ class DataConfigFactory: }, ) + des370k_interaction = dict( + dataset_name="des370k_interaction", + links={ + "DES370K.zip": "https://zenodo.org/record/5676266/files/DES370K.zip", + }, + ) + + des5m_interaction = dict( + dataset_name="des5m_interaction", + links={ + "DES5M.zip": "https://zenodo.org/records/5706002/files/DESS5M.zip?download=1", + }, + ) + tmqm = dict( dataset_name="tmqm", links={ @@ -142,6 +156,22 @@ class DataConfigFactory: dataset_name="transition1x", links={"Transition1x.h5": "https://figshare.com/ndownloader/files/36035789"}, ) + + # l7 = dict( + # dataset_name="l7", + # links={"l7.zip": "http://www.begdb.org/moldown.php?id=40"} + # ) + + des_s66 = dict( + dataset_name="des_s66", + links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"}, + ) + + des_s66x8 = dict( + dataset_name="des_s66x8", + links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"}, + ) + available_datasets = [k for k in locals().keys() if not k.startswith("__")] def __init__(self): diff --git a/openqdc/utils/molecule.py b/openqdc/utils/molecule.py index 2edadc5..6b48cc5 100644 --- a/openqdc/utils/molecule.py +++ b/openqdc/utils/molecule.py @@ -10,6 +10,398 @@ atom_table = Chem.GetPeriodicTable() +# molecule group classification for DES datasets +molecule_groups = { + "acids": set(["CCC(=O)O", "CC(=O)O", "OC=O", "OC(=O)CC(=O)O"]), + "alcohols": set(["CCCO", "CCC(O)C", "CCO", "CC(O)C", "CO", "OC1CCCC1", "OC1CCCCC1", "OCCCCO", "OCCCO", "OCCO"]), + "alkanes": set( + [ + "C1CCCC1", + "C1CCCCC1", + "C", + "CC1CCCC1", + "CC1CCCCC1", + "CC", + "CCC", + "CC(C)C", + "CCCC", + "CC(C)(C)C", + "CCC(C)C", + "CCCCC", + "CCC(C)(C)C", + "CCCCCC", + ] + ), + "alkenes": set( + [ + "C=C", + "CC=C", + "CC=CC", + "CC(=C)C", + "CCC=C", + "CC=C(C)C", + "CCC=CC", + "CCC(=C)C", + "CC(=C(C)C)C", + "CCC=C(C)C", + "CCC(=CC)C", + "CCC(=C(C)C)C", + ] + ), + "amides": set( + [ + "CCCNC=O", + "CCC(=O)N", + "CCC(=O)NC", + "CCC(=O)N(C)C", + "CCC(=O)N(CC)C", + "CCNC=O", + "CCNC(=O)C", + "CCN(C=O)CC", + "CCN(C(=O)C)C", + "CCNC(=O)CC", + "CCN(C(=O)C)CC", + "CC(=O)N", + "CC(=O)N(C)C", + "CNC=O", + "CNC(=O)C", + "CN(C=O)CC", + "CNC(=O)CC(=O)N", + "CNC(=O)CC(=O)NC", + "CNC(=O)CNC=O", + "CNC(=O)CNC(=O)C", + "CNC(=O)C(NC(=O)C)C", + "NC=O", + "NC(=O)CC(=O)N", + "O=CN(C)C", + "O=CNCCC(=O)N", + "O=CNCCC(=O)NC", + "O=CNCCNC=O", + "O=CNCC(=O)N", + ] + ), + "amines": set( + [ + "C1CCCN1", + "C1CCCNC1", + "CCCN", + "CCCNC", + "CCCN(C)C", + "CCN", + "CCN(C)C", + "CCNCC", + "CCN(CC)C", + "CN", + "CNC", + "CN(C)C", + "CNCC", + "CNCCCN", + "CNCCCNC", + "CNCCN", + "CNCCNC", + "N", + "NCCCN", + "NCCN", + ] + ), + "ammoniums": set(["CC[NH3+]", "C[N+](C)(C)C", "C[NH2+]C", "C[NH3+]", "C[NH+](C)C", "[NH4+]"]), + "benzene": set(["c1ccccc1", "Cc1ccccc1", "CCc1ccccc1"]), + "carboxylates": set(["[O-]C=O", "[O-]C(=O)C", "[O-]C(=O)CC"]), + "esters": set( + [ + "CCCOC=O", + "CCC(=O)OC", + "CCOC(=O)CC", + "CCOC(=O)C", + "CCOC=O", + "COC(=O)C", + "COC=O", + "O=COCCCOC=O", + "O=COCCOC=O", + "O=COCOC=O", + ] + ), + "ethers": set( + [ + "C1CCCO1", + "C1CCCOC1", + "C1CCOCO1", + "C1OCCO1", + "CCCOC", + "CCCOCOC", + "CCOCC", + "COCCCOC", + "COCC", + "COCCOC", + "COC", + "COCOCC", + "COCOC", + "O1CCOCC1", + "O1COCOC1", + ] + ), + "guanidiums": set(["CCNC(=[NH2+])N", "CNC(=[NH2+])N", "NC(=[NH2+])N"]), + "imidazolium": set(["c1[nH]cc[nH+]1", "Cc1c[nH]c[nH+]1", "CCc1c[nH]c[nH+]1"]), + "ketones": set(["CCC(=O)CC", "CCC(=O)C", "CCC=O", "CC(=O)C", "CC=O", "C=O"]), + "monoatomics": set( + [ + "[Ar]", + "[Br-]", + "[Ca+2]", + "[Cl-]", + "[F-]", + "[He]", + "[I-]", + "[K+]", + "[Kr]", + "[Li+]", + "[Mg+2]", + "[Na+]", + "[Ne]", + "[Xe]", + ] + ), + "other": set( + [ + "Brc1ccc(cc1)Br", + "Brc1ccccc1", + "BrC(Br)Br", + "BrCBr", + "BrCCBr", + "CBr", + "CC(Br)Br", + "CCBr", + "CCCC#CC", + "CCCC(Cl)(Cl)Cl", + "CCCC(Cl)Cl", + "CCCCCl", + "CCC#CC", + "CCCC#C", + "CCCC(F)(F)F", + "CCCC(F)F", + "CCCCF", + "CCC(Cl)(Cl)Cl", + "CCC(Cl)Cl", + "CCCCl", + "CCCC#N", + "CC#CC", + "CCC#C", + "CCC(F)(F)F", + "CCC(F)F", + "CCCF", + "CC(Cl)(Cl)Cl", + "CC(Cl)Cl", + "CCCl", + "CCC#N", + "CC#C", + "CC(F)(F)F", + "CC(F)F", + "CCF", + "CC(I)I", + "CCI", + "CCl", + "CC#N", + "CCOP(=O)(OC)OC", + "CCOP(=O)(OC)[O-]", + "CCOP(=O)(OC)O", + "C#C", + "CF", + "CI", + "Clc1ccc(cc1)Cl", + "Clc1cccc(c1)Cl", + "Clc1ccccc1Cl", + "Clc1ccccc1", + "Clc1cc(Cl)c(c(c1Cl)Cl)Cl", + "Clc1cc(Cl)cc(c1)Cl", + "Clc1c(Cl)c(Cl)c(c(c1Cl)Cl)Cl", + "ClC(C(Cl)(Cl)Cl)(Cl)Cl", + "ClC(C(Cl)(Cl)Cl)Cl", + "ClCC(Cl)(Cl)Cl", + "ClCC(Cl)Cl", + "ClCCCl", + "ClC(Cl)Cl", + "ClCCl", + "CNCCCOC=O", + "CNCCCOC", + "CNCCC(=O)NC", + "CNCCC(=O)N", + "CNCCC(=O)O", + "CNCCCO", + "CNCCCSC", + "CNCCCS", + "CNCCNC=O", + "CNCCOC=O", + "CNCCOC", + "CNCC(=O)NC", + "CNCC(=O)N", + "CNCC(=O)O", + "CNCCO", + "CNCCSC", + "CNCCS", + "CNC(=O)CCN", + "CNC(=O)CC(=O)O", + "CNC(=O)CCO", + "CNC(=O)CCS", + "CNC(=O)CN", + "CNC(=O)COC=O", + "CNC(=O)CO", + "CNCOC=O", + "CNCOC", + "CNC(=O)CS", + "CNCSC", + "C#N", + "COCCCN", + "COCCCOC=O", + "COCCC(=O)NC", + "COCCC(=O)N", + "COCCC(=O)O", + "COCCCO", + "COCCCSC", + "COCCCS", + "COCCNC=O", + "COCCN", + "COCCOC=O", + "COCC(=O)NC", + "COCC(=O)N", + "COCC(=O)O", + "COCCO", + "COCCSC", + "COCCS", + "COCNC=O", + "COCN", + "COCOC=O", + "COCO", + "COCSC", + "COCS", + "COP(=O)(OC)OC", + "COP(=O)(OC)[O-]", + "COP(=O)(OC)O", + "COP(=O)(O)O", + "COP(=O)(OP(=O)(O)O)[O-]", + "CSCCCNC=O", + "CSCCCN", + "CSCCCOC=O", + "CSCCC(=O)N", + "CSCCC(=O)O", + "CSCCCO", + "CSCCN", + "CSCCOC=O", + "CSCC(=O)NC", + "CSCC(=O)N", + "CSCC(=O)O", + "CSCCO", + "CSCNC=O", + "CSCN", + "CSCOC=O", + "CSCO", + "Fc1ccc(cc1)F", + "Fc1cccc(c1)F", + "Fc1ccccc1F", + "Fc1ccccc1", + "Fc1cc(F)c(c(c1F)F)F", + "Fc1cc(F)cc(c1)F", + "Fc1c(F)c(F)c(c(c1F)F)F", + "FC(C(F)(F)F)(F)F", + "FC(C(F)(F)F)F", + "FCC(F)(F)F", + "FCC(F)F", + "FCCF", + "FC(F)F", + "FCF", + "ICCI", + "ICI", + "NCCCOC=O", + "NCCC(=O)N", + "NCCC(=O)O", + "NCCCO", + "NCCCS", + "NCCNC=O", + "NCCOC=O", + "NCC(=O)N", + "NCC(=O)O", + "NCCO", + "NCCS", + "NC(=O)CC(=O)O", + "NC(=O)CCO", + "NC(=O)CCS", + "NC(=O)CO", + "NCOC=O", + "NC(=O)CS", + "OCCCNC=O", + "OCCCOC=O", + "OCCC(=O)O", + "OCCCS", + "OCCNC=O", + "OCCOC=O", + "OCC(=O)O", + "OCCS", + "O=CNCCC(=O)O", + "O=CNCCOC=O", + "O=CNCC(=O)O", + "O=CNCOC=O", + "O=COCCC(=O)NC", + "O=COCCC(=O)N", + "O=COCCC(=O)O", + "O=COCC(=O)N", + "O=COCC(=O)O", + "OC(=O)CCS", + "OCOC=O", + "OC(=O)CS", + "OP(=O)(O)O", + "[O-]P(=O)(OP(=O)(OC)O)O", + "SCCCOC=O", + "SCCNC=O", + "SCCOC=O", + "SCOC=O", + "[H][H]", + ] + ), + "phenol": set(["Cc1ccc(cc1)O", "CCc1ccc(cc1)O", "Oc1ccccc1"]), + "pyridine": set(["c1cccnc1", "c1ccncn1", "n1ccncc1"]), + "pyrrole": set( + [ + "c1ccc2c(c1)[nH]cc2", + "c1ccc[nH]1", + "c1ncc[nH]1", + "Cc1cnc[nH]1", + "Cc1c[nH]c2c1cccc2", + "Cc1c[nH]cn1", + "CCc1cnc[nH]1", + "CCc1c[nH]c2c1cccc2", + "CCc1c[nH]cn1", + ] + ), + "sulfides": set( + [ + "C1CCCS1", + "C1CCCSC1", + "C1CCSCS1", + "C1CCSSC1", + "C1CSSC1", + "C1SCCS1", + "CCCSCSC", + "CCCSC", + "CCCSSC", + "CCSCC", + "CCSSCC", + "CCSSC", + "CSCCCSC", + "CSCCSC", + "CSCC", + "CSCSCC", + "CSCSC", + "CSC", + "CSSC", + "S1CCSCC1", + "S1CSCSC1", + ] + ), + "thiols": set(["CCCSS", "CCCS", "CCSS", "CCS", "CSCCCS", "CSCCS", "CSCS", "CSS", "CS", "SCCCS", "SCCS", "SS", "S"]), + "water": set(["O"]), + "flourane": set(["F"]), + "hydrogen chloride": set(["Cl"]), +} + def z_to_formula(z): u, c = np.unique(z, return_counts=True) diff --git a/tests/test_dummy.py b/tests/test_dummy.py index 702427f..f82376c 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,6 +1,6 @@ """Path hack to make tests work.""" -from openqdc.datasets import Dummy # noqa: E402 +from openqdc.datasets.potential.dummy import Dummy # noqa: E402 from openqdc.utils.atomization_energies import ( ISOLATED_ATOM_ENERGIES, IsolatedAtomEnergyFactory,