diff --git a/README.md b/README.md index 246fe30..7ed8e5b 100644 --- a/README.md +++ b/README.md @@ -52,17 +52,30 @@ openqdc download --datasets Spice QMugs We provide support for the following publicly available QM Datasets. +# Potential Energy + | Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| | --- | --- | --- | --- | --- | --- | --- | --- | +| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes | | [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No | | [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No | | [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | | +| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | +| [PCQM_PM6](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740) | | | 1| No| | PM6 | No +| [PCQM_B3LYP](https://arxiv.org/abs/2305.18454) | 85,938,443|85,938,443 | 1| No| | B3LYP/6-31G* | No | [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No | +| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | +| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | +| [SolvatedPeptides](https://doi.org/10.1021/acs.jctc.9b00181) | | 2,731,180 | | Yes | | revPBE-D3(BJ)/def2-TZVP | | | [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes | -| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes | -| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | | | No | | TPSSh-D3BJ/def2-SVP | | +| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | 86,665| 1| No | | TPSSh-D3BJ/def2-SVP | | +| [Transition1X](https://www.nature.com/articles/s41597-022-01870-w) | | 9,654,813| | Yes | | ωB97x/6–31 G(d) | Yes | +| [WaterClusters](https://doi.org/10.1063/1.5128378) | 1 | 4,464,740| | No | 2 | TTM2.1-F | Yes| + + +# Interaction energy + +| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| +| --- | --- | --- | --- | --- | --- | --- | --- | | [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes | | [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes | -| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | -| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | -| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | diff --git a/docs/tutorials/usage.ipynb b/docs/tutorials/usage.ipynb index 1ad74cd..49f957c 100644 --- a/docs/tutorials/usage.ipynb +++ b/docs/tutorials/usage.ipynb @@ -4,12 +4,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# OpenQDC Hands On tutorial\n", + "# OpenQDC Hands-on Tutorial\n", "\n", "## Instantiate and GO!\n", "\n", - "If you don't have the dataset downloaded it will be downloaded automatically and cached. You just instantiate the class and you are ready to go.\n", - "Change of units are done automatically on loading based on the units in the dataset." + "If you don't have the dataset downloaded, it will be downloaded automatically and cached. You just instantiate the class and you are ready to go.\n", + "Change of units is done automatically upon loading based on the units of the dataset.\n", + "\n", + "Supported energy units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n", + "\n", + "Supported distance units: [\"ang\", \"nm\", \"bohr\"]" ] }, { @@ -53,7 +57,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Items from the dataset object class are obtained through the get method.\n", + "### Items from the dataset object class are obtained through the \"get\" method.\n", "\n", "The dictionary of the item contains different important keys:\n", "- 'positions' : numpy array of the 3d atomic positions (n x 3)\n", @@ -63,7 +67,7 @@ "- 'energies': potential energy of the molecule (n_level_of_theries)\n", "- 'name': name or smiles (is present) of the molecule\n", "- 'subset': subset of the dataset the molecule belongs to\n", - "- 'forces': if presentes the forces on the atoms (n x 3 x n_level_of_theories_forces)" + "- 'forces': if present, the forces on the atoms (n x 3 x n_level_of_theories_forces)" ] }, { @@ -257,9 +261,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Alternatevely we can also retrieve the data from the dataset object class as ase.Atoms using the get_ase_atoms\n", - "\n", - "The dictionary of the item contains different important keys:" + "### Alternatively, we can also retrieve the data from the dataset object class as ase.Atoms using the get_ase_atoms!" ] }, { @@ -444,7 +446,7 @@ "source": [ "### Iterators \n", "\n", - "The method as_iter(atoms=False) returns an iterator over the dataset. If atoms is True the iterator returns the data as ase.Atoms objects. Otherwise it returns the dictionary of the item." + "The method as_iter(atoms=False) returns an iterator over the dataset. If atoms is True, the iterator returns the data as an ase.Atoms objects. Otherwise, it returns the dictionary of the item." ] }, { @@ -651,7 +653,11 @@ "source": [ "### Isolated atoms energies [e0s]\n", "\n", - "The isolated atoms energies are automatically used inside the datasets for the correct level of theory but you can also use them directly by accessing the IsolatedAtomEnergyFactor class." + "The potential energy of the system can be decomposed into the sum of isolated atom energies and the formation energy.\n", + "\n", + "$U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$\n", + "\n", + "The isolated atoms energies are automatically used inside the datasets for the correct level of theory, but you can also use them directly by accessing the IsolatedAtomEnergyFactor class." ] }, { diff --git a/openqdc/datasets/ani.py b/openqdc/datasets/ani.py index 3f1b92b..ac1725d 100644 --- a/openqdc/datasets/ani.py +++ b/openqdc/datasets/ani.py @@ -104,7 +104,7 @@ def __smiles_converter__(self, x): class ANI1X(ANI1): """ - The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning which leads to + The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to a total of 5,496,771 conformers with 63,865 unique molecules. Usage diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index c65fdbc..33c2c74 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -1,3 +1,5 @@ +"""The BaseDataset defining shared functionality between all datasets.""" + import os import pickle as pkl from copy import deepcopy @@ -40,7 +42,7 @@ from openqdc.utils.units import get_conversion -def extract_entry( +def _extract_entry( df: pd.DataFrame, i: int, subset: str, @@ -73,11 +75,12 @@ def extract_entry( def read_qc_archive_h5( raw_path: str, subset: str, energy_target_names: List[str], force_target_names: List[str] ) -> List[Dict[str, np.ndarray]]: + """Extracts data from the HDF5 archive file.""" data = load_hdf5_file(raw_path) data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()} n = len(data_t["molecule_id"]) - samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))] + samples = [_extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))] return samples @@ -108,6 +111,19 @@ def __init__( overwrite_local_cache: bool = False, cache_dir: Optional[str] = None, ) -> None: + """ + + Parameters + ---------- + energy_unit + Energy unit to convert dataset to. Supported units: ["kcal/mol", "kj/mol", "hartree", "ev"] + distance_unit + Distance unit to convert dataset to. Supported units: ["ang", "nm", "bohr"] + overwrite_local_cache + Whether to overwrite the locally cached dataset. + cache_dir + Cache directory location. Defaults to "~/.cache/openqdc" + """ set_cache_dir(cache_dir) self.data = None if not self.is_preprocessed(): diff --git a/openqdc/datasets/iso_17.py b/openqdc/datasets/iso_17.py index 4553ec1..ef40409 100644 --- a/openqdc/datasets/iso_17.py +++ b/openqdc/datasets/iso_17.py @@ -6,8 +6,8 @@ class ISO17(BaseDataset): """ ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed - composition of atoms (C7O2H10) arranged in different chemically valid structures. It consists of consist - of 129 molecules each containing 5,000 conformational geometries, energies and forces with a resolution + composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist + of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution of 1 femtosecond in the molecular dynamics trajectories. The simulations were carried out using the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method. diff --git a/openqdc/datasets/molecule3d.py b/openqdc/datasets/molecule3d.py index dc47e53..418b8ec 100644 --- a/openqdc/datasets/molecule3d.py +++ b/openqdc/datasets/molecule3d.py @@ -67,7 +67,7 @@ def _read_sdf(sdf_path: str, properties_path: str) -> List[Dict[str, np.ndarray] class Molecule3D(BaseDataset): """ Molecule3D dataset consists of 3,899,647 molecules with ground state geometries and energies - calculated at B3LYP/6-31G* level of theory. The molecules are extracted from the + calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing invalid molecule files. Usage: diff --git a/openqdc/datasets/qm7x.py b/openqdc/datasets/qm7x.py index eb8b015..e027ab3 100644 --- a/openqdc/datasets/qm7x.py +++ b/openqdc/datasets/qm7x.py @@ -33,6 +33,25 @@ def read_mol(mol_h5, mol_name, energy_target_names, force_target_names): class QM7X(BaseDataset): + """ + QM7X is a collection of almost 4.2 million conformers from 6,950 unique molecules. It contains DFT + energy and force labels at the PBE0+MBD level of theory. It consists of structures for molecules with + up to seven heavy (C, N, O, S, Cl) atoms from the GDB13 database. For each molecule, (meta-)stable + equilibrium structures including constitutional/structural isomers and stereoisomers are + searched using density-functional tight binding (DFTB). Then, for each (meta-)stable structure, 100 + off-equilibrium structures are obtained and labeled with PBE0+MBD. + + Usage: + ```python + from openqdc.datasets import QM7X + dataset = QM7X() + ``` + + References: + - https://arxiv.org/abs/2006.15139 + - https://zenodo.org/records/4288677 + """ + __name__ = "qm7x" __energy_methods__ = ["pbe0/mbd", "dft3b"] diff --git a/openqdc/datasets/sn2_rxn.py b/openqdc/datasets/sn2_rxn.py index abcbd62..9c513ea 100644 --- a/openqdc/datasets/sn2_rxn.py +++ b/openqdc/datasets/sn2_rxn.py @@ -4,6 +4,23 @@ class SN2RXN(BaseDataset): + """ + This dataset probes chemical reactions of methyl halides with halide anions, i.e. + X- + CH3Y -> CH3X + Y-, and contains structures for all possible combinations of + X,Y = F, Cl, Br, I. It contains energy and forces for 452709 conformations calculated + at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. + + Usage: + ```python + from openqdc.datasets import SN2RXN + dataset = SN2RXN() + ``` + + References: + - https://doi.org/10.1021/acs.jctc.9b00181 + - https://zenodo.org/records/2605341 + """ + __name__ = "sn2_rxn" __energy_methods__ = [ @@ -33,30 +50,6 @@ def __smiles_converter__(self, x): def read_raw_entries(self): raw_path = p_join(self.root, "sn2_rxn.h5") - - # raw_path = p_join(self.root, "sn2_reactions.npz") - # data = np.load(raw_path) - - # # as example for accessing individual entries, print the data for entry idx=0 - # idx = 0 - # print("Data for entry " + str(idx)+":") - # print("Number of atoms") - # print(data["N"][idx]) - # print("Energy [eV]") - # print(data["E"][idx]) - # print("Total charge") - # print(data["Q"][idx]) - # print("Dipole moment vector (with respect to [0.0 0.0 0.0]) [eA]") - # print(data["D"][idx,:]) - # print("Nuclear charges") - # print(data["Z"][idx,:data["N"][idx]]) - # print("Cartesian coordinates [A]") - # print(data["R"][idx,:data["N"][idx],:]) - # print("Forces [eV/A]") - # print(data["F"][idx,:data["N"][idx],:]) - - # exit() - samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, self.force_target_names) return samples diff --git a/openqdc/datasets/solvated_peptides.py b/openqdc/datasets/solvated_peptides.py index 216ecdd..095680e 100644 --- a/openqdc/datasets/solvated_peptides.py +++ b/openqdc/datasets/solvated_peptides.py @@ -4,6 +4,23 @@ class SolvatedPeptides(BaseDataset): + """ + The solvated protein fragments dataset probes many-body intermolecular + interactions between "protein fragments" and water molecules. + It contains energy and forces for 2731180 structures calculated + at the revPBE-D3(BJ)/def2-TZVP level of theory. + + Usage: + ```python + from openqdc.datasets import SolvatedPeptides + dataset = SolvatedPeptides() + ``` + + References: + - https://doi.org/10.1021/acs.jctc.9b00181 + - https://zenodo.org/records/2605372 + """ + __name__ = "solvated_peptides" __energy_methods__ = [ diff --git a/openqdc/datasets/spice.py b/openqdc/datasets/spice.py index 0b90912..77a4d87 100644 --- a/openqdc/datasets/spice.py +++ b/openqdc/datasets/spice.py @@ -34,9 +34,9 @@ def read_record(r): class Spice(BaseDataset): """ - Spice Dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of + The Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of small molecules, dimers, dipeptides, and solvated amino acids. It consists of both forces and energies calculated - at {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory. + at the {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory. Usage: ```python diff --git a/openqdc/datasets/tmqm.py b/openqdc/datasets/tmqm.py index 8952aaa..8695b45 100644 --- a/openqdc/datasets/tmqm.py +++ b/openqdc/datasets/tmqm.py @@ -45,6 +45,23 @@ def read_xyz(fname, e_map): class TMQM(BaseDataset): + """ + The tmQM dataset contains the geometries of a large transition metal-organic + compound space with a large variety of organic ligands and 30 transition metals. + It contains energy labels for 86,665 mononuclear complexe calculated + at the TPSSh-D3BJ/def2-SV DFT level of theory. + + Usage: + ```python + from openqdc.datasets import TMQM + dataset = TMQM() + ``` + + References: + - https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041 + - https://github.com/bbskjelstad/tmqm + """ + __name__ = "tmqm" __energy_methods__ = ["tpssh/def2-tzvp"] diff --git a/openqdc/datasets/transition1x.py b/openqdc/datasets/transition1x.py index 0285ec9..734e98b 100644 --- a/openqdc/datasets/transition1x.py +++ b/openqdc/datasets/transition1x.py @@ -37,6 +37,22 @@ def read_record(r, group): class Transition1X(BaseDataset): + """ + The Transition1x dataset contains structures from 10k organic reaction pathways of various types. + It contains DFT energy and force labels for 9.6 mio. conformers calculated at the + wB97x/6-31-G(d) level of theory. + + Usage: + ```python + from openqdc.datasets import Transition1X + dataset = Transition1X() + ``` + + References: + - https://www.nature.com/articles/s41597-022-01870-w + - https://gitlab.com/matschreiner/Transition1x + """ + __name__ = "transition1x" __energy_methods__ = [ diff --git a/openqdc/datasets/waterclusters3_30.py b/openqdc/datasets/waterclusters3_30.py index 6aa5748..507f1fe 100644 --- a/openqdc/datasets/waterclusters3_30.py +++ b/openqdc/datasets/waterclusters3_30.py @@ -49,6 +49,23 @@ def read_xyz(fname, n_waters): class WaterClusters(BaseDataset): + """ + The WaterClusters dataset contains putative minima and low energy networks for water + clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with + the TTM2.1-F ab-initio based interaction potential for water. + It contains approximately 4.5 mil. structures. + + Usage: + ```python + from openqdc.datasets import WaterClusters + dataset = WaterClusters() + ``` + + References: + - https://doi.org/10.1063/1.5128378 + - https://sites.uw.edu/wdbase/database-of-water-clusters/ + """ + __name__ = "waterclusters3_30" # Energy in hartree, all zeros by default diff --git a/openqdc/raws/pubchemqc.py b/openqdc/raws/pubchemqc.py index 4c32232..5a6a8e5 100644 --- a/openqdc/raws/pubchemqc.py +++ b/openqdc/raws/pubchemqc.py @@ -1,3 +1,5 @@ +"""Download funtionalities for PubChemQC.""" + import hashlib import os import pickle as pkl diff --git a/openqdc/utils/atomization_energies.py b/openqdc/utils/atomization_energies.py index 2320396..b9ef590 100644 --- a/openqdc/utils/atomization_energies.py +++ b/openqdc/utils/atomization_energies.py @@ -1,3 +1,5 @@ +"""Look-up tables for isolated atom energies.""" + from typing import Dict, Tuple import numpy as np diff --git a/openqdc/utils/io.py b/openqdc/utils/io.py index b9a434c..18928d1 100644 --- a/openqdc/utils/io.py +++ b/openqdc/utils/io.py @@ -1,4 +1,4 @@ -"""IO utilities for mlip package""" +"""IO utilities.""" import json import os @@ -6,6 +6,7 @@ import fsspec import h5py +from aiohttp import ClientTimeout from ase.atoms import Atoms from fsspec.callbacks import TqdmCallback from fsspec.implementations.local import LocalFileSystem @@ -16,6 +17,8 @@ gcp_filesys_public = fsspec.filesystem("https") # public API for download local_filesys = LocalFileSystem() +gcp_filesys_public.client_kwargs = {"timeout": ClientTimeout(total=3600, connect=1000)} + _OPENQDC_CACHE_DIR = ( "~/.cache/openqdc" if "OPENQDC_CACHE_DIR" not in os.environ else os.path.normpath(os.environ["OPENQDC_CACHE_DIR"]) ) diff --git a/openqdc/utils/molecule.py b/openqdc/utils/molecule.py index 82a58d2..2edadc5 100644 --- a/openqdc/utils/molecule.py +++ b/openqdc/utils/molecule.py @@ -1,3 +1,5 @@ +"""Computations on molecular graphs.""" + from typing import Any import numpy as np diff --git a/openqdc/utils/preprocess.py b/openqdc/utils/preprocess.py index bf93c89..0b88866 100644 --- a/openqdc/utils/preprocess.py +++ b/openqdc/utils/preprocess.py @@ -1,3 +1,5 @@ +"""Dataset preprocessing.""" + import click import numpy as np from loguru import logger diff --git a/openqdc/utils/units.py b/openqdc/utils/units.py index f79ebce..12c13f9 100644 --- a/openqdc/utils/units.py +++ b/openqdc/utils/units.py @@ -1,3 +1,13 @@ +""" +Unit conversion utils. + +Energy units: + ["kcal/mol", "kj/mol", "hartree", "ev"] + +Distance units: + ["ang", "nm", "bohr"] +""" + from typing import Callable from openqdc.utils.exceptions import ConversionAlreadyDefined, ConversionNotDefinedError @@ -6,11 +16,28 @@ class Conversion: + """ + Conversion from one unit system to another. + + Attributes + ---------- + name + A human-readable name for the conversion + fn: + The callable to compute the conversion + """ + def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]): """ - Args: - name: A human-readable name for the metric - fn: The callable to actually compute the metric + + Parameters + ---------- + in_unit + String defining the units of the current values + out_unit + String defining the target units + func + The callable to compute the conversion """ name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip() @@ -57,7 +84,7 @@ def get_conversion(in_unit: str, out_unit: str): Conversion("kj/mol", "kcal/mol", lambda x: x * 0.239006) Conversion("kj/mol", "hartree", lambda x: x * 0.000380879) -# bohr conversion +# distance conversions Conversion("bohr", "ang", lambda x: x * 0.52917721092) Conversion("ang", "bohr", lambda x: x / 0.52917721092) Conversion("ang", "nm", lambda x: x * 0.1) @@ -75,7 +102,7 @@ def get_conversion(in_unit: str, out_unit: str): Conversion("hartree/ang", "kcal/mol/ang", lambda x: get_conversion("hartree", "kcal/mol")(x)) Conversion("hartree/ang", "hartree/bohr", lambda x: get_conversion("bohr", "ang")(x)) Conversion("hartree/bohr", "hartree/ang", lambda x: get_conversion("ang", "bohr")(x)) -Conversion("kcal/mol/bohr", "Hartree/bohr", lambda x: get_conversion("kcal/mol", "hartree")(x)) +Conversion("kcal/mol/bohr", "hartree/bohr", lambda x: get_conversion("kcal/mol", "hartree")(x)) Conversion("ev/ang", "hartree/ang", lambda x: get_conversion("ev", "hartree")(x)) Conversion("ev/bohr", "hartree/bohr", lambda x: get_conversion("ev", "hartree")(x)) Conversion("ev/bohr", "ev/ang", lambda x: get_conversion("ang", "bohr")(x)) diff --git a/pyproject.toml b/pyproject.toml index dfd19c5..d0bc622 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ fallback_version = "dev" [tool.isort] profile = "black" +src_paths = ["openqdc"] [tool.setuptools.packages.find] where = ["."]