diff --git a/README.md b/README.md index cd3f517..ffaf7a4 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,25 @@ -# openQDC - -Open Quantum Data Commons +
+ +
+ +

+ openQDC - Open Quantum Data Commons
+

+

+ + Docs + | + + Homepage + +

+ +--- + +[![license](https://licensebuttons.net/l/by-nc/4.0/80x15.png)](https://github.com/valence-labs/openQDC/blob/main/LICENSE) ### Installing openQDC + ```bash git clone git@github.com:OpenDrugDiscovery/openQDC.git cd openQDC @@ -57,41 +74,55 @@ We provide support for the following publicly available QM Potential Energy Data # Potential Energy -| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| -| --- | --- | --- | --- | --- | --- | --- | --- | -| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes | -| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No | -| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No | -| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | | -| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | -| [PCQM_PM6](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740) | | | 1| No| | PM6 | No -| [PCQM_B3LYP](https://arxiv.org/abs/2305.18454) | 85,938,443|85,938,443 | 1| No| | B3LYP/6-31G* | No -| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No | -| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | -| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | -| [SolvatedPeptides](https://doi.org/10.1021/acs.jctc.9b00181) | | 2,731,180 | | Yes | | revPBE-D3(BJ)/def2-TZVP | | -| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes | -| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | 86,665| 1| No | | TPSSh-D3BJ/def2-SVP | | -| [Transition1X](https://www.nature.com/articles/s41597-022-01870-w) | | 9,654,813| | Yes | | ωB97x/6–31 G(d) | Yes | -| [WaterClusters](https://doi.org/10.1063/1.5128378) | 1 | 4,464,740| | No | 2 | TTM2.1-F | Yes| - +| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations | +| ----------------------------------------------------------------------------- | ----------- | ------------ | ------------------------------- | ------------ | ---------- | -------------------------- | ----------------------------- | +| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes | +| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No | +| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G\* | No | +| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | | +| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | +| [PCQM_PM6](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740) | | | 1 | No | | PM6 | No | +| [PCQM_B3LYP](https://arxiv.org/abs/2305.18454) | 85,938,443 | 85,938,443 | 1 | No | | B3LYP/6-31G\* | No | +| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No | +| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | +| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | +| [SolvatedPeptides](https://doi.org/10.1021/acs.jctc.9b00181) | | 2,731,180 | | Yes | | revPBE-D3(BJ)/def2-TZVP | | +| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes | +| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | 86,665 | 1 | No | | TPSSh-D3BJ/def2-SVP | | +| [Transition1X](https://www.nature.com/articles/s41597-022-01870-w) | | 9,654,813 | | Yes | | ωB97x/6–31 G(d) | Yes | +| [WaterClusters](https://doi.org/10.1063/1.5128378) | 1 | 4,464,740 | | No | 2 | TTM2.1-F | Yes | # Interaction energy We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets. -| Dataset | -| --- | -| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | -| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | +| Dataset | +| ------------------------------------------------------------------------------------------------------------------- | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | | [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) | -| [DESS66](https://www.nature.com/articles/s41597-021-00833-x) | -| [DESS66x8](https://www.nature.com/articles/s41597-021-00833-x) | -| [Splinter](https://www.nature.com/articles/s41597-023-02443-1) | -| [X40](https://pubs.acs.org/doi/10.1021/ct300647k) | -| [L7](https://pubs.acs.org/doi/10.1021/ct400036b) | +| [DESS66](https://www.nature.com/articles/s41597-021-00833-x) | +| [DESS66x8](https://www.nature.com/articles/s41597-021-00833-x) | +| [Splinter](https://www.nature.com/articles/s41597-023-02443-1) | +| [X40](https://pubs.acs.org/doi/10.1021/ct300647k) | +| [L7](https://pubs.acs.org/doi/10.1021/ct400036b) | + +# CI Status + +The CI runs tests and performs code quality checks for the following combinations: + +- The three major platforms: Windows, OSX and Linux. +- The four latest Python versions. + +| | `main` | +| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Lib build & Testing | [![test](https://github.com/valence-labs/openQDC/actions/workflows/test.yml/badge.svg)](https://github.com/valence-labs/openQDC/actions/workflows/test.yml) | +| Code Sanity (linting and type analysis) | [![code-check](https://github.com/valence-labs/openQDC/actions/workflows/code-check.yml/badge.svg)](https://github.com/valence-labs/openQDC//actions/workflows/code-check.yml) | +| Documentation Build | [![doc](https://github.com/valence-labs/openQDC/actions/workflows/doc.yml/badge.svg)](https://github.com/valence-labs/openQDC/actions/workflows/doc.yml) | +| Pre-Commit | [![pre-commit](https://github.com/valence-labs/openQDC/actions/workflows/pre-commit-ci.yml/badge.svg)](https://github.com/valence-labs/openQDC/actions/workflows/pre-commit-ci.yml) | # How to cite + All data presented in the OpenQDC are already published in scientific journals, full reference to the respective paper is attached to each dataset class. When citing data obtained from OpenQDC, you should cite both the original paper(s) the data come from and our paper on OpenQDC itself. The reference is: ADD REF HERE LATER diff --git a/docs/API/e0_dispatcher.md b/docs/API/e0_dispatcher.md new file mode 100644 index 0000000..e70b60a --- /dev/null +++ b/docs/API/e0_dispatcher.md @@ -0,0 +1 @@ +::: openqdc.datasets.energies diff --git a/docs/API/properties.md b/docs/API/properties.md new file mode 100644 index 0000000..3b6238b --- /dev/null +++ b/docs/API/properties.md @@ -0,0 +1,3 @@ +# Defined properties for datasets + +:::openqdc.datasets.properties diff --git a/docs/API/statistics.md b/docs/API/statistics.md new file mode 100644 index 0000000..b073963 --- /dev/null +++ b/docs/API/statistics.md @@ -0,0 +1 @@ +::: openqdc.datasets.statistics diff --git a/docs/assets/logo-title.png b/docs/assets/logo-title.png new file mode 100644 index 0000000..959eaaa Binary files /dev/null and b/docs/assets/logo-title.png differ diff --git a/docs/index.md b/docs/index.md index db497b1..65b1ea9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,17 +9,17 @@ OpenQDC is a python library to work with quantum datasets. It's a package aimed - 🧠 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc). - 📈 Data: have access to 1.5+ billion datapoints -Visit our website at TOFILL . +Visit our website at https://openqdc.io . ## Installation Use mamba: ```bash -mamba install -c conda-forge openqdc +conda install -c conda-forge openqdc ``` -_**Tips:** You can replace `mamba` by `conda`._ +_**Tips:** You can replace `conda` by `mamba`._ _**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install OpenQDC. The package is also pip installable if you need it: `pip install openqdc`._ @@ -58,7 +58,7 @@ dataset.calculate_descriptors( ## How to cite -Please cite OpenQDC if you use it in your research: [![DOI](zenodo_badge)](zenodo_link). +Please cite OpenQDC if you use it in your research: [![Pending Publication](Pending Publication)](Pending Publication). ## Compatibilities diff --git a/docs/usage.md b/docs/usage.md index af62f45..02874c4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -37,6 +37,14 @@ for data in dataset.as_iter(atoms=True): break ``` +or if you want to just iterate over the data: + +```python +for data in dataset: + print(data) # dict of arrays + break +``` + ## Lazy loading OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during `import openqdc as qdc`. In case of trouble you can always disable lazy loading by setting the environment variable `OPENQDC_DISABLE_LAZY_LOADING` to `1`. diff --git a/env.yml b/env.yml index 87a9cca..e8be694 100644 --- a/env.yml +++ b/env.yml @@ -14,6 +14,7 @@ dependencies: - s3fs - pydantic - python-dotenv + - httpx # Scientific diff --git a/mkdocs.yml b/mkdocs.yml index 67d7b04..cdeb8cb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,8 +1,8 @@ site_name: "OpenQDC" -site_description: "I don't know... Something about data and Quantum stuff I guess :D" +site_description: "Harness the power of quantum chemistry in one line of code." repo_url: "https://github.com/valence-labs/openQDC" repo_name: "openQDC" -copyright: Copyright 2023 Valence Labs +copyright: Copyright 2024 Valence Labs site_url: "https://github.com/valence-labs/openQDC" remote_branch: "gh-pages" @@ -25,7 +25,11 @@ nav: - API: - QM methods: API/methods.md - Normalization regressor: API/regressor.md - - Main class: API/basedataset.md + - Main classes: + - BaseDataset: API/basedataset.md + - Available Properties: API/properties.md + - e0 Dispatcher: API/e0_dispatcher.md + - Statistics: API/statistics.md - Format loading: API/formats.md - Datasets: - Potential Energy: diff --git a/openqdc/__init__.py b/openqdc/__init__.py index 051aeef..56ba46c 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -21,6 +21,7 @@ def get_project_root(): "ANI1CCX_V2": "openqdc.datasets.potential.ani", "ANI1X": "openqdc.datasets.potential.ani", "ANI2X": "openqdc.datasets.potential.ani", + "BPA": "openqdc.datasets.potential.bpa", "Spice": "openqdc.datasets.potential.spice", "SpiceV2": "openqdc.datasets.potential.spice", "SpiceVL2": "openqdc.datasets.potential.spice", @@ -118,6 +119,7 @@ def __dir__(): # POTENTIAL from .datasets.potential.alchemy import Alchemy from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X + from .datasets.potential.bpa import BPA from .datasets.potential.comp6 import COMP6 from .datasets.potential.dummy import Dummy, PredefinedDataset from .datasets.potential.gdml import GDML diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index 8a48012..1e0c716 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -237,7 +237,13 @@ def force_methods(self): return list(compress(self.energy_methods, self.force_mask)) @property - def e0s_dispatcher(self): + def e0s_dispatcher(self) -> AtomEnergies: + """ + Property to get the object that dispatched the isolated atom energies of the QM methods. + + Returns: + Object wrapping the isolated atom energies of the QM methods. + """ if not hasattr(self, "_e0s_dispatcher"): # Automatically fetch/compute formation or regression energies self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs) diff --git a/openqdc/datasets/energies.py b/openqdc/datasets/energies.py index 676f65c..4ca61c8 100644 --- a/openqdc/datasets/energies.py +++ b/openqdc/datasets/energies.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field from os.path import join as p_join -from typing import Dict, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np from loguru import logger @@ -14,22 +14,24 @@ POSSIBLE_ENERGIES = ["formation", "regression", "null"] -def dispatch_factory(data, **kwargs) -> "IsolatedEnergyInterface": +def dispatch_factory(data: Any, **kwargs: Dict) -> "IsolatedEnergyInterface": """ Factory function that select the correct energy class for the fetching/calculation of isolated atom energies. - Parameters - ---------- - data : openqdc.datasets.Dataset - Dataset object that contains the information - about the isolated atom energies. Info will be passed - by references - kwargs : dict - Additional arguments that will be passed to the - selected energy class. Mostly used for regression - to pass the regressor_kwargs. + Parameters: + data : openqdc.datasets.Dataset + Dataset object that contains the information + about the isolated atom energies. Info will be passed + by references + kwargs : dict + Additional arguments that will be passed to the + selected energy class. Mostly used for regression + to pass the regressor_kwargs. + + Returns: + Initialized IsolatedEnergyInterface-like object """ if data.energy_type == "formation": return PhysicalEnergy(data, **kwargs) @@ -100,26 +102,16 @@ class AtomEnergies: """ def __init__(self, data, **kwargs) -> None: - """ - Parameters - ---------- - data : openqdc.datasets.Dataset - Dataset object that contains the information - about the isolated atom energies. Info will be passed - by references - kwargs : dict - Additional arguments that will be passed to the - selected energy class. Mostly used for regression - to pass the regressor_kwargs. - """ - self.atom_energies = data.energy_type self.factory = dispatch_factory(data, **kwargs) @property def e0s_matrix(self) -> np.ndarray: """ - Returns the isolated atom energies matrixes + Return the isolated atom energies dictionary + + Returns: + Matrix Array with the isolated atom energies """ return self.factory.e0_matrix @@ -127,6 +119,9 @@ def e0s_matrix(self) -> np.ndarray: def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]: """ Return the isolated atom energies dictionary + + Returns: + Dictionary with the isolated atom energies """ return self.factory.e0_dict @@ -142,10 +137,18 @@ def __getitem__(self, item: AtomSpecies) -> AtomEnergy: Item can be written as tuple(Symbol, charge), tuple(Chemical number, charge). If no charge is passed, it will be automatically set to 0. + Examples: - AtomEnergies[6], AtomEnergies[6,1], - AtomEnergies["C",1], AtomEnergies[(6,1)] + AtomEnergies[6], AtomEnergies[6,1], \n + AtomEnergies["C",1], AtomEnergies[(6,1)], \n AtomEnergies[("C,1)] + + Parameters: + item: + AtomSpecies object or tuple with the atom symbol and charge + + Returns: + AtomEnergy object with the isolated atom energy """ try: atom, charge = item[0], item[1] @@ -168,16 +171,15 @@ class IsolatedEnergyInterface(ABC): def __init__(self, data, **kwargs): """ - Parameters - ---------- - data : openqdc.datasets.Dataset - Dataset object that contains the information - about the isolated atom energies. Info will be passed - by references - kwargs : dict - Additional arguments that will be passed to the - selected energy class. Mostly used for regression - to pass the regressor_kwargs. + Parameters: + data : openqdc.datasets.Dataset + Dataset object that contains the information + about the isolated atom energies. Info will be passed + by references + kwargs : dict + Additional arguments that will be passed to the + selected energy class. Mostly used for regression + to pass the regressor_kwargs. """ self._e0_matrixs = [] self._e0_dict = None @@ -204,6 +206,9 @@ def __len__(self): def e0_matrix(self) -> np.ndarray: """ Return the isolated atom energies matrixes + + Returns: + Matrix Array with the isolated atom energies """ return np.array(self._e0_matrixs) @@ -211,6 +216,9 @@ def e0_matrix(self) -> np.ndarray: def e0_dict(self) -> Dict: """ Return the isolated atom energies dict + + Returns: + Dictionary with the isolated atom energies """ return self._e0s_dict @@ -276,11 +284,15 @@ def _post_init(self): self._set_lin_atom_species_dict(E0s, cov) self._set_linear_e0s() - def _compute_regression_e0s(self): + def _compute_regression_e0s(self) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Try to compute the regressed isolated atom energies. raise an error if the regression fails. return the regressed isolated atom energies and the uncertainty values. + + Returns: + Tuple with the regressed isolated atom energies and the uncertainty values of the regression + if available. """ try: E0s, cov = self.regressor.solve() @@ -305,7 +317,7 @@ def _set_lin_atom_species_dict(self, E0s, covs) -> None: def _set_linear_e0s(self) -> None: """ Transform the e0s dictionary into the correct e0s - matrix format + matrix format. """ new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))] for z, e0 in self._e0s_dict.items(): diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py index e0207b7..7beba8c 100644 --- a/openqdc/datasets/potential/__init__.py +++ b/openqdc/datasets/potential/__init__.py @@ -1,5 +1,6 @@ from .alchemy import Alchemy from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X +from .bpa import BPA from .comp6 import COMP6 from .dummy import Dummy, PredefinedDataset from .gdml import GDML @@ -34,6 +35,7 @@ "ANI1CCX_V2": ANI1CCX_V2, "ANI1X": ANI1X, "ANI2X": ANI2X, + "BPA": BPA, "COMP6": COMP6, "GDML": GDML, "GEOM": GEOM, diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py index 81f106f..ed43aae 100644 --- a/openqdc/datasets/potential/ani.py +++ b/openqdc/datasets/potential/ani.py @@ -60,7 +60,7 @@ class ANI1(BaseDataset): __name__ = "ani1" __energy_methods__ = [ - PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)" + PotentialMethod.WB97X_6_31G_D, ] energy_target_names = [ @@ -121,14 +121,14 @@ class ANI1X(ANI1): __forces_unit__ = "hartree/ang" __energy_methods__ = [ - PotentialMethod.NONE, # "hf/cc-pvdz", - PotentialMethod.NONE, # "hf/cc-pvqz", - PotentialMethod.NONE, # "hf/cc-pvtz", - PotentialMethod.NONE, # "mp2/cc-pvdz", - PotentialMethod.NONE, # "mp2/cc-pvqz", - PotentialMethod.NONE, # "mp2/cc-pvtz", - PotentialMethod.NONE, # "wb97x/6-31g(d)", - PotentialMethod.NONE, # "wb97x/cc-pvtz", + PotentialMethod.HF_CC_PVDZ, + PotentialMethod.HF_CC_PVQZ, + PotentialMethod.HF_CC_PVTZ, + PotentialMethod.MP2_CC_PVDZ, + PotentialMethod.MP2_CC_PVQZ, + PotentialMethod.MP2_CC_PVTZ, + PotentialMethod.WB97X_6_31G_D, + PotentialMethod.WB97X_CC_PVTZ, ] energy_target_names = [ @@ -179,10 +179,10 @@ class ANI1CCX(ANI1): __forces_unit__ = "hartree/ang" __energy_methods__ = [ - PotentialMethod.NONE, # "ccsd(t)/cbs", - PotentialMethod.NONE, # "ccsd(t)/cc-pvdz", - PotentialMethod.NONE, # "ccsd(t)/cc-pvtz", - PotentialMethod.NONE, # "tccsd(t)/cc-pvdz", + PotentialMethod.CCSD_T_CBS, # "ccsd(t)/cbs", + PotentialMethod.CCSD_T_CC_PVDZ, # "ccsd(t)/cc-pvdz", + PotentialMethod.CCSD_T_CC_PVTZ, # "ccsd(t)/cc-pvtz", + PotentialMethod.TCSSD_T_CC_PVDZ, # "tccsd(t)/cc-pvdz", ] energy_target_names = [ diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py new file mode 100644 index 0000000..1681710 --- /dev/null +++ b/openqdc/datasets/potential/bpa.py @@ -0,0 +1,74 @@ +from typing import Any, Dict, List + +import numpy as np +from ase.atoms import Atoms + +from openqdc import BaseDataset +from openqdc.methods import PotentialMethod + + +def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]: + return dict( + name=np.array([str(atoms.symbols)]), + subset=subset, + energies=np.array([atoms.get_potential_energy()], dtype=np.float64), + forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32), + atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32), + n_atoms=np.array([len(atoms)], dtype=np.int32), + split=np.array([subset.item().split("_")[0]]), + ) + + +class BPA(BaseDataset): + """ + BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike + molecule 3-(benzyloxy)pyridin-2-amine. This dataset features + complex dihedral potential energy surface with many local minima, + which can be challenging to approximate using classical or ML force fields. + The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to + perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at + three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step. + The final configurations were re-evaluated using ORCA at the DFT level of + theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set. + + Usage: + ```python + from openqdc.datasets import BPA + dataset = BPA() + ``` + + + References: + https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647 + """ + + __name__ = "BPA" + __energy_unit__ = "ev" + __forces_unit__ = "ev/ang" + __distance_unit__ = "ang" + __force_mask__ = [True] + __energy_methods__ = [PotentialMethod.WB97X_6_31G_D] + __links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"} + + def read_raw_entries(self) -> List[Dict]: + import os.path as osp + from glob import glob + + from ase.io import iread + + files = glob(osp.join(self.root, "dataset_3BPA", "*.xyz")) + files = [f for f in files if "iso_atoms.xyz" not in f] + all_records = [] + + for file in files: + subset = np.array([osp.basename(file).split(".")[0]]) + + for atoms in iread(file, format="extxyz"): + all_records.append(read_bpa_record(subset, atoms)) + + return all_records + + def __getitem__(self, idx): + data = super().__getitem__(idx) + data.__setattr__("split", self._convert_array(self.data["split"][idx])) + return data diff --git a/openqdc/datasets/potential/proteinfragments.py b/openqdc/datasets/potential/proteinfragments.py index d628975..337c988 100644 --- a/openqdc/datasets/potential/proteinfragments.py +++ b/openqdc/datasets/potential/proteinfragments.py @@ -120,13 +120,13 @@ class ProteinFragments(BaseDataset): __name__ = "proteinfragments" # PBE0/def2-TZVPP+MBD __energy_methods__ = [ - PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)" + PotentialMethod.PBE0_MBD_DEF2_TZVPP, ] energy_target_names = [ - "ωB97x:6-31G(d) Energy", + "PBE0+MBD/def2-TZVPP", ] - # PBE0/def2-TZVPP+MBD + __energy_unit__ = "ev" __distance_unit__ = "ang" __forces_unit__ = "ev/ang" diff --git a/openqdc/datasets/potential/vqm24.py b/openqdc/datasets/potential/vqm24.py index 1710e1d..aca19ae 100644 --- a/openqdc/datasets/potential/vqm24.py +++ b/openqdc/datasets/potential/vqm24.py @@ -59,11 +59,11 @@ class VQM24(BaseDataset): __name__ = "vqm24" __energy_methods__ = [ - PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)" + PotentialMethod.WB97X_D3_CC_PVDZ, # "wB97x-D3/cc-pVDZ." ] energy_target_names = [ - "ωB97x:6-31G(d) Energy", + "wB97x-D3/cc-pVDZ", ] # ωB97X-D3/cc-pVDZ __energy_unit__ = "hartree" diff --git a/openqdc/datasets/properties.py b/openqdc/datasets/properties.py index 81bc568..9874330 100644 --- a/openqdc/datasets/properties.py +++ b/openqdc/datasets/properties.py @@ -1,3 +1,5 @@ +from typing import Tuple + import numpy as np import pandas as pd @@ -29,35 +31,62 @@ def _compute_average_nb_atoms(self): self.__average_nb_atoms__ = np.mean(self.data["n_atoms"]) @property - def average_n_atoms(self): + def average_n_atoms(self) -> int: """ Average number of atoms in a molecule in the dataset. + + Returns: + Average number of atoms in a molecule in the dataset. """ if self.__average_nb_atoms__ is None: raise StatisticsNotAvailableError(self.__name__) return self.__average_nb_atoms__ @property - def numbers(self): + def numbers(self) -> np.ndarray: + """ + Unique atomic numbers in the dataset + + Returns: + Array of the unique atomic numbers in the dataset + """ if hasattr(self, "_numbers"): return self._numbers self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32) return self._numbers @property - def charges(self): + def charges(self) -> np.ndarray: + """ + Unique charges in the dataset + + Returns: + Array of the unique charges in the dataset + """ if hasattr(self, "_charges"): return self._charges self._charges = np.unique(self.data["atomic_inputs"][..., :2], axis=0).astype(np.int32) return self._charges @property - def min_max_charges(self): + def min_max_charges(self) -> Tuple[int, int]: + """ + Minimum and maximum charges in the dataset + + Returns: + (min_charge, max_charge) + """ if hasattr(self, "_min_max_charges"): return self._min_max_charges self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1]) return self._min_max_charges @property - def chemical_species(self): + def chemical_species(self) -> np.ndarray: + """ + Chemical symbols in the dataset + + Returns: + Array of the chemical symbols in the dataset + """ return np.array(ATOM_SYMBOLS)[self.numbers] diff --git a/openqdc/datasets/statistics.py b/openqdc/datasets/statistics.py index 6b1adeb..8b9ae13 100644 --- a/openqdc/datasets/statistics.py +++ b/openqdc/datasets/statistics.py @@ -2,7 +2,7 @@ from copy import deepcopy from dataclasses import asdict, dataclass from os.path import join as p_join -from typing import Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional import numpy as np from loguru import logger @@ -16,15 +16,22 @@ class StatisticsResults: to provide general methods. """ - def to_dict(self): + def to_dict(self) -> Dict: """ Convert the class to a dictionary + + Returns: + Dictionary representation of the class """ return asdict(self) def transform(self, func: Callable): """ Apply a function to all the attributes of the class + + Parameters: + func: + Function to apply to the attributes """ for k, v in self.to_dict().items(): if v is not None: @@ -56,18 +63,19 @@ class ForceStatistics(StatisticsResults): class StatisticManager: """ - Manager class to share the state between all + Manager class that automatically handle the shared state between the statistic calculators """ - def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"): + def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"): """ - dataset : openqdc.datasets.base.BaseDataset - The dataset object to compute the statistics - recompute : bool, default = False - Flag to recompute the statistics - *statistic_calculators : AbstractStatsCalculator - statistic calculators to run + Parameters: + dataset : openqdc.datasets.base.BaseDataset + The dataset object to compute the statistics + recompute: + Flag to recompute the statistics + *statistic_calculators: + List of statistic calculators to run """ self._state = {} self._results = {} @@ -80,6 +88,9 @@ def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "Ab def state(self) -> Dict: """ Return the dictionary state of the manager + + Returns: + State of the StatisticManager """ return self._state @@ -95,25 +106,40 @@ def reset_results(self): """ self._results = {} - def get_state(self, key: Optional[str] = None): + def get_state(self, key: Optional[str] = None) -> Optional[Any]: """ - key : str, default = None Return the value of the key in the state dictionary + + Parameters: + key: str, default = None + Returns: + the value of the key in the state dictionary or the whole state dictionary if key is None """ if key is None: return self._state return self._state.get(key, None) - def has_state(self, key: str): + def has_state(self, key: str) -> bool: """ Check is state has key + + Parameters: + key: + Key to check in the state dictionary + + Returns: + True if the key is in the state dictionary """ return key in self._state def get_results(self, as_dict: bool = False): """ Aggregate results from all the calculators + + Parameters: + as_dict: + Flag to return the results as a dictionary """ results = deepcopy(self._results) if as_dict: @@ -155,26 +181,27 @@ def __init__( forces: Optional[np.ndarray] = None, ): """ - name : str - Name of the dataset for saving and loading. - energy_type : str, default = None - Type of the energy for the computation of the statistics. Used for loading and saving. - force_recompute : bool, default = False - Flag to force the recomputation of the statistics - energies : np.ndarray, default = None - Energies of the dataset - n_atoms : np.ndarray, default = None - Number of atoms in the dataset - atom_species : np.ndarray, default = None - Atomic species of the dataset - position_idx_range : np.ndarray, default = None - Position index range of the dataset - e0_matrix : np.ndarray, default = None - Isolated atom energies matrix of the dataset - atom_charges : np.ndarray, default = None - Atomic charges of the dataset - forces : np.ndarray, default = None - Forces of the dataset + Parameters: + name : + Name of the dataset for saving and loading. + energy_type : + Type of the energy for the computation of the statistics. Used for loading and saving. + force_recompute : + Flag to force the recomputation of the statistics + energies : n + Energies of the dataset + n_atoms : + Number of atoms in the dataset + atom_species : + Atomic species of the dataset + position_idx_range : n + Position index range of the dataset + e0_matrix : + Isolated atom energies matrix of the dataset + atom_charges : + Atomic charges of the dataset + forces : + Forces of the dataset """ self.name = name self.energy_type = energy_type diff --git a/openqdc/methods/enums.py b/openqdc/methods/enums.py index a4b958b..dfdb795 100644 --- a/openqdc/methods/enums.py +++ b/openqdc/methods/enums.py @@ -71,6 +71,7 @@ class CORRECTION(StrEnum): D4 = "d4" # Grimmes -D4 correction (we don t have any so feel free to not add this one) GCP = "gcp" # Geometrical Counter-Poise Correction CP = "cp" # Counter-Poise Correction + MBD = "mbd" # Many-Body Dispersion Correction/vdw-TS correction VWN = "vwn" # VWN5 = "vwn5" # NONE = "" @@ -119,6 +120,7 @@ class Functional(Enum): HCTH_407 = "hcth-407" HCTH_93 = "hcth-93" HF = "hf" + HF_R2SCAN_DC4 = "hf-r2scan-dc4" KCIS_MODIFIED = "kcis-modified" KCIS_ORIGINAL = "kcis-original" KMLYP_VWN5 = "kmlyp", CORRECTION.VWN5 @@ -153,23 +155,42 @@ class Functional(Enum): PBE_D = "pbe", CORRECTION.D PBE_D3_BJ = "pbe", CORRECTION.D3BJ PBE0 = "pbe0" + PBE0_MBD = "pbe0+mbd" PBESOL = "pbesol" PKZB = "pkzb" PKZBX_KCISCOR = "pkzbx-kciscor" PM6 = "pm6" PW91 = "pw91" QCISDT = "qcisd(t)" + R2_SCAN = "r2Scan" + R2_SCAN_HF = "r2Scan@hf" + R2_SCAN_R2_SCAN50 = "r2Scan@r2Scan50" + R2_SCAN50 = "r2Scan50" + R2_SCAN100 = "r2Scan100" + R2_SCAN10 = "r2Scan10" + R2_SCAN20 = "r2Scan20" + R2_SCAN25 = "r2Scan25" + R2_SCAN30 = "r2Scan30" + R2_SCAN40 = "r2Scan40" + R2_SCAN60 = "r2Scan60" + R2_SCAN70 = "r2Scan70" + R2_SCAN80 = "r2Scan80" + R2_SCAN90 = "r2Scan90" REVPBE = "revpbe" REVPBE_D3_BJ = "revpbe", CORRECTION.D3BJ REVTPSS = "revtpss" RGE2 = "rge2" RPBE = "rpbe" SAPT0 = "sapt0" + SCAN = "scan" + SCAN_HF = "scan@hf" + SCAN_R2SCAN50 = "scan@r2scan50" SSB_D = "ssb", CORRECTION.D SVWN = "svwn" TMGGA = "t-mgga" TAU_HCTH = "tau-hcth" TAU_HCTH_HYBRID = "tau-hcth-hybrid" + TCSSD_T = "tccsd(t)" TPSS = "tpss" TPSSD = "tpss", CORRECTION.D TPSSH = "tpssh" @@ -295,8 +316,11 @@ class PotentialMethod(QmMethod): # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1 BP86_D_DZP = Functional.BP86_D, BasisSet.DZP BP86_D_SZ = Functional.BP86_D, BasisSet.SZ BP86_D_TZP = Functional.BP86_D, BasisSet.TZP + CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS + CCSD_T_CC_PVTZ = Functional.CCSDT, BasisSet.CC_PVDZ CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ CCSD_CC_PVDZ = Functional.CCSD, BasisSet.CC_PVDZ + DFT3B = Functional.DFT3B, BasisSet.NONE DSD_BLYP_D3_BJ_DEF2_TZVP = Functional.DSD_BLYP_D3_BJ, BasisSet.DEF2_TZVP FT97_DZP = Functional.FT97, BasisSet.DZP @@ -317,6 +341,18 @@ class PotentialMethod(QmMethod): # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1 HCTH_93_SZ = Functional.HCTH_93, BasisSet.SZ HCTH_93_TZP = Functional.HCTH_93, BasisSet.TZP HF_DEF2_TZVP = Functional.HF, BasisSet.DEF2_TZVP + HF_CC_PVDZ = ( + Functional.HF, + BasisSet.CC_PVDZ, + ) + HF_CC_PVQZ = ( + Functional.HF, + BasisSet.CC_PVQZ, + ) + HF_CC_PVTZ = ( + Functional.HF, + BasisSet.CC_PVTZ, + ) KCIS_MODIFIED_DZP = Functional.KCIS_MODIFIED, BasisSet.DZP KCIS_MODIFIED_SZ = Functional.KCIS_MODIFIED, BasisSet.SZ KCIS_MODIFIED_TZP = Functional.KCIS_MODIFIED, BasisSet.TZP @@ -350,6 +386,9 @@ class PotentialMethod(QmMethod): # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1 M06_DZP = Functional.M06, BasisSet.DZP M06_SZ = Functional.M06, BasisSet.SZ M06_TZP = Functional.M06, BasisSet.TZP + MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ + MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ + MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ MPBE_DZP = Functional.MPBE, BasisSet.DZP MPBE_SZ = Functional.MPBE, BasisSet.SZ MPBE_TZP = Functional.MPBE, BasisSet.TZP @@ -407,6 +446,7 @@ class PotentialMethod(QmMethod): # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1 PBE0_DEF2_TZVP = Functional.PBE0, BasisSet.DEF2_TZVP PBE0_SZ = Functional.PBE0, BasisSet.SZ PBE0_TZP = Functional.PBE0, BasisSet.TZP + PBE0_MBD_DEF2_TZVPP = Functional.PBE0_MBD, BasisSet.DEF2_TZVPPD PBESOL_DZP = Functional.PBESOL, BasisSet.DZP PBESOL_SZ = Functional.PBESOL, BasisSet.SZ PBESOL_TZP = Functional.PBESOL, BasisSet.TZP @@ -446,6 +486,7 @@ class PotentialMethod(QmMethod): # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1 TAU_HCTH_DZP = Functional.TAU_HCTH, BasisSet.DZP TAU_HCTH_SZ = Functional.TAU_HCTH, BasisSet.SZ TAU_HCTH_TZP = Functional.TAU_HCTH, BasisSet.TZP + TCSSD_T_CC_PVDZ = Functional.TCSSD_T, BasisSet.CC_PVDZ TPSSD_DZP = Functional.TPSSD, BasisSet.DZP TPSSD_SZ = Functional.TPSSD, BasisSet.SZ TPSSD_TZP = Functional.TPSSD, BasisSet.TZP @@ -469,7 +510,9 @@ class PotentialMethod(QmMethod): # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1 WB97M_D3BJ_DEF2_TZVPPD = Functional.WB97M_D3BJ, BasisSet.DEF2_TZVPPD WB97X_D_DEF2_SVP = Functional.WB97X_D, BasisSet.DEF2_SVP WB97X_D3_DEF2_TZVP = Functional.WB97X_D3, BasisSet.DEF2_TZVP + WB97X_D3_CC_PVDZ = Functional.WB97X_D3, BasisSet.CC_PVDZ WB97X_6_31G_D = Functional.WB97X, BasisSet.GSTAR + WB97X_CC_PVTZ = Functional.WB97X, BasisSet.CC_PVTZ X3LYP_VWN5_DZP = Functional.X3LYP_VWN5, BasisSet.DZP X3LYP_VWN5_SZ = Functional.X3LYP_VWN5, BasisSet.SZ X3LYP_VWN5_TZP = Functional.X3LYP_VWN5, BasisSet.TZP diff --git a/pyproject.toml b/pyproject.toml index 37e9d08..96a955d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ dependencies = [ "zarr", "python-dotenv", "s3fs", + "httpx", ]