diff --git a/openqdc/__init__.py b/openqdc/__init__.py index 4a000e4..5ced217 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -19,6 +19,7 @@ def get_project_root(): "ANI1CCX": "openqdc.datasets.potential.ani", "ANI1CCX_V2": "openqdc.datasets.potential.ani", "ANI1X": "openqdc.datasets.potential.ani", + "ANI2": "openqdc.datasets.potential.ani", "Spice": "openqdc.datasets.potential.spice", "SpiceV2": "openqdc.datasets.potential.spice", "SpiceVL2": "openqdc.datasets.potential.spice", @@ -100,7 +101,7 @@ def __dir__(): from .datasets.interaction.metcalf import Metcalf from .datasets.interaction.splinter import Splinter from .datasets.interaction.x40 import X40 - from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X + from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2 from .datasets.potential.comp6 import COMP6 from .datasets.potential.dummy import Dummy from .datasets.potential.gdml import GDML diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py index 5e473a8..0edbb8b 100644 --- a/openqdc/datasets/potential/__init__.py +++ b/openqdc/datasets/potential/__init__.py @@ -1,4 +1,4 @@ -from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X +from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2 from .comp6 import COMP6 from .dummy import Dummy from .gdml import GDML @@ -24,6 +24,7 @@ "ani1": ANI1, "ani1ccx": ANI1CCX, "ani1ccxv2": ANI1CCX_V2, + "ani2": ANI2, "ani1x": ANI1X, "comp6": COMP6, "gdml": GDML, diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py index 70fc882..7a2e04d 100644 --- a/openqdc/datasets/potential/ani.py +++ b/openqdc/datasets/potential/ani.py @@ -1,12 +1,42 @@ import os from os.path import join as p_join +import numpy as np + from openqdc.datasets.base import BaseDataset from openqdc.methods import PotentialMethod -from openqdc.utils import read_qc_archive_h5 +from openqdc.utils import load_hdf5_file, read_qc_archive_h5 from openqdc.utils.io import get_local_cache +def read_ani2_h5(raw_path): + h5f = load_hdf5_file(raw_path) + samples = [] + for _, props in h5f.items(): + samples.append(extract_ani2_entries(props)) + return samples + + +def extract_ani2_entries(properties): + coordinates = properties["coordinates"] + species = properties["species"] + forces = properties["forces"] + energies = properties["energies"] + n_atoms = coordinates.shape[1] + n_entries = coordinates.shape[0] + flattened_coordinates = coordinates[:].reshape((-1, 3)) + xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1) + res = dict( + name=np.array(["ANI2"] * n_entries), + subset=np.array([str(n_atoms)] * n_entries), + energies=energies[:].reshape((-1, 1)).astype(np.float64), + atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32), + n_atoms=np.array([n_atoms] * n_entries, dtype=np.int32), + forces=forces[:].reshape(-1, 3, 1).astype(np.float32), + ) + return res + + class ANI1(BaseDataset): """ The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small @@ -176,3 +206,51 @@ class ANI1CCX_V2(ANI1CCX): __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB] energy_target_names = ANI1CCX.energy_target_names + ["PM6", "GFN2"] + + +class ANI2(ANI1): + """ """ + + __name__ = "ani2" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + + __energy_methods__ = [ + # PotentialMethod.NONE, # "b973c/def2mtzvp", + PotentialMethod.WB97X_6_31G_D, # "wb97x/631gd", # PAPER DATASET + # PotentialMethod.NONE, # "wb97md3bj/def2tzvpp", + # PotentialMethod.NONE, # "wb97mv/def2tzvpp", + # PotentialMethod.NONE, # "wb97x/def2tzvpp", + ] + + energy_target_names = [ + # "b973c/def2mtzvp", + "wb97x/631gd", + # "wb97md3bj/def2tzvpp", + # "wb97mv/def2tzvpp", + # "wb97x/def2tzvpp", + ] + + force_target_names = ["wb97x/631gd"] # "b973c/def2mtzvp", + + __force_mask__ = [True] + __links__ = { # "ANI-2x-B973c-def2mTZVP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1", # noqa + # "ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1", # noqa + # "ANI-2x-wB97MV-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1", # noqa + "ANI-2x-wB97X-631Gd.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1", # noqa + # "ANI-2x-wB97X-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1", # noqa + } + + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return x + + def read_raw_entries(self): + samples = [] + for lvl_theory in self.__links__.keys(): + raw_path = p_join(self.root, "final_h5", f"{lvl_theory.split('.')[0]}.h5") + samples.extend(read_ani2_h5(raw_path)) + return samples