valence-labs · prtos · Jun 8, 2024 · Jun 4, 2024
diff --git a/openqdc/__init__.py b/openqdc/__init__.py
@@ -19,6 +19,7 @@ def get_project_root():
     "ANI1CCX": "openqdc.datasets.potential.ani",
     "ANI1CCX_V2": "openqdc.datasets.potential.ani",
     "ANI1X": "openqdc.datasets.potential.ani",
+    "ANI2": "openqdc.datasets.potential.ani",
     "Spice": "openqdc.datasets.potential.spice",
     "SpiceV2": "openqdc.datasets.potential.spice",
     "SpiceVL2": "openqdc.datasets.potential.spice",
@@ -100,7 +101,7 @@ def __dir__():
     from .datasets.interaction.metcalf import Metcalf
     from .datasets.interaction.splinter import Splinter
     from .datasets.interaction.x40 import X40
-    from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
+    from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
     from .datasets.potential.comp6 import COMP6
     from .datasets.potential.dummy import Dummy
     from .datasets.potential.gdml import GDML

diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py
@@ -1,4 +1,4 @@
-from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
+from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
 from .comp6 import COMP6
 from .dummy import Dummy
 from .gdml import GDML
@@ -24,6 +24,7 @@
     "ani1": ANI1,
     "ani1ccx": ANI1CCX,
     "ani1ccxv2": ANI1CCX_V2,
+    "ani2": ANI2,
     "ani1x": ANI1X,
     "comp6": COMP6,
     "gdml": GDML,

diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py
@@ -1,12 +1,42 @@
 import os
 from os.path import join as p_join
 
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset
 from openqdc.methods import PotentialMethod
-from openqdc.utils import read_qc_archive_h5
+from openqdc.utils import load_hdf5_file, read_qc_archive_h5
 from openqdc.utils.io import get_local_cache
 
 
+def read_ani2_h5(raw_path):
+    h5f = load_hdf5_file(raw_path)
+    samples = []
+    for _, props in h5f.items():
+        samples.append(extract_ani2_entries(props))
+    return samples
+
+
+def extract_ani2_entries(properties):
+    coordinates = properties["coordinates"]
+    species = properties["species"]
+    forces = properties["forces"]
+    energies = properties["energies"]
+    n_atoms = coordinates.shape[1]
+    n_entries = coordinates.shape[0]
+    flattened_coordinates = coordinates[:].reshape((-1, 3))
+    xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1)
+    res = dict(
+        name=np.array(["ANI2"] * n_entries),
+        subset=np.array([str(n_atoms)] * n_entries),
+        energies=energies[:].reshape((-1, 1)).astype(np.float64),
+        atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32),
+        n_atoms=np.array([n_atoms] * n_entries, dtype=np.int32),
+        forces=forces[:].reshape(-1, 3, 1).astype(np.float32),
+    )
+    return res
+
+
 class ANI1(BaseDataset):
     """
     The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small
@@ -176,3 +206,51 @@ class ANI1CCX_V2(ANI1CCX):
 
     __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]
     energy_target_names = ANI1CCX.energy_target_names + ["PM6", "GFN2"]
+
+
+class ANI2(ANI1):
+    """ """
+
+    __name__ = "ani2"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+
+    __energy_methods__ = [
+        # PotentialMethod.NONE,  # "b973c/def2mtzvp",
+        PotentialMethod.WB97X_6_31G_D,  # "wb97x/631gd", # PAPER DATASET
+        # PotentialMethod.NONE,  # "wb97md3bj/def2tzvpp",
+        # PotentialMethod.NONE,  # "wb97mv/def2tzvpp",
+        # PotentialMethod.NONE,  # "wb97x/def2tzvpp",
+    ]
+
+    energy_target_names = [
+        # "b973c/def2mtzvp",
+        "wb97x/631gd",
+        # "wb97md3bj/def2tzvpp",
+        # "wb97mv/def2tzvpp",
+        # "wb97x/def2tzvpp",
+    ]
+
+    force_target_names = ["wb97x/631gd"]  # "b973c/def2mtzvp",
+
+    __force_mask__ = [True]
+    __links__ = {  # "ANI-2x-B973c-def2mTZVP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1",  # noqa
+        # "ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1", # noqa
+        # "ANI-2x-wB97MV-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1", # noqa
+        "ANI-2x-wB97X-631Gd.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1",  # noqa
+        # "ANI-2x-wB97X-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1", # noqa
+    }
+
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return x
+
+    def read_raw_entries(self):
+        samples = []
+        for lvl_theory in self.__links__.keys():
+            raw_path = p_join(self.root, "final_h5", f"{lvl_theory.split('.')[0]}.h5")
+            samples.extend(read_ani2_h5(raw_path))
+        return samples