Merge branch 'downloader_add' into issue_90

valence-labs · Jun 8, 2024 · 0566260 · 0566260
2 parents f0a499b + 032aa71
commit 0566260
Show file tree

Hide file tree

Showing 9 changed files with 101 additions and 20 deletions.
diff --git a/openqdc/__init__.py b/openqdc/__init__.py
@@ -19,6 +19,7 @@ def get_project_root():
     "ANI1CCX": "openqdc.datasets.potential.ani",
     "ANI1CCX_V2": "openqdc.datasets.potential.ani",
     "ANI1X": "openqdc.datasets.potential.ani",
+    "ANI2": "openqdc.datasets.potential.ani",
     "Spice": "openqdc.datasets.potential.spice",
     "SpiceV2": "openqdc.datasets.potential.spice",
     "SpiceVL2": "openqdc.datasets.potential.spice",
@@ -100,7 +101,7 @@ def __dir__():
     from .datasets.interaction.metcalf import Metcalf
     from .datasets.interaction.splinter import Splinter
     from .datasets.interaction.x40 import X40
-    from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
+    from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
     from .datasets.potential.comp6 import COMP6
     from .datasets.potential.dummy import Dummy
     from .datasets.potential.gdml import GDML

diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py
@@ -1,4 +1,4 @@
-from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
+from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
 from .comp6 import COMP6
 from .dummy import Dummy
 from .gdml import GDML

diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py
@@ -1,12 +1,42 @@
 import os
 from os.path import join as p_join
 
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset
 from openqdc.methods import PotentialMethod
-from openqdc.utils import read_qc_archive_h5
+from openqdc.utils import load_hdf5_file, read_qc_archive_h5
 from openqdc.utils.io import get_local_cache
 
 
+def read_ani2_h5(raw_path):
+    h5f = load_hdf5_file(raw_path)
+    samples = []
+    for _, props in h5f.items():
+        samples.append(extract_ani2_entries(props))
+    return samples
+
+
+def extract_ani2_entries(properties):
+    coordinates = properties["coordinates"]
+    species = properties["species"]
+    forces = properties["forces"]
+    energies = properties["energies"]
+    n_atoms = coordinates.shape[1]
+    n_entries = coordinates.shape[0]
+    flattened_coordinates = coordinates[:].reshape((-1, 3))
+    xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1)
+    res = dict(
+        name=np.array(["ANI2"] * n_entries),
+        subset=np.array([str(n_atoms)] * n_entries),
+        energies=energies[:].reshape((-1, 1)).astype(np.float64),
+        atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32),
+        n_atoms=np.array([n_atoms] * n_entries, dtype=np.int32),
+        forces=forces[:].reshape(-1, 3, 1).astype(np.float32),
+    )
+    return res
+
+
 class ANI1(BaseDataset):
     """
     The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small
@@ -176,3 +206,51 @@ class ANI1CCX_V2(ANI1CCX):
 
     __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]
     energy_target_names = ANI1CCX.energy_target_names + ["PM6", "GFN2"]
+
+
+class ANI2(ANI1):
+    """ """
+
+    __name__ = "ani2"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+
+    __energy_methods__ = [
+        # PotentialMethod.NONE,  # "b973c/def2mtzvp",
+        PotentialMethod.WB97X_6_31G_D,  # "wb97x/631gd", # PAPER DATASET
+        # PotentialMethod.NONE,  # "wb97md3bj/def2tzvpp",
+        # PotentialMethod.NONE,  # "wb97mv/def2tzvpp",
+        # PotentialMethod.NONE,  # "wb97x/def2tzvpp",
+    ]
+
+    energy_target_names = [
+        # "b973c/def2mtzvp",
+        "wb97x/631gd",
+        # "wb97md3bj/def2tzvpp",
+        # "wb97mv/def2tzvpp",
+        # "wb97x/def2tzvpp",
+    ]
+
+    force_target_names = ["wb97x/631gd"]  # "b973c/def2mtzvp",
+
+    __force_mask__ = [True]
+    __links__ = {  # "ANI-2x-B973c-def2mTZVP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1",  # noqa
+        # "ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1", # noqa
+        # "ANI-2x-wB97MV-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1", # noqa
+        "ANI-2x-wB97X-631Gd.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1",  # noqa
+        # "ANI-2x-wB97X-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1", # noqa
+    }
+
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return x
+
+    def read_raw_entries(self):
+        samples = []
+        for lvl_theory in self.__links__.keys():
+            raw_path = p_join(self.root, "final_h5", f"{lvl_theory.split('.')[0]}.h5")
+            samples.extend(read_ani2_h5(raw_path))
+        return samples
diff --git a/openqdc/datasets/potential/comp6.py b/openqdc/datasets/potential/comp6.py
@@ -26,8 +26,8 @@ class COMP6(BaseDataset):
 
     # watchout that forces are stored as -grad(E)
     __energy_unit__ = "kcal/mol"
-    __distance_unit__ = "bohr"  # bohr
-    __forces_unit__ = "kcal/mol/bohr"
+    __distance_unit__ = "ang"  # angstorm
+    __forces_unit__ = "kcal/mol/ang"
 
     __energy_methods__ = [
         PotentialMethod.WB97X_6_31G_D,  # "wb97x/6-31g*",

diff --git a/openqdc/datasets/potential/gdml.py b/openqdc/datasets/potential/gdml.py
@@ -54,8 +54,8 @@ class GDML(BaseDataset):
     ]
 
     __energy_unit__ = "kcal/mol"
-    __distance_unit__ = "bohr"
-    __forces_unit__ = "kcal/mol/bohr"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "kcal/mol/ang"
     __links__ = {
         "gdb7_9.hdf5.gz": "https://zenodo.org/record/3588361/files/208.hdf5.gz",
         "gdb10_13.hdf5.gz": "https://zenodo.org/record/3588364/files/209.hdf5.gz",

diff --git a/openqdc/datasets/potential/iso_17.py b/openqdc/datasets/potential/iso_17.py
@@ -40,8 +40,8 @@ class ISO17(BaseDataset):
     ]
 
     __energy_unit__ = "ev"
-    __distance_unit__ = "bohr"  # bohr
-    __forces_unit__ = "ev/bohr"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "ev/ang"
     __links__ = {"iso_17.hdf5.gz": "https://zenodo.org/record/3585907/files/216.hdf5.gz"}
 
     def __smiles_converter__(self, x):

diff --git a/openqdc/datasets/potential/qm7x.py b/openqdc/datasets/potential/qm7x.py
@@ -57,7 +57,7 @@ class QM7X(BaseDataset):
 
     __energy_methods__ = [PotentialMethod.PBE0_DEF2_TZVP, PotentialMethod.DFT3B]  # "pbe0/def2-tzvp", "dft3b"]
 
-    energy_target_names = ["ePBE0", "eMBD"]
+    energy_target_names = ["ePBE0+MBD", "eDFTB+MBD"]
 
     __force_mask__ = [True, True]
 

diff --git a/openqdc/datasets/potential/solvated_peptides.py b/openqdc/datasets/potential/solvated_peptides.py
@@ -41,9 +41,9 @@ class SolvatedPeptides(BaseDataset):
     ]
 
     # TO CHECK
-    __energy_unit__ = "hartree"
-    __distance_unit__ = "bohr"
-    __forces_unit__ = "hartree/bohr"
+    __energy_unit__ = "ev"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "ev/ang"
     __links__ = {"solvated_peptides.hdf5.gz": "https://zenodo.org/record/3585804/files/213.hdf5.gz"}
 
     def __smiles_converter__(self, x):

diff --git a/openqdc/datasets/potential/waterclusters3_30.py b/openqdc/datasets/potential/waterclusters3_30.py
@@ -1,3 +1,4 @@
+import zipfile
 from io import StringIO
 from os.path import join as p_join
 
@@ -8,8 +9,6 @@
 from openqdc.methods import PotentialMethod
 from openqdc.utils.constants import ATOM_TABLE, MAX_ATOMIC_NUMBER
 
-# we could use ase.io.read to read extxyz files
-
 
 def content_to_xyz(content, n_waters):
     content = content.strip()
@@ -80,12 +79,15 @@ class WaterClusters(BaseDataset):
 
     def read_raw_entries(self):
         samples = []
+        parent_folder = p_join(self.root, "W3-W30_all_geoms_TTM2.1-F/")
         for i in range(3, 31):
-            raw_path = p_join(self.root, f"W3-W30_all_geoms_TTM2.1-F/W{i}_geoms_all.xyz")
-            data = read_xyz(
-                raw_path,
-                i,
-            )
+            name = f"W{i}_geoms_all"
+            zip_path = p_join(parent_folder, f"{name}.zip")
+            xyz_path = p_join(parent_folder, f"{name}.xyz")
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
+                zip_ref.extractall(parent_folder)
+
+            data = read_xyz(xyz_path, i)
             samples += data
 
         return samples