Skip to content

Commit

Permalink
Merge pull request #96 from OpenDrugDiscovery/ani2
Browse files Browse the repository at this point in the history
ANI2
  • Loading branch information
prtos authored Jun 8, 2024
2 parents 25e2a2f + a6fc1c0 commit 032aa71
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 3 deletions.
3 changes: 2 additions & 1 deletion openqdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def get_project_root():
"ANI1CCX": "openqdc.datasets.potential.ani",
"ANI1CCX_V2": "openqdc.datasets.potential.ani",
"ANI1X": "openqdc.datasets.potential.ani",
"ANI2": "openqdc.datasets.potential.ani",
"Spice": "openqdc.datasets.potential.spice",
"SpiceV2": "openqdc.datasets.potential.spice",
"SpiceVL2": "openqdc.datasets.potential.spice",
Expand Down Expand Up @@ -100,7 +101,7 @@ def __dir__():
from .datasets.interaction.metcalf import Metcalf
from .datasets.interaction.splinter import Splinter
from .datasets.interaction.x40 import X40
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
from .datasets.potential.comp6 import COMP6
from .datasets.potential.dummy import Dummy
from .datasets.potential.gdml import GDML
Expand Down
3 changes: 2 additions & 1 deletion openqdc/datasets/potential/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
from .comp6 import COMP6
from .dummy import Dummy
from .gdml import GDML
Expand All @@ -24,6 +24,7 @@
"ani1": ANI1,
"ani1ccx": ANI1CCX,
"ani1ccxv2": ANI1CCX_V2,
"ani2": ANI2,
"ani1x": ANI1X,
"comp6": COMP6,
"gdml": GDML,
Expand Down
80 changes: 79 additions & 1 deletion openqdc/datasets/potential/ani.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,42 @@
import os
from os.path import join as p_join

import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.methods import PotentialMethod
from openqdc.utils import read_qc_archive_h5
from openqdc.utils import load_hdf5_file, read_qc_archive_h5
from openqdc.utils.io import get_local_cache


def read_ani2_h5(raw_path):
h5f = load_hdf5_file(raw_path)
samples = []
for _, props in h5f.items():
samples.append(extract_ani2_entries(props))
return samples


def extract_ani2_entries(properties):
coordinates = properties["coordinates"]
species = properties["species"]
forces = properties["forces"]
energies = properties["energies"]
n_atoms = coordinates.shape[1]
n_entries = coordinates.shape[0]
flattened_coordinates = coordinates[:].reshape((-1, 3))
xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1)
res = dict(
name=np.array(["ANI2"] * n_entries),
subset=np.array([str(n_atoms)] * n_entries),
energies=energies[:].reshape((-1, 1)).astype(np.float64),
atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32),
n_atoms=np.array([n_atoms] * n_entries, dtype=np.int32),
forces=forces[:].reshape(-1, 3, 1).astype(np.float32),
)
return res


class ANI1(BaseDataset):
"""
The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small
Expand Down Expand Up @@ -176,3 +206,51 @@ class ANI1CCX_V2(ANI1CCX):

__energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]
energy_target_names = ANI1CCX.energy_target_names + ["PM6", "GFN2"]


class ANI2(ANI1):
""" """

__name__ = "ani2"
__energy_unit__ = "hartree"
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"

__energy_methods__ = [
# PotentialMethod.NONE, # "b973c/def2mtzvp",
PotentialMethod.WB97X_6_31G_D, # "wb97x/631gd", # PAPER DATASET
# PotentialMethod.NONE, # "wb97md3bj/def2tzvpp",
# PotentialMethod.NONE, # "wb97mv/def2tzvpp",
# PotentialMethod.NONE, # "wb97x/def2tzvpp",
]

energy_target_names = [
# "b973c/def2mtzvp",
"wb97x/631gd",
# "wb97md3bj/def2tzvpp",
# "wb97mv/def2tzvpp",
# "wb97x/def2tzvpp",
]

force_target_names = ["wb97x/631gd"] # "b973c/def2mtzvp",

__force_mask__ = [True]
__links__ = { # "ANI-2x-B973c-def2mTZVP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1", # noqa
# "ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1", # noqa
# "ANI-2x-wB97MV-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1", # noqa
"ANI-2x-wB97X-631Gd.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1", # noqa
# "ANI-2x-wB97X-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1", # noqa
}

def __smiles_converter__(self, x):
"""util function to convert string to smiles: useful if the smiles is
encoded in a different format than its display format
"""
return x

def read_raw_entries(self):
samples = []
for lvl_theory in self.__links__.keys():
raw_path = p_join(self.root, "final_h5", f"{lvl_theory.split('.')[0]}.h5")
samples.extend(read_ani2_h5(raw_path))
return samples

0 comments on commit 032aa71

Please sign in to comment.