Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ANI2 #96

Merged
merged 1 commit into from
Jun 8, 2024
Merged

ANI2 #96

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion openqdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def get_project_root():
"ANI1CCX": "openqdc.datasets.potential.ani",
"ANI1CCX_V2": "openqdc.datasets.potential.ani",
"ANI1X": "openqdc.datasets.potential.ani",
"ANI2": "openqdc.datasets.potential.ani",
"Spice": "openqdc.datasets.potential.spice",
"SpiceV2": "openqdc.datasets.potential.spice",
"SpiceVL2": "openqdc.datasets.potential.spice",
Expand Down Expand Up @@ -100,7 +101,7 @@ def __dir__():
from .datasets.interaction.metcalf import Metcalf
from .datasets.interaction.splinter import Splinter
from .datasets.interaction.x40 import X40
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
from .datasets.potential.comp6 import COMP6
from .datasets.potential.dummy import Dummy
from .datasets.potential.gdml import GDML
Expand Down
3 changes: 2 additions & 1 deletion openqdc/datasets/potential/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
from .comp6 import COMP6
from .dummy import Dummy
from .gdml import GDML
Expand All @@ -24,6 +24,7 @@
"ani1": ANI1,
"ani1ccx": ANI1CCX,
"ani1ccxv2": ANI1CCX_V2,
"ani2": ANI2,
"ani1x": ANI1X,
"comp6": COMP6,
"gdml": GDML,
Expand Down
80 changes: 79 additions & 1 deletion openqdc/datasets/potential/ani.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,42 @@
import os
from os.path import join as p_join

import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.methods import PotentialMethod
from openqdc.utils import read_qc_archive_h5
from openqdc.utils import load_hdf5_file, read_qc_archive_h5
from openqdc.utils.io import get_local_cache


def read_ani2_h5(raw_path):
h5f = load_hdf5_file(raw_path)
samples = []
for _, props in h5f.items():
samples.append(extract_ani2_entries(props))
return samples


def extract_ani2_entries(properties):
coordinates = properties["coordinates"]
species = properties["species"]
forces = properties["forces"]
energies = properties["energies"]
n_atoms = coordinates.shape[1]
n_entries = coordinates.shape[0]
flattened_coordinates = coordinates[:].reshape((-1, 3))
xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1)
res = dict(
name=np.array(["ANI2"] * n_entries),
subset=np.array([str(n_atoms)] * n_entries),
energies=energies[:].reshape((-1, 1)).astype(np.float64),
atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32),
n_atoms=np.array([n_atoms] * n_entries, dtype=np.int32),
forces=forces[:].reshape(-1, 3, 1).astype(np.float32),
)
return res


class ANI1(BaseDataset):
"""
The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small
Expand Down Expand Up @@ -176,3 +206,51 @@ class ANI1CCX_V2(ANI1CCX):

__energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]
energy_target_names = ANI1CCX.energy_target_names + ["PM6", "GFN2"]


class ANI2(ANI1):
""" """

__name__ = "ani2"
__energy_unit__ = "hartree"
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"

__energy_methods__ = [
# PotentialMethod.NONE, # "b973c/def2mtzvp",
PotentialMethod.WB97X_6_31G_D, # "wb97x/631gd", # PAPER DATASET
# PotentialMethod.NONE, # "wb97md3bj/def2tzvpp",
# PotentialMethod.NONE, # "wb97mv/def2tzvpp",
# PotentialMethod.NONE, # "wb97x/def2tzvpp",
]

energy_target_names = [
# "b973c/def2mtzvp",
"wb97x/631gd",
# "wb97md3bj/def2tzvpp",
# "wb97mv/def2tzvpp",
# "wb97x/def2tzvpp",
]

force_target_names = ["wb97x/631gd"] # "b973c/def2mtzvp",

__force_mask__ = [True]
__links__ = { # "ANI-2x-B973c-def2mTZVP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1", # noqa
# "ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1", # noqa
# "ANI-2x-wB97MV-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1", # noqa
"ANI-2x-wB97X-631Gd.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1", # noqa
# "ANI-2x-wB97X-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1", # noqa
}

def __smiles_converter__(self, x):
"""util function to convert string to smiles: useful if the smiles is
encoded in a different format than its display format
"""
return x

def read_raw_entries(self):
samples = []
for lvl_theory in self.__links__.keys():
raw_path = p_join(self.root, "final_h5", f"{lvl_theory.split('.')[0]}.h5")
samples.extend(read_ani2_h5(raw_path))
return samples
Loading