Skip to content

Commit

Permalink
Merge branch 'downloader_add' into issue_90
Browse files Browse the repository at this point in the history
  • Loading branch information
prtos authored Jun 8, 2024
2 parents f0a499b + 032aa71 commit 0566260
Show file tree
Hide file tree
Showing 9 changed files with 101 additions and 20 deletions.
3 changes: 2 additions & 1 deletion openqdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def get_project_root():
"ANI1CCX": "openqdc.datasets.potential.ani",
"ANI1CCX_V2": "openqdc.datasets.potential.ani",
"ANI1X": "openqdc.datasets.potential.ani",
"ANI2": "openqdc.datasets.potential.ani",
"Spice": "openqdc.datasets.potential.spice",
"SpiceV2": "openqdc.datasets.potential.spice",
"SpiceVL2": "openqdc.datasets.potential.spice",
Expand Down Expand Up @@ -100,7 +101,7 @@ def __dir__():
from .datasets.interaction.metcalf import Metcalf
from .datasets.interaction.splinter import Splinter
from .datasets.interaction.x40 import X40
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
from .datasets.potential.comp6 import COMP6
from .datasets.potential.dummy import Dummy
from .datasets.potential.gdml import GDML
Expand Down
2 changes: 1 addition & 1 deletion openqdc/datasets/potential/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2
from .comp6 import COMP6
from .dummy import Dummy
from .gdml import GDML
Expand Down
80 changes: 79 additions & 1 deletion openqdc/datasets/potential/ani.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,42 @@
import os
from os.path import join as p_join

import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.methods import PotentialMethod
from openqdc.utils import read_qc_archive_h5
from openqdc.utils import load_hdf5_file, read_qc_archive_h5
from openqdc.utils.io import get_local_cache


def read_ani2_h5(raw_path):
h5f = load_hdf5_file(raw_path)
samples = []
for _, props in h5f.items():
samples.append(extract_ani2_entries(props))
return samples


def extract_ani2_entries(properties):
coordinates = properties["coordinates"]
species = properties["species"]
forces = properties["forces"]
energies = properties["energies"]
n_atoms = coordinates.shape[1]
n_entries = coordinates.shape[0]
flattened_coordinates = coordinates[:].reshape((-1, 3))
xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1)
res = dict(
name=np.array(["ANI2"] * n_entries),
subset=np.array([str(n_atoms)] * n_entries),
energies=energies[:].reshape((-1, 1)).astype(np.float64),
atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32),
n_atoms=np.array([n_atoms] * n_entries, dtype=np.int32),
forces=forces[:].reshape(-1, 3, 1).astype(np.float32),
)
return res


class ANI1(BaseDataset):
"""
The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small
Expand Down Expand Up @@ -176,3 +206,51 @@ class ANI1CCX_V2(ANI1CCX):

__energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]
energy_target_names = ANI1CCX.energy_target_names + ["PM6", "GFN2"]


class ANI2(ANI1):
""" """

__name__ = "ani2"
__energy_unit__ = "hartree"
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"

__energy_methods__ = [
# PotentialMethod.NONE, # "b973c/def2mtzvp",
PotentialMethod.WB97X_6_31G_D, # "wb97x/631gd", # PAPER DATASET
# PotentialMethod.NONE, # "wb97md3bj/def2tzvpp",
# PotentialMethod.NONE, # "wb97mv/def2tzvpp",
# PotentialMethod.NONE, # "wb97x/def2tzvpp",
]

energy_target_names = [
# "b973c/def2mtzvp",
"wb97x/631gd",
# "wb97md3bj/def2tzvpp",
# "wb97mv/def2tzvpp",
# "wb97x/def2tzvpp",
]

force_target_names = ["wb97x/631gd"] # "b973c/def2mtzvp",

__force_mask__ = [True]
__links__ = { # "ANI-2x-B973c-def2mTZVP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1", # noqa
# "ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1", # noqa
# "ANI-2x-wB97MV-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1", # noqa
"ANI-2x-wB97X-631Gd.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1", # noqa
# "ANI-2x-wB97X-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1", # noqa
}

def __smiles_converter__(self, x):
"""util function to convert string to smiles: useful if the smiles is
encoded in a different format than its display format
"""
return x

def read_raw_entries(self):
samples = []
for lvl_theory in self.__links__.keys():
raw_path = p_join(self.root, "final_h5", f"{lvl_theory.split('.')[0]}.h5")
samples.extend(read_ani2_h5(raw_path))
return samples
4 changes: 2 additions & 2 deletions openqdc/datasets/potential/comp6.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class COMP6(BaseDataset):

# watchout that forces are stored as -grad(E)
__energy_unit__ = "kcal/mol"
__distance_unit__ = "bohr" # bohr
__forces_unit__ = "kcal/mol/bohr"
__distance_unit__ = "ang" # angstorm
__forces_unit__ = "kcal/mol/ang"

__energy_methods__ = [
PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g*",
Expand Down
4 changes: 2 additions & 2 deletions openqdc/datasets/potential/gdml.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ class GDML(BaseDataset):
]

__energy_unit__ = "kcal/mol"
__distance_unit__ = "bohr"
__forces_unit__ = "kcal/mol/bohr"
__distance_unit__ = "ang"
__forces_unit__ = "kcal/mol/ang"
__links__ = {
"gdb7_9.hdf5.gz": "https://zenodo.org/record/3588361/files/208.hdf5.gz",
"gdb10_13.hdf5.gz": "https://zenodo.org/record/3588364/files/209.hdf5.gz",
Expand Down
4 changes: 2 additions & 2 deletions openqdc/datasets/potential/iso_17.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ class ISO17(BaseDataset):
]

__energy_unit__ = "ev"
__distance_unit__ = "bohr" # bohr
__forces_unit__ = "ev/bohr"
__distance_unit__ = "ang"
__forces_unit__ = "ev/ang"
__links__ = {"iso_17.hdf5.gz": "https://zenodo.org/record/3585907/files/216.hdf5.gz"}

def __smiles_converter__(self, x):
Expand Down
2 changes: 1 addition & 1 deletion openqdc/datasets/potential/qm7x.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class QM7X(BaseDataset):

__energy_methods__ = [PotentialMethod.PBE0_DEF2_TZVP, PotentialMethod.DFT3B] # "pbe0/def2-tzvp", "dft3b"]

energy_target_names = ["ePBE0", "eMBD"]
energy_target_names = ["ePBE0+MBD", "eDFTB+MBD"]

__force_mask__ = [True, True]

Expand Down
6 changes: 3 additions & 3 deletions openqdc/datasets/potential/solvated_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ class SolvatedPeptides(BaseDataset):
]

# TO CHECK
__energy_unit__ = "hartree"
__distance_unit__ = "bohr"
__forces_unit__ = "hartree/bohr"
__energy_unit__ = "ev"
__distance_unit__ = "ang"
__forces_unit__ = "ev/ang"
__links__ = {"solvated_peptides.hdf5.gz": "https://zenodo.org/record/3585804/files/213.hdf5.gz"}

def __smiles_converter__(self, x):
Expand Down
16 changes: 9 additions & 7 deletions openqdc/datasets/potential/waterclusters3_30.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import zipfile
from io import StringIO
from os.path import join as p_join

Expand All @@ -8,8 +9,6 @@
from openqdc.methods import PotentialMethod
from openqdc.utils.constants import ATOM_TABLE, MAX_ATOMIC_NUMBER

# we could use ase.io.read to read extxyz files


def content_to_xyz(content, n_waters):
content = content.strip()
Expand Down Expand Up @@ -80,12 +79,15 @@ class WaterClusters(BaseDataset):

def read_raw_entries(self):
samples = []
parent_folder = p_join(self.root, "W3-W30_all_geoms_TTM2.1-F/")
for i in range(3, 31):
raw_path = p_join(self.root, f"W3-W30_all_geoms_TTM2.1-F/W{i}_geoms_all.xyz")
data = read_xyz(
raw_path,
i,
)
name = f"W{i}_geoms_all"
zip_path = p_join(parent_folder, f"{name}.zip")
xyz_path = p_join(parent_folder, f"{name}.xyz")
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(parent_folder)

data = read_xyz(xyz_path, i)
samples += data

return samples

0 comments on commit 0566260

Please sign in to comment.