Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metcalf Dataset #41

Closed
wants to merge 14 commits into from
50 changes: 50 additions & 0 deletions src/openqdc/datasets/interaction/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import importlib
import os
from typing import TYPE_CHECKING # noqa F401

# The below lazy import logic is coming from openff-toolkit:
# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44

# Dictionary of objects to lazily import; maps the object's name to its module path

_lazy_imports_obj = {
"BaseInteractionDataset": "openqdc.datasets.interaction.base",
"DES370K": "openqdc.datasets.interaction.des370k",
"Metcalf": "openqdc.datasets.interaction.metcalf",
}

_lazy_imports_mod = {}


def __getattr__(name):
"""Lazily import objects from _lazy_imports_obj or _lazy_imports_mod

Note that this method is only called by Python if the name cannot be found
in the current module."""
obj_mod = _lazy_imports_obj.get(name)
if obj_mod is not None:
mod = importlib.import_module(obj_mod)
return mod.__dict__[name]

lazy_mod = _lazy_imports_mod.get(name)
if lazy_mod is not None:
return importlib.import_module(lazy_mod)

raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def __dir__():
"""Add _lazy_imports_obj and _lazy_imports_mod to dir(<module>)"""
keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys())
return sorted(keys)


if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
from .base import BaseInteractionDataset
from .des370k import DES370K

__all__ = [
"BaseInteractionDataset",
"DES370K",
"Metcalf",
]
45 changes: 45 additions & 0 deletions src/openqdc/datasets/interaction/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from typing import Dict, List, Optional, Union
from openqdc.utils.io import (
copy_exists,
dict_to_atoms,
get_local_cache,
load_hdf5_file,
load_pkl,
pull_locally,
push_remote,
set_cache_dir,
)
from openqdc.datasets.potential.base import BaseDataset

from loguru import logger

import numpy as np

class BaseInteractionDataset(BaseDataset):
def __init__(
self,
energy_unit: Optional[str] = None,
distance_unit: Optional[str] = None,
overwrite_local_cache: bool = False,
cache_dir: Optional[str] = None,
) -> None:
super().__init__(
energy_unit=energy_unit,
distance_unit=distance_unit,
overwrite_local_cache=overwrite_local_cache,
cache_dir=cache_dir
)

def collate_list(self, list_entries: List[Dict]):
# concatenate entries
print(list_entries[0])
res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \
for key in list_entries[0] if not isinstance(list_entries[0][key], dict)}

csum = np.cumsum(res.get("n_atoms"))
print(csum)
x = np.zeros((csum.shape[0], 2), dtype=np.int32)
x[1:, 0], x[:, 1] = csum[:-1], csum
res["position_idx_range"] = x

return res
98 changes: 98 additions & 0 deletions src/openqdc/datasets/interaction/des370k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import os
import numpy as np
import pandas as pd

from typing import Dict, List

from tqdm import tqdm
from loguru import logger
from openqdc.datasets.interaction import BaseInteractionDataset
from openqdc.utils.molecule import atom_table


class DES370K(BaseInteractionDataset):
__name__ = "des370k_interaction"
__energy_unit__ = "hartree"
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"
__energy_methods__ = [
"mp2/cc-pvdz",
"mp2/cc-pvqz",
"mp2/cc-pvtz",
"mp2/cbs",
"ccsd(t)/cc-pvdz",
"ccsd(t)/cbs", # cbs
"ccsd(t)/nn", # nn
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz_es",
"sapt0/aug-cc-pwcvxz_ex",
"sapt0/aug-cc-pwcvxz_exs2",
"sapt0/aug-cc-pwcvxz_ind",
"sapt0/aug-cc-pwcvxz_exind",
"sapt0/aug-cc-pwcvxz_disp",
"sapt0/aug-cc-pwcvxz_exdisp_os",
"sapt0/aug-cc-pwcvxz_exdisp_ss",
"sapt0/aug-cc-pwcvxz_delta_HF",
]

energy_target_names = [
"cc_MP2_all",
"qz_MP2_all",
"tz_MP2_all",
"cbs_MP2_all",
"cc_CCSD(T)_all",
"cbs_CCSD(T)_all",
"nn_CCSD(T)_all",
"sapt_all",
"sapt_es",
"sapt_ex",
"sapt_exs2",
"sapt_ind",
"sapt_exind",
"sapt_disp",
"sapt_exdisp_os",
"sapt_exdisp_ss",
"sapt_delta_HF",
]

def read_raw_entries(self) -> List[Dict]:
self.filepath = os.path.join(self.root, "DES370K.csv")
logger.info(f"Reading DES370K interaction data from {self.filepath}")
df = pd.read_csv(self.filepath)
data = []
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
smiles0, smiles1 = row["smiles0"], row["smiles1"]
charge0, charge1 = row["charge0"], row["charge1"]
natoms0, natoms1 = row["natoms0"], row["natoms1"]
pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
pos0 = pos[:natoms0]
pos1 = pos[natoms0:]

elements = row["elements"].split()
elements0 = np.array(elements[:natoms0])
elements1 = np.array(elements[natoms0:])

atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
atomic_nums0 = np.array(atomic_nums[:natoms0])
atomic_nums1 = np.array(atomic_nums[natoms0:])

charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)

atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
atomic_inputs0 = atomic_inputs[:natoms0, :]
atomic_inputs1 = atomic_inputs[natoms0:, :]

energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]

name = np.array([smiles0 + "." + smiles1])

item = dict(
energies=energies,
subset=np.array(["DES370K"]),
n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
n_atoms_first=np.array([natoms0], dtype=np.int32),
atomic_inputs=atomic_inputs,
name=name,
)
data.append(item)
return data
91 changes: 91 additions & 0 deletions src/openqdc/datasets/interaction/metcalf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
import numpy as np

from typing import Dict, List

from tqdm import tqdm
from rdkit import Chem
from loguru import logger
from openqdc.datasets.interaction import BaseInteractionDataset
from openqdc.utils.molecule import atom_table

class Metcalf(BaseInteractionDataset):
__name__ = "metcalf"
__energy_unit__ = "hartree"
__distance_unit__ = "ang"
__forces_unit__ = None
__energy_methods__ = [
"SAPT0/jun-cc-pVDZ"
]
energy_target_names = [
"total energy",
"electrostatic energy",
"exchange energy",
"induction energy",
"dispersion energy",
]

def read_raw_entries(self) -> List[Dict]:
"""
SMILES strings are inferred from the
.xyz files using the RDKit xyz2mol function.
More details here:

https://github.com/jensengroup/xyz2mol

"""
data = []
for dirname in os.listdir(self.root):
xyz_dir = os.path.join(self.root, dirname)
if not os.path.isdir(xyz_dir):
continue
subset = np.array([dirname.split("-")[0].lower()]) # training, validation, or test
for filename in os.listdir(xyz_dir):
if not filename.endswith(".xyz"):
continue
lines = list(map(lambda x: x.strip(), open(os.path.join(xyz_dir, filename), "r").readlines()))
line_two = lines[1].split(",")
energies = np.array([line_two[1:6]], dtype=np.float32)
num_atoms = np.array([int(lines[0])])
num_atoms0 = int(line_two[-1])
num_atoms1 = num_atoms[0] - num_atoms0

elem_xyz = np.array([x.split() for x in lines[2:]])
# elements = np.expand_dims(elem_xyz[:, 0], axis=0)
elements = elem_xyz[:, 0]
xyz = elem_xyz[:, 1:].astype(np.float32)
# xyz0_fname = os.path.join(xyz_dir, f"{filename}_0_tmp.xyz")
# with open(xyz0_fname, "w") as xyz_0_file:
# lines_to_write = [str(num_atoms0) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[:num_atoms0].tolist()))
# lines_to_write[-1] = lines_to_write[-1][:-1]
# print(lines_to_write)
# xyz_0_file.writelines(lines_to_write) # writelines doesn't actually add a newline to each string (weird)

# xyz1_fname = os.path.join(xyz_dir, f"{filename}_1_tmp.xyz")
# with open(xyz1_fname, "w") as xyz_1_file:
# lines_to_write = [str(num_atoms1) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[num_atoms0:].tolist()))
# lines_to_write[-1] = lines_to_write[-1][:-1]
# xyz_1_file.writelines(lines_to_write)

# smiles0 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz0_fname))
# smiles1 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz1_fname))
#
atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
charges = np.expand_dims(np.array([0] * num_atoms[0]), axis=1)

atomic_inputs = np.concatenate((atomic_nums, charges, xyz), axis=-1, dtype=np.float32)

item = dict(
# elements=elements,
n_atoms=num_atoms,
subset=subset,
energies=energies,
positions=xyz,
atomic_inputs=atomic_inputs,
name=np.array([""])
)
data.append(item)
return data



Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,28 @@
# Dictionary of objects to lazily import; maps the object's name to its module path

_lazy_imports_obj = {
"ANI1": "openqdc.datasets.ani",
"ANI1CCX": "openqdc.datasets.ani",
"ANI1X": "openqdc.datasets.ani",
"Spice": "openqdc.datasets.spice",
"GEOM": "openqdc.datasets.geom",
"QMugs": "openqdc.datasets.qmugs",
"ISO17": "openqdc.datasets.iso_17",
"COMP6": "openqdc.datasets.comp6",
"GDML": "openqdc.datasets.gdml",
"Molecule3D": "openqdc.datasets.molecule3d",
"OrbnetDenali": "openqdc.datasets.orbnet_denali",
"SN2RXN": "openqdc.datasets.sn2_rxn",
"QM7X": "openqdc.datasets.qm7x",
"DESS": "openqdc.datasets.dess",
"NablaDFT": "openqdc.datasets.nabladft",
"SolvatedPeptides": "openqdc.datasets.solvated_peptides",
"WaterClusters": "openqdc.datasets.waterclusters3_30",
"TMQM": "openqdc.datasets.tmqm",
"Dummy": "openqdc.datasets.dummy",
"PCQM_B3LYP": "openqdc.datasets.pcqm",
"PCQM_PM6": "openqdc.datasets.pcqm",
"Transition1X": "openqdc.datasets.transition1x",
"ANI1": "openqdc.datasets.potential.ani",
"ANI1CCX": "openqdc.datasets.potential.ani",
"ANI1X": "openqdc.datasets.potential.ani",
"Spice": "openqdc.datasets.potential.spice",
"GEOM": "openqdc.datasets.potential.geom",
"QMugs": "openqdc.datasets.potential.qmugs",
"ISO17": "openqdc.datasets.potential.iso_17",
"COMP6": "openqdc.datasets.potential.comp6",
"GDML": "openqdc.datasets.potential.gdml",
"Molecule3D": "openqdc.datasets.potential.molecule3d",
"OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
"SN2RXN": "openqdc.datasets.potential.sn2_rxn",
"QM7X": "openqdc.datasets.potential.qm7x",
"DESS": "openqdc.datasets.potential.dess",
"NablaDFT": "openqdc.datasets.potential.nabladft",
"SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
"WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
"TMQM": "openqdc.datasets.potential.tmqm",
"Dummy": "openqdc.datasets.potential.dummy",
"PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
"PCQM_PM6": "openqdc.datasets.potential.pcqm",
"Transition1X": "openqdc.datasets.potential.transition1x",
}

_lazy_imports_mod = {}
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ def _post_init(
self._convert_data()
self._set_isolated_atom_energies()

@classmethod
def no_init(cls):
return cls.__new__(cls)

def _convert_data(self):
logger.info(
f"Converting {self.__name__} data to the following units:\n\
Expand Down Expand Up @@ -325,6 +329,7 @@ def read_raw_entries(self):

def collate_list(self, list_entries):
# concatenate entries
logger.info(f"list entries: {type(list_entries)}")
res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}

csum = np.cumsum(res.get("n_atoms"))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from os.path import join as p_join

from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5


class COMP6(BaseDataset):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
from tqdm import tqdm

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.molecule import get_atomic_number_and_charge


Expand Down Expand Up @@ -58,7 +58,6 @@ class DESS(BaseDataset):
"nn_CCSD(T)_all",
"sapt_all",
]
# ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all']

partitions = ["DES370K", "DES5M"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from numpy import array
from sklearn.utils import Bunch

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
from openqdc.utils.constants import NOT_DEFINED

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from os.path import join as p_join

from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5


class GDML(BaseDataset):
Expand Down
Loading
Loading