-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Interaction Datasets #40
Changes from 31 commits
bd3fcf9
a800ea5
9d6fca6
794e63f
0db4765
6e5a002
8e1e003
d68bae6
5e94d67
3c9508b
768fb2e
8aeadd8
9cf6034
ce2c53b
6206665
5cb57d9
e18b710
54cadbf
7f83eb5
a922ef7
2146058
4d9a4ba
c2229e3
9349454
c3bdc64
23c0739
74f87a6
f046ea9
3c84ee9
04c81ae
11e2858
78f0423
bd58fdf
5dfcf55
4bc3a49
b046eea
fe54044
ef2528c
c0ef5b1
ad55296
0a51e7c
b6c3a6a
07f70b8
1443450
802b70b
e969b54
5725fed
6c6b286
46c5ebe
d5ec053
cb9987c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import os | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from typing import Dict, List | ||
|
||
from tqdm import tqdm | ||
from rdkit import Chem | ||
from ruamel.yaml import YAML | ||
from loguru import logger | ||
from openqdc.datasets.interaction import BaseInteractionDataset | ||
from openqdc.utils.molecule import atom_table, molecule_groups | ||
|
||
|
||
class L7(BaseInteractionDataset): | ||
__name__ = "L7" | ||
__energy_unit__ = "hartree" | ||
__distance_unit__ = "ang" | ||
__forces_unit__ = "hartree/ang" | ||
__energy_methods__ = [ | ||
mcneela marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"CSD(T) | QCISD(T)", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why "CSD(T) | QCISD(T)"? Pick the one most used in the other datasets There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ignore this if the comment above make sense There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I annotated the energy this way because in the paper they said they computed the energies using CSD(T) or QCISD(T) but they didn't provide labels in the dataset showing which dimer pairs were computed with CSD(T) and which were computed using QCISD(T). |
||
"DLPNO-CCSD(T)", | ||
"MP2/CBS", | ||
"MP2C/CBS", | ||
"fixed", | ||
"DLPNO-CCSD(T0)", | ||
"LNO-CCSD(T)", | ||
"FN-DMC", | ||
] | ||
|
||
energy_target_names = [] | ||
|
||
def read_raw_entries(self) -> List[Dict]: | ||
yaml_fpath = os.path.join(self.root, "l7.yaml") | ||
logger.info(f"Reading L7 interaction data from {self.root}") | ||
yaml_file = open(yaml_fpath, "r") | ||
yaml = YAML() | ||
data = [] | ||
data_dict = yaml.load(yaml_file) | ||
charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"]) | ||
charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"]) | ||
|
||
for idx, item in enumerate(data_dict["items"]): | ||
energies = [] | ||
name = np.array([item["shortname"]]) | ||
fname = item["geometry"].split(":")[1] | ||
energies.append(item["reference_value"]) | ||
xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r") | ||
lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) | ||
lines.pop(1) | ||
n_atoms = np.array([int(lines[0][0])], dtype=np.int32) | ||
n_atoms_first = np.array([int(item["setup"]["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) | ||
subset = np.array([item["group"]]) | ||
energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] | ||
energies = np.array([energies], dtype=np.float32) | ||
pos = np.array(lines[1:])[:, 1:].astype(np.float32) | ||
elems = np.array(lines[1:])[:, 0] | ||
atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) | ||
natoms0 = n_atoms_first[0] | ||
natoms1 = n_atoms[0] - natoms0 | ||
charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) | ||
atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) | ||
|
||
item = dict( | ||
energies=energies, | ||
subset=subset, | ||
n_atoms=n_atoms, | ||
n_atoms_first=n_atoms_first, | ||
atomic_inputs=atomic_inputs, | ||
name=name, | ||
) | ||
data.append(item) | ||
return data |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import os | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from typing import Dict, List | ||
|
||
from tqdm import tqdm | ||
from rdkit import Chem | ||
from ruamel.yaml import YAML | ||
from loguru import logger | ||
from openqdc.datasets.interaction import BaseInteractionDataset | ||
from openqdc.utils.molecule import atom_table, molecule_groups | ||
|
||
|
||
class X40(BaseInteractionDataset): | ||
__name__ = "X40" | ||
__energy_unit__ = "hartree" | ||
__distance_unit__ = "ang" | ||
__forces_unit__ = "hartree/ang" | ||
__energy_methods__ = [ | ||
"default", | ||
mcneela marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"MP2/CBS", | ||
"dCCSD(T)/haDZ", | ||
"dCCSD(T)/haTZ", | ||
"MP2.5/CBS(aDZ)", | ||
] | ||
|
||
energy_target_names = [] | ||
|
||
def read_raw_entries(self) -> List[Dict]: | ||
yaml_fpath = os.path.join(self.root, "x40.yaml") | ||
logger.info(f"Reading X40 interaction data from {self.root}") | ||
yaml_file = open(yaml_fpath, "r") | ||
yaml = YAML() | ||
data = [] | ||
data_dict = yaml.load(yaml_file) | ||
charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"]) | ||
charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"]) | ||
|
||
for idx, item in enumerate(data_dict["items"]): | ||
energies = [] | ||
name = np.array([item["shortname"]]) | ||
energies.append(float(item["reference_value"])) | ||
xyz_file = open(os.path.join(self.root, f"{item['shortname']}.xyz"), "r") | ||
lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) | ||
setup = lines.pop(1) | ||
n_atoms = np.array([int(lines[0][0])], dtype=np.int32) | ||
n_atoms_first = setup[0].split("-")[1] | ||
n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32) | ||
subset = np.array([item["group"]]) | ||
energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] | ||
energies = np.array([energies], dtype=np.float32) | ||
pos = np.array(lines[1:])[:, 1:].astype(np.float32) | ||
elems = np.array(lines[1:])[:, 0] | ||
atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1) | ||
natoms0 = n_atoms_first[0] | ||
natoms1 = n_atoms[0] - natoms0 | ||
charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) | ||
atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) | ||
|
||
item = dict( | ||
energies=energies, | ||
subset=subset, | ||
n_atoms=n_atoms, | ||
n_atoms_first=n_atoms_first, | ||
atomic_inputs=atomic_inputs, | ||
name=name, | ||
) | ||
data.append(item) | ||
return data |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import importlib | ||
import os | ||
from typing import TYPE_CHECKING # noqa F401 | ||
|
||
# The below lazy import logic is coming from openff-toolkit: | ||
# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44 | ||
|
||
# Dictionary of objects to lazily import; maps the object's name to its module path | ||
|
||
_lazy_imports_obj = { | ||
"BaseInteractionDataset": "openqdc.datasets.interaction.base", | ||
"DES370K": "openqdc.datasets.interaction.des370k", | ||
"DES5M": "openqdc.datasets.interaction.des5m", | ||
"Metcalf": "openqdc.datasets.interaction.metcalf", | ||
"DESS66": "openqdc.datasets.interaction.dess66", | ||
"DESS66x8": "openqdc.datasets.interaction.dess66x8", | ||
"L7": "openqdc.datasets.interaction.L7", | ||
"X40": "openqdc.datasets.interaction.X40", | ||
"Splinter": "openqdc.datasets.interaction.splinter", | ||
} | ||
|
||
_lazy_imports_mod = {} | ||
|
||
|
||
def __getattr__(name): | ||
"""Lazily import objects from _lazy_imports_obj or _lazy_imports_mod | ||
|
||
Note that this method is only called by Python if the name cannot be found | ||
in the current module.""" | ||
obj_mod = _lazy_imports_obj.get(name) | ||
if obj_mod is not None: | ||
mod = importlib.import_module(obj_mod) | ||
return mod.__dict__[name] | ||
|
||
lazy_mod = _lazy_imports_mod.get(name) | ||
if lazy_mod is not None: | ||
return importlib.import_module(lazy_mod) | ||
|
||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}") | ||
|
||
|
||
def __dir__(): | ||
"""Add _lazy_imports_obj and _lazy_imports_mod to dir(<module>)""" | ||
keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys()) | ||
return sorted(keys) | ||
|
||
|
||
if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": | ||
from .base import BaseInteractionDataset | ||
from .des370k import DES370K | ||
from .des5m import DES5M | ||
from .metcalf import Metcalf | ||
from .dess66 import DESS66 | ||
from .dess66x8 import DESS66x8 | ||
from .L7 import L7 | ||
from .X40 import X40 | ||
from .splinter import Splinter | ||
|
||
__all__ = [ | ||
"BaseInteractionDataset", | ||
"DES370K", | ||
"DES5M", | ||
"Metcalf", | ||
"DESS66", | ||
"DESS66x8", | ||
"L7", | ||
"X40", | ||
"Splinter", | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from typing import Dict, List, Optional, Union | ||
from openqdc.utils.io import ( | ||
copy_exists, | ||
dict_to_atoms, | ||
get_local_cache, | ||
load_hdf5_file, | ||
load_pkl, | ||
pull_locally, | ||
push_remote, | ||
set_cache_dir, | ||
) | ||
from openqdc.datasets.potential.base import BaseDataset | ||
from openqdc.utils.constants import ( | ||
NB_ATOMIC_FEATURES | ||
) | ||
|
||
from loguru import logger | ||
|
||
import numpy as np | ||
|
||
class BaseInteractionDataset(BaseDataset): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The write and read prepossessed must be changed here no? There are news keys been added so the base class must adapt those functions no? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We must also change the logic to avoid of a few other functions to avoid the normalization of interaction energies no @FNTwin. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I still need to update the preprocessing functions to add the new keys. I'm not familiar with the normalization of the energies, Cristian will probably be able to help with that. |
||
def __init__( | ||
self, | ||
energy_unit: Optional[str] = None, | ||
distance_unit: Optional[str] = None, | ||
overwrite_local_cache: bool = False, | ||
cache_dir: Optional[str] = None, | ||
) -> None: | ||
super().__init__( | ||
energy_unit=energy_unit, | ||
distance_unit=distance_unit, | ||
overwrite_local_cache=overwrite_local_cache, | ||
cache_dir=cache_dir | ||
) | ||
|
||
def collate_list(self, list_entries: List[Dict]): | ||
# concatenate entries | ||
print(list_entries[0]) | ||
res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \ | ||
for key in list_entries[0] if not isinstance(list_entries[0][key], dict)} | ||
|
||
csum = np.cumsum(res.get("n_atoms")) | ||
print(csum) | ||
x = np.zeros((csum.shape[0], 2), dtype=np.int32) | ||
x[1:, 0], x[:, 1] = csum[:-1], csum | ||
res["position_idx_range"] = x | ||
|
||
return res | ||
|
||
@property | ||
def data_shapes(self): | ||
return { | ||
"atomic_inputs": (-1, NB_ATOMIC_FEATURES), | ||
"position_idx_range": (-1, 2), | ||
"energies": (-1, len(self.__energy_methods__)), | ||
"forces": (-1, 3, len(self.force_target_names)), | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add the other datasets here as well
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They've now been added! We still need to calculate the statistics for each dataset.