From 0f212626e96055e8587a79da6482481644e9495b Mon Sep 17 00:00:00 2001 From: prtos Date: Sun, 24 Sep 2023 23:19:01 +0000 Subject: [PATCH] new datasets added --- openqdc/datasets/comp6.py | 72 +++++++++++++++++++++++ openqdc/datasets/gdml.py | 62 +++++++++++++++++++ openqdc/datasets/iso_17.py | 54 +++++++++++++++++ openqdc/datasets/orbnet_denali.py | 85 +++++++++++++++++++++++++++ openqdc/datasets/qm7x.py | 54 +++++++++++++++++ openqdc/datasets/sn2_rxn.py | 54 +++++++++++++++++ openqdc/datasets/solvated_peptides.py | 0 openqdc/datasets/waterclusters3_30.py | 0 8 files changed, 381 insertions(+) create mode 100644 openqdc/datasets/comp6.py create mode 100644 openqdc/datasets/gdml.py create mode 100644 openqdc/datasets/iso_17.py create mode 100644 openqdc/datasets/orbnet_denali.py create mode 100644 openqdc/datasets/qm7x.py create mode 100644 openqdc/datasets/sn2_rxn.py create mode 100644 openqdc/datasets/solvated_peptides.py create mode 100644 openqdc/datasets/waterclusters3_30.py diff --git a/openqdc/datasets/comp6.py b/openqdc/datasets/comp6.py new file mode 100644 index 0000000..cd3e41c --- /dev/null +++ b/openqdc/datasets/comp6.py @@ -0,0 +1,72 @@ +import numpy as np +from os.path import join as p_join +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 + + +class COMP6(BaseDataset): + __name__ = 'comp6' + + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = [ + "wb97x_6-31g*", + "b3lyp-d3m(bj)_tz", + "b3lyp_tz", + "hf_tz", + "pbe-d3(bj)_dz", + "pbe_tz", + "svwm_tz", + "wb97m-d3(bj)_tz", + "wb97m_tz", + ] + + energy_target_names = [ + "Energy", + "B3LYP-D3M(BJ):def2-tzvp", + "B3LYP:def2-tzvp", + "HF:def2-tzvp", + "PBE-D3M(BJ):def2-tzvp", + "PBE:def2-tzvp", + "SVWN:def2-tzvp", + "WB97M-D3(BJ):def2-tzvp", + "WB97M:def2-tzvp", + ] + + __force_methods__ = [ + "wb97x_6-31g*", + ] + + force_target_names = [ + "Gradient", + ] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + samples = [] + for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]: + raw_path = p_join(self.root, f'{subset}.h5') + samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, + self.force_target_names) + + return samples + + + +if __name__ == '__main__': + for data_class in [COMP6]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=' ') + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=' ') + + print() diff --git a/openqdc/datasets/gdml.py b/openqdc/datasets/gdml.py new file mode 100644 index 0000000..b53586b --- /dev/null +++ b/openqdc/datasets/gdml.py @@ -0,0 +1,62 @@ +import os +import numpy as np +from os.path import join as p_join +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 + + +class GDML(BaseDataset): + __name__ = 'gdml' + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = [ + "ccsd", + "ccsd(t)", + "pbe-ts", + ] + + energy_target_names = [ + "CCSD Energy", + "CCSD(T) Energy", + "PBE-TS Energy", + ] + + __force_methods__ = [ + "ccsd", + "ccsd(t)", + "pbe-ts", + ] + + force_target_names = [ + "CCSD Gradient", + "CCSD(T) Gradient", + "PBE-TS Gradient", + ] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + raw_path = p_join(self.root, f'gdml.h5') + samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, + self.force_target_names) + + return samples + + + +if __name__ == '__main__': + for data_class in [GDML]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=' ') + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=' ') + + print() diff --git a/openqdc/datasets/iso_17.py b/openqdc/datasets/iso_17.py new file mode 100644 index 0000000..4e86637 --- /dev/null +++ b/openqdc/datasets/iso_17.py @@ -0,0 +1,54 @@ +import os +import numpy as np +from os.path import join as p_join +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 + + +class ISO17(BaseDataset): + __name__ = 'iso_17' + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = [ + "pbe-ts", + ] + + energy_target_names = [ + "PBE-TS Energy", + ] + + __force_methods__ = [ + "pbe-ts", + ] + + force_target_names = [ + "PBE-TS Gradient", + ] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + raw_path = p_join(self.root, f'iso_17.h5') + samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, + self.force_target_names) + + return samples + + + +if __name__ == '__main__': + for data_class in [ISO17]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=' ') + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=' ') + + print() diff --git a/openqdc/datasets/orbnet_denali.py b/openqdc/datasets/orbnet_denali.py new file mode 100644 index 0000000..157f80f --- /dev/null +++ b/openqdc/datasets/orbnet_denali.py @@ -0,0 +1,85 @@ +import os +import glob +import torch +import pickle as pkl +import numpy as np +import pandas as pd +from tqdm import tqdm +import datamol as dm +from sklearn.utils import Bunch +from os.path import join as p_join +from openqdc.utils import load_json +from openqdc.utils.molecule import atom_table +from openqdc.utils.io import get_local_cache +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.datasets.base import BaseDataset + + +def read_mol(mol_id, conf_dict, base_path, energy_target_names): + res = [] + for conf_id, conf_label in conf_dict.items(): + try: + cf_name = p_join(base_path, "xyz_files", mol_id, f"{conf_id}.xyz") + d = np.loadtxt(cf_name, skiprows=2, dtype='str') + z, positions = d[:, 0], d[:, 1:].astype(np.float32) + z = np.array([atom_table.GetAtomicNumber(s) for s in z]) + xs = np.stack((z, np.zeros_like(z)), axis=-1) + + conf = dict( + atomic_inputs = np.concatenate((xs, positions), axis=-1, dtype=np.float32), + name = np.array([mol_id]), + energies = np.array([conf_label[k] for k in energy_target_names], dtype=np.float32)[None, :], + n_atoms = np.array([positions.shape[0]], dtype=np.int32), + subset = np.array([conf_label["subset"]]), + ) + res.append(conf) + except Exception as e: + print (f'Skipping: {mol_id} {conf_id} due to {e}') + + return res + + +class OrbnetDenali(BaseDataset): + __name__ = 'orbnet_denali' + __energy_methods__ = ["wb97x-d3_tz", "gfn1_xtb"] + + energy_target_names = ['dft_energy', 'xtb1_energy'] + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + label_path = p_join(self.root, 'denali_labels.csv') + df = pd.read_csv(label_path, usecols=['sample_id', 'mol_id', 'subset', 'dft_energy', 'xtb1_energy']) + labels = {mol_id: group.drop(['mol_id'], axis=1).drop_duplicates("sample_id").set_index("sample_id").to_dict('index') + for mol_id, group in df.groupby('mol_id')} + + # print(df.head()) + # tmp = df.to_dict('index') + # for i, k in enumerate(tmp): + # print(k, tmp[k]) + # if i > 10: + # break + # exit() + fn = lambda x: read_mol(x[0], x[1], self.root, self.energy_target_names) + res = dm.parallelized(fn, list(labels.items()), scheduler='threads', n_jobs=-1, progress=True) + samples = sum(res, []) + return samples + + +if __name__ == '__main__': + for data_class in [OrbnetDenali]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=' ') + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=' ') + + print() \ No newline at end of file diff --git a/openqdc/datasets/qm7x.py b/openqdc/datasets/qm7x.py new file mode 100644 index 0000000..4e86637 --- /dev/null +++ b/openqdc/datasets/qm7x.py @@ -0,0 +1,54 @@ +import os +import numpy as np +from os.path import join as p_join +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 + + +class ISO17(BaseDataset): + __name__ = 'iso_17' + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = [ + "pbe-ts", + ] + + energy_target_names = [ + "PBE-TS Energy", + ] + + __force_methods__ = [ + "pbe-ts", + ] + + force_target_names = [ + "PBE-TS Gradient", + ] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + raw_path = p_join(self.root, f'iso_17.h5') + samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, + self.force_target_names) + + return samples + + + +if __name__ == '__main__': + for data_class in [ISO17]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=' ') + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=' ') + + print() diff --git a/openqdc/datasets/sn2_rxn.py b/openqdc/datasets/sn2_rxn.py new file mode 100644 index 0000000..cdb6e27 --- /dev/null +++ b/openqdc/datasets/sn2_rxn.py @@ -0,0 +1,54 @@ +import os +import numpy as np +from os.path import join as p_join +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 + + +class SN2RXN(BaseDataset): + __name__ = 'iso_17' + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = [ + "pbe-ts", + ] + + energy_target_names = [ + "PBE-TS Energy", + ] + + __force_methods__ = [ + "pbe-ts", + ] + + force_target_names = [ + "PBE-TS Gradient", + ] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + raw_path = p_join(self.root, f'iso_17.h5') + samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, + self.force_target_names) + + return samples + + + +if __name__ == '__main__': + for data_class in [SN2RXN]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=' ') + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=' ') + + print() diff --git a/openqdc/datasets/solvated_peptides.py b/openqdc/datasets/solvated_peptides.py new file mode 100644 index 0000000..e69de29 diff --git a/openqdc/datasets/waterclusters3_30.py b/openqdc/datasets/waterclusters3_30.py new file mode 100644 index 0000000..e69de29