new datasets added

valence-labs · Sep 24, 2023 · 0f21262 · 0f21262
1 parent 76fec8c
commit 0f21262
Show file tree

Hide file tree

Showing 8 changed files with 381 additions and 0 deletions.
diff --git a/openqdc/datasets/comp6.py b/openqdc/datasets/comp6.py
@@ -0,0 +1,72 @@
+import numpy as np
+from os.path import join as p_join
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+
+
+class COMP6(BaseDataset):
+    __name__ = 'comp6'
+
+
+    # Energy in hartree, all zeros by default
+    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
+
+    __energy_methods__ = [
+        "wb97x_6-31g*",
+        "b3lyp-d3m(bj)_tz",
+        "b3lyp_tz",
+        "hf_tz",
+        "pbe-d3(bj)_dz",
+        "pbe_tz",
+        "svwm_tz",
+        "wb97m-d3(bj)_tz",
+        "wb97m_tz",
+    ]
+
+    energy_target_names = [
+        "Energy",
+        "B3LYP-D3M(BJ):def2-tzvp",
+        "B3LYP:def2-tzvp",
+        "HF:def2-tzvp",
+        "PBE-D3M(BJ):def2-tzvp",
+        "PBE:def2-tzvp",
+        "SVWN:def2-tzvp",
+        "WB97M-D3(BJ):def2-tzvp",
+        "WB97M:def2-tzvp",
+    ]
+
+    __force_methods__ = [
+        "wb97x_6-31g*",
+    ]
+
+    force_target_names = [
+        "Gradient",   
+    ]
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def read_raw_entries(self):
+        samples = []
+        for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]:
+            raw_path = p_join(self.root, f'{subset}.h5')
+            samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, 
+                                      self.force_target_names)
+
+        return samples
+
+
+
+if __name__ == '__main__':
+    for data_class in [COMP6]:
+        data = data_class()
+        n = len(data)
+
+        for i in np.random.choice(n, 3, replace=False):
+            x = data[i]
+            print(x.name, x.subset, end=' ')
+            for k in x:
+                if x[k] is not None:
+                    print(k, x[k].shape, end=' ')
+
+            print()
diff --git a/openqdc/datasets/gdml.py b/openqdc/datasets/gdml.py
@@ -0,0 +1,62 @@
+import os
+import numpy as np
+from os.path import join as p_join
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+
+
+class GDML(BaseDataset):
+    __name__ = 'gdml'
+
+    # Energy in hartree, all zeros by default
+    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
+
+    __energy_methods__ = [
+        "ccsd",
+        "ccsd(t)",
+        "pbe-ts",
+    ]
+
+    energy_target_names = [
+        "CCSD Energy",
+        "CCSD(T) Energy",
+        "PBE-TS Energy",
+    ]
+
+    __force_methods__ = [
+        "ccsd",
+        "ccsd(t)",
+        "pbe-ts",
+    ]
+
+    force_target_names = [
+        "CCSD Gradient",
+        "CCSD(T) Gradient",
+        "PBE-TS Gradient",
+    ]
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def read_raw_entries(self):
+        raw_path = p_join(self.root, f'gdml.h5')
+        samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, 
+                                      self.force_target_names)
+
+        return samples
+
+
+
+if __name__ == '__main__':
+    for data_class in [GDML]:
+        data = data_class()
+        n = len(data)
+
+        for i in np.random.choice(n, 3, replace=False):
+            x = data[i]
+            print(x.name, x.subset, end=' ')
+            for k in x:
+                if x[k] is not None:
+                    print(k, x[k].shape, end=' ')
+
+            print()
diff --git a/openqdc/datasets/iso_17.py b/openqdc/datasets/iso_17.py
@@ -0,0 +1,54 @@
+import os
+import numpy as np
+from os.path import join as p_join
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+
+
+class ISO17(BaseDataset):
+    __name__ = 'iso_17'
+
+    # Energy in hartree, all zeros by default
+    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
+
+    __energy_methods__ = [
+        "pbe-ts",
+    ]
+
+    energy_target_names = [
+        "PBE-TS Energy",
+    ]
+
+    __force_methods__ = [
+        "pbe-ts",
+    ]
+
+    force_target_names = [
+        "PBE-TS Gradient",
+    ]
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def read_raw_entries(self):
+        raw_path = p_join(self.root, f'iso_17.h5')
+        samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, 
+                                      self.force_target_names)
+
+        return samples
+
+
+
+if __name__ == '__main__':
+    for data_class in [ISO17]:
+        data = data_class()
+        n = len(data)
+
+        for i in np.random.choice(n, 3, replace=False):
+            x = data[i]
+            print(x.name, x.subset, end=' ')
+            for k in x:
+                if x[k] is not None:
+                    print(k, x[k].shape, end=' ')
+
+            print()
diff --git a/openqdc/datasets/orbnet_denali.py b/openqdc/datasets/orbnet_denali.py
@@ -0,0 +1,85 @@
+import os
+import glob
+import torch
+import pickle as pkl
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import datamol as dm
+from sklearn.utils import Bunch
+from os.path import join as p_join
+from openqdc.utils import load_json
+from openqdc.utils.molecule import atom_table
+from openqdc.utils.io import get_local_cache
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+from openqdc.datasets.base import BaseDataset
+
+
+def read_mol(mol_id, conf_dict, base_path, energy_target_names):
+    res = []
+    for conf_id, conf_label in conf_dict.items():
+        try:
+            cf_name = p_join(base_path, "xyz_files", mol_id, f"{conf_id}.xyz")
+            d = np.loadtxt(cf_name, skiprows=2, dtype='str')
+            z, positions = d[:, 0], d[:, 1:].astype(np.float32)
+            z = np.array([atom_table.GetAtomicNumber(s) for s in z])
+            xs = np.stack((z, np.zeros_like(z)), axis=-1)
+
+            conf = dict(
+                atomic_inputs = np.concatenate((xs, positions), axis=-1, dtype=np.float32),
+                name = np.array([mol_id]),
+                energies = np.array([conf_label[k] for k in energy_target_names], dtype=np.float32)[None, :],
+                n_atoms = np.array([positions.shape[0]], dtype=np.int32),
+                subset = np.array([conf_label["subset"]]),
+            )
+            res.append(conf)
+        except Exception as e:
+            print (f'Skipping: {mol_id} {conf_id} due to {e}')
+
+    return res
+
+
+class OrbnetDenali(BaseDataset):
+    __name__ = 'orbnet_denali'
+    __energy_methods__ = ["wb97x-d3_tz", "gfn1_xtb"]
+
+    energy_target_names = ['dft_energy', 'xtb1_energy']
+
+    # Energy in hartree, all zeros by default
+    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def read_raw_entries(self):
+        label_path = p_join(self.root, 'denali_labels.csv')
+        df = pd.read_csv(label_path, usecols=['sample_id', 'mol_id', 'subset', 'dft_energy', 'xtb1_energy'])
+        labels = {mol_id: group.drop(['mol_id'], axis=1).drop_duplicates("sample_id").set_index("sample_id").to_dict('index') 
+                  for mol_id, group in df.groupby('mol_id')}
+
+        # print(df.head())
+        # tmp = df.to_dict('index')
+        # for i, k in enumerate(tmp):
+        #     print(k, tmp[k])
+        #     if i > 10:
+        #         break
+        # exit()
+        fn = lambda x: read_mol(x[0], x[1], self.root, self.energy_target_names)
+        res = dm.parallelized(fn, list(labels.items()), scheduler='threads', n_jobs=-1, progress=True)
+        samples = sum(res, [])
+        return samples
+
+
+if __name__ == '__main__':
+    for data_class in [OrbnetDenali]:
+        data = data_class()
+        n = len(data)
+
+        for i in np.random.choice(n, 3, replace=False):
+            x = data[i]
+            print(x.name, x.subset, end=' ')
+            for k in x:
+                if x[k] is not None:
+                    print(k, x[k].shape, end=' ')
+
+            print()
diff --git a/openqdc/datasets/qm7x.py b/openqdc/datasets/qm7x.py
@@ -0,0 +1,54 @@
+import os
+import numpy as np
+from os.path import join as p_join
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+
+
+class ISO17(BaseDataset):
+    __name__ = 'iso_17'
+
+    # Energy in hartree, all zeros by default
+    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
+
+    __energy_methods__ = [
+        "pbe-ts",
+    ]
+
+    energy_target_names = [
+        "PBE-TS Energy",
+    ]
+
+    __force_methods__ = [
+        "pbe-ts",
+    ]
+
+    force_target_names = [
+        "PBE-TS Gradient",
+    ]
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def read_raw_entries(self):
+        raw_path = p_join(self.root, f'iso_17.h5')
+        samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, 
+                                      self.force_target_names)
+
+        return samples
+
+
+
+if __name__ == '__main__':
+    for data_class in [ISO17]:
+        data = data_class()
+        n = len(data)
+
+        for i in np.random.choice(n, 3, replace=False):
+            x = data[i]
+            print(x.name, x.subset, end=' ')
+            for k in x:
+                if x[k] is not None:
+                    print(k, x[k].shape, end=' ')
+
+            print()
diff --git a/openqdc/datasets/sn2_rxn.py b/openqdc/datasets/sn2_rxn.py
@@ -0,0 +1,54 @@
+import os
+import numpy as np
+from os.path import join as p_join
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+
+
+class SN2RXN(BaseDataset):
+    __name__ = 'iso_17'
+
+    # Energy in hartree, all zeros by default
+    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
+
+    __energy_methods__ = [
+        "pbe-ts",
+    ]
+
+    energy_target_names = [
+        "PBE-TS Energy",
+    ]
+
+    __force_methods__ = [
+        "pbe-ts",
+    ]
+
+    force_target_names = [
+        "PBE-TS Gradient",
+    ]
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def read_raw_entries(self):
+        raw_path = p_join(self.root, f'iso_17.h5')
+        samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, 
+                                      self.force_target_names)
+
+        return samples
+
+
+
+if __name__ == '__main__':
+    for data_class in [SN2RXN]:
+        data = data_class()
+        n = len(data)
+
+        for i in np.random.choice(n, 3, replace=False):
+            x = data[i]
+            print(x.name, x.subset, end=' ')
+            for k in x:
+                if x[k] is not None:
+                    print(k, x[k].shape, end=' ')
+
+            print()
diff --git a/openqdc/datasets/solvated_peptides.py b/openqdc/datasets/solvated_peptides.py
diff --git a/openqdc/datasets/waterclusters3_30.py b/openqdc/datasets/waterclusters3_30.py