Skip to content

Commit

Permalink
new datasets added
Browse files Browse the repository at this point in the history
  • Loading branch information
prtos committed Sep 24, 2023
1 parent 76fec8c commit 0f21262
Show file tree
Hide file tree
Showing 8 changed files with 381 additions and 0 deletions.
72 changes: 72 additions & 0 deletions openqdc/datasets/comp6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy as np
from os.path import join as p_join
from openqdc.utils.constants import MAX_ATOMIC_NUMBER
from openqdc.datasets.base import BaseDataset, read_qc_archive_h5


class COMP6(BaseDataset):
__name__ = 'comp6'


# Energy in hartree, all zeros by default
atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)

__energy_methods__ = [
"wb97x_6-31g*",
"b3lyp-d3m(bj)_tz",
"b3lyp_tz",
"hf_tz",
"pbe-d3(bj)_dz",
"pbe_tz",
"svwm_tz",
"wb97m-d3(bj)_tz",
"wb97m_tz",
]

energy_target_names = [
"Energy",
"B3LYP-D3M(BJ):def2-tzvp",
"B3LYP:def2-tzvp",
"HF:def2-tzvp",
"PBE-D3M(BJ):def2-tzvp",
"PBE:def2-tzvp",
"SVWN:def2-tzvp",
"WB97M-D3(BJ):def2-tzvp",
"WB97M:def2-tzvp",
]

__force_methods__ = [
"wb97x_6-31g*",
]

force_target_names = [
"Gradient",
]

def __init__(self) -> None:
super().__init__()

def read_raw_entries(self):
samples = []
for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]:
raw_path = p_join(self.root, f'{subset}.h5')
samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names,
self.force_target_names)

return samples



if __name__ == '__main__':
for data_class in [COMP6]:
data = data_class()
n = len(data)

for i in np.random.choice(n, 3, replace=False):
x = data[i]
print(x.name, x.subset, end=' ')
for k in x:
if x[k] is not None:
print(k, x[k].shape, end=' ')

print()
62 changes: 62 additions & 0 deletions openqdc/datasets/gdml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
import numpy as np
from os.path import join as p_join
from openqdc.utils.constants import MAX_ATOMIC_NUMBER
from openqdc.datasets.base import BaseDataset, read_qc_archive_h5


class GDML(BaseDataset):
__name__ = 'gdml'

# Energy in hartree, all zeros by default
atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)

__energy_methods__ = [
"ccsd",
"ccsd(t)",
"pbe-ts",
]

energy_target_names = [
"CCSD Energy",
"CCSD(T) Energy",
"PBE-TS Energy",
]

__force_methods__ = [
"ccsd",
"ccsd(t)",
"pbe-ts",
]

force_target_names = [
"CCSD Gradient",
"CCSD(T) Gradient",
"PBE-TS Gradient",
]

def __init__(self) -> None:
super().__init__()

def read_raw_entries(self):
raw_path = p_join(self.root, f'gdml.h5')
samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names,
self.force_target_names)

return samples



if __name__ == '__main__':
for data_class in [GDML]:
data = data_class()
n = len(data)

for i in np.random.choice(n, 3, replace=False):
x = data[i]
print(x.name, x.subset, end=' ')
for k in x:
if x[k] is not None:
print(k, x[k].shape, end=' ')

print()
54 changes: 54 additions & 0 deletions openqdc/datasets/iso_17.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import numpy as np
from os.path import join as p_join
from openqdc.utils.constants import MAX_ATOMIC_NUMBER
from openqdc.datasets.base import BaseDataset, read_qc_archive_h5


class ISO17(BaseDataset):
__name__ = 'iso_17'

# Energy in hartree, all zeros by default
atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)

__energy_methods__ = [
"pbe-ts",
]

energy_target_names = [
"PBE-TS Energy",
]

__force_methods__ = [
"pbe-ts",
]

force_target_names = [
"PBE-TS Gradient",
]

def __init__(self) -> None:
super().__init__()

def read_raw_entries(self):
raw_path = p_join(self.root, f'iso_17.h5')
samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names,
self.force_target_names)

return samples



if __name__ == '__main__':
for data_class in [ISO17]:
data = data_class()
n = len(data)

for i in np.random.choice(n, 3, replace=False):
x = data[i]
print(x.name, x.subset, end=' ')
for k in x:
if x[k] is not None:
print(k, x[k].shape, end=' ')

print()
85 changes: 85 additions & 0 deletions openqdc/datasets/orbnet_denali.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import glob
import torch
import pickle as pkl
import numpy as np
import pandas as pd
from tqdm import tqdm
import datamol as dm
from sklearn.utils import Bunch
from os.path import join as p_join
from openqdc.utils import load_json
from openqdc.utils.molecule import atom_table
from openqdc.utils.io import get_local_cache
from openqdc.utils.constants import MAX_ATOMIC_NUMBER
from openqdc.datasets.base import BaseDataset


def read_mol(mol_id, conf_dict, base_path, energy_target_names):
res = []
for conf_id, conf_label in conf_dict.items():
try:
cf_name = p_join(base_path, "xyz_files", mol_id, f"{conf_id}.xyz")
d = np.loadtxt(cf_name, skiprows=2, dtype='str')
z, positions = d[:, 0], d[:, 1:].astype(np.float32)
z = np.array([atom_table.GetAtomicNumber(s) for s in z])
xs = np.stack((z, np.zeros_like(z)), axis=-1)

conf = dict(
atomic_inputs = np.concatenate((xs, positions), axis=-1, dtype=np.float32),
name = np.array([mol_id]),
energies = np.array([conf_label[k] for k in energy_target_names], dtype=np.float32)[None, :],
n_atoms = np.array([positions.shape[0]], dtype=np.int32),
subset = np.array([conf_label["subset"]]),
)
res.append(conf)
except Exception as e:
print (f'Skipping: {mol_id} {conf_id} due to {e}')

return res


class OrbnetDenali(BaseDataset):
__name__ = 'orbnet_denali'
__energy_methods__ = ["wb97x-d3_tz", "gfn1_xtb"]

energy_target_names = ['dft_energy', 'xtb1_energy']

# Energy in hartree, all zeros by default
atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)

def __init__(self) -> None:
super().__init__()

def read_raw_entries(self):
label_path = p_join(self.root, 'denali_labels.csv')
df = pd.read_csv(label_path, usecols=['sample_id', 'mol_id', 'subset', 'dft_energy', 'xtb1_energy'])
labels = {mol_id: group.drop(['mol_id'], axis=1).drop_duplicates("sample_id").set_index("sample_id").to_dict('index')
for mol_id, group in df.groupby('mol_id')}

# print(df.head())
# tmp = df.to_dict('index')
# for i, k in enumerate(tmp):
# print(k, tmp[k])
# if i > 10:
# break
# exit()
fn = lambda x: read_mol(x[0], x[1], self.root, self.energy_target_names)
res = dm.parallelized(fn, list(labels.items()), scheduler='threads', n_jobs=-1, progress=True)
samples = sum(res, [])
return samples


if __name__ == '__main__':
for data_class in [OrbnetDenali]:
data = data_class()
n = len(data)

for i in np.random.choice(n, 3, replace=False):
x = data[i]
print(x.name, x.subset, end=' ')
for k in x:
if x[k] is not None:
print(k, x[k].shape, end=' ')

print()
54 changes: 54 additions & 0 deletions openqdc/datasets/qm7x.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import numpy as np
from os.path import join as p_join
from openqdc.utils.constants import MAX_ATOMIC_NUMBER
from openqdc.datasets.base import BaseDataset, read_qc_archive_h5


class ISO17(BaseDataset):
__name__ = 'iso_17'

# Energy in hartree, all zeros by default
atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)

__energy_methods__ = [
"pbe-ts",
]

energy_target_names = [
"PBE-TS Energy",
]

__force_methods__ = [
"pbe-ts",
]

force_target_names = [
"PBE-TS Gradient",
]

def __init__(self) -> None:
super().__init__()

def read_raw_entries(self):
raw_path = p_join(self.root, f'iso_17.h5')
samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names,
self.force_target_names)

return samples



if __name__ == '__main__':
for data_class in [ISO17]:
data = data_class()
n = len(data)

for i in np.random.choice(n, 3, replace=False):
x = data[i]
print(x.name, x.subset, end=' ')
for k in x:
if x[k] is not None:
print(k, x[k].shape, end=' ')

print()
54 changes: 54 additions & 0 deletions openqdc/datasets/sn2_rxn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import numpy as np
from os.path import join as p_join
from openqdc.utils.constants import MAX_ATOMIC_NUMBER
from openqdc.datasets.base import BaseDataset, read_qc_archive_h5


class SN2RXN(BaseDataset):
__name__ = 'iso_17'

# Energy in hartree, all zeros by default
atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)

__energy_methods__ = [
"pbe-ts",
]

energy_target_names = [
"PBE-TS Energy",
]

__force_methods__ = [
"pbe-ts",
]

force_target_names = [
"PBE-TS Gradient",
]

def __init__(self) -> None:
super().__init__()

def read_raw_entries(self):
raw_path = p_join(self.root, f'iso_17.h5')
samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names,
self.force_target_names)

return samples



if __name__ == '__main__':
for data_class in [SN2RXN]:
data = data_class()
n = len(data)

for i in np.random.choice(n, 3, replace=False):
x = data[i]
print(x.name, x.subset, end=' ')
for k in x:
if x[k] is not None:
print(k, x[k].shape, end=' ')

print()
Empty file.
Empty file.

0 comments on commit 0f21262

Please sign in to comment.