diff --git a/.github/SECURITY.md b/.github/SECURITY.md index c9d0753..128d21b 100644 --- a/.github/SECURITY.md +++ b/.github/SECURITY.md @@ -1,3 +1,3 @@ # Security Policy -Please report any security-related issues directly to prudencio@valencediscovery.com. +Please report any security-related issues directly to prudencio@valencediscovery.com. diff --git a/.github/workflows/pre-commit-ci.yml b/.github/workflows/pre-commit-ci.yml new file mode 100644 index 0000000..b597281 --- /dev/null +++ b/.github/workflows/pre-commit-ci.yml @@ -0,0 +1,15 @@ +# Based on https://github.com/pre-commit/action +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.0 diff --git a/.gitignore b/.gitignore index bc693a8..c1eefd0 100644 --- a/.gitignore +++ b/.gitignore @@ -140,10 +140,9 @@ cache/ *.hdf5 nohup.out *.out -*.crt +*.crt *.key *.dat *.xyz *.csv *.txt - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..a6b322d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-yaml + - id: check-toml + - id: check-json + - id: check-merge-conflict + - id: requirements-txt-fixer + - id: detect-private-key + - repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black"] + - repo: https://github.com/charliermarsh/ruff-pre-commit + # Ruff version. + rev: 'v0.0.241' + hooks: + - id: ruff diff --git a/env.yml b/env.yml index 8eb195c..932af25 100644 --- a/env.yml +++ b/env.yml @@ -64,6 +64,6 @@ dependencies: - ruff - ipykernel - pydantic <= 2.0 - + - pip: - torch-nl diff --git a/openqdc/datasets/ani.py b/openqdc/datasets/ani.py index 3683bc6..f0b3335 100644 --- a/openqdc/datasets/ani.py +++ b/openqdc/datasets/ani.py @@ -1,14 +1,15 @@ import os -import numpy as np from os.path import join as p_join -from openqdc.utils.constants import MAX_ATOMIC_NUMBER + +import numpy as np + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.io import get_local_cache class ANI1(BaseDataset): - __name__ = 'ani1' - + __name__ = "ani1" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -21,29 +22,27 @@ class ANI1(BaseDataset): "ωB97x:6-31G(d) Energy", ] - def __init__(self) -> None: super().__init__() @property def root(self): - return p_join(get_local_cache(), 'ani') - + return p_join(get_local_cache(), "ani") + @property def preprocess_path(self): - path = p_join(self.root, 'preprocessed', self.__name__) + path = p_join(self.root, "preprocessed", self.__name__) os.makedirs(path, exist_ok=True) return path - + def read_raw_entries(self): - raw_path = p_join(self.root, f'{self.__name__}.h5') - samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, - self.force_target_names) + raw_path = p_join(self.root, f"{self.__name__}.h5") + samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names) return samples class ANI1CCX(ANI1): - __name__ = 'ani1ccx' + __name__ = "ani1ccx" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -67,10 +66,10 @@ class ANI1CCX(ANI1): def __init__(self) -> None: super().__init__() - + class ANI1X(ANI1): - __name__ = 'ani1x' + __name__ = "ani1x" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -94,10 +93,10 @@ class ANI1X(ANI1): "MP2:cc-pVQZ Correlation Energy", "MP2:cc-pVTZ Correlation Energy", "wB97x:6-31G(d) Total Energy", - "wB97x:def2-TZVPP Total Energy" + "wB97x:def2-TZVPP Total Energy", ] - force_target_names = [ + force_target_names = [ "wB97x:6-31G(d) Atomic Forces", "wB97x:def2-TZVPP Atomic Forces", ] @@ -111,21 +110,21 @@ def __init__(self) -> None: super().__init__() -if __name__ == '__main__': +if __name__ == "__main__": for data_class in [ - ANI1, - # ANI1CCX, - # ANI1X - ]: + ANI1, + # ANI1CCX, + # ANI1X + ]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - + print(k, x[k].shape, end=" ") + print() - exit() \ No newline at end of file + exit() diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index 954c026..254ff0d 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -1,27 +1,34 @@ import os -import torch -import numpy as np -import pickle as pkl -from tqdm import tqdm from os.path import join as p_join + +import numpy as np +import torch from sklearn.utils import Bunch -from openqdc.utils.io import get_local_cache, pull_locally, push_remote, load_hdf5_file, copy_exists +from tqdm import tqdm + +from openqdc.utils.constants import NB_ATOMIC_FEATURES +from openqdc.utils.io import ( + copy_exists, + get_local_cache, + load_hdf5_file, + pull_locally, + push_remote, +) from openqdc.utils.molecule import atom_table -from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER, NB_ATOMIC_FEATURES -def extract_entry(df, i, subset, energy_target_names, force_target_names=None): +def extract_entry(df, i, subset, energy_target_names, force_target_names=None): x = np.array([atom_table.GetAtomicNumber(s) for s in df["symbols"][i]]) xs = np.stack((x, np.zeros_like(x)), axis=-1) - positions= df["geometry"][i].reshape((-1, 3)) - energies= np.array([df[k][i] for k in energy_target_names]) - + positions = df["geometry"][i].reshape((-1, 3)) + energies = np.array([df[k][i] for k in energy_target_names]) + res = dict( - name= np.array([df["name"][i]]), - subset= np.array([subset]), - energies= energies.reshape((1, -1)).astype(np.float32), - atomic_inputs = np.concatenate((xs, positions), axis=-1, dtype=np.float32), - n_atoms = np.array([x.shape[0]], dtype=np.int32), + name=np.array([df["name"][i]]), + subset=np.array([subset]), + energies=energies.reshape((1, -1)).astype(np.float32), + atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), + n_atoms=np.array([x.shape[0]], dtype=np.int32), ) if force_target_names is not None and len(force_target_names) > 0: forces = np.zeros((positions.shape[0], 3, len(force_target_names)), dtype=np.float32) @@ -47,8 +54,7 @@ def read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names # print('\n'*3) # exit() - samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) - for i in tqdm(range(n))] + samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))] return samples @@ -71,45 +77,44 @@ def __init__(self) -> None: @property def root(self): return p_join(get_local_cache(), self.__name__) - + @property def preprocess_path(self): - path = p_join(self.root, 'preprocessed') + path = p_join(self.root, "preprocessed") os.makedirs(path, exist_ok=True) return path - + @property def data_keys(self): keys = list(self.data_types.keys()) if len(self.__force_methods__) == 0: keys.remove("forces") return keys - + @property def data_types(self): return { - "atomic_inputs": np.float32, - "position_idx_range": np.int32, + "atomic_inputs": np.float32, + "position_idx_range": np.int32, "energies": np.float32, - "forces": np.float32 + "forces": np.float32, } - + @property def data_shapes(self): return { - "atomic_inputs": (-1, NB_ATOMIC_FEATURES), - "position_idx_range": (-1, 2), + "atomic_inputs": (-1, NB_ATOMIC_FEATURES), + "position_idx_range": (-1, 2), "energies": (-1, len(self.energy_target_names)), - "forces": (-1, 3, len(self.force_target_names)) + "forces": (-1, 3, len(self.force_target_names)), } - + def read_raw_entries(self): raise NotImplementedError - + def collate_list(self, list_entries): # concatenate entries - res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) - for key in list_entries[0]} + res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]} csum = np.cumsum(res.pop("n_atoms")) x = np.zeros((csum.shape[0], 2), dtype=np.int32) @@ -121,14 +126,11 @@ def save_preprocess(self, data_dict): # save memmaps for key in self.data_keys: local_path = p_join(self.preprocess_path, f"{key}.mmap") - out = np.memmap(local_path, - mode="w+", - dtype=data_dict[key].dtype, - shape=data_dict[key].shape) + out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) out[:] = data_dict.pop(key)[:] out.flush() push_remote(local_path) - + # save smiles and subset for key in ["name", "subset"]: local_path = p_join(self.preprocess_path, f"{key}.npz") @@ -136,19 +138,20 @@ def save_preprocess(self, data_dict): with open(local_path, "wb") as f: np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) push_remote(local_path) - - def read_preprocess(self): + + def read_preprocess(self): self.data = {} for key in self.data_keys: filename = p_join(self.preprocess_path, f"{key}.mmap") pull_locally(filename) self.data[key] = np.memmap( - filename, mode='r', + filename, + mode="r", dtype=self.data_types[key], ).reshape(self.data_shapes[key]) - + for key in self.data: - print(f'Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}') + print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") for key in ["name", "subset"]: filename = p_join(self.preprocess_path, f"{key}.npz") @@ -156,17 +159,15 @@ def read_preprocess(self): # with open(filename, "rb") as f: self.data[key] = np.load(open(filename, "rb")) for k in self.data[key]: - print(f'Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}') + print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}") def is_preprocessed(self): - predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) - for key in self.data_keys] - predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) - for x in ["name", "subset"]] + predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] + predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) for x in ["name", "subset"]] return all(predicats) def __len__(self): - return self.data['energies'].shape[0] + return self.data["energies"].shape[0] def __getitem__(self, idx: int): p_start, p_end = self.data["position_idx_range"][idx] @@ -190,5 +191,5 @@ def __getitem__(self, idx: int): energies=energies, name=name, subset=subset, - forces=forces + forces=forces, ) diff --git a/openqdc/datasets/comp6.py b/openqdc/datasets/comp6.py index cd3e41c..96811c9 100644 --- a/openqdc/datasets/comp6.py +++ b/openqdc/datasets/comp6.py @@ -1,12 +1,13 @@ -import numpy as np from os.path import join as p_join -from openqdc.utils.constants import MAX_ATOMIC_NUMBER + +import numpy as np + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER class COMP6(BaseDataset): - __name__ = 'comp6' - + __name__ = "comp6" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -40,33 +41,31 @@ class COMP6(BaseDataset): ] force_target_names = [ - "Gradient", + "Gradient", ] def __init__(self) -> None: super().__init__() - + def read_raw_entries(self): samples = [] for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]: - raw_path = p_join(self.root, f'{subset}.h5') - samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, - self.force_target_names) + raw_path = p_join(self.root, f"{subset}.h5") + samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, self.force_target_names) return samples - -if __name__ == '__main__': +if __name__ == "__main__": for data_class in [COMP6]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - + print(k, x[k].shape, end=" ") + print() diff --git a/openqdc/datasets/gdml.py b/openqdc/datasets/gdml.py index b53586b..c0ca093 100644 --- a/openqdc/datasets/gdml.py +++ b/openqdc/datasets/gdml.py @@ -1,12 +1,13 @@ -import os -import numpy as np from os.path import join as p_join -from openqdc.utils.constants import MAX_ATOMIC_NUMBER + +import numpy as np + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER class GDML(BaseDataset): - __name__ = 'gdml' + __name__ = "gdml" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -37,26 +38,24 @@ class GDML(BaseDataset): def __init__(self) -> None: super().__init__() - + def read_raw_entries(self): - raw_path = p_join(self.root, f'gdml.h5') - samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, - self.force_target_names) + raw_path = p_join(self.root, "gdml.h5") + samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, self.force_target_names) return samples - -if __name__ == '__main__': +if __name__ == "__main__": for data_class in [GDML]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - + print(k, x[k].shape, end=" ") + print() diff --git a/openqdc/datasets/geom.py b/openqdc/datasets/geom.py index c489f58..c2f895a 100644 --- a/openqdc/datasets/geom.py +++ b/openqdc/datasets/geom.py @@ -1,15 +1,16 @@ +from os.path import join as p_join -import numpy as np import datamol as dm -from os.path import join as p_join -from openqdc.utils import load_pkl, load_json -from openqdc.utils.molecule import get_atomic_numuber_and_charge -from openqdc.utils.constants import MAX_ATOMIC_NUMBER +import numpy as np + from openqdc.datasets.base import BaseDataset +from openqdc.utils import load_json, load_pkl +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import get_atomic_numuber_and_charge def read_mol(mol_id, mol_dict, base_path, partition): - """ Read molecule from pickle file and return dict with conformers and energies + """Read molecule from pickle file and return dict with conformers and energies Parameters ---------- @@ -31,31 +32,31 @@ def read_mol(mol_id, mol_dict, base_path, partition): """ try: - d = load_pkl(p_join(base_path, mol_dict['pickle_path']), False) - confs = d['conformers'] - x = get_atomic_numuber_and_charge(confs[0]['rd_mol']) - positions = np.array([cf['rd_mol'].GetConformer().GetPositions() for cf in confs]) + d = load_pkl(p_join(base_path, mol_dict["pickle_path"]), False) + confs = d["conformers"] + x = get_atomic_numuber_and_charge(confs[0]["rd_mol"]) + positions = np.array([cf["rd_mol"].GetConformer().GetPositions() for cf in confs]) n_confs = positions.shape[0] res = dict( - atomic_inputs = np.concatenate(( - x[None, ...].repeat(n_confs, axis=0), - positions), axis=-1, dtype=np.float32).reshape(-1, 5), - name = np.array([d['smiles'] for _ in confs]), - energies = np.array([cf['totalenergy'] for cf in confs], dtype=np.float32)[:, None], - n_atoms = np.array([positions.shape[1]] * n_confs, dtype=np.int32), - subset = np.array([partition] * n_confs), + atomic_inputs=np.concatenate( + (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32 + ).reshape(-1, 5), + name=np.array([d["smiles"] for _ in confs]), + energies=np.array([cf["totalenergy"] for cf in confs], dtype=np.float32)[:, None], + n_atoms=np.array([positions.shape[1]] * n_confs, dtype=np.int32), + subset=np.array([partition] * n_confs), ) except Exception as e: - print (f'Skipping: {mol_id} due to {e}') + print(f"Skipping: {mol_id} due to {e}") res = None return res class GEOM(BaseDataset): - __name__ = 'geom' + __name__ = "geom" __energy_methods__ = ["gfn2_xtb"] energy_target_names = ["gfn2_xtb.energy"] @@ -64,36 +65,36 @@ class GEOM(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - partitions = ['qm9', 'drugs'] + partitions = ["qm9", "drugs"] def __init__(self) -> None: super().__init__() def _read_raw_(self, partition): - raw_path = p_join(self.root, 'rdkit_folder') + raw_path = p_join(self.root, "rdkit_folder") - mols = load_json(p_join(raw_path, f'summary_{partition}.json')) + mols = load_json(p_join(raw_path, f"summary_{partition}.json")) mols = list(mols.items()) - fn = lambda x: read_mol(x[0], x[1], raw_path, partition) - samples = dm.parallelized(fn, mols, n_jobs=1, progress=True) # don't use more than 1 job + fn = lambda x: read_mol(x[0], x[1], raw_path, partition) # noqa E731 + samples = dm.parallelized(fn, mols, n_jobs=1, progress=True) # don't use more than 1 job return samples - + def read_raw_entries(self): samples = sum([self._read_raw_(partition) for partition in self.partitions], []) return samples -if __name__ == '__main__': +if __name__ == "__main__": for data_class in [GEOM]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - - print() \ No newline at end of file + print(k, x[k].shape, end=" ") + + print() diff --git a/openqdc/datasets/iso_17.py b/openqdc/datasets/iso_17.py index 4e86637..55f395c 100644 --- a/openqdc/datasets/iso_17.py +++ b/openqdc/datasets/iso_17.py @@ -1,12 +1,13 @@ -import os -import numpy as np from os.path import join as p_join -from openqdc.utils.constants import MAX_ATOMIC_NUMBER + +import numpy as np + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER class ISO17(BaseDataset): - __name__ = 'iso_17' + __name__ = "iso_17" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -29,26 +30,24 @@ class ISO17(BaseDataset): def __init__(self) -> None: super().__init__() - + def read_raw_entries(self): - raw_path = p_join(self.root, f'iso_17.h5') - samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, - self.force_target_names) + raw_path = p_join(self.root, "iso_17.h5") + samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) return samples - -if __name__ == '__main__': +if __name__ == "__main__": for data_class in [ISO17]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - + print(k, x[k].shape, end=" ") + print() diff --git a/openqdc/datasets/molecule3d.py b/openqdc/datasets/molecule3d.py index da15cd9..ac4f348 100644 --- a/openqdc/datasets/molecule3d.py +++ b/openqdc/datasets/molecule3d.py @@ -1,41 +1,41 @@ +from glob import glob +from os.path import join as p_join +import datamol as dm import numpy as np import pandas as pd -import datamol as dm -from tqdm import tqdm -from glob import glob from rdkit import Chem -from os.path import join as p_join -from openqdc.utils.molecule import get_atomic_numuber_and_charge -from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER +from tqdm import tqdm + from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import get_atomic_numuber_and_charge def read_mol(mol, energy): smiles = dm.to_smiles(mol, explicit_hs=False) # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False) x = get_atomic_numuber_and_charge(mol) - positions= mol.GetConformer().GetPositions() * BOHR2ANG - + positions = mol.GetConformer().GetPositions() * BOHR2ANG + res = dict( - name= np.array([smiles]), - subset= np.array(["molecule3d"]), - energies= np.array([energy]).astype(np.float32)[:, None], - atomic_inputs = np.concatenate((x, positions), axis=-1, dtype=np.float32), - n_atoms = np.array([x.shape[0]], dtype=np.int32), + name=np.array([smiles]), + subset=np.array(["molecule3d"]), + energies=np.array([energy]).astype(np.float32)[:, None], + atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32), + n_atoms=np.array([x.shape[0]], dtype=np.int32), ) return res def _read_sdf(sdf_path, properties_path): - properties = pd.read_csv(properties_path, dtype={"cid": str}) properties.drop_duplicates(subset="cid", inplace=True, keep="first") xys = properties[["cid", "scf energy"]] properties = dict(zip(xys.cid.values, xys["scf energy"].values)) - - get_e = lambda mol: properties[mol.GetProp('_Name').split(" ")[1]] + + get_e = lambda mol: properties[mol.GetProp("_Name").split(" ")[1]] fn = lambda x: read_mol(x, get_e(x)) suppl = Chem.SDMolSupplier(sdf_path, removeHs=False, sanitize=True) @@ -43,8 +43,9 @@ def _read_sdf(sdf_path, properties_path): return tmp + class Molecule3D(BaseDataset): - __name__ = 'molecule3d' + __name__ = "molecule3d" __energy_methods__ = ["b3lyp_6-31g*"] energy_target_names = ["b3lyp/6-31g*.energy"] @@ -56,26 +57,26 @@ def __init__(self) -> None: super().__init__() def read_raw_entries(self): - raw = p_join(self.root, 'data', 'raw') - sdf_paths = glob(p_join(raw, '*.sdf')) - properties_path = p_join(raw, 'properties.csv') + raw = p_join(self.root, "data", "raw") + sdf_paths = glob(p_join(raw, "*.sdf")) + properties_path = p_join(raw, "properties.csv") fn = lambda x: _read_sdf(x, properties_path) - res = dm.parallelized(fn, sdf_paths, n_jobs=1) # don't use more than 1 job + res = dm.parallelized(fn, sdf_paths, n_jobs=1) # don't use more than 1 job samples = sum(res, []) return samples -if __name__ == '__main__': +if __name__ == "__main__": for data_class in [Molecule3D]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - - print() \ No newline at end of file + print(k, x[k].shape, end=" ") + + print() diff --git a/openqdc/datasets/nabladft.py b/openqdc/datasets/nabladft.py index 25b4932..3234011 100644 --- a/openqdc/datasets/nabladft.py +++ b/openqdc/datasets/nabladft.py @@ -1,12 +1,13 @@ - import os -import numpy as np +from os.path import join as p_join + import datamol as dm +import numpy as np +from nablaDFT.dataset import HamiltonianDatabase from tqdm import tqdm -from os.path import join as p_join -from openqdc.utils.constants import MAX_ATOMIC_NUMBER + from openqdc.datasets.base import BaseDataset -from nablaDFT.dataset import HamiltonianDatabase +from openqdc.utils.constants import MAX_ATOMIC_NUMBER def to_mol(entry): @@ -14,29 +15,29 @@ def to_mol(entry): C = np.zeros_like(Z) res = dict( - atomic_inputs = np.concatenate((Z[:, None], C[:, None], R), axis=-1).astype(np.float32), - name = np.array(['']), - energies = E[:, None].astype(np.float32), - forces = F[:, :, None].astype(np.float32), - n_atoms = np.array([Z.shape[0]], dtype=np.int32), - subset = np.array(['nabla']), + atomic_inputs=np.concatenate((Z[:, None], C[:, None], R), axis=-1).astype(np.float32), + name=np.array([""]), + energies=E[:, None].astype(np.float32), + forces=F[:, :, None].astype(np.float32), + n_atoms=np.array([Z.shape[0]], dtype=np.int32), + subset=np.array(["nabla"]), ) return res def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000): - print(f'Loading from {start_idx} to {stop_idx}') + print(f"Loading from {start_idx} to {stop_idx}") db = HamiltonianDatabase(raw_path) idxs = list(np.arange(start_idx, stop_idx)) n, s = len(idxs), step_size - samples = [to_mol(entry) for i in tqdm(range(0, n, s)) for entry in db[idxs[i:i + s]]] + samples = [to_mol(entry) for i in tqdm(range(0, n, s)) for entry in db[idxs[i : i + s]]] return samples - + class NablaDFT(BaseDataset): - __name__ = 'nabladft' + __name__ = "nabladft" __energy_methods__ = ["wb97x-d_svp"] energy_target_names = ["ωB97X-D/def2-SVP"] @@ -47,30 +48,30 @@ class NablaDFT(BaseDataset): def __init__(self) -> None: super().__init__() - def read_raw_entries(self): - raw_path = p_join(self.root, 'dataset_full.db') + raw_path = p_join(self.root, "dataset_full.db") train = HamiltonianDatabase(raw_path) n, c = len(train), 20 step_size = int(np.ceil(n / os.cpu_count())) - fn = lambda i: read_chunk_from_db(raw_path, i*step_size, min((i + 1) * step_size, n)) - samples = dm.parallelized(fn, list(range(c)), n_jobs=c, progress=False, scheduler="threads") # don't use more than 1 job - + fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n)) + samples = dm.parallelized( + fn, list(range(c)), n_jobs=c, progress=False, scheduler="threads" + ) # don't use more than 1 job + return sum(samples, []) - -if __name__ == '__main__': + +if __name__ == "__main__": for data_class in [NablaDFT]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - - print() + print(k, x[k].shape, end=" ") + print() diff --git a/openqdc/datasets/orbnet_denali.py b/openqdc/datasets/orbnet_denali.py index 157f80f..452cce1 100644 --- a/openqdc/datasets/orbnet_denali.py +++ b/openqdc/datasets/orbnet_denali.py @@ -1,18 +1,12 @@ -import os -import glob -import torch -import pickle as pkl +from os.path import join as p_join + +import datamol as dm import numpy as np import pandas as pd -from tqdm import tqdm -import datamol as dm -from sklearn.utils import Bunch -from os.path import join as p_join -from openqdc.utils import load_json -from openqdc.utils.molecule import atom_table -from openqdc.utils.io import get_local_cache -from openqdc.utils.constants import MAX_ATOMIC_NUMBER + from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import atom_table def read_mol(mol_id, conf_dict, base_path, energy_target_names): @@ -20,30 +14,30 @@ def read_mol(mol_id, conf_dict, base_path, energy_target_names): for conf_id, conf_label in conf_dict.items(): try: cf_name = p_join(base_path, "xyz_files", mol_id, f"{conf_id}.xyz") - d = np.loadtxt(cf_name, skiprows=2, dtype='str') + d = np.loadtxt(cf_name, skiprows=2, dtype="str") z, positions = d[:, 0], d[:, 1:].astype(np.float32) z = np.array([atom_table.GetAtomicNumber(s) for s in z]) xs = np.stack((z, np.zeros_like(z)), axis=-1) conf = dict( - atomic_inputs = np.concatenate((xs, positions), axis=-1, dtype=np.float32), - name = np.array([mol_id]), - energies = np.array([conf_label[k] for k in energy_target_names], dtype=np.float32)[None, :], - n_atoms = np.array([positions.shape[0]], dtype=np.int32), - subset = np.array([conf_label["subset"]]), + atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), + name=np.array([mol_id]), + energies=np.array([conf_label[k] for k in energy_target_names], dtype=np.float32)[None, :], + n_atoms=np.array([positions.shape[0]], dtype=np.int32), + subset=np.array([conf_label["subset"]]), ) res.append(conf) except Exception as e: - print (f'Skipping: {mol_id} {conf_id} due to {e}') + print(f"Skipping: {mol_id} {conf_id} due to {e}") return res class OrbnetDenali(BaseDataset): - __name__ = 'orbnet_denali' + __name__ = "orbnet_denali" __energy_methods__ = ["wb97x-d3_tz", "gfn1_xtb"] - energy_target_names = ['dft_energy', 'xtb1_energy'] + energy_target_names = ["dft_energy", "xtb1_energy"] # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -52,10 +46,12 @@ def __init__(self) -> None: super().__init__() def read_raw_entries(self): - label_path = p_join(self.root, 'denali_labels.csv') - df = pd.read_csv(label_path, usecols=['sample_id', 'mol_id', 'subset', 'dft_energy', 'xtb1_energy']) - labels = {mol_id: group.drop(['mol_id'], axis=1).drop_duplicates("sample_id").set_index("sample_id").to_dict('index') - for mol_id, group in df.groupby('mol_id')} + label_path = p_join(self.root, "denali_labels.csv") + df = pd.read_csv(label_path, usecols=["sample_id", "mol_id", "subset", "dft_energy", "xtb1_energy"]) + labels = { + mol_id: group.drop(["mol_id"], axis=1).drop_duplicates("sample_id").set_index("sample_id").to_dict("index") + for mol_id, group in df.groupby("mol_id") + } # print(df.head()) # tmp = df.to_dict('index') @@ -65,21 +61,21 @@ def read_raw_entries(self): # break # exit() fn = lambda x: read_mol(x[0], x[1], self.root, self.energy_target_names) - res = dm.parallelized(fn, list(labels.items()), scheduler='threads', n_jobs=-1, progress=True) + res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True) samples = sum(res, []) return samples - -if __name__ == '__main__': + +if __name__ == "__main__": for data_class in [OrbnetDenali]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - - print() \ No newline at end of file + print(k, x[k].shape, end=" ") + + print() diff --git a/openqdc/datasets/qm7x.py b/openqdc/datasets/qm7x.py index 4e86637..55f395c 100644 --- a/openqdc/datasets/qm7x.py +++ b/openqdc/datasets/qm7x.py @@ -1,12 +1,13 @@ -import os -import numpy as np from os.path import join as p_join -from openqdc.utils.constants import MAX_ATOMIC_NUMBER + +import numpy as np + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER class ISO17(BaseDataset): - __name__ = 'iso_17' + __name__ = "iso_17" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -29,26 +30,24 @@ class ISO17(BaseDataset): def __init__(self) -> None: super().__init__() - + def read_raw_entries(self): - raw_path = p_join(self.root, f'iso_17.h5') - samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, - self.force_target_names) + raw_path = p_join(self.root, "iso_17.h5") + samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) return samples - -if __name__ == '__main__': +if __name__ == "__main__": for data_class in [ISO17]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - + print(k, x[k].shape, end=" ") + print() diff --git a/openqdc/datasets/qmugs.py b/openqdc/datasets/qmugs.py index 83b67f1..6868f38 100644 --- a/openqdc/datasets/qmugs.py +++ b/openqdc/datasets/qmugs.py @@ -1,11 +1,13 @@ import os -import numpy as np -import datamol as dm from glob import glob from os.path import join as p_join -from openqdc.utils.molecule import get_atomic_numuber_and_charge -from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER + +import datamol as dm +import numpy as np + from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import get_atomic_numuber_and_charge def read_mol(mol_dir): @@ -18,27 +20,29 @@ def read_mol(mol_dir): smiles = dm.to_smiles(mols[0], explicit_hs=False) x = get_atomic_numuber_and_charge(mols[0])[None, ...].repeat(n_confs, axis=0) - positions= np.array([mol.GetConformer().GetPositions() for mol in mols]) + positions = np.array([mol.GetConformer().GetPositions() for mol in mols]) props = [mol.GetPropsAsDict() for mol in mols] - targets = np.array([[p[el]for el in QMugs.energy_target_names] for p in props]) - + targets = np.array([[p[el] for el in QMugs.energy_target_names] for p in props]) + res = dict( - name= np.array([smiles]*n_confs), - subset= np.array(['qmugs']*n_confs), - energies= targets.astype(np.float32), - atomic_inputs = np.concatenate((x, positions), - axis=-1, dtype=np.float32).reshape(-1, 5), - n_atoms = np.array([x.shape[1]]*n_confs, dtype=np.int32), + name=np.array([smiles] * n_confs), + subset=np.array(["qmugs"] * n_confs), + energies=targets.astype(np.float32), + atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5), + n_atoms=np.array([x.shape[1]] * n_confs, dtype=np.int32), ) return res class QMugs(BaseDataset): - __name__ = 'qmugs' + __name__ = "qmugs" __energy_methods__ = ["gfn2_xtb", "b3lyp/6-31g*"] - energy_target_names = ["GFN2:TOTAL_ENERGY", "DFT:TOTAL_ENERGY",] + energy_target_names = [ + "GFN2:TOTAL_ENERGY", + "DFT:TOTAL_ENERGY", + ] # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -47,25 +51,23 @@ def __init__(self) -> None: super().__init__() def read_raw_entries(self): - raw_path = p_join(self.root, 'structures') + raw_path = p_join(self.root, "structures") mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)] - samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, - progress=True, scheduler="threads") + samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, progress=True, scheduler="threads") return samples - -if __name__ == '__main__': + +if __name__ == "__main__": for data_class in [QMugs]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - - print() + print(k, x[k].shape, end=" ") + print() diff --git a/openqdc/datasets/sn2_rxn.py b/openqdc/datasets/sn2_rxn.py index cdb6e27..8be8281 100644 --- a/openqdc/datasets/sn2_rxn.py +++ b/openqdc/datasets/sn2_rxn.py @@ -1,12 +1,13 @@ -import os -import numpy as np from os.path import join as p_join -from openqdc.utils.constants import MAX_ATOMIC_NUMBER + +import numpy as np + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER class SN2RXN(BaseDataset): - __name__ = 'iso_17' + __name__ = "iso_17" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -29,26 +30,24 @@ class SN2RXN(BaseDataset): def __init__(self) -> None: super().__init__() - + def read_raw_entries(self): - raw_path = p_join(self.root, f'iso_17.h5') - samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, - self.force_target_names) + raw_path = p_join(self.root, "iso_17.h5") + samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) return samples - -if __name__ == '__main__': +if __name__ == "__main__": for data_class in [SN2RXN]: data = data_class() n = len(data) for i in np.random.choice(n, 3, replace=False): x = data[i] - print(x.name, x.subset, end=' ') + print(x.name, x.subset, end=" ") for k in x: if x[k] is not None: - print(k, x[k].shape, end=' ') - + print(k, x[k].shape, end=" ") + print() diff --git a/openqdc/datasets/spice.py b/openqdc/datasets/spice.py index 1042f92..0aec9b2 100644 --- a/openqdc/datasets/spice.py +++ b/openqdc/datasets/spice.py @@ -1,11 +1,13 @@ +from os.path import join as p_join + +import datamol as dm import numpy as np from tqdm import tqdm -import datamol as dm -from os.path import join as p_join + +from openqdc.datasets.base import BaseDataset from openqdc.utils import load_hdf5_file -from openqdc.utils.molecule import get_atomic_numuber_and_charge from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER -from openqdc.datasets.base import BaseDataset +from openqdc.utils.molecule import get_atomic_numuber_and_charge def read_record(r): @@ -13,24 +15,24 @@ def read_record(r): subset = r["subset"][0].decode("utf-8") n_confs = r["conformations"].shape[0] x = get_atomic_numuber_and_charge(dm.to_mol(smiles, add_hs=True)) - positions= r["conformations"][:] * BOHR2ANG - + positions = r["conformations"][:] * BOHR2ANG + res = dict( - smiles= np.array([smiles]*n_confs), - subset= np.array([Spice.subset_mapping[subset]]*n_confs), - energies= r[Spice.energy_target_names[0]][:][:, None].astype(np.float32), - forces= r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG, - atomic_inputs = np.concatenate(( - x[None, ...].repeat(n_confs, axis=0), - positions), axis=-1, dtype=np.float32).reshape(-1, 5), - n_atoms = np.array([x.shape[0]]*n_confs, dtype=np.int32), + smiles=np.array([smiles] * n_confs), + subset=np.array([Spice.subset_mapping[subset]] * n_confs), + energies=r[Spice.energy_target_names[0]][:][:, None].astype(np.float32), + forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG, + atomic_inputs=np.concatenate( + (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32 + ).reshape(-1, 5), + n_atoms=np.array([x.shape[0]] * n_confs, dtype=np.int32), ) return res class Spice(BaseDataset): - __name__ = 'spice' + __name__ = "spice" __energy_methods__ = ["wb97x_tz"] energy_target_names = ["dft_total_energy"] @@ -78,23 +80,23 @@ def __init__(self) -> None: super().__init__() def read_raw_entries(self): - raw_path = p_join(self.root, 'SPICE-1.1.4.hdf5') - - data = load_hdf5_file(raw_path) - tmp = [read_record(data[mol_name]) for mol_name in tqdm(data)] # don't use parallelized here + raw_path = p_join(self.root, "SPICE-1.1.4.hdf5") + + data = load_hdf5_file(raw_path) + tmp = [read_record(data[mol_name]) for mol_name in tqdm(data)] # don't use parallelized here return tmp -if __name__ == '__main__': +if __name__ == "__main__": data = Spice() n = len(data) for i in np.random.choice(n, 10, replace=False): x = data[i] - print(x.smiles, x.subset, end=' ') + print(x.smiles, x.subset, end=" ") for k in x: - if k != 'smiles' and k != 'subset': - print(k, x[k].shape if x[k] is not None else None, end=' ') - - print() \ No newline at end of file + if k != "smiles" and k != "subset": + print(k, x[k].shape if x[k] is not None else None, end=" ") + + print() diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py index df1ba44..86d1d33 100644 --- a/openqdc/raws/config_factory.py +++ b/openqdc/raws/config_factory.py @@ -1,6 +1,4 @@ - class DataConfigFactory: - ani = dict( dataset_name="ani", links={ @@ -24,100 +22,82 @@ class DataConfigFactory: gdml = dict( dataset_name="gdml", - links = { - "gdml.hdf5.gz": "https://zenodo.org/record/3585908/files/219.hdf5.gz" - }, - ) + links={"gdml.hdf5.gz": "https://zenodo.org/record/3585908/files/219.hdf5.gz"}, + ) solvated_peptides = dict( dataset_name="solvated_peptides", - links = { - "solvated_peptides.hdf5.gz": "https://zenodo.org/record/3585804/files/213.hdf5.gz" - }, + links={"solvated_peptides.hdf5.gz": "https://zenodo.org/record/3585804/files/213.hdf5.gz"}, ) iso_17 = dict( dataset_name="iso_17", - links = { - "iso_17.hdf5.gz": "https://zenodo.org/record/3585907/files/216.hdf5.gz" - }, - ) + links={"iso_17.hdf5.gz": "https://zenodo.org/record/3585907/files/216.hdf5.gz"}, + ) sn2_rxn = dict( dataset_name="sn2_rxn", - links = { - "sn2_rxn.hdf5.gz": "https://zenodo.org/record/3585800/files/212.hdf5.gz" - }, + links={"sn2_rxn.hdf5.gz": "https://zenodo.org/record/3585800/files/212.hdf5.gz"}, ) # FROM: https://sites.uw.edu/wdbase/database-of-water-clusters/ waterclusters3_30 = dict( dataset_name="waterclusters3_30", - links = { - "W3-W30_all_geoms_TTM2.1-F.zip": "https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n" - }, + links={"W3-W30_all_geoms_TTM2.1-F.zip": "https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n"}, ) geom = dict( dataset_name="geom", - links = { - "rdkit_folder.tar.gz": "https://dataverse.harvard.edu/api/access/datafile/4327252" - }, + links={"rdkit_folder.tar.gz": "https://dataverse.harvard.edu/api/access/datafile/4327252"}, ) molecule3d = dict( dataset_name="molecule3d", - links={ - "molecule3d.zip": "https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy" - }, + links={"molecule3d.zip": "https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy"}, ) orbnet_denali = dict( dataset_name="orbnet_denali", links={ "orbnet_denali.tar.gz": "https://figshare.com/ndownloader/files/28672287", - "orbnet_denali_targets.tar.gz": "https://figshare.com/ndownloader/files/28672248"}, + "orbnet_denali_targets.tar.gz": "https://figshare.com/ndownloader/files/28672248", + }, ) qm7x = dict( dataset_name="qm7x", - links={f"{i}000.xz":"https://zenodo.org/record/4288677/files/{i}000.xz" for i in range(1, 9) - } + links={f"{i}000.xz": "https://zenodo.org/record/4288677/files/{i}000.xz" for i in range(1, 9)}, ) qmugs = dict( dataset_name="qmugs", links={ - "summary.csv": "https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=summary.csv", - "structures.tar.gz": "https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=structures.tar.gz", + "summary.csv": "https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=summary.csv", + "structures.tar.gz": "https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=structures.tar.gz", }, ) spice = dict( dataset_name="spice", - links={ - "SPICE-1.1.4.hdf5": "https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5" - }, + links={"SPICE-1.1.4.hdf5": "https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5"}, ) misato = dict( dataset_name="misato", links={ - "MD.hdf5": "https://zenodo.org/record/7711953/files/MD.hdf5", - "QM.hdf5": "https://zenodo.org/record/7711953/files/QM.hdf5" - }, + "MD.hdf5": "https://zenodo.org/record/7711953/files/MD.hdf5", + "QM.hdf5": "https://zenodo.org/record/7711953/files/QM.hdf5", + }, ) nabladft = dict( dataset_name="nabladft", - links={ - "nabladft.db": "https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db" - }, + links={"nabladft.db": "https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db"}, cmd=[ "axel -n 10 --output=dataset_full.db https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db" - ] + ], ) - + pubchemqc = dict( dataset_name="pubchemqc", links={ @@ -126,16 +106,14 @@ class DataConfigFactory: cmd=[ 'wget "https://chibakoudai.sharepoint.com/:u:/s/stair06/EcWMtOpIEqFLrHcR1dzlZiMBLhTFY0RZ0qPaqC4lhRp51A?download=1" -O b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.001', 'wget "https://chibakoudai.sharepoint.com/:u:/s/stair06/EbJe-SlL4oNPhOpOtA8mxLsB1F3eI2l-5RS315hIZUFNwQ?download=1" -O b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.002', - 'cat b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.001 b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.002 | tar xvfJ - ' - ] + "cat b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.001 b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.002 | tar xvfJ - ", + ], ) - available_datasets = [k for k in locals().keys() if not k.startswith("__")] + available_datasets = [k for k in locals().keys() if not k.startswith("__")] def __init__(self): pass def __call__(self, dataset_name): return getattr(self, dataset_name) - - diff --git a/openqdc/raws/fetch.py b/openqdc/raws/fetch.py index 58d86d0..b7fc4e4 100644 --- a/openqdc/raws/fetch.py +++ b/openqdc/raws/fetch.py @@ -1,22 +1,22 @@ """Script to download the molecule3d dataset from Google Drive.""" -import os import gzip -import tqdm -import gdown -import fsspec -import socket +import os import shutil +import socket import tarfile -import zipfile -import requests import urllib.error import urllib.request +import zipfile + +import fsspec +import gdown +import requests +import tqdm from loguru import logger from sklearn.utils import Bunch -from openqdc.utils.io import get_local_cache + from openqdc.raws.config_factory import DataConfigFactory -from office365.runtime.auth.client_credential import ClientCredential -from office365.sharepoint.client_context import ClientContext +from openqdc.utils.io import get_local_cache # function to download large files with requests @@ -37,7 +37,6 @@ def fetch_file(url, local_filename, overwrite=False): Local file. """ try: - if os.path.exists(local_filename) and not overwrite: logger.info("File already exists, skipping download") else: @@ -53,7 +52,7 @@ def fetch_file(url, local_filename, overwrite=False): # decompress archive if necessary parent = os.path.dirname(local_filename) - if local_filename.endswith("tar.gz"): + if local_filename.endswith("tar.gz"): with tarfile.open(local_filename) as tar: logger.info(f"Verifying archive extraction states: {local_filename}") all_names = tar.getnames() @@ -81,7 +80,7 @@ def fetch_file(url, local_filename, overwrite=False): all_extracted = os.path.exists(out_filename) if not all_extracted: logger.info(f"Extracting archive: {local_filename}") - with gzip.open(local_filename, 'rb') as f_in, open(out_filename, 'wb') as f_out: + with gzip.open(local_filename, "rb") as f_in, open(out_filename, "wb") as f_out: shutil.copyfileobj(f_in, f_out) else: logger.info(f"Archive already extracted: {local_filename}") @@ -115,7 +114,7 @@ def __init__(self, cache_path=None, overwrite=False): self.cache_path = cache_path self.overwrite = overwrite - + def from_config(self, config: dict): b_config = Bunch(**config) data_path = os.path.join(self.cache_path, b_config.dataset_name) @@ -123,14 +122,14 @@ def from_config(self, config: dict): logger.info(f"Downloading the {b_config.dataset_name} dataset") for local, link in b_config.links.items(): - outfile = os.path.join(data_path, local) + outfile = os.path.join(data_path, local) fetch_file(link, outfile) def from_name(self, name): cfg = DataConfigFactory()(name) return self.from_config(cfg) - + if __name__ == "__main__": dataset_names = DataConfigFactory.available_datasets @@ -138,4 +137,3 @@ def from_name(self, name): for dataset_name in dataset_names: dd = DataDownloader() dd.from_name(dataset_name) - diff --git a/openqdc/utils/constants.py b/openqdc/utils/constants.py index b941a82..a8a8215 100644 --- a/openqdc/utils/constants.py +++ b/openqdc/utils/constants.py @@ -4,4 +4,4 @@ HAR2EV = 27.211386246 -BOHR2ANG = 0.52917721092 \ No newline at end of file +BOHR2ANG = 0.52917721092 diff --git a/openqdc/utils/io.py b/openqdc/utils/io.py index fe3c9e6..0391add 100644 --- a/openqdc/utils/io.py +++ b/openqdc/utils/io.py @@ -1,10 +1,11 @@ """IO utilities for mlip package""" -import os import json +import os +import pickle as pkl + +import fsspec import h5py import torch -import fsspec -import pickle as pkl from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem from rdkit.Chem import MolFromXYZFile @@ -14,14 +15,14 @@ def get_local_cache(): - cache_dir = os.path.expanduser(os.path.expandvars('~/.cache/openqdc')) + cache_dir = os.path.expanduser(os.path.expandvars("~/.cache/openqdc")) os.makedirs(cache_dir, exist_ok=True) return cache_dir def get_remote_cache(): remote_cache = "gs://opendatasets/openqdc" - return remote_cache + return remote_cache def push_remote(local_path, overwrite=True): @@ -148,4 +149,4 @@ def load_json(path): def load_xyz(path): - return MolFromXYZFile(path) \ No newline at end of file + return MolFromXYZFile(path) diff --git a/openqdc/utils/molecule.py b/openqdc/utils/molecule.py index 83fc4a6..e8c1c9c 100644 --- a/openqdc/utils/molecule.py +++ b/openqdc/utils/molecule.py @@ -3,6 +3,7 @@ atom_table = Chem.GetPeriodicTable() + def get_atomic_number(mol: Chem.Mol): """Returns atomic numbers for rdkit molecule""" return np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]) @@ -15,6 +16,4 @@ def get_atomic_charge(mol: Chem.Mol): def get_atomic_numuber_and_charge(mol: Chem.Mol): """Returns atoms number and charge for rdkit molecule""" - return np.array([[atom.GetAtomicNum(), atom.GetFormalCharge()] - for atom in mol.GetAtoms()]) - + return np.array([[atom.GetAtomicNum(), atom.GetFormalCharge()] for atom in mol.GetAtoms()]) diff --git a/pyproject.toml b/pyproject.toml index be78fcf..5061854 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,8 @@ omit = ["setup.py", "tests/*"] [tool.ruff] line-length = 120 +ignore = ["E731"] [tool.ruff.per-file-ignores] "__init__.py" = ["F403"] +"openqdc/raws/config_factory.py" = ["E501"]