From ec670f86db54be3ba7261375ad50c84a9aa9d532 Mon Sep 17 00:00:00 2001 From: prtos Date: Thu, 12 Oct 2023 17:36:43 +0000 Subject: [PATCH 1/2] everything execept pubchemqc --- .gitignore | 2 + src/openqdc/datasets/dess.py | 102 +++++++++++++++++ src/openqdc/datasets/orbnet_denali.py | 2 +- src/openqdc/datasets/pcqm.py | 93 +++++++++++++++ src/openqdc/datasets/qm7x.py | 63 ++++++++--- src/openqdc/datasets/qmugs.py | 2 +- src/openqdc/datasets/sn2_rxn.py | 15 +-- src/openqdc/datasets/solvated_peptides.py | 54 +++++++++ src/openqdc/datasets/tmqm.py | 88 +++++++++++++++ src/openqdc/datasets/waterclusters3_30.py | 86 ++++++++++++++ src/openqdc/raws/config_factory.py | 18 ++- src/openqdc/raws/fetch.py | 131 +++++++++++++--------- src/openqdc/raws/pubchemqc.py | 117 +++++++++++++++++++ 13 files changed, 696 insertions(+), 77 deletions(-) create mode 100644 src/openqdc/datasets/dess.py create mode 100644 src/openqdc/datasets/tmqm.py create mode 100644 src/openqdc/raws/pubchemqc.py diff --git a/.gitignore b/.gitignore index c1eefd0..5fbc33b 100644 --- a/.gitignore +++ b/.gitignore @@ -146,3 +146,5 @@ nohup.out *.xyz *.csv *.txt +*.sh + diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py new file mode 100644 index 0000000..96dd008 --- /dev/null +++ b/src/openqdc/datasets/dess.py @@ -0,0 +1,102 @@ +from os.path import join as p_join + +import datamol as dm +import numpy as np +import pandas as pd +from tqdm import tqdm +from rdkit.Chem import MolFromMolBlock +from openqdc.datasets.base import BaseDataset +from openqdc.utils import load_json, load_pkl +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import get_atomic_number_and_charge + + +def read_mol(mol_path, smiles, subset, targets): + try: + with open(mol_path, "r") as f: + mol_block = f.read() + mol = dm.read_molblock(mol_block, remove_hs=False, fail_if_invalid=True) + + x = get_atomic_number_and_charge(mol) + positions = mol.GetConformer().GetPositions() + + res = dict( + name=np.array([smiles]), + subset=np.array([subset]), + energies=np.array(targets).astype(np.float32)[None, :], + atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32), + n_atoms=np.array([x.shape[0]], dtype=np.int32), + ) + except Exception as e: + print(f"Skipping: {mol_path} due to {e}") + res = None + + return res + + +class DESS(BaseDataset): + __name__ = "dess" + __energy_methods__ = [ + 'mp2_cc', + 'mp2_qz', + 'mp2_tz', + 'mp2_cbs', + 'ccsd(t)_cc', + 'ccsd(t)_cbs', + 'ccsd(t)_nn', + 'sapt', + ] + + energy_target_names = [ + 'cc_MP2_all', + 'qz_MP2_all', + 'tz_MP2_all', + 'cbs_MP2_all', + 'cc_CCSD(T)_all', + 'cbs_CCSD(T)_all', + 'nn_CCSD(T)_all', + 'sapt_all', + ] + # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all'] + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + partitions = ["DES370K", "DES5M"] + + def __init__(self) -> None: + super().__init__() + + def _read_raw_(self, part): + df = pd.read_csv(p_join(self.root, f"{part}.csv")) + for col in self.energy_target_names: + if col not in df.columns: + df[col] = np.nan + smiles = (df['smiles0'] + '.' + df['smiles1']).tolist() + subsets = (f"{part}_" + df["group_orig"]).tolist() + targets = df[self.energy_target_names].values + paths = p_join(self.root, "geometries/") + df["system_id"].astype(str) + f"/{part}_" + df["geom_id"].astype(str) + ".mol" + + inputs = [dict(smiles=smiles[i], subset=subsets[i], targets=targets[i], mol_path=paths[i]) + for i in tqdm(range(len(smiles)))] + f = lambda xs: [read_mol(**x) for x in xs] + samples = dm.parallelized_with_batches(f, inputs, n_jobs=-1, progress=True, + batch_size=1024, scheduler= "threads") + return samples + + def read_raw_entries(self): + samples = sum([self._read_raw_(partition) for partition in self.partitions], []) + return samples + + +if __name__ == "__main__": + for data_class in [DESS]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index 452cce1..6901737 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -9,7 +9,7 @@ from openqdc.utils.molecule import atom_table -def read_mol(mol_id, conf_dict, base_path, energy_target_names): +def read_archive(mol_id, conf_dict, base_path, energy_target_names): res = [] for conf_id, conf_label in conf_dict.items(): try: diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py index e69de29..69970ff 100644 --- a/src/openqdc/datasets/pcqm.py +++ b/src/openqdc/datasets/pcqm.py @@ -0,0 +1,93 @@ +from os.path import join as p_join +import json +import tarfile +import datamol as dm +import numpy as np +import pandas as pd +from glob import glob +from tqdm import tqdm +from rdkit.Chem import MolFromMolBlock +from openqdc.datasets.base import BaseDataset +from openqdc.utils import load_json, load_pkl +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import get_atomic_number_and_charge + + +def flatten_dict(d, sep: str= '.'): + return pd.json_normalize(d, sep=sep).to_dict(orient='records')[0] + +def read_content(fd): + try: + f = tar.extractfile(fd) + r = flatten_dict(json.load(f)) + x = np.concatenate((r['atoms.elements.number'][:, None], + r['atoms.core electrons'][:, None], + r['atoms.coords.3d'].reshape(-1, 3)), + axis=-1).astype(np.float32) + + res = dict( + name=np.array([r['smiles']]), + subset=np.array([r['formula']]), + energies=np.array(['properties.energy.total']).astype(np.float32)[None, :], + atomic_inputs=x, + n_atoms=np.array([x.shape[0]], dtype=np.int32), + ) + except Exception as e: + res = None + + return res + +def read_archive(path): + + with tarfile.open(path) as tar: + res = [read_content(member) for member in tar.getmembers()] + # print(len(res)) + return res + + +class PubchemQC(BaseDataset): + __name__ = "pubchemqc" + __energy_methods__ = [ + 'b3lyp', + 'pm6', + ] + + energy_target_names = [ + 'b3lyp', + 'pm6', + ] + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + partitions = ["b3lyp", "pm6"] + + def __init__(self) -> None: + super().__init__() + + def _read_raw_(self, part): + arxiv_paths = glob(p_join(self.root, f"{part}", "*.tar.gz")) + print(len(arxiv_paths)) + samples = dm.parallelized(read_archive, arxiv_paths, n_jobs=-1, progress=True, scheduler='threads') + res = sum(samples, []) + print(len(res)) + exit() + return res + + def read_raw_entries(self): + samples = sum([self._read_raw_(partition) for partition in self.partitions], []) + return samples + + +if __name__ == "__main__": + for data_class in [PubchemQC]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") + diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py index 55f395c..bbcaab8 100644 --- a/src/openqdc/datasets/qm7x.py +++ b/src/openqdc/datasets/qm7x.py @@ -1,45 +1,78 @@ -from os.path import join as p_join import numpy as np - +from tqdm import tqdm +from os.path import join as p_join from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 from openqdc.utils.constants import MAX_ATOMIC_NUMBER - - -class ISO17(BaseDataset): - __name__ = "iso_17" +from openqdc.utils.io import load_hdf5_file + + +def read_mol(mol_h5, mol_name, energy_target_names, force_target_names): + m = mol_h5 + cids = list(mol_h5.keys()) + + zs = [m[c]['atNUM'] for c in cids] + xyz = np.concatenate([m[c]['atXYZ'] for c in cids], axis=0) + n_atoms = np.array([len(z) for z in zs], dtype=np.int32) + n, zs = len(n_atoms), np.concatenate(zs, axis=0) + a_inputs = np.concatenate([np.stack([zs, np.zeros_like(zs)], axis=-1), xyz], axis=-1) + + forces = np.concatenate([np.stack([m[c][f_tag] + for f_tag in force_target_names], axis=-1) + for c in cids], axis=0) + energies = np.stack([np.array([m[c][e_tag][0] + for e_tag in energy_target_names]) + for c in cids], axis=0) + + res = dict( + name=np.array([mol_name]*n), + subset=np.array(["qm7x"]*n), + energies=energies.astype(np.float32), + atomic_inputs=a_inputs.astype(np.float32), + forces=forces.astype(np.float32), + n_atoms=n_atoms, + ) + + return res + +class QM7X(BaseDataset): + __name__ = "qm7x" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) __energy_methods__ = [ - "pbe-ts", + "pbe-ts", "mbd" ] energy_target_names = [ - "PBE-TS Energy", + "ePBE0", "eMBD" ] __force_methods__ = [ - "pbe-ts", + "pbe-ts", "vdw" ] force_target_names = [ - "PBE-TS Gradient", + "pbe0FOR", "vdwFOR" ] def __init__(self) -> None: super().__init__() def read_raw_entries(self): - raw_path = p_join(self.root, "iso_17.h5") - samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) - + samples = [] + for i in range(1, 9): + raw_path = p_join(self.root, f"{i}000") + data = load_hdf5_file(raw_path) + samples += [read_mol(data[k], k, self.energy_target_names, self.force_target_names) + for k in tqdm(data.keys())] + return samples if __name__ == "__main__": - for data_class in [ISO17]: + for data_class in [QM7X]: data = data_class() n = len(data) @@ -49,5 +82,3 @@ def read_raw_entries(self): for k in x: if x[k] is not None: print(k, x[k].shape, end=" ") - - print() diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index b528f42..6fc468b 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -12,7 +12,7 @@ def read_mol(mol_dir): filenames = glob(p_join(mol_dir, "*.sdf")) - mols = [dm.read_sdf(f)[0] for f in filenames] + mols = [dm.read_sdf(f, remove_hs=False)[0] for f in filenames] n_confs = len(mols) if len(mols) == 0: diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index 8be8281..f418f95 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -7,33 +7,34 @@ class SN2RXN(BaseDataset): - __name__ = "iso_17" + __name__ = "sn2_rxn" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) __energy_methods__ = [ - "pbe-ts", + "dsd-blyp-d3(bj)_tz", ] energy_target_names = [ - "PBE-TS Energy", + "DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy", ] __force_methods__ = [ - "pbe-ts", + "dsd-blyp-d3(bj)_tz", ] force_target_names = [ - "PBE-TS Gradient", + "DSD-BLYP-D3(BJ):def2-TZVP Gradient", ] def __init__(self) -> None: super().__init__() def read_raw_entries(self): - raw_path = p_join(self.root, "iso_17.h5") - samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) + raw_path = p_join(self.root, "sn2_rxn.h5") + samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, + self.force_target_names) return samples diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py index e69de29..1ff0ebe 100644 --- a/src/openqdc/datasets/solvated_peptides.py +++ b/src/openqdc/datasets/solvated_peptides.py @@ -0,0 +1,54 @@ +from os.path import join as p_join + +import numpy as np + +from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER + + +class SolvatedPeptides(BaseDataset): + __name__ = "solvated_peptides" + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = [ + "revpbe-d3(bj)_tz", + ] + + energy_target_names = [ + "revPBE-D3(BJ):def2-TZVP Atomization Energy", + ] + + __force_methods__ = [ + "revpbe-d3(bj)_tz", + ] + + force_target_names = [ + "revPBE-D3(BJ):def2-TZVP Gradient", + ] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + raw_path = p_join(self.root, "solvated_peptides.h5") + samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, + self.force_target_names) + + return samples + + +if __name__ == "__main__": + for data_class in [SolvatedPeptides]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") + + print() diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py new file mode 100644 index 0000000..548fdf4 --- /dev/null +++ b/src/openqdc/datasets/tmqm.py @@ -0,0 +1,88 @@ + +import numpy as np +import pandas as pd +from tqdm import tqdm +from os.path import join as p_join +from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import atom_table +from openqdc.utils.io import load_hdf5_file +from io import StringIO + + +def content_to_xyz(content, e_map): + try: + tmp = content.split("\n")[1].split(" | ") + code = tmp[0].split(" ")[-1] + name = tmp[3].split(" ")[-1] + except: + print(content) + return None + + s = StringIO(content) + d = np.loadtxt(s, skiprows=2, dtype="str") + z, positions = d[:, 0], d[:, 1:].astype(np.float32) + z = np.array([atom_table.GetAtomicNumber(s) for s in z]) + xs = np.stack((z, np.zeros_like(z)), axis=-1) + e = e_map[code] + + conf = dict( + atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), + name=np.array([name]), + energies=np.array([e], dtype=np.float32)[:, None] , + n_atoms=np.array([positions.shape[0]], dtype=np.int32), + subset=np.array(["tmqm"]), + ) + + return conf + + +def read_xyz(fname, e_map): + with open(fname, "r") as f: + contents = f.read().split("\n\n") + + print("toto", len(contents)) + res = [content_to_xyz(content, e_map) for content in tqdm(contents)] + return res + + +class TMQM(BaseDataset): + __name__ = "tmqm" + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = [ + "tpssh_tz" + ] + + energy_target_names = [ + "TPSSh/def2TZVP level" + ] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + df = pd.read_csv(p_join(self.root, "tmQM_y.csv"), sep=";", usecols=["CSD_code", "Electronic_E"]) + e_map = dict(zip(df["CSD_code"], df["Electronic_E"])) + raw_fnames = ["tmQM_X1.xyz", "tmQM_X2.xyz", "Benchmark2_TPSSh_Opt.xyz"] + samples = [] + for fname in raw_fnames: + data = read_xyz(p_join(self.root, fname), e_map) + samples += data + + return samples + + +if __name__ == "__main__": + for data_class in [TMQM]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") diff --git a/src/openqdc/datasets/waterclusters3_30.py b/src/openqdc/datasets/waterclusters3_30.py index e69de29..c711787 100644 --- a/src/openqdc/datasets/waterclusters3_30.py +++ b/src/openqdc/datasets/waterclusters3_30.py @@ -0,0 +1,86 @@ + +import numpy as np +from tqdm import tqdm +from os.path import join as p_join +from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import atom_table +from openqdc.utils.io import load_hdf5_file +from io import StringIO + + +def content_to_xyz(content, n_waters): + content = content.strip() + + try: + tmp = content.splitlines() + s = StringIO(content) + d = np.loadtxt(s, skiprows=2, dtype="str") + z, positions = d[:, 0], d[:, 1:].astype(np.float32) + z = np.array([atom_table.GetAtomicNumber(s) for s in z]) + xs = np.stack((z, np.zeros_like(z)), axis=-1) + e = float(tmp[1].strip().split(" ")[-1]) + except: + print("Error in reading xyz file") + print(n_waters, content) + return None + + conf = dict( + atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), + name=np.array([f"water_{n_waters}"]), + energies=np.array([e], dtype=np.float32)[:, None] , + n_atoms=np.array([positions.shape[0]], dtype=np.int32), + subset=np.array([f"water_{n_waters}"]), + ) + + return conf + + +def read_xyz(fname, n_waters): + s = 3*n_waters+2 + with open(fname, "r") as f: + lines = f.readlines() + contents = ["".join(lines[i:i+s]) for i in range(0, len(lines), s)] + + res = [content_to_xyz(content, n_waters) for content in tqdm(contents)] + return res + + +class WaterClusters(BaseDataset): + __name__ = "waterclusters3_30" + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = [ + "ttm2.1-f" + ] + + energy_target_names = [ + "TTM2.1-F Potential" + ] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + samples = [] + for i in range(3, 31): + raw_path = p_join(self.root, f"W3-W30_all_geoms_TTM2.1-F/W{i}_geoms_all.xyz") + data = read_xyz(raw_path, i,) + samples += data + + return samples + + +if __name__ == "__main__": + for data_class in [WaterClusters]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py index 86d1d33..1255fa2 100644 --- a/src/openqdc/raws/config_factory.py +++ b/src/openqdc/raws/config_factory.py @@ -65,8 +65,8 @@ class DataConfigFactory: ) qm7x = dict( - dataset_name="qm7x", - links={f"{i}000.xz": "https://zenodo.org/record/4288677/files/{i}000.xz" for i in range(1, 9)}, + dataset_name="qm7x", # https://zenodo.org/record/4288677/files/1000.xz?download=1 + links={f"{i}000.xz": f"https://zenodo.org/record/4288677/files/{i}000.xz" for i in range(1, 9)}, ) qmugs = dict( @@ -82,6 +82,20 @@ class DataConfigFactory: links={"SPICE-1.1.4.hdf5": "https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5"}, ) + dess = dict( + dataset_name="dess5m", + links={"DESS5M.zip": "https://zenodo.org/record/5706002/files/DESS5M.zip", + "DESS370.zip":"https://zenodo.org/record/5676266/files/DES370K.zip"}, + ) + + tmqm = dict( + dataset_name="tmqm", + links={ + x: f"https://raw.githubusercontent.com/bbskjelstad/tmqm/master/data/{x}" + for x in ["tmQM_X1.xyz.gz", "tmQM_X2.xyz.gz", "tmQM_y.csv", "Benchmark2_TPSSh_Opt.xyz"] + }, + ) + misato = dict( dataset_name="misato", links={ diff --git a/src/openqdc/raws/fetch.py b/src/openqdc/raws/fetch.py index b7fc4e4..a9c43b0 100644 --- a/src/openqdc/raws/fetch.py +++ b/src/openqdc/raws/fetch.py @@ -1,24 +1,86 @@ """Script to download the molecule3d dataset from Google Drive.""" -import gzip import os +import gzip +import tqdm +import gdown +import fsspec import shutil import socket import tarfile -import urllib.error -import urllib.request import zipfile - -import fsspec -import gdown import requests -import tqdm +import urllib.error +import urllib.request +import datamol as dm from loguru import logger from sklearn.utils import Bunch from openqdc.raws.config_factory import DataConfigFactory +from openqdc.raws.pubchemqc import download_b3lyp_pm6 from openqdc.utils.io import get_local_cache +def download_url(url, local_filename): + logger.info(f"Url: {url} File: {local_filename}") + if "drive.google.com" in url: + gdown.download(url, local_filename, quiet=False) + elif "raw.github" in url: + r = requests.get(url, allow_redirects=True) + with open(local_filename, 'wb') as f: + f.write(r.content) + else: + r = requests.get(url, stream=True) + with fsspec.open(local_filename, "wb") as f: + for chunk in tqdm.tqdm(r.iter_content(chunk_size=16384)): + if chunk: + f.write(chunk) + + +def decompress_tar_gz(local_filename): + parent = os.path.dirname(local_filename) + with tarfile.open(local_filename) as tar: + logger.info(f"Verifying archive extraction states: {local_filename}") + all_names = tar.getnames() + all_extracted = all([os.path.exists(os.path.join(parent, x)) for x in all_names]) + if not all_extracted: + logger.info(f"Extracting archive: {local_filename}") + tar.extractall(path=parent) + else: + logger.info(f"Archive already extracted: {local_filename}") + + +def decompress_zip(local_filename): + parent = os.path.dirname(local_filename) + + logger.info(f"Verifying archive extraction states: {local_filename}") + with zipfile.ZipFile(local_filename, "r") as zip_ref: + all_names = zip_ref.namelist() + all_extracted = all([os.path.exists(os.path.join(parent, x)) for x in all_names]) + if not all_extracted: + logger.info(f"Extracting archive: {local_filename}") + zip_ref.extractall(parent) + else: + logger.info(f"Archive already extracted: {local_filename}") + + +def decompress_gz(local_filename): + parent = os.path.dirname(local_filename) + + logger.info(f"Verifying archive extraction states: {local_filename}") + out_filename = local_filename.replace(".gz", "") + if out_filename.endswith("hdf5"): + out_filename = local_filename.replace("hdf5", "h5") + + all_extracted = os.path.exists(out_filename) + if not all_extracted: + logger.info(f"Extracting archive: {local_filename}") + with gzip.open(local_filename, "rb") as f_in, open(out_filename, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + else: + logger.info(f"Archive already extracted: {local_filename}") + + + # function to download large files with requests def fetch_file(url, local_filename, overwrite=False): """ @@ -40,50 +102,18 @@ def fetch_file(url, local_filename, overwrite=False): if os.path.exists(local_filename) and not overwrite: logger.info("File already exists, skipping download") else: - logger.info(f"File: {local_filename}") - if "drive.google.com" in url: - gdown.download(url, local_filename, quiet=False) - else: - r = requests.get(url, stream=True) - with fsspec.open(local_filename, "wb") as f: - for chunk in tqdm.tqdm(r.iter_content(chunk_size=16384)): - if chunk: - f.write(chunk) + download_url(url, local_filename) # decompress archive if necessary parent = os.path.dirname(local_filename) if local_filename.endswith("tar.gz"): - with tarfile.open(local_filename) as tar: - logger.info(f"Verifying archive extraction states: {local_filename}") - all_names = tar.getnames() - all_extracted = all([os.path.exists(os.path.join(parent, x)) for x in all_names]) - if not all_extracted: - logger.info(f"Extracting archive: {local_filename}") - tar.extractall(path=parent) - else: - logger.info(f"Archive already extracted: {local_filename}") + decompress_tar_gz(local_filename) elif local_filename.endswith("zip"): - logger.info(f"Verifying archive extraction states: {local_filename}") - with zipfile.ZipFile(local_filename, "r") as zip_ref: - all_names = zip_ref.namelist() - all_extracted = all([os.path.exists(os.path.join(parent, x)) for x in all_names]) - if not all_extracted: - logger.info(f"Extracting archive: {local_filename}") - zip_ref.extractall(parent) - else: - logger.info(f"Archive already extracted: {local_filename}") - - elif local_filename.endswith("hdf5.gz"): - logger.info(f"Verifying archive extraction states: {local_filename}") - out_filename = local_filename.replace("hdf5.gz", "h5") - all_extracted = os.path.exists(out_filename) - if not all_extracted: - logger.info(f"Extracting archive: {local_filename}") - with gzip.open(local_filename, "rb") as f_in, open(out_filename, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - else: - logger.info(f"Archive already extracted: {local_filename}") + decompress_zip(local_filename) + + elif local_filename.endswith(".gz"): + decompress_gz(local_filename) elif local_filename.endswith("xz"): logger.info(f"Extracting archive: {local_filename}") @@ -132,8 +162,9 @@ def from_name(self, name): if __name__ == "__main__": - dataset_names = DataConfigFactory.available_datasets - dataset_names = ["ani"] - for dataset_name in dataset_names: - dd = DataDownloader() - dd.from_name(dataset_name) + download_b3lyp_pm6() + # dataset_names = DataConfigFactory.available_datasets + # dataset_names = ["tmqm"] + # for dataset_name in dataset_names: + # dd = DataDownloader() + # dd.from_name(dataset_name) diff --git a/src/openqdc/raws/pubchemqc.py b/src/openqdc/raws/pubchemqc.py new file mode 100644 index 0000000..1c45d13 --- /dev/null +++ b/src/openqdc/raws/pubchemqc.py @@ -0,0 +1,117 @@ +import os +import click +import datamol as dm +from tqdm import tqdm +from openqdc.utils.io import get_local_cache + + +def download_b3lyp_pm6_item(i, method='b3lyp'): + try: + step_size = 25000 + start = str(i * step_size + 1).rjust(9, '0') + stop = str((i + 1) * step_size).rjust(9, '0') + + cmd_b3lyp = f"""wget --header="Host: chibakoudai.sharepoint.com" +--header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 +(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" --header="Accept: +text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image +/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" +--header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,fr;q=0.7" +--header="Referer: https://chibakoudai.sharepoint.com/sites/stair02/Shared +%20Documents/Forms/AllItems.aspx?ga=1&id=%2Fsites%2Fstair02%2FShared%20 +Documents%2Fdata%2FPubChemQC%2FB3LYP%5FPM6%2Fb3lyp%5Fpm6%5Fver1%2E0%2E0%2F +json%2Fall%2FCompound%5F{start}%5F{stop}%2Etar%2Exz&viewid=f6d34767%2 +D64f0%2D480e%2Dab70%2Dd8524dbdc74e&parent=%2Fsites%2Fstair02%2FShared%20 +Documents%2Fdata%2FPubChemQC%2FB3LYP%5FPM6%2Fb3lyp%5Fpm6%5Fver1%2E0%2E0%2F +json%2Fall" --header="Cookie: MicrosoftApplicationsTelemetryDeviceId=cec40b8a +-9870-4c4f-bb71-838a300c8685; MSFPC=GUID=511089efdbeb49d3923fdc7e6404bd9b& +HASH=5110&LV=202303&V=4&LU=1678287635332; WSS_FullScreenMode=false; FedAuth= +77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+VjEzLDBoLmZ8bWV +tYmVyc2hpcHx1cm4lM2FzcG8lM2Fhbm9uIzI1YTJjOTIyMjVkMWNlNzlkOWVmN2NjYmRjNzc5Y +WI1MmJhMDY2N2E5NDRmZTg3NGFmZTFhZjRjMjQ0OGE3ZTUsMCMuZnxtZW1iZXJzaGlwfHVybiU +zYXNwbyUzYWFub24jMjVhMmM5MjIyNWQxY2U3OWQ5ZWY3Y2NiZGM3NzlhYjUyYmEwNjY3YTk0N +GZlODc0YWZlMWFmNGMyNDQ4YTdlNSwxMzM0MTM0MDE0OTAwMDAwMDAsMCwxMzM0MTYwOTY0NjE +yNDYxODcsMC4wLjAuMCwyNTgsNTIyZTlhNzYtYWNiZC00MDJiLWEyZmMtN2NmNjg5ZGRmNTkwL +CwsOTA3NGUyYTAtNDAyNS0yMDAwLWE2NDEtYjdiNDU2N2JlNzI5LDc2MjNlM2EwLWQwYzEtMjA +wMC1hNjQxLWI4MGEyYmU3YmExZSxSNFZHUmtMSXdrT3RETDI0alZUSm9RLDAsMCwwLCwsLDI2N +TA0Njc3NDM5OTk5OTk5OTksMCwsLCwsLCwwLCwxOTU2NzYsR0FkeFdYM3FnLXBsUDRlOVhCUDF +5MTZpZmpVLFN2QXdUYjI3b0MrM0RKa2hsODdRNnhkVFVpQ2l5U0tqU2RxZ3EzNUFsa2lOcmczQ +0NJZWplSmNCR1dteCtWRS8zL1lacmZFYVk3eGJGVDFSWHoxREhXVE5oK0dUSzhiQ0FYOUUxQ20 +yUXpPVG5jZm5MNDdpWUVOLzRzUzVTdnFpbnZ1eDh3L2FrQmZISW01Zlpqbk02c25KOWs5V294b +24wY1F1dUgvY1d0UUNOTkJ2WmtvRkVReitUVldBSmtQRmtxNUlibXFyL2hMUzcreGlqS3FWeXd +WZldIeGp3Q25iUTlzYitjcnhqcDlYR2szLzZ1YUFUeTMyVi9MVFBBdmM4am9wL2hRdjV4bXBnZ +k95M1cvSkljNXpPTlBlbmdQVkl2MXJtb0EwS0h6QVpCNjBnY3pEM1BaYWZVZHFsdGV6RndRTTV +xSFB3Q1hqelJ3SDRyL0Vsdz09PC9TUD4=" --header="Connection: keep-alive" "https +://chibakoudai.sharepoint.com/sites/stair02/_layouts/15/download.aspx? +SourceUrl=%2Fsites%2Fstair02%2FShared%20Documents%2Fdata%2FPubChemQC%2FB3LYP +%5FPM6%2Fb3lyp%5Fpm6%5Fver1%2E0%2E0%2Fjson%2Fall%2FCompound%5F{start}%5F +{stop}%2Etar%2Exz" -c -O 'Compound_{start}_{stop}.tar.xz' > /dev/null 2>&1""" + + + cmd_pm6 = f"""wget --header="Host: chibakoudai.sharepoint.com" +--header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 +(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" --header="Accept: +text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/ +webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" +--header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,fr;q=0.7" +--header="Referer: https://chibakoudai.sharepoint.com/sites/stair01/ +Shared%20Documents/Forms/AllItems.aspx?ga=1&id=%2Fsites%2Fstair01%2F +Shared%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2F +json%2Fall%2FCompound%5F{start}%5F{stop}%2Etar%2Exz&viewid=2a7fb7f8 +%2Df3f8%2D4ad2%2D931e%2Dfc786e938ea8&parent=%2Fsites%2Fstair01%2FShared +%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2Fjson%2Fall" +--header="Cookie: MicrosoftApplicationsTelemetryDeviceId=cec40b8a-9870- +4c4f-bb71-838a300c8685; MSFPC=GUID=511089efdbeb49d3923fdc7e6404bd9b& +HASH=5110&LV=202303&V=4&LU=1678287635332; WSS_FullScreenMode=false; +FedAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+ +VjEzLDBoLmZ8bWVtYmVyc2hpcHx1cm4lM2FzcG8lM2Fhbm9uIzI1YTJjOTIyMjVkMWNl +NzlkOWVmN2NjYmRjNzc5YWI1MmJhMDY2N2E5NDRmZTg3NGFmZTFhZjRjMjQ0OGE3ZTUs +MCMuZnxtZW1iZXJzaGlwfHVybiUzYXNwbyUzYWFub24jMjVhMmM5MjIyNWQxY2U3OWQ5 +ZWY3Y2NiZGM3NzlhYjUyYmEwNjY3YTk0NGZlODc0YWZlMWFmNGMyNDQ4YTdlNSwxMzM0 +MTM0MDE0OTAwMDAwMDAsMCwxMzM0MTYyODU0MDg5NTI4NjAsMC4wLjAuMCwyNTgsNTIy +ZTlhNzYtYWNiZC00MDJiLWEyZmMtN2NmNjg5ZGRmNTkwLCwsOTA3NGUyYTAtNDAyNS0y +MDAwLWE2NDEtYjdiNDU2N2JlNzI5LDdiMzVlM2EwLWYwYmMtMjAwMC05ZmM0LWU4ODFi +NmM4NGNjZSxSNFZHUmtMSXdrT3RETDI0alZUSm9RLDAsMCwwLCwsLDI2NTA0Njc3NDM5 +OTk5OTk5OTksMCwsLCwsLCwwLCwxOTU2NzYsR0FkeFdYM3FnLXBsUDRlOVhCUDF5MTZp +ZmpVLFNRKzRNWHJYNzRaSHUxMUxVcE9adVZTT1BiK0xJTllwdHY3YTBIM2hLOEdPNThw +L1F1VDZ2K1FTWUZWekpqL3FFblp1TUhlVjFWaytxQ2lhSC9tWXNkMXlRM1N6YVRJaUtx +cHVsWkhTUEVsWmg4TmtHMDhzT3ZXN2J5dW1OMmY4dFJMUVNmekFYQnREVzdnN1hUMUgy +MUsyVlFyUys3WEtHSXpvMmFjQU5XQVNMUTQwRTJFVEd5SlhjRE9ya09HS2ZiSThDVWk4 +bHNwaFRVZTJ6UjBPbjRZaGVFSDUrYTJsSVB4bUNLdG0weXBsS1V6M2pEakxHcml0Rk5l +dWdUdEk0WUpZY3ZOcGZENmZDU0M3dGFhOXlXYmpZUU1QMlhmbXd1bGtkRCs1aUdYRjZi +SFNBNXlNY1FuUXBCVWZjSjgwcDZXSmtlbXlzMWlWZXA5RGU4UHpvZz09PC9TUD4=" +--header="Connection: keep-alive" "https://chibakoudai.sharepoint.com/ +sites/stair01/_layouts/15/download.aspx?SourceUrl=%2Fsites%2Fstair01%2F +Shared%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2F +json%2Fall%2FCompound%5F{start}%5F{stop}%2Etar%2Exz" -c -O +'Compound_{start}_{stop}.tar.xz' > /dev/null 2>&1""" + + cmd = cmd_b3lyp if method == 'b3lyp' else cmd_pm6 + cmd = cmd.replace('\n', '') + os.system(cmd) + except Exception as e: + pass + # else: + # print(f"Downloaded: Compound_{start}_{stop}.tar.xz") + + +def download_b3lyp_pm6(start=0, stop=10000, method='b3lyp'): + path = os.path.join(get_local_cache(), "pubchemqc", method) + os.makedirs(path, exist_ok=True) + os.chdir(path) + ixs = list(range(start, stop)) + for i in tqdm(ixs): + download_b3lyp_pm6_item(i, method=method) + + +@click.command() +@click.option("--id", "-i", type=int, default=0, help="chunk id starting at 0") +@click.option("--chunk-size", "-s", type=int, default=50, help="Chunk size to divide and conquer.") +@click.option("--method", "-m", type=str, default="pm6", help="QM Method used for the calculations.") +def main(id, chunk_size, method): + start = id * chunk_size + stop = (id + 1) * chunk_size + download_b3lyp_pm6(start=start, stop=stop, method=method) + +if __name__ == "__main__": + main() \ No newline at end of file From f2e1664e2b6cc24f1f31892d4e5e4fe737fc08ca Mon Sep 17 00:00:00 2001 From: prtos Date: Thu, 12 Oct 2023 17:55:59 +0000 Subject: [PATCH 2/2] precommit passed --- .gitignore | 1 - src/openqdc/datasets/dess.py | 58 +++++++++++++---------- src/openqdc/datasets/orbnet_denali.py | 2 +- src/openqdc/datasets/pcqm.py | 53 +++++++++++---------- src/openqdc/datasets/qm7x.py | 49 ++++++++----------- src/openqdc/datasets/sn2_rxn.py | 3 +- src/openqdc/datasets/solvated_peptides.py | 3 +- src/openqdc/datasets/tmqm.py | 24 ++++------ src/openqdc/datasets/waterclusters3_30.py | 35 +++++++------- src/openqdc/raws/config_factory.py | 12 +++-- src/openqdc/raws/fetch.py | 21 ++++---- src/openqdc/raws/pubchemqc.py | 49 +++++++++---------- 12 files changed, 151 insertions(+), 159 deletions(-) diff --git a/.gitignore b/.gitignore index 5fbc33b..5d3cc32 100644 --- a/.gitignore +++ b/.gitignore @@ -147,4 +147,3 @@ nohup.out *.csv *.txt *.sh - diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py index 96dd008..f67ccab 100644 --- a/src/openqdc/datasets/dess.py +++ b/src/openqdc/datasets/dess.py @@ -4,9 +4,8 @@ import numpy as np import pandas as pd from tqdm import tqdm -from rdkit.Chem import MolFromMolBlock + from openqdc.datasets.base import BaseDataset -from openqdc.utils import load_json, load_pkl from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import get_atomic_number_and_charge @@ -37,25 +36,25 @@ def read_mol(mol_path, smiles, subset, targets): class DESS(BaseDataset): __name__ = "dess" __energy_methods__ = [ - 'mp2_cc', - 'mp2_qz', - 'mp2_tz', - 'mp2_cbs', - 'ccsd(t)_cc', - 'ccsd(t)_cbs', - 'ccsd(t)_nn', - 'sapt', + "mp2_cc", + "mp2_qz", + "mp2_tz", + "mp2_cbs", + "ccsd(t)_cc", + "ccsd(t)_cbs", + "ccsd(t)_nn", + "sapt", ] energy_target_names = [ - 'cc_MP2_all', - 'qz_MP2_all', - 'tz_MP2_all', - 'cbs_MP2_all', - 'cc_CCSD(T)_all', - 'cbs_CCSD(T)_all', - 'nn_CCSD(T)_all', - 'sapt_all', + "cc_MP2_all", + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "cc_CCSD(T)_all", + "cbs_CCSD(T)_all", + "nn_CCSD(T)_all", + "sapt_all", ] # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all'] @@ -72,16 +71,25 @@ def _read_raw_(self, part): for col in self.energy_target_names: if col not in df.columns: df[col] = np.nan - smiles = (df['smiles0'] + '.' + df['smiles1']).tolist() + smiles = (df["smiles0"] + "." + df["smiles1"]).tolist() subsets = (f"{part}_" + df["group_orig"]).tolist() targets = df[self.energy_target_names].values - paths = p_join(self.root, "geometries/") + df["system_id"].astype(str) + f"/{part}_" + df["geom_id"].astype(str) + ".mol" - - inputs = [dict(smiles=smiles[i], subset=subsets[i], targets=targets[i], mol_path=paths[i]) - for i in tqdm(range(len(smiles)))] + paths = ( + p_join(self.root, "geometries/") + + df["system_id"].astype(str) + + f"/{part}_" + + df["geom_id"].astype(str) + + ".mol" + ) + + inputs = [ + dict(smiles=smiles[i], subset=subsets[i], targets=targets[i], mol_path=paths[i]) + for i in tqdm(range(len(smiles))) + ] f = lambda xs: [read_mol(**x) for x in xs] - samples = dm.parallelized_with_batches(f, inputs, n_jobs=-1, progress=True, - batch_size=1024, scheduler= "threads") + samples = dm.parallelized_with_batches( + f, inputs, n_jobs=-1, progress=True, batch_size=1024, scheduler="threads" + ) return samples def read_raw_entries(self): diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index 6901737..5e44263 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -60,7 +60,7 @@ def read_raw_entries(self): # if i > 10: # break # exit() - fn = lambda x: read_mol(x[0], x[1], self.root, self.energy_target_names) + fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names) res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True) samples = sum(res, []) return samples diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py index 69970ff..bfa0e38 100644 --- a/src/openqdc/datasets/pcqm.py +++ b/src/openqdc/datasets/pcqm.py @@ -1,46 +1,48 @@ -from os.path import join as p_join import json import tarfile +from glob import glob +from os.path import join as p_join + import datamol as dm import numpy as np import pandas as pd -from glob import glob -from tqdm import tqdm -from rdkit.Chem import MolFromMolBlock + from openqdc.datasets.base import BaseDataset -from openqdc.utils import load_json, load_pkl from openqdc.utils.constants import MAX_ATOMIC_NUMBER -from openqdc.utils.molecule import get_atomic_number_and_charge -def flatten_dict(d, sep: str= '.'): - return pd.json_normalize(d, sep=sep).to_dict(orient='records')[0] +def flatten_dict(d, sep: str = "."): + return pd.json_normalize(d, sep=sep).to_dict(orient="records")[0] + -def read_content(fd): +def read_content(f): try: - f = tar.extractfile(fd) r = flatten_dict(json.load(f)) - x = np.concatenate((r['atoms.elements.number'][:, None], - r['atoms.core electrons'][:, None], - r['atoms.coords.3d'].reshape(-1, 3)), - axis=-1).astype(np.float32) + x = np.concatenate( + ( + r["atoms.elements.number"][:, None], + r["atoms.core electrons"][:, None], + r["atoms.coords.3d"].reshape(-1, 3), + ), + axis=-1, + ).astype(np.float32) res = dict( - name=np.array([r['smiles']]), - subset=np.array([r['formula']]), - energies=np.array(['properties.energy.total']).astype(np.float32)[None, :], + name=np.array([r["smiles"]]), + subset=np.array([r["formula"]]), + energies=np.array(["properties.energy.total"]).astype(np.float32)[None, :], atomic_inputs=x, n_atoms=np.array([x.shape[0]], dtype=np.int32), ) - except Exception as e: + except Exception: res = None return res -def read_archive(path): +def read_archive(path): with tarfile.open(path) as tar: - res = [read_content(member) for member in tar.getmembers()] + res = [read_content(tar.extractfile(member)) for member in tar.getmembers()] # print(len(res)) return res @@ -48,13 +50,13 @@ def read_archive(path): class PubchemQC(BaseDataset): __name__ = "pubchemqc" __energy_methods__ = [ - 'b3lyp', - 'pm6', + "b3lyp", + "pm6", ] energy_target_names = [ - 'b3lyp', - 'pm6', + "b3lyp", + "pm6", ] # Energy in hartree, all zeros by default @@ -68,7 +70,7 @@ def __init__(self) -> None: def _read_raw_(self, part): arxiv_paths = glob(p_join(self.root, f"{part}", "*.tar.gz")) print(len(arxiv_paths)) - samples = dm.parallelized(read_archive, arxiv_paths, n_jobs=-1, progress=True, scheduler='threads') + samples = dm.parallelized(read_archive, arxiv_paths, n_jobs=-1, progress=True, scheduler="threads") res = sum(samples, []) print(len(res)) exit() @@ -90,4 +92,3 @@ def read_raw_entries(self): for k in x: if x[k] is not None: print(k, x[k].shape, end=" ") - diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py index bbcaab8..da055d5 100644 --- a/src/openqdc/datasets/qm7x.py +++ b/src/openqdc/datasets/qm7x.py @@ -1,8 +1,9 @@ +from os.path import join as p_join import numpy as np from tqdm import tqdm -from os.path import join as p_join -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 + +from openqdc.datasets.base import BaseDataset from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.io import load_hdf5_file @@ -10,23 +11,19 @@ def read_mol(mol_h5, mol_name, energy_target_names, force_target_names): m = mol_h5 cids = list(mol_h5.keys()) - - zs = [m[c]['atNUM'] for c in cids] - xyz = np.concatenate([m[c]['atXYZ'] for c in cids], axis=0) + + zs = [m[c]["atNUM"] for c in cids] + xyz = np.concatenate([m[c]["atXYZ"] for c in cids], axis=0) n_atoms = np.array([len(z) for z in zs], dtype=np.int32) n, zs = len(n_atoms), np.concatenate(zs, axis=0) a_inputs = np.concatenate([np.stack([zs, np.zeros_like(zs)], axis=-1), xyz], axis=-1) - - forces = np.concatenate([np.stack([m[c][f_tag] - for f_tag in force_target_names], axis=-1) - for c in cids], axis=0) - energies = np.stack([np.array([m[c][e_tag][0] - for e_tag in energy_target_names]) - for c in cids], axis=0) + + forces = np.concatenate([np.stack([m[c][f_tag] for f_tag in force_target_names], axis=-1) for c in cids], axis=0) + energies = np.stack([np.array([m[c][e_tag][0] for e_tag in energy_target_names]) for c in cids], axis=0) res = dict( - name=np.array([mol_name]*n), - subset=np.array(["qm7x"]*n), + name=np.array([mol_name] * n), + subset=np.array(["qm7x"] * n), energies=energies.astype(np.float32), atomic_inputs=a_inputs.astype(np.float32), forces=forces.astype(np.float32), @@ -35,27 +32,20 @@ def read_mol(mol_h5, mol_name, energy_target_names, force_target_names): return res + class QM7X(BaseDataset): __name__ = "qm7x" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - __energy_methods__ = [ - "pbe-ts", "mbd" - ] + __energy_methods__ = ["pbe-ts", "mbd"] - energy_target_names = [ - "ePBE0", "eMBD" - ] + energy_target_names = ["ePBE0", "eMBD"] - __force_methods__ = [ - "pbe-ts", "vdw" - ] + __force_methods__ = ["pbe-ts", "vdw"] - force_target_names = [ - "pbe0FOR", "vdwFOR" - ] + force_target_names = ["pbe0FOR", "vdwFOR"] def __init__(self) -> None: super().__init__() @@ -65,9 +55,10 @@ def read_raw_entries(self): for i in range(1, 9): raw_path = p_join(self.root, f"{i}000") data = load_hdf5_file(raw_path) - samples += [read_mol(data[k], k, self.energy_target_names, self.force_target_names) - for k in tqdm(data.keys())] - + samples += [ + read_mol(data[k], k, self.energy_target_names, self.force_target_names) for k in tqdm(data.keys()) + ] + return samples diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index f418f95..88da2c5 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -33,8 +33,7 @@ def __init__(self) -> None: def read_raw_entries(self): raw_path = p_join(self.root, "sn2_rxn.h5") - samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, - self.force_target_names) + samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, self.force_target_names) return samples diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py index 1ff0ebe..801d28f 100644 --- a/src/openqdc/datasets/solvated_peptides.py +++ b/src/openqdc/datasets/solvated_peptides.py @@ -33,8 +33,7 @@ def __init__(self) -> None: def read_raw_entries(self): raw_path = p_join(self.root, "solvated_peptides.h5") - samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, - self.force_target_names) + samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names) return samples diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py index 548fdf4..c08ccfc 100644 --- a/src/openqdc/datasets/tmqm.py +++ b/src/openqdc/datasets/tmqm.py @@ -1,13 +1,13 @@ +from io import StringIO +from os.path import join as p_join import numpy as np import pandas as pd from tqdm import tqdm -from os.path import join as p_join -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 + +from openqdc.datasets.base import BaseDataset from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import atom_table -from openqdc.utils.io import load_hdf5_file -from io import StringIO def content_to_xyz(content, e_map): @@ -15,7 +15,7 @@ def content_to_xyz(content, e_map): tmp = content.split("\n")[1].split(" | ") code = tmp[0].split(" ")[-1] name = tmp[3].split(" ")[-1] - except: + except Exception: print(content) return None @@ -25,11 +25,11 @@ def content_to_xyz(content, e_map): z = np.array([atom_table.GetAtomicNumber(s) for s in z]) xs = np.stack((z, np.zeros_like(z)), axis=-1) e = e_map[code] - + conf = dict( atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), name=np.array([name]), - energies=np.array([e], dtype=np.float32)[:, None] , + energies=np.array([e], dtype=np.float32)[:, None], n_atoms=np.array([positions.shape[0]], dtype=np.int32), subset=np.array(["tmqm"]), ) @@ -40,7 +40,7 @@ def content_to_xyz(content, e_map): def read_xyz(fname, e_map): with open(fname, "r") as f: contents = f.read().split("\n\n") - + print("toto", len(contents)) res = [content_to_xyz(content, e_map) for content in tqdm(contents)] return res @@ -52,13 +52,9 @@ class TMQM(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - __energy_methods__ = [ - "tpssh_tz" - ] + __energy_methods__ = ["tpssh_tz"] - energy_target_names = [ - "TPSSh/def2TZVP level" - ] + energy_target_names = ["TPSSh/def2TZVP level"] def __init__(self) -> None: super().__init__() diff --git a/src/openqdc/datasets/waterclusters3_30.py b/src/openqdc/datasets/waterclusters3_30.py index c711787..eff78b1 100644 --- a/src/openqdc/datasets/waterclusters3_30.py +++ b/src/openqdc/datasets/waterclusters3_30.py @@ -1,12 +1,12 @@ +from io import StringIO +from os.path import join as p_join import numpy as np from tqdm import tqdm -from os.path import join as p_join -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 + +from openqdc.datasets.base import BaseDataset from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import atom_table -from openqdc.utils.io import load_hdf5_file -from io import StringIO def content_to_xyz(content, n_waters): @@ -20,15 +20,15 @@ def content_to_xyz(content, n_waters): z = np.array([atom_table.GetAtomicNumber(s) for s in z]) xs = np.stack((z, np.zeros_like(z)), axis=-1) e = float(tmp[1].strip().split(" ")[-1]) - except: + except Exception: print("Error in reading xyz file") print(n_waters, content) return None - + conf = dict( atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), name=np.array([f"water_{n_waters}"]), - energies=np.array([e], dtype=np.float32)[:, None] , + energies=np.array([e], dtype=np.float32)[:, None], n_atoms=np.array([positions.shape[0]], dtype=np.int32), subset=np.array([f"water_{n_waters}"]), ) @@ -37,11 +37,11 @@ def content_to_xyz(content, n_waters): def read_xyz(fname, n_waters): - s = 3*n_waters+2 + s = 3 * n_waters + 2 with open(fname, "r") as f: lines = f.readlines() - contents = ["".join(lines[i:i+s]) for i in range(0, len(lines), s)] - + contents = ["".join(lines[i : i + s]) for i in range(0, len(lines), s)] + res = [content_to_xyz(content, n_waters) for content in tqdm(contents)] return res @@ -52,13 +52,9 @@ class WaterClusters(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - __energy_methods__ = [ - "ttm2.1-f" - ] + __energy_methods__ = ["ttm2.1-f"] - energy_target_names = [ - "TTM2.1-F Potential" - ] + energy_target_names = ["TTM2.1-F Potential"] def __init__(self) -> None: super().__init__() @@ -67,9 +63,12 @@ def read_raw_entries(self): samples = [] for i in range(3, 31): raw_path = p_join(self.root, f"W3-W30_all_geoms_TTM2.1-F/W{i}_geoms_all.xyz") - data = read_xyz(raw_path, i,) + data = read_xyz( + raw_path, + i, + ) samples += data - + return samples diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py index 1255fa2..87e7620 100644 --- a/src/openqdc/raws/config_factory.py +++ b/src/openqdc/raws/config_factory.py @@ -65,7 +65,7 @@ class DataConfigFactory: ) qm7x = dict( - dataset_name="qm7x", # https://zenodo.org/record/4288677/files/1000.xz?download=1 + dataset_name="qm7x", # https://zenodo.org/record/4288677/files/1000.xz?download=1 links={f"{i}000.xz": f"https://zenodo.org/record/4288677/files/{i}000.xz" for i in range(1, 9)}, ) @@ -84,16 +84,18 @@ class DataConfigFactory: dess = dict( dataset_name="dess5m", - links={"DESS5M.zip": "https://zenodo.org/record/5706002/files/DESS5M.zip", - "DESS370.zip":"https://zenodo.org/record/5676266/files/DES370K.zip"}, + links={ + "DESS5M.zip": "https://zenodo.org/record/5706002/files/DESS5M.zip", + "DESS370.zip": "https://zenodo.org/record/5676266/files/DES370K.zip", + }, ) - + tmqm = dict( dataset_name="tmqm", links={ x: f"https://raw.githubusercontent.com/bbskjelstad/tmqm/master/data/{x}" for x in ["tmQM_X1.xyz.gz", "tmQM_X2.xyz.gz", "tmQM_y.csv", "Benchmark2_TPSSh_Opt.xyz"] - }, + }, ) misato = dict( diff --git a/src/openqdc/raws/fetch.py b/src/openqdc/raws/fetch.py index a9c43b0..5aefa10 100644 --- a/src/openqdc/raws/fetch.py +++ b/src/openqdc/raws/fetch.py @@ -1,17 +1,17 @@ """Script to download the molecule3d dataset from Google Drive.""" -import os import gzip -import tqdm -import gdown -import fsspec +import os import shutil import socket import tarfile -import zipfile -import requests import urllib.error import urllib.request -import datamol as dm +import zipfile + +import fsspec +import gdown +import requests +import tqdm from loguru import logger from sklearn.utils import Bunch @@ -26,7 +26,7 @@ def download_url(url, local_filename): gdown.download(url, local_filename, quiet=False) elif "raw.github" in url: r = requests.get(url, allow_redirects=True) - with open(local_filename, 'wb') as f: + with open(local_filename, "wb") as f: f.write(r.content) else: r = requests.get(url, stream=True) @@ -35,7 +35,7 @@ def download_url(url, local_filename): if chunk: f.write(chunk) - + def decompress_tar_gz(local_filename): parent = os.path.dirname(local_filename) with tarfile.open(local_filename) as tar: @@ -64,8 +64,6 @@ def decompress_zip(local_filename): def decompress_gz(local_filename): - parent = os.path.dirname(local_filename) - logger.info(f"Verifying archive extraction states: {local_filename}") out_filename = local_filename.replace(".gz", "") if out_filename.endswith("hdf5"): @@ -80,7 +78,6 @@ def decompress_gz(local_filename): logger.info(f"Archive already extracted: {local_filename}") - # function to download large files with requests def fetch_file(url, local_filename, overwrite=False): """ diff --git a/src/openqdc/raws/pubchemqc.py b/src/openqdc/raws/pubchemqc.py index 1c45d13..d756bd5 100644 --- a/src/openqdc/raws/pubchemqc.py +++ b/src/openqdc/raws/pubchemqc.py @@ -1,22 +1,23 @@ import os + import click -import datamol as dm from tqdm import tqdm + from openqdc.utils.io import get_local_cache -def download_b3lyp_pm6_item(i, method='b3lyp'): +def download_b3lyp_pm6_item(i, method="b3lyp"): try: step_size = 25000 - start = str(i * step_size + 1).rjust(9, '0') - stop = str((i + 1) * step_size).rjust(9, '0') + start = str(i * step_size + 1).rjust(9, "0") + stop = str((i + 1) * step_size).rjust(9, "0") - cmd_b3lyp = f"""wget --header="Host: chibakoudai.sharepoint.com" ---header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 -(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" --header="Accept: + cmd_b3lyp = f"""wget --header="Host: chibakoudai.sharepoint.com" +--header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 +(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image -/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" ---header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,fr;q=0.7" +/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" +--header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,fr;q=0.7" --header="Referer: https://chibakoudai.sharepoint.com/sites/stair02/Shared %20Documents/Forms/AllItems.aspx?ga=1&id=%2Fsites%2Fstair02%2FShared%20 Documents%2Fdata%2FPubChemQC%2FB3LYP%5FPM6%2Fb3lyp%5Fpm6%5Fver1%2E0%2E0%2F @@ -46,23 +47,22 @@ def download_b3lyp_pm6_item(i, method='b3lyp'): SourceUrl=%2Fsites%2Fstair02%2FShared%20Documents%2Fdata%2FPubChemQC%2FB3LYP %5FPM6%2Fb3lyp%5Fpm6%5Fver1%2E0%2E0%2Fjson%2Fall%2FCompound%5F{start}%5F {stop}%2Etar%2Exz" -c -O 'Compound_{start}_{stop}.tar.xz' > /dev/null 2>&1""" - - cmd_pm6 = f"""wget --header="Host: chibakoudai.sharepoint.com" ---header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 -(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" --header="Accept: + cmd_pm6 = f"""wget --header="Host: chibakoudai.sharepoint.com" +--header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 +(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/ -webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" ---header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,fr;q=0.7" +webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" +--header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,fr;q=0.7" --header="Referer: https://chibakoudai.sharepoint.com/sites/stair01/ Shared%20Documents/Forms/AllItems.aspx?ga=1&id=%2Fsites%2Fstair01%2F Shared%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2F json%2Fall%2FCompound%5F{start}%5F{stop}%2Etar%2Exz&viewid=2a7fb7f8 %2Df3f8%2D4ad2%2D931e%2Dfc786e938ea8&parent=%2Fsites%2Fstair01%2FShared -%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2Fjson%2Fall" +%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2Fjson%2Fall" --header="Cookie: MicrosoftApplicationsTelemetryDeviceId=cec40b8a-9870- 4c4f-bb71-838a300c8685; MSFPC=GUID=511089efdbeb49d3923fdc7e6404bd9b& -HASH=5110&LV=202303&V=4&LU=1678287635332; WSS_FullScreenMode=false; +HASH=5110&LV=202303&V=4&LU=1678287635332; WSS_FullScreenMode=false; FedAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+ VjEzLDBoLmZ8bWVtYmVyc2hpcHx1cm4lM2FzcG8lM2Fhbm9uIzI1YTJjOTIyMjVkMWNl NzlkOWVmN2NjYmRjNzc5YWI1MmJhMDY2N2E5NDRmZTg3NGFmZTFhZjRjMjQ0OGE3ZTUs @@ -79,23 +79,23 @@ def download_b3lyp_pm6_item(i, method='b3lyp'): MUsyVlFyUys3WEtHSXpvMmFjQU5XQVNMUTQwRTJFVEd5SlhjRE9ya09HS2ZiSThDVWk4 bHNwaFRVZTJ6UjBPbjRZaGVFSDUrYTJsSVB4bUNLdG0weXBsS1V6M2pEakxHcml0Rk5l dWdUdEk0WUpZY3ZOcGZENmZDU0M3dGFhOXlXYmpZUU1QMlhmbXd1bGtkRCs1aUdYRjZi -SFNBNXlNY1FuUXBCVWZjSjgwcDZXSmtlbXlzMWlWZXA5RGU4UHpvZz09PC9TUD4=" +SFNBNXlNY1FuUXBCVWZjSjgwcDZXSmtlbXlzMWlWZXA5RGU4UHpvZz09PC9TUD4=" --header="Connection: keep-alive" "https://chibakoudai.sharepoint.com/ sites/stair01/_layouts/15/download.aspx?SourceUrl=%2Fsites%2Fstair01%2F Shared%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2F -json%2Fall%2FCompound%5F{start}%5F{stop}%2Etar%2Exz" -c -O +json%2Fall%2FCompound%5F{start}%5F{stop}%2Etar%2Exz" -c -O 'Compound_{start}_{stop}.tar.xz' > /dev/null 2>&1""" - cmd = cmd_b3lyp if method == 'b3lyp' else cmd_pm6 - cmd = cmd.replace('\n', '') + cmd = cmd_b3lyp if method == "b3lyp" else cmd_pm6 + cmd = cmd.replace("\n", "") os.system(cmd) - except Exception as e: + except Exception: pass # else: # print(f"Downloaded: Compound_{start}_{stop}.tar.xz") -def download_b3lyp_pm6(start=0, stop=10000, method='b3lyp'): +def download_b3lyp_pm6(start=0, stop=10000, method="b3lyp"): path = os.path.join(get_local_cache(), "pubchemqc", method) os.makedirs(path, exist_ok=True) os.chdir(path) @@ -113,5 +113,6 @@ def main(id, chunk_size, method): stop = (id + 1) * chunk_size download_b3lyp_pm6(start=start, stop=stop, method=method) + if __name__ == "__main__": - main() \ No newline at end of file + main()