From 1b064376d62da2a886907cd7d42313dbeca27725 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 13 Nov 2023 19:32:02 +0000 Subject: [PATCH 01/18] For nikhil --- src/openqdc/datasets/dummy.py | 29 +++++++++++++++++++---------- src/openqdc/datasets/qmugs.py | 30 ++++++++++++++++++++++++++++++ src/openqdc/datasets/spice.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 10 deletions(-) diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py index 4e1ff17..b535393 100644 --- a/src/openqdc/datasets/dummy.py +++ b/src/openqdc/datasets/dummy.py @@ -2,6 +2,7 @@ from sklearn.utils import Bunch from openqdc.datasets.base import BaseDataset +from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory class Dummy(BaseDataset): @@ -10,38 +11,46 @@ class Dummy(BaseDataset): """ __name__ = "dummy" - __energy_methods__ = ["I_solved_the_schrodinger_equation_by_hand"] - __force_methods__ = ["I_made_up_random_forces"] + __energy_methods__ = ["I_solved_the_schrodinger_equation_by_hand", "PM6"] + __force_methods__ = ["I_made_up_random_forces", "writing_1_to_every_coordinate"] __energy_unit__ = "kcal/mol" __distance_unit__ = "ang" __forces_unit__ = "kcal/mol/ang" - energy_target_names = ["energy"] + energy_target_names = [f"energy{i}" for i in range(len(__energy_methods__))] - force_target_names = ["forces"] + force_target_names = [f"forces{i}" for i in range(len(__force_methods__))] + __isolated_atom_energies__ = [] def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None: try: super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir) + except: # noqa pass + self._set_isolated_atom_energies() + + def is_preprocessed(self): + return True def read_raw_entries(self): pass def __len__(self): - return 999999999 + return 9999 def __getitem__(self, idx: int): - size = np.random.randint(1, 250) + shift = IsolatedAtomEnergyFactory.max_charge + size = np.random.randint(1, 100) z = np.random.randint(1, 100, size) + c = np.random.randint(-1, 2, size) return Bunch( positions=np.random.rand(size, 3) * 10, atomic_numbers=z, - charges=np.random.randint(-1, 2, size), - e0=np.zeros(size), - energies=np.random.rand(1) * 100, + charges=c, + e0=self.__isolated_atom_energies__[..., z, c + shift].T, + energies=np.random.randn(len(self.__energy_methods__)), name="dummy_{}".format(idx), subset="dummy", - forces=np.random.rand(size, 3) * 100, + forces=(np.random.randn(size, 3, len(self.__force_methods__)) * 100), ) diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index c75f8b5..58d5b15 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -62,6 +62,36 @@ class QMugs(BaseDataset): "DFT:TOTAL_ENERGY", ] + @property + def average_n_atoms(self): + return 55.215926293326426 + + def get_stats(self, tp: str = "formation"): + if tp not in ["formation", "total"]: + raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead") + if tp == "formation": + return { + "energy": { + "mean": self.convert_energy(np.array([-12.94348027, -9.83037297])), + "std": self.convert_energy(np.array([4.39971409, 3.3574188])), + }, + "forces": { + "mean": np.array([0]), + "std": np.array([0]), + }, + } + else: + return { + "energy": { + "mean": self.convert_energy(np.array([-89.44242, -1740.5336])), + "std": self.convert_energy(np.array([29.599571, 791.48663])), + }, + "forces": { + "mean": np.array([0]), + "std": np.array([0]), + }, + } + def read_raw_entries(self): raw_path = p_join(self.root, "structures") mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)] diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index 974d45f..cc511f0 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -75,6 +75,36 @@ class Spice(BaseDataset): "SPICE Ion Pairs Single Points Dataset v1.1": "Ion Pairs", } + @property + def average_n_atoms(self): + return 29.88387509402179 + + def get_stats(self, tp: str = "formation"): + if tp not in ["formation", "total"]: + raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead") + if tp == "formation": + return { + "energy": { + "mean": self.convert_energy(np.array([-5.6775709889409836])), + "std": self.convert_energy(np.array([2.3371488471105013])), + }, + "forces": { + "mean": self.convert_energy(np.array([-1.03868e-08])), + "std": self.convert_energy(np.array([0.021063408])), + }, + } + else: + return { + "energy": { + "mean": self.convert_energy(np.array([-1244.6562])), + "std": self.convert_energy(np.array([1219.4248])), + }, + "forces": { + "mean": self.convert_energy(np.array([-1.03868e-08])), + "std": self.convert_energy(np.array([0.021063408])), + }, + } + def convert_forces(self, x): return (-1.0) * super().convert_forces(x) From 22c8609df40fc4b1907a68b728076ea09a6a8abb Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 14 Nov 2023 18:13:39 +0000 Subject: [PATCH 02/18] Component values --- src/openqdc/datasets/qmugs.py | 21 ++++++++++++-------- src/openqdc/datasets/spice.py | 37 +++++++++++++++++++++++------------ 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index 58d5b15..481a167 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -4,6 +4,7 @@ import datamol as dm import numpy as np +from numpy import array from openqdc.datasets.base import BaseDataset from openqdc.utils.molecule import get_atomic_number_and_charge @@ -72,23 +73,27 @@ def get_stats(self, tp: str = "formation"): if tp == "formation": return { "energy": { - "mean": self.convert_energy(np.array([-12.94348027, -9.83037297])), - "std": self.convert_energy(np.array([4.39971409, 3.3574188])), + "mean": self.convert_energy(array([-12.94348027, -9.83037297])), + "std": self.convert_energy(array([4.39971409, 3.3574188])), }, "forces": { - "mean": np.array([0]), - "std": np.array([0]), + "mean": array([0]), + "std": array([0]), + "components": { + "mean": self.convert_forces(array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])), + "std": self.convert_forces(array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])), + }, }, } else: return { "energy": { - "mean": self.convert_energy(np.array([-89.44242, -1740.5336])), - "std": self.convert_energy(np.array([29.599571, 791.48663])), + "mean": self.convert_energy(array([-89.44242, -1740.5336])), + "std": self.convert_energy(array([29.599571, 791.48663])), }, "forces": { - "mean": np.array([0]), - "std": np.array([0]), + "mean": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), + "std": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), }, } diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index cc511f0..ec848df 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -2,6 +2,7 @@ import datamol as dm import numpy as np +from numpy import array from tqdm import tqdm from openqdc.datasets.base import BaseDataset @@ -83,25 +84,35 @@ def get_stats(self, tp: str = "formation"): if tp not in ["formation", "total"]: raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead") if tp == "formation": - return { - "energy": { - "mean": self.convert_energy(np.array([-5.6775709889409836])), - "std": self.convert_energy(np.array([2.3371488471105013])), + return ( + { + "energy": { + "mean": self.convert_energy(array([-5.67757058])), + "std": self.convert_energy(array([2.33714861])), + }, + "forces": { + "mean": self.convert_forces(array([-1.0387013e-08])), + "std": self.convert_forces(array([0.021063408])), + "components": { + "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), + "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), + }, + }, }, - "forces": { - "mean": self.convert_energy(np.array([-1.03868e-08])), - "std": self.convert_energy(np.array([0.021063408])), - }, - } + ) else: return { "energy": { - "mean": self.convert_energy(np.array([-1244.6562])), - "std": self.convert_energy(np.array([1219.4248])), + "mean": self.convert_energy(array([-1244.6562])), + "std": self.convert_energy(array([1219.4248])), }, "forces": { - "mean": self.convert_energy(np.array([-1.03868e-08])), - "std": self.convert_energy(np.array([0.021063408])), + "mean": self.convert_forces(array([-1.0387013e-08])), + "std": self.convert_forces(array([0.021063408])), + "components": { + "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), + "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), + }, }, } From 28bbf005f3c3b820110ce14ecb7f9d4aac80a51e Mon Sep 17 00:00:00 2001 From: prtos Date: Wed, 15 Nov 2023 04:20:59 +0000 Subject: [PATCH 03/18] wip --- src/openqdc/datasets/base.py | 4 +-- src/openqdc/datasets/pcqm.py | 68 ++++++++++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 598a0b3..dbca4cc 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -193,7 +193,7 @@ def save_preprocess(self, data_dict): uniques, inv_indices = np.unique(data_dict[key], return_inverse=True) with open(local_path, "wb") as f: np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) - push_remote(local_path) + push_remote(local_path, overwrite=True) def read_preprocess(self, overwrite_local_cache=False): logger.info("Reading preprocessed data") @@ -218,7 +218,7 @@ def read_preprocess(self, overwrite_local_cache=False): for key in ["name", "subset"]: filename = p_join(self.preprocess_path, f"{key}.npz") - pull_locally(filename) + pull_locally(filename, overwrite=overwrite_local_cache) self.data[key] = dict() with open(filename, "rb") as f: tmp = np.load(f) diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py index 505eef1..555d2eb 100644 --- a/src/openqdc/datasets/pcqm.py +++ b/src/openqdc/datasets/pcqm.py @@ -11,7 +11,7 @@ from loguru import logger from openqdc.datasets.base import BaseDataset -from openqdc.utils.io import get_local_cache +from openqdc.utils.io import get_local_cache, push_remote def flatten_dict(d, sep: str = "."): @@ -80,27 +80,65 @@ def __init__(self, energy_unit=None, distance_unit=None) -> None: def root(self): return p_join(get_local_cache(), "pubchemqc") - def collate_list(self, list_entries, partial=False): - # default partial=False is necessary for compatibility with the base class - if partial: - predicat = list_entries is not None and len(list_entries) > 0 - list_entries = [x for x in list_entries if x is not None] - return super().collate_list(list_entries) if predicat else None - else: - n = 0 - for i in range(len(list_entries)): - list_entries[i]["position_idx_range"] += n - n += list_entries[i]["position_idx_range"].max() - res = {key: np.concatenate([r[key] for r in list_entries], axis=0) for key in list_entries[0]} - return res + @property + def preprocess_path(self): + path = p_join(self.root, "preprocessed", self.__name__) + os.makedirs(path, exist_ok=True) + return path + + def collate_list(self, list_entries): + predicat = list_entries is not None and len(list_entries) > 0 + list_entries = [x for x in list_entries if x is not None] + return super().collate_list(list_entries) if predicat else None def read_raw_entries(self): arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl")) - f = lambda x: self.collate_list(read_preprocessed_archive(x), partial=True) + f = lambda x: self.collate_list(read_preprocessed_archive(x)) samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True) samples = [x for x in samples if x is not None] return samples + def preprocess(self): + if not self.is_preprocessed(): + logger.info("Preprocessing data and saving it to cache.") + logger.info( + f"Dataset {self.__name__} data with the following units:\n" + f"Energy: {self.energy_unit}, Distance: {self.distance_unit}, " + f"Forces: {self.force_unit if self.__force_methods__ else 'None'}" + ) + entries = self.read_raw_entries() + self.collate_and_save_list(entries) + + def collate_and_save_list(self, list_entries): + n_molecules, n_atoms = 0, 0 + for i in range(len(list_entries)): + list_entries[i]["position_idx_range"] += n_atoms + n_atoms += list_entries[i]["position_idx_range"].max() + n_molecules += list_entries[i]["position_idx_range"].shape[0] + + for key in self.data_keys: + first = list_entries[0][key] + shape = (n_molecules, *first.shape[1:]) + local_path = p_join(self.preprocess_path, f"{key}.mmap") + out = np.memmap(local_path, mode="w+", dtype=first.dtype, shape=shape) + + start = 0 + for i in range(len(list_entries)): + x = list_entries[i].pop(key) + n = x.shape[0] + out[start : start + n] = x + out.flush() + push_remote(local_path, overwrite=True) + + # save smiles and subset + for key in ["name", "subset"]: + local_path = p_join(self.preprocess_path, f"{key}.npz") + x = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)] + uniques, inv_indices = np.unique(x, return_inverse=True) + with open(local_path, "wb") as f: + np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) + push_remote(local_path, overwrite=True) + class PCQM_B3LYP(PCQM_PM6): __name__ = "pubchemqc_b3lyp" From 4c8bf76589362a9acf1da150db7d9f3a11df6e77 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 15 Nov 2023 16:48:27 +0000 Subject: [PATCH 04/18] Precomputed stats --- src/openqdc/datasets/ani.py | 203 ++++++++++++++++++++++ src/openqdc/datasets/base.py | 28 ++- src/openqdc/datasets/comp6.py | 91 ++++++++++ src/openqdc/datasets/dess.py | 65 +++++++ src/openqdc/datasets/dummy.py | 33 ++++ src/openqdc/datasets/geom.py | 37 ++++ src/openqdc/datasets/iso_17.py | 42 +++++ src/openqdc/datasets/molecule3d.py | 37 ++++ src/openqdc/datasets/orbnet_denali.py | 37 ++++ src/openqdc/datasets/qm7x.py | 75 ++++++++ src/openqdc/datasets/qmugs.py | 30 ++-- src/openqdc/datasets/sn2_rxn.py | 3 + src/openqdc/datasets/solvated_peptides.py | 44 +++++ src/openqdc/datasets/spice.py | 52 +++--- src/openqdc/datasets/tmqm.py | 37 ++++ src/openqdc/utils/constants.py | 2 + 16 files changed, 772 insertions(+), 44 deletions(-) diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index 913fb8a..c18d254 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -1,6 +1,8 @@ import os from os.path import join as p_join +from numpy import array, float32 + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 from openqdc.utils.io import get_local_cache @@ -34,6 +36,7 @@ class ANI1(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" + __average_nb_atoms__ = 15.91676229984414 @property def root(self): @@ -50,6 +53,41 @@ def read_raw_entries(self): samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names) return samples + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-2.37376472])), + "std": self.convert_energy(array([0.50266975])), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-333.67322], dtype=float32)), + "std": self.convert_energy(array([61.21667], dtype=float32)), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + } + class ANI1CCX(ANI1): """ @@ -71,6 +109,7 @@ class ANI1CCX(ANI1): __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" + __average_nb_atoms__ = 15.274685315870588 __energy_methods__ = [ "ccsd(t)/cbs", @@ -89,6 +128,41 @@ class ANI1CCX(ANI1): __force_methods__ = [] force_target_names = [] + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-3.23959548, 500.30384627, 500.40706776, 500.76740432])), + "std": self.convert_energy(array([1.03021261, 132.52707152, 132.56092469, 132.65261362])), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-374.40665, -1.2378153, -1.505962, -1.2396905], dtype=float32)), + "std": self.convert_energy(array([101.63995, 0.32444745, 0.39500558, 0.3250212], dtype=float32)), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + } + class ANI1X(ANI1): """ @@ -143,5 +217,134 @@ class ANI1X(ANI1): "wb97x/cc-pvtz", ] + __average_nb_atoms__ = 15.274685315870588 + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy( + array( + [ + -2.87910686, + -2.91460298, + -2.91182519, + 500.00748497, + 500.27885605, + 500.11130961, + -3.66090173, + -4.40643278, + ] + ) + ), + "std": self.convert_energy( + array( + [ + 0.92849657, + 0.93421854, + 0.93411345, + 132.44580372, + 132.52326771, + 132.47987395, + 1.60180792, + 1.75414812, + ] + ) + ), + }, + "forces": { + "mean": self.convert_forces(array([-6.139757e-06])), + "std": self.convert_forces(array([0.07401004])), + "components": { + "mean": self.convert_forces( + array( + [ + [6.6829815e-13, 3.5682501e-07], + [-5.1223647e-13, -1.8487021e-06], + [8.1159564e-13, -3.6849189e-05], + ], + dtype=float32, + ) + ), + "std": self.convert_forces( + array( + [[0.0759203, 0.06799112], [0.07694941, 0.06652647], [0.06229663, 0.05442103]], + dtype=float32, + ) + ), + "rms": self.convert_forces( + array( + [[0.0759203, 0.06799113], [0.07694941, 0.06652647], [0.06229663, 0.05442095]], + dtype=float32, + ) + ), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy( + array( + [ + -372.68945, + -372.74274, + -372.7326, + -1.1540408, + -1.5152899, + -1.4195863, + -392.72458, + -391.208, + ], + dtype=float32, + ) + ), + "std": self.convert_energy( + array( + [ + 101.166664, + 101.19915, + 101.191895, + 0.30445468, + 0.39988872, + 0.37456134, + 136.79112, + 137.48692, + ], + dtype=float32, + ) + ), + }, + "forces": { + "mean": self.convert_forces(array([-6.139757e-06])), + "std": self.convert_forces(array([0.07401004])), + "components": { + "mean": self.convert_forces( + array( + [ + [6.6829815e-13, 3.5682501e-07], + [-5.1223647e-13, -1.8487021e-06], + [8.1159564e-13, -3.6849189e-05], + ], + dtype=float32, + ) + ), + "std": self.convert_forces( + array( + [[0.0759203, 0.06799112], [0.07694941, 0.06652647], [0.06229663, 0.05442103]], + dtype=float32, + ) + ), + "rms": self.convert_forces( + array( + [[0.0759203, 0.06799113], [0.07694941, 0.06652647], [0.06229663, 0.05442095]], + dtype=float32, + ) + ), + }, + }, + }, + } + def convert_forces(self, x): return super().convert_forces(x) * 0.529177249 # correct the Dataset error diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 1de6ff1..022f880 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -14,7 +14,7 @@ IsolatedAtomEnergyFactory, chemical_symbols, ) -from openqdc.utils.constants import NB_ATOMIC_FEATURES +from openqdc.utils.constants import NB_ATOMIC_FEATURES, POSSIBLE_NORMALIZATION from openqdc.utils.io import ( copy_exists, dict_to_atoms, @@ -83,6 +83,7 @@ class BaseDataset(torch.utils.data.Dataset): __fn_energy__ = lambda x: x __fn_distance__ = lambda x: x __fn_forces__ = lambda x: x + __average_nb_atoms__ = None def __init__( self, @@ -425,3 +426,28 @@ def __str__(self): def __repr__(self): return f"{self.__name__}" + + @property + def _stats(self): + return {} + + @property + def average_n_atoms(self): + if self.__average_nb_atoms__ is None: + logger.info( + "This property for this dataset not available." + + "Please open an issue on Github for the team to look into it." + ) + return 1 + return self.__average_nb_atoms__ + + def get_statistics(self, normalization: str = "formation"): + stats = self._stats + if len(stats) == 0: + logger.info( + "This property for this dataset not available." + + "Please open an issue on Github for the team to look into it." + ) + if normalization not in POSSIBLE_NORMALIZATION: + raise ValueError(f"normalization={normalization} is not valid. Must be one of {POSSIBLE_NORMALIZATION}") + return stats[normalization] diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index c95ec17..2c6fb83 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -1,5 +1,7 @@ from os.path import join as p_join +from numpy import array, float32, nan + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 @@ -59,6 +61,95 @@ class COMP6(BaseDataset): "Gradient", ] + __average_nb_atoms__ = 25.74051563378753 + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy( + array( + [ + -2579.52016333, + -2543.74519203, + -354694.46157991, + -2506.4300631, + -2616.89224817, + -3157.54118509, + -354031.62984212, + nan, + nan, + ] + ) + ), + "std": self.convert_energy( + array( + [ + 1811.03171965, + 1471.95818836, + 201545.89189168, + 1385.50993753, + 1456.09915473, + 1728.51133182, + 208097.95666257, + nan, + nan, + ] + ) + ), + }, + "forces": { + "mean": self.convert_forces(6.6065984e-13), + "std": self.convert_forces(0.056459695), + "components": { + "mean": self.convert_forces( + array([[-4.1767219e-13], [1.0024132e-12], [-9.4386771e-13]], dtype=float32) + ), + "std": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)), + "rms": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy( + array( + [ + -360972.16, + -354729.66, + -354699.38, + -349555.7, + -351555.97, + -351530.44, + -354027.8, + nan, + nan, + ], + dtype=float32, + ), + ), + "std": self.convert_energy( + array( + [254766.0, 201559.77, 201537.8, 188725.47, 191028.78, 191016.1, 208089.4, nan, nan], + dtype=float32, + ), + ), + }, + "forces": { + "mean": self.convert_forces(array([6.6065984e-13])), + "std": self.convert_forces(array([0.056459695])), + "components": { + "mean": self.convert_forces( + array([[-4.1767219e-13], [1.0024132e-12], [-9.4386771e-13]], dtype=float32) + ), + "std": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)), + "rms": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)), + }, + }, + }, + } + def read_raw_entries(self): samples = [] for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]: diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py index 80b1e1c..76c2355 100644 --- a/src/openqdc/datasets/dess.py +++ b/src/openqdc/datasets/dess.py @@ -3,6 +3,7 @@ import datamol as dm import numpy as np import pandas as pd +from numpy import array, float32 from tqdm import tqdm from openqdc.datasets.base import BaseDataset @@ -61,6 +62,70 @@ class DESS(BaseDataset): # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all'] partitions = ["DES370K", "DES5M"] + __average_nb_atoms__ = 18.944905540514252 + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": array( + [ + 1.55021905e03, + 7.85244098e02, + 7.85022111e02, + 7.84864329e02, + 1.55072999e03, + 1.56925354e03, + -7.76931930e-01, + -2.29679257e00, + ] + ), + "std": array( + [ + 1.89341080e03, + 9.68462074e02, + 9.68422393e02, + 9.70421004e02, + 1.89394888e03, + 1.91531274e03, + 1.38230279e00, + 1.27589212e01, + ] + ), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + "total": { + "energy": { + "mean": array( + [-0.8121756, 4.1286583, 4.3558254, 3.9754286, -0.66178626, -1.0762144, 4.4547276, 3.4555712], + dtype=float32, + ), + "std": array( + [11.138448, 17.800674, 18.148039, 17.567768, 11.290686, 11.005155, 18.162294, 16.579735], + dtype=float32, + ), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + } def _read_raw_(self, part): df = pd.read_csv(p_join(self.root, f"{part}.csv")) diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py index b535393..38f70b4 100644 --- a/src/openqdc/datasets/dummy.py +++ b/src/openqdc/datasets/dummy.py @@ -3,6 +3,7 @@ from openqdc.datasets.base import BaseDataset from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory +from numpy import array class Dummy(BaseDataset): @@ -21,6 +22,38 @@ class Dummy(BaseDataset): force_target_names = [f"forces{i}" for i in range(len(__force_methods__))] __isolated_atom_energies__ = [] + __average_n_atoms__ = 20 + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": array([-12.94348027, -9.83037297]), + "std": array([4.39971409, 3.3574188]), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), + "std": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), + "rms": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), + }, + }, + }, + "total": { + "energy": { + "mean": array([-89.44242, -1740.5336]), + "std": array([29.599571, 791.48663]), + }, + "forces": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + } def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None: try: diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py index c016a9f..c16f001 100644 --- a/src/openqdc/datasets/geom.py +++ b/src/openqdc/datasets/geom.py @@ -3,6 +3,7 @@ import datamol as dm import numpy as np +from numpy import array from openqdc.datasets.base import BaseDataset from openqdc.utils import load_json, load_pkl @@ -84,9 +85,45 @@ class GEOM(BaseDataset): energy_target_names = ["gfn2_xtb.energy"] force_target_names = [] + __average_nb_atoms__ = 50.07244573677837 partitions = ["qm9", "drugs"] + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-8.47811605])), + "std": self.convert_energy(array([3.68015507])), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-78.264725])), + "std": self.convert_energy(array([20.200787])), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + } + def _read_raw_(self, partition): raw_path = p_join(self.root, "rdkit_folder") diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index 735ae67..36c09f5 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -1,5 +1,7 @@ from os.path import join as p_join +from numpy import array, float32 + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 @@ -42,6 +44,46 @@ class ISO17(BaseDataset): __energy_unit__ = "ev" __distance_unit__ = "bohr" # bohr __forces_unit__ = "ev/bohr" + __average_nb_atoms__ = 19.0 + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-103.58336533])), + "std": self.convert_energy(array([0.79709836])), + }, + "forces": { + "mean": self.convert_forces(array([-1.2548699e-11])), + "std": self.convert_forces(array([1.1287293])), + "components": { + "mean": self.convert_forces( + array([[-2.7712117e-11], [-1.8989450e-12], [3.9721233e-11]], dtype=float32) + ), + "std": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)), + "rms": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-11503.619]), dtype=float32), + "std": self.convert_energy(array([0.79709935]), dtype=float32), + }, + "forces": { + "mean": self.convert_forces(array([-1.2548699e-11])), + "std": self.convert_forces(array([1.1287293])), + "components": { + "mean": self.convert_forces( + array([[-2.7712117e-11], [-1.8989450e-12], [3.9721233e-11]], dtype=float32) + ), + "std": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)), + "rms": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)), + }, + }, + }, + } def read_raw_entries(self): raw_path = p_join(self.root, "iso_17.h5") diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index dc47e53..ace03d4 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -5,6 +5,7 @@ import datamol as dm import numpy as np import pandas as pd +from numpy import array, float32 from rdkit import Chem from tqdm import tqdm @@ -89,6 +90,7 @@ class Molecule3D(BaseDataset): __forces_unit__ = "ev/ang" energy_target_names = ["b3lyp/6-31g*.energy"] + __average_nb_atoms__ = 29.111696292432697 def read_raw_entries(self): raw = p_join(self.root, "data", "raw") @@ -99,3 +101,38 @@ def read_raw_entries(self): res = dm.parallelized(fn, sdf_paths, n_jobs=1) # don't use more than 1 job samples = sum(res, []) return samples + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-191.66717791])), + "std": self.convert_energy(array([2005.52732443])), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-21100.502], dtype=float32)), + "std": self.convert_energy(array([9345.366], dtype=float32)), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + } diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index 614e252..e5f5953 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -4,6 +4,7 @@ import datamol as dm import numpy as np import pandas as pd +from numpy import array, float32 from openqdc.datasets.base import BaseDataset from openqdc.utils.molecule import atom_table @@ -57,6 +58,7 @@ class OrbnetDenali(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" + __average_nb_atoms__ = 44.87857092772292 def read_raw_entries(self): label_path = p_join(self.root, "denali_labels.csv") @@ -77,3 +79,38 @@ def read_raw_entries(self): res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True) samples = sum(res, []) return samples + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-13.87283487, -7.79357297])), + "std": self.convert_energy(array([75.34652971, 3.3274954])), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-1630.8182, -74.17469], dtype=float32)), + "std": self.convert_energy(array([929.9734, 19.751446], dtype=float32)), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + } diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py index eb8b015..50603eb 100644 --- a/src/openqdc/datasets/qm7x.py +++ b/src/openqdc/datasets/qm7x.py @@ -1,6 +1,7 @@ from os.path import join as p_join import numpy as np +from numpy import array, float32 from tqdm import tqdm from openqdc.datasets.base import BaseDataset @@ -46,6 +47,80 @@ class QM7X(BaseDataset): __energy_unit__ = "ev" __distance_unit__ = "ang" __forces_unit__ = "ev/ang" + __average_nb_atoms__ = 16.84668721109399 + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-82.57984067, 372.52167714])), + "std": self.convert_energy(array([9.85675539, 39.76633713])), + }, + "forces": { + "mean": self.convert_forces(array([-1.1617619e-07])), + "std": self.convert_forces(array([1.1451852])), + "components": { + "mean": self.convert_forces( + array( + [ + [-7.1192130e-07, -6.0926320e-11], + [-4.3502279e-08, -3.7376963e-11], + [5.8300976e-08, 2.9215352e-11], + ], + dtype=float32, + ) + ), + "std": self.convert_forces( + array( + [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]], + dtype=float32, + ) + ), + "rms": self.convert_forces( + array( + [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]], + dtype=float32, + ) + ), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-8.6828701e03, -2.7446982e-01], dtype=float32)), + "std": self.convert_energy(array([1.4362784e03, 5.8798514e-02], dtype=float32)), + }, + "forces": { + "mean": self.convert_forces(array([-1.1617619e-07])), + "std": self.convert_forces(array([1.1451852])), + "components": { + "mean": self.convert_forces( + array( + [ + [-7.1192130e-07, -6.0926320e-11], + [-4.3502279e-08, -3.7376963e-11], + [5.8300976e-08, 2.9215352e-11], + ], + dtype=float32, + ) + ), + "std": self.convert_forces( + array( + [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]], + dtype=float32, + ) + ), + "rms": self.convert_forces( + array( + [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]], + dtype=float32, + ) + ), + }, + }, + }, + } def read_raw_entries(self): samples = [] diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index 481a167..1ef6b22 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -57,6 +57,7 @@ class QMugs(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" + __average_nb_atoms__ = 55.215926293326426 energy_target_names = [ "GFN2:TOTAL_ENERGY", @@ -64,14 +65,9 @@ class QMugs(BaseDataset): ] @property - def average_n_atoms(self): - return 55.215926293326426 - - def get_stats(self, tp: str = "formation"): - if tp not in ["formation", "total"]: - raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead") - if tp == "formation": - return { + def _stats(self): + return { + "formation": { "energy": { "mean": self.convert_energy(array([-12.94348027, -9.83037297])), "std": self.convert_energy(array([4.39971409, 3.3574188])), @@ -80,22 +76,24 @@ def get_stats(self, tp: str = "formation"): "mean": array([0]), "std": array([0]), "components": { - "mean": self.convert_forces(array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])), - "std": self.convert_forces(array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])), + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), }, }, - } - else: - return { + }, + "total": { "energy": { "mean": self.convert_energy(array([-89.44242, -1740.5336])), "std": self.convert_energy(array([29.599571, 791.48663])), }, "forces": { - "mean": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), - "std": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), }, - } + }, + } def read_raw_entries(self): raw_path = p_join(self.root, "structures") diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index 3e75e91..d8808d5 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -24,6 +24,9 @@ class SN2RXN(BaseDataset): force_target_names = [ "DSD-BLYP-D3(BJ):def2-TZVP Gradient", ] + __average_nb_atoms__ = 5.444471933343348 + + # TODO: Check this def read_raw_entries(self): raw_path = p_join(self.root, "sn2_rxn.h5") diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py index 9846bdf..4723c38 100644 --- a/src/openqdc/datasets/solvated_peptides.py +++ b/src/openqdc/datasets/solvated_peptides.py @@ -1,5 +1,7 @@ from os.path import join as p_join +from numpy import array, float32 + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 @@ -26,9 +28,51 @@ class SolvatedPeptides(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" + __average_nb_atoms__ = 21.380975029465297 def read_raw_entries(self): raw_path = p_join(self.root, "solvated_peptides.h5") samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names) return samples + + # TODO : Check the values in this + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([423.49523618])), + "std": self.convert_energy(array([309.76172829])), + }, + "forces": { + "mean": self.convert_forces(array([-3.792959e-12])), + "std": self.convert_forces(array([1.4568169])), + "components": { + "mean": self.convert_forces( + array([[-4.1655182e-12], [-6.9530774e-12], [2.5650127e-12]], dtype=float32) + ), + "std": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)), + "rms": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-79.619286], dtype=float32)), + "std": self.convert_energy(array([40.01196], dtype=float32)), + }, + "forces": { + "mean": self.convert_forces(array([-3.792959e-12])), + "std": self.convert_forces(array([1.4568169])), + "components": { + "mean": self.convert_forces( + array([[-4.1655182e-12], [-6.9530774e-12], [2.5650127e-12]], dtype=float32) + ), + "std": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)), + "rms": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)), + }, + }, + }, + } diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index ec848df..8153d8f 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -56,6 +56,7 @@ class Spice(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" + __average_nb_atoms__ = 29.88387509402179 energy_target_names = ["dft_total_energy"] @@ -77,18 +78,28 @@ class Spice(BaseDataset): } @property - def average_n_atoms(self): - return 29.88387509402179 - - def get_stats(self, tp: str = "formation"): - if tp not in ["formation", "total"]: - raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead") - if tp == "formation": - return ( + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-5.67757058])), + "std": self.convert_energy(array([2.33714861])), + }, + "forces": { + "mean": self.convert_forces(array([-1.0387013e-08])), + "std": self.convert_forces(array([0.021063408])), + "components": { + "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), + "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), + "rms": array([[0.02017307], [0.02016142], [0.02014796]]), + }, + }, + }, + "total": { { "energy": { - "mean": self.convert_energy(array([-5.67757058])), - "std": self.convert_energy(array([2.33714861])), + "mean": self.convert_energy(array([-1244.6562])), + "std": self.convert_energy(array([1219.4248])), }, "forces": { "mean": self.convert_forces(array([-1.0387013e-08])), @@ -96,25 +107,12 @@ def get_stats(self, tp: str = "formation"): "components": { "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), + "rms": array([[0.02017307], [0.02016142], [0.02014796]]), }, }, - }, - ) - else: - return { - "energy": { - "mean": self.convert_energy(array([-1244.6562])), - "std": self.convert_energy(array([1219.4248])), - }, - "forces": { - "mean": self.convert_forces(array([-1.0387013e-08])), - "std": self.convert_forces(array([0.021063408])), - "components": { - "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), - "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), - }, - }, - } + } + }, + } def convert_forces(self, x): return (-1.0) * super().convert_forces(x) diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py index 8952aaa..7160edc 100644 --- a/src/openqdc/datasets/tmqm.py +++ b/src/openqdc/datasets/tmqm.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from numpy import array, float32 from tqdm import tqdm from openqdc.datasets.base import BaseDataset @@ -54,6 +55,42 @@ class TMQM(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" + __average_nb_atoms__ = 65.9905035539555 + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-669.31056277])), + "std": self.convert_energy(array([896.83091033])), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-2952.1428], dtype=float32)), + "std": self.convert_energy(array([1608.7888], dtype=float32)), + }, + "forces": { + "mean": array([0]), + "std": array([0]), + "components": { + "mean": array([[0.0], [0.0], [0.0]]), + "std": array([[0.0], [0.0], [0.0]]), + "rms": array([[0.0], [0.0], [0.0]]), + }, + }, + }, + } def read_raw_entries(self): df = pd.read_csv(p_join(self.root, "tmQM_y.csv"), sep=";", usecols=["CSD_code", "Electronic_E"]) diff --git a/src/openqdc/utils/constants.py b/src/openqdc/utils/constants.py index a8a8215..d1e1937 100644 --- a/src/openqdc/utils/constants.py +++ b/src/openqdc/utils/constants.py @@ -5,3 +5,5 @@ HAR2EV = 27.211386246 BOHR2ANG = 0.52917721092 + +POSSIBLE_NORMALIZATION = ["formation", "total"] From c349f050285cab1f29af154188523a7bd1544680 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 15 Nov 2023 17:00:23 +0000 Subject: [PATCH 05/18] Fixes + black --- src/openqdc/datasets/dummy.py | 2 +- src/openqdc/datasets/spice.py | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py index 38f70b4..4a9bfc8 100644 --- a/src/openqdc/datasets/dummy.py +++ b/src/openqdc/datasets/dummy.py @@ -1,9 +1,9 @@ import numpy as np # noqa +from numpy import array from sklearn.utils import Bunch from openqdc.datasets.base import BaseDataset from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory -from numpy import array class Dummy(BaseDataset): diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index 8153d8f..08f7f2b 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -96,21 +96,19 @@ def _stats(self): }, }, "total": { - { - "energy": { - "mean": self.convert_energy(array([-1244.6562])), - "std": self.convert_energy(array([1219.4248])), - }, - "forces": { - "mean": self.convert_forces(array([-1.0387013e-08])), - "std": self.convert_forces(array([0.021063408])), - "components": { - "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), - "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), - "rms": array([[0.02017307], [0.02016142], [0.02014796]]), - }, + "energy": { + "mean": self.convert_energy(array([-1244.6562])), + "std": self.convert_energy(array([1219.4248])), + }, + "forces": { + "mean": self.convert_forces(array([-1.0387013e-08])), + "std": self.convert_forces(array([0.021063408])), + "components": { + "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), + "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), + "rms": array([[0.02017307], [0.02016142], [0.02014796]]), }, - } + }, }, } From 6a67b791a9a214252bfb8458441a90e3ee4d277e Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 15 Nov 2023 17:01:57 +0000 Subject: [PATCH 06/18] update init to not call openqdc.datasets everytime --- src/openqdc/__init__.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py index 1432923..0e65ee2 100644 --- a/src/openqdc/__init__.py +++ b/src/openqdc/__init__.py @@ -7,7 +7,30 @@ # Dictionary of objects to lazily import; maps the object's name to its module path -_lazy_imports_obj = {} +_lazy_imports_obj = { + "ANI1": "openqdc.datasets.ani", + "ANI1CCX": "openqdc.datasets.ani", + "ANI1X": "openqdc.datasets.ani", + "Spice": "openqdc.datasets.spice", + "GEOM": "openqdc.datasets.geom", + "QMugs": "openqdc.datasets.qmugs", + "ISO17": "openqdc.datasets.iso_17", + "COMP6": "openqdc.datasets.comp6", + "GDML": "openqdc.datasets.gdml", + "Molecule3D": "openqdc.datasets.molecule3d", + "OrbnetDenali": "openqdc.datasets.orbnet_denali", + "SN2RXN": "openqdc.datasets.sn2_rxn", + "QM7X": "openqdc.datasets.qm7x", + "DESS": "openqdc.datasets.dess", + "NablaDFT": "openqdc.datasets.nabladft", + "SolvatedPeptides": "openqdc.datasets.solvated_peptides", + "WaterClusters": "openqdc.datasets.waterclusters3_30", + "TMQM": "openqdc.datasets.tmqm", + "Dummy": "openqdc.datasets.dummy", + "PCQM_B3LYP": "openqdc.datasets.pcqm", + "PCQM_PM6": "openqdc.datasets.pcqm", + "Transition1X": "openqdc.datasets.transition1x", +} _lazy_imports_mod = {"datasets": "openqdc.datamodule", "utils": "openqdc.utils"} From 0dca850673cb97013c5137fcfb4151d7eb5ae42c Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 15 Nov 2023 17:03:35 +0000 Subject: [PATCH 07/18] RMS Spice fix --- src/openqdc/datasets/spice.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index 08f7f2b..bb2e1a6 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -91,7 +91,7 @@ def _stats(self): "components": { "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), - "rms": array([[0.02017307], [0.02016142], [0.02014796]]), + "rms": self.convert_forces(array([[0.02017307], [0.02016142], [0.02014796]])), }, }, }, @@ -106,7 +106,7 @@ def _stats(self): "components": { "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), - "rms": array([[0.02017307], [0.02016142], [0.02014796]]), + "rms": self.convert_forces(array([[0.02017307], [0.02016142], [0.02014796]])), }, }, }, From 32caf77a48aaeadfe3d0a3689312a869b1649e39 Mon Sep 17 00:00:00 2001 From: prtos Date: Fri, 17 Nov 2023 00:58:57 +0000 Subject: [PATCH 08/18] combine smiles and subset into one artifact --- src/openqdc/__init__.py | 2 +- src/openqdc/datasets/base.py | 52 +++++++++++++++--------- src/openqdc/datasets/pcqm.py | 19 ++++++--- src/openqdc/utils/molecule.py | 76 +++++++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 25 deletions(-) diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py index 1432923..e77bc9e 100644 --- a/src/openqdc/__init__.py +++ b/src/openqdc/__init__.py @@ -9,7 +9,7 @@ _lazy_imports_obj = {} -_lazy_imports_mod = {"datasets": "openqdc.datamodule", "utils": "openqdc.utils"} +_lazy_imports_mod = {"datasets": "openqdc.datasets", "utils": "openqdc.utils"} def __getattr__(name): diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 47765a2..29c89bd 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -1,4 +1,6 @@ +import gzip import os +import pickle as pkl from os.path import join as p_join from typing import Dict, List, Optional, Union @@ -215,6 +217,7 @@ def collate_list(self, list_entries): x = np.zeros((csum.shape[0], 2), dtype=np.int32) x[1:, 0], x[:, 1] = csum[:-1], csum res["position_idx_range"] = x + return res def save_preprocess(self, data_dict): @@ -228,12 +231,17 @@ def save_preprocess(self, data_dict): push_remote(local_path, overwrite=True) # save smiles and subset - for key in ["name", "subset"]: - local_path = p_join(self.preprocess_path, f"{key}.npz") - uniques, inv_indices = np.unique(data_dict[key], return_inverse=True) - with open(local_path, "wb") as f: - np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) - push_remote(local_path, overwrite=True) + local_path = p_join(self.preprocess_path, "props.pkl.gz") + with gzip.open(local_path, "wb") as f: + pkl.dump(data_dict, f) + push_remote(local_path, overwrite=True) + + # for key in ["name", "subset"]: + # local_path = p_join(self.preprocess_path, f"{key}.npz") + # uniques, inv_indices = np.unique(data_dict[key], return_inverse=True) + # with open(local_path, "wb") as f: + # np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) + # push_remote(local_path, overwrite=True) def read_preprocess(self, overwrite_local_cache=False): logger.info("Reading preprocessed data") @@ -256,19 +264,25 @@ def read_preprocess(self, overwrite_local_cache=False): for key in self.data: print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") - for key in ["name", "subset"]: - filename = p_join(self.preprocess_path, f"{key}.npz") - pull_locally(filename, overwrite=overwrite_local_cache) - self.data[key] = dict() - with open(filename, "rb") as f: - tmp = np.load(f) - for k in tmp: - self.data[key][k] = tmp[k] - print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}") + filename = p_join(self.preprocess_path, "props.pkl.gz") + pull_locally(filename, overwrite=overwrite_local_cache) + with gzip.open(filename, "rb") as f: + tmp = pkl.load(f) + self.data.update(tmp) + + # for key in ["name", "subset"]: + # filename = p_join(self.preprocess_path, f"{key}.npz") + # pull_locally(filename, overwrite=overwrite_local_cache) + # self.data[key] = dict() + # with open(filename, "rb") as f: + # tmp = np.load(f) + # for k in tmp: + # self.data[key][k] = tmp[k] + # print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}") def is_preprocessed(self): predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] - predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) for x in ["name", "subset"]] + predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl.gz"))] return all(predicats) def preprocess(self): @@ -383,7 +397,7 @@ def wrapper(idx): entry = self.get_ase_atoms(idx, ext=False) return soap.create(entry, centers=entry.positions) - descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads") + descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads", n_jobs=-1) datum["soap"] = np.vstack(descr) if return_idxs: datum["idxs"] = idxs @@ -402,8 +416,8 @@ def __getitem__(self, idx: int): self.convert_distance(np.array(input[:, -3:], dtype=np.float32)), self.convert_energy(np.array(self.data["energies"][idx], dtype=np.float32)), ) - name = self.data["name"]["uniques"][self.data["name"]["inv_indices"][idx]] - subset = self.data["subset"]["uniques"][self.data["subset"]["inv_indices"][idx]] + name = self.data["name"][idx] + subset = self.data["subset"][idx] if "forces" in self.data: forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32)) diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py index 555d2eb..73f1c68 100644 --- a/src/openqdc/datasets/pcqm.py +++ b/src/openqdc/datasets/pcqm.py @@ -1,3 +1,4 @@ +import gzip import json import os import pickle as pkl @@ -131,13 +132,21 @@ def collate_and_save_list(self, list_entries): push_remote(local_path, overwrite=True) # save smiles and subset + tmp = dict() + local_path = p_join(self.preprocess_path, "props.pkl.gz") for key in ["name", "subset"]: local_path = p_join(self.preprocess_path, f"{key}.npz") - x = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)] - uniques, inv_indices = np.unique(x, return_inverse=True) - with open(local_path, "wb") as f: - np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) - push_remote(local_path, overwrite=True) + tmp[key] = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)] + with gzip.open(local_path, "wb") as f: + pkl.dump(x, f) + push_remote(local_path, overwrite=True) + # for key in ["name", "subset"]: + # local_path = p_join(self.preprocess_path, f"{key}.npz") + # x = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)] + # uniques, inv_indices = np.unique(x, return_inverse=True) + # with open(local_path, "wb") as f: + # np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) + # push_remote(local_path, overwrite=True) class PCQM_B3LYP(PCQM_PM6): diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py index cd2290f..8bd2b83 100644 --- a/src/openqdc/utils/molecule.py +++ b/src/openqdc/utils/molecule.py @@ -1,4 +1,7 @@ +from typing import Any + import numpy as np +from numpy import ndarray from rdkit import Chem atom_table = Chem.GetPeriodicTable() @@ -17,3 +20,76 @@ def get_atomic_charge(mol: Chem.Mol): def get_atomic_number_and_charge(mol: Chem.Mol): """Returns atoms number and charge for rdkit molecule""" return np.array([[atom.GetAtomicNum(), atom.GetFormalCharge()] for atom in mol.GetAtoms()]) + + +def rmsd(P: ndarray, Q: ndarray, **kwargs) -> float: + """ + Calculate Root-mean-square deviation from two sets of vectors V and W. + + Parameters + ---------- + V : array + (N,D) matrix, where N is points and D is dimension. + W : array + (N,D) matrix, where N is points and D is dimension. + + Returns + ------- + rmsd : float + Root-mean-square deviation between the two vectors + """ + diff = P - Q + return np.sqrt((diff * diff).sum() / P.shape[0]) + + +def kabsch_rmsd( + P: ndarray, + Q: ndarray, + translate: bool = False, + **kwargs: Any, +) -> float: + """ + Rotate matrix P unto Q using Kabsch algorithm and calculate the RMSD. + + Parameters + ---------- + P : array + (N,D) matrix, where N is points and D is dimension. + Q : array + (N,D) matrix, where N is points and D is dimension. + translate : bool + Use centroids to translate vector P and Q unto each other. + + Returns + ------- + rmsd : float + root-mean squared deviation + """ + + if translate: + Q = Q - Q.mean(axis=0) + P = P - P.mean(axis=0) + + # Computation of the covariance matrix + C = np.dot(np.transpose(P), Q) + + # Computation of the optimal rotation matrix + # This can be done using singular value decomposition (SVD) + # Getting the sign of the det(V)*(W) to decide + # whether we need to correct our rotation matrix to ensure a + # right-handed coordinate system. + # And finally calculating the optimal rotation matrix U + # see http://en.wikipedia.org/wiki/Kabsch_algorithm + V, S, W = np.linalg.svd(C) + d = (np.linalg.det(V) * np.linalg.det(W)) < 0.0 + + if d: + S[-1] = -S[-1] + V[:, -1] = -V[:, -1] + + # Create Rotation matrix U + U = np.dot(V, W) + + # Rotate P + P_prime = np.dot(P, U) + return rmsd(P_prime, Q) From f74a6dff5b00e634f1a3d97a8c9261dd6963e9b5 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 27 Nov 2023 20:44:27 +0000 Subject: [PATCH 09/18] Fix xyz save + Updated e0 matrix to fix PCQM --- src/openqdc/datasets/base.py | 5 +++-- src/openqdc/utils/atomization_energies.py | 2 +- src/openqdc/utils/units.py | 4 ++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 022f880..5aae697 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -278,14 +278,15 @@ def preprocess(self): res = self.collate_list(entries) self.save_preprocess(res) - def save_xyz(self, idx: int, path: Optional[str] = None): + def save_xyz(self, idx: int, path: Optional[str] = None, name=None): """ Save the entry at index idx as an extxyz file. """ if path is None: path = os.getcwd() at = self.get_ase_atoms(idx, ext=True) - name = at.info["name"] + if name is not None: + name = at.info["name"] write_extxyz(p_join(path, f"{name}.xyz"), at) def get_ase_atoms(self, idx: int, ext=True): diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index 40d0d13..01b9821 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -131,7 +131,7 @@ class IsolatedAtomEnergyFactory: Factory method to get the isolated atom energies for a given level of theory. """ - max_charge = 4 + max_charge = 9 def __init__(self): pass diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py index fb895ce..4446592 100644 --- a/src/openqdc/utils/units.py +++ b/src/openqdc/utils/units.py @@ -73,3 +73,7 @@ def get_conversion(in_unit: str, out_unit: str): Conversion("hartree/ang", "kcal/mol/ang", lambda x: get_conversion("hartree", "kcal/mol")(x)) Conversion("hartree/ang", "hartree/bohr", lambda x: get_conversion("bohr", "ang")(x)) Conversion("hartree/bohr", "hartree/ang", lambda x: get_conversion("ang", "bohr")(x)) +Conversion("kcal/mol/bohr", "Hartree/bohr", lambda x: get_conversion("kcal/mol", "hartree")(x)) +Conversion("ev/ang", "hartree/ang", lambda x: get_conversion("ev", "hartree")(x)) +Conversion("ev/bohr", "hartree/bohr", lambda x: get_conversion("ev", "hartree")(x)) +Conversion("ev/bohr", "ev/ang", lambda x: get_conversion("ang", "bohr")(x)) From 97f093ca84eb5cb96e424849829c01d45cdcc364 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 28 Nov 2023 14:25:53 +0000 Subject: [PATCH 10/18] GDML Stats, Improvements, Exceptions, Forces as None if not present in the _stats --- src/openqdc/datasets/ani.py | 41 ++---------- src/openqdc/datasets/base.py | 59 +++++++++++----- src/openqdc/datasets/dess.py | 21 +----- src/openqdc/datasets/dummy.py | 17 +---- src/openqdc/datasets/gdml.py | 96 ++++++++++++++++++++++++++- src/openqdc/datasets/geom.py | 21 +----- src/openqdc/datasets/molecule3d.py | 21 +----- src/openqdc/datasets/orbnet_denali.py | 21 +----- src/openqdc/datasets/qmugs.py | 17 +---- src/openqdc/datasets/tmqm.py | 21 +----- src/openqdc/utils/constants.py | 12 ++++ src/openqdc/utils/exceptions.py | 63 ++++++++++++++++++ src/openqdc/utils/units.py | 6 +- 13 files changed, 241 insertions(+), 175 deletions(-) create mode 100644 src/openqdc/utils/exceptions.py diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index c18d254..0e8c751 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -4,6 +4,7 @@ from numpy import array, float32 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.io import get_local_cache @@ -61,30 +62,14 @@ def _stats(self): "mean": self.convert_energy(array([-2.37376472])), "std": self.convert_energy(array([0.50266975])), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, "total": { "energy": { "mean": self.convert_energy(array([-333.67322], dtype=float32)), "std": self.convert_energy(array([61.21667], dtype=float32)), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, } @@ -136,30 +121,14 @@ def _stats(self): "mean": self.convert_energy(array([-3.23959548, 500.30384627, 500.40706776, 500.76740432])), "std": self.convert_energy(array([1.03021261, 132.52707152, 132.56092469, 132.65261362])), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, "total": { "energy": { "mean": self.convert_energy(array([-374.40665, -1.2378153, -1.505962, -1.2396905], dtype=float32)), "std": self.convert_energy(array([101.63995, 0.32444745, 0.39500558, 0.3250212], dtype=float32)), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, } diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 5aae697..478cb37 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -15,6 +15,12 @@ chemical_symbols, ) from openqdc.utils.constants import NB_ATOMIC_FEATURES, POSSIBLE_NORMALIZATION +from openqdc.utils.exceptions import ( + PROPERTY_NOT_AVAILABLE_ERROR, + DatasetNotAvailableError, + NormalizationNotAvailableError, + StatisticsNotAvailableError, +) from openqdc.utils.io import ( copy_exists, dict_to_atoms, @@ -96,10 +102,7 @@ def __init__( self.data = None self._set_units(energy_unit, distance_unit) if not self.is_preprocessed(): - logger.info("This dataset not available. Please open an issue on Github for the team to look into it.") - # entries = self.read_raw_entries() - # res = self.collate_list(entries) - # self.save_preprocess(res) + raise DatasetNotAvailableError(self.__name__) else: self.read_preprocess(overwrite_local_cache=overwrite_local_cache) self._set_isolated_atom_energies() @@ -255,7 +258,7 @@ def read_preprocess(self, overwrite_local_cache=False): ).reshape(self.data_shapes[key]) for key in self.data: - print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") + logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") for key in ["name", "subset"]: filename = p_join(self.preprocess_path, f"{key}.npz") @@ -265,7 +268,9 @@ def read_preprocess(self, overwrite_local_cache=False): tmp = np.load(f) for k in tmp: self.data[key][k] = tmp[k] - print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}") + logger.info( + f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}" + ) def is_preprocessed(self): predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] @@ -434,21 +439,41 @@ def _stats(self): @property def average_n_atoms(self): + """ + Average number of atoms in a molecule in the dataset. + """ if self.__average_nb_atoms__ is None: - logger.info( - "This property for this dataset not available." - + "Please open an issue on Github for the team to look into it." - ) + logger.info(PROPERTY_NOT_AVAILABLE_ERROR) return 1 return self.__average_nb_atoms__ - def get_statistics(self, normalization: str = "formation"): + def get_statistics(self, normalization: str = "formation", return_none: bool = True): + """ + Get the statistics of the dataset. + normalization : str, optional + Type of energy, by default "formation", must be one of ["formation", "total"] + return_none : bool, optional + Whether to return None if the statistics for the forces are not available, by default True + Otherwise, the statistics for the forces are set to 0.0 + """ stats = self._stats if len(stats) == 0: - logger.info( - "This property for this dataset not available." - + "Please open an issue on Github for the team to look into it." - ) + raise StatisticsNotAvailableError(self.__name__) if normalization not in POSSIBLE_NORMALIZATION: - raise ValueError(f"normalization={normalization} is not valid. Must be one of {POSSIBLE_NORMALIZATION}") - return stats[normalization] + raise NormalizationNotAvailableError(normalization) + selected_stats = stats[normalization] + if len(self.__force_methods__) == 0 and not return_none: + selected_stats.update( + { + "forces": { + "mean": np.array([0.0]), + "std": np.array([0.0]), + "components": { + "mean": np.array([[0.0], [0.0], [0.0]]), + "std": np.array([[0.0], [0.0], [0.0]]), + "rms": np.array([[0.0], [0.0], [0.0]]), + }, + } + } + ) + return selected_stats diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py index 76c2355..6651bf7 100644 --- a/src/openqdc/datasets/dess.py +++ b/src/openqdc/datasets/dess.py @@ -7,6 +7,7 @@ from tqdm import tqdm from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import get_atomic_number_and_charge @@ -94,15 +95,7 @@ def _stats(self): ] ), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, "total": { "energy": { @@ -115,15 +108,7 @@ def _stats(self): dtype=float32, ), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, } diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py index 4a9bfc8..99b5106 100644 --- a/src/openqdc/datasets/dummy.py +++ b/src/openqdc/datasets/dummy.py @@ -4,6 +4,7 @@ from openqdc.datasets.base import BaseDataset from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory +from openqdc.utils.constants import NOT_DEFINED class Dummy(BaseDataset): @@ -32,26 +33,14 @@ def _stats(self): "mean": array([-12.94348027, -9.83037297]), "std": array([4.39971409, 3.3574188]), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), - "std": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), - "rms": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), - }, - }, + "forces": NOT_DEFINED, }, "total": { "energy": { "mean": array([-89.44242, -1740.5336]), "std": array([29.599571, 791.48663]), }, - "forces": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, + "forces": NOT_DEFINED, }, } diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py index 789f84a..73bae17 100644 --- a/src/openqdc/datasets/gdml.py +++ b/src/openqdc/datasets/gdml.py @@ -1,5 +1,7 @@ from os.path import join as p_join +from numpy import array, float32 + from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 @@ -32,7 +34,7 @@ class GDML(BaseDataset): __energy_methods__ = [ "ccsd/cc-pvdz", "ccsd(t)/cc-pvdz", - "pbe/mbd", # MD22 + # "pbe/mbd", # MD22 # "pbe+mbd/tight", #MD22 "pbe/vdw-ts", # MD17 ] @@ -46,7 +48,7 @@ class GDML(BaseDataset): __force_methods__ = [ "ccsd/cc-pvdz", "ccsd(t)/cc-pvdz", - "pbe/mbd", # MD22 + # "pbe/mbd", # MD22 # "pbe+mbd/tight", #MD22 "pbe/vdw-ts", # MD17 ] @@ -60,9 +62,99 @@ class GDML(BaseDataset): __energy_unit__ = "kcal/mol" __distance_unit__ = "bohr" __forces_unit__ = "kcal/mol/bohr" + __average_nb_atoms__ = 13.00299550 def read_raw_entries(self): raw_path = p_join(self.root, "gdml.h5") samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, self.force_target_names) return samples + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": self.convert_energy(array([-2466.00011563, -1213.94691714, -1916.02068252])), + "std": self.convert_energy(array([6.65779492, 310.70204248, 729.2143015])), + }, + "forces": { + "mean": self.convert_forces(array(-1.42346325e-05)), + "std": self.convert_forces(array(27.009315)), + "components": { + "mean": self.convert_forces( + array( + [ + [-8.3862792e-09, -1.9758134e-07, -7.7199416e-05], + [-2.7550591e-09, -1.9665436e-08, 5.3315878e-05], + [-7.5688439e-10, 5.6149121e-09, -1.8894127e-05], + ], + dtype=float32, + ) + ), + "std": self.convert_forces( + array( + [ + [31.060509, 29.168474, 27.547812], + [31.365385, 26.67319, 26.068623], + [31.024155, 27.272366, 22.33925], + ], + dtype=float32, + ) + ), + "rms": self.convert_forces( + array( + [ + [31.060509, 29.168474, 27.547802], + [31.365385, 26.67319, 26.068628], + [31.024155, 27.272366, 22.33925], + ], + dtype=float32, + ) + ), + }, + }, + }, + "total": { + "energy": { + "mean": self.convert_energy(array([-405688.28, -141134.3, -194075.56], dtype=float32)), + "std": self.convert_energy(array([7.2360396e00, 3.0755928e04, 8.4138445e04], dtype=float32)), + }, + "forces": { + "mean": self.convert_forces(array(-1.42346325e-05)), + "std": self.convert_forces(array(27.009315)), + "components": { + "mean": self.convert_forces( + array( + [ + [-8.3862792e-09, -1.9758134e-07, -7.7199416e-05], + [-2.7550591e-09, -1.9665436e-08, 5.3315878e-05], + [-7.5688439e-10, 5.6149121e-09, -1.8894127e-05], + ], + dtype=float32, + ) + ), + "std": self.convert_forces( + array( + [ + [31.060509, 29.168474, 27.547812], + [31.365385, 26.67319, 26.068623], + [31.024155, 27.272366, 22.33925], + ], + dtype=float32, + ) + ), + "rms": self.convert_forces( + array( + [ + [31.060509, 29.168474, 27.547802], + [31.365385, 26.67319, 26.068628], + [31.024155, 27.272366, 22.33925], + ], + dtype=float32, + ) + ), + }, + }, + }, + } diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py index c16f001..897b828 100644 --- a/src/openqdc/datasets/geom.py +++ b/src/openqdc/datasets/geom.py @@ -7,6 +7,7 @@ from openqdc.datasets.base import BaseDataset from openqdc.utils import load_json, load_pkl +from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import get_atomic_number_and_charge @@ -97,30 +98,14 @@ def _stats(self): "mean": self.convert_energy(array([-8.47811605])), "std": self.convert_energy(array([3.68015507])), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, "total": { "energy": { "mean": self.convert_energy(array([-78.264725])), "std": self.convert_energy(array([20.200787])), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, } diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index ace03d4..3887499 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -10,6 +10,7 @@ from tqdm import tqdm from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import get_atomic_number_and_charge @@ -110,29 +111,13 @@ def _stats(self): "mean": self.convert_energy(array([-191.66717791])), "std": self.convert_energy(array([2005.52732443])), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, "total": { "energy": { "mean": self.convert_energy(array([-21100.502], dtype=float32)), "std": self.convert_energy(array([9345.366], dtype=float32)), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, } diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index e5f5953..e158410 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -7,6 +7,7 @@ from numpy import array, float32 from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import atom_table @@ -88,29 +89,13 @@ def _stats(self): "mean": self.convert_energy(array([-13.87283487, -7.79357297])), "std": self.convert_energy(array([75.34652971, 3.3274954])), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, "total": { "energy": { "mean": self.convert_energy(array([-1630.8182, -74.17469], dtype=float32)), "std": self.convert_energy(array([929.9734, 19.751446], dtype=float32)), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, } diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index 1ef6b22..a793151 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -7,6 +7,7 @@ from numpy import array from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import get_atomic_number_and_charge @@ -72,26 +73,14 @@ def _stats(self): "mean": self.convert_energy(array([-12.94348027, -9.83037297])), "std": self.convert_energy(array([4.39971409, 3.3574188])), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, "total": { "energy": { "mean": self.convert_energy(array([-89.44242, -1740.5336])), "std": self.convert_energy(array([29.599571, 791.48663])), }, - "forces": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, + "forces": NOT_DEFINED, }, } diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py index 7160edc..1b7ffc1 100644 --- a/src/openqdc/datasets/tmqm.py +++ b/src/openqdc/datasets/tmqm.py @@ -7,6 +7,7 @@ from tqdm import tqdm from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import atom_table @@ -65,30 +66,14 @@ def _stats(self): "mean": self.convert_energy(array([-669.31056277])), "std": self.convert_energy(array([896.83091033])), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, "total": { "energy": { "mean": self.convert_energy(array([-2952.1428], dtype=float32)), "std": self.convert_energy(array([1608.7888], dtype=float32)), }, - "forces": { - "mean": array([0]), - "std": array([0]), - "components": { - "mean": array([[0.0], [0.0], [0.0]]), - "std": array([[0.0], [0.0], [0.0]]), - "rms": array([[0.0], [0.0], [0.0]]), - }, - }, + "forces": NOT_DEFINED, }, } diff --git a/src/openqdc/utils/constants.py b/src/openqdc/utils/constants.py index d1e1937..d32c5d9 100644 --- a/src/openqdc/utils/constants.py +++ b/src/openqdc/utils/constants.py @@ -7,3 +7,15 @@ BOHR2ANG = 0.52917721092 POSSIBLE_NORMALIZATION = ["formation", "total"] + +NOT_DEFINED = { + "forces": { + "mean": None, + "std": None, + "components": { + "mean": None, + "std": None, + "rms": None, + }, + }, +} diff --git a/src/openqdc/utils/exceptions.py b/src/openqdc/utils/exceptions.py new file mode 100644 index 0000000..5c3ddbc --- /dev/null +++ b/src/openqdc/utils/exceptions.py @@ -0,0 +1,63 @@ +from typing import Final + +from openqdc.utils.constants import POSSIBLE_NORMALIZATION + +PROPERTY_NOT_AVAILABLE_ERROR: Final[ + str +] = """This property for this dataset not available. +Please open an issue on Github for the team to look into it.""" + + +class OpenQDCException(Exception): + """Base exception for custom exceptions raised by the openQDC""" + + def __init__(self, msg: str): + super().__init__(msg) + self.msg = msg + + def __str__(self): + return self.msg + + +class DatasetNotAvailableError(OpenQDCException): + """Raised when a dataset is not available""" + + msg = "Dataset {dataset_name} is not available. Please open an issue on Github for the team to look into it." + + def __init__(self, dataset_name): + super().__init__(self.msg.format(dataset_name=dataset_name)) + + +class StatisticsNotAvailableError(DatasetNotAvailableError): + """Raised when statistics are not available""" + + +class NormalizationNotAvailableError(OpenQDCException): + """Raised when normalization is not available""" + + def __init__(self, normalization): + msg = f"Normalization={normalization} is not valid. Must be one of {POSSIBLE_NORMALIZATION}" + super().__init__(msg) + + +class ConversionNotDefinedError(OpenQDCException, ValueError): + """Raised when a conversion is not defined""" + + _error_message = """ + Conversion from {in_unit} to {out_unit} is not defined in the conversion registry. + To add a new conversion, use the following syntax or open an issue on Github for the team to look into it: + + Conversion("{in_unit}", "{out_unit}", lambda x: x * conversion_factor) + """ + + def __init__(self, in_unit, out_unit): + super().__init__(self._error_message.format(in_unit=in_unit, out_unit=out_unit)) + + +class ConversionAlreadyDefined(ConversionNotDefinedError): + """Raised when a conversion is not defined""" + + _error_message = """ + Conversion from {in_unit} to {out_unit} is alread defined in the conversion registry. + To reuse the same metric, use get_conversion({in_unit}, {out_unit}). + """ diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py index 4446592..016d464 100644 --- a/src/openqdc/utils/units.py +++ b/src/openqdc/utils/units.py @@ -1,5 +1,7 @@ from typing import Callable +from openqdc.utils.exceptions import ConversionAlreadyDefined, ConversionNotDefinedError + CONVERSION_REGISTRY = {} @@ -13,7 +15,7 @@ def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]): name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip() if name in CONVERSION_REGISTRY: - raise ValueError(f"{name} is already registered. To reuse the same metric, use Metric.get_by_name().") + raise ConversionAlreadyDefined(in_unit, out_unit) CONVERSION_REGISTRY[name] = self self.name = name @@ -29,7 +31,7 @@ def get_conversion(in_unit: str, out_unit: str): if in_unit.lower().strip() == out_unit.lower().strip(): return lambda x: x if name not in CONVERSION_REGISTRY: - raise ValueError(f"{name} is not a valid metric. Valid metrics are: {list(CONVERSION_REGISTRY.keys())}") + raise ConversionNotDefinedError(in_unit, out_unit) return CONVERSION_REGISTRY[name] From 51a5191d69d094df93e56eeb1493681d6af4268e Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 28 Nov 2023 15:09:39 +0000 Subject: [PATCH 11/18] Fix the downloading issue and incompatibilities with new file types --- src/openqdc/datasets/base.py | 40 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 478cb37..887fe8f 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -26,6 +26,7 @@ dict_to_atoms, get_local_cache, load_hdf5_file, + load_pkl, pull_locally, push_remote, set_cache_dir, @@ -102,10 +103,18 @@ def __init__( self.data = None self._set_units(energy_unit, distance_unit) if not self.is_preprocessed(): - raise DatasetNotAvailableError(self.__name__) + self._download() else: self.read_preprocess(overwrite_local_cache=overwrite_local_cache) - self._set_isolated_atom_energies() + self._set_isolated_atom_energies() + + def _download(self): + try: + self.read_preprocess(overwrite_local_cache=True) + if not self.is_preprocessed(): + raise Exception + except Exception: + raise DatasetNotAvailableError(self.__name__) @property def numbers(self): @@ -260,21 +269,23 @@ def read_preprocess(self, overwrite_local_cache=False): for key in self.data: logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") - for key in ["name", "subset"]: - filename = p_join(self.preprocess_path, f"{key}.npz") + for key in ["props"]: + filename = p_join(self.preprocess_path, f"{key}.pkl") pull_locally(filename) - self.data[key] = dict() - with open(filename, "rb") as f: - tmp = np.load(f) - for k in tmp: - self.data[key][k] = tmp[k] - logger.info( - f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}" - ) + for key, v in load_pkl(filename).items(): + self.data[key] = dict() + if key == "n_atoms": + self.data[key] = v + logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") + else: + self.data[key]["uniques"] = v[0] + self.data[key]["inv_indices"] = v[1] + logger.info(f"Loaded {key}_{'uniques'} with shape {v[0].shape}, dtype { v[0].dtype}") + logger.info(f"Loaded {key}_{'inv_indices'} with shape {v[1].shape}, dtype {v[1].dtype}") def is_preprocessed(self): predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] - predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) for x in ["name", "subset"]] + predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.pkl")) for x in ["props"]] return all(predicats) def preprocess(self): @@ -411,7 +422,7 @@ def __getitem__(self, idx: int): ) name = self.data["name"]["uniques"][self.data["name"]["inv_indices"][idx]] subset = self.data["subset"]["uniques"][self.data["subset"]["inv_indices"][idx]] - + n_atoms = self.data["n_atoms"][idx] if "forces" in self.data: forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32)) else: @@ -425,6 +436,7 @@ def __getitem__(self, idx: int): name=name, subset=subset, forces=forces, + n_atoms=n_atoms, ) def __str__(self): From 16dcb4eb7e04bb241aa5533c0c48ef472fd535c3 Mon Sep 17 00:00:00 2001 From: prtos Date: Tue, 28 Nov 2023 15:43:41 +0000 Subject: [PATCH 12/18] change format for many reasons --- src/openqdc/datasets/ani.py | 18 ++ src/openqdc/datasets/base.py | 81 ++++---- src/openqdc/datasets/comp6.py | 14 +- src/openqdc/datasets/iso_17.py | 6 + src/openqdc/datasets/nabladft.py | 26 ++- src/openqdc/datasets/pcqm.py | 31 +-- src/openqdc/datasets/sn2_rxn.py | 30 +++ src/openqdc/datasets/solvated_peptides.py | 6 + src/openqdc/raws/config_factory.py | 2 +- src/openqdc/utils/atomization_energies.py | 231 +++++++++++----------- src/openqdc/utils/molecule.py | 10 + src/openqdc/utils/preprocess.py | 8 +- src/openqdc/utils/units.py | 3 + 13 files changed, 283 insertions(+), 183 deletions(-) diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index 913fb8a..3f1b92b 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -39,6 +39,12 @@ class ANI1(BaseDataset): def root(self): return p_join(get_local_cache(), "ani") + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "-".join(x.decode("ascii").split("-")[:-1]) + @property def preprocess_path(self): path = p_join(self.root, "preprocessed", self.__name__) @@ -89,6 +95,12 @@ class ANI1CCX(ANI1): __force_methods__ = [] force_target_names = [] + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return x + class ANI1X(ANI1): """ @@ -145,3 +157,9 @@ class ANI1X(ANI1): def convert_forces(self, x): return super().convert_forces(x) * 0.529177249 # correct the Dataset error + + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return x diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 29c89bd..995b297 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -1,4 +1,3 @@ -import gzip import os import pickle as pkl from os.path import join as p_join @@ -26,7 +25,7 @@ push_remote, set_cache_dir, ) -from openqdc.utils.molecule import atom_table +from openqdc.utils.molecule import atom_table, z_to_formula from openqdc.utils.package_utils import requires_package from openqdc.utils.units import get_conversion @@ -45,7 +44,7 @@ def extract_entry( res = dict( name=np.array([df["name"][i]]), - subset=np.array([subset]), + subset=np.array([subset if subset is not None else z_to_formula(x)]), energies=energies.reshape((1, -1)).astype(np.float32), atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), n_atoms=np.array([x.shape[0]], dtype=np.int32), @@ -66,8 +65,8 @@ def read_qc_archive_h5( ) -> List[Dict[str, np.ndarray]]: data = load_hdf5_file(raw_path) data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()} - n = len(data_t["molecule_id"]) + n = len(data_t["molecule_id"]) samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))] return samples @@ -98,9 +97,6 @@ def __init__( self._set_units(energy_unit, distance_unit) if not self.is_preprocessed(): logger.info("This dataset not available. Please open an issue on Github for the team to look into it.") - # entries = self.read_raw_entries() - # res = self.collate_list(entries) - # self.save_preprocess(res) else: self.read_preprocess(overwrite_local_cache=overwrite_local_cache) self._set_isolated_atom_energies() @@ -109,12 +105,12 @@ def __init__( def numbers(self): if hasattr(self, "_numbers"): return self._numbers - self._numbers = np.array(list(set(self.data["atomic_inputs"][..., 0])), dtype=np.int32) + self._numbers = np.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32) return self._numbers @property def chemical_species(self): - return [chemical_symbols[z] for z in self.numbers] + return np.array(chemical_symbols)[self.numbers] @property def energy_unit(self): @@ -213,7 +209,7 @@ def collate_list(self, list_entries): # concatenate entries res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]} - csum = np.cumsum(res.pop("n_atoms")) + csum = np.cumsum(res.get("n_atoms")) x = np.zeros((csum.shape[0], 2), dtype=np.int32) x[1:, 0], x[:, 1] = csum[:-1], csum res["position_idx_range"] = x @@ -231,18 +227,14 @@ def save_preprocess(self, data_dict): push_remote(local_path, overwrite=True) # save smiles and subset - local_path = p_join(self.preprocess_path, "props.pkl.gz") - with gzip.open(local_path, "wb") as f: + local_path = p_join(self.preprocess_path, "props.pkl") + for key in ["name", "subset"]: + data_dict[key] = np.unique(data_dict[key], return_inverse=True) + + with open(local_path, "wb") as f: pkl.dump(data_dict, f) push_remote(local_path, overwrite=True) - # for key in ["name", "subset"]: - # local_path = p_join(self.preprocess_path, f"{key}.npz") - # uniques, inv_indices = np.unique(data_dict[key], return_inverse=True) - # with open(local_path, "wb") as f: - # np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) - # push_remote(local_path, overwrite=True) - def read_preprocess(self, overwrite_local_cache=False): logger.info("Reading preprocessed data") logger.info( @@ -255,38 +247,29 @@ def read_preprocess(self, overwrite_local_cache=False): for key in self.data_keys: filename = p_join(self.preprocess_path, f"{key}.mmap") pull_locally(filename, overwrite=overwrite_local_cache) - self.data[key] = np.memmap( - filename, - mode="r", - dtype=self.data_types[key], - ).reshape(self.data_shapes[key]) - - for key in self.data: - print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") + self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key]) - filename = p_join(self.preprocess_path, "props.pkl.gz") + filename = p_join(self.preprocess_path, "props.pkl") pull_locally(filename, overwrite=overwrite_local_cache) - with gzip.open(filename, "rb") as f: + with open(filename, "rb") as f: tmp = pkl.load(f) - self.data.update(tmp) - - # for key in ["name", "subset"]: - # filename = p_join(self.preprocess_path, f"{key}.npz") - # pull_locally(filename, overwrite=overwrite_local_cache) - # self.data[key] = dict() - # with open(filename, "rb") as f: - # tmp = np.load(f) - # for k in tmp: - # self.data[key][k] = tmp[k] - # print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}") + for key in ["name", "subset", "n_atoms"]: + x = tmp.pop(key) + if len(x) == 2: + self.data[key] = x[0][x[1]] + else: + self.data[key] = x + + for key in self.data: + print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") def is_preprocessed(self): predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] - predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl.gz"))] + predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))] return all(predicats) - def preprocess(self): - if not self.is_preprocessed(): + def preprocess(self, overwrite=False): + if overwrite or not self.is_preprocessed(): entries = self.read_raw_entries() res = self.collate_list(entries) self.save_preprocess(res) @@ -319,7 +302,7 @@ def get_ase_atoms(self, idx: int, ext=True): @requires_package("dscribe") @requires_package("datamol") - def chemical_space( + def soap_descriptors( self, n_samples: Optional[Union[List[int], int]] = None, return_idxs: bool = True, @@ -364,7 +347,7 @@ def chemical_space( idxs = list(range(len(self))) elif isinstance(n_samples, int): idxs = np.random.choice(len(self), size=n_samples, replace=False) - elif isinstance(n_samples, list): + else: # list, set, np.ndarray idxs = n_samples datum = {} r_cut = soap_kwargs.pop("r_cut", 5.0) @@ -406,6 +389,12 @@ def wrapper(idx): def __len__(self): return self.data["energies"].shape[0] + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return x + def __getitem__(self, idx: int): shift = IsolatedAtomEnergyFactory.max_charge p_start, p_end = self.data["position_idx_range"][idx] @@ -416,7 +405,7 @@ def __getitem__(self, idx: int): self.convert_distance(np.array(input[:, -3:], dtype=np.float32)), self.convert_energy(np.array(self.data["energies"][idx], dtype=np.float32)), ) - name = self.data["name"][idx] + name = self.__smiles_converter__(self.data["name"][idx]) subset = self.data["subset"][idx] if "forces" in self.data: diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index c95ec17..7b6890b 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -35,8 +35,8 @@ class COMP6(BaseDataset): "pbe-d3bj/def2-tzvp", "pbe/def2-tzvp", "svwn/def2-tzvp", - "wb97m-d3bj/def2-tzvp", - "wb97m/def2-tzvp", + # "wb97m-d3bj/def2-tzvp", + # "wb97m/def2-tzvp", ] energy_target_names = [ @@ -47,8 +47,8 @@ class COMP6(BaseDataset): "PBE-D3M(BJ):def2-tzvp", "PBE:def2-tzvp", "SVWN:def2-tzvp", - "WB97M-D3(BJ):def2-tzvp", - "WB97M:def2-tzvp", + # "WB97M-D3(BJ):def2-tzvp", + # "WB97M:def2-tzvp", ] __force_methods__ = [ @@ -59,6 +59,12 @@ class COMP6(BaseDataset): "Gradient", ] + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "-".join(x.decode("ascii").split("_")[:-1]) + def read_raw_entries(self): samples = [] for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]: diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index 735ae67..4553ec1 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -43,6 +43,12 @@ class ISO17(BaseDataset): __distance_unit__ = "bohr" # bohr __forces_unit__ = "ev/bohr" + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "-".join(x.decode("ascii").split("_")[:-1]) + def read_raw_entries(self): raw_path = p_join(self.root, "iso_17.h5") samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py index e7d9eb8..0555cdc 100644 --- a/src/openqdc/datasets/nabladft.py +++ b/src/openqdc/datasets/nabladft.py @@ -4,30 +4,32 @@ import datamol as dm import numpy as np -from tqdm import tqdm +import pandas as pd from openqdc.datasets.base import BaseDataset +from openqdc.utils.molecule import z_to_formula from openqdc.utils.package_utils import requires_package -def to_mol(entry) -> Dict[str, np.ndarray]: +def to_mol(entry, metadata) -> Dict[str, np.ndarray]: Z, R, E, F = entry[:4] C = np.zeros_like(Z) + E[0] = metadata["DFT TOTAL ENERGY"] res = dict( atomic_inputs=np.concatenate((Z[:, None], C[:, None], R), axis=-1).astype(np.float32), - name=np.array([""]), + name=np.array([metadata["SMILES"]]), energies=E[:, None].astype(np.float32), forces=F[:, :, None].astype(np.float32), n_atoms=np.array([Z.shape[0]], dtype=np.int32), - subset=np.array(["nabla"]), + subset=np.array([z_to_formula(Z)]), ) return res @requires_package("nablaDFT") -def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000): +def read_chunk_from_db(raw_path, start_idx, stop_idx, labels, step_size=1000): from nablaDFT.dataset import HamiltonianDatabase print(f"Loading from {start_idx} to {stop_idx}") @@ -35,7 +37,13 @@ def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000): idxs = list(np.arange(start_idx, stop_idx)) n, s = len(idxs), step_size - samples = [to_mol(entry) for i in tqdm(range(0, n, s)) for entry in db[idxs[i : i + s]]] + cursor = db._get_connection().cursor() + data_idxs = cursor.execute("""SELECT * FROM dataset_ids WHERE id IN (""" + str(idxs)[1:-1] + ")").fetchall() + c_idxs = [tuple(x[1:]) for x in data_idxs] + + samples = [ + to_mol(entry, labels[c_idxs[i + j]]) for i in range(0, n, s) for j, entry in enumerate(db[idxs[i : i + s]]) + ] return samples @@ -68,12 +76,16 @@ class NablaDFT(BaseDataset): def read_raw_entries(self): from nablaDFT.dataset import HamiltonianDatabase + label_path = p_join(self.root, "summary.csv") + df = pd.read_csv(label_path, usecols=["MOSES id", "CONFORMER id", "SMILES", "DFT TOTAL ENERGY"]) + labels = df.set_index(keys=["MOSES id", "CONFORMER id"]).to_dict("index") + raw_path = p_join(self.root, "dataset_full.db") train = HamiltonianDatabase(raw_path) n, c = len(train), 20 step_size = int(np.ceil(n / os.cpu_count())) - fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n)) + fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels) samples = dm.parallelized( fn, list(range(c)), n_jobs=c, progress=False, scheduler="threads" ) # don't use more than 1 job diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py index 73f1c68..d1a344c 100644 --- a/src/openqdc/datasets/pcqm.py +++ b/src/openqdc/datasets/pcqm.py @@ -1,4 +1,3 @@ -import gzip import json import os import pickle as pkl @@ -90,7 +89,11 @@ def preprocess_path(self): def collate_list(self, list_entries): predicat = list_entries is not None and len(list_entries) > 0 list_entries = [x for x in list_entries if x is not None] - return super().collate_list(list_entries) if predicat else None + if predicat: + res = super().collate_list(list_entries) + else: + res = None + return res def read_raw_entries(self): arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl")) @@ -99,8 +102,8 @@ def read_raw_entries(self): samples = [x for x in samples if x is not None] return samples - def preprocess(self): - if not self.is_preprocessed(): + def preprocess(self, overwrite=False): + if overwrite or not self.is_preprocessed(): logger.info("Preprocessing data and saving it to cache.") logger.info( f"Dataset {self.__name__} data with the following units:\n" @@ -132,13 +135,19 @@ def collate_and_save_list(self, list_entries): push_remote(local_path, overwrite=True) # save smiles and subset - tmp = dict() - local_path = p_join(self.preprocess_path, "props.pkl.gz") - for key in ["name", "subset"]: - local_path = p_join(self.preprocess_path, f"{key}.npz") - tmp[key] = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)] - with gzip.open(local_path, "wb") as f: - pkl.dump(x, f) + tmp, n = dict(name=[]), len(list_entries) + local_path = p_join(self.preprocess_path, "props.pkl") + names = [list_entries[i].pop("name") for i in range(n)] + f = lambda xs: [dm.to_inchikey(x) for x in xs] + res = dm.parallelized(f, names, n_jobs=-1, progress=False) + for x in res: + tmp["name"] += x + for key in ["subset", "n_atoms"]: + tmp[key] = [] + for i in range(n): + tmp[key] += list(list_entries[i].pop(key)) + with open(local_path, "wb") as f: + pkl.dump(tmp, f) push_remote(local_path, overwrite=True) # for key in ["name", "subset"]: # local_path = p_join(self.preprocess_path, f"{key}.npz") diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index 3e75e91..abcbd62 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -25,8 +25,38 @@ class SN2RXN(BaseDataset): "DSD-BLYP-D3(BJ):def2-TZVP Gradient", ] + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "-".join(x.decode("ascii").split("_")[:-1]) + def read_raw_entries(self): raw_path = p_join(self.root, "sn2_rxn.h5") + + # raw_path = p_join(self.root, "sn2_reactions.npz") + # data = np.load(raw_path) + + # # as example for accessing individual entries, print the data for entry idx=0 + # idx = 0 + # print("Data for entry " + str(idx)+":") + # print("Number of atoms") + # print(data["N"][idx]) + # print("Energy [eV]") + # print(data["E"][idx]) + # print("Total charge") + # print(data["Q"][idx]) + # print("Dipole moment vector (with respect to [0.0 0.0 0.0]) [eA]") + # print(data["D"][idx,:]) + # print("Nuclear charges") + # print(data["Z"][idx,:data["N"][idx]]) + # print("Cartesian coordinates [A]") + # print(data["R"][idx,:data["N"][idx],:]) + # print("Forces [eV/A]") + # print(data["F"][idx,:data["N"][idx],:]) + + # exit() + samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, self.force_target_names) return samples diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py index 9846bdf..216ecdd 100644 --- a/src/openqdc/datasets/solvated_peptides.py +++ b/src/openqdc/datasets/solvated_peptides.py @@ -27,6 +27,12 @@ class SolvatedPeptides(BaseDataset): __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "_".join(x.decode("ascii").split("_")[:-1]) + def read_raw_entries(self): raw_path = p_join(self.root, "solvated_peptides.h5") samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py index 38bec86..c8dddba 100644 --- a/src/openqdc/raws/config_factory.py +++ b/src/openqdc/raws/config_factory.py @@ -37,7 +37,7 @@ class DataConfigFactory: sn2_rxn = dict( dataset_name="sn2_rxn", - links={"sn2_rxn.hdf5.gz": "https://zenodo.org/record/3585800/files/212.hdf5.gz"}, + links={"sn2_rxn.hdf5.gz": "https://zenodo.org/records/2605341/files/sn2_reactions.npz"}, ) # FROM: https://sites.uw.edu/wdbase/database-of-water-clusters/ diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index 40d0d13..6a1a638 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -2,124 +2,126 @@ import numpy as np from loguru import logger +from rdkit import Chem from openqdc.utils.constants import MAX_ATOMIC_NUMBER +atom_table = Chem.GetPeriodicTable() + __all__ = ["chemical_symbols", "atomic_numbers", "IsolatedAtomEnergyFactory"] EF_KEY: TypeAlias = Tuple[str, int] -ATOM_SPECIES = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I" -# Energy in atomic unit/ Hartree / Ang - # didn t calculate for Pd, Pt, Mo, Ni, Fe, Cu, see DESS atomic_numbers = {} -chemical_symbols = [ - "X", - "H", - "He", - "Li", - "Be", - "B", - "C", - "N", - "O", - "F", - "Ne", - "Na", - "Mg", - "Al", - "Si", - "P", - "S", - "Cl", - "Ar", - "K", - "Ca", - "Sc", - "Ti", - "V", - "Cr", - "Mn", - "Fe", - "Co", - "Ni", - "Cu", - "Zn", - "Ga", - "Ge", - "As", - "Se", - "Br", - "Kr", - "Rb", - "Sr", - "Y", - "Zr", - "Nb", - "Mo", - "Tc", - "Ru", - "Rh", - "Pd", - "Ag", - "Cd", - "In", - "Sn", - "Sb", - "Te", - "I", - "Xe", - "Cs", - "Ba", - "La", - "Ce", - "Pr", - "Nd", - "Pm", - "Sm", - "Eu", - "Gd", - "Tb", - "Dy", - "Ho", - "Er", - "Tm", - "Yb", - "Lu", - "Hf", - "Ta", - "W", - "Re", - "Os", - "Ir", - "Pt", - "Au", - "Hg", - "Tl", - "Pb", - "Bi", - "Po", - "At", - "Rn", - "Fr", - "Ra", - "Ac", - "Th", - "Pa", - "U", - "Np", - "Pu", - "Am", - "Cm", - "Bk", - "Cf", - "Es", - "Fm", - "Md", - "No", - "Lr", -] +chemical_symbols = np.array( + [ + "X", + "H", + "He", + "Li", + "Be", + "B", + "C", + "N", + "O", + "F", + "Ne", + "Na", + "Mg", + "Al", + "Si", + "P", + "S", + "Cl", + "Ar", + "K", + "Ca", + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Ga", + "Ge", + "As", + "Se", + "Br", + "Kr", + "Rb", + "Sr", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "In", + "Sn", + "Sb", + "Te", + "I", + "Xe", + "Cs", + "Ba", + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + "Tl", + "Pb", + "Bi", + "Po", + "At", + "Rn", + "Fr", + "Ra", + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", + ] +) for Z, symbol in enumerate(chemical_symbols): @@ -131,7 +133,7 @@ class IsolatedAtomEnergyFactory: Factory method to get the isolated atom energies for a given level of theory. """ - max_charge = 4 + max_charge = 6 def __init__(self): pass @@ -207,7 +209,14 @@ def get_matrix(level_of_theory: str) -> np.ndarray: if tuple_hashmap is None: return matrix for key in tuple_hashmap.keys(): - matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key] + try: + matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key] + except KeyError: + print(key, list(tuple_hashmap.items())) + print(key[0], "?", key[1], "?", shift) + print(matrix.shape, atomic_numbers[key[0]], key[1] + shift) + logger.warning(f"Isolated atom energies not found for {key} and level of theory {level_of_theory}") + matrix[atomic_numbers[key[0]], key[1] + shift] = 0 return matrix diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py index 8bd2b83..82a58d2 100644 --- a/src/openqdc/utils/molecule.py +++ b/src/openqdc/utils/molecule.py @@ -4,9 +4,19 @@ from numpy import ndarray from rdkit import Chem +from openqdc.utils.atomization_energies import chemical_symbols + atom_table = Chem.GetPeriodicTable() +def z_to_formula(z): + u, c = np.unique(z, return_counts=True) + idxs = np.argsort(u) + u, c = u[idxs], c[idxs] + + return "".join([f"{chemical_symbols[u[i]]}{c[i] if c[i] > 1 else ''}" for i in range(len(u))]) + + def get_atomic_number(mol: Chem.Mol): """Returns atomic numbers for rdkit molecule""" return np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]) diff --git a/src/openqdc/utils/preprocess.py b/src/openqdc/utils/preprocess.py index 1142dca..b34499e 100644 --- a/src/openqdc/utils/preprocess.py +++ b/src/openqdc/utils/preprocess.py @@ -36,9 +36,11 @@ def preprocess(dataset): if dataset not in options_map: dataset_id = int(dataset) + data_class = options[dataset_id] + else: + data_class = options_map[dataset] - data_class = options[dataset_id] - data_class().preprocess() + data_class().preprocess(overwrite=False) data = data_class() logger.info(f"Preprocessing {data.__name__}") @@ -47,7 +49,7 @@ def preprocess(dataset): x = data[i] print(x.name, x.subset, end=" ") for k in x: - if x[k] is not None: + if isinstance(x[k], np.ndarray): print(k, x[k].shape, end=" ") print() diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py index fb895ce..69c8972 100644 --- a/src/openqdc/utils/units.py +++ b/src/openqdc/utils/units.py @@ -73,3 +73,6 @@ def get_conversion(in_unit: str, out_unit: str): Conversion("hartree/ang", "kcal/mol/ang", lambda x: get_conversion("hartree", "kcal/mol")(x)) Conversion("hartree/ang", "hartree/bohr", lambda x: get_conversion("bohr", "ang")(x)) Conversion("hartree/bohr", "hartree/ang", lambda x: get_conversion("ang", "bohr")(x)) +Conversion("ev/bohr", "kcal/mol/ang", lambda x: get_conversion("ang", "bohr")(get_conversion("ev", "kcal/mol")(x))) +Conversion("kcal/mol/bohr", "kcal/mol/ang", lambda x: get_conversion("ang", "bohr")(x)) +Conversion("ev/ang", "kcal/mol/ang", lambda x: get_conversion("ev", "kcal/mol")(x)) From 2bf9050a4014bbc998829a4e8940305b65ade0c9 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 28 Nov 2023 19:39:49 +0000 Subject: [PATCH 13/18] On the fly calculation --- src/openqdc/datasets/ani.py | 172 ---------------------- src/openqdc/datasets/base.py | 61 ++++++-- src/openqdc/datasets/comp6.py | 91 ------------ src/openqdc/datasets/dess.py | 50 ------- src/openqdc/datasets/gdml.py | 92 ------------ src/openqdc/datasets/geom.py | 22 --- src/openqdc/datasets/iso_17.py | 42 ------ src/openqdc/datasets/molecule3d.py | 22 --- src/openqdc/datasets/orbnet_denali.py | 22 --- src/openqdc/datasets/qm7x.py | 75 ---------- src/openqdc/datasets/qmugs.py | 22 --- src/openqdc/datasets/sn2_rxn.py | 3 - src/openqdc/datasets/solvated_peptides.py | 44 ------ src/openqdc/datasets/spice.py | 37 ----- src/openqdc/datasets/tmqm.py | 22 --- 15 files changed, 52 insertions(+), 725 deletions(-) diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index 23c890f..3f1b92b 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -1,10 +1,7 @@ import os from os.path import join as p_join -from numpy import array, float32 - from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 -from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.io import get_local_cache @@ -37,7 +34,6 @@ class ANI1(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" - __average_nb_atoms__ = 15.91676229984414 @property def root(self): @@ -60,25 +56,6 @@ def read_raw_entries(self): samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names) return samples - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-2.37376472])), - "std": self.convert_energy(array([0.50266975])), - }, - "forces": NOT_DEFINED, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-333.67322], dtype=float32)), - "std": self.convert_energy(array([61.21667], dtype=float32)), - }, - "forces": NOT_DEFINED, - }, - } - class ANI1CCX(ANI1): """ @@ -100,7 +77,6 @@ class ANI1CCX(ANI1): __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" - __average_nb_atoms__ = 15.274685315870588 __energy_methods__ = [ "ccsd(t)/cbs", @@ -119,25 +95,6 @@ class ANI1CCX(ANI1): __force_methods__ = [] force_target_names = [] - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-3.23959548, 500.30384627, 500.40706776, 500.76740432])), - "std": self.convert_energy(array([1.03021261, 132.52707152, 132.56092469, 132.65261362])), - }, - "forces": NOT_DEFINED, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-374.40665, -1.2378153, -1.505962, -1.2396905], dtype=float32)), - "std": self.convert_energy(array([101.63995, 0.32444745, 0.39500558, 0.3250212], dtype=float32)), - }, - "forces": NOT_DEFINED, - }, - } - def __smiles_converter__(self, x): """util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format @@ -198,135 +155,6 @@ class ANI1X(ANI1): "wb97x/cc-pvtz", ] - __average_nb_atoms__ = 15.274685315870588 - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy( - array( - [ - -2.87910686, - -2.91460298, - -2.91182519, - 500.00748497, - 500.27885605, - 500.11130961, - -3.66090173, - -4.40643278, - ] - ) - ), - "std": self.convert_energy( - array( - [ - 0.92849657, - 0.93421854, - 0.93411345, - 132.44580372, - 132.52326771, - 132.47987395, - 1.60180792, - 1.75414812, - ] - ) - ), - }, - "forces": { - "mean": self.convert_forces(array([-6.139757e-06])), - "std": self.convert_forces(array([0.07401004])), - "components": { - "mean": self.convert_forces( - array( - [ - [6.6829815e-13, 3.5682501e-07], - [-5.1223647e-13, -1.8487021e-06], - [8.1159564e-13, -3.6849189e-05], - ], - dtype=float32, - ) - ), - "std": self.convert_forces( - array( - [[0.0759203, 0.06799112], [0.07694941, 0.06652647], [0.06229663, 0.05442103]], - dtype=float32, - ) - ), - "rms": self.convert_forces( - array( - [[0.0759203, 0.06799113], [0.07694941, 0.06652647], [0.06229663, 0.05442095]], - dtype=float32, - ) - ), - }, - }, - }, - "total": { - "energy": { - "mean": self.convert_energy( - array( - [ - -372.68945, - -372.74274, - -372.7326, - -1.1540408, - -1.5152899, - -1.4195863, - -392.72458, - -391.208, - ], - dtype=float32, - ) - ), - "std": self.convert_energy( - array( - [ - 101.166664, - 101.19915, - 101.191895, - 0.30445468, - 0.39988872, - 0.37456134, - 136.79112, - 137.48692, - ], - dtype=float32, - ) - ), - }, - "forces": { - "mean": self.convert_forces(array([-6.139757e-06])), - "std": self.convert_forces(array([0.07401004])), - "components": { - "mean": self.convert_forces( - array( - [ - [6.6829815e-13, 3.5682501e-07], - [-5.1223647e-13, -1.8487021e-06], - [8.1159564e-13, -3.6849189e-05], - ], - dtype=float32, - ) - ), - "std": self.convert_forces( - array( - [[0.0759203, 0.06799112], [0.07694941, 0.06652647], [0.06229663, 0.05442103]], - dtype=float32, - ) - ), - "rms": self.convert_forces( - array( - [[0.0759203, 0.06799113], [0.07694941, 0.06652647], [0.06229663, 0.05442095]], - dtype=float32, - ) - ), - }, - }, - }, - } - def convert_forces(self, x): return super().convert_forces(x) * 0.529177249 # correct the Dataset error diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 34f2ac9..6c939e7 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -15,7 +15,11 @@ IsolatedAtomEnergyFactory, chemical_symbols, ) -from openqdc.utils.constants import NB_ATOMIC_FEATURES, POSSIBLE_NORMALIZATION +from openqdc.utils.constants import ( + NB_ATOMIC_FEATURES, + NOT_DEFINED, + POSSIBLE_NORMALIZATION, +) from openqdc.utils.exceptions import ( PROPERTY_NOT_AVAILABLE_ERROR, DatasetNotAvailableError, @@ -91,6 +95,7 @@ class BaseDataset(torch.utils.data.Dataset): __fn_distance__ = lambda x: x __fn_forces__ = lambda x: x __average_nb_atoms__ = None + __stats__ = {} def __init__( self, @@ -107,14 +112,52 @@ def __init__( else: self.read_preprocess(overwrite_local_cache=overwrite_local_cache) self._set_isolated_atom_energies() + self._precompute_statistics() + + def _precompute_statistics(self): + logger.info("Precomputing relevant statistics") + self._compute_average_nb_atoms() + (formation_E_mean, formation_E_std, total_E_mean, total_E_std) = self._precompute_E() + forces_dict = self._precompute_F() + self.__stats__ = { + "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict}, + "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict}, + } - def _download(self): - try: - self.read_preprocess(overwrite_local_cache=True) - if not self.is_preprocessed(): - raise Exception - except Exception: - raise DatasetNotAvailableError(self.__name__) + def _compute_average_nb_atoms(self): + self.__average_nb_atoms__ = np.mean(self.data["n_atoms"]) + + def _precompute_E(self): + splits_idx = self.data["position_idx_range"][:, 1] + s = np.array(self.data["atomic_inputs"][:, :2], dtype=int) + s[:, 1] += IsolatedAtomEnergyFactory.max_charge + matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__] + matrixs = [np.split(matrix, splits_idx)[:-1] for matrix in matrixs] + converted_energy_data = self.convert_energy(self.data["energies"]) + # calculation per molecule formation energy statistics + e = [] + for i in range(len(self.__energy_methods__)): + e.append(converted_energy_data[:, i] - np.array(list(map(lambda x: x.sum(), matrixs[i])))) + E = np.array(e).T + formation_E_mean = np.nanmean(E, axis=0) + formation_E_std = np.nanstd(E, axis=0) + total_E_mean = np.nanmean(converted_energy_data, axis=0) + total_E_std = np.nanstd(converted_energy_data, axis=0) + + return formation_E_mean, formation_E_std, total_E_mean, total_E_std + + def _precompute_F(self): + if len(self.__force_methods__) == 0: + return NOT_DEFINED + converted_force_data = self.convert_forces(self.data["forces"]) + force_mean = np.nanmean(converted_force_data, axis=0) + force_std = np.nanstd(converted_force_data, axis=0) + force_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0)) + return { + "mean": force_mean, + "std": force_std, + "components": {"rms": force_rms, "std": force_std.mean(axis=0), "mean": force_mean.mean(axis=0)}, + } @property def numbers(self): @@ -447,7 +490,7 @@ def __repr__(self): @property def _stats(self): - return {} + return self.__stats__ @property def average_n_atoms(self): diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index 4641d7f..7b6890b 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -1,7 +1,5 @@ from os.path import join as p_join -from numpy import array, float32, nan - from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 @@ -61,95 +59,6 @@ class COMP6(BaseDataset): "Gradient", ] - __average_nb_atoms__ = 25.74051563378753 - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy( - array( - [ - -2579.52016333, - -2543.74519203, - -354694.46157991, - -2506.4300631, - -2616.89224817, - -3157.54118509, - -354031.62984212, - nan, - nan, - ] - ) - ), - "std": self.convert_energy( - array( - [ - 1811.03171965, - 1471.95818836, - 201545.89189168, - 1385.50993753, - 1456.09915473, - 1728.51133182, - 208097.95666257, - nan, - nan, - ] - ) - ), - }, - "forces": { - "mean": self.convert_forces(6.6065984e-13), - "std": self.convert_forces(0.056459695), - "components": { - "mean": self.convert_forces( - array([[-4.1767219e-13], [1.0024132e-12], [-9.4386771e-13]], dtype=float32) - ), - "std": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)), - "rms": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)), - }, - }, - }, - "total": { - "energy": { - "mean": self.convert_energy( - array( - [ - -360972.16, - -354729.66, - -354699.38, - -349555.7, - -351555.97, - -351530.44, - -354027.8, - nan, - nan, - ], - dtype=float32, - ), - ), - "std": self.convert_energy( - array( - [254766.0, 201559.77, 201537.8, 188725.47, 191028.78, 191016.1, 208089.4, nan, nan], - dtype=float32, - ), - ), - }, - "forces": { - "mean": self.convert_forces(array([6.6065984e-13])), - "std": self.convert_forces(array([0.056459695])), - "components": { - "mean": self.convert_forces( - array([[-4.1767219e-13], [1.0024132e-12], [-9.4386771e-13]], dtype=float32) - ), - "std": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)), - "rms": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)), - }, - }, - }, - } - def __smiles_converter__(self, x): """util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py index 6651bf7..80b1e1c 100644 --- a/src/openqdc/datasets/dess.py +++ b/src/openqdc/datasets/dess.py @@ -3,11 +3,9 @@ import datamol as dm import numpy as np import pandas as pd -from numpy import array, float32 from tqdm import tqdm from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import get_atomic_number_and_charge @@ -63,54 +61,6 @@ class DESS(BaseDataset): # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all'] partitions = ["DES370K", "DES5M"] - __average_nb_atoms__ = 18.944905540514252 - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": array( - [ - 1.55021905e03, - 7.85244098e02, - 7.85022111e02, - 7.84864329e02, - 1.55072999e03, - 1.56925354e03, - -7.76931930e-01, - -2.29679257e00, - ] - ), - "std": array( - [ - 1.89341080e03, - 9.68462074e02, - 9.68422393e02, - 9.70421004e02, - 1.89394888e03, - 1.91531274e03, - 1.38230279e00, - 1.27589212e01, - ] - ), - }, - "forces": NOT_DEFINED, - }, - "total": { - "energy": { - "mean": array( - [-0.8121756, 4.1286583, 4.3558254, 3.9754286, -0.66178626, -1.0762144, 4.4547276, 3.4555712], - dtype=float32, - ), - "std": array( - [11.138448, 17.800674, 18.148039, 17.567768, 11.290686, 11.005155, 18.162294, 16.579735], - dtype=float32, - ), - }, - "forces": NOT_DEFINED, - }, - } def _read_raw_(self, part): df = pd.read_csv(p_join(self.root, f"{part}.csv")) diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py index 73bae17..e40b3fa 100644 --- a/src/openqdc/datasets/gdml.py +++ b/src/openqdc/datasets/gdml.py @@ -1,7 +1,5 @@ from os.path import join as p_join -from numpy import array, float32 - from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 @@ -62,99 +60,9 @@ class GDML(BaseDataset): __energy_unit__ = "kcal/mol" __distance_unit__ = "bohr" __forces_unit__ = "kcal/mol/bohr" - __average_nb_atoms__ = 13.00299550 def read_raw_entries(self): raw_path = p_join(self.root, "gdml.h5") samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, self.force_target_names) return samples - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-2466.00011563, -1213.94691714, -1916.02068252])), - "std": self.convert_energy(array([6.65779492, 310.70204248, 729.2143015])), - }, - "forces": { - "mean": self.convert_forces(array(-1.42346325e-05)), - "std": self.convert_forces(array(27.009315)), - "components": { - "mean": self.convert_forces( - array( - [ - [-8.3862792e-09, -1.9758134e-07, -7.7199416e-05], - [-2.7550591e-09, -1.9665436e-08, 5.3315878e-05], - [-7.5688439e-10, 5.6149121e-09, -1.8894127e-05], - ], - dtype=float32, - ) - ), - "std": self.convert_forces( - array( - [ - [31.060509, 29.168474, 27.547812], - [31.365385, 26.67319, 26.068623], - [31.024155, 27.272366, 22.33925], - ], - dtype=float32, - ) - ), - "rms": self.convert_forces( - array( - [ - [31.060509, 29.168474, 27.547802], - [31.365385, 26.67319, 26.068628], - [31.024155, 27.272366, 22.33925], - ], - dtype=float32, - ) - ), - }, - }, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-405688.28, -141134.3, -194075.56], dtype=float32)), - "std": self.convert_energy(array([7.2360396e00, 3.0755928e04, 8.4138445e04], dtype=float32)), - }, - "forces": { - "mean": self.convert_forces(array(-1.42346325e-05)), - "std": self.convert_forces(array(27.009315)), - "components": { - "mean": self.convert_forces( - array( - [ - [-8.3862792e-09, -1.9758134e-07, -7.7199416e-05], - [-2.7550591e-09, -1.9665436e-08, 5.3315878e-05], - [-7.5688439e-10, 5.6149121e-09, -1.8894127e-05], - ], - dtype=float32, - ) - ), - "std": self.convert_forces( - array( - [ - [31.060509, 29.168474, 27.547812], - [31.365385, 26.67319, 26.068623], - [31.024155, 27.272366, 22.33925], - ], - dtype=float32, - ) - ), - "rms": self.convert_forces( - array( - [ - [31.060509, 29.168474, 27.547802], - [31.365385, 26.67319, 26.068628], - [31.024155, 27.272366, 22.33925], - ], - dtype=float32, - ) - ), - }, - }, - }, - } diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py index 897b828..c016a9f 100644 --- a/src/openqdc/datasets/geom.py +++ b/src/openqdc/datasets/geom.py @@ -3,11 +3,9 @@ import datamol as dm import numpy as np -from numpy import array from openqdc.datasets.base import BaseDataset from openqdc.utils import load_json, load_pkl -from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import get_atomic_number_and_charge @@ -86,29 +84,9 @@ class GEOM(BaseDataset): energy_target_names = ["gfn2_xtb.energy"] force_target_names = [] - __average_nb_atoms__ = 50.07244573677837 partitions = ["qm9", "drugs"] - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-8.47811605])), - "std": self.convert_energy(array([3.68015507])), - }, - "forces": NOT_DEFINED, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-78.264725])), - "std": self.convert_energy(array([20.200787])), - }, - "forces": NOT_DEFINED, - }, - } - def _read_raw_(self, partition): raw_path = p_join(self.root, "rdkit_folder") diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index 1d0b39b..4553ec1 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -1,7 +1,5 @@ from os.path import join as p_join -from numpy import array, float32 - from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 @@ -44,46 +42,6 @@ class ISO17(BaseDataset): __energy_unit__ = "ev" __distance_unit__ = "bohr" # bohr __forces_unit__ = "ev/bohr" - __average_nb_atoms__ = 19.0 - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-103.58336533])), - "std": self.convert_energy(array([0.79709836])), - }, - "forces": { - "mean": self.convert_forces(array([-1.2548699e-11])), - "std": self.convert_forces(array([1.1287293])), - "components": { - "mean": self.convert_forces( - array([[-2.7712117e-11], [-1.8989450e-12], [3.9721233e-11]], dtype=float32) - ), - "std": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)), - "rms": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)), - }, - }, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-11503.619]), dtype=float32), - "std": self.convert_energy(array([0.79709935]), dtype=float32), - }, - "forces": { - "mean": self.convert_forces(array([-1.2548699e-11])), - "std": self.convert_forces(array([1.1287293])), - "components": { - "mean": self.convert_forces( - array([[-2.7712117e-11], [-1.8989450e-12], [3.9721233e-11]], dtype=float32) - ), - "std": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)), - "rms": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)), - }, - }, - }, - } def __smiles_converter__(self, x): """util function to convert string to smiles: useful if the smiles is diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index 3887499..dc47e53 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -5,12 +5,10 @@ import datamol as dm import numpy as np import pandas as pd -from numpy import array, float32 from rdkit import Chem from tqdm import tqdm from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import get_atomic_number_and_charge @@ -91,7 +89,6 @@ class Molecule3D(BaseDataset): __forces_unit__ = "ev/ang" energy_target_names = ["b3lyp/6-31g*.energy"] - __average_nb_atoms__ = 29.111696292432697 def read_raw_entries(self): raw = p_join(self.root, "data", "raw") @@ -102,22 +99,3 @@ def read_raw_entries(self): res = dm.parallelized(fn, sdf_paths, n_jobs=1) # don't use more than 1 job samples = sum(res, []) return samples - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-191.66717791])), - "std": self.convert_energy(array([2005.52732443])), - }, - "forces": NOT_DEFINED, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-21100.502], dtype=float32)), - "std": self.convert_energy(array([9345.366], dtype=float32)), - }, - "forces": NOT_DEFINED, - }, - } diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index e158410..614e252 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -4,10 +4,8 @@ import datamol as dm import numpy as np import pandas as pd -from numpy import array, float32 from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import atom_table @@ -59,7 +57,6 @@ class OrbnetDenali(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" - __average_nb_atoms__ = 44.87857092772292 def read_raw_entries(self): label_path = p_join(self.root, "denali_labels.csv") @@ -80,22 +77,3 @@ def read_raw_entries(self): res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True) samples = sum(res, []) return samples - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-13.87283487, -7.79357297])), - "std": self.convert_energy(array([75.34652971, 3.3274954])), - }, - "forces": NOT_DEFINED, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-1630.8182, -74.17469], dtype=float32)), - "std": self.convert_energy(array([929.9734, 19.751446], dtype=float32)), - }, - "forces": NOT_DEFINED, - }, - } diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py index 50603eb..eb8b015 100644 --- a/src/openqdc/datasets/qm7x.py +++ b/src/openqdc/datasets/qm7x.py @@ -1,7 +1,6 @@ from os.path import join as p_join import numpy as np -from numpy import array, float32 from tqdm import tqdm from openqdc.datasets.base import BaseDataset @@ -47,80 +46,6 @@ class QM7X(BaseDataset): __energy_unit__ = "ev" __distance_unit__ = "ang" __forces_unit__ = "ev/ang" - __average_nb_atoms__ = 16.84668721109399 - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-82.57984067, 372.52167714])), - "std": self.convert_energy(array([9.85675539, 39.76633713])), - }, - "forces": { - "mean": self.convert_forces(array([-1.1617619e-07])), - "std": self.convert_forces(array([1.1451852])), - "components": { - "mean": self.convert_forces( - array( - [ - [-7.1192130e-07, -6.0926320e-11], - [-4.3502279e-08, -3.7376963e-11], - [5.8300976e-08, 2.9215352e-11], - ], - dtype=float32, - ) - ), - "std": self.convert_forces( - array( - [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]], - dtype=float32, - ) - ), - "rms": self.convert_forces( - array( - [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]], - dtype=float32, - ) - ), - }, - }, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-8.6828701e03, -2.7446982e-01], dtype=float32)), - "std": self.convert_energy(array([1.4362784e03, 5.8798514e-02], dtype=float32)), - }, - "forces": { - "mean": self.convert_forces(array([-1.1617619e-07])), - "std": self.convert_forces(array([1.1451852])), - "components": { - "mean": self.convert_forces( - array( - [ - [-7.1192130e-07, -6.0926320e-11], - [-4.3502279e-08, -3.7376963e-11], - [5.8300976e-08, 2.9215352e-11], - ], - dtype=float32, - ) - ), - "std": self.convert_forces( - array( - [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]], - dtype=float32, - ) - ), - "rms": self.convert_forces( - array( - [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]], - dtype=float32, - ) - ), - }, - }, - }, - } def read_raw_entries(self): samples = [] diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index a793151..c75f8b5 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -4,10 +4,8 @@ import datamol as dm import numpy as np -from numpy import array from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import get_atomic_number_and_charge @@ -58,32 +56,12 @@ class QMugs(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" - __average_nb_atoms__ = 55.215926293326426 energy_target_names = [ "GFN2:TOTAL_ENERGY", "DFT:TOTAL_ENERGY", ] - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-12.94348027, -9.83037297])), - "std": self.convert_energy(array([4.39971409, 3.3574188])), - }, - "forces": NOT_DEFINED, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-89.44242, -1740.5336])), - "std": self.convert_energy(array([29.599571, 791.48663])), - }, - "forces": NOT_DEFINED, - }, - } - def read_raw_entries(self): raw_path = p_join(self.root, "structures") mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)] diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index e618699..abcbd62 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -24,9 +24,6 @@ class SN2RXN(BaseDataset): force_target_names = [ "DSD-BLYP-D3(BJ):def2-TZVP Gradient", ] - __average_nb_atoms__ = 5.444471933343348 - - # TODO: Check this def __smiles_converter__(self, x): """util function to convert string to smiles: useful if the smiles is diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py index 2ac1fde..216ecdd 100644 --- a/src/openqdc/datasets/solvated_peptides.py +++ b/src/openqdc/datasets/solvated_peptides.py @@ -1,7 +1,5 @@ from os.path import join as p_join -from numpy import array, float32 - from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 @@ -28,7 +26,6 @@ class SolvatedPeptides(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" - __average_nb_atoms__ = 21.380975029465297 def __smiles_converter__(self, x): """util function to convert string to smiles: useful if the smiles is @@ -41,44 +38,3 @@ def read_raw_entries(self): samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names) return samples - - # TODO : Check the values in this - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([423.49523618])), - "std": self.convert_energy(array([309.76172829])), - }, - "forces": { - "mean": self.convert_forces(array([-3.792959e-12])), - "std": self.convert_forces(array([1.4568169])), - "components": { - "mean": self.convert_forces( - array([[-4.1655182e-12], [-6.9530774e-12], [2.5650127e-12]], dtype=float32) - ), - "std": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)), - "rms": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)), - }, - }, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-79.619286], dtype=float32)), - "std": self.convert_energy(array([40.01196], dtype=float32)), - }, - "forces": { - "mean": self.convert_forces(array([-3.792959e-12])), - "std": self.convert_forces(array([1.4568169])), - "components": { - "mean": self.convert_forces( - array([[-4.1655182e-12], [-6.9530774e-12], [2.5650127e-12]], dtype=float32) - ), - "std": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)), - "rms": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)), - }, - }, - }, - } diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index bb2e1a6..974d45f 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -2,7 +2,6 @@ import datamol as dm import numpy as np -from numpy import array from tqdm import tqdm from openqdc.datasets.base import BaseDataset @@ -56,7 +55,6 @@ class Spice(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" - __average_nb_atoms__ = 29.88387509402179 energy_target_names = ["dft_total_energy"] @@ -77,41 +75,6 @@ class Spice(BaseDataset): "SPICE Ion Pairs Single Points Dataset v1.1": "Ion Pairs", } - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-5.67757058])), - "std": self.convert_energy(array([2.33714861])), - }, - "forces": { - "mean": self.convert_forces(array([-1.0387013e-08])), - "std": self.convert_forces(array([0.021063408])), - "components": { - "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), - "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), - "rms": self.convert_forces(array([[0.02017307], [0.02016142], [0.02014796]])), - }, - }, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-1244.6562])), - "std": self.convert_energy(array([1219.4248])), - }, - "forces": { - "mean": self.convert_forces(array([-1.0387013e-08])), - "std": self.convert_forces(array([0.021063408])), - "components": { - "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])), - "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])), - "rms": self.convert_forces(array([[0.02017307], [0.02016142], [0.02014796]])), - }, - }, - }, - } - def convert_forces(self, x): return (-1.0) * super().convert_forces(x) diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py index 1b7ffc1..8952aaa 100644 --- a/src/openqdc/datasets/tmqm.py +++ b/src/openqdc/datasets/tmqm.py @@ -3,11 +3,9 @@ import numpy as np import pandas as pd -from numpy import array, float32 from tqdm import tqdm from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import NOT_DEFINED from openqdc.utils.molecule import atom_table @@ -56,26 +54,6 @@ class TMQM(BaseDataset): __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" - __average_nb_atoms__ = 65.9905035539555 - - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": self.convert_energy(array([-669.31056277])), - "std": self.convert_energy(array([896.83091033])), - }, - "forces": NOT_DEFINED, - }, - "total": { - "energy": { - "mean": self.convert_energy(array([-2952.1428], dtype=float32)), - "std": self.convert_energy(array([1608.7888], dtype=float32)), - }, - "forces": NOT_DEFINED, - }, - } def read_raw_entries(self): df = pd.read_csv(p_join(self.root, "tmQM_y.csv"), sep=";", usecols=["CSD_code", "Electronic_E"]) From c90192a2e8430e7454ddf6cbdcc67163b3d73b1b Mon Sep 17 00:00:00 2001 From: prtos Date: Tue, 28 Nov 2023 16:10:29 -0500 Subject: [PATCH 14/18] Update base.py --- src/openqdc/datasets/base.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 6c939e7..16dc892 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -132,13 +132,16 @@ def _precompute_E(self): s = np.array(self.data["atomic_inputs"][:, :2], dtype=int) s[:, 1] += IsolatedAtomEnergyFactory.max_charge matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__] - matrixs = [np.split(matrix, splits_idx)[:-1] for matrix in matrixs] + # matrixs = [np.split(matrix, splits_idx)[:-1] for matrix in matrixs] converted_energy_data = self.convert_energy(self.data["energies"]) # calculation per molecule formation energy statistics - e = [] - for i in range(len(self.__energy_methods__)): - e.append(converted_energy_data[:, i] - np.array(list(map(lambda x: x.sum(), matrixs[i])))) - E = np.array(e).T + n = len(self.__energy_methods__) + E = [] + for i, matrix in enumerate(matrixs): + c = np.cumsum(np.append([0], matrix))[splits_idx] + c[1:] = c[1:] - c[:-1] + E.append(converted_energy_data[:, i] -c) + E = np.array(E).T formation_E_mean = np.nanmean(E, axis=0) formation_E_std = np.nanstd(E, axis=0) total_E_mean = np.nanmean(converted_energy_data, axis=0) From 67bf201482947b62d4b546136da22f022f2f27c8 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 28 Nov 2023 22:47:13 +0000 Subject: [PATCH 15/18] raise correct Error + cleaning --- src/openqdc/datasets/base.py | 8 ++------ src/openqdc/utils/exceptions.py | 9 +++++++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 16dc892..a40d952 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -21,7 +21,6 @@ POSSIBLE_NORMALIZATION, ) from openqdc.utils.exceptions import ( - PROPERTY_NOT_AVAILABLE_ERROR, DatasetNotAvailableError, NormalizationNotAvailableError, StatisticsNotAvailableError, @@ -132,15 +131,13 @@ def _precompute_E(self): s = np.array(self.data["atomic_inputs"][:, :2], dtype=int) s[:, 1] += IsolatedAtomEnergyFactory.max_charge matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__] - # matrixs = [np.split(matrix, splits_idx)[:-1] for matrix in matrixs] converted_energy_data = self.convert_energy(self.data["energies"]) # calculation per molecule formation energy statistics - n = len(self.__energy_methods__) E = [] for i, matrix in enumerate(matrixs): c = np.cumsum(np.append([0], matrix))[splits_idx] c[1:] = c[1:] - c[:-1] - E.append(converted_energy_data[:, i] -c) + E.append(converted_energy_data[:, i] - c) E = np.array(E).T formation_E_mean = np.nanmean(E, axis=0) formation_E_std = np.nanstd(E, axis=0) @@ -501,8 +498,7 @@ def average_n_atoms(self): Average number of atoms in a molecule in the dataset. """ if self.__average_nb_atoms__ is None: - logger.info(PROPERTY_NOT_AVAILABLE_ERROR) - return 1 + raise StatisticsNotAvailableError(self.__name__) return self.__average_nb_atoms__ def get_statistics(self, normalization: str = "formation", return_none: bool = True): diff --git a/src/openqdc/utils/exceptions.py b/src/openqdc/utils/exceptions.py index 5c3ddbc..246d01c 100644 --- a/src/openqdc/utils/exceptions.py +++ b/src/openqdc/utils/exceptions.py @@ -31,6 +31,11 @@ def __init__(self, dataset_name): class StatisticsNotAvailableError(DatasetNotAvailableError): """Raised when statistics are not available""" + msg = ( + "Statistics for dataset {dataset_name} are not available." + + "Please open an issue on Github for the team to look into it." + ) + class NormalizationNotAvailableError(OpenQDCException): """Raised when normalization is not available""" @@ -44,7 +49,7 @@ class ConversionNotDefinedError(OpenQDCException, ValueError): """Raised when a conversion is not defined""" _error_message = """ - Conversion from {in_unit} to {out_unit} is not defined in the conversion registry. + Conversion from {in_unit} to {out_unit} is not defined in the conversion registry. To add a new conversion, use the following syntax or open an issue on Github for the team to look into it: Conversion("{in_unit}", "{out_unit}", lambda x: x * conversion_factor) @@ -58,6 +63,6 @@ class ConversionAlreadyDefined(ConversionNotDefinedError): """Raised when a conversion is not defined""" _error_message = """ - Conversion from {in_unit} to {out_unit} is alread defined in the conversion registry. + Conversion from {in_unit} to {out_unit} is alread defined in the conversion registry. To reuse the same metric, use get_conversion({in_unit}, {out_unit}). """ From 52f69ce0f5f1b2d525fe6ab78387c65e6f427c5b Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 28 Nov 2023 23:01:08 +0000 Subject: [PATCH 16/18] Local caching statistics --- src/openqdc/datasets/base.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index a40d952..eea280d 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -30,6 +30,7 @@ dict_to_atoms, get_local_cache, load_hdf5_file, + load_pkl, pull_locally, push_remote, set_cache_dir, @@ -113,15 +114,23 @@ def __init__( self._set_isolated_atom_energies() self._precompute_statistics() - def _precompute_statistics(self): - logger.info("Precomputing relevant statistics") + def _precompute_statistics(self, overwrite_local_cache: bool = False): + local_path = p_join(self.preprocess_path, "stats.pkl") + if self.is_preprocessed_statistics() and not overwrite_local_cache: + stats = load_pkl(local_path) + logger.info("Loaded precomputed statistics") + else: + logger.info("Precomputing relevant statistics") + (formation_E_mean, formation_E_std, total_E_mean, total_E_std) = self._precompute_E() + forces_dict = self._precompute_F() + stats = { + "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict}, + "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict}, + } + with open(local_path, "wb") as f: + pkl.dump(stats, f) self._compute_average_nb_atoms() - (formation_E_mean, formation_E_std, total_E_mean, total_E_std) = self._precompute_E() - forces_dict = self._precompute_F() - self.__stats__ = { - "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict}, - "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict}, - } + self.__stats__ = stats def _compute_average_nb_atoms(self): self.__average_nb_atoms__ = np.mean(self.data["n_atoms"]) @@ -326,6 +335,9 @@ def is_preprocessed(self): predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))] return all(predicats) + def is_preprocessed_statistics(self): + return bool(copy_exists(p_join(self.preprocess_path, "stats.pkl"))) + def preprocess(self, overwrite=False): if overwrite or not self.is_preprocessed(): entries = self.read_raw_entries() From f38bda12ef98372da7895e3f34576f8f6d2d27df Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 29 Nov 2023 13:41:50 +0000 Subject: [PATCH 17/18] Fix on incorrect unit changing, stats calculated on original units, conversion on the fly --- src/openqdc/datasets/base.py | 72 +++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index eea280d..5547df3 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -106,13 +106,34 @@ def __init__( ) -> None: set_cache_dir(cache_dir) self.data = None - self._set_units(energy_unit, distance_unit) if not self.is_preprocessed(): raise DatasetNotAvailableError(self.__name__) else: self.read_preprocess(overwrite_local_cache=overwrite_local_cache) + self._post_init(overwrite_local_cache, energy_unit, distance_unit) + + def _post_init( + self, + overwrite_local_cache: bool = False, + energy_unit: Optional[str] = None, + distance_unit: Optional[str] = None, + ) -> None: + self._set_units(None, None) + self._set_isolated_atom_energies() + self._precompute_statistics(overwrite_local_cache=overwrite_local_cache) + self._set_units(energy_unit, distance_unit) + self._convert_data() self._set_isolated_atom_energies() - self._precompute_statistics() + + def _convert_data(self): + logger.info( + f"Converting {self.__name__} data to the following units:\n\ + Energy: {self.energy_unit},\n\ + Distance: {self.distance_unit},\n\ + Forces: {self.force_unit if self.__force_methods__ else 'None'}" + ) + for key in self.data_keys: + self.data[key] = self._convert_on_loading(self.data[key], key) def _precompute_statistics(self, overwrite_local_cache: bool = False): local_path = p_join(self.preprocess_path, "stats.pkl") @@ -153,7 +174,12 @@ def _precompute_E(self): total_E_mean = np.nanmean(converted_energy_data, axis=0) total_E_std = np.nanstd(converted_energy_data, axis=0) - return formation_E_mean, formation_E_std, total_E_mean, total_E_std + return ( + np.atleast_2d(formation_E_mean), + np.atleast_2d(formation_E_std), + np.atleast_2d(total_E_mean), + np.atleast_2d(total_E_std), + ) def _precompute_F(self): if len(self.__force_methods__) == 0: @@ -163,16 +189,16 @@ def _precompute_F(self): force_std = np.nanstd(converted_force_data, axis=0) force_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0)) return { - "mean": force_mean, - "std": force_std, - "components": {"rms": force_rms, "std": force_std.mean(axis=0), "mean": force_mean.mean(axis=0)}, + "mean": np.atleast_2d(force_mean.mean(axis=0)), + "std": np.atleast_2d(force_std.mean(axis=0)), + "components": {"rms": force_rms, "std": force_std, "mean": force_mean}, } @property def numbers(self): if hasattr(self, "_numbers"): return self._numbers - self._numbers = np.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32) + self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32) return self._numbers @property @@ -302,6 +328,18 @@ def save_preprocess(self, data_dict): pkl.dump(data_dict, f) push_remote(local_path, overwrite=True) + def _convert_on_loading(self, x, key): + if key == "energies": + return self.convert_energy(x) + elif key == "forces": + return self.convert_forces(x) + elif key == "atomic_inputs": + x = np.array(x, dtype=np.float32) + x[:, -3:] = self.convert_distance(x[:, -3:]) + return x + else: + return x + def read_preprocess(self, overwrite_local_cache=False): logger.info("Reading preprocessed data") logger.info( @@ -473,14 +511,14 @@ def __getitem__(self, idx: int): z, c, positions, energies = ( np.array(input[:, 0], dtype=np.int32), np.array(input[:, 1], dtype=np.int32), - self.convert_distance(np.array(input[:, -3:], dtype=np.float32)), - self.convert_energy(np.array(self.data["energies"][idx], dtype=np.float32)), + np.array(input[:, -3:], dtype=np.float32), + np.array(self.data["energies"][idx], dtype=np.float32), ) name = self.__smiles_converter__(self.data["name"][idx]) subset = self.data["subset"][idx] if "forces" in self.data: - forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32)) + forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32) else: forces = None return Bunch( @@ -542,4 +580,18 @@ def get_statistics(self, normalization: str = "formation", return_none: bool = T } } ) + # cycle trough dict to convert units + for key in selected_stats: + if key == "forces": + for key2 in selected_stats[key]: + if key2 != "components": + selected_stats[key][key2] = self.convert_forces(selected_stats[key][key2]) + else: + for key2 in selected_stats[key]["components"]: + selected_stats[key]["components"][key2] = self.convert_forces( + selected_stats[key]["components"][key2] + ) + else: + for key2 in selected_stats[key]: + selected_stats[key][key2] = self.convert_energy(selected_stats[key][key2]) return selected_stats From 0197dd002feea4a0657b47c886393de0d26558ab Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 29 Nov 2023 14:43:37 +0000 Subject: [PATCH 18/18] Deepcopy dict to avoid reference issue --- src/openqdc/datasets/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 5547df3..d7c8234 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -1,5 +1,6 @@ import os import pickle as pkl +from copy import deepcopy from os.path import join as p_join from typing import Dict, List, Optional, Union @@ -560,7 +561,7 @@ def get_statistics(self, normalization: str = "formation", return_none: bool = T Whether to return None if the statistics for the forces are not available, by default True Otherwise, the statistics for the forces are set to 0.0 """ - stats = self._stats + stats = deepcopy(self._stats) if len(stats) == 0: raise StatisticsNotAvailableError(self.__name__) if normalization not in POSSIBLE_NORMALIZATION: