From 1b064376d62da2a886907cd7d42313dbeca27725 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Mon, 13 Nov 2023 19:32:02 +0000
Subject: [PATCH 01/18] For nikhil

---
 src/openqdc/datasets/dummy.py | 29 +++++++++++++++++++----------
 src/openqdc/datasets/qmugs.py | 30 ++++++++++++++++++++++++++++++
 src/openqdc/datasets/spice.py | 30 ++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py
index 4e1ff17..b535393 100644
--- a/src/openqdc/datasets/dummy.py
+++ b/src/openqdc/datasets/dummy.py
@@ -2,6 +2,7 @@
 from sklearn.utils import Bunch
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
 
 
 class Dummy(BaseDataset):
@@ -10,38 +11,46 @@ class Dummy(BaseDataset):
     """
 
     __name__ = "dummy"
-    __energy_methods__ = ["I_solved_the_schrodinger_equation_by_hand"]
-    __force_methods__ = ["I_made_up_random_forces"]
+    __energy_methods__ = ["I_solved_the_schrodinger_equation_by_hand", "PM6"]
+    __force_methods__ = ["I_made_up_random_forces", "writing_1_to_every_coordinate"]
     __energy_unit__ = "kcal/mol"
     __distance_unit__ = "ang"
     __forces_unit__ = "kcal/mol/ang"
 
-    energy_target_names = ["energy"]
+    energy_target_names = [f"energy{i}" for i in range(len(__energy_methods__))]
 
-    force_target_names = ["forces"]
+    force_target_names = [f"forces{i}" for i in range(len(__force_methods__))]
+    __isolated_atom_energies__ = []
 
     def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None:
         try:
             super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir)
+
         except:  # noqa
             pass
+        self._set_isolated_atom_energies()
+
+    def is_preprocessed(self):
+        return True
 
     def read_raw_entries(self):
         pass
 
     def __len__(self):
-        return 999999999
+        return 9999
 
     def __getitem__(self, idx: int):
-        size = np.random.randint(1, 250)
+        shift = IsolatedAtomEnergyFactory.max_charge
+        size = np.random.randint(1, 100)
         z = np.random.randint(1, 100, size)
+        c = np.random.randint(-1, 2, size)
         return Bunch(
             positions=np.random.rand(size, 3) * 10,
             atomic_numbers=z,
-            charges=np.random.randint(-1, 2, size),
-            e0=np.zeros(size),
-            energies=np.random.rand(1) * 100,
+            charges=c,
+            e0=self.__isolated_atom_energies__[..., z, c + shift].T,
+            energies=np.random.randn(len(self.__energy_methods__)),
             name="dummy_{}".format(idx),
             subset="dummy",
-            forces=np.random.rand(size, 3) * 100,
+            forces=(np.random.randn(size, 3, len(self.__force_methods__)) * 100),
         )
diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py
index c75f8b5..58d5b15 100644
--- a/src/openqdc/datasets/qmugs.py
+++ b/src/openqdc/datasets/qmugs.py
@@ -62,6 +62,36 @@ class QMugs(BaseDataset):
         "DFT:TOTAL_ENERGY",
     ]
 
+    @property
+    def average_n_atoms(self):
+        return 55.215926293326426
+
+    def get_stats(self, tp: str = "formation"):
+        if tp not in ["formation", "total"]:
+            raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead")
+        if tp == "formation":
+            return {
+                "energy": {
+                    "mean": self.convert_energy(np.array([-12.94348027, -9.83037297])),
+                    "std": self.convert_energy(np.array([4.39971409, 3.3574188])),
+                },
+                "forces": {
+                    "mean": np.array([0]),
+                    "std": np.array([0]),
+                },
+            }
+        else:
+            return {
+                "energy": {
+                    "mean": self.convert_energy(np.array([-89.44242, -1740.5336])),
+                    "std": self.convert_energy(np.array([29.599571, 791.48663])),
+                },
+                "forces": {
+                    "mean": np.array([0]),
+                    "std": np.array([0]),
+                },
+            }
+
     def read_raw_entries(self):
         raw_path = p_join(self.root, "structures")
         mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)]
diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py
index 974d45f..cc511f0 100644
--- a/src/openqdc/datasets/spice.py
+++ b/src/openqdc/datasets/spice.py
@@ -75,6 +75,36 @@ class Spice(BaseDataset):
         "SPICE Ion Pairs Single Points Dataset v1.1": "Ion Pairs",
     }
 
+    @property
+    def average_n_atoms(self):
+        return 29.88387509402179
+
+    def get_stats(self, tp: str = "formation"):
+        if tp not in ["formation", "total"]:
+            raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead")
+        if tp == "formation":
+            return {
+                "energy": {
+                    "mean": self.convert_energy(np.array([-5.6775709889409836])),
+                    "std": self.convert_energy(np.array([2.3371488471105013])),
+                },
+                "forces": {
+                    "mean": self.convert_energy(np.array([-1.03868e-08])),
+                    "std": self.convert_energy(np.array([0.021063408])),
+                },
+            }
+        else:
+            return {
+                "energy": {
+                    "mean": self.convert_energy(np.array([-1244.6562])),
+                    "std": self.convert_energy(np.array([1219.4248])),
+                },
+                "forces": {
+                    "mean": self.convert_energy(np.array([-1.03868e-08])),
+                    "std": self.convert_energy(np.array([0.021063408])),
+                },
+            }
+
     def convert_forces(self, x):
         return (-1.0) * super().convert_forces(x)
 

From 22c8609df40fc4b1907a68b728076ea09a6a8abb Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Tue, 14 Nov 2023 18:13:39 +0000
Subject: [PATCH 02/18] Component values

---
 src/openqdc/datasets/qmugs.py | 21 ++++++++++++--------
 src/openqdc/datasets/spice.py | 37 +++++++++++++++++++++++------------
 2 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py
index 58d5b15..481a167 100644
--- a/src/openqdc/datasets/qmugs.py
+++ b/src/openqdc/datasets/qmugs.py
@@ -4,6 +4,7 @@
 
 import datamol as dm
 import numpy as np
+from numpy import array
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils.molecule import get_atomic_number_and_charge
@@ -72,23 +73,27 @@ def get_stats(self, tp: str = "formation"):
         if tp == "formation":
             return {
                 "energy": {
-                    "mean": self.convert_energy(np.array([-12.94348027, -9.83037297])),
-                    "std": self.convert_energy(np.array([4.39971409, 3.3574188])),
+                    "mean": self.convert_energy(array([-12.94348027, -9.83037297])),
+                    "std": self.convert_energy(array([4.39971409, 3.3574188])),
                 },
                 "forces": {
-                    "mean": np.array([0]),
-                    "std": np.array([0]),
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": self.convert_forces(array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])),
+                        "std": self.convert_forces(array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])),
+                    },
                 },
             }
         else:
             return {
                 "energy": {
-                    "mean": self.convert_energy(np.array([-89.44242, -1740.5336])),
-                    "std": self.convert_energy(np.array([29.599571, 791.48663])),
+                    "mean": self.convert_energy(array([-89.44242, -1740.5336])),
+                    "std": self.convert_energy(array([29.599571, 791.48663])),
                 },
                 "forces": {
-                    "mean": np.array([0]),
-                    "std": np.array([0]),
+                    "mean": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
+                    "std": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
                 },
             }
 
diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py
index cc511f0..ec848df 100644
--- a/src/openqdc/datasets/spice.py
+++ b/src/openqdc/datasets/spice.py
@@ -2,6 +2,7 @@
 
 import datamol as dm
 import numpy as np
+from numpy import array
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
@@ -83,25 +84,35 @@ def get_stats(self, tp: str = "formation"):
         if tp not in ["formation", "total"]:
             raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead")
         if tp == "formation":
-            return {
-                "energy": {
-                    "mean": self.convert_energy(np.array([-5.6775709889409836])),
-                    "std": self.convert_energy(np.array([2.3371488471105013])),
+            return (
+                {
+                    "energy": {
+                        "mean": self.convert_energy(array([-5.67757058])),
+                        "std": self.convert_energy(array([2.33714861])),
+                    },
+                    "forces": {
+                        "mean": self.convert_forces(array([-1.0387013e-08])),
+                        "std": self.convert_forces(array([0.021063408])),
+                        "components": {
+                            "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
+                            "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
+                        },
+                    },
                 },
-                "forces": {
-                    "mean": self.convert_energy(np.array([-1.03868e-08])),
-                    "std": self.convert_energy(np.array([0.021063408])),
-                },
-            }
+            )
         else:
             return {
                 "energy": {
-                    "mean": self.convert_energy(np.array([-1244.6562])),
-                    "std": self.convert_energy(np.array([1219.4248])),
+                    "mean": self.convert_energy(array([-1244.6562])),
+                    "std": self.convert_energy(array([1219.4248])),
                 },
                 "forces": {
-                    "mean": self.convert_energy(np.array([-1.03868e-08])),
-                    "std": self.convert_energy(np.array([0.021063408])),
+                    "mean": self.convert_forces(array([-1.0387013e-08])),
+                    "std": self.convert_forces(array([0.021063408])),
+                    "components": {
+                        "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
+                        "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
+                    },
                 },
             }
 

From 28bbf005f3c3b820110ce14ecb7f9d4aac80a51e Mon Sep 17 00:00:00 2001
From: prtos <tossouprudencio@gmail.com>
Date: Wed, 15 Nov 2023 04:20:59 +0000
Subject: [PATCH 03/18] wip

---
 src/openqdc/datasets/base.py |  4 +--
 src/openqdc/datasets/pcqm.py | 68 ++++++++++++++++++++++++++++--------
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 598a0b3..dbca4cc 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -193,7 +193,7 @@ def save_preprocess(self, data_dict):
             uniques, inv_indices = np.unique(data_dict[key], return_inverse=True)
             with open(local_path, "wb") as f:
                 np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
-            push_remote(local_path)
+            push_remote(local_path, overwrite=True)
 
     def read_preprocess(self, overwrite_local_cache=False):
         logger.info("Reading preprocessed data")
@@ -218,7 +218,7 @@ def read_preprocess(self, overwrite_local_cache=False):
 
         for key in ["name", "subset"]:
             filename = p_join(self.preprocess_path, f"{key}.npz")
-            pull_locally(filename)
+            pull_locally(filename, overwrite=overwrite_local_cache)
             self.data[key] = dict()
             with open(filename, "rb") as f:
                 tmp = np.load(f)
diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py
index 505eef1..555d2eb 100644
--- a/src/openqdc/datasets/pcqm.py
+++ b/src/openqdc/datasets/pcqm.py
@@ -11,7 +11,7 @@
 from loguru import logger
 
 from openqdc.datasets.base import BaseDataset
-from openqdc.utils.io import get_local_cache
+from openqdc.utils.io import get_local_cache, push_remote
 
 
 def flatten_dict(d, sep: str = "."):
@@ -80,27 +80,65 @@ def __init__(self, energy_unit=None, distance_unit=None) -> None:
     def root(self):
         return p_join(get_local_cache(), "pubchemqc")
 
-    def collate_list(self, list_entries, partial=False):
-        # default partial=False is necessary for compatibility with the base class
-        if partial:
-            predicat = list_entries is not None and len(list_entries) > 0
-            list_entries = [x for x in list_entries if x is not None]
-            return super().collate_list(list_entries) if predicat else None
-        else:
-            n = 0
-            for i in range(len(list_entries)):
-                list_entries[i]["position_idx_range"] += n
-                n += list_entries[i]["position_idx_range"].max()
-            res = {key: np.concatenate([r[key] for r in list_entries], axis=0) for key in list_entries[0]}
-            return res
+    @property
+    def preprocess_path(self):
+        path = p_join(self.root, "preprocessed", self.__name__)
+        os.makedirs(path, exist_ok=True)
+        return path
+
+    def collate_list(self, list_entries):
+        predicat = list_entries is not None and len(list_entries) > 0
+        list_entries = [x for x in list_entries if x is not None]
+        return super().collate_list(list_entries) if predicat else None
 
     def read_raw_entries(self):
         arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl"))
-        f = lambda x: self.collate_list(read_preprocessed_archive(x), partial=True)
+        f = lambda x: self.collate_list(read_preprocessed_archive(x))
         samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)
         samples = [x for x in samples if x is not None]
         return samples
 
+    def preprocess(self):
+        if not self.is_preprocessed():
+            logger.info("Preprocessing data and saving it to cache.")
+            logger.info(
+                f"Dataset {self.__name__} data with the following units:\n"
+                f"Energy: {self.energy_unit}, Distance: {self.distance_unit}, "
+                f"Forces: {self.force_unit if self.__force_methods__ else 'None'}"
+            )
+            entries = self.read_raw_entries()
+            self.collate_and_save_list(entries)
+
+    def collate_and_save_list(self, list_entries):
+        n_molecules, n_atoms = 0, 0
+        for i in range(len(list_entries)):
+            list_entries[i]["position_idx_range"] += n_atoms
+            n_atoms += list_entries[i]["position_idx_range"].max()
+            n_molecules += list_entries[i]["position_idx_range"].shape[0]
+
+        for key in self.data_keys:
+            first = list_entries[0][key]
+            shape = (n_molecules, *first.shape[1:])
+            local_path = p_join(self.preprocess_path, f"{key}.mmap")
+            out = np.memmap(local_path, mode="w+", dtype=first.dtype, shape=shape)
+
+            start = 0
+            for i in range(len(list_entries)):
+                x = list_entries[i].pop(key)
+                n = x.shape[0]
+                out[start : start + n] = x
+                out.flush()
+            push_remote(local_path, overwrite=True)
+
+        # save smiles and subset
+        for key in ["name", "subset"]:
+            local_path = p_join(self.preprocess_path, f"{key}.npz")
+            x = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)]
+            uniques, inv_indices = np.unique(x, return_inverse=True)
+            with open(local_path, "wb") as f:
+                np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
+            push_remote(local_path, overwrite=True)
+
 
 class PCQM_B3LYP(PCQM_PM6):
     __name__ = "pubchemqc_b3lyp"

From 4c8bf76589362a9acf1da150db7d9f3a11df6e77 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Wed, 15 Nov 2023 16:48:27 +0000
Subject: [PATCH 04/18] Precomputed stats

---
 src/openqdc/datasets/ani.py               | 203 ++++++++++++++++++++++
 src/openqdc/datasets/base.py              |  28 ++-
 src/openqdc/datasets/comp6.py             |  91 ++++++++++
 src/openqdc/datasets/dess.py              |  65 +++++++
 src/openqdc/datasets/dummy.py             |  33 ++++
 src/openqdc/datasets/geom.py              |  37 ++++
 src/openqdc/datasets/iso_17.py            |  42 +++++
 src/openqdc/datasets/molecule3d.py        |  37 ++++
 src/openqdc/datasets/orbnet_denali.py     |  37 ++++
 src/openqdc/datasets/qm7x.py              |  75 ++++++++
 src/openqdc/datasets/qmugs.py             |  30 ++--
 src/openqdc/datasets/sn2_rxn.py           |   3 +
 src/openqdc/datasets/solvated_peptides.py |  44 +++++
 src/openqdc/datasets/spice.py             |  52 +++---
 src/openqdc/datasets/tmqm.py              |  37 ++++
 src/openqdc/utils/constants.py            |   2 +
 16 files changed, 772 insertions(+), 44 deletions(-)

diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py
index 913fb8a..c18d254 100644
--- a/src/openqdc/datasets/ani.py
+++ b/src/openqdc/datasets/ani.py
@@ -1,6 +1,8 @@
 import os
 from os.path import join as p_join
 
+from numpy import array, float32
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
 from openqdc.utils.io import get_local_cache
 
@@ -34,6 +36,7 @@ class ANI1(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "bohr"
     __forces_unit__ = "hartree/bohr"
+    __average_nb_atoms__ = 15.91676229984414
 
     @property
     def root(self):
@@ -50,6 +53,41 @@ def read_raw_entries(self):
         samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)
         return samples
 
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-2.37376472])),
+                    "std": self.convert_energy(array([0.50266975])),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-333.67322], dtype=float32)),
+                    "std": self.convert_energy(array([61.21667], dtype=float32)),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+        }
+
 
 class ANI1CCX(ANI1):
     """
@@ -71,6 +109,7 @@ class ANI1CCX(ANI1):
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
+    __average_nb_atoms__ = 15.274685315870588
 
     __energy_methods__ = [
         "ccsd(t)/cbs",
@@ -89,6 +128,41 @@ class ANI1CCX(ANI1):
     __force_methods__ = []
     force_target_names = []
 
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-3.23959548, 500.30384627, 500.40706776, 500.76740432])),
+                    "std": self.convert_energy(array([1.03021261, 132.52707152, 132.56092469, 132.65261362])),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-374.40665, -1.2378153, -1.505962, -1.2396905], dtype=float32)),
+                    "std": self.convert_energy(array([101.63995, 0.32444745, 0.39500558, 0.3250212], dtype=float32)),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+        }
+
 
 class ANI1X(ANI1):
     """
@@ -143,5 +217,134 @@ class ANI1X(ANI1):
         "wb97x/cc-pvtz",
     ]
 
+    __average_nb_atoms__ = 15.274685315870588
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(
+                        array(
+                            [
+                                -2.87910686,
+                                -2.91460298,
+                                -2.91182519,
+                                500.00748497,
+                                500.27885605,
+                                500.11130961,
+                                -3.66090173,
+                                -4.40643278,
+                            ]
+                        )
+                    ),
+                    "std": self.convert_energy(
+                        array(
+                            [
+                                0.92849657,
+                                0.93421854,
+                                0.93411345,
+                                132.44580372,
+                                132.52326771,
+                                132.47987395,
+                                1.60180792,
+                                1.75414812,
+                            ]
+                        )
+                    ),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-6.139757e-06])),
+                    "std": self.convert_forces(array([0.07401004])),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array(
+                                [
+                                    [6.6829815e-13, 3.5682501e-07],
+                                    [-5.1223647e-13, -1.8487021e-06],
+                                    [8.1159564e-13, -3.6849189e-05],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                        "std": self.convert_forces(
+                            array(
+                                [[0.0759203, 0.06799112], [0.07694941, 0.06652647], [0.06229663, 0.05442103]],
+                                dtype=float32,
+                            )
+                        ),
+                        "rms": self.convert_forces(
+                            array(
+                                [[0.0759203, 0.06799113], [0.07694941, 0.06652647], [0.06229663, 0.05442095]],
+                                dtype=float32,
+                            )
+                        ),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(
+                        array(
+                            [
+                                -372.68945,
+                                -372.74274,
+                                -372.7326,
+                                -1.1540408,
+                                -1.5152899,
+                                -1.4195863,
+                                -392.72458,
+                                -391.208,
+                            ],
+                            dtype=float32,
+                        )
+                    ),
+                    "std": self.convert_energy(
+                        array(
+                            [
+                                101.166664,
+                                101.19915,
+                                101.191895,
+                                0.30445468,
+                                0.39988872,
+                                0.37456134,
+                                136.79112,
+                                137.48692,
+                            ],
+                            dtype=float32,
+                        )
+                    ),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-6.139757e-06])),
+                    "std": self.convert_forces(array([0.07401004])),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array(
+                                [
+                                    [6.6829815e-13, 3.5682501e-07],
+                                    [-5.1223647e-13, -1.8487021e-06],
+                                    [8.1159564e-13, -3.6849189e-05],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                        "std": self.convert_forces(
+                            array(
+                                [[0.0759203, 0.06799112], [0.07694941, 0.06652647], [0.06229663, 0.05442103]],
+                                dtype=float32,
+                            )
+                        ),
+                        "rms": self.convert_forces(
+                            array(
+                                [[0.0759203, 0.06799113], [0.07694941, 0.06652647], [0.06229663, 0.05442095]],
+                                dtype=float32,
+                            )
+                        ),
+                    },
+                },
+            },
+        }
+
     def convert_forces(self, x):
         return super().convert_forces(x) * 0.529177249  # correct the Dataset error
diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 1de6ff1..022f880 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -14,7 +14,7 @@
     IsolatedAtomEnergyFactory,
     chemical_symbols,
 )
-from openqdc.utils.constants import NB_ATOMIC_FEATURES
+from openqdc.utils.constants import NB_ATOMIC_FEATURES, POSSIBLE_NORMALIZATION
 from openqdc.utils.io import (
     copy_exists,
     dict_to_atoms,
@@ -83,6 +83,7 @@ class BaseDataset(torch.utils.data.Dataset):
     __fn_energy__ = lambda x: x
     __fn_distance__ = lambda x: x
     __fn_forces__ = lambda x: x
+    __average_nb_atoms__ = None
 
     def __init__(
         self,
@@ -425,3 +426,28 @@ def __str__(self):
 
     def __repr__(self):
         return f"{self.__name__}"
+
+    @property
+    def _stats(self):
+        return {}
+
+    @property
+    def average_n_atoms(self):
+        if self.__average_nb_atoms__ is None:
+            logger.info(
+                "This property for this dataset not available."
+                + "Please open an issue on Github for the team to look into it."
+            )
+            return 1
+        return self.__average_nb_atoms__
+
+    def get_statistics(self, normalization: str = "formation"):
+        stats = self._stats
+        if len(stats) == 0:
+            logger.info(
+                "This property for this dataset not available."
+                + "Please open an issue on Github for the team to look into it."
+            )
+        if normalization not in POSSIBLE_NORMALIZATION:
+            raise ValueError(f"normalization={normalization} is not valid. Must be one of {POSSIBLE_NORMALIZATION}")
+        return stats[normalization]
diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py
index c95ec17..2c6fb83 100644
--- a/src/openqdc/datasets/comp6.py
+++ b/src/openqdc/datasets/comp6.py
@@ -1,5 +1,7 @@
 from os.path import join as p_join
 
+from numpy import array, float32, nan
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
 
 
@@ -59,6 +61,95 @@ class COMP6(BaseDataset):
         "Gradient",
     ]
 
+    __average_nb_atoms__ = 25.74051563378753
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(
+                        array(
+                            [
+                                -2579.52016333,
+                                -2543.74519203,
+                                -354694.46157991,
+                                -2506.4300631,
+                                -2616.89224817,
+                                -3157.54118509,
+                                -354031.62984212,
+                                nan,
+                                nan,
+                            ]
+                        )
+                    ),
+                    "std": self.convert_energy(
+                        array(
+                            [
+                                1811.03171965,
+                                1471.95818836,
+                                201545.89189168,
+                                1385.50993753,
+                                1456.09915473,
+                                1728.51133182,
+                                208097.95666257,
+                                nan,
+                                nan,
+                            ]
+                        )
+                    ),
+                },
+                "forces": {
+                    "mean": self.convert_forces(6.6065984e-13),
+                    "std": self.convert_forces(0.056459695),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array([[-4.1767219e-13], [1.0024132e-12], [-9.4386771e-13]], dtype=float32)
+                        ),
+                        "std": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)),
+                        "rms": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(
+                        array(
+                            [
+                                -360972.16,
+                                -354729.66,
+                                -354699.38,
+                                -349555.7,
+                                -351555.97,
+                                -351530.44,
+                                -354027.8,
+                                nan,
+                                nan,
+                            ],
+                            dtype=float32,
+                        ),
+                    ),
+                    "std": self.convert_energy(
+                        array(
+                            [254766.0, 201559.77, 201537.8, 188725.47, 191028.78, 191016.1, 208089.4, nan, nan],
+                            dtype=float32,
+                        ),
+                    ),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([6.6065984e-13])),
+                    "std": self.convert_forces(array([0.056459695])),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array([[-4.1767219e-13], [1.0024132e-12], [-9.4386771e-13]], dtype=float32)
+                        ),
+                        "std": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)),
+                        "rms": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)),
+                    },
+                },
+            },
+        }
+
     def read_raw_entries(self):
         samples = []
         for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]:
diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py
index 80b1e1c..76c2355 100644
--- a/src/openqdc/datasets/dess.py
+++ b/src/openqdc/datasets/dess.py
@@ -3,6 +3,7 @@
 import datamol as dm
 import numpy as np
 import pandas as pd
+from numpy import array, float32
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
@@ -61,6 +62,70 @@ class DESS(BaseDataset):
     # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all']
 
     partitions = ["DES370K", "DES5M"]
+    __average_nb_atoms__ = 18.944905540514252
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": array(
+                        [
+                            1.55021905e03,
+                            7.85244098e02,
+                            7.85022111e02,
+                            7.84864329e02,
+                            1.55072999e03,
+                            1.56925354e03,
+                            -7.76931930e-01,
+                            -2.29679257e00,
+                        ]
+                    ),
+                    "std": array(
+                        [
+                            1.89341080e03,
+                            9.68462074e02,
+                            9.68422393e02,
+                            9.70421004e02,
+                            1.89394888e03,
+                            1.91531274e03,
+                            1.38230279e00,
+                            1.27589212e01,
+                        ]
+                    ),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": array(
+                        [-0.8121756, 4.1286583, 4.3558254, 3.9754286, -0.66178626, -1.0762144, 4.4547276, 3.4555712],
+                        dtype=float32,
+                    ),
+                    "std": array(
+                        [11.138448, 17.800674, 18.148039, 17.567768, 11.290686, 11.005155, 18.162294, 16.579735],
+                        dtype=float32,
+                    ),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+        }
 
     def _read_raw_(self, part):
         df = pd.read_csv(p_join(self.root, f"{part}.csv"))
diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py
index b535393..38f70b4 100644
--- a/src/openqdc/datasets/dummy.py
+++ b/src/openqdc/datasets/dummy.py
@@ -3,6 +3,7 @@
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
+from numpy import array
 
 
 class Dummy(BaseDataset):
@@ -21,6 +22,38 @@ class Dummy(BaseDataset):
 
     force_target_names = [f"forces{i}" for i in range(len(__force_methods__))]
     __isolated_atom_energies__ = []
+    __average_n_atoms__ = 20
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": array([-12.94348027, -9.83037297]),
+                    "std": array([4.39971409, 3.3574188]),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
+                        "std": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
+                        "rms": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": array([-89.44242, -1740.5336]),
+                    "std": array([29.599571, 791.48663]),
+                },
+                "forces": {
+                    "mean": array([[0.0], [0.0], [0.0]]),
+                    "std": array([[0.0], [0.0], [0.0]]),
+                    "rms": array([[0.0], [0.0], [0.0]]),
+                },
+            },
+        }
 
     def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None:
         try:
diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py
index c016a9f..c16f001 100644
--- a/src/openqdc/datasets/geom.py
+++ b/src/openqdc/datasets/geom.py
@@ -3,6 +3,7 @@
 
 import datamol as dm
 import numpy as np
+from numpy import array
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils import load_json, load_pkl
@@ -84,9 +85,45 @@ class GEOM(BaseDataset):
 
     energy_target_names = ["gfn2_xtb.energy"]
     force_target_names = []
+    __average_nb_atoms__ = 50.07244573677837
 
     partitions = ["qm9", "drugs"]
 
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-8.47811605])),
+                    "std": self.convert_energy(array([3.68015507])),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-78.264725])),
+                    "std": self.convert_energy(array([20.200787])),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+        }
+
     def _read_raw_(self, partition):
         raw_path = p_join(self.root, "rdkit_folder")
 
diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py
index 735ae67..36c09f5 100644
--- a/src/openqdc/datasets/iso_17.py
+++ b/src/openqdc/datasets/iso_17.py
@@ -1,5 +1,7 @@
 from os.path import join as p_join
 
+from numpy import array, float32
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
 
 
@@ -42,6 +44,46 @@ class ISO17(BaseDataset):
     __energy_unit__ = "ev"
     __distance_unit__ = "bohr"  # bohr
     __forces_unit__ = "ev/bohr"
+    __average_nb_atoms__ = 19.0
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-103.58336533])),
+                    "std": self.convert_energy(array([0.79709836])),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-1.2548699e-11])),
+                    "std": self.convert_forces(array([1.1287293])),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array([[-2.7712117e-11], [-1.8989450e-12], [3.9721233e-11]], dtype=float32)
+                        ),
+                        "std": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)),
+                        "rms": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-11503.619]), dtype=float32),
+                    "std": self.convert_energy(array([0.79709935]), dtype=float32),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-1.2548699e-11])),
+                    "std": self.convert_forces(array([1.1287293])),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array([[-2.7712117e-11], [-1.8989450e-12], [3.9721233e-11]], dtype=float32)
+                        ),
+                        "std": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)),
+                        "rms": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)),
+                    },
+                },
+            },
+        }
 
     def read_raw_entries(self):
         raw_path = p_join(self.root, "iso_17.h5")
diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py
index dc47e53..ace03d4 100644
--- a/src/openqdc/datasets/molecule3d.py
+++ b/src/openqdc/datasets/molecule3d.py
@@ -5,6 +5,7 @@
 import datamol as dm
 import numpy as np
 import pandas as pd
+from numpy import array, float32
 from rdkit import Chem
 from tqdm import tqdm
 
@@ -89,6 +90,7 @@ class Molecule3D(BaseDataset):
     __forces_unit__ = "ev/ang"
 
     energy_target_names = ["b3lyp/6-31g*.energy"]
+    __average_nb_atoms__ = 29.111696292432697
 
     def read_raw_entries(self):
         raw = p_join(self.root, "data", "raw")
@@ -99,3 +101,38 @@ def read_raw_entries(self):
         res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job
         samples = sum(res, [])
         return samples
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-191.66717791])),
+                    "std": self.convert_energy(array([2005.52732443])),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-21100.502], dtype=float32)),
+                    "std": self.convert_energy(array([9345.366], dtype=float32)),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+        }
diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py
index 614e252..e5f5953 100644
--- a/src/openqdc/datasets/orbnet_denali.py
+++ b/src/openqdc/datasets/orbnet_denali.py
@@ -4,6 +4,7 @@
 import datamol as dm
 import numpy as np
 import pandas as pd
+from numpy import array, float32
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils.molecule import atom_table
@@ -57,6 +58,7 @@ class OrbnetDenali(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
+    __average_nb_atoms__ = 44.87857092772292
 
     def read_raw_entries(self):
         label_path = p_join(self.root, "denali_labels.csv")
@@ -77,3 +79,38 @@ def read_raw_entries(self):
         res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True)
         samples = sum(res, [])
         return samples
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-13.87283487, -7.79357297])),
+                    "std": self.convert_energy(array([75.34652971, 3.3274954])),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-1630.8182, -74.17469], dtype=float32)),
+                    "std": self.convert_energy(array([929.9734, 19.751446], dtype=float32)),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+        }
diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py
index eb8b015..50603eb 100644
--- a/src/openqdc/datasets/qm7x.py
+++ b/src/openqdc/datasets/qm7x.py
@@ -1,6 +1,7 @@
 from os.path import join as p_join
 
 import numpy as np
+from numpy import array, float32
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
@@ -46,6 +47,80 @@ class QM7X(BaseDataset):
     __energy_unit__ = "ev"
     __distance_unit__ = "ang"
     __forces_unit__ = "ev/ang"
+    __average_nb_atoms__ = 16.84668721109399
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-82.57984067, 372.52167714])),
+                    "std": self.convert_energy(array([9.85675539, 39.76633713])),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-1.1617619e-07])),
+                    "std": self.convert_forces(array([1.1451852])),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array(
+                                [
+                                    [-7.1192130e-07, -6.0926320e-11],
+                                    [-4.3502279e-08, -3.7376963e-11],
+                                    [5.8300976e-08, 2.9215352e-11],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                        "std": self.convert_forces(
+                            array(
+                                [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]],
+                                dtype=float32,
+                            )
+                        ),
+                        "rms": self.convert_forces(
+                            array(
+                                [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]],
+                                dtype=float32,
+                            )
+                        ),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-8.6828701e03, -2.7446982e-01], dtype=float32)),
+                    "std": self.convert_energy(array([1.4362784e03, 5.8798514e-02], dtype=float32)),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-1.1617619e-07])),
+                    "std": self.convert_forces(array([1.1451852])),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array(
+                                [
+                                    [-7.1192130e-07, -6.0926320e-11],
+                                    [-4.3502279e-08, -3.7376963e-11],
+                                    [5.8300976e-08, 2.9215352e-11],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                        "std": self.convert_forces(
+                            array(
+                                [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]],
+                                dtype=float32,
+                            )
+                        ),
+                        "rms": self.convert_forces(
+                            array(
+                                [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]],
+                                dtype=float32,
+                            )
+                        ),
+                    },
+                },
+            },
+        }
 
     def read_raw_entries(self):
         samples = []
diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py
index 481a167..1ef6b22 100644
--- a/src/openqdc/datasets/qmugs.py
+++ b/src/openqdc/datasets/qmugs.py
@@ -57,6 +57,7 @@ class QMugs(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
+    __average_nb_atoms__ = 55.215926293326426
 
     energy_target_names = [
         "GFN2:TOTAL_ENERGY",
@@ -64,14 +65,9 @@ class QMugs(BaseDataset):
     ]
 
     @property
-    def average_n_atoms(self):
-        return 55.215926293326426
-
-    def get_stats(self, tp: str = "formation"):
-        if tp not in ["formation", "total"]:
-            raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead")
-        if tp == "formation":
-            return {
+    def _stats(self):
+        return {
+            "formation": {
                 "energy": {
                     "mean": self.convert_energy(array([-12.94348027, -9.83037297])),
                     "std": self.convert_energy(array([4.39971409, 3.3574188])),
@@ -80,22 +76,24 @@ def get_stats(self, tp: str = "formation"):
                     "mean": array([0]),
                     "std": array([0]),
                     "components": {
-                        "mean": self.convert_forces(array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])),
-                        "std": self.convert_forces(array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])),
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
                     },
                 },
-            }
-        else:
-            return {
+            },
+            "total": {
                 "energy": {
                     "mean": self.convert_energy(array([-89.44242, -1740.5336])),
                     "std": self.convert_energy(array([29.599571, 791.48663])),
                 },
                 "forces": {
-                    "mean": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
-                    "std": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
+                    "mean": array([[0.0], [0.0], [0.0]]),
+                    "std": array([[0.0], [0.0], [0.0]]),
+                    "rms": array([[0.0], [0.0], [0.0]]),
                 },
-            }
+            },
+        }
 
     def read_raw_entries(self):
         raw_path = p_join(self.root, "structures")
diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py
index 3e75e91..d8808d5 100644
--- a/src/openqdc/datasets/sn2_rxn.py
+++ b/src/openqdc/datasets/sn2_rxn.py
@@ -24,6 +24,9 @@ class SN2RXN(BaseDataset):
     force_target_names = [
         "DSD-BLYP-D3(BJ):def2-TZVP Gradient",
     ]
+    __average_nb_atoms__ = 5.444471933343348
+
+    # TODO: Check this
 
     def read_raw_entries(self):
         raw_path = p_join(self.root, "sn2_rxn.h5")
diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py
index 9846bdf..4723c38 100644
--- a/src/openqdc/datasets/solvated_peptides.py
+++ b/src/openqdc/datasets/solvated_peptides.py
@@ -1,5 +1,7 @@
 from os.path import join as p_join
 
+from numpy import array, float32
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
 
 
@@ -26,9 +28,51 @@ class SolvatedPeptides(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "bohr"
     __forces_unit__ = "hartree/bohr"
+    __average_nb_atoms__ = 21.380975029465297
 
     def read_raw_entries(self):
         raw_path = p_join(self.root, "solvated_peptides.h5")
         samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names)
 
         return samples
+
+    # TODO : Check the values in this
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([423.49523618])),
+                    "std": self.convert_energy(array([309.76172829])),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-3.792959e-12])),
+                    "std": self.convert_forces(array([1.4568169])),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array([[-4.1655182e-12], [-6.9530774e-12], [2.5650127e-12]], dtype=float32)
+                        ),
+                        "std": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)),
+                        "rms": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-79.619286], dtype=float32)),
+                    "std": self.convert_energy(array([40.01196], dtype=float32)),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-3.792959e-12])),
+                    "std": self.convert_forces(array([1.4568169])),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array([[-4.1655182e-12], [-6.9530774e-12], [2.5650127e-12]], dtype=float32)
+                        ),
+                        "std": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)),
+                        "rms": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)),
+                    },
+                },
+            },
+        }
diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py
index ec848df..8153d8f 100644
--- a/src/openqdc/datasets/spice.py
+++ b/src/openqdc/datasets/spice.py
@@ -56,6 +56,7 @@ class Spice(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "bohr"
     __forces_unit__ = "hartree/bohr"
+    __average_nb_atoms__ = 29.88387509402179
 
     energy_target_names = ["dft_total_energy"]
 
@@ -77,18 +78,28 @@ class Spice(BaseDataset):
     }
 
     @property
-    def average_n_atoms(self):
-        return 29.88387509402179
-
-    def get_stats(self, tp: str = "formation"):
-        if tp not in ["formation", "total"]:
-            raise ValueError(f"type must be one of 'formation' or 'total', got {tp} instead")
-        if tp == "formation":
-            return (
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-5.67757058])),
+                    "std": self.convert_energy(array([2.33714861])),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-1.0387013e-08])),
+                    "std": self.convert_forces(array([0.021063408])),
+                    "components": {
+                        "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
+                        "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
+                        "rms": array([[0.02017307], [0.02016142], [0.02014796]]),
+                    },
+                },
+            },
+            "total": {
                 {
                     "energy": {
-                        "mean": self.convert_energy(array([-5.67757058])),
-                        "std": self.convert_energy(array([2.33714861])),
+                        "mean": self.convert_energy(array([-1244.6562])),
+                        "std": self.convert_energy(array([1219.4248])),
                     },
                     "forces": {
                         "mean": self.convert_forces(array([-1.0387013e-08])),
@@ -96,25 +107,12 @@ def get_stats(self, tp: str = "formation"):
                         "components": {
                             "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
                             "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
+                            "rms": array([[0.02017307], [0.02016142], [0.02014796]]),
                         },
                     },
-                },
-            )
-        else:
-            return {
-                "energy": {
-                    "mean": self.convert_energy(array([-1244.6562])),
-                    "std": self.convert_energy(array([1219.4248])),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-1.0387013e-08])),
-                    "std": self.convert_forces(array([0.021063408])),
-                    "components": {
-                        "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
-                        "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
-                    },
-                },
-            }
+                }
+            },
+        }
 
     def convert_forces(self, x):
         return (-1.0) * super().convert_forces(x)
diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py
index 8952aaa..7160edc 100644
--- a/src/openqdc/datasets/tmqm.py
+++ b/src/openqdc/datasets/tmqm.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+from numpy import array, float32
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
@@ -54,6 +55,42 @@ class TMQM(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
+    __average_nb_atoms__ = 65.9905035539555
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-669.31056277])),
+                    "std": self.convert_energy(array([896.83091033])),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-2952.1428], dtype=float32)),
+                    "std": self.convert_energy(array([1608.7888], dtype=float32)),
+                },
+                "forces": {
+                    "mean": array([0]),
+                    "std": array([0]),
+                    "components": {
+                        "mean": array([[0.0], [0.0], [0.0]]),
+                        "std": array([[0.0], [0.0], [0.0]]),
+                        "rms": array([[0.0], [0.0], [0.0]]),
+                    },
+                },
+            },
+        }
 
     def read_raw_entries(self):
         df = pd.read_csv(p_join(self.root, "tmQM_y.csv"), sep=";", usecols=["CSD_code", "Electronic_E"])
diff --git a/src/openqdc/utils/constants.py b/src/openqdc/utils/constants.py
index a8a8215..d1e1937 100644
--- a/src/openqdc/utils/constants.py
+++ b/src/openqdc/utils/constants.py
@@ -5,3 +5,5 @@
 HAR2EV = 27.211386246
 
 BOHR2ANG = 0.52917721092
+
+POSSIBLE_NORMALIZATION = ["formation", "total"]

From c349f050285cab1f29af154188523a7bd1544680 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Wed, 15 Nov 2023 17:00:23 +0000
Subject: [PATCH 05/18] Fixes + black

---
 src/openqdc/datasets/dummy.py |  2 +-
 src/openqdc/datasets/spice.py | 26 ++++++++++++--------------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py
index 38f70b4..4a9bfc8 100644
--- a/src/openqdc/datasets/dummy.py
+++ b/src/openqdc/datasets/dummy.py
@@ -1,9 +1,9 @@
 import numpy as np  # noqa
+from numpy import array
 from sklearn.utils import Bunch
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
-from numpy import array
 
 
 class Dummy(BaseDataset):
diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py
index 8153d8f..08f7f2b 100644
--- a/src/openqdc/datasets/spice.py
+++ b/src/openqdc/datasets/spice.py
@@ -96,21 +96,19 @@ def _stats(self):
                 },
             },
             "total": {
-                {
-                    "energy": {
-                        "mean": self.convert_energy(array([-1244.6562])),
-                        "std": self.convert_energy(array([1219.4248])),
-                    },
-                    "forces": {
-                        "mean": self.convert_forces(array([-1.0387013e-08])),
-                        "std": self.convert_forces(array([0.021063408])),
-                        "components": {
-                            "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
-                            "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
-                            "rms": array([[0.02017307], [0.02016142], [0.02014796]]),
-                        },
+                "energy": {
+                    "mean": self.convert_energy(array([-1244.6562])),
+                    "std": self.convert_energy(array([1219.4248])),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array([-1.0387013e-08])),
+                    "std": self.convert_forces(array([0.021063408])),
+                    "components": {
+                        "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
+                        "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
+                        "rms": array([[0.02017307], [0.02016142], [0.02014796]]),
                     },
-                }
+                },
             },
         }
 

From 6a67b791a9a214252bfb8458441a90e3ee4d277e Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Wed, 15 Nov 2023 17:01:57 +0000
Subject: [PATCH 06/18] update init to not call openqdc.datasets everytime

---
 src/openqdc/__init__.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py
index 1432923..0e65ee2 100644
--- a/src/openqdc/__init__.py
+++ b/src/openqdc/__init__.py
@@ -7,7 +7,30 @@
 
 # Dictionary of objects to lazily import; maps the object's name to its module path
 
-_lazy_imports_obj = {}
+_lazy_imports_obj = {
+    "ANI1": "openqdc.datasets.ani",
+    "ANI1CCX": "openqdc.datasets.ani",
+    "ANI1X": "openqdc.datasets.ani",
+    "Spice": "openqdc.datasets.spice",
+    "GEOM": "openqdc.datasets.geom",
+    "QMugs": "openqdc.datasets.qmugs",
+    "ISO17": "openqdc.datasets.iso_17",
+    "COMP6": "openqdc.datasets.comp6",
+    "GDML": "openqdc.datasets.gdml",
+    "Molecule3D": "openqdc.datasets.molecule3d",
+    "OrbnetDenali": "openqdc.datasets.orbnet_denali",
+    "SN2RXN": "openqdc.datasets.sn2_rxn",
+    "QM7X": "openqdc.datasets.qm7x",
+    "DESS": "openqdc.datasets.dess",
+    "NablaDFT": "openqdc.datasets.nabladft",
+    "SolvatedPeptides": "openqdc.datasets.solvated_peptides",
+    "WaterClusters": "openqdc.datasets.waterclusters3_30",
+    "TMQM": "openqdc.datasets.tmqm",
+    "Dummy": "openqdc.datasets.dummy",
+    "PCQM_B3LYP": "openqdc.datasets.pcqm",
+    "PCQM_PM6": "openqdc.datasets.pcqm",
+    "Transition1X": "openqdc.datasets.transition1x",
+}
 
 _lazy_imports_mod = {"datasets": "openqdc.datamodule", "utils": "openqdc.utils"}
 

From 0dca850673cb97013c5137fcfb4151d7eb5ae42c Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Wed, 15 Nov 2023 17:03:35 +0000
Subject: [PATCH 07/18] RMS Spice fix

---
 src/openqdc/datasets/spice.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py
index 08f7f2b..bb2e1a6 100644
--- a/src/openqdc/datasets/spice.py
+++ b/src/openqdc/datasets/spice.py
@@ -91,7 +91,7 @@ def _stats(self):
                     "components": {
                         "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
                         "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
-                        "rms": array([[0.02017307], [0.02016142], [0.02014796]]),
+                        "rms": self.convert_forces(array([[0.02017307], [0.02016142], [0.02014796]])),
                     },
                 },
             },
@@ -106,7 +106,7 @@ def _stats(self):
                     "components": {
                         "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
                         "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
-                        "rms": array([[0.02017307], [0.02016142], [0.02014796]]),
+                        "rms": self.convert_forces(array([[0.02017307], [0.02016142], [0.02014796]])),
                     },
                 },
             },

From 32caf77a48aaeadfe3d0a3689312a869b1649e39 Mon Sep 17 00:00:00 2001
From: prtos <tossouprudencio@gmail.com>
Date: Fri, 17 Nov 2023 00:58:57 +0000
Subject: [PATCH 08/18] combine smiles and subset into one artifact

---
 src/openqdc/__init__.py       |  2 +-
 src/openqdc/datasets/base.py  | 52 +++++++++++++++---------
 src/openqdc/datasets/pcqm.py  | 19 ++++++---
 src/openqdc/utils/molecule.py | 76 +++++++++++++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 25 deletions(-)

diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py
index 1432923..e77bc9e 100644
--- a/src/openqdc/__init__.py
+++ b/src/openqdc/__init__.py
@@ -9,7 +9,7 @@
 
 _lazy_imports_obj = {}
 
-_lazy_imports_mod = {"datasets": "openqdc.datamodule", "utils": "openqdc.utils"}
+_lazy_imports_mod = {"datasets": "openqdc.datasets", "utils": "openqdc.utils"}
 
 
 def __getattr__(name):
diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 47765a2..29c89bd 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -1,4 +1,6 @@
+import gzip
 import os
+import pickle as pkl
 from os.path import join as p_join
 from typing import Dict, List, Optional, Union
 
@@ -215,6 +217,7 @@ def collate_list(self, list_entries):
         x = np.zeros((csum.shape[0], 2), dtype=np.int32)
         x[1:, 0], x[:, 1] = csum[:-1], csum
         res["position_idx_range"] = x
+
         return res
 
     def save_preprocess(self, data_dict):
@@ -228,12 +231,17 @@ def save_preprocess(self, data_dict):
             push_remote(local_path, overwrite=True)
 
         # save smiles and subset
-        for key in ["name", "subset"]:
-            local_path = p_join(self.preprocess_path, f"{key}.npz")
-            uniques, inv_indices = np.unique(data_dict[key], return_inverse=True)
-            with open(local_path, "wb") as f:
-                np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
-            push_remote(local_path, overwrite=True)
+        local_path = p_join(self.preprocess_path, "props.pkl.gz")
+        with gzip.open(local_path, "wb") as f:
+            pkl.dump(data_dict, f)
+        push_remote(local_path, overwrite=True)
+
+        # for key in ["name", "subset"]:
+        #     local_path = p_join(self.preprocess_path, f"{key}.npz")
+        #     uniques, inv_indices = np.unique(data_dict[key], return_inverse=True)
+        #     with open(local_path, "wb") as f:
+        #         np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
+        #     push_remote(local_path, overwrite=True)
 
     def read_preprocess(self, overwrite_local_cache=False):
         logger.info("Reading preprocessed data")
@@ -256,19 +264,25 @@ def read_preprocess(self, overwrite_local_cache=False):
         for key in self.data:
             print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
 
-        for key in ["name", "subset"]:
-            filename = p_join(self.preprocess_path, f"{key}.npz")
-            pull_locally(filename, overwrite=overwrite_local_cache)
-            self.data[key] = dict()
-            with open(filename, "rb") as f:
-                tmp = np.load(f)
-                for k in tmp:
-                    self.data[key][k] = tmp[k]
-                    print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}")
+        filename = p_join(self.preprocess_path, "props.pkl.gz")
+        pull_locally(filename, overwrite=overwrite_local_cache)
+        with gzip.open(filename, "rb") as f:
+            tmp = pkl.load(f)
+            self.data.update(tmp)
+
+        # for key in ["name", "subset"]:
+        #     filename = p_join(self.preprocess_path, f"{key}.npz")
+        #     pull_locally(filename, overwrite=overwrite_local_cache)
+        #     self.data[key] = dict()
+        #     with open(filename, "rb") as f:
+        #         tmp = np.load(f)
+        #         for k in tmp:
+        #             self.data[key][k] = tmp[k]
+        #             print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}")
 
     def is_preprocessed(self):
         predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
-        predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) for x in ["name", "subset"]]
+        predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl.gz"))]
         return all(predicats)
 
     def preprocess(self):
@@ -383,7 +397,7 @@ def wrapper(idx):
             entry = self.get_ase_atoms(idx, ext=False)
             return soap.create(entry, centers=entry.positions)
 
-        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads")
+        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads", n_jobs=-1)
         datum["soap"] = np.vstack(descr)
         if return_idxs:
             datum["idxs"] = idxs
@@ -402,8 +416,8 @@ def __getitem__(self, idx: int):
             self.convert_distance(np.array(input[:, -3:], dtype=np.float32)),
             self.convert_energy(np.array(self.data["energies"][idx], dtype=np.float32)),
         )
-        name = self.data["name"]["uniques"][self.data["name"]["inv_indices"][idx]]
-        subset = self.data["subset"]["uniques"][self.data["subset"]["inv_indices"][idx]]
+        name = self.data["name"][idx]
+        subset = self.data["subset"][idx]
 
         if "forces" in self.data:
             forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32))
diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py
index 555d2eb..73f1c68 100644
--- a/src/openqdc/datasets/pcqm.py
+++ b/src/openqdc/datasets/pcqm.py
@@ -1,3 +1,4 @@
+import gzip
 import json
 import os
 import pickle as pkl
@@ -131,13 +132,21 @@ def collate_and_save_list(self, list_entries):
             push_remote(local_path, overwrite=True)
 
         # save smiles and subset
+        tmp = dict()
+        local_path = p_join(self.preprocess_path, "props.pkl.gz")
         for key in ["name", "subset"]:
             local_path = p_join(self.preprocess_path, f"{key}.npz")
-            x = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)]
-            uniques, inv_indices = np.unique(x, return_inverse=True)
-            with open(local_path, "wb") as f:
-                np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
-            push_remote(local_path, overwrite=True)
+            tmp[key] = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)]
+        with gzip.open(local_path, "wb") as f:
+            pkl.dump(x, f)
+        push_remote(local_path, overwrite=True)
+        # for key in ["name", "subset"]:
+        #     local_path = p_join(self.preprocess_path, f"{key}.npz")
+        #     x = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)]
+        #     uniques, inv_indices = np.unique(x, return_inverse=True)
+        #     with open(local_path, "wb") as f:
+        #         np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
+        #     push_remote(local_path, overwrite=True)
 
 
 class PCQM_B3LYP(PCQM_PM6):
diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py
index cd2290f..8bd2b83 100644
--- a/src/openqdc/utils/molecule.py
+++ b/src/openqdc/utils/molecule.py
@@ -1,4 +1,7 @@
+from typing import Any
+
 import numpy as np
+from numpy import ndarray
 from rdkit import Chem
 
 atom_table = Chem.GetPeriodicTable()
@@ -17,3 +20,76 @@ def get_atomic_charge(mol: Chem.Mol):
 def get_atomic_number_and_charge(mol: Chem.Mol):
     """Returns atoms number and charge for rdkit molecule"""
     return np.array([[atom.GetAtomicNum(), atom.GetFormalCharge()] for atom in mol.GetAtoms()])
+
+
+def rmsd(P: ndarray, Q: ndarray, **kwargs) -> float:
+    """
+    Calculate Root-mean-square deviation from two sets of vectors V and W.
+
+    Parameters
+    ----------
+    V : array
+        (N,D) matrix, where N is points and D is dimension.
+    W : array
+        (N,D) matrix, where N is points and D is dimension.
+
+    Returns
+    -------
+    rmsd : float
+        Root-mean-square deviation between the two vectors
+    """
+    diff = P - Q
+    return np.sqrt((diff * diff).sum() / P.shape[0])
+
+
+def kabsch_rmsd(
+    P: ndarray,
+    Q: ndarray,
+    translate: bool = False,
+    **kwargs: Any,
+) -> float:
+    """
+    Rotate matrix P unto Q using Kabsch algorithm and calculate the RMSD.
+
+    Parameters
+    ----------
+    P : array
+        (N,D) matrix, where N is points and D is dimension.
+    Q : array
+        (N,D) matrix, where N is points and D is dimension.
+    translate : bool
+        Use centroids to translate vector P and Q unto each other.
+
+    Returns
+    -------
+    rmsd : float
+        root-mean squared deviation
+    """
+
+    if translate:
+        Q = Q - Q.mean(axis=0)
+        P = P - P.mean(axis=0)
+
+    # Computation of the covariance matrix
+    C = np.dot(np.transpose(P), Q)
+
+    # Computation of the optimal rotation matrix
+    # This can be done using singular value decomposition (SVD)
+    # Getting the sign of the det(V)*(W) to decide
+    # whether we need to correct our rotation matrix to ensure a
+    # right-handed coordinate system.
+    # And finally calculating the optimal rotation matrix U
+    # see http://en.wikipedia.org/wiki/Kabsch_algorithm
+    V, S, W = np.linalg.svd(C)
+    d = (np.linalg.det(V) * np.linalg.det(W)) < 0.0
+
+    if d:
+        S[-1] = -S[-1]
+        V[:, -1] = -V[:, -1]
+
+    # Create Rotation matrix U
+    U = np.dot(V, W)
+
+    # Rotate P
+    P_prime = np.dot(P, U)
+    return rmsd(P_prime, Q)

From f74a6dff5b00e634f1a3d97a8c9261dd6963e9b5 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Mon, 27 Nov 2023 20:44:27 +0000
Subject: [PATCH 09/18] Fix xyz save + Updated e0 matrix to fix PCQM

---
 src/openqdc/datasets/base.py              | 5 +++--
 src/openqdc/utils/atomization_energies.py | 2 +-
 src/openqdc/utils/units.py                | 4 ++++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 022f880..5aae697 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -278,14 +278,15 @@ def preprocess(self):
             res = self.collate_list(entries)
             self.save_preprocess(res)
 
-    def save_xyz(self, idx: int, path: Optional[str] = None):
+    def save_xyz(self, idx: int, path: Optional[str] = None, name=None):
         """
         Save the entry at index idx as an extxyz file.
         """
         if path is None:
             path = os.getcwd()
         at = self.get_ase_atoms(idx, ext=True)
-        name = at.info["name"]
+        if name is not None:
+            name = at.info["name"]
         write_extxyz(p_join(path, f"{name}.xyz"), at)
 
     def get_ase_atoms(self, idx: int, ext=True):
diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py
index 40d0d13..01b9821 100644
--- a/src/openqdc/utils/atomization_energies.py
+++ b/src/openqdc/utils/atomization_energies.py
@@ -131,7 +131,7 @@ class IsolatedAtomEnergyFactory:
     Factory method to get the isolated atom energies for a given level of theory.
     """
 
-    max_charge = 4
+    max_charge = 9
 
     def __init__(self):
         pass
diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py
index fb895ce..4446592 100644
--- a/src/openqdc/utils/units.py
+++ b/src/openqdc/utils/units.py
@@ -73,3 +73,7 @@ def get_conversion(in_unit: str, out_unit: str):
 Conversion("hartree/ang", "kcal/mol/ang", lambda x: get_conversion("hartree", "kcal/mol")(x))
 Conversion("hartree/ang", "hartree/bohr", lambda x: get_conversion("bohr", "ang")(x))
 Conversion("hartree/bohr", "hartree/ang", lambda x: get_conversion("ang", "bohr")(x))
+Conversion("kcal/mol/bohr", "Hartree/bohr", lambda x: get_conversion("kcal/mol", "hartree")(x))
+Conversion("ev/ang", "hartree/ang", lambda x: get_conversion("ev", "hartree")(x))
+Conversion("ev/bohr", "hartree/bohr", lambda x: get_conversion("ev", "hartree")(x))
+Conversion("ev/bohr", "ev/ang", lambda x: get_conversion("ang", "bohr")(x))

From 97f093ca84eb5cb96e424849829c01d45cdcc364 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Tue, 28 Nov 2023 14:25:53 +0000
Subject: [PATCH 10/18] GDML Stats, Improvements, Exceptions, Forces as None if
 not present in the _stats

---
 src/openqdc/datasets/ani.py           | 41 ++----------
 src/openqdc/datasets/base.py          | 59 +++++++++++-----
 src/openqdc/datasets/dess.py          | 21 +-----
 src/openqdc/datasets/dummy.py         | 17 +----
 src/openqdc/datasets/gdml.py          | 96 ++++++++++++++++++++++++++-
 src/openqdc/datasets/geom.py          | 21 +-----
 src/openqdc/datasets/molecule3d.py    | 21 +-----
 src/openqdc/datasets/orbnet_denali.py | 21 +-----
 src/openqdc/datasets/qmugs.py         | 17 +----
 src/openqdc/datasets/tmqm.py          | 21 +-----
 src/openqdc/utils/constants.py        | 12 ++++
 src/openqdc/utils/exceptions.py       | 63 ++++++++++++++++++
 src/openqdc/utils/units.py            |  6 +-
 13 files changed, 241 insertions(+), 175 deletions(-)
 create mode 100644 src/openqdc/utils/exceptions.py

diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py
index c18d254..0e8c751 100644
--- a/src/openqdc/datasets/ani.py
+++ b/src/openqdc/datasets/ani.py
@@ -4,6 +4,7 @@
 from numpy import array, float32
 
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.io import get_local_cache
 
 
@@ -61,30 +62,14 @@ def _stats(self):
                     "mean": self.convert_energy(array([-2.37376472])),
                     "std": self.convert_energy(array([0.50266975])),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
             "total": {
                 "energy": {
                     "mean": self.convert_energy(array([-333.67322], dtype=float32)),
                     "std": self.convert_energy(array([61.21667], dtype=float32)),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
         }
 
@@ -136,30 +121,14 @@ def _stats(self):
                     "mean": self.convert_energy(array([-3.23959548, 500.30384627, 500.40706776, 500.76740432])),
                     "std": self.convert_energy(array([1.03021261, 132.52707152, 132.56092469, 132.65261362])),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
             "total": {
                 "energy": {
                     "mean": self.convert_energy(array([-374.40665, -1.2378153, -1.505962, -1.2396905], dtype=float32)),
                     "std": self.convert_energy(array([101.63995, 0.32444745, 0.39500558, 0.3250212], dtype=float32)),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
         }
 
diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 5aae697..478cb37 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -15,6 +15,12 @@
     chemical_symbols,
 )
 from openqdc.utils.constants import NB_ATOMIC_FEATURES, POSSIBLE_NORMALIZATION
+from openqdc.utils.exceptions import (
+    PROPERTY_NOT_AVAILABLE_ERROR,
+    DatasetNotAvailableError,
+    NormalizationNotAvailableError,
+    StatisticsNotAvailableError,
+)
 from openqdc.utils.io import (
     copy_exists,
     dict_to_atoms,
@@ -96,10 +102,7 @@ def __init__(
         self.data = None
         self._set_units(energy_unit, distance_unit)
         if not self.is_preprocessed():
-            logger.info("This dataset not available. Please open an issue on Github for the team to look into it.")
-            # entries = self.read_raw_entries()
-            # res = self.collate_list(entries)
-            # self.save_preprocess(res)
+            raise DatasetNotAvailableError(self.__name__)
         else:
             self.read_preprocess(overwrite_local_cache=overwrite_local_cache)
             self._set_isolated_atom_energies()
@@ -255,7 +258,7 @@ def read_preprocess(self, overwrite_local_cache=False):
             ).reshape(self.data_shapes[key])
 
         for key in self.data:
-            print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
+            logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
 
         for key in ["name", "subset"]:
             filename = p_join(self.preprocess_path, f"{key}.npz")
@@ -265,7 +268,9 @@ def read_preprocess(self, overwrite_local_cache=False):
                 tmp = np.load(f)
                 for k in tmp:
                     self.data[key][k] = tmp[k]
-                    print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}")
+                    logger.info(
+                        f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}"
+                    )
 
     def is_preprocessed(self):
         predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
@@ -434,21 +439,41 @@ def _stats(self):
 
     @property
     def average_n_atoms(self):
+        """
+        Average number of atoms in a molecule in the dataset.
+        """
         if self.__average_nb_atoms__ is None:
-            logger.info(
-                "This property for this dataset not available."
-                + "Please open an issue on Github for the team to look into it."
-            )
+            logger.info(PROPERTY_NOT_AVAILABLE_ERROR)
             return 1
         return self.__average_nb_atoms__
 
-    def get_statistics(self, normalization: str = "formation"):
+    def get_statistics(self, normalization: str = "formation", return_none: bool = True):
+        """
+        Get the statistics of the dataset.
+        normalization : str, optional
+            Type of energy, by default "formation", must be one of ["formation", "total"]
+        return_none : bool, optional
+            Whether to return None if the statistics for the forces are not available, by default True
+            Otherwise, the statistics for the forces are set to 0.0
+        """
         stats = self._stats
         if len(stats) == 0:
-            logger.info(
-                "This property for this dataset not available."
-                + "Please open an issue on Github for the team to look into it."
-            )
+            raise StatisticsNotAvailableError(self.__name__)
         if normalization not in POSSIBLE_NORMALIZATION:
-            raise ValueError(f"normalization={normalization} is not valid. Must be one of {POSSIBLE_NORMALIZATION}")
-        return stats[normalization]
+            raise NormalizationNotAvailableError(normalization)
+        selected_stats = stats[normalization]
+        if len(self.__force_methods__) == 0 and not return_none:
+            selected_stats.update(
+                {
+                    "forces": {
+                        "mean": np.array([0.0]),
+                        "std": np.array([0.0]),
+                        "components": {
+                            "mean": np.array([[0.0], [0.0], [0.0]]),
+                            "std": np.array([[0.0], [0.0], [0.0]]),
+                            "rms": np.array([[0.0], [0.0], [0.0]]),
+                        },
+                    }
+                }
+            )
+        return selected_stats
diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py
index 76c2355..6651bf7 100644
--- a/src/openqdc/datasets/dess.py
+++ b/src/openqdc/datasets/dess.py
@@ -7,6 +7,7 @@
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
@@ -94,15 +95,7 @@ def _stats(self):
                         ]
                     ),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
             "total": {
                 "energy": {
@@ -115,15 +108,7 @@ def _stats(self):
                         dtype=float32,
                     ),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
         }
 
diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py
index 4a9bfc8..99b5106 100644
--- a/src/openqdc/datasets/dummy.py
+++ b/src/openqdc/datasets/dummy.py
@@ -4,6 +4,7 @@
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
+from openqdc.utils.constants import NOT_DEFINED
 
 
 class Dummy(BaseDataset):
@@ -32,26 +33,14 @@ def _stats(self):
                     "mean": array([-12.94348027, -9.83037297]),
                     "std": array([4.39971409, 3.3574188]),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
-                        "std": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
-                        "rms": array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
             "total": {
                 "energy": {
                     "mean": array([-89.44242, -1740.5336]),
                     "std": array([29.599571, 791.48663]),
                 },
-                "forces": {
-                    "mean": array([[0.0], [0.0], [0.0]]),
-                    "std": array([[0.0], [0.0], [0.0]]),
-                    "rms": array([[0.0], [0.0], [0.0]]),
-                },
+                "forces": NOT_DEFINED,
             },
         }
 
diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py
index 789f84a..73bae17 100644
--- a/src/openqdc/datasets/gdml.py
+++ b/src/openqdc/datasets/gdml.py
@@ -1,5 +1,7 @@
 from os.path import join as p_join
 
+from numpy import array, float32
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
 
 
@@ -32,7 +34,7 @@ class GDML(BaseDataset):
     __energy_methods__ = [
         "ccsd/cc-pvdz",
         "ccsd(t)/cc-pvdz",
-        "pbe/mbd",  # MD22
+        # "pbe/mbd",  # MD22
         # "pbe+mbd/tight", #MD22
         "pbe/vdw-ts",  # MD17
     ]
@@ -46,7 +48,7 @@ class GDML(BaseDataset):
     __force_methods__ = [
         "ccsd/cc-pvdz",
         "ccsd(t)/cc-pvdz",
-        "pbe/mbd",  # MD22
+        # "pbe/mbd",  # MD22
         # "pbe+mbd/tight", #MD22
         "pbe/vdw-ts",  # MD17
     ]
@@ -60,9 +62,99 @@ class GDML(BaseDataset):
     __energy_unit__ = "kcal/mol"
     __distance_unit__ = "bohr"
     __forces_unit__ = "kcal/mol/bohr"
+    __average_nb_atoms__ = 13.00299550
 
     def read_raw_entries(self):
         raw_path = p_join(self.root, "gdml.h5")
         samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, self.force_target_names)
 
         return samples
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": self.convert_energy(array([-2466.00011563, -1213.94691714, -1916.02068252])),
+                    "std": self.convert_energy(array([6.65779492, 310.70204248, 729.2143015])),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array(-1.42346325e-05)),
+                    "std": self.convert_forces(array(27.009315)),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array(
+                                [
+                                    [-8.3862792e-09, -1.9758134e-07, -7.7199416e-05],
+                                    [-2.7550591e-09, -1.9665436e-08, 5.3315878e-05],
+                                    [-7.5688439e-10, 5.6149121e-09, -1.8894127e-05],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                        "std": self.convert_forces(
+                            array(
+                                [
+                                    [31.060509, 29.168474, 27.547812],
+                                    [31.365385, 26.67319, 26.068623],
+                                    [31.024155, 27.272366, 22.33925],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                        "rms": self.convert_forces(
+                            array(
+                                [
+                                    [31.060509, 29.168474, 27.547802],
+                                    [31.365385, 26.67319, 26.068628],
+                                    [31.024155, 27.272366, 22.33925],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                    },
+                },
+            },
+            "total": {
+                "energy": {
+                    "mean": self.convert_energy(array([-405688.28, -141134.3, -194075.56], dtype=float32)),
+                    "std": self.convert_energy(array([7.2360396e00, 3.0755928e04, 8.4138445e04], dtype=float32)),
+                },
+                "forces": {
+                    "mean": self.convert_forces(array(-1.42346325e-05)),
+                    "std": self.convert_forces(array(27.009315)),
+                    "components": {
+                        "mean": self.convert_forces(
+                            array(
+                                [
+                                    [-8.3862792e-09, -1.9758134e-07, -7.7199416e-05],
+                                    [-2.7550591e-09, -1.9665436e-08, 5.3315878e-05],
+                                    [-7.5688439e-10, 5.6149121e-09, -1.8894127e-05],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                        "std": self.convert_forces(
+                            array(
+                                [
+                                    [31.060509, 29.168474, 27.547812],
+                                    [31.365385, 26.67319, 26.068623],
+                                    [31.024155, 27.272366, 22.33925],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                        "rms": self.convert_forces(
+                            array(
+                                [
+                                    [31.060509, 29.168474, 27.547802],
+                                    [31.365385, 26.67319, 26.068628],
+                                    [31.024155, 27.272366, 22.33925],
+                                ],
+                                dtype=float32,
+                            )
+                        ),
+                    },
+                },
+            },
+        }
diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py
index c16f001..897b828 100644
--- a/src/openqdc/datasets/geom.py
+++ b/src/openqdc/datasets/geom.py
@@ -7,6 +7,7 @@
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils import load_json, load_pkl
+from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
@@ -97,30 +98,14 @@ def _stats(self):
                     "mean": self.convert_energy(array([-8.47811605])),
                     "std": self.convert_energy(array([3.68015507])),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
             "total": {
                 "energy": {
                     "mean": self.convert_energy(array([-78.264725])),
                     "std": self.convert_energy(array([20.200787])),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
         }
 
diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py
index ace03d4..3887499 100644
--- a/src/openqdc/datasets/molecule3d.py
+++ b/src/openqdc/datasets/molecule3d.py
@@ -10,6 +10,7 @@
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
@@ -110,29 +111,13 @@ def _stats(self):
                     "mean": self.convert_energy(array([-191.66717791])),
                     "std": self.convert_energy(array([2005.52732443])),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
             "total": {
                 "energy": {
                     "mean": self.convert_energy(array([-21100.502], dtype=float32)),
                     "std": self.convert_energy(array([9345.366], dtype=float32)),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
         }
diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py
index e5f5953..e158410 100644
--- a/src/openqdc/datasets/orbnet_denali.py
+++ b/src/openqdc/datasets/orbnet_denali.py
@@ -7,6 +7,7 @@
 from numpy import array, float32
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import atom_table
 
 
@@ -88,29 +89,13 @@ def _stats(self):
                     "mean": self.convert_energy(array([-13.87283487, -7.79357297])),
                     "std": self.convert_energy(array([75.34652971, 3.3274954])),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
             "total": {
                 "energy": {
                     "mean": self.convert_energy(array([-1630.8182, -74.17469], dtype=float32)),
                     "std": self.convert_energy(array([929.9734, 19.751446], dtype=float32)),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
         }
diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py
index 1ef6b22..a793151 100644
--- a/src/openqdc/datasets/qmugs.py
+++ b/src/openqdc/datasets/qmugs.py
@@ -7,6 +7,7 @@
 from numpy import array
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
@@ -72,26 +73,14 @@ def _stats(self):
                     "mean": self.convert_energy(array([-12.94348027, -9.83037297])),
                     "std": self.convert_energy(array([4.39971409, 3.3574188])),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
             "total": {
                 "energy": {
                     "mean": self.convert_energy(array([-89.44242, -1740.5336])),
                     "std": self.convert_energy(array([29.599571, 791.48663])),
                 },
-                "forces": {
-                    "mean": array([[0.0], [0.0], [0.0]]),
-                    "std": array([[0.0], [0.0], [0.0]]),
-                    "rms": array([[0.0], [0.0], [0.0]]),
-                },
+                "forces": NOT_DEFINED,
             },
         }
 
diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py
index 7160edc..1b7ffc1 100644
--- a/src/openqdc/datasets/tmqm.py
+++ b/src/openqdc/datasets/tmqm.py
@@ -7,6 +7,7 @@
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import atom_table
 
 
@@ -65,30 +66,14 @@ def _stats(self):
                     "mean": self.convert_energy(array([-669.31056277])),
                     "std": self.convert_energy(array([896.83091033])),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
             "total": {
                 "energy": {
                     "mean": self.convert_energy(array([-2952.1428], dtype=float32)),
                     "std": self.convert_energy(array([1608.7888], dtype=float32)),
                 },
-                "forces": {
-                    "mean": array([0]),
-                    "std": array([0]),
-                    "components": {
-                        "mean": array([[0.0], [0.0], [0.0]]),
-                        "std": array([[0.0], [0.0], [0.0]]),
-                        "rms": array([[0.0], [0.0], [0.0]]),
-                    },
-                },
+                "forces": NOT_DEFINED,
             },
         }
 
diff --git a/src/openqdc/utils/constants.py b/src/openqdc/utils/constants.py
index d1e1937..d32c5d9 100644
--- a/src/openqdc/utils/constants.py
+++ b/src/openqdc/utils/constants.py
@@ -7,3 +7,15 @@
 BOHR2ANG = 0.52917721092
 
 POSSIBLE_NORMALIZATION = ["formation", "total"]
+
+NOT_DEFINED = {
+    "forces": {
+        "mean": None,
+        "std": None,
+        "components": {
+            "mean": None,
+            "std": None,
+            "rms": None,
+        },
+    },
+}
diff --git a/src/openqdc/utils/exceptions.py b/src/openqdc/utils/exceptions.py
new file mode 100644
index 0000000..5c3ddbc
--- /dev/null
+++ b/src/openqdc/utils/exceptions.py
@@ -0,0 +1,63 @@
+from typing import Final
+
+from openqdc.utils.constants import POSSIBLE_NORMALIZATION
+
+PROPERTY_NOT_AVAILABLE_ERROR: Final[
+    str
+] = """This property for this dataset not available.
+Please open an issue on Github for the team to look into it."""
+
+
+class OpenQDCException(Exception):
+    """Base exception for custom exceptions raised by the openQDC"""
+
+    def __init__(self, msg: str):
+        super().__init__(msg)
+        self.msg = msg
+
+    def __str__(self):
+        return self.msg
+
+
+class DatasetNotAvailableError(OpenQDCException):
+    """Raised when a dataset is not available"""
+
+    msg = "Dataset {dataset_name} is not available. Please open an issue on Github for the team to look into it."
+
+    def __init__(self, dataset_name):
+        super().__init__(self.msg.format(dataset_name=dataset_name))
+
+
+class StatisticsNotAvailableError(DatasetNotAvailableError):
+    """Raised when statistics are not available"""
+
+
+class NormalizationNotAvailableError(OpenQDCException):
+    """Raised when normalization is not available"""
+
+    def __init__(self, normalization):
+        msg = f"Normalization={normalization} is not valid. Must be one of {POSSIBLE_NORMALIZATION}"
+        super().__init__(msg)
+
+
+class ConversionNotDefinedError(OpenQDCException, ValueError):
+    """Raised when a conversion is not defined"""
+
+    _error_message = """
+    Conversion from {in_unit} to {out_unit} is not defined in the conversion registry. 
+    To add a new conversion, use the following syntax or open an issue on Github for the team to look into it:
+
+    Conversion("{in_unit}", "{out_unit}", lambda x: x * conversion_factor)
+    """
+
+    def __init__(self, in_unit, out_unit):
+        super().__init__(self._error_message.format(in_unit=in_unit, out_unit=out_unit))
+
+
+class ConversionAlreadyDefined(ConversionNotDefinedError):
+    """Raised when a conversion is not defined"""
+
+    _error_message = """
+    Conversion from {in_unit} to {out_unit} is alread defined in the conversion registry. 
+    To reuse the same metric, use get_conversion({in_unit}, {out_unit}).
+    """
diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py
index 4446592..016d464 100644
--- a/src/openqdc/utils/units.py
+++ b/src/openqdc/utils/units.py
@@ -1,5 +1,7 @@
 from typing import Callable
 
+from openqdc.utils.exceptions import ConversionAlreadyDefined, ConversionNotDefinedError
+
 CONVERSION_REGISTRY = {}
 
 
@@ -13,7 +15,7 @@ def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):
         name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip()
 
         if name in CONVERSION_REGISTRY:
-            raise ValueError(f"{name} is already registered. To reuse the same metric, use Metric.get_by_name().")
+            raise ConversionAlreadyDefined(in_unit, out_unit)
         CONVERSION_REGISTRY[name] = self
 
         self.name = name
@@ -29,7 +31,7 @@ def get_conversion(in_unit: str, out_unit: str):
     if in_unit.lower().strip() == out_unit.lower().strip():
         return lambda x: x
     if name not in CONVERSION_REGISTRY:
-        raise ValueError(f"{name} is not a valid metric. Valid metrics are: {list(CONVERSION_REGISTRY.keys())}")
+        raise ConversionNotDefinedError(in_unit, out_unit)
     return CONVERSION_REGISTRY[name]
 
 

From 51a5191d69d094df93e56eeb1493681d6af4268e Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Tue, 28 Nov 2023 15:09:39 +0000
Subject: [PATCH 11/18] Fix the downloading issue and incompatibilities with
 new file types

---
 src/openqdc/datasets/base.py | 40 +++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 478cb37..887fe8f 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -26,6 +26,7 @@
     dict_to_atoms,
     get_local_cache,
     load_hdf5_file,
+    load_pkl,
     pull_locally,
     push_remote,
     set_cache_dir,
@@ -102,10 +103,18 @@ def __init__(
         self.data = None
         self._set_units(energy_unit, distance_unit)
         if not self.is_preprocessed():
-            raise DatasetNotAvailableError(self.__name__)
+            self._download()
         else:
             self.read_preprocess(overwrite_local_cache=overwrite_local_cache)
-            self._set_isolated_atom_energies()
+        self._set_isolated_atom_energies()
+
+    def _download(self):
+        try:
+            self.read_preprocess(overwrite_local_cache=True)
+            if not self.is_preprocessed():
+                raise Exception
+        except Exception:
+            raise DatasetNotAvailableError(self.__name__)
 
     @property
     def numbers(self):
@@ -260,21 +269,23 @@ def read_preprocess(self, overwrite_local_cache=False):
         for key in self.data:
             logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
 
-        for key in ["name", "subset"]:
-            filename = p_join(self.preprocess_path, f"{key}.npz")
+        for key in ["props"]:
+            filename = p_join(self.preprocess_path, f"{key}.pkl")
             pull_locally(filename)
-            self.data[key] = dict()
-            with open(filename, "rb") as f:
-                tmp = np.load(f)
-                for k in tmp:
-                    self.data[key][k] = tmp[k]
-                    logger.info(
-                        f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}"
-                    )
+            for key, v in load_pkl(filename).items():
+                self.data[key] = dict()
+                if key == "n_atoms":
+                    self.data[key] = v
+                    logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
+                else:
+                    self.data[key]["uniques"] = v[0]
+                    self.data[key]["inv_indices"] = v[1]
+                    logger.info(f"Loaded {key}_{'uniques'} with shape {v[0].shape}, dtype { v[0].dtype}")
+                    logger.info(f"Loaded {key}_{'inv_indices'} with shape {v[1].shape}, dtype {v[1].dtype}")
 
     def is_preprocessed(self):
         predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
-        predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) for x in ["name", "subset"]]
+        predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.pkl")) for x in ["props"]]
         return all(predicats)
 
     def preprocess(self):
@@ -411,7 +422,7 @@ def __getitem__(self, idx: int):
         )
         name = self.data["name"]["uniques"][self.data["name"]["inv_indices"][idx]]
         subset = self.data["subset"]["uniques"][self.data["subset"]["inv_indices"][idx]]
-
+        n_atoms = self.data["n_atoms"][idx]
         if "forces" in self.data:
             forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32))
         else:
@@ -425,6 +436,7 @@ def __getitem__(self, idx: int):
             name=name,
             subset=subset,
             forces=forces,
+            n_atoms=n_atoms,
         )
 
     def __str__(self):

From 16dcb4eb7e04bb241aa5533c0c48ef472fd535c3 Mon Sep 17 00:00:00 2001
From: prtos <tossouprudencio@gmail.com>
Date: Tue, 28 Nov 2023 15:43:41 +0000
Subject: [PATCH 12/18] change format for many reasons

---
 src/openqdc/datasets/ani.py               |  18 ++
 src/openqdc/datasets/base.py              |  81 ++++----
 src/openqdc/datasets/comp6.py             |  14 +-
 src/openqdc/datasets/iso_17.py            |   6 +
 src/openqdc/datasets/nabladft.py          |  26 ++-
 src/openqdc/datasets/pcqm.py              |  31 +--
 src/openqdc/datasets/sn2_rxn.py           |  30 +++
 src/openqdc/datasets/solvated_peptides.py |   6 +
 src/openqdc/raws/config_factory.py        |   2 +-
 src/openqdc/utils/atomization_energies.py | 231 +++++++++++-----------
 src/openqdc/utils/molecule.py             |  10 +
 src/openqdc/utils/preprocess.py           |   8 +-
 src/openqdc/utils/units.py                |   3 +
 13 files changed, 283 insertions(+), 183 deletions(-)

diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py
index 913fb8a..3f1b92b 100644
--- a/src/openqdc/datasets/ani.py
+++ b/src/openqdc/datasets/ani.py
@@ -39,6 +39,12 @@ class ANI1(BaseDataset):
     def root(self):
         return p_join(get_local_cache(), "ani")
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "-".join(x.decode("ascii").split("-")[:-1])
+
     @property
     def preprocess_path(self):
         path = p_join(self.root, "preprocessed", self.__name__)
@@ -89,6 +95,12 @@ class ANI1CCX(ANI1):
     __force_methods__ = []
     force_target_names = []
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return x
+
 
 class ANI1X(ANI1):
     """
@@ -145,3 +157,9 @@ class ANI1X(ANI1):
 
     def convert_forces(self, x):
         return super().convert_forces(x) * 0.529177249  # correct the Dataset error
+
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return x
diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 29c89bd..995b297 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -1,4 +1,3 @@
-import gzip
 import os
 import pickle as pkl
 from os.path import join as p_join
@@ -26,7 +25,7 @@
     push_remote,
     set_cache_dir,
 )
-from openqdc.utils.molecule import atom_table
+from openqdc.utils.molecule import atom_table, z_to_formula
 from openqdc.utils.package_utils import requires_package
 from openqdc.utils.units import get_conversion
 
@@ -45,7 +44,7 @@ def extract_entry(
 
     res = dict(
         name=np.array([df["name"][i]]),
-        subset=np.array([subset]),
+        subset=np.array([subset if subset is not None else z_to_formula(x)]),
         energies=energies.reshape((1, -1)).astype(np.float32),
         atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32),
         n_atoms=np.array([x.shape[0]], dtype=np.int32),
@@ -66,8 +65,8 @@ def read_qc_archive_h5(
 ) -> List[Dict[str, np.ndarray]]:
     data = load_hdf5_file(raw_path)
     data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}
-    n = len(data_t["molecule_id"])
 
+    n = len(data_t["molecule_id"])
     samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]
     return samples
 
@@ -98,9 +97,6 @@ def __init__(
         self._set_units(energy_unit, distance_unit)
         if not self.is_preprocessed():
             logger.info("This dataset not available. Please open an issue on Github for the team to look into it.")
-            # entries = self.read_raw_entries()
-            # res = self.collate_list(entries)
-            # self.save_preprocess(res)
         else:
             self.read_preprocess(overwrite_local_cache=overwrite_local_cache)
             self._set_isolated_atom_energies()
@@ -109,12 +105,12 @@ def __init__(
     def numbers(self):
         if hasattr(self, "_numbers"):
             return self._numbers
-        self._numbers = np.array(list(set(self.data["atomic_inputs"][..., 0])), dtype=np.int32)
+        self._numbers = np.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
         return self._numbers
 
     @property
     def chemical_species(self):
-        return [chemical_symbols[z] for z in self.numbers]
+        return np.array(chemical_symbols)[self.numbers]
 
     @property
     def energy_unit(self):
@@ -213,7 +209,7 @@ def collate_list(self, list_entries):
         # concatenate entries
         res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
 
-        csum = np.cumsum(res.pop("n_atoms"))
+        csum = np.cumsum(res.get("n_atoms"))
         x = np.zeros((csum.shape[0], 2), dtype=np.int32)
         x[1:, 0], x[:, 1] = csum[:-1], csum
         res["position_idx_range"] = x
@@ -231,18 +227,14 @@ def save_preprocess(self, data_dict):
             push_remote(local_path, overwrite=True)
 
         # save smiles and subset
-        local_path = p_join(self.preprocess_path, "props.pkl.gz")
-        with gzip.open(local_path, "wb") as f:
+        local_path = p_join(self.preprocess_path, "props.pkl")
+        for key in ["name", "subset"]:
+            data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+
+        with open(local_path, "wb") as f:
             pkl.dump(data_dict, f)
         push_remote(local_path, overwrite=True)
 
-        # for key in ["name", "subset"]:
-        #     local_path = p_join(self.preprocess_path, f"{key}.npz")
-        #     uniques, inv_indices = np.unique(data_dict[key], return_inverse=True)
-        #     with open(local_path, "wb") as f:
-        #         np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
-        #     push_remote(local_path, overwrite=True)
-
     def read_preprocess(self, overwrite_local_cache=False):
         logger.info("Reading preprocessed data")
         logger.info(
@@ -255,38 +247,29 @@ def read_preprocess(self, overwrite_local_cache=False):
         for key in self.data_keys:
             filename = p_join(self.preprocess_path, f"{key}.mmap")
             pull_locally(filename, overwrite=overwrite_local_cache)
-            self.data[key] = np.memmap(
-                filename,
-                mode="r",
-                dtype=self.data_types[key],
-            ).reshape(self.data_shapes[key])
-
-        for key in self.data:
-            print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
+            self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key])
 
-        filename = p_join(self.preprocess_path, "props.pkl.gz")
+        filename = p_join(self.preprocess_path, "props.pkl")
         pull_locally(filename, overwrite=overwrite_local_cache)
-        with gzip.open(filename, "rb") as f:
+        with open(filename, "rb") as f:
             tmp = pkl.load(f)
-            self.data.update(tmp)
-
-        # for key in ["name", "subset"]:
-        #     filename = p_join(self.preprocess_path, f"{key}.npz")
-        #     pull_locally(filename, overwrite=overwrite_local_cache)
-        #     self.data[key] = dict()
-        #     with open(filename, "rb") as f:
-        #         tmp = np.load(f)
-        #         for k in tmp:
-        #             self.data[key][k] = tmp[k]
-        #             print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}")
+            for key in ["name", "subset", "n_atoms"]:
+                x = tmp.pop(key)
+                if len(x) == 2:
+                    self.data[key] = x[0][x[1]]
+                else:
+                    self.data[key] = x
+
+        for key in self.data:
+            print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
 
     def is_preprocessed(self):
         predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
-        predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl.gz"))]
+        predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))]
         return all(predicats)
 
-    def preprocess(self):
-        if not self.is_preprocessed():
+    def preprocess(self, overwrite=False):
+        if overwrite or not self.is_preprocessed():
             entries = self.read_raw_entries()
             res = self.collate_list(entries)
             self.save_preprocess(res)
@@ -319,7 +302,7 @@ def get_ase_atoms(self, idx: int, ext=True):
 
     @requires_package("dscribe")
     @requires_package("datamol")
-    def chemical_space(
+    def soap_descriptors(
         self,
         n_samples: Optional[Union[List[int], int]] = None,
         return_idxs: bool = True,
@@ -364,7 +347,7 @@ def chemical_space(
             idxs = list(range(len(self)))
         elif isinstance(n_samples, int):
             idxs = np.random.choice(len(self), size=n_samples, replace=False)
-        elif isinstance(n_samples, list):
+        else:  # list, set, np.ndarray
             idxs = n_samples
         datum = {}
         r_cut = soap_kwargs.pop("r_cut", 5.0)
@@ -406,6 +389,12 @@ def wrapper(idx):
     def __len__(self):
         return self.data["energies"].shape[0]
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return x
+
     def __getitem__(self, idx: int):
         shift = IsolatedAtomEnergyFactory.max_charge
         p_start, p_end = self.data["position_idx_range"][idx]
@@ -416,7 +405,7 @@ def __getitem__(self, idx: int):
             self.convert_distance(np.array(input[:, -3:], dtype=np.float32)),
             self.convert_energy(np.array(self.data["energies"][idx], dtype=np.float32)),
         )
-        name = self.data["name"][idx]
+        name = self.__smiles_converter__(self.data["name"][idx])
         subset = self.data["subset"][idx]
 
         if "forces" in self.data:
diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py
index c95ec17..7b6890b 100644
--- a/src/openqdc/datasets/comp6.py
+++ b/src/openqdc/datasets/comp6.py
@@ -35,8 +35,8 @@ class COMP6(BaseDataset):
         "pbe-d3bj/def2-tzvp",
         "pbe/def2-tzvp",
         "svwn/def2-tzvp",
-        "wb97m-d3bj/def2-tzvp",
-        "wb97m/def2-tzvp",
+        # "wb97m-d3bj/def2-tzvp",
+        # "wb97m/def2-tzvp",
     ]
 
     energy_target_names = [
@@ -47,8 +47,8 @@ class COMP6(BaseDataset):
         "PBE-D3M(BJ):def2-tzvp",
         "PBE:def2-tzvp",
         "SVWN:def2-tzvp",
-        "WB97M-D3(BJ):def2-tzvp",
-        "WB97M:def2-tzvp",
+        # "WB97M-D3(BJ):def2-tzvp",
+        # "WB97M:def2-tzvp",
     ]
 
     __force_methods__ = [
@@ -59,6 +59,12 @@ class COMP6(BaseDataset):
         "Gradient",
     ]
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "-".join(x.decode("ascii").split("_")[:-1])
+
     def read_raw_entries(self):
         samples = []
         for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]:
diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py
index 735ae67..4553ec1 100644
--- a/src/openqdc/datasets/iso_17.py
+++ b/src/openqdc/datasets/iso_17.py
@@ -43,6 +43,12 @@ class ISO17(BaseDataset):
     __distance_unit__ = "bohr"  # bohr
     __forces_unit__ = "ev/bohr"
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "-".join(x.decode("ascii").split("_")[:-1])
+
     def read_raw_entries(self):
         raw_path = p_join(self.root, "iso_17.h5")
         samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names)
diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py
index e7d9eb8..0555cdc 100644
--- a/src/openqdc/datasets/nabladft.py
+++ b/src/openqdc/datasets/nabladft.py
@@ -4,30 +4,32 @@
 
 import datamol as dm
 import numpy as np
-from tqdm import tqdm
+import pandas as pd
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.molecule import z_to_formula
 from openqdc.utils.package_utils import requires_package
 
 
-def to_mol(entry) -> Dict[str, np.ndarray]:
+def to_mol(entry, metadata) -> Dict[str, np.ndarray]:
     Z, R, E, F = entry[:4]
     C = np.zeros_like(Z)
+    E[0] = metadata["DFT TOTAL ENERGY"]
 
     res = dict(
         atomic_inputs=np.concatenate((Z[:, None], C[:, None], R), axis=-1).astype(np.float32),
-        name=np.array([""]),
+        name=np.array([metadata["SMILES"]]),
         energies=E[:, None].astype(np.float32),
         forces=F[:, :, None].astype(np.float32),
         n_atoms=np.array([Z.shape[0]], dtype=np.int32),
-        subset=np.array(["nabla"]),
+        subset=np.array([z_to_formula(Z)]),
     )
 
     return res
 
 
 @requires_package("nablaDFT")
-def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000):
+def read_chunk_from_db(raw_path, start_idx, stop_idx, labels, step_size=1000):
     from nablaDFT.dataset import HamiltonianDatabase
 
     print(f"Loading from {start_idx} to {stop_idx}")
@@ -35,7 +37,13 @@ def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000):
     idxs = list(np.arange(start_idx, stop_idx))
     n, s = len(idxs), step_size
 
-    samples = [to_mol(entry) for i in tqdm(range(0, n, s)) for entry in db[idxs[i : i + s]]]
+    cursor = db._get_connection().cursor()
+    data_idxs = cursor.execute("""SELECT * FROM dataset_ids WHERE id IN (""" + str(idxs)[1:-1] + ")").fetchall()
+    c_idxs = [tuple(x[1:]) for x in data_idxs]
+
+    samples = [
+        to_mol(entry, labels[c_idxs[i + j]]) for i in range(0, n, s) for j, entry in enumerate(db[idxs[i : i + s]])
+    ]
     return samples
 
 
@@ -68,12 +76,16 @@ class NablaDFT(BaseDataset):
     def read_raw_entries(self):
         from nablaDFT.dataset import HamiltonianDatabase
 
+        label_path = p_join(self.root, "summary.csv")
+        df = pd.read_csv(label_path, usecols=["MOSES id", "CONFORMER id", "SMILES", "DFT TOTAL ENERGY"])
+        labels = df.set_index(keys=["MOSES id", "CONFORMER id"]).to_dict("index")
+
         raw_path = p_join(self.root, "dataset_full.db")
         train = HamiltonianDatabase(raw_path)
         n, c = len(train), 20
         step_size = int(np.ceil(n / os.cpu_count()))
 
-        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n))
+        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels)
         samples = dm.parallelized(
             fn, list(range(c)), n_jobs=c, progress=False, scheduler="threads"
         )  # don't use more than 1 job
diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py
index 73f1c68..d1a344c 100644
--- a/src/openqdc/datasets/pcqm.py
+++ b/src/openqdc/datasets/pcqm.py
@@ -1,4 +1,3 @@
-import gzip
 import json
 import os
 import pickle as pkl
@@ -90,7 +89,11 @@ def preprocess_path(self):
     def collate_list(self, list_entries):
         predicat = list_entries is not None and len(list_entries) > 0
         list_entries = [x for x in list_entries if x is not None]
-        return super().collate_list(list_entries) if predicat else None
+        if predicat:
+            res = super().collate_list(list_entries)
+        else:
+            res = None
+        return res
 
     def read_raw_entries(self):
         arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl"))
@@ -99,8 +102,8 @@ def read_raw_entries(self):
         samples = [x for x in samples if x is not None]
         return samples
 
-    def preprocess(self):
-        if not self.is_preprocessed():
+    def preprocess(self, overwrite=False):
+        if overwrite or not self.is_preprocessed():
             logger.info("Preprocessing data and saving it to cache.")
             logger.info(
                 f"Dataset {self.__name__} data with the following units:\n"
@@ -132,13 +135,19 @@ def collate_and_save_list(self, list_entries):
             push_remote(local_path, overwrite=True)
 
         # save smiles and subset
-        tmp = dict()
-        local_path = p_join(self.preprocess_path, "props.pkl.gz")
-        for key in ["name", "subset"]:
-            local_path = p_join(self.preprocess_path, f"{key}.npz")
-            tmp[key] = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)]
-        with gzip.open(local_path, "wb") as f:
-            pkl.dump(x, f)
+        tmp, n = dict(name=[]), len(list_entries)
+        local_path = p_join(self.preprocess_path, "props.pkl")
+        names = [list_entries[i].pop("name") for i in range(n)]
+        f = lambda xs: [dm.to_inchikey(x) for x in xs]
+        res = dm.parallelized(f, names, n_jobs=-1, progress=False)
+        for x in res:
+            tmp["name"] += x
+        for key in ["subset", "n_atoms"]:
+            tmp[key] = []
+            for i in range(n):
+                tmp[key] += list(list_entries[i].pop(key))
+        with open(local_path, "wb") as f:
+            pkl.dump(tmp, f)
         push_remote(local_path, overwrite=True)
         # for key in ["name", "subset"]:
         #     local_path = p_join(self.preprocess_path, f"{key}.npz")
diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py
index 3e75e91..abcbd62 100644
--- a/src/openqdc/datasets/sn2_rxn.py
+++ b/src/openqdc/datasets/sn2_rxn.py
@@ -25,8 +25,38 @@ class SN2RXN(BaseDataset):
         "DSD-BLYP-D3(BJ):def2-TZVP Gradient",
     ]
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "-".join(x.decode("ascii").split("_")[:-1])
+
     def read_raw_entries(self):
         raw_path = p_join(self.root, "sn2_rxn.h5")
+
+        # raw_path = p_join(self.root, "sn2_reactions.npz")
+        # data = np.load(raw_path)
+
+        # # as example for accessing individual entries, print the data for entry idx=0
+        # idx = 0
+        # print("Data for entry " + str(idx)+":")
+        # print("Number of atoms")
+        # print(data["N"][idx])
+        # print("Energy [eV]")
+        # print(data["E"][idx])
+        # print("Total charge")
+        # print(data["Q"][idx])
+        # print("Dipole moment vector (with respect to [0.0 0.0 0.0]) [eA]")
+        # print(data["D"][idx,:])
+        # print("Nuclear charges")
+        # print(data["Z"][idx,:data["N"][idx]])
+        # print("Cartesian coordinates [A]")
+        # print(data["R"][idx,:data["N"][idx],:])
+        # print("Forces [eV/A]")
+        # print(data["F"][idx,:data["N"][idx],:])
+
+        # exit()
+
         samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, self.force_target_names)
 
         return samples
diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py
index 9846bdf..216ecdd 100644
--- a/src/openqdc/datasets/solvated_peptides.py
+++ b/src/openqdc/datasets/solvated_peptides.py
@@ -27,6 +27,12 @@ class SolvatedPeptides(BaseDataset):
     __distance_unit__ = "bohr"
     __forces_unit__ = "hartree/bohr"
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "_".join(x.decode("ascii").split("_")[:-1])
+
     def read_raw_entries(self):
         raw_path = p_join(self.root, "solvated_peptides.h5")
         samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names)
diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py
index 38bec86..c8dddba 100644
--- a/src/openqdc/raws/config_factory.py
+++ b/src/openqdc/raws/config_factory.py
@@ -37,7 +37,7 @@ class DataConfigFactory:
 
     sn2_rxn = dict(
         dataset_name="sn2_rxn",
-        links={"sn2_rxn.hdf5.gz": "https://zenodo.org/record/3585800/files/212.hdf5.gz"},
+        links={"sn2_rxn.hdf5.gz": "https://zenodo.org/records/2605341/files/sn2_reactions.npz"},
     )
 
     # FROM: https://sites.uw.edu/wdbase/database-of-water-clusters/
diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py
index 40d0d13..6a1a638 100644
--- a/src/openqdc/utils/atomization_energies.py
+++ b/src/openqdc/utils/atomization_energies.py
@@ -2,124 +2,126 @@
 
 import numpy as np
 from loguru import logger
+from rdkit import Chem
 
 from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 
+atom_table = Chem.GetPeriodicTable()
+
 __all__ = ["chemical_symbols", "atomic_numbers", "IsolatedAtomEnergyFactory"]
 
 EF_KEY: TypeAlias = Tuple[str, int]
 
-ATOM_SPECIES = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I"
-# Energy in atomic unit/ Hartree / Ang
-
 # didn t calculate for Pd, Pt, Mo, Ni, Fe, Cu, see DESS
 atomic_numbers = {}
-chemical_symbols = [
-    "X",
-    "H",
-    "He",
-    "Li",
-    "Be",
-    "B",
-    "C",
-    "N",
-    "O",
-    "F",
-    "Ne",
-    "Na",
-    "Mg",
-    "Al",
-    "Si",
-    "P",
-    "S",
-    "Cl",
-    "Ar",
-    "K",
-    "Ca",
-    "Sc",
-    "Ti",
-    "V",
-    "Cr",
-    "Mn",
-    "Fe",
-    "Co",
-    "Ni",
-    "Cu",
-    "Zn",
-    "Ga",
-    "Ge",
-    "As",
-    "Se",
-    "Br",
-    "Kr",
-    "Rb",
-    "Sr",
-    "Y",
-    "Zr",
-    "Nb",
-    "Mo",
-    "Tc",
-    "Ru",
-    "Rh",
-    "Pd",
-    "Ag",
-    "Cd",
-    "In",
-    "Sn",
-    "Sb",
-    "Te",
-    "I",
-    "Xe",
-    "Cs",
-    "Ba",
-    "La",
-    "Ce",
-    "Pr",
-    "Nd",
-    "Pm",
-    "Sm",
-    "Eu",
-    "Gd",
-    "Tb",
-    "Dy",
-    "Ho",
-    "Er",
-    "Tm",
-    "Yb",
-    "Lu",
-    "Hf",
-    "Ta",
-    "W",
-    "Re",
-    "Os",
-    "Ir",
-    "Pt",
-    "Au",
-    "Hg",
-    "Tl",
-    "Pb",
-    "Bi",
-    "Po",
-    "At",
-    "Rn",
-    "Fr",
-    "Ra",
-    "Ac",
-    "Th",
-    "Pa",
-    "U",
-    "Np",
-    "Pu",
-    "Am",
-    "Cm",
-    "Bk",
-    "Cf",
-    "Es",
-    "Fm",
-    "Md",
-    "No",
-    "Lr",
-]
+chemical_symbols = np.array(
+    [
+        "X",
+        "H",
+        "He",
+        "Li",
+        "Be",
+        "B",
+        "C",
+        "N",
+        "O",
+        "F",
+        "Ne",
+        "Na",
+        "Mg",
+        "Al",
+        "Si",
+        "P",
+        "S",
+        "Cl",
+        "Ar",
+        "K",
+        "Ca",
+        "Sc",
+        "Ti",
+        "V",
+        "Cr",
+        "Mn",
+        "Fe",
+        "Co",
+        "Ni",
+        "Cu",
+        "Zn",
+        "Ga",
+        "Ge",
+        "As",
+        "Se",
+        "Br",
+        "Kr",
+        "Rb",
+        "Sr",
+        "Y",
+        "Zr",
+        "Nb",
+        "Mo",
+        "Tc",
+        "Ru",
+        "Rh",
+        "Pd",
+        "Ag",
+        "Cd",
+        "In",
+        "Sn",
+        "Sb",
+        "Te",
+        "I",
+        "Xe",
+        "Cs",
+        "Ba",
+        "La",
+        "Ce",
+        "Pr",
+        "Nd",
+        "Pm",
+        "Sm",
+        "Eu",
+        "Gd",
+        "Tb",
+        "Dy",
+        "Ho",
+        "Er",
+        "Tm",
+        "Yb",
+        "Lu",
+        "Hf",
+        "Ta",
+        "W",
+        "Re",
+        "Os",
+        "Ir",
+        "Pt",
+        "Au",
+        "Hg",
+        "Tl",
+        "Pb",
+        "Bi",
+        "Po",
+        "At",
+        "Rn",
+        "Fr",
+        "Ra",
+        "Ac",
+        "Th",
+        "Pa",
+        "U",
+        "Np",
+        "Pu",
+        "Am",
+        "Cm",
+        "Bk",
+        "Cf",
+        "Es",
+        "Fm",
+        "Md",
+        "No",
+        "Lr",
+    ]
+)
 
 
 for Z, symbol in enumerate(chemical_symbols):
@@ -131,7 +133,7 @@ class IsolatedAtomEnergyFactory:
     Factory method to get the isolated atom energies for a given level of theory.
     """
 
-    max_charge = 4
+    max_charge = 6
 
     def __init__(self):
         pass
@@ -207,7 +209,14 @@ def get_matrix(level_of_theory: str) -> np.ndarray:
         if tuple_hashmap is None:
             return matrix
         for key in tuple_hashmap.keys():
-            matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key]
+            try:
+                matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key]
+            except KeyError:
+                print(key, list(tuple_hashmap.items()))
+                print(key[0], "?", key[1], "?", shift)
+                print(matrix.shape, atomic_numbers[key[0]], key[1] + shift)
+                logger.warning(f"Isolated atom energies not found for {key} and level of theory {level_of_theory}")
+                matrix[atomic_numbers[key[0]], key[1] + shift] = 0
         return matrix
 
 
diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py
index 8bd2b83..82a58d2 100644
--- a/src/openqdc/utils/molecule.py
+++ b/src/openqdc/utils/molecule.py
@@ -4,9 +4,19 @@
 from numpy import ndarray
 from rdkit import Chem
 
+from openqdc.utils.atomization_energies import chemical_symbols
+
 atom_table = Chem.GetPeriodicTable()
 
 
+def z_to_formula(z):
+    u, c = np.unique(z, return_counts=True)
+    idxs = np.argsort(u)
+    u, c = u[idxs], c[idxs]
+
+    return "".join([f"{chemical_symbols[u[i]]}{c[i] if c[i] > 1 else ''}" for i in range(len(u))])
+
+
 def get_atomic_number(mol: Chem.Mol):
     """Returns atomic numbers for rdkit molecule"""
     return np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()])
diff --git a/src/openqdc/utils/preprocess.py b/src/openqdc/utils/preprocess.py
index 1142dca..b34499e 100644
--- a/src/openqdc/utils/preprocess.py
+++ b/src/openqdc/utils/preprocess.py
@@ -36,9 +36,11 @@
 def preprocess(dataset):
     if dataset not in options_map:
         dataset_id = int(dataset)
+        data_class = options[dataset_id]
+    else:
+        data_class = options_map[dataset]
 
-    data_class = options[dataset_id]
-    data_class().preprocess()
+    data_class().preprocess(overwrite=False)
     data = data_class()
     logger.info(f"Preprocessing {data.__name__}")
 
@@ -47,7 +49,7 @@ def preprocess(dataset):
         x = data[i]
         print(x.name, x.subset, end=" ")
         for k in x:
-            if x[k] is not None:
+            if isinstance(x[k], np.ndarray):
                 print(k, x[k].shape, end=" ")
         print()
 
diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py
index fb895ce..69c8972 100644
--- a/src/openqdc/utils/units.py
+++ b/src/openqdc/utils/units.py
@@ -73,3 +73,6 @@ def get_conversion(in_unit: str, out_unit: str):
 Conversion("hartree/ang", "kcal/mol/ang", lambda x: get_conversion("hartree", "kcal/mol")(x))
 Conversion("hartree/ang", "hartree/bohr", lambda x: get_conversion("bohr", "ang")(x))
 Conversion("hartree/bohr", "hartree/ang", lambda x: get_conversion("ang", "bohr")(x))
+Conversion("ev/bohr", "kcal/mol/ang", lambda x: get_conversion("ang", "bohr")(get_conversion("ev", "kcal/mol")(x)))
+Conversion("kcal/mol/bohr", "kcal/mol/ang", lambda x: get_conversion("ang", "bohr")(x))
+Conversion("ev/ang", "kcal/mol/ang", lambda x: get_conversion("ev", "kcal/mol")(x))

From 2bf9050a4014bbc998829a4e8940305b65ade0c9 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Tue, 28 Nov 2023 19:39:49 +0000
Subject: [PATCH 13/18] On the fly calculation

---
 src/openqdc/datasets/ani.py               | 172 ----------------------
 src/openqdc/datasets/base.py              |  61 ++++++--
 src/openqdc/datasets/comp6.py             |  91 ------------
 src/openqdc/datasets/dess.py              |  50 -------
 src/openqdc/datasets/gdml.py              |  92 ------------
 src/openqdc/datasets/geom.py              |  22 ---
 src/openqdc/datasets/iso_17.py            |  42 ------
 src/openqdc/datasets/molecule3d.py        |  22 ---
 src/openqdc/datasets/orbnet_denali.py     |  22 ---
 src/openqdc/datasets/qm7x.py              |  75 ----------
 src/openqdc/datasets/qmugs.py             |  22 ---
 src/openqdc/datasets/sn2_rxn.py           |   3 -
 src/openqdc/datasets/solvated_peptides.py |  44 ------
 src/openqdc/datasets/spice.py             |  37 -----
 src/openqdc/datasets/tmqm.py              |  22 ---
 15 files changed, 52 insertions(+), 725 deletions(-)

diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py
index 23c890f..3f1b92b 100644
--- a/src/openqdc/datasets/ani.py
+++ b/src/openqdc/datasets/ani.py
@@ -1,10 +1,7 @@
 import os
 from os.path import join as p_join
 
-from numpy import array, float32
-
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
-from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.io import get_local_cache
 
 
@@ -37,7 +34,6 @@ class ANI1(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "bohr"
     __forces_unit__ = "hartree/bohr"
-    __average_nb_atoms__ = 15.91676229984414
 
     @property
     def root(self):
@@ -60,25 +56,6 @@ def read_raw_entries(self):
         samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)
         return samples
 
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-2.37376472])),
-                    "std": self.convert_energy(array([0.50266975])),
-                },
-                "forces": NOT_DEFINED,
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-333.67322], dtype=float32)),
-                    "std": self.convert_energy(array([61.21667], dtype=float32)),
-                },
-                "forces": NOT_DEFINED,
-            },
-        }
-
 
 class ANI1CCX(ANI1):
     """
@@ -100,7 +77,6 @@ class ANI1CCX(ANI1):
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
-    __average_nb_atoms__ = 15.274685315870588
 
     __energy_methods__ = [
         "ccsd(t)/cbs",
@@ -119,25 +95,6 @@ class ANI1CCX(ANI1):
     __force_methods__ = []
     force_target_names = []
 
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-3.23959548, 500.30384627, 500.40706776, 500.76740432])),
-                    "std": self.convert_energy(array([1.03021261, 132.52707152, 132.56092469, 132.65261362])),
-                },
-                "forces": NOT_DEFINED,
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-374.40665, -1.2378153, -1.505962, -1.2396905], dtype=float32)),
-                    "std": self.convert_energy(array([101.63995, 0.32444745, 0.39500558, 0.3250212], dtype=float32)),
-                },
-                "forces": NOT_DEFINED,
-            },
-        }
-
     def __smiles_converter__(self, x):
         """util function to convert string to smiles: useful if the smiles is
         encoded in a different format than its display format
@@ -198,135 +155,6 @@ class ANI1X(ANI1):
         "wb97x/cc-pvtz",
     ]
 
-    __average_nb_atoms__ = 15.274685315870588
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(
-                        array(
-                            [
-                                -2.87910686,
-                                -2.91460298,
-                                -2.91182519,
-                                500.00748497,
-                                500.27885605,
-                                500.11130961,
-                                -3.66090173,
-                                -4.40643278,
-                            ]
-                        )
-                    ),
-                    "std": self.convert_energy(
-                        array(
-                            [
-                                0.92849657,
-                                0.93421854,
-                                0.93411345,
-                                132.44580372,
-                                132.52326771,
-                                132.47987395,
-                                1.60180792,
-                                1.75414812,
-                            ]
-                        )
-                    ),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-6.139757e-06])),
-                    "std": self.convert_forces(array([0.07401004])),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array(
-                                [
-                                    [6.6829815e-13, 3.5682501e-07],
-                                    [-5.1223647e-13, -1.8487021e-06],
-                                    [8.1159564e-13, -3.6849189e-05],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                        "std": self.convert_forces(
-                            array(
-                                [[0.0759203, 0.06799112], [0.07694941, 0.06652647], [0.06229663, 0.05442103]],
-                                dtype=float32,
-                            )
-                        ),
-                        "rms": self.convert_forces(
-                            array(
-                                [[0.0759203, 0.06799113], [0.07694941, 0.06652647], [0.06229663, 0.05442095]],
-                                dtype=float32,
-                            )
-                        ),
-                    },
-                },
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(
-                        array(
-                            [
-                                -372.68945,
-                                -372.74274,
-                                -372.7326,
-                                -1.1540408,
-                                -1.5152899,
-                                -1.4195863,
-                                -392.72458,
-                                -391.208,
-                            ],
-                            dtype=float32,
-                        )
-                    ),
-                    "std": self.convert_energy(
-                        array(
-                            [
-                                101.166664,
-                                101.19915,
-                                101.191895,
-                                0.30445468,
-                                0.39988872,
-                                0.37456134,
-                                136.79112,
-                                137.48692,
-                            ],
-                            dtype=float32,
-                        )
-                    ),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-6.139757e-06])),
-                    "std": self.convert_forces(array([0.07401004])),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array(
-                                [
-                                    [6.6829815e-13, 3.5682501e-07],
-                                    [-5.1223647e-13, -1.8487021e-06],
-                                    [8.1159564e-13, -3.6849189e-05],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                        "std": self.convert_forces(
-                            array(
-                                [[0.0759203, 0.06799112], [0.07694941, 0.06652647], [0.06229663, 0.05442103]],
-                                dtype=float32,
-                            )
-                        ),
-                        "rms": self.convert_forces(
-                            array(
-                                [[0.0759203, 0.06799113], [0.07694941, 0.06652647], [0.06229663, 0.05442095]],
-                                dtype=float32,
-                            )
-                        ),
-                    },
-                },
-            },
-        }
-
     def convert_forces(self, x):
         return super().convert_forces(x) * 0.529177249  # correct the Dataset error
 
diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 34f2ac9..6c939e7 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -15,7 +15,11 @@
     IsolatedAtomEnergyFactory,
     chemical_symbols,
 )
-from openqdc.utils.constants import NB_ATOMIC_FEATURES, POSSIBLE_NORMALIZATION
+from openqdc.utils.constants import (
+    NB_ATOMIC_FEATURES,
+    NOT_DEFINED,
+    POSSIBLE_NORMALIZATION,
+)
 from openqdc.utils.exceptions import (
     PROPERTY_NOT_AVAILABLE_ERROR,
     DatasetNotAvailableError,
@@ -91,6 +95,7 @@ class BaseDataset(torch.utils.data.Dataset):
     __fn_distance__ = lambda x: x
     __fn_forces__ = lambda x: x
     __average_nb_atoms__ = None
+    __stats__ = {}
 
     def __init__(
         self,
@@ -107,14 +112,52 @@ def __init__(
         else:
             self.read_preprocess(overwrite_local_cache=overwrite_local_cache)
         self._set_isolated_atom_energies()
+        self._precompute_statistics()
+
+    def _precompute_statistics(self):
+        logger.info("Precomputing relevant statistics")
+        self._compute_average_nb_atoms()
+        (formation_E_mean, formation_E_std, total_E_mean, total_E_std) = self._precompute_E()
+        forces_dict = self._precompute_F()
+        self.__stats__ = {
+            "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict},
+            "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict},
+        }
 
-    def _download(self):
-        try:
-            self.read_preprocess(overwrite_local_cache=True)
-            if not self.is_preprocessed():
-                raise Exception
-        except Exception:
-            raise DatasetNotAvailableError(self.__name__)
+    def _compute_average_nb_atoms(self):
+        self.__average_nb_atoms__ = np.mean(self.data["n_atoms"])
+
+    def _precompute_E(self):
+        splits_idx = self.data["position_idx_range"][:, 1]
+        s = np.array(self.data["atomic_inputs"][:, :2], dtype=int)
+        s[:, 1] += IsolatedAtomEnergyFactory.max_charge
+        matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__]
+        matrixs = [np.split(matrix, splits_idx)[:-1] for matrix in matrixs]
+        converted_energy_data = self.convert_energy(self.data["energies"])
+        # calculation per molecule formation energy statistics
+        e = []
+        for i in range(len(self.__energy_methods__)):
+            e.append(converted_energy_data[:, i] - np.array(list(map(lambda x: x.sum(), matrixs[i]))))
+        E = np.array(e).T
+        formation_E_mean = np.nanmean(E, axis=0)
+        formation_E_std = np.nanstd(E, axis=0)
+        total_E_mean = np.nanmean(converted_energy_data, axis=0)
+        total_E_std = np.nanstd(converted_energy_data, axis=0)
+
+        return formation_E_mean, formation_E_std, total_E_mean, total_E_std
+
+    def _precompute_F(self):
+        if len(self.__force_methods__) == 0:
+            return NOT_DEFINED
+        converted_force_data = self.convert_forces(self.data["forces"])
+        force_mean = np.nanmean(converted_force_data, axis=0)
+        force_std = np.nanstd(converted_force_data, axis=0)
+        force_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))
+        return {
+            "mean": force_mean,
+            "std": force_std,
+            "components": {"rms": force_rms, "std": force_std.mean(axis=0), "mean": force_mean.mean(axis=0)},
+        }
 
     @property
     def numbers(self):
@@ -447,7 +490,7 @@ def __repr__(self):
 
     @property
     def _stats(self):
-        return {}
+        return self.__stats__
 
     @property
     def average_n_atoms(self):
diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py
index 4641d7f..7b6890b 100644
--- a/src/openqdc/datasets/comp6.py
+++ b/src/openqdc/datasets/comp6.py
@@ -1,7 +1,5 @@
 from os.path import join as p_join
 
-from numpy import array, float32, nan
-
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
 
 
@@ -61,95 +59,6 @@ class COMP6(BaseDataset):
         "Gradient",
     ]
 
-    __average_nb_atoms__ = 25.74051563378753
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(
-                        array(
-                            [
-                                -2579.52016333,
-                                -2543.74519203,
-                                -354694.46157991,
-                                -2506.4300631,
-                                -2616.89224817,
-                                -3157.54118509,
-                                -354031.62984212,
-                                nan,
-                                nan,
-                            ]
-                        )
-                    ),
-                    "std": self.convert_energy(
-                        array(
-                            [
-                                1811.03171965,
-                                1471.95818836,
-                                201545.89189168,
-                                1385.50993753,
-                                1456.09915473,
-                                1728.51133182,
-                                208097.95666257,
-                                nan,
-                                nan,
-                            ]
-                        )
-                    ),
-                },
-                "forces": {
-                    "mean": self.convert_forces(6.6065984e-13),
-                    "std": self.convert_forces(0.056459695),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array([[-4.1767219e-13], [1.0024132e-12], [-9.4386771e-13]], dtype=float32)
-                        ),
-                        "std": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)),
-                        "rms": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)),
-                    },
-                },
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(
-                        array(
-                            [
-                                -360972.16,
-                                -354729.66,
-                                -354699.38,
-                                -349555.7,
-                                -351555.97,
-                                -351530.44,
-                                -354027.8,
-                                nan,
-                                nan,
-                            ],
-                            dtype=float32,
-                        ),
-                    ),
-                    "std": self.convert_energy(
-                        array(
-                            [254766.0, 201559.77, 201537.8, 188725.47, 191028.78, 191016.1, 208089.4, nan, nan],
-                            dtype=float32,
-                        ),
-                    ),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([6.6065984e-13])),
-                    "std": self.convert_forces(array([0.056459695])),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array([[-4.1767219e-13], [1.0024132e-12], [-9.4386771e-13]], dtype=float32)
-                        ),
-                        "std": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)),
-                        "rms": self.convert_forces(array([[0.05781676], [0.05793402], [0.05330585]], dtype=float32)),
-                    },
-                },
-            },
-        }
-
     def __smiles_converter__(self, x):
         """util function to convert string to smiles: useful if the smiles is
         encoded in a different format than its display format
diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py
index 6651bf7..80b1e1c 100644
--- a/src/openqdc/datasets/dess.py
+++ b/src/openqdc/datasets/dess.py
@@ -3,11 +3,9 @@
 import datamol as dm
 import numpy as np
 import pandas as pd
-from numpy import array, float32
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
-from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
@@ -63,54 +61,6 @@ class DESS(BaseDataset):
     # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all']
 
     partitions = ["DES370K", "DES5M"]
-    __average_nb_atoms__ = 18.944905540514252
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": array(
-                        [
-                            1.55021905e03,
-                            7.85244098e02,
-                            7.85022111e02,
-                            7.84864329e02,
-                            1.55072999e03,
-                            1.56925354e03,
-                            -7.76931930e-01,
-                            -2.29679257e00,
-                        ]
-                    ),
-                    "std": array(
-                        [
-                            1.89341080e03,
-                            9.68462074e02,
-                            9.68422393e02,
-                            9.70421004e02,
-                            1.89394888e03,
-                            1.91531274e03,
-                            1.38230279e00,
-                            1.27589212e01,
-                        ]
-                    ),
-                },
-                "forces": NOT_DEFINED,
-            },
-            "total": {
-                "energy": {
-                    "mean": array(
-                        [-0.8121756, 4.1286583, 4.3558254, 3.9754286, -0.66178626, -1.0762144, 4.4547276, 3.4555712],
-                        dtype=float32,
-                    ),
-                    "std": array(
-                        [11.138448, 17.800674, 18.148039, 17.567768, 11.290686, 11.005155, 18.162294, 16.579735],
-                        dtype=float32,
-                    ),
-                },
-                "forces": NOT_DEFINED,
-            },
-        }
 
     def _read_raw_(self, part):
         df = pd.read_csv(p_join(self.root, f"{part}.csv"))
diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py
index 73bae17..e40b3fa 100644
--- a/src/openqdc/datasets/gdml.py
+++ b/src/openqdc/datasets/gdml.py
@@ -1,7 +1,5 @@
 from os.path import join as p_join
 
-from numpy import array, float32
-
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
 
 
@@ -62,99 +60,9 @@ class GDML(BaseDataset):
     __energy_unit__ = "kcal/mol"
     __distance_unit__ = "bohr"
     __forces_unit__ = "kcal/mol/bohr"
-    __average_nb_atoms__ = 13.00299550
 
     def read_raw_entries(self):
         raw_path = p_join(self.root, "gdml.h5")
         samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, self.force_target_names)
 
         return samples
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-2466.00011563, -1213.94691714, -1916.02068252])),
-                    "std": self.convert_energy(array([6.65779492, 310.70204248, 729.2143015])),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array(-1.42346325e-05)),
-                    "std": self.convert_forces(array(27.009315)),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array(
-                                [
-                                    [-8.3862792e-09, -1.9758134e-07, -7.7199416e-05],
-                                    [-2.7550591e-09, -1.9665436e-08, 5.3315878e-05],
-                                    [-7.5688439e-10, 5.6149121e-09, -1.8894127e-05],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                        "std": self.convert_forces(
-                            array(
-                                [
-                                    [31.060509, 29.168474, 27.547812],
-                                    [31.365385, 26.67319, 26.068623],
-                                    [31.024155, 27.272366, 22.33925],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                        "rms": self.convert_forces(
-                            array(
-                                [
-                                    [31.060509, 29.168474, 27.547802],
-                                    [31.365385, 26.67319, 26.068628],
-                                    [31.024155, 27.272366, 22.33925],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                    },
-                },
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-405688.28, -141134.3, -194075.56], dtype=float32)),
-                    "std": self.convert_energy(array([7.2360396e00, 3.0755928e04, 8.4138445e04], dtype=float32)),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array(-1.42346325e-05)),
-                    "std": self.convert_forces(array(27.009315)),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array(
-                                [
-                                    [-8.3862792e-09, -1.9758134e-07, -7.7199416e-05],
-                                    [-2.7550591e-09, -1.9665436e-08, 5.3315878e-05],
-                                    [-7.5688439e-10, 5.6149121e-09, -1.8894127e-05],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                        "std": self.convert_forces(
-                            array(
-                                [
-                                    [31.060509, 29.168474, 27.547812],
-                                    [31.365385, 26.67319, 26.068623],
-                                    [31.024155, 27.272366, 22.33925],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                        "rms": self.convert_forces(
-                            array(
-                                [
-                                    [31.060509, 29.168474, 27.547802],
-                                    [31.365385, 26.67319, 26.068628],
-                                    [31.024155, 27.272366, 22.33925],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                    },
-                },
-            },
-        }
diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py
index 897b828..c016a9f 100644
--- a/src/openqdc/datasets/geom.py
+++ b/src/openqdc/datasets/geom.py
@@ -3,11 +3,9 @@
 
 import datamol as dm
 import numpy as np
-from numpy import array
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils import load_json, load_pkl
-from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
@@ -86,29 +84,9 @@ class GEOM(BaseDataset):
 
     energy_target_names = ["gfn2_xtb.energy"]
     force_target_names = []
-    __average_nb_atoms__ = 50.07244573677837
 
     partitions = ["qm9", "drugs"]
 
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-8.47811605])),
-                    "std": self.convert_energy(array([3.68015507])),
-                },
-                "forces": NOT_DEFINED,
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-78.264725])),
-                    "std": self.convert_energy(array([20.200787])),
-                },
-                "forces": NOT_DEFINED,
-            },
-        }
-
     def _read_raw_(self, partition):
         raw_path = p_join(self.root, "rdkit_folder")
 
diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py
index 1d0b39b..4553ec1 100644
--- a/src/openqdc/datasets/iso_17.py
+++ b/src/openqdc/datasets/iso_17.py
@@ -1,7 +1,5 @@
 from os.path import join as p_join
 
-from numpy import array, float32
-
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
 
 
@@ -44,46 +42,6 @@ class ISO17(BaseDataset):
     __energy_unit__ = "ev"
     __distance_unit__ = "bohr"  # bohr
     __forces_unit__ = "ev/bohr"
-    __average_nb_atoms__ = 19.0
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-103.58336533])),
-                    "std": self.convert_energy(array([0.79709836])),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-1.2548699e-11])),
-                    "std": self.convert_forces(array([1.1287293])),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array([[-2.7712117e-11], [-1.8989450e-12], [3.9721233e-11]], dtype=float32)
-                        ),
-                        "std": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)),
-                        "rms": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)),
-                    },
-                },
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-11503.619]), dtype=float32),
-                    "std": self.convert_energy(array([0.79709935]), dtype=float32),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-1.2548699e-11])),
-                    "std": self.convert_forces(array([1.1287293])),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array([[-2.7712117e-11], [-1.8989450e-12], [3.9721233e-11]], dtype=float32)
-                        ),
-                        "std": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)),
-                        "rms": self.convert_forces(array([[1.1013116], [1.1273879], [1.1195794]], dtype=float32)),
-                    },
-                },
-            },
-        }
 
     def __smiles_converter__(self, x):
         """util function to convert string to smiles: useful if the smiles is
diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py
index 3887499..dc47e53 100644
--- a/src/openqdc/datasets/molecule3d.py
+++ b/src/openqdc/datasets/molecule3d.py
@@ -5,12 +5,10 @@
 import datamol as dm
 import numpy as np
 import pandas as pd
-from numpy import array, float32
 from rdkit import Chem
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
-from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
@@ -91,7 +89,6 @@ class Molecule3D(BaseDataset):
     __forces_unit__ = "ev/ang"
 
     energy_target_names = ["b3lyp/6-31g*.energy"]
-    __average_nb_atoms__ = 29.111696292432697
 
     def read_raw_entries(self):
         raw = p_join(self.root, "data", "raw")
@@ -102,22 +99,3 @@ def read_raw_entries(self):
         res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job
         samples = sum(res, [])
         return samples
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-191.66717791])),
-                    "std": self.convert_energy(array([2005.52732443])),
-                },
-                "forces": NOT_DEFINED,
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-21100.502], dtype=float32)),
-                    "std": self.convert_energy(array([9345.366], dtype=float32)),
-                },
-                "forces": NOT_DEFINED,
-            },
-        }
diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py
index e158410..614e252 100644
--- a/src/openqdc/datasets/orbnet_denali.py
+++ b/src/openqdc/datasets/orbnet_denali.py
@@ -4,10 +4,8 @@
 import datamol as dm
 import numpy as np
 import pandas as pd
-from numpy import array, float32
 
 from openqdc.datasets.base import BaseDataset
-from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import atom_table
 
 
@@ -59,7 +57,6 @@ class OrbnetDenali(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
-    __average_nb_atoms__ = 44.87857092772292
 
     def read_raw_entries(self):
         label_path = p_join(self.root, "denali_labels.csv")
@@ -80,22 +77,3 @@ def read_raw_entries(self):
         res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True)
         samples = sum(res, [])
         return samples
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-13.87283487, -7.79357297])),
-                    "std": self.convert_energy(array([75.34652971, 3.3274954])),
-                },
-                "forces": NOT_DEFINED,
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-1630.8182, -74.17469], dtype=float32)),
-                    "std": self.convert_energy(array([929.9734, 19.751446], dtype=float32)),
-                },
-                "forces": NOT_DEFINED,
-            },
-        }
diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py
index 50603eb..eb8b015 100644
--- a/src/openqdc/datasets/qm7x.py
+++ b/src/openqdc/datasets/qm7x.py
@@ -1,7 +1,6 @@
 from os.path import join as p_join
 
 import numpy as np
-from numpy import array, float32
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
@@ -47,80 +46,6 @@ class QM7X(BaseDataset):
     __energy_unit__ = "ev"
     __distance_unit__ = "ang"
     __forces_unit__ = "ev/ang"
-    __average_nb_atoms__ = 16.84668721109399
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-82.57984067, 372.52167714])),
-                    "std": self.convert_energy(array([9.85675539, 39.76633713])),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-1.1617619e-07])),
-                    "std": self.convert_forces(array([1.1451852])),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array(
-                                [
-                                    [-7.1192130e-07, -6.0926320e-11],
-                                    [-4.3502279e-08, -3.7376963e-11],
-                                    [5.8300976e-08, 2.9215352e-11],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                        "std": self.convert_forces(
-                            array(
-                                [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]],
-                                dtype=float32,
-                            )
-                        ),
-                        "rms": self.convert_forces(
-                            array(
-                                [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]],
-                                dtype=float32,
-                            )
-                        ),
-                    },
-                },
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-8.6828701e03, -2.7446982e-01], dtype=float32)),
-                    "std": self.convert_energy(array([1.4362784e03, 5.8798514e-02], dtype=float32)),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-1.1617619e-07])),
-                    "std": self.convert_forces(array([1.1451852])),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array(
-                                [
-                                    [-7.1192130e-07, -6.0926320e-11],
-                                    [-4.3502279e-08, -3.7376963e-11],
-                                    [5.8300976e-08, 2.9215352e-11],
-                                ],
-                                dtype=float32,
-                            )
-                        ),
-                        "std": self.convert_forces(
-                            array(
-                                [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]],
-                                dtype=float32,
-                            )
-                        ),
-                        "rms": self.convert_forces(
-                            array(
-                                [[1.4721272, 0.00549965], [1.4861498, 0.00508684], [1.4812028, 0.00496012]],
-                                dtype=float32,
-                            )
-                        ),
-                    },
-                },
-            },
-        }
 
     def read_raw_entries(self):
         samples = []
diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py
index a793151..c75f8b5 100644
--- a/src/openqdc/datasets/qmugs.py
+++ b/src/openqdc/datasets/qmugs.py
@@ -4,10 +4,8 @@
 
 import datamol as dm
 import numpy as np
-from numpy import array
 
 from openqdc.datasets.base import BaseDataset
-from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
@@ -58,32 +56,12 @@ class QMugs(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
-    __average_nb_atoms__ = 55.215926293326426
 
     energy_target_names = [
         "GFN2:TOTAL_ENERGY",
         "DFT:TOTAL_ENERGY",
     ]
 
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-12.94348027, -9.83037297])),
-                    "std": self.convert_energy(array([4.39971409, 3.3574188])),
-                },
-                "forces": NOT_DEFINED,
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-89.44242, -1740.5336])),
-                    "std": self.convert_energy(array([29.599571, 791.48663])),
-                },
-                "forces": NOT_DEFINED,
-            },
-        }
-
     def read_raw_entries(self):
         raw_path = p_join(self.root, "structures")
         mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)]
diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py
index e618699..abcbd62 100644
--- a/src/openqdc/datasets/sn2_rxn.py
+++ b/src/openqdc/datasets/sn2_rxn.py
@@ -24,9 +24,6 @@ class SN2RXN(BaseDataset):
     force_target_names = [
         "DSD-BLYP-D3(BJ):def2-TZVP Gradient",
     ]
-    __average_nb_atoms__ = 5.444471933343348
-
-    # TODO: Check this
 
     def __smiles_converter__(self, x):
         """util function to convert string to smiles: useful if the smiles is
diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py
index 2ac1fde..216ecdd 100644
--- a/src/openqdc/datasets/solvated_peptides.py
+++ b/src/openqdc/datasets/solvated_peptides.py
@@ -1,7 +1,5 @@
 from os.path import join as p_join
 
-from numpy import array, float32
-
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
 
 
@@ -28,7 +26,6 @@ class SolvatedPeptides(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "bohr"
     __forces_unit__ = "hartree/bohr"
-    __average_nb_atoms__ = 21.380975029465297
 
     def __smiles_converter__(self, x):
         """util function to convert string to smiles: useful if the smiles is
@@ -41,44 +38,3 @@ def read_raw_entries(self):
         samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names)
 
         return samples
-
-    # TODO : Check the values in this
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([423.49523618])),
-                    "std": self.convert_energy(array([309.76172829])),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-3.792959e-12])),
-                    "std": self.convert_forces(array([1.4568169])),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array([[-4.1655182e-12], [-6.9530774e-12], [2.5650127e-12]], dtype=float32)
-                        ),
-                        "std": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)),
-                        "rms": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)),
-                    },
-                },
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-79.619286], dtype=float32)),
-                    "std": self.convert_energy(array([40.01196], dtype=float32)),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-3.792959e-12])),
-                    "std": self.convert_forces(array([1.4568169])),
-                    "components": {
-                        "mean": self.convert_forces(
-                            array([[-4.1655182e-12], [-6.9530774e-12], [2.5650127e-12]], dtype=float32)
-                        ),
-                        "std": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)),
-                        "rms": self.convert_forces(array([[1.3502095], [1.3478843], [1.3509929]], dtype=float32)),
-                    },
-                },
-            },
-        }
diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py
index bb2e1a6..974d45f 100644
--- a/src/openqdc/datasets/spice.py
+++ b/src/openqdc/datasets/spice.py
@@ -2,7 +2,6 @@
 
 import datamol as dm
 import numpy as np
-from numpy import array
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
@@ -56,7 +55,6 @@ class Spice(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "bohr"
     __forces_unit__ = "hartree/bohr"
-    __average_nb_atoms__ = 29.88387509402179
 
     energy_target_names = ["dft_total_energy"]
 
@@ -77,41 +75,6 @@ class Spice(BaseDataset):
         "SPICE Ion Pairs Single Points Dataset v1.1": "Ion Pairs",
     }
 
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-5.67757058])),
-                    "std": self.convert_energy(array([2.33714861])),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-1.0387013e-08])),
-                    "std": self.convert_forces(array([0.021063408])),
-                    "components": {
-                        "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
-                        "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
-                        "rms": self.convert_forces(array([[0.02017307], [0.02016142], [0.02014796]])),
-                    },
-                },
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-1244.6562])),
-                    "std": self.convert_energy(array([1219.4248])),
-                },
-                "forces": {
-                    "mean": self.convert_forces(array([-1.0387013e-08])),
-                    "std": self.convert_forces(array([0.021063408])),
-                    "components": {
-                        "mean": self.convert_forces(array([[5.7479990e-09], [-4.8940532e-08], [1.2032132e-08]])),
-                        "std": self.convert_forces(array([[0.02017307], [0.02016141], [0.02014796]])),
-                        "rms": self.convert_forces(array([[0.02017307], [0.02016142], [0.02014796]])),
-                    },
-                },
-            },
-        }
-
     def convert_forces(self, x):
         return (-1.0) * super().convert_forces(x)
 
diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py
index 1b7ffc1..8952aaa 100644
--- a/src/openqdc/datasets/tmqm.py
+++ b/src/openqdc/datasets/tmqm.py
@@ -3,11 +3,9 @@
 
 import numpy as np
 import pandas as pd
-from numpy import array, float32
 from tqdm import tqdm
 
 from openqdc.datasets.base import BaseDataset
-from openqdc.utils.constants import NOT_DEFINED
 from openqdc.utils.molecule import atom_table
 
 
@@ -56,26 +54,6 @@ class TMQM(BaseDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
-    __average_nb_atoms__ = 65.9905035539555
-
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": self.convert_energy(array([-669.31056277])),
-                    "std": self.convert_energy(array([896.83091033])),
-                },
-                "forces": NOT_DEFINED,
-            },
-            "total": {
-                "energy": {
-                    "mean": self.convert_energy(array([-2952.1428], dtype=float32)),
-                    "std": self.convert_energy(array([1608.7888], dtype=float32)),
-                },
-                "forces": NOT_DEFINED,
-            },
-        }
 
     def read_raw_entries(self):
         df = pd.read_csv(p_join(self.root, "tmQM_y.csv"), sep=";", usecols=["CSD_code", "Electronic_E"])

From c90192a2e8430e7454ddf6cbdcc67163b3d73b1b Mon Sep 17 00:00:00 2001
From: prtos <prtos@users.noreply.github.com>
Date: Tue, 28 Nov 2023 16:10:29 -0500
Subject: [PATCH 14/18] Update base.py

---
 src/openqdc/datasets/base.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 6c939e7..16dc892 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -132,13 +132,16 @@ def _precompute_E(self):
         s = np.array(self.data["atomic_inputs"][:, :2], dtype=int)
         s[:, 1] += IsolatedAtomEnergyFactory.max_charge
         matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__]
-        matrixs = [np.split(matrix, splits_idx)[:-1] for matrix in matrixs]
+        # matrixs = [np.split(matrix, splits_idx)[:-1] for matrix in matrixs]
         converted_energy_data = self.convert_energy(self.data["energies"])
         # calculation per molecule formation energy statistics
-        e = []
-        for i in range(len(self.__energy_methods__)):
-            e.append(converted_energy_data[:, i] - np.array(list(map(lambda x: x.sum(), matrixs[i]))))
-        E = np.array(e).T
+        n = len(self.__energy_methods__)
+        E = []
+        for i, matrix in enumerate(matrixs):
+            c = np.cumsum(np.append([0], matrix))[splits_idx]
+            c[1:] = c[1:] - c[:-1]
+            E.append(converted_energy_data[:, i] -c)
+        E = np.array(E).T
         formation_E_mean = np.nanmean(E, axis=0)
         formation_E_std = np.nanstd(E, axis=0)
         total_E_mean = np.nanmean(converted_energy_data, axis=0)

From 67bf201482947b62d4b546136da22f022f2f27c8 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Tue, 28 Nov 2023 22:47:13 +0000
Subject: [PATCH 15/18] raise correct Error + cleaning

---
 src/openqdc/datasets/base.py    | 8 ++------
 src/openqdc/utils/exceptions.py | 9 +++++++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 16dc892..a40d952 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -21,7 +21,6 @@
     POSSIBLE_NORMALIZATION,
 )
 from openqdc.utils.exceptions import (
-    PROPERTY_NOT_AVAILABLE_ERROR,
     DatasetNotAvailableError,
     NormalizationNotAvailableError,
     StatisticsNotAvailableError,
@@ -132,15 +131,13 @@ def _precompute_E(self):
         s = np.array(self.data["atomic_inputs"][:, :2], dtype=int)
         s[:, 1] += IsolatedAtomEnergyFactory.max_charge
         matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__]
-        # matrixs = [np.split(matrix, splits_idx)[:-1] for matrix in matrixs]
         converted_energy_data = self.convert_energy(self.data["energies"])
         # calculation per molecule formation energy statistics
-        n = len(self.__energy_methods__)
         E = []
         for i, matrix in enumerate(matrixs):
             c = np.cumsum(np.append([0], matrix))[splits_idx]
             c[1:] = c[1:] - c[:-1]
-            E.append(converted_energy_data[:, i] -c)
+            E.append(converted_energy_data[:, i] - c)
         E = np.array(E).T
         formation_E_mean = np.nanmean(E, axis=0)
         formation_E_std = np.nanstd(E, axis=0)
@@ -501,8 +498,7 @@ def average_n_atoms(self):
         Average number of atoms in a molecule in the dataset.
         """
         if self.__average_nb_atoms__ is None:
-            logger.info(PROPERTY_NOT_AVAILABLE_ERROR)
-            return 1
+            raise StatisticsNotAvailableError(self.__name__)
         return self.__average_nb_atoms__
 
     def get_statistics(self, normalization: str = "formation", return_none: bool = True):
diff --git a/src/openqdc/utils/exceptions.py b/src/openqdc/utils/exceptions.py
index 5c3ddbc..246d01c 100644
--- a/src/openqdc/utils/exceptions.py
+++ b/src/openqdc/utils/exceptions.py
@@ -31,6 +31,11 @@ def __init__(self, dataset_name):
 class StatisticsNotAvailableError(DatasetNotAvailableError):
     """Raised when statistics are not available"""
 
+    msg = (
+        "Statistics for dataset {dataset_name} are not available."
+        + "Please open an issue on Github for the team to look into it."
+    )
+
 
 class NormalizationNotAvailableError(OpenQDCException):
     """Raised when normalization is not available"""
@@ -44,7 +49,7 @@ class ConversionNotDefinedError(OpenQDCException, ValueError):
     """Raised when a conversion is not defined"""
 
     _error_message = """
-    Conversion from {in_unit} to {out_unit} is not defined in the conversion registry. 
+    Conversion from {in_unit} to {out_unit} is not defined in the conversion registry.
     To add a new conversion, use the following syntax or open an issue on Github for the team to look into it:
 
     Conversion("{in_unit}", "{out_unit}", lambda x: x * conversion_factor)
@@ -58,6 +63,6 @@ class ConversionAlreadyDefined(ConversionNotDefinedError):
     """Raised when a conversion is not defined"""
 
     _error_message = """
-    Conversion from {in_unit} to {out_unit} is alread defined in the conversion registry. 
+    Conversion from {in_unit} to {out_unit} is alread defined in the conversion registry.
     To reuse the same metric, use get_conversion({in_unit}, {out_unit}).
     """

From 52f69ce0f5f1b2d525fe6ab78387c65e6f427c5b Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Tue, 28 Nov 2023 23:01:08 +0000
Subject: [PATCH 16/18] Local caching statistics

---
 src/openqdc/datasets/base.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index a40d952..eea280d 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -30,6 +30,7 @@
     dict_to_atoms,
     get_local_cache,
     load_hdf5_file,
+    load_pkl,
     pull_locally,
     push_remote,
     set_cache_dir,
@@ -113,15 +114,23 @@ def __init__(
         self._set_isolated_atom_energies()
         self._precompute_statistics()
 
-    def _precompute_statistics(self):
-        logger.info("Precomputing relevant statistics")
+    def _precompute_statistics(self, overwrite_local_cache: bool = False):
+        local_path = p_join(self.preprocess_path, "stats.pkl")
+        if self.is_preprocessed_statistics() and not overwrite_local_cache:
+            stats = load_pkl(local_path)
+            logger.info("Loaded precomputed statistics")
+        else:
+            logger.info("Precomputing relevant statistics")
+            (formation_E_mean, formation_E_std, total_E_mean, total_E_std) = self._precompute_E()
+            forces_dict = self._precompute_F()
+            stats = {
+                "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict},
+                "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict},
+            }
+            with open(local_path, "wb") as f:
+                pkl.dump(stats, f)
         self._compute_average_nb_atoms()
-        (formation_E_mean, formation_E_std, total_E_mean, total_E_std) = self._precompute_E()
-        forces_dict = self._precompute_F()
-        self.__stats__ = {
-            "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict},
-            "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict},
-        }
+        self.__stats__ = stats
 
     def _compute_average_nb_atoms(self):
         self.__average_nb_atoms__ = np.mean(self.data["n_atoms"])
@@ -326,6 +335,9 @@ def is_preprocessed(self):
         predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))]
         return all(predicats)
 
+    def is_preprocessed_statistics(self):
+        return bool(copy_exists(p_join(self.preprocess_path, "stats.pkl")))
+
     def preprocess(self, overwrite=False):
         if overwrite or not self.is_preprocessed():
             entries = self.read_raw_entries()

From f38bda12ef98372da7895e3f34576f8f6d2d27df Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Wed, 29 Nov 2023 13:41:50 +0000
Subject: [PATCH 17/18] Fix on incorrect unit changing, stats calculated on
 original units, conversion on the fly

---
 src/openqdc/datasets/base.py | 72 +++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 10 deletions(-)

diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index eea280d..5547df3 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -106,13 +106,34 @@ def __init__(
     ) -> None:
         set_cache_dir(cache_dir)
         self.data = None
-        self._set_units(energy_unit, distance_unit)
         if not self.is_preprocessed():
             raise DatasetNotAvailableError(self.__name__)
         else:
             self.read_preprocess(overwrite_local_cache=overwrite_local_cache)
+        self._post_init(overwrite_local_cache, energy_unit, distance_unit)
+
+    def _post_init(
+        self,
+        overwrite_local_cache: bool = False,
+        energy_unit: Optional[str] = None,
+        distance_unit: Optional[str] = None,
+    ) -> None:
+        self._set_units(None, None)
+        self._set_isolated_atom_energies()
+        self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)
+        self._set_units(energy_unit, distance_unit)
+        self._convert_data()
         self._set_isolated_atom_energies()
-        self._precompute_statistics()
+
+    def _convert_data(self):
+        logger.info(
+            f"Converting {self.__name__} data to the following units:\n\
+                     Energy: {self.energy_unit},\n\
+                     Distance: {self.distance_unit},\n\
+                     Forces: {self.force_unit if self.__force_methods__ else 'None'}"
+        )
+        for key in self.data_keys:
+            self.data[key] = self._convert_on_loading(self.data[key], key)
 
     def _precompute_statistics(self, overwrite_local_cache: bool = False):
         local_path = p_join(self.preprocess_path, "stats.pkl")
@@ -153,7 +174,12 @@ def _precompute_E(self):
         total_E_mean = np.nanmean(converted_energy_data, axis=0)
         total_E_std = np.nanstd(converted_energy_data, axis=0)
 
-        return formation_E_mean, formation_E_std, total_E_mean, total_E_std
+        return (
+            np.atleast_2d(formation_E_mean),
+            np.atleast_2d(formation_E_std),
+            np.atleast_2d(total_E_mean),
+            np.atleast_2d(total_E_std),
+        )
 
     def _precompute_F(self):
         if len(self.__force_methods__) == 0:
@@ -163,16 +189,16 @@ def _precompute_F(self):
         force_std = np.nanstd(converted_force_data, axis=0)
         force_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))
         return {
-            "mean": force_mean,
-            "std": force_std,
-            "components": {"rms": force_rms, "std": force_std.mean(axis=0), "mean": force_mean.mean(axis=0)},
+            "mean": np.atleast_2d(force_mean.mean(axis=0)),
+            "std": np.atleast_2d(force_std.mean(axis=0)),
+            "components": {"rms": force_rms, "std": force_std, "mean": force_mean},
         }
 
     @property
     def numbers(self):
         if hasattr(self, "_numbers"):
             return self._numbers
-        self._numbers = np.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
+        self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
         return self._numbers
 
     @property
@@ -302,6 +328,18 @@ def save_preprocess(self, data_dict):
             pkl.dump(data_dict, f)
         push_remote(local_path, overwrite=True)
 
+    def _convert_on_loading(self, x, key):
+        if key == "energies":
+            return self.convert_energy(x)
+        elif key == "forces":
+            return self.convert_forces(x)
+        elif key == "atomic_inputs":
+            x = np.array(x, dtype=np.float32)
+            x[:, -3:] = self.convert_distance(x[:, -3:])
+            return x
+        else:
+            return x
+
     def read_preprocess(self, overwrite_local_cache=False):
         logger.info("Reading preprocessed data")
         logger.info(
@@ -473,14 +511,14 @@ def __getitem__(self, idx: int):
         z, c, positions, energies = (
             np.array(input[:, 0], dtype=np.int32),
             np.array(input[:, 1], dtype=np.int32),
-            self.convert_distance(np.array(input[:, -3:], dtype=np.float32)),
-            self.convert_energy(np.array(self.data["energies"][idx], dtype=np.float32)),
+            np.array(input[:, -3:], dtype=np.float32),
+            np.array(self.data["energies"][idx], dtype=np.float32),
         )
         name = self.__smiles_converter__(self.data["name"][idx])
         subset = self.data["subset"][idx]
 
         if "forces" in self.data:
-            forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32))
+            forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32)
         else:
             forces = None
         return Bunch(
@@ -542,4 +580,18 @@ def get_statistics(self, normalization: str = "formation", return_none: bool = T
                     }
                 }
             )
+        # cycle trough dict to convert units
+        for key in selected_stats:
+            if key == "forces":
+                for key2 in selected_stats[key]:
+                    if key2 != "components":
+                        selected_stats[key][key2] = self.convert_forces(selected_stats[key][key2])
+                    else:
+                        for key2 in selected_stats[key]["components"]:
+                            selected_stats[key]["components"][key2] = self.convert_forces(
+                                selected_stats[key]["components"][key2]
+                            )
+            else:
+                for key2 in selected_stats[key]:
+                    selected_stats[key][key2] = self.convert_energy(selected_stats[key][key2])
         return selected_stats

From 0197dd002feea4a0657b47c886393de0d26558ab Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Wed, 29 Nov 2023 14:43:37 +0000
Subject: [PATCH 18/18] Deepcopy dict to avoid reference issue

---
 src/openqdc/datasets/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 5547df3..d7c8234 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -1,5 +1,6 @@
 import os
 import pickle as pkl
+from copy import deepcopy
 from os.path import join as p_join
 from typing import Dict, List, Optional, Union
 
@@ -560,7 +561,7 @@ def get_statistics(self, normalization: str = "formation", return_none: bool = T
             Whether to return None if the statistics for the forces are not available, by default True
             Otherwise, the statistics for the forces are set to 0.0
         """
-        stats = self._stats
+        stats = deepcopy(self._stats)
         if len(stats) == 0:
             raise StatisticsNotAvailableError(self.__name__)
         if normalization not in POSSIBLE_NORMALIZATION: