From a305032c586e55228e5fac29f55b932bbd8117cb Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Mon, 25 Sep 2023 14:46:01 +0000
Subject: [PATCH 1/3] Added .pre-commit config

---
 .pre-commit-config.yaml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..a6b322d
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,26 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: check-toml
+      - id: check-json
+      - id: check-merge-conflict
+      - id: requirements-txt-fixer
+      - id: detect-private-key
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: ["--profile", "black"]
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    # Ruff version.
+    rev: 'v0.0.241'
+    hooks:
+      - id: ruff

From cea7d35b52bcd658820ac1c8774cc76d02f617d2 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Mon, 25 Sep 2023 14:59:52 +0000
Subject: [PATCH 2/3] Ran pre-commit run --all

---
 .github/SECURITY.md               |  2 +-
 .gitignore                        |  3 +-
 env.yml                           |  2 +-
 openqdc/datasets/ani.py           | 51 ++++++++--------
 openqdc/datasets/base.py          | 99 ++++++++++++++++---------------
 openqdc/datasets/comp6.py         | 27 ++++-----
 openqdc/datasets/gdml.py          | 25 ++++----
 openqdc/datasets/geom.py          | 61 +++++++++----------
 openqdc/datasets/iso_17.py        | 25 ++++----
 openqdc/datasets/molecule3d.py    | 53 +++++++++--------
 openqdc/datasets/nabladft.py      | 53 +++++++++--------
 openqdc/datasets/orbnet_denali.py | 60 +++++++++----------
 openqdc/datasets/qm7x.py          | 25 ++++----
 openqdc/datasets/qmugs.py         | 50 ++++++++--------
 openqdc/datasets/sn2_rxn.py       | 25 ++++----
 openqdc/datasets/spice.py         | 52 ++++++++--------
 openqdc/raws/config_factory.py    | 70 ++++++++--------------
 openqdc/raws/fetch.py             | 32 +++++-----
 openqdc/utils/constants.py        |  2 +-
 openqdc/utils/io.py               | 13 ++--
 openqdc/utils/molecule.py         |  5 +-
 pyproject.toml                    |  2 +
 22 files changed, 356 insertions(+), 381 deletions(-)

diff --git a/.github/SECURITY.md b/.github/SECURITY.md
index c9d0753..128d21b 100644
--- a/.github/SECURITY.md
+++ b/.github/SECURITY.md
@@ -1,3 +1,3 @@
 # Security Policy
 
-Please report any security-related issues directly to prudencio@valencediscovery.com. 
+Please report any security-related issues directly to prudencio@valencediscovery.com.
diff --git a/.gitignore b/.gitignore
index bc693a8..c1eefd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -140,10 +140,9 @@ cache/
 *.hdf5
 nohup.out
 *.out
-*.crt 
+*.crt
 *.key
 *.dat
 *.xyz
 *.csv
 *.txt
-
diff --git a/env.yml b/env.yml
index 8eb195c..932af25 100644
--- a/env.yml
+++ b/env.yml
@@ -64,6 +64,6 @@ dependencies:
   - ruff
   - ipykernel
   - pydantic <= 2.0
-  
+
   - pip:
       - torch-nl
diff --git a/openqdc/datasets/ani.py b/openqdc/datasets/ani.py
index 3683bc6..f0b3335 100644
--- a/openqdc/datasets/ani.py
+++ b/openqdc/datasets/ani.py
@@ -1,14 +1,15 @@
 import os
-import numpy as np
 from os.path import join as p_join
-from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 from openqdc.utils.io import get_local_cache
 
 
 class ANI1(BaseDataset):
-    __name__ = 'ani1'
-
+    __name__ = "ani1"
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -21,29 +22,27 @@ class ANI1(BaseDataset):
         "ωB97x:6-31G(d) Energy",
     ]
 
-
     def __init__(self) -> None:
         super().__init__()
 
     @property
     def root(self):
-        return p_join(get_local_cache(), 'ani')
-    
+        return p_join(get_local_cache(), "ani")
+
     @property
     def preprocess_path(self):
-        path = p_join(self.root, 'preprocessed', self.__name__)
+        path = p_join(self.root, "preprocessed", self.__name__)
         os.makedirs(path, exist_ok=True)
         return path
-    
+
     def read_raw_entries(self):
-        raw_path = p_join(self.root, f'{self.__name__}.h5')
-        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, 
-                                     self.force_target_names)
+        raw_path = p_join(self.root, f"{self.__name__}.h5")
+        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)
         return samples
 
 
 class ANI1CCX(ANI1):
-    __name__ = 'ani1ccx'
+    __name__ = "ani1ccx"
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -67,10 +66,10 @@ class ANI1CCX(ANI1):
 
     def __init__(self) -> None:
         super().__init__()
-    
+
 
 class ANI1X(ANI1):
-    __name__ = 'ani1x'
+    __name__ = "ani1x"
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -94,10 +93,10 @@ class ANI1X(ANI1):
         "MP2:cc-pVQZ Correlation Energy",
         "MP2:cc-pVTZ Correlation Energy",
         "wB97x:6-31G(d) Total Energy",
-        "wB97x:def2-TZVPP Total Energy"
+        "wB97x:def2-TZVPP Total Energy",
     ]
 
-    force_target_names = [ 
+    force_target_names = [
         "wB97x:6-31G(d) Atomic Forces",
         "wB97x:def2-TZVPP Atomic Forces",
     ]
@@ -111,21 +110,21 @@ def __init__(self) -> None:
         super().__init__()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     for data_class in [
-            ANI1, 
-            # ANI1CCX, 
-            # ANI1X
-        ]:
+        ANI1,
+        # ANI1CCX,
+        # ANI1X
+    ]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
+                    print(k, x[k].shape, end=" ")
+
             print()
-        exit()
\ No newline at end of file
+        exit()
diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index 954c026..254ff0d 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -1,27 +1,34 @@
 import os
-import torch
-import numpy as np
-import pickle as pkl
-from tqdm import tqdm
 from os.path import join as p_join
+
+import numpy as np
+import torch
 from sklearn.utils import Bunch
-from openqdc.utils.io import get_local_cache, pull_locally, push_remote, load_hdf5_file, copy_exists
+from tqdm import tqdm
+
+from openqdc.utils.constants import NB_ATOMIC_FEATURES
+from openqdc.utils.io import (
+    copy_exists,
+    get_local_cache,
+    load_hdf5_file,
+    pull_locally,
+    push_remote,
+)
 from openqdc.utils.molecule import atom_table
-from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER, NB_ATOMIC_FEATURES
 
 
-def extract_entry(df, i, subset, energy_target_names, force_target_names=None): 
+def extract_entry(df, i, subset, energy_target_names, force_target_names=None):
     x = np.array([atom_table.GetAtomicNumber(s) for s in df["symbols"][i]])
     xs = np.stack((x, np.zeros_like(x)), axis=-1)
-    positions= df["geometry"][i].reshape((-1, 3))
-    energies= np.array([df[k][i] for k in energy_target_names])
-    
+    positions = df["geometry"][i].reshape((-1, 3))
+    energies = np.array([df[k][i] for k in energy_target_names])
+
     res = dict(
-        name= np.array([df["name"][i]]),
-        subset= np.array([subset]),     
-        energies= energies.reshape((1, -1)).astype(np.float32),
-        atomic_inputs = np.concatenate((xs, positions), axis=-1, dtype=np.float32),
-        n_atoms = np.array([x.shape[0]], dtype=np.int32),
+        name=np.array([df["name"][i]]),
+        subset=np.array([subset]),
+        energies=energies.reshape((1, -1)).astype(np.float32),
+        atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32),
+        n_atoms=np.array([x.shape[0]], dtype=np.int32),
     )
     if force_target_names is not None and len(force_target_names) > 0:
         forces = np.zeros((positions.shape[0], 3, len(force_target_names)), dtype=np.float32)
@@ -47,8 +54,7 @@ def read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names
     # print('\n'*3)
     # exit()
 
-    samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) 
-               for i in tqdm(range(n))]
+    samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]
     return samples
 
 
@@ -71,45 +77,44 @@ def __init__(self) -> None:
     @property
     def root(self):
         return p_join(get_local_cache(), self.__name__)
-    
+
     @property
     def preprocess_path(self):
-        path = p_join(self.root, 'preprocessed')
+        path = p_join(self.root, "preprocessed")
         os.makedirs(path, exist_ok=True)
         return path
-    
+
     @property
     def data_keys(self):
         keys = list(self.data_types.keys())
         if len(self.__force_methods__) == 0:
             keys.remove("forces")
         return keys
-    
+
     @property
     def data_types(self):
         return {
-            "atomic_inputs": np.float32, 
-            "position_idx_range": np.int32, 
+            "atomic_inputs": np.float32,
+            "position_idx_range": np.int32,
             "energies": np.float32,
-            "forces": np.float32
+            "forces": np.float32,
         }
-    
+
     @property
     def data_shapes(self):
         return {
-            "atomic_inputs": (-1, NB_ATOMIC_FEATURES), 
-            "position_idx_range": (-1, 2), 
+            "atomic_inputs": (-1, NB_ATOMIC_FEATURES),
+            "position_idx_range": (-1, 2),
             "energies": (-1, len(self.energy_target_names)),
-            "forces": (-1, 3, len(self.force_target_names))
+            "forces": (-1, 3, len(self.force_target_names)),
         }
-    
+
     def read_raw_entries(self):
         raise NotImplementedError
-    
+
     def collate_list(self, list_entries):
         # concatenate entries
-        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) 
-               for key in list_entries[0]}
+        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
 
         csum = np.cumsum(res.pop("n_atoms"))
         x = np.zeros((csum.shape[0], 2), dtype=np.int32)
@@ -121,14 +126,11 @@ def save_preprocess(self, data_dict):
         # save memmaps
         for key in self.data_keys:
             local_path = p_join(self.preprocess_path, f"{key}.mmap")
-            out = np.memmap(local_path, 
-                            mode="w+", 
-                            dtype=data_dict[key].dtype, 
-                            shape=data_dict[key].shape)
+            out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
             out[:] = data_dict.pop(key)[:]
             out.flush()
             push_remote(local_path)
-            
+
         # save smiles and subset
         for key in ["name", "subset"]:
             local_path = p_join(self.preprocess_path, f"{key}.npz")
@@ -136,19 +138,20 @@ def save_preprocess(self, data_dict):
             with open(local_path, "wb") as f:
                 np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
             push_remote(local_path)
-    
-    def read_preprocess(self): 
+
+    def read_preprocess(self):
         self.data = {}
         for key in self.data_keys:
             filename = p_join(self.preprocess_path, f"{key}.mmap")
             pull_locally(filename)
             self.data[key] = np.memmap(
-                filename, mode='r', 
+                filename,
+                mode="r",
                 dtype=self.data_types[key],
             ).reshape(self.data_shapes[key])
-            
+
         for key in self.data:
-            print(f'Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}')
+            print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
 
         for key in ["name", "subset"]:
             filename = p_join(self.preprocess_path, f"{key}.npz")
@@ -156,17 +159,15 @@ def read_preprocess(self):
             # with open(filename, "rb") as f:
             self.data[key] = np.load(open(filename, "rb"))
             for k in self.data[key]:
-                print(f'Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}')
+                print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}")
 
     def is_preprocessed(self):
-        predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap"))
-                     for key in self.data_keys]
-        predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz"))
-                      for x in ["name", "subset"]]
+        predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
+        predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) for x in ["name", "subset"]]
         return all(predicats)
 
     def __len__(self):
-        return self.data['energies'].shape[0]
+        return self.data["energies"].shape[0]
 
     def __getitem__(self, idx: int):
         p_start, p_end = self.data["position_idx_range"][idx]
@@ -190,5 +191,5 @@ def __getitem__(self, idx: int):
             energies=energies,
             name=name,
             subset=subset,
-            forces=forces
+            forces=forces,
         )
diff --git a/openqdc/datasets/comp6.py b/openqdc/datasets/comp6.py
index cd3e41c..96811c9 100644
--- a/openqdc/datasets/comp6.py
+++ b/openqdc/datasets/comp6.py
@@ -1,12 +1,13 @@
-import numpy as np
 from os.path import join as p_join
-from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 
 
 class COMP6(BaseDataset):
-    __name__ = 'comp6'
-
+    __name__ = "comp6"
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -40,33 +41,31 @@ class COMP6(BaseDataset):
     ]
 
     force_target_names = [
-        "Gradient",   
+        "Gradient",
     ]
 
     def __init__(self) -> None:
         super().__init__()
-    
+
     def read_raw_entries(self):
         samples = []
         for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]:
-            raw_path = p_join(self.root, f'{subset}.h5')
-            samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, 
-                                      self.force_target_names)
+            raw_path = p_join(self.root, f"{subset}.h5")
+            samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, self.force_target_names)
 
         return samples
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     for data_class in [COMP6]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
+                    print(k, x[k].shape, end=" ")
+
             print()
diff --git a/openqdc/datasets/gdml.py b/openqdc/datasets/gdml.py
index b53586b..c0ca093 100644
--- a/openqdc/datasets/gdml.py
+++ b/openqdc/datasets/gdml.py
@@ -1,12 +1,13 @@
-import os
-import numpy as np
 from os.path import join as p_join
-from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 
 
 class GDML(BaseDataset):
-    __name__ = 'gdml'
+    __name__ = "gdml"
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -37,26 +38,24 @@ class GDML(BaseDataset):
 
     def __init__(self) -> None:
         super().__init__()
-    
+
     def read_raw_entries(self):
-        raw_path = p_join(self.root, f'gdml.h5')
-        samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, 
-                                      self.force_target_names)
+        raw_path = p_join(self.root, "gdml.h5")
+        samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, self.force_target_names)
 
         return samples
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     for data_class in [GDML]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
+                    print(k, x[k].shape, end=" ")
+
             print()
diff --git a/openqdc/datasets/geom.py b/openqdc/datasets/geom.py
index c489f58..c2f895a 100644
--- a/openqdc/datasets/geom.py
+++ b/openqdc/datasets/geom.py
@@ -1,15 +1,16 @@
+from os.path import join as p_join
 
-import numpy as np
 import datamol as dm
-from os.path import join as p_join
-from openqdc.utils import load_pkl, load_json
-from openqdc.utils.molecule import get_atomic_numuber_and_charge
-from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils import load_json, load_pkl
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+from openqdc.utils.molecule import get_atomic_numuber_and_charge
 
 
 def read_mol(mol_id, mol_dict, base_path, partition):
-    """ Read molecule from pickle file and return dict with conformers and energies
+    """Read molecule from pickle file and return dict with conformers and energies
 
     Parameters
     ----------
@@ -31,31 +32,31 @@ def read_mol(mol_id, mol_dict, base_path, partition):
     """
 
     try:
-        d = load_pkl(p_join(base_path, mol_dict['pickle_path']), False)
-        confs = d['conformers']
-        x = get_atomic_numuber_and_charge(confs[0]['rd_mol'])
-        positions = np.array([cf['rd_mol'].GetConformer().GetPositions() for cf in confs])
+        d = load_pkl(p_join(base_path, mol_dict["pickle_path"]), False)
+        confs = d["conformers"]
+        x = get_atomic_numuber_and_charge(confs[0]["rd_mol"])
+        positions = np.array([cf["rd_mol"].GetConformer().GetPositions() for cf in confs])
         n_confs = positions.shape[0]
 
         res = dict(
-            atomic_inputs = np.concatenate((
-                x[None, ...].repeat(n_confs, axis=0), 
-                positions), axis=-1, dtype=np.float32).reshape(-1, 5),
-            name = np.array([d['smiles'] for _ in confs]),
-            energies = np.array([cf['totalenergy'] for cf in confs], dtype=np.float32)[:, None],
-            n_atoms = np.array([positions.shape[1]] * n_confs, dtype=np.int32),
-            subset = np.array([partition] * n_confs),
+            atomic_inputs=np.concatenate(
+                (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32
+            ).reshape(-1, 5),
+            name=np.array([d["smiles"] for _ in confs]),
+            energies=np.array([cf["totalenergy"] for cf in confs], dtype=np.float32)[:, None],
+            n_atoms=np.array([positions.shape[1]] * n_confs, dtype=np.int32),
+            subset=np.array([partition] * n_confs),
         )
 
     except Exception as e:
-        print (f'Skipping: {mol_id} due to {e}')
+        print(f"Skipping: {mol_id} due to {e}")
         res = None
 
     return res
 
 
 class GEOM(BaseDataset):
-    __name__ = 'geom'
+    __name__ = "geom"
     __energy_methods__ = ["gfn2_xtb"]
 
     energy_target_names = ["gfn2_xtb.energy"]
@@ -64,36 +65,36 @@ class GEOM(BaseDataset):
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
 
-    partitions = ['qm9', 'drugs']
+    partitions = ["qm9", "drugs"]
 
     def __init__(self) -> None:
         super().__init__()
 
     def _read_raw_(self, partition):
-        raw_path = p_join(self.root, 'rdkit_folder')
+        raw_path = p_join(self.root, "rdkit_folder")
 
-        mols = load_json(p_join(raw_path, f'summary_{partition}.json'))
+        mols = load_json(p_join(raw_path, f"summary_{partition}.json"))
         mols = list(mols.items())
 
-        fn = lambda x: read_mol(x[0], x[1], raw_path, partition)
-        samples = dm.parallelized(fn, mols, n_jobs=1, progress=True) # don't use more than 1 job
+        fn = lambda x: read_mol(x[0], x[1], raw_path, partition)  # noqa E731
+        samples = dm.parallelized(fn, mols, n_jobs=1, progress=True)  # don't use more than 1 job
         return samples
-    
+
     def read_raw_entries(self):
         samples = sum([self._read_raw_(partition) for partition in self.partitions], [])
         return samples
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     for data_class in [GEOM]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
-            print()
\ No newline at end of file
+                    print(k, x[k].shape, end=" ")
+
+            print()
diff --git a/openqdc/datasets/iso_17.py b/openqdc/datasets/iso_17.py
index 4e86637..55f395c 100644
--- a/openqdc/datasets/iso_17.py
+++ b/openqdc/datasets/iso_17.py
@@ -1,12 +1,13 @@
-import os
-import numpy as np
 from os.path import join as p_join
-from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 
 
 class ISO17(BaseDataset):
-    __name__ = 'iso_17'
+    __name__ = "iso_17"
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -29,26 +30,24 @@ class ISO17(BaseDataset):
 
     def __init__(self) -> None:
         super().__init__()
-    
+
     def read_raw_entries(self):
-        raw_path = p_join(self.root, f'iso_17.h5')
-        samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, 
-                                      self.force_target_names)
+        raw_path = p_join(self.root, "iso_17.h5")
+        samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names)
 
         return samples
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     for data_class in [ISO17]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
+                    print(k, x[k].shape, end=" ")
+
             print()
diff --git a/openqdc/datasets/molecule3d.py b/openqdc/datasets/molecule3d.py
index da15cd9..ac4f348 100644
--- a/openqdc/datasets/molecule3d.py
+++ b/openqdc/datasets/molecule3d.py
@@ -1,41 +1,41 @@
+from glob import glob
+from os.path import join as p_join
 
+import datamol as dm
 import numpy as np
 import pandas as pd
-import datamol as dm
-from tqdm import tqdm
-from glob import glob
 from rdkit import Chem
-from os.path import join as p_join
-from openqdc.utils.molecule import get_atomic_numuber_and_charge
-from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER
+from tqdm import tqdm
+
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER
+from openqdc.utils.molecule import get_atomic_numuber_and_charge
 
 
 def read_mol(mol, energy):
     smiles = dm.to_smiles(mol, explicit_hs=False)
     # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)
     x = get_atomic_numuber_and_charge(mol)
-    positions= mol.GetConformer().GetPositions() * BOHR2ANG
-    
+    positions = mol.GetConformer().GetPositions() * BOHR2ANG
+
     res = dict(
-        name= np.array([smiles]),
-        subset= np.array(["molecule3d"]),     
-        energies= np.array([energy]).astype(np.float32)[:, None],
-        atomic_inputs = np.concatenate((x, positions), axis=-1, dtype=np.float32),
-        n_atoms = np.array([x.shape[0]], dtype=np.int32),
+        name=np.array([smiles]),
+        subset=np.array(["molecule3d"]),
+        energies=np.array([energy]).astype(np.float32)[:, None],
+        atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32),
+        n_atoms=np.array([x.shape[0]], dtype=np.int32),
     )
 
     return res
 
 
 def _read_sdf(sdf_path, properties_path):
-
     properties = pd.read_csv(properties_path, dtype={"cid": str})
     properties.drop_duplicates(subset="cid", inplace=True, keep="first")
     xys = properties[["cid", "scf energy"]]
     properties = dict(zip(xys.cid.values, xys["scf energy"].values))
-        
-    get_e = lambda mol: properties[mol.GetProp('_Name').split(" ")[1]]
+
+    get_e = lambda mol: properties[mol.GetProp("_Name").split(" ")[1]]
     fn = lambda x: read_mol(x, get_e(x))
 
     suppl = Chem.SDMolSupplier(sdf_path, removeHs=False, sanitize=True)
@@ -43,8 +43,9 @@ def _read_sdf(sdf_path, properties_path):
 
     return tmp
 
+
 class Molecule3D(BaseDataset):
-    __name__ = 'molecule3d'
+    __name__ = "molecule3d"
     __energy_methods__ = ["b3lyp_6-31g*"]
 
     energy_target_names = ["b3lyp/6-31g*.energy"]
@@ -56,26 +57,26 @@ def __init__(self) -> None:
         super().__init__()
 
     def read_raw_entries(self):
-        raw = p_join(self.root, 'data', 'raw')
-        sdf_paths = glob(p_join(raw, '*.sdf'))
-        properties_path = p_join(raw, 'properties.csv')
+        raw = p_join(self.root, "data", "raw")
+        sdf_paths = glob(p_join(raw, "*.sdf"))
+        properties_path = p_join(raw, "properties.csv")
 
         fn = lambda x: _read_sdf(x, properties_path)
-        res = dm.parallelized(fn, sdf_paths, n_jobs=1) # don't use more than 1 job
+        res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job
         samples = sum(res, [])
         return samples
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     for data_class in [Molecule3D]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
-            print()
\ No newline at end of file
+                    print(k, x[k].shape, end=" ")
+
+            print()
diff --git a/openqdc/datasets/nabladft.py b/openqdc/datasets/nabladft.py
index 25b4932..3234011 100644
--- a/openqdc/datasets/nabladft.py
+++ b/openqdc/datasets/nabladft.py
@@ -1,12 +1,13 @@
-
 import os
-import numpy as np
+from os.path import join as p_join
+
 import datamol as dm
+import numpy as np
+from nablaDFT.dataset import HamiltonianDatabase
 from tqdm import tqdm
-from os.path import join as p_join
-from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+
 from openqdc.datasets.base import BaseDataset
-from nablaDFT.dataset import HamiltonianDatabase
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 
 
 def to_mol(entry):
@@ -14,29 +15,29 @@ def to_mol(entry):
     C = np.zeros_like(Z)
 
     res = dict(
-        atomic_inputs = np.concatenate((Z[:, None], C[:, None],  R), axis=-1).astype(np.float32),
-        name = np.array(['']),
-        energies = E[:, None].astype(np.float32),
-        forces = F[:, :, None].astype(np.float32),
-        n_atoms = np.array([Z.shape[0]], dtype=np.int32),
-        subset = np.array(['nabla']),
+        atomic_inputs=np.concatenate((Z[:, None], C[:, None], R), axis=-1).astype(np.float32),
+        name=np.array([""]),
+        energies=E[:, None].astype(np.float32),
+        forces=F[:, :, None].astype(np.float32),
+        n_atoms=np.array([Z.shape[0]], dtype=np.int32),
+        subset=np.array(["nabla"]),
     )
 
     return res
 
 
 def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000):
-    print(f'Loading from {start_idx} to {stop_idx}')
+    print(f"Loading from {start_idx} to {stop_idx}")
     db = HamiltonianDatabase(raw_path)
     idxs = list(np.arange(start_idx, stop_idx))
     n, s = len(idxs), step_size
 
-    samples = [to_mol(entry) for i in tqdm(range(0,  n, s)) for entry in db[idxs[i:i + s]]]
+    samples = [to_mol(entry) for i in tqdm(range(0, n, s)) for entry in db[idxs[i : i + s]]]
     return samples
-    
+
 
 class NablaDFT(BaseDataset):
-    __name__ = 'nabladft'
+    __name__ = "nabladft"
     __energy_methods__ = ["wb97x-d_svp"]
 
     energy_target_names = ["ωB97X-D/def2-SVP"]
@@ -47,30 +48,30 @@ class NablaDFT(BaseDataset):
     def __init__(self) -> None:
         super().__init__()
 
-
     def read_raw_entries(self):
-        raw_path = p_join(self.root, 'dataset_full.db')
+        raw_path = p_join(self.root, "dataset_full.db")
         train = HamiltonianDatabase(raw_path)
         n, c = len(train), 20
         step_size = int(np.ceil(n / os.cpu_count()))
 
-        fn = lambda i: read_chunk_from_db(raw_path, i*step_size, min((i + 1) * step_size, n))
-        samples = dm.parallelized(fn, list(range(c)), n_jobs=c, progress=False, scheduler="threads") # don't use more than 1 job
-            
+        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n))
+        samples = dm.parallelized(
+            fn, list(range(c)), n_jobs=c, progress=False, scheduler="threads"
+        )  # don't use more than 1 job
+
         return sum(samples, [])
-    
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     for data_class in [NablaDFT]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
-            print()
+                    print(k, x[k].shape, end=" ")
 
+            print()
diff --git a/openqdc/datasets/orbnet_denali.py b/openqdc/datasets/orbnet_denali.py
index 157f80f..452cce1 100644
--- a/openqdc/datasets/orbnet_denali.py
+++ b/openqdc/datasets/orbnet_denali.py
@@ -1,18 +1,12 @@
-import os
-import glob
-import torch
-import pickle as pkl
+from os.path import join as p_join
+
+import datamol as dm
 import numpy as np
 import pandas as pd
-from tqdm import tqdm
-import datamol as dm
-from sklearn.utils import Bunch
-from os.path import join as p_join
-from openqdc.utils import load_json
-from openqdc.utils.molecule import atom_table
-from openqdc.utils.io import get_local_cache
-from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+from openqdc.utils.molecule import atom_table
 
 
 def read_mol(mol_id, conf_dict, base_path, energy_target_names):
@@ -20,30 +14,30 @@ def read_mol(mol_id, conf_dict, base_path, energy_target_names):
     for conf_id, conf_label in conf_dict.items():
         try:
             cf_name = p_join(base_path, "xyz_files", mol_id, f"{conf_id}.xyz")
-            d = np.loadtxt(cf_name, skiprows=2, dtype='str')
+            d = np.loadtxt(cf_name, skiprows=2, dtype="str")
             z, positions = d[:, 0], d[:, 1:].astype(np.float32)
             z = np.array([atom_table.GetAtomicNumber(s) for s in z])
             xs = np.stack((z, np.zeros_like(z)), axis=-1)
 
             conf = dict(
-                atomic_inputs = np.concatenate((xs, positions), axis=-1, dtype=np.float32),
-                name = np.array([mol_id]),
-                energies = np.array([conf_label[k] for k in energy_target_names], dtype=np.float32)[None, :],
-                n_atoms = np.array([positions.shape[0]], dtype=np.int32),
-                subset = np.array([conf_label["subset"]]),
+                atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32),
+                name=np.array([mol_id]),
+                energies=np.array([conf_label[k] for k in energy_target_names], dtype=np.float32)[None, :],
+                n_atoms=np.array([positions.shape[0]], dtype=np.int32),
+                subset=np.array([conf_label["subset"]]),
             )
             res.append(conf)
         except Exception as e:
-            print (f'Skipping: {mol_id} {conf_id} due to {e}')
+            print(f"Skipping: {mol_id} {conf_id} due to {e}")
 
     return res
 
 
 class OrbnetDenali(BaseDataset):
-    __name__ = 'orbnet_denali'
+    __name__ = "orbnet_denali"
     __energy_methods__ = ["wb97x-d3_tz", "gfn1_xtb"]
 
-    energy_target_names = ['dft_energy', 'xtb1_energy']
+    energy_target_names = ["dft_energy", "xtb1_energy"]
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -52,10 +46,12 @@ def __init__(self) -> None:
         super().__init__()
 
     def read_raw_entries(self):
-        label_path = p_join(self.root, 'denali_labels.csv')
-        df = pd.read_csv(label_path, usecols=['sample_id', 'mol_id', 'subset', 'dft_energy', 'xtb1_energy'])
-        labels = {mol_id: group.drop(['mol_id'], axis=1).drop_duplicates("sample_id").set_index("sample_id").to_dict('index') 
-                  for mol_id, group in df.groupby('mol_id')}
+        label_path = p_join(self.root, "denali_labels.csv")
+        df = pd.read_csv(label_path, usecols=["sample_id", "mol_id", "subset", "dft_energy", "xtb1_energy"])
+        labels = {
+            mol_id: group.drop(["mol_id"], axis=1).drop_duplicates("sample_id").set_index("sample_id").to_dict("index")
+            for mol_id, group in df.groupby("mol_id")
+        }
 
         # print(df.head())
         # tmp = df.to_dict('index')
@@ -65,21 +61,21 @@ def read_raw_entries(self):
         #         break
         # exit()
         fn = lambda x: read_mol(x[0], x[1], self.root, self.energy_target_names)
-        res = dm.parallelized(fn, list(labels.items()), scheduler='threads', n_jobs=-1, progress=True)
+        res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True)
         samples = sum(res, [])
         return samples
-    
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     for data_class in [OrbnetDenali]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
-            print()
\ No newline at end of file
+                    print(k, x[k].shape, end=" ")
+
+            print()
diff --git a/openqdc/datasets/qm7x.py b/openqdc/datasets/qm7x.py
index 4e86637..55f395c 100644
--- a/openqdc/datasets/qm7x.py
+++ b/openqdc/datasets/qm7x.py
@@ -1,12 +1,13 @@
-import os
-import numpy as np
 from os.path import join as p_join
-from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 
 
 class ISO17(BaseDataset):
-    __name__ = 'iso_17'
+    __name__ = "iso_17"
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -29,26 +30,24 @@ class ISO17(BaseDataset):
 
     def __init__(self) -> None:
         super().__init__()
-    
+
     def read_raw_entries(self):
-        raw_path = p_join(self.root, f'iso_17.h5')
-        samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, 
-                                      self.force_target_names)
+        raw_path = p_join(self.root, "iso_17.h5")
+        samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names)
 
         return samples
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     for data_class in [ISO17]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
+                    print(k, x[k].shape, end=" ")
+
             print()
diff --git a/openqdc/datasets/qmugs.py b/openqdc/datasets/qmugs.py
index 83b67f1..6868f38 100644
--- a/openqdc/datasets/qmugs.py
+++ b/openqdc/datasets/qmugs.py
@@ -1,11 +1,13 @@
 import os
-import numpy as np
-import datamol as dm
 from glob import glob
 from os.path import join as p_join
-from openqdc.utils.molecule import get_atomic_numuber_and_charge
-from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER
+
+import datamol as dm
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+from openqdc.utils.molecule import get_atomic_numuber_and_charge
 
 
 def read_mol(mol_dir):
@@ -18,27 +20,29 @@ def read_mol(mol_dir):
 
     smiles = dm.to_smiles(mols[0], explicit_hs=False)
     x = get_atomic_numuber_and_charge(mols[0])[None, ...].repeat(n_confs, axis=0)
-    positions= np.array([mol.GetConformer().GetPositions() for mol in mols])
+    positions = np.array([mol.GetConformer().GetPositions() for mol in mols])
     props = [mol.GetPropsAsDict() for mol in mols]
-    targets = np.array([[p[el]for el in QMugs.energy_target_names] for p in props])
-    
+    targets = np.array([[p[el] for el in QMugs.energy_target_names] for p in props])
+
     res = dict(
-        name= np.array([smiles]*n_confs),
-        subset= np.array(['qmugs']*n_confs),     
-        energies= targets.astype(np.float32),
-        atomic_inputs = np.concatenate((x, positions), 
-                                    axis=-1, dtype=np.float32).reshape(-1, 5),
-        n_atoms = np.array([x.shape[1]]*n_confs, dtype=np.int32),
+        name=np.array([smiles] * n_confs),
+        subset=np.array(["qmugs"] * n_confs),
+        energies=targets.astype(np.float32),
+        atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),
+        n_atoms=np.array([x.shape[1]] * n_confs, dtype=np.int32),
     )
 
     return res
 
 
 class QMugs(BaseDataset):
-    __name__ = 'qmugs'
+    __name__ = "qmugs"
     __energy_methods__ = ["gfn2_xtb", "b3lyp/6-31g*"]
 
-    energy_target_names = ["GFN2:TOTAL_ENERGY", "DFT:TOTAL_ENERGY",]
+    energy_target_names = [
+        "GFN2:TOTAL_ENERGY",
+        "DFT:TOTAL_ENERGY",
+    ]
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -47,25 +51,23 @@ def __init__(self) -> None:
         super().__init__()
 
     def read_raw_entries(self):
-        raw_path = p_join(self.root, 'structures')
+        raw_path = p_join(self.root, "structures")
         mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)]
 
-        samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, 
-                              progress=True, scheduler="threads")
+        samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, progress=True, scheduler="threads")
         return samples
-    
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     for data_class in [QMugs]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
-            print()
+                    print(k, x[k].shape, end=" ")
 
+            print()
diff --git a/openqdc/datasets/sn2_rxn.py b/openqdc/datasets/sn2_rxn.py
index cdb6e27..8be8281 100644
--- a/openqdc/datasets/sn2_rxn.py
+++ b/openqdc/datasets/sn2_rxn.py
@@ -1,12 +1,13 @@
-import os
-import numpy as np
 from os.path import join as p_join
-from openqdc.utils.constants import MAX_ATOMIC_NUMBER
+
+import numpy as np
+
 from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 
 
 class SN2RXN(BaseDataset):
-    __name__ = 'iso_17'
+    __name__ = "iso_17"
 
     # Energy in hartree, all zeros by default
     atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)
@@ -29,26 +30,24 @@ class SN2RXN(BaseDataset):
 
     def __init__(self) -> None:
         super().__init__()
-    
+
     def read_raw_entries(self):
-        raw_path = p_join(self.root, f'iso_17.h5')
-        samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, 
-                                      self.force_target_names)
+        raw_path = p_join(self.root, "iso_17.h5")
+        samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names)
 
         return samples
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     for data_class in [SN2RXN]:
         data = data_class()
         n = len(data)
 
         for i in np.random.choice(n, 3, replace=False):
             x = data[i]
-            print(x.name, x.subset, end=' ')
+            print(x.name, x.subset, end=" ")
             for k in x:
                 if x[k] is not None:
-                    print(k, x[k].shape, end=' ')
-                
+                    print(k, x[k].shape, end=" ")
+
             print()
diff --git a/openqdc/datasets/spice.py b/openqdc/datasets/spice.py
index 1042f92..0aec9b2 100644
--- a/openqdc/datasets/spice.py
+++ b/openqdc/datasets/spice.py
@@ -1,11 +1,13 @@
+from os.path import join as p_join
+
+import datamol as dm
 import numpy as np
 from tqdm import tqdm
-import datamol as dm
-from os.path import join as p_join
+
+from openqdc.datasets.base import BaseDataset
 from openqdc.utils import load_hdf5_file
-from openqdc.utils.molecule import get_atomic_numuber_and_charge
 from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER
-from openqdc.datasets.base import BaseDataset
+from openqdc.utils.molecule import get_atomic_numuber_and_charge
 
 
 def read_record(r):
@@ -13,24 +15,24 @@ def read_record(r):
     subset = r["subset"][0].decode("utf-8")
     n_confs = r["conformations"].shape[0]
     x = get_atomic_numuber_and_charge(dm.to_mol(smiles, add_hs=True))
-    positions= r["conformations"][:] * BOHR2ANG
-    
+    positions = r["conformations"][:] * BOHR2ANG
+
     res = dict(
-        smiles= np.array([smiles]*n_confs),
-        subset= np.array([Spice.subset_mapping[subset]]*n_confs),     
-        energies= r[Spice.energy_target_names[0]][:][:, None].astype(np.float32),
-        forces= r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG,
-        atomic_inputs = np.concatenate((
-            x[None, ...].repeat(n_confs, axis=0), 
-            positions), axis=-1, dtype=np.float32).reshape(-1, 5),
-        n_atoms = np.array([x.shape[0]]*n_confs, dtype=np.int32),
+        smiles=np.array([smiles] * n_confs),
+        subset=np.array([Spice.subset_mapping[subset]] * n_confs),
+        energies=r[Spice.energy_target_names[0]][:][:, None].astype(np.float32),
+        forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG,
+        atomic_inputs=np.concatenate(
+            (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32
+        ).reshape(-1, 5),
+        n_atoms=np.array([x.shape[0]] * n_confs, dtype=np.int32),
     )
 
     return res
 
 
 class Spice(BaseDataset):
-    __name__ = 'spice'
+    __name__ = "spice"
     __energy_methods__ = ["wb97x_tz"]
 
     energy_target_names = ["dft_total_energy"]
@@ -78,23 +80,23 @@ def __init__(self) -> None:
         super().__init__()
 
     def read_raw_entries(self):
-        raw_path = p_join(self.root, 'SPICE-1.1.4.hdf5')
-        
-        data = load_hdf5_file(raw_path) 
-        tmp = [read_record(data[mol_name]) for mol_name in tqdm(data)] # don't use parallelized here
+        raw_path = p_join(self.root, "SPICE-1.1.4.hdf5")
+
+        data = load_hdf5_file(raw_path)
+        tmp = [read_record(data[mol_name]) for mol_name in tqdm(data)]  # don't use parallelized here
 
         return tmp
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     data = Spice()
     n = len(data)
 
     for i in np.random.choice(n, 10, replace=False):
         x = data[i]
-        print(x.smiles, x.subset, end=' ')
+        print(x.smiles, x.subset, end=" ")
         for k in x:
-            if k != 'smiles' and k != 'subset':
-                print(k, x[k].shape if x[k] is not None else None, end=' ')
-            
-        print()
\ No newline at end of file
+            if k != "smiles" and k != "subset":
+                print(k, x[k].shape if x[k] is not None else None, end=" ")
+
+        print()
diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py
index df1ba44..86d1d33 100644
--- a/openqdc/raws/config_factory.py
+++ b/openqdc/raws/config_factory.py
@@ -1,6 +1,4 @@
-
 class DataConfigFactory:
-
     ani = dict(
         dataset_name="ani",
         links={
@@ -24,100 +22,82 @@ class DataConfigFactory:
 
     gdml = dict(
         dataset_name="gdml",
-        links = {
-            "gdml.hdf5.gz": "https://zenodo.org/record/3585908/files/219.hdf5.gz"
-        },
-    )    
+        links={"gdml.hdf5.gz": "https://zenodo.org/record/3585908/files/219.hdf5.gz"},
+    )
 
     solvated_peptides = dict(
         dataset_name="solvated_peptides",
-        links = {
-            "solvated_peptides.hdf5.gz": "https://zenodo.org/record/3585804/files/213.hdf5.gz"
-        },
+        links={"solvated_peptides.hdf5.gz": "https://zenodo.org/record/3585804/files/213.hdf5.gz"},
     )
 
     iso_17 = dict(
         dataset_name="iso_17",
-        links = {
-            "iso_17.hdf5.gz": "https://zenodo.org/record/3585907/files/216.hdf5.gz"
-        },
-    )   
+        links={"iso_17.hdf5.gz": "https://zenodo.org/record/3585907/files/216.hdf5.gz"},
+    )
 
     sn2_rxn = dict(
         dataset_name="sn2_rxn",
-        links = {
-            "sn2_rxn.hdf5.gz": "https://zenodo.org/record/3585800/files/212.hdf5.gz"
-        },
+        links={"sn2_rxn.hdf5.gz": "https://zenodo.org/record/3585800/files/212.hdf5.gz"},
     )
 
     # FROM: https://sites.uw.edu/wdbase/database-of-water-clusters/
     waterclusters3_30 = dict(
         dataset_name="waterclusters3_30",
-        links = {
-            "W3-W30_all_geoms_TTM2.1-F.zip": "https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n"
-        },
+        links={"W3-W30_all_geoms_TTM2.1-F.zip": "https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n"},
     )
 
     geom = dict(
         dataset_name="geom",
-        links = {
-            "rdkit_folder.tar.gz": "https://dataverse.harvard.edu/api/access/datafile/4327252"
-        },
+        links={"rdkit_folder.tar.gz": "https://dataverse.harvard.edu/api/access/datafile/4327252"},
     )
 
     molecule3d = dict(
         dataset_name="molecule3d",
-        links={
-            "molecule3d.zip": "https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy"
-        },
+        links={"molecule3d.zip": "https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy"},
     )
 
     orbnet_denali = dict(
         dataset_name="orbnet_denali",
         links={
             "orbnet_denali.tar.gz": "https://figshare.com/ndownloader/files/28672287",
-            "orbnet_denali_targets.tar.gz": "https://figshare.com/ndownloader/files/28672248"},
+            "orbnet_denali_targets.tar.gz": "https://figshare.com/ndownloader/files/28672248",
+        },
     )
 
     qm7x = dict(
         dataset_name="qm7x",
-        links={f"{i}000.xz":"https://zenodo.org/record/4288677/files/{i}000.xz" for i in range(1, 9)
-        }
+        links={f"{i}000.xz": "https://zenodo.org/record/4288677/files/{i}000.xz" for i in range(1, 9)},
     )
 
     qmugs = dict(
         dataset_name="qmugs",
         links={
-                "summary.csv": "https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=summary.csv",
-                "structures.tar.gz": "https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=structures.tar.gz",
+            "summary.csv": "https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=summary.csv",
+            "structures.tar.gz": "https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=structures.tar.gz",
         },
     )
 
     spice = dict(
         dataset_name="spice",
-        links={
-            "SPICE-1.1.4.hdf5": "https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5"
-            },
+        links={"SPICE-1.1.4.hdf5": "https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5"},
     )
 
     misato = dict(
         dataset_name="misato",
         links={
-                "MD.hdf5": "https://zenodo.org/record/7711953/files/MD.hdf5",
-                "QM.hdf5": "https://zenodo.org/record/7711953/files/QM.hdf5"
-            },
+            "MD.hdf5": "https://zenodo.org/record/7711953/files/MD.hdf5",
+            "QM.hdf5": "https://zenodo.org/record/7711953/files/QM.hdf5",
+        },
     )
 
     nabladft = dict(
         dataset_name="nabladft",
-        links={
-            "nabladft.db": "https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db"
-        },
+        links={"nabladft.db": "https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db"},
         cmd=[
             "axel -n 10 --output=dataset_full.db https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db"
-        ]
+        ],
     )
-    
+
     pubchemqc = dict(
         dataset_name="pubchemqc",
         links={
@@ -126,16 +106,14 @@ class DataConfigFactory:
         cmd=[
             'wget "https://chibakoudai.sharepoint.com/:u:/s/stair06/EcWMtOpIEqFLrHcR1dzlZiMBLhTFY0RZ0qPaqC4lhRp51A?download=1" -O b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.001',
             'wget "https://chibakoudai.sharepoint.com/:u:/s/stair06/EbJe-SlL4oNPhOpOtA8mxLsB1F3eI2l-5RS315hIZUFNwQ?download=1" -O b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.002',
-            'cat b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.001 b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.002 | tar xvfJ - '
-        ]
+            "cat b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.001 b3lyp_pm6_ver1.0.1-postgrest-docker-compose.tar.xz.rclone_chunk.002 | tar xvfJ - ",
+        ],
     )
 
-    available_datasets = [k for k in locals().keys()  if not k.startswith("__")]
+    available_datasets = [k for k in locals().keys() if not k.startswith("__")]
 
     def __init__(self):
         pass
 
     def __call__(self, dataset_name):
         return getattr(self, dataset_name)
-    
-
diff --git a/openqdc/raws/fetch.py b/openqdc/raws/fetch.py
index 58d86d0..b7fc4e4 100644
--- a/openqdc/raws/fetch.py
+++ b/openqdc/raws/fetch.py
@@ -1,22 +1,22 @@
 """Script to download the molecule3d dataset from Google Drive."""
-import os
 import gzip
-import tqdm
-import gdown
-import fsspec
-import socket
+import os
 import shutil
+import socket
 import tarfile
-import zipfile
-import requests
 import urllib.error
 import urllib.request
+import zipfile
+
+import fsspec
+import gdown
+import requests
+import tqdm
 from loguru import logger
 from sklearn.utils import Bunch
-from openqdc.utils.io import get_local_cache
+
 from openqdc.raws.config_factory import DataConfigFactory
-from office365.runtime.auth.client_credential import ClientCredential
-from office365.sharepoint.client_context import ClientContext
+from openqdc.utils.io import get_local_cache
 
 
 # function to download large files with requests
@@ -37,7 +37,6 @@ def fetch_file(url, local_filename, overwrite=False):
         Local file.
     """
     try:
-
         if os.path.exists(local_filename) and not overwrite:
             logger.info("File already exists, skipping download")
         else:
@@ -53,7 +52,7 @@ def fetch_file(url, local_filename, overwrite=False):
 
         # decompress archive if necessary
         parent = os.path.dirname(local_filename)
-        if local_filename.endswith("tar.gz"):            
+        if local_filename.endswith("tar.gz"):
             with tarfile.open(local_filename) as tar:
                 logger.info(f"Verifying archive extraction states: {local_filename}")
                 all_names = tar.getnames()
@@ -81,7 +80,7 @@ def fetch_file(url, local_filename, overwrite=False):
             all_extracted = os.path.exists(out_filename)
             if not all_extracted:
                 logger.info(f"Extracting archive: {local_filename}")
-                with gzip.open(local_filename, 'rb') as f_in, open(out_filename, 'wb') as f_out:
+                with gzip.open(local_filename, "rb") as f_in, open(out_filename, "wb") as f_out:
                     shutil.copyfileobj(f_in, f_out)
             else:
                 logger.info(f"Archive already extracted: {local_filename}")
@@ -115,7 +114,7 @@ def __init__(self, cache_path=None, overwrite=False):
 
         self.cache_path = cache_path
         self.overwrite = overwrite
-    
+
     def from_config(self, config: dict):
         b_config = Bunch(**config)
         data_path = os.path.join(self.cache_path, b_config.dataset_name)
@@ -123,14 +122,14 @@ def from_config(self, config: dict):
 
         logger.info(f"Downloading the {b_config.dataset_name} dataset")
         for local, link in b_config.links.items():
-            outfile = os.path.join(data_path, local) 
+            outfile = os.path.join(data_path, local)
 
             fetch_file(link, outfile)
 
     def from_name(self, name):
         cfg = DataConfigFactory()(name)
         return self.from_config(cfg)
- 
+
 
 if __name__ == "__main__":
     dataset_names = DataConfigFactory.available_datasets
@@ -138,4 +137,3 @@ def from_name(self, name):
     for dataset_name in dataset_names:
         dd = DataDownloader()
         dd.from_name(dataset_name)
-
diff --git a/openqdc/utils/constants.py b/openqdc/utils/constants.py
index b941a82..a8a8215 100644
--- a/openqdc/utils/constants.py
+++ b/openqdc/utils/constants.py
@@ -4,4 +4,4 @@
 
 HAR2EV = 27.211386246
 
-BOHR2ANG = 0.52917721092
\ No newline at end of file
+BOHR2ANG = 0.52917721092
diff --git a/openqdc/utils/io.py b/openqdc/utils/io.py
index fe3c9e6..0391add 100644
--- a/openqdc/utils/io.py
+++ b/openqdc/utils/io.py
@@ -1,10 +1,11 @@
 """IO utilities for mlip package"""
-import os
 import json
+import os
+import pickle as pkl
+
+import fsspec
 import h5py
 import torch
-import fsspec
-import pickle as pkl
 from fsspec.implementations.local import LocalFileSystem
 from gcsfs import GCSFileSystem
 from rdkit.Chem import MolFromXYZFile
@@ -14,14 +15,14 @@
 
 
 def get_local_cache():
-    cache_dir = os.path.expanduser(os.path.expandvars('~/.cache/openqdc'))
+    cache_dir = os.path.expanduser(os.path.expandvars("~/.cache/openqdc"))
     os.makedirs(cache_dir, exist_ok=True)
     return cache_dir
 
 
 def get_remote_cache():
     remote_cache = "gs://opendatasets/openqdc"
-    return remote_cache 
+    return remote_cache
 
 
 def push_remote(local_path, overwrite=True):
@@ -148,4 +149,4 @@ def load_json(path):
 
 
 def load_xyz(path):
-    return MolFromXYZFile(path)
\ No newline at end of file
+    return MolFromXYZFile(path)
diff --git a/openqdc/utils/molecule.py b/openqdc/utils/molecule.py
index 83fc4a6..e8c1c9c 100644
--- a/openqdc/utils/molecule.py
+++ b/openqdc/utils/molecule.py
@@ -3,6 +3,7 @@
 
 atom_table = Chem.GetPeriodicTable()
 
+
 def get_atomic_number(mol: Chem.Mol):
     """Returns atomic numbers for rdkit molecule"""
     return np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()])
@@ -15,6 +16,4 @@ def get_atomic_charge(mol: Chem.Mol):
 
 def get_atomic_numuber_and_charge(mol: Chem.Mol):
     """Returns atoms number and charge for rdkit molecule"""
-    return np.array([[atom.GetAtomicNum(), atom.GetFormalCharge()] 
-                     for atom in mol.GetAtoms()])
-
+    return np.array([[atom.GetAtomicNum(), atom.GetFormalCharge()] for atom in mol.GetAtoms()])
diff --git a/pyproject.toml b/pyproject.toml
index be78fcf..5061854 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,8 @@ omit = ["setup.py", "tests/*"]
 
 [tool.ruff]
 line-length = 120
+ignore = ["E731"]
 
 [tool.ruff.per-file-ignores]
 "__init__.py" = ["F403"]
+"openqdc/raws/config_factory.py" = ["E501"]

From 7db824031f7a22539c53c50f2c8f919686985515 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Mon, 25 Sep 2023 15:04:52 +0000
Subject: [PATCH 3/3] Added pre-commit ci

---
 .github/workflows/pre-commit-ci.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 .github/workflows/pre-commit-ci.yml

diff --git a/.github/workflows/pre-commit-ci.yml b/.github/workflows/pre-commit-ci.yml
new file mode 100644
index 0000000..b597281
--- /dev/null
+++ b/.github/workflows/pre-commit-ci.yml
@@ -0,0 +1,15 @@
+# Based on https://github.com/pre-commit/action
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v3
+    - uses: pre-commit/action@v3.0.0