diff --git a/README.md b/README.md
index 776b8ea..267a83a 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ openqdc download Spice QMugs
 6. QM Level of Theory
  -->
 
-We provide support for the following publicly available QM Datasets.
+We provide support for the following publicly available QM Potential Energy Datasets.
 
 # Potential Energy
 
@@ -78,7 +78,15 @@ We provide support for the following publicly available QM Datasets.
 
 # Interaction energy
 
-| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes |
-| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes |
+We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets.
+
+| Dataset |
+| --- |
+| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) |
+| [DES5M](https://www.nature.com/articles/s41597-021-00833-x)   |
+| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) |
+| [DESS66](https://www.nature.com/articles/s41597-021-00833-x) |
+| [DESS66x8](https://www.nature.com/articles/s41597-021-00833-x) |
+| [Splinter](https://www.nature.com/articles/s41597-023-02443-1) |
+| [X40](https://pubs.acs.org/doi/10.1021/ct300647k) |
+| [L7](https://pubs.acs.org/doi/10.1021/ct400036b)  |
diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py
index c8b2469..3e7db7b 100644
--- a/openqdc/datasets/__init__.py
+++ b/openqdc/datasets/__init__.py
@@ -1,25 +1,4 @@
-from .base import BaseDataset  # noqa
 from .interaction import AVAILABLE_INTERACTION_DATASETS  # noqa
-from .interaction import DES  # noqa
 from .potential import AVAILABLE_POTENTIAL_DATASETS  # noqa
-from .potential.ani import ANI1, ANI1CCX, ANI1X  # noqa
-from .potential.comp6 import COMP6  # noqa
-from .potential.dummy import Dummy  # noqa
-from .potential.gdml import GDML  # noqa
-from .potential.geom import GEOM  # noqa
-from .potential.iso_17 import ISO17  # noqa
-from .potential.molecule3d import Molecule3D  # noqa
-from .potential.multixcqm9 import MultixcQM9  # noqa
-from .potential.nabladft import NablaDFT  # noqa
-from .potential.orbnet_denali import OrbnetDenali  # noqa
-from .potential.pcqm import PCQM_B3LYP, PCQM_PM6  # noqa
-from .potential.qm7x import QM7X  # noqa
-from .potential.qmugs import QMugs  # noqa
-from .potential.sn2_rxn import SN2RXN  # noqa
-from .potential.solvated_peptides import SolvatedPeptides  # noqa
-from .potential.spice import Spice  # noqa
-from .potential.tmqm import TMQM  # noqa
-from .potential.transition1x import Transition1X  # noqa
-from .potential.waterclusters3_30 import WaterClusters  # noqa
 
 AVAILABLE_DATASETS = {**AVAILABLE_POTENTIAL_DATASETS, **AVAILABLE_INTERACTION_DATASETS}
diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
new file mode 100644
index 0000000..4e6ad01
--- /dev/null
+++ b/openqdc/datasets/interaction/L7.py
@@ -0,0 +1,121 @@
+import os
+from typing import Dict, List
+
+import numpy as np
+import yaml
+from loguru import logger
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table
+
+
+class DataItemYAMLObj:
+    def __init__(self, name, shortname, geometry, reference_value, setup, group, tags):
+        self.name = name
+        self.shortname = shortname
+        self.geometry = geometry
+        self.reference_value = reference_value
+        self.setup = setup
+        self.group = group
+        self.tags = tags
+
+
+class DataSetYAMLObj:
+    def __init__(self, name, references, text, method_energy, groups_by, groups, global_setup):
+        self.name = name
+        self.references = references
+        self.text = text
+        self.method_energy = method_energy
+        self.groups_by = groups_by
+        self.groups = groups
+        self.global_setup = global_setup
+
+
+def data_item_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode):
+    """Construct an employee."""
+    return DataItemYAMLObj(**loader.construct_mapping(node))
+
+
+def dataset_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode):
+    """Construct an employee."""
+    return DataSetYAMLObj(**loader.construct_mapping(node))
+
+
+def get_loader():
+    """Add constructors to PyYAML loader."""
+    loader = yaml.SafeLoader
+    loader.add_constructor("!ruby/object:ProtocolDataset::DataSetItem", data_item_constructor)
+    loader.add_constructor("!ruby/object:ProtocolDataset::DataSetDescription", dataset_constructor)
+    return loader
+
+
+class L7(BaseInteractionDataset):
+    """
+    The L7 interaction energy dataset as described in:
+
+    Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes
+    Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza
+    Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374
+    DOI: 10.1021/ct400036b
+
+    Data was downloaded and extracted from:
+    http://cuby4.molecular.cz/dataset_l7.html
+    """
+
+    __name__ = "L7"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "CSD(T) | QCISD(T)",
+        "DLPNO-CCSD(T)",
+        "MP2/CBS",
+        "MP2C/CBS",
+        "fixed",
+        "DLPNO-CCSD(T0)",
+        "LNO-CCSD(T)",
+        "FN-DMC",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        yaml_fpath = os.path.join(self.root, "l7.yaml")
+        logger.info(f"Reading L7 interaction data from {self.root}")
+        yaml_file = open(yaml_fpath, "r")
+        data = []
+        data_dict = yaml.load(yaml_file, Loader=get_loader())
+        charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"])
+
+        for idx, item in enumerate(data_dict["items"]):
+            energies = []
+            name = np.array([item.shortname])
+            fname = item.geometry.split(":")[1]
+            energies.append(item.reference_value)
+            xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r")
+            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
+            lines.pop(1)
+            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
+            n_atoms_first = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
+            subset = np.array([item.group])
+            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies = np.array([energies], dtype=np.float32)
+            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
+            elems = np.array(lines[1:])[:, 0]
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            natoms0 = n_atoms_first[0]
+            natoms1 = n_atoms[0] - natoms0
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            item = dict(
+                energies=energies,
+                subset=subset,
+                n_atoms=n_atoms,
+                n_atoms_first=n_atoms_first,
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
new file mode 100644
index 0000000..08f4037
--- /dev/null
+++ b/openqdc/datasets/interaction/X40.py
@@ -0,0 +1,80 @@
+import os
+from typing import Dict, List
+
+import numpy as np
+import yaml
+from loguru import logger
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.datasets.interaction.L7 import get_loader
+from openqdc.utils.molecule import atom_table
+
+
+class X40(BaseInteractionDataset):
+    """
+    X40 interaction dataset of 40 dimer pairs as
+    introduced in the following paper:
+
+    Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules
+    Jan Řezáč, Kevin E. Riley, and Pavel Hobza
+    Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292
+    DOI: 10.1021/ct300647k
+
+    Dataset retrieved and processed from:
+    http://cuby4.molecular.cz/dataset_x40.html
+    """
+
+    __name__ = "X40"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "CCSD(T)/CBS",
+        "MP2/CBS",
+        "dCCSD(T)/haDZ",
+        "dCCSD(T)/haTZ",
+        "MP2.5/CBS(aDZ)",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        yaml_fpath = os.path.join(self.root, "x40.yaml")
+        logger.info(f"Reading X40 interaction data from {self.root}")
+        yaml_file = open(yaml_fpath, "r")
+        data = []
+        data_dict = yaml.load(yaml_file, Loader=get_loader())
+        charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"])
+
+        for idx, item in enumerate(data_dict["items"]):
+            energies = []
+            name = np.array([item.shortname])
+            energies.append(float(item.reference_value))
+            xyz_file = open(os.path.join(self.root, f"{item.shortname}.xyz"), "r")
+            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
+            setup = lines.pop(1)
+            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
+            n_atoms_first = setup[0].split("-")[1]
+            n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32)
+            subset = np.array([item.group])
+            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies = np.array([energies], dtype=np.float32)
+            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
+            elems = np.array(lines[1:])[:, 0]
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            natoms0 = n_atoms_first[0]
+            natoms1 = n_atoms[0] - natoms0
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            item = dict(
+                energies=energies,
+                subset=subset,
+                n_atoms=n_atoms,
+                n_atoms_first=n_atoms_first,
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py
index abf99a6..82154a5 100644
--- a/openqdc/datasets/interaction/__init__.py
+++ b/openqdc/datasets/interaction/__init__.py
@@ -1,3 +1,21 @@
-from .des import DES
+from .base import BaseInteractionDataset
+from .des5m import DES5M
+from .des370k import DES370K
+from .dess66 import DESS66
+from .dess66x8 import DESS66x8
+from .L7 import L7
+from .metcalf import Metcalf
+from .splinter import Splinter
+from .X40 import X40
 
-AVAILABLE_INTERACTION_DATASETS = {"des": DES}
+AVAILABLE_INTERACTION_DATASETS = {
+    "base": BaseInteractionDataset,
+    "des5m": DES5M,
+    "des370k": DES370K,
+    "dess66": DESS66,
+    "dess66x8": DESS66x8,
+    "l7": L7,
+    "metcalf": Metcalf,
+    "splinter": Splinter,
+    "x40": X40,
+}
diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
new file mode 100644
index 0000000..bd3ab70
--- /dev/null
+++ b/openqdc/datasets/interaction/base.py
@@ -0,0 +1,141 @@
+import pickle as pkl
+from os.path import join as p_join
+from typing import Dict, List, Optional
+
+import numpy as np
+from loguru import logger
+from sklearn.utils import Bunch
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
+from openqdc.utils.constants import NB_ATOMIC_FEATURES
+from openqdc.utils.io import pull_locally, push_remote
+
+
+class BaseInteractionDataset(BaseDataset):
+    def __init__(
+        self,
+        energy_unit: Optional[str] = None,
+        distance_unit: Optional[str] = None,
+        overwrite_local_cache: bool = False,
+        cache_dir: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            energy_unit=energy_unit,
+            distance_unit=distance_unit,
+            overwrite_local_cache=overwrite_local_cache,
+            cache_dir=cache_dir,
+        )
+
+    def collate_list(self, list_entries: List[Dict]):
+        # concatenate entries
+        res = {
+            key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0)
+            for key in list_entries[0]
+            if not isinstance(list_entries[0][key], dict)
+        }
+
+        csum = np.cumsum(res.get("n_atoms"))
+        x = np.zeros((csum.shape[0], 2), dtype=np.int32)
+        x[1:, 0], x[:, 1] = csum[:-1], csum
+        res["position_idx_range"] = x
+
+        return res
+
+    @property
+    def data_shapes(self):
+        return {
+            "atomic_inputs": (-1, NB_ATOMIC_FEATURES),
+            "position_idx_range": (-1, 2),
+            "energies": (-1, len(self.__energy_methods__)),
+            "forces": (-1, 3, len(self.force_target_names)),
+        }
+
+    @property
+    def data_types(self):
+        return {
+            "atomic_inputs": np.float32,
+            "position_idx_range": np.int32,
+            "energies": np.float32,
+            "forces": np.float32,
+        }
+
+    def __getitem__(self, idx: int):
+        shift = IsolatedAtomEnergyFactory.max_charge
+        p_start, p_end = self.data["position_idx_range"][idx]
+        input = self.data["atomic_inputs"][p_start:p_end]
+        z, c, positions, energies = (
+            np.array(input[:, 0], dtype=np.int32),
+            np.array(input[:, 1], dtype=np.int32),
+            np.array(input[:, -3:], dtype=np.float32),
+            np.array(self.data["energies"][idx], dtype=np.float32),
+        )
+        name = self.__smiles_converter__(self.data["name"][idx])
+        subset = self.data["subset"][idx]
+        n_atoms_first = self.data["n_atoms_first"][idx]
+
+        if "forces" in self.data:
+            forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32)
+        else:
+            forces = None
+        return Bunch(
+            positions=positions,
+            atomic_numbers=z,
+            charges=c,
+            e0=self.__isolated_atom_energies__[..., z, c + shift].T,
+            energies=energies,
+            name=name,
+            subset=subset,
+            forces=forces,
+            n_atoms_first=n_atoms_first,
+        )
+
+    def save_preprocess(self, data_dict):
+        # save memmaps
+        logger.info("Preprocessing data and saving it to cache.")
+        for key in self.data_keys:
+            local_path = p_join(self.preprocess_path, f"{key}.mmap")
+            out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
+            out[:] = data_dict.pop(key)[:]
+            out.flush()
+            push_remote(local_path, overwrite=True)
+
+        # save all other keys in props.pkl
+        local_path = p_join(self.preprocess_path, "props.pkl")
+        for key in data_dict:
+            if key not in self.data_keys:
+                x = data_dict[key]
+                x[np.where(x is None)] = np.inf
+                data_dict[key] = np.unique(x, return_inverse=True)
+
+        with open(local_path, "wb") as f:
+            pkl.dump(data_dict, f)
+        push_remote(local_path, overwrite=True)
+
+    def read_preprocess(self, overwrite_local_cache=False):
+        logger.info("Reading preprocessed data.")
+        logger.info(
+            f"Dataset {self.__name__} with the following units:\n\
+                     Energy: {self.energy_unit},\n\
+                     Distance: {self.distance_unit},\n\
+                     Forces: {self.force_unit if self.__force_methods__ else 'None'}"
+        )
+        self.data = {}
+        for key in self.data_keys:
+            filename = p_join(self.preprocess_path, f"{key}.mmap")
+            pull_locally(filename, overwrite=overwrite_local_cache)
+            self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key])
+
+        filename = p_join(self.preprocess_path, "props.pkl")
+        pull_locally(filename, overwrite=overwrite_local_cache)
+        with open(filename, "rb") as f:
+            tmp = pkl.load(f)
+            for key in set(tmp.keys()) - set(self.data_keys):
+                x = tmp.pop(key)
+                if len(x) == 2:
+                    self.data[key] = x[0][x[1]]
+                else:
+                    self.data[key] = x
+
+        for key in self.data:
+            logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py
deleted file mode 100644
index 337a8e8..0000000
--- a/openqdc/datasets/interaction/des.py
+++ /dev/null
@@ -1,93 +0,0 @@
-from os.path import join as p_join
-
-import datamol as dm
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-
-from openqdc.datasets.base import BaseDataset
-from openqdc.utils.molecule import get_atomic_number_and_charge
-
-
-def read_mol(mol_path, smiles, subset, targets):
-    try:
-        with open(mol_path, "r") as f:
-            mol_block = f.read()
-            mol = dm.read_molblock(mol_block, remove_hs=False, fail_if_invalid=True)
-
-        x = get_atomic_number_and_charge(mol)
-        positions = mol.GetConformer().GetPositions()
-
-        res = dict(
-            name=np.array([smiles]),
-            subset=np.array([subset]),
-            energies=np.array(targets).astype(np.float32)[None, :],
-            atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32),
-            n_atoms=np.array([x.shape[0]], dtype=np.int32),
-        )
-    except Exception as e:
-        print(f"Skipping: {mol_path} due to {e}")
-        res = None
-
-    return res
-
-
-class DES(BaseDataset):
-    __name__ = "des"
-    __energy_unit__ = "hartree"
-    __distance_unit__ = "ang"
-    __forces_unit__ = "hartree/ang"
-    __energy_methods__ = [
-        "mp2/cc-pvdz",
-        "mp2/cc-pvqz",
-        "mp2/cc-pvtz",
-        "mp2/cbs",
-        "ccsd(t)/cc-pvdz",
-        "ccsd(t)/cbs",  # cbs
-        "ccsd(t)/nn",  # nn
-        "sapt0/aug-cc-pwcvxz",
-    ]
-
-    energy_target_names = [
-        "cc_MP2_all",
-        "qz_MP2_all",
-        "tz_MP2_all",
-        "cbs_MP2_all",
-        "cc_CCSD(T)_all",
-        "cbs_CCSD(T)_all",
-        "nn_CCSD(T)_all",
-        "sapt_all",
-    ]
-    # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all']
-
-    partitions = ["DES370K", "DES5M"]
-
-    def _read_raw_(self, part):
-        df = pd.read_csv(p_join(self.root, f"{part}.csv"))
-        for col in self.energy_target_names:
-            if col not in df.columns:
-                df[col] = np.nan
-        smiles = (df["smiles0"] + "." + df["smiles1"]).tolist()
-        subsets = (f"{part}_" + df["group_orig"]).tolist()
-        targets = df[self.energy_target_names].values
-        paths = (
-            p_join(self.root, "geometries/")
-            + df["system_id"].astype(str)
-            + f"/{part}_"
-            + df["geom_id"].astype(str)
-            + ".mol"
-        )
-
-        inputs = [
-            dict(smiles=smiles[i], subset=subsets[i], targets=targets[i], mol_path=paths[i])
-            for i in tqdm(range(len(smiles)))
-        ]
-        f = lambda xs: [read_mol(**x) for x in xs]
-        samples = dm.parallelized_with_batches(
-            f, inputs, n_jobs=-1, progress=True, batch_size=1024, scheduler="threads"
-        )
-        return samples
-
-    def read_raw_entries(self):
-        samples = sum([self._read_raw_(partition) for partition in self.partitions], [])
-        return samples
diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py
new file mode 100644
index 0000000..75198fd
--- /dev/null
+++ b/openqdc/datasets/interaction/des370k.py
@@ -0,0 +1,122 @@
+import os
+from typing import Dict, List
+
+import numpy as np
+import pandas as pd
+from loguru import logger
+from tqdm import tqdm
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.utils.io import get_local_cache
+from openqdc.utils.molecule import atom_table, molecule_groups
+
+
+class DES370K(BaseInteractionDataset):
+    """
+    DE Shaw Research interaction energy of over 370K
+    small molecule dimers as described in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+    """
+
+    __name__ = "des370k_interaction"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "mp2/cc-pvdz",
+        "mp2/cc-pvqz",
+        "mp2/cc-pvtz",
+        "mp2/cbs",
+        "ccsd(t)/cc-pvdz",
+        "ccsd(t)/cbs",  # cbs
+        "ccsd(t)/nn",  # nn
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz_es",
+        "sapt0/aug-cc-pwcvxz_ex",
+        "sapt0/aug-cc-pwcvxz_exs2",
+        "sapt0/aug-cc-pwcvxz_ind",
+        "sapt0/aug-cc-pwcvxz_exind",
+        "sapt0/aug-cc-pwcvxz_disp",
+        "sapt0/aug-cc-pwcvxz_exdisp_os",
+        "sapt0/aug-cc-pwcvxz_exdisp_ss",
+        "sapt0/aug-cc-pwcvxz_delta_HF",
+    ]
+
+    energy_target_names = [
+        "cc_MP2_all",
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "cc_CCSD(T)_all",
+        "cbs_CCSD(T)_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    _filename = "DES370K.csv"
+    _name = "des370k_interaction"
+
+    @classmethod
+    def _root(cls):
+        return os.path.join(get_local_cache(), cls._name)
+
+    @classmethod
+    def _read_raw_entries(cls) -> List[Dict]:
+        filepath = os.path.join(cls._root(), cls._filename)
+        logger.info(f"Reading {cls._name} interaction data from {filepath}")
+        df = pd.read_csv(filepath)
+        data = []
+        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+            smiles0, smiles1 = row["smiles0"], row["smiles1"]
+            charge0, charge1 = row["charge0"], row["charge1"]
+            natoms0, natoms1 = row["natoms0"], row["natoms1"]
+            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
+
+            elements = row["elements"].split()
+
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            energies = np.array(row[cls.energy_target_names].values).astype(np.float32)[None, :]
+
+            name = np.array([smiles0 + "." + smiles1])
+
+            subsets = []
+            for smiles in [smiles0, smiles1]:
+                found = False
+                for functional_group, smiles_set in molecule_groups.items():
+                    if smiles in smiles_set:
+                        subsets.append(functional_group)
+                        found = True
+                if not found:
+                    logger.info(f"molecule group lookup failed for {smiles}")
+
+            item = dict(
+                energies=energies,
+                subset=np.array([subsets]),
+                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
+                n_atoms_first=np.array([natoms0], dtype=np.int32),
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
+
+    def read_raw_entries(self) -> List[Dict]:
+        return DES370K._read_raw_entries()
diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py
new file mode 100644
index 0000000..46d9ba1
--- /dev/null
+++ b/openqdc/datasets/interaction/des5m.py
@@ -0,0 +1,56 @@
+from typing import Dict, List
+
+from openqdc.datasets.interaction.des370k import DES370K
+
+
+class DES5M(DES370K):
+    """
+    DE Shaw Research interaction energy calculations for
+    over 5M small molecule dimers as described in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+    """
+
+    __name__ = "des5m_interaction"
+    __energy_methods__ = [
+        "mp2/cc-pvqz",
+        "mp2/cc-pvtz",
+        "mp2/cbs",
+        "ccsd(t)/nn",  # nn
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz_es",
+        "sapt0/aug-cc-pwcvxz_ex",
+        "sapt0/aug-cc-pwcvxz_exs2",
+        "sapt0/aug-cc-pwcvxz_ind",
+        "sapt0/aug-cc-pwcvxz_exind",
+        "sapt0/aug-cc-pwcvxz_disp",
+        "sapt0/aug-cc-pwcvxz_exdisp_os",
+        "sapt0/aug-cc-pwcvxz_exdisp_ss",
+        "sapt0/aug-cc-pwcvxz_delta_HF",
+    ]
+
+    energy_target_names = [
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    _filename = "DES5M.csv"
+    _name = "des5m_interaction"
+
+    def read_raw_entries(self) -> List[Dict]:
+        return DES5M._read_raw_entries()
diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py
new file mode 100644
index 0000000..ae3ce81
--- /dev/null
+++ b/openqdc/datasets/interaction/dess66.py
@@ -0,0 +1,107 @@
+import os
+from typing import Dict, List
+
+import numpy as np
+import pandas as pd
+from loguru import logger
+from tqdm import tqdm
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table
+
+
+class DESS66(BaseInteractionDataset):
+    """
+    DE Shaw Research interaction energy
+    estimates of all 66 conformers from
+    the original S66 dataset as described
+    in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+
+    Data was downloaded from Zenodo:
+    https://zenodo.org/records/5676284
+    """
+
+    __name__ = "des_s66"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "mp2/cc-pvdz",
+        "mp2/cc-pvqz",
+        "mp2/cc-pvtz",
+        "mp2/cbs",
+        "ccsd(t)/cc-pvdz",
+        "ccsd(t)/cbs",  # cbs
+        "ccsd(t)/nn",  # nn
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz_es",
+        "sapt0/aug-cc-pwcvxz_ex",
+        "sapt0/aug-cc-pwcvxz_exs2",
+        "sapt0/aug-cc-pwcvxz_ind",
+        "sapt0/aug-cc-pwcvxz_exind",
+        "sapt0/aug-cc-pwcvxz_disp",
+        "sapt0/aug-cc-pwcvxz_exdisp_os",
+        "sapt0/aug-cc-pwcvxz_exdisp_ss",
+        "sapt0/aug-cc-pwcvxz_delta_HF",
+    ]
+
+    energy_target_names = [
+        "cc_MP2_all",
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "cc_CCSD(T)_all",
+        "cbs_CCSD(T)_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    def read_raw_entries(self) -> List[Dict]:
+        self.filepath = os.path.join(self.root, "DESS66.csv")
+        logger.info(f"Reading DESS66 interaction data from {self.filepath}")
+        df = pd.read_csv(self.filepath)
+        data = []
+        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+            smiles0, smiles1 = row["smiles0"], row["smiles1"]
+            charge0, charge1 = row["charge0"], row["charge1"]
+            natoms0, natoms1 = row["natoms0"], row["natoms1"]
+            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
+
+            elements = row["elements"].split()
+
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
+
+            name = np.array([smiles0 + "." + smiles1])
+
+            subset = row["system_name"]
+
+            item = dict(
+                energies=energies,
+                subset=np.array([subset]),
+                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
+                n_atoms_first=np.array([natoms0], dtype=np.int32),
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py
new file mode 100644
index 0000000..2e97221
--- /dev/null
+++ b/openqdc/datasets/interaction/dess66x8.py
@@ -0,0 +1,108 @@
+import os
+from typing import Dict, List
+
+import numpy as np
+import pandas as pd
+from loguru import logger
+from tqdm import tqdm
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table
+
+
+class DESS66x8(BaseInteractionDataset):
+    """
+    DE Shaw Research interaction energy
+    estimates of all 528 conformers from
+    the original S66x8 dataset as described
+    in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+
+    Data was downloaded from Zenodo:
+
+    https://zenodo.org/records/5676284
+    """
+
+    __name__ = "des_s66x8"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "mp2/cc-pvdz",
+        "mp2/cc-pvqz",
+        "mp2/cc-pvtz",
+        "mp2/cbs",
+        "ccsd(t)/cc-pvdz",
+        "ccsd(t)/cbs",  # cbs
+        "ccsd(t)/nn",  # nn
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz_es",
+        "sapt0/aug-cc-pwcvxz_ex",
+        "sapt0/aug-cc-pwcvxz_exs2",
+        "sapt0/aug-cc-pwcvxz_ind",
+        "sapt0/aug-cc-pwcvxz_exind",
+        "sapt0/aug-cc-pwcvxz_disp",
+        "sapt0/aug-cc-pwcvxz_exdisp_os",
+        "sapt0/aug-cc-pwcvxz_exdisp_ss",
+        "sapt0/aug-cc-pwcvxz_delta_HF",
+    ]
+
+    energy_target_names = [
+        "cc_MP2_all",
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "cc_CCSD(T)_all",
+        "cbs_CCSD(T)_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    def read_raw_entries(self) -> List[Dict]:
+        self.filepath = os.path.join(self.root, "DESS66x8.csv")
+        logger.info(f"Reading DESS66x8 interaction data from {self.filepath}")
+        df = pd.read_csv(self.filepath)
+        data = []
+        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+            smiles0, smiles1 = row["smiles0"], row["smiles1"]
+            charge0, charge1 = row["charge0"], row["charge1"]
+            natoms0, natoms1 = row["natoms0"], row["natoms1"]
+            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
+
+            elements = row["elements"].split()
+
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
+
+            name = np.array([smiles0 + "." + smiles1])
+
+            subset = row["system_name"]
+
+            item = dict(
+                energies=energies,
+                subset=np.array([subset]),
+                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
+                n_atoms_first=np.array([natoms0], dtype=np.int32),
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
new file mode 100644
index 0000000..041964a
--- /dev/null
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -0,0 +1,80 @@
+import os
+from typing import Dict, List
+
+import numpy as np
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table
+
+
+class Metcalf(BaseInteractionDataset):
+    """
+    Hydrogen-bonded dimers of NMA with 126 molecules as described in:
+
+    Approaches for machine learning intermolecular interaction energies and
+    application to energy components from symmetry adapted perturbation theory.
+    Derek P. Metcalf, Alexios Koutsoukas, Steven A. Spronk, Brian L. Claus,
+    Deborah A. Loughney, Stephen R. Johnson, Daniel L. Cheney, C. David Sherrill;
+    J. Chem. Phys. 21 February 2020; 152 (7): 074103.
+    https://doi.org/10.1063/1.5142636
+
+    Further details:
+    "Hydrogen-bonded dimers involving N-methylacetamide (NMA) and 126 molecules
+    (46 donors and 80 acceptors; Figs. 2 and 3) were used. Optimized geometries
+    for the 126 individual monomers were obtained and paired with NMA in broad
+    arrays of spatial configurations to generate thousands of complexes for training.
+    """
+
+    __name__ = "metcalf"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = None
+    __energy_methods__ = [
+        "SAPT0/jun-cc-pVDZ",
+        "SAPT0/jun-cc-pVDZ_es",
+        "SAPT0/jun-cc-pVDZ_ex",
+        "SAPT0/jun-cc-pVDZ_ind",
+        "SAPT0/jun-cc-pVDZ_disp",
+    ]
+    energy_target_names = [
+        "total energy",
+        "electrostatic energy",
+        "exchange energy",
+        "induction energy",
+        "dispersion energy",
+    ]
+
+    def read_raw_entries(self) -> List[Dict]:
+        data = []
+        for dirname in os.listdir(self.root):
+            xyz_dir = os.path.join(self.root, dirname)
+            if not os.path.isdir(xyz_dir):
+                continue
+            subset = np.array([dirname.split("-")[0].lower()])  # training, validation, or test
+            for filename in os.listdir(xyz_dir):
+                if not filename.endswith(".xyz"):
+                    continue
+                lines = list(map(lambda x: x.strip(), open(os.path.join(xyz_dir, filename), "r").readlines()))
+                line_two = lines[1].split(",")
+                energies = np.array([line_two[1:6]], dtype=np.float32)
+                num_atoms = np.array([int(lines[0])])
+
+                elem_xyz = np.array([x.split() for x in lines[2:]])
+                elements = elem_xyz[:, 0]
+                xyz = elem_xyz[:, 1:].astype(np.float32)
+                atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+                charges = np.expand_dims(np.array([0] * num_atoms[0]), axis=1)
+
+                atomic_inputs = np.concatenate((atomic_nums, charges, xyz), axis=-1, dtype=np.float32)
+
+                item = dict(
+                    n_atoms=num_atoms,
+                    subset=subset,
+                    energies=energies,
+                    positions=xyz,
+                    atomic_inputs=atomic_inputs,
+                    name=np.array([""]),
+                    n_atoms_first=np.array([-1]),
+                )
+                data.append(item)
+        return data
diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
new file mode 100644
index 0000000..06b6d86
--- /dev/null
+++ b/openqdc/datasets/interaction/splinter.py
@@ -0,0 +1,126 @@
+import os
+from typing import Dict, List
+
+import numpy as np
+from loguru import logger
+from tqdm import tqdm
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table
+
+
+class Splinter(BaseInteractionDataset):
+    """
+    A dataset of over 1.7 million protein-ligand
+    interactions as described in the paper:
+
+    A quantum chemical interaction energy dataset for accurately modeling protein-ligand interactions.
+    Spronk, S.A., Glick, Z.L., Metcalf, D.P. et al.
+    Sci Data 10, 619 (2023).
+    https://doi.org/10.1038/s41597-023-02443-1
+    """
+
+    __name__ = "splinter"
+    __energy_methods__ = [
+        "sapt0/jun-cc-pV(D+d)Z_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_es_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_ex_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_ind_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_disp_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_scaled",
+        "sapt0/jun-cc-pV(D+d)Z_es_scaled",
+        "sapt0/jun-cc-pV(D+d)Z_ex_scaled",
+        "sapt0/jun-cc-pV(D+d)Z_ind_scaled",
+        "sapt0/jun-cc-pV(D+d)Z_disp_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_es_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_ex_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_ind_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_disp_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_es_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_ex_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_ind_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_disp_scaled",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        logger.info(f"Reading Splinter interaction data from {self.root}")
+        data = []
+        i = 0
+        with tqdm(total=1706343) as progress_bar:
+            for root, dirs, files in os.walk(self.root):  # total is currently an approximation
+                for filename in files:
+                    if not filename.endswith(".xyz"):
+                        continue
+                    i += 1
+                    filepath = os.path.join(root, filename)
+                    filein = open(filepath, "r")
+                    lines = list(map(lambda x: x.strip(), filein.readlines()))
+                    n_atoms = np.array([int(lines[0])], dtype=np.int32)
+                    metadata = lines[1].split(",")
+                    try:
+                        (
+                            protein_monomer_name,
+                            protein_interaction_site_type,
+                            ligand_monomer_name,
+                            ligand_interaction_site_type,
+                            index,
+                            r,
+                            theta_P,
+                            tau_P,
+                            theta_L,
+                            tau_L,
+                            tau_PL,
+                        ) = metadata[0].split("_")
+                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(
+                            map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])
+                        )
+                    except ValueError:
+                        (
+                            protein_monomer_name,
+                            protein_interaction_site_type,
+                            ligand_monomer_name,
+                            ligand_interaction_site_type,
+                            index,
+                            _,
+                        ) = metadata[0].split("_")
+                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6
+                    energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)
+                    n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32)
+                    total_charge, charge0, charge1 = list(map(int, metadata[1:4]))
+                    lines = list(map(lambda x: x.split(), lines[2:]))
+                    pos = np.array(lines)[:, 1:].astype(np.float32)
+                    elems = np.array(lines)[:, 0]
+                    atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+                    natoms0 = n_atoms_first[0]
+                    natoms1 = n_atoms[0] - natoms0
+                    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+                    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+                    subset = np.array([root.split("/")[-1]])
+
+                    item = dict(
+                        energies=energies,
+                        subset=subset,
+                        n_atoms=n_atoms,
+                        n_atoms_first=n_atoms_first,
+                        atomic_inputs=atomic_inputs,
+                        protein_monomer_name=np.array([protein_monomer_name]),
+                        protein_interaction_site_type=np.array([protein_interaction_site_type]),
+                        ligand_monomer_name=np.array([ligand_monomer_name]),
+                        ligand_interaction_site_type=np.array([ligand_interaction_site_type]),
+                        index=np.array([index]),
+                        r=np.array([r]),
+                        theta_P=np.array([theta_P]),
+                        tau_P=np.array([tau_P]),
+                        theta_L=np.array([theta_L]),
+                        tau_L=np.array([tau_L]),
+                        tau_PL=np.array([tau_PL]),
+                        name=np.array([protein_monomer_name + "." + ligand_monomer_name]),
+                    )
+                    data.append(item)
+                    progress_bar.update(1)
+        logger.info(f"Processed {i} files in total")
+        return data
diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py
index b8cd73e..6205a3e 100644
--- a/openqdc/raws/config_factory.py
+++ b/openqdc/raws/config_factory.py
@@ -90,6 +90,20 @@ class DataConfigFactory:
         },
     )
 
+    des370k_interaction = dict(
+        dataset_name="des370k_interaction",
+        links={
+            "DES370K.zip": "https://zenodo.org/record/5676266/files/DES370K.zip",
+        },
+    )
+
+    des5m_interaction = dict(
+        dataset_name="des5m_interaction",
+        links={
+            "DES5M.zip": "https://zenodo.org/records/5706002/files/DESS5M.zip?download=1",
+        },
+    )
+
     tmqm = dict(
         dataset_name="tmqm",
         links={
@@ -142,6 +156,22 @@ class DataConfigFactory:
         dataset_name="transition1x",
         links={"Transition1x.h5": "https://figshare.com/ndownloader/files/36035789"},
     )
+
+    # l7 = dict(
+    #     dataset_name="l7",
+    #     links={"l7.zip": "http://www.begdb.org/moldown.php?id=40"}
+    # )
+
+    des_s66 = dict(
+        dataset_name="des_s66",
+        links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"},
+    )
+
+    des_s66x8 = dict(
+        dataset_name="des_s66x8",
+        links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"},
+    )
+
     available_datasets = [k for k in locals().keys() if not k.startswith("__")]
 
     def __init__(self):
diff --git a/openqdc/utils/molecule.py b/openqdc/utils/molecule.py
index 2edadc5..6b48cc5 100644
--- a/openqdc/utils/molecule.py
+++ b/openqdc/utils/molecule.py
@@ -10,6 +10,398 @@
 
 atom_table = Chem.GetPeriodicTable()
 
+# molecule group classification for DES datasets
+molecule_groups = {
+    "acids": set(["CCC(=O)O", "CC(=O)O", "OC=O", "OC(=O)CC(=O)O"]),
+    "alcohols": set(["CCCO", "CCC(O)C", "CCO", "CC(O)C", "CO", "OC1CCCC1", "OC1CCCCC1", "OCCCCO", "OCCCO", "OCCO"]),
+    "alkanes": set(
+        [
+            "C1CCCC1",
+            "C1CCCCC1",
+            "C",
+            "CC1CCCC1",
+            "CC1CCCCC1",
+            "CC",
+            "CCC",
+            "CC(C)C",
+            "CCCC",
+            "CC(C)(C)C",
+            "CCC(C)C",
+            "CCCCC",
+            "CCC(C)(C)C",
+            "CCCCCC",
+        ]
+    ),
+    "alkenes": set(
+        [
+            "C=C",
+            "CC=C",
+            "CC=CC",
+            "CC(=C)C",
+            "CCC=C",
+            "CC=C(C)C",
+            "CCC=CC",
+            "CCC(=C)C",
+            "CC(=C(C)C)C",
+            "CCC=C(C)C",
+            "CCC(=CC)C",
+            "CCC(=C(C)C)C",
+        ]
+    ),
+    "amides": set(
+        [
+            "CCCNC=O",
+            "CCC(=O)N",
+            "CCC(=O)NC",
+            "CCC(=O)N(C)C",
+            "CCC(=O)N(CC)C",
+            "CCNC=O",
+            "CCNC(=O)C",
+            "CCN(C=O)CC",
+            "CCN(C(=O)C)C",
+            "CCNC(=O)CC",
+            "CCN(C(=O)C)CC",
+            "CC(=O)N",
+            "CC(=O)N(C)C",
+            "CNC=O",
+            "CNC(=O)C",
+            "CN(C=O)CC",
+            "CNC(=O)CC(=O)N",
+            "CNC(=O)CC(=O)NC",
+            "CNC(=O)CNC=O",
+            "CNC(=O)CNC(=O)C",
+            "CNC(=O)C(NC(=O)C)C",
+            "NC=O",
+            "NC(=O)CC(=O)N",
+            "O=CN(C)C",
+            "O=CNCCC(=O)N",
+            "O=CNCCC(=O)NC",
+            "O=CNCCNC=O",
+            "O=CNCC(=O)N",
+        ]
+    ),
+    "amines": set(
+        [
+            "C1CCCN1",
+            "C1CCCNC1",
+            "CCCN",
+            "CCCNC",
+            "CCCN(C)C",
+            "CCN",
+            "CCN(C)C",
+            "CCNCC",
+            "CCN(CC)C",
+            "CN",
+            "CNC",
+            "CN(C)C",
+            "CNCC",
+            "CNCCCN",
+            "CNCCCNC",
+            "CNCCN",
+            "CNCCNC",
+            "N",
+            "NCCCN",
+            "NCCN",
+        ]
+    ),
+    "ammoniums": set(["CC[NH3+]", "C[N+](C)(C)C", "C[NH2+]C", "C[NH3+]", "C[NH+](C)C", "[NH4+]"]),
+    "benzene": set(["c1ccccc1", "Cc1ccccc1", "CCc1ccccc1"]),
+    "carboxylates": set(["[O-]C=O", "[O-]C(=O)C", "[O-]C(=O)CC"]),
+    "esters": set(
+        [
+            "CCCOC=O",
+            "CCC(=O)OC",
+            "CCOC(=O)CC",
+            "CCOC(=O)C",
+            "CCOC=O",
+            "COC(=O)C",
+            "COC=O",
+            "O=COCCCOC=O",
+            "O=COCCOC=O",
+            "O=COCOC=O",
+        ]
+    ),
+    "ethers": set(
+        [
+            "C1CCCO1",
+            "C1CCCOC1",
+            "C1CCOCO1",
+            "C1OCCO1",
+            "CCCOC",
+            "CCCOCOC",
+            "CCOCC",
+            "COCCCOC",
+            "COCC",
+            "COCCOC",
+            "COC",
+            "COCOCC",
+            "COCOC",
+            "O1CCOCC1",
+            "O1COCOC1",
+        ]
+    ),
+    "guanidiums": set(["CCNC(=[NH2+])N", "CNC(=[NH2+])N", "NC(=[NH2+])N"]),
+    "imidazolium": set(["c1[nH]cc[nH+]1", "Cc1c[nH]c[nH+]1", "CCc1c[nH]c[nH+]1"]),
+    "ketones": set(["CCC(=O)CC", "CCC(=O)C", "CCC=O", "CC(=O)C", "CC=O", "C=O"]),
+    "monoatomics": set(
+        [
+            "[Ar]",
+            "[Br-]",
+            "[Ca+2]",
+            "[Cl-]",
+            "[F-]",
+            "[He]",
+            "[I-]",
+            "[K+]",
+            "[Kr]",
+            "[Li+]",
+            "[Mg+2]",
+            "[Na+]",
+            "[Ne]",
+            "[Xe]",
+        ]
+    ),
+    "other": set(
+        [
+            "Brc1ccc(cc1)Br",
+            "Brc1ccccc1",
+            "BrC(Br)Br",
+            "BrCBr",
+            "BrCCBr",
+            "CBr",
+            "CC(Br)Br",
+            "CCBr",
+            "CCCC#CC",
+            "CCCC(Cl)(Cl)Cl",
+            "CCCC(Cl)Cl",
+            "CCCCCl",
+            "CCC#CC",
+            "CCCC#C",
+            "CCCC(F)(F)F",
+            "CCCC(F)F",
+            "CCCCF",
+            "CCC(Cl)(Cl)Cl",
+            "CCC(Cl)Cl",
+            "CCCCl",
+            "CCCC#N",
+            "CC#CC",
+            "CCC#C",
+            "CCC(F)(F)F",
+            "CCC(F)F",
+            "CCCF",
+            "CC(Cl)(Cl)Cl",
+            "CC(Cl)Cl",
+            "CCCl",
+            "CCC#N",
+            "CC#C",
+            "CC(F)(F)F",
+            "CC(F)F",
+            "CCF",
+            "CC(I)I",
+            "CCI",
+            "CCl",
+            "CC#N",
+            "CCOP(=O)(OC)OC",
+            "CCOP(=O)(OC)[O-]",
+            "CCOP(=O)(OC)O",
+            "C#C",
+            "CF",
+            "CI",
+            "Clc1ccc(cc1)Cl",
+            "Clc1cccc(c1)Cl",
+            "Clc1ccccc1Cl",
+            "Clc1ccccc1",
+            "Clc1cc(Cl)c(c(c1Cl)Cl)Cl",
+            "Clc1cc(Cl)cc(c1)Cl",
+            "Clc1c(Cl)c(Cl)c(c(c1Cl)Cl)Cl",
+            "ClC(C(Cl)(Cl)Cl)(Cl)Cl",
+            "ClC(C(Cl)(Cl)Cl)Cl",
+            "ClCC(Cl)(Cl)Cl",
+            "ClCC(Cl)Cl",
+            "ClCCCl",
+            "ClC(Cl)Cl",
+            "ClCCl",
+            "CNCCCOC=O",
+            "CNCCCOC",
+            "CNCCC(=O)NC",
+            "CNCCC(=O)N",
+            "CNCCC(=O)O",
+            "CNCCCO",
+            "CNCCCSC",
+            "CNCCCS",
+            "CNCCNC=O",
+            "CNCCOC=O",
+            "CNCCOC",
+            "CNCC(=O)NC",
+            "CNCC(=O)N",
+            "CNCC(=O)O",
+            "CNCCO",
+            "CNCCSC",
+            "CNCCS",
+            "CNC(=O)CCN",
+            "CNC(=O)CC(=O)O",
+            "CNC(=O)CCO",
+            "CNC(=O)CCS",
+            "CNC(=O)CN",
+            "CNC(=O)COC=O",
+            "CNC(=O)CO",
+            "CNCOC=O",
+            "CNCOC",
+            "CNC(=O)CS",
+            "CNCSC",
+            "C#N",
+            "COCCCN",
+            "COCCCOC=O",
+            "COCCC(=O)NC",
+            "COCCC(=O)N",
+            "COCCC(=O)O",
+            "COCCCO",
+            "COCCCSC",
+            "COCCCS",
+            "COCCNC=O",
+            "COCCN",
+            "COCCOC=O",
+            "COCC(=O)NC",
+            "COCC(=O)N",
+            "COCC(=O)O",
+            "COCCO",
+            "COCCSC",
+            "COCCS",
+            "COCNC=O",
+            "COCN",
+            "COCOC=O",
+            "COCO",
+            "COCSC",
+            "COCS",
+            "COP(=O)(OC)OC",
+            "COP(=O)(OC)[O-]",
+            "COP(=O)(OC)O",
+            "COP(=O)(O)O",
+            "COP(=O)(OP(=O)(O)O)[O-]",
+            "CSCCCNC=O",
+            "CSCCCN",
+            "CSCCCOC=O",
+            "CSCCC(=O)N",
+            "CSCCC(=O)O",
+            "CSCCCO",
+            "CSCCN",
+            "CSCCOC=O",
+            "CSCC(=O)NC",
+            "CSCC(=O)N",
+            "CSCC(=O)O",
+            "CSCCO",
+            "CSCNC=O",
+            "CSCN",
+            "CSCOC=O",
+            "CSCO",
+            "Fc1ccc(cc1)F",
+            "Fc1cccc(c1)F",
+            "Fc1ccccc1F",
+            "Fc1ccccc1",
+            "Fc1cc(F)c(c(c1F)F)F",
+            "Fc1cc(F)cc(c1)F",
+            "Fc1c(F)c(F)c(c(c1F)F)F",
+            "FC(C(F)(F)F)(F)F",
+            "FC(C(F)(F)F)F",
+            "FCC(F)(F)F",
+            "FCC(F)F",
+            "FCCF",
+            "FC(F)F",
+            "FCF",
+            "ICCI",
+            "ICI",
+            "NCCCOC=O",
+            "NCCC(=O)N",
+            "NCCC(=O)O",
+            "NCCCO",
+            "NCCCS",
+            "NCCNC=O",
+            "NCCOC=O",
+            "NCC(=O)N",
+            "NCC(=O)O",
+            "NCCO",
+            "NCCS",
+            "NC(=O)CC(=O)O",
+            "NC(=O)CCO",
+            "NC(=O)CCS",
+            "NC(=O)CO",
+            "NCOC=O",
+            "NC(=O)CS",
+            "OCCCNC=O",
+            "OCCCOC=O",
+            "OCCC(=O)O",
+            "OCCCS",
+            "OCCNC=O",
+            "OCCOC=O",
+            "OCC(=O)O",
+            "OCCS",
+            "O=CNCCC(=O)O",
+            "O=CNCCOC=O",
+            "O=CNCC(=O)O",
+            "O=CNCOC=O",
+            "O=COCCC(=O)NC",
+            "O=COCCC(=O)N",
+            "O=COCCC(=O)O",
+            "O=COCC(=O)N",
+            "O=COCC(=O)O",
+            "OC(=O)CCS",
+            "OCOC=O",
+            "OC(=O)CS",
+            "OP(=O)(O)O",
+            "[O-]P(=O)(OP(=O)(OC)O)O",
+            "SCCCOC=O",
+            "SCCNC=O",
+            "SCCOC=O",
+            "SCOC=O",
+            "[H][H]",
+        ]
+    ),
+    "phenol": set(["Cc1ccc(cc1)O", "CCc1ccc(cc1)O", "Oc1ccccc1"]),
+    "pyridine": set(["c1cccnc1", "c1ccncn1", "n1ccncc1"]),
+    "pyrrole": set(
+        [
+            "c1ccc2c(c1)[nH]cc2",
+            "c1ccc[nH]1",
+            "c1ncc[nH]1",
+            "Cc1cnc[nH]1",
+            "Cc1c[nH]c2c1cccc2",
+            "Cc1c[nH]cn1",
+            "CCc1cnc[nH]1",
+            "CCc1c[nH]c2c1cccc2",
+            "CCc1c[nH]cn1",
+        ]
+    ),
+    "sulfides": set(
+        [
+            "C1CCCS1",
+            "C1CCCSC1",
+            "C1CCSCS1",
+            "C1CCSSC1",
+            "C1CSSC1",
+            "C1SCCS1",
+            "CCCSCSC",
+            "CCCSC",
+            "CCCSSC",
+            "CCSCC",
+            "CCSSCC",
+            "CCSSC",
+            "CSCCCSC",
+            "CSCCSC",
+            "CSCC",
+            "CSCSCC",
+            "CSCSC",
+            "CSC",
+            "CSSC",
+            "S1CCSCC1",
+            "S1CSCSC1",
+        ]
+    ),
+    "thiols": set(["CCCSS", "CCCS", "CCSS", "CCS", "CSCCCS", "CSCCS", "CSCS", "CSS", "CS", "SCCCS", "SCCS", "SS", "S"]),
+    "water": set(["O"]),
+    "flourane": set(["F"]),
+    "hydrogen chloride": set(["Cl"]),
+}
+
 
 def z_to_formula(z):
     u, c = np.unique(z, return_counts=True)
diff --git a/tests/test_dummy.py b/tests/test_dummy.py
index 702427f..f82376c 100644
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
@@ -1,6 +1,6 @@
 """Path hack to make tests work."""
 
-from openqdc.datasets import Dummy  # noqa: E402
+from openqdc.datasets.potential.dummy import Dummy  # noqa: E402
 from openqdc.utils.atomization_energies import (
     ISOLATED_ATOM_ENERGIES,
     IsolatedAtomEnergyFactory,