From bd3fcf982c5be95e363da3bbdaf83324b125910d Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 1 Mar 2024 16:07:08 -0500
Subject: [PATCH 01/47] started splitting datasets into 'interaction' and
 'potential'

---
 src/openqdc/datasets/interaction/des370k.py   | 122 ++++++++++++++++++
 .../datasets/{ => potential}/__init__.py      |   0
 src/openqdc/datasets/{ => potential}/ani.py   |   0
 src/openqdc/datasets/{ => potential}/base.py  |   0
 src/openqdc/datasets/{ => potential}/comp6.py |   0
 src/openqdc/datasets/{ => potential}/dess.py  |   0
 src/openqdc/datasets/{ => potential}/dummy.py |   0
 src/openqdc/datasets/{ => potential}/gdml.py  |   0
 src/openqdc/datasets/{ => potential}/geom.py  |   0
 .../datasets/{ => potential}/iso_17.py        |   0
 .../datasets/{ => potential}/molecule3d.py    |   0
 .../datasets/{ => potential}/nabladft.py      |   0
 .../datasets/{ => potential}/orbnet_denali.py |   0
 src/openqdc/datasets/{ => potential}/pcqm.py  |   0
 src/openqdc/datasets/{ => potential}/qm7x.py  |   0
 src/openqdc/datasets/{ => potential}/qmugs.py |   0
 .../datasets/{ => potential}/sn2_rxn.py       |   0
 .../{ => potential}/solvated_peptides.py      |   0
 src/openqdc/datasets/{ => potential}/spice.py |   0
 src/openqdc/datasets/{ => potential}/tmqm.py  |   0
 .../datasets/{ => potential}/transition1x.py  |   0
 .../{ => potential}/waterclusters3_30.py      |   0
 22 files changed, 122 insertions(+)
 create mode 100644 src/openqdc/datasets/interaction/des370k.py
 rename src/openqdc/datasets/{ => potential}/__init__.py (100%)
 rename src/openqdc/datasets/{ => potential}/ani.py (100%)
 rename src/openqdc/datasets/{ => potential}/base.py (100%)
 rename src/openqdc/datasets/{ => potential}/comp6.py (100%)
 rename src/openqdc/datasets/{ => potential}/dess.py (100%)
 rename src/openqdc/datasets/{ => potential}/dummy.py (100%)
 rename src/openqdc/datasets/{ => potential}/gdml.py (100%)
 rename src/openqdc/datasets/{ => potential}/geom.py (100%)
 rename src/openqdc/datasets/{ => potential}/iso_17.py (100%)
 rename src/openqdc/datasets/{ => potential}/molecule3d.py (100%)
 rename src/openqdc/datasets/{ => potential}/nabladft.py (100%)
 rename src/openqdc/datasets/{ => potential}/orbnet_denali.py (100%)
 rename src/openqdc/datasets/{ => potential}/pcqm.py (100%)
 rename src/openqdc/datasets/{ => potential}/qm7x.py (100%)
 rename src/openqdc/datasets/{ => potential}/qmugs.py (100%)
 rename src/openqdc/datasets/{ => potential}/sn2_rxn.py (100%)
 rename src/openqdc/datasets/{ => potential}/solvated_peptides.py (100%)
 rename src/openqdc/datasets/{ => potential}/spice.py (100%)
 rename src/openqdc/datasets/{ => potential}/tmqm.py (100%)
 rename src/openqdc/datasets/{ => potential}/transition1x.py (100%)
 rename src/openqdc/datasets/{ => potential}/waterclusters3_30.py (100%)

diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
new file mode 100644
index 0000000..e9b6a21
--- /dev/null
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -0,0 +1,122 @@
+import torch
+import pandas as pd
+
+from typing import Dict, List
+from collections import defaultdict, Counter
+from torch.utils.data import Dataset
+
+class Dimer:
+    def __init__(
+        self,
+        smiles_0: str,
+        smiles_1: str,
+        charge_0: int,
+        charge_1: int,
+        n_atoms_0: int,
+        n_atoms_1: int,
+        pos: torch.Tensor,
+        sapt_energies: List[float],
+    ) -> None:
+        self.smiles_0 = smiles_0
+        self.smiles_1 = smiles_1
+        self.charge_1 = charge_0
+        self.charge_1 = charge_1
+        self.n_atoms_0 = n_atoms_0
+        self.n_atoms_1 = n_atoms_1
+        self.pos = pos
+        self.sapt_energies = sapt_energies
+        (
+            self.sapt_es,
+            self.sapt_ex,
+            self.sapt_exs2,
+            self.sapt_ind,
+            self.sapt_exind,
+            self.sapt_disp,
+            self.sapt_exdisp_os,
+            self.sapt_exdisp_ss,
+            self.sapt_delta_HF,
+            self.sapt_all
+        ) = tuple(sapt_energies)
+
+    def __str__(self) -> str:
+        return f"Dimer(smiles_0='{self.smiles_0}', smiles_1='{self.smiles_1}')"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+
+class DES370K(Dataset):
+    def __init__(self, filepath="data/des370k.csv") -> None:
+        self.df = pd.read_csv(filepath)
+        self._atom_types = defaultdict(int)
+        self.data = []
+        self._preprocess()
+    
+    def _preprocess(self) -> None:
+        for idx, row in self.df.iterrows():
+            smiles0, smiles1 = row["smiles0"], row["smiles1"]
+            charge0, charge1 = row["charge0"], row["charge1"]
+            natoms0, natoms1 = row["natoms0"], row["natoms1"]
+            pos = torch.tensor(list(map(float, row["xyz"].split()))).view(-1, 3)
+            sapt_energies = [row[col] for col in self.df.columns if "sapt" in col]
+            dimer = Dimer(
+                smiles0, smiles1,
+                charge0, charge1,
+                natoms0, natoms1,
+                pos, sapt_energies
+            )
+            self.data.append(dimer)
+
+            # get atom types
+            elems = row["elements"].split()
+            counts = Counter(set(elems))
+            for key in counts:
+                self._atom_types[key] += counts[key]
+
+        # convert defaultdict to regular dict
+        self._atom_types = dict(self._atom_types)
+
+    def __str__(self) -> str:
+        return f"DES370K(n_atoms={self.num_atoms}, n_molecules={self.num_molecules}, atom_types={self.species})"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+    @property
+    def atom_types(self) -> Dict[str, int]:
+        """
+        Returns a dictionary of 
+        (element, count) pairs.
+        """
+        return self._atom_types
+
+    @property
+    def num_molecules(self) -> int:
+        return self.df.shape[0]
+
+    @property
+    def num_atoms(self) -> int:
+        """
+        Returns the total number of atoms in 
+        the dataset.
+        """
+        if not hasattr(self, "_num_atoms"):
+            self._num_atoms = sum(self.atom_types.values())
+        return self._num_atoms 
+
+    @property
+    def species(self) -> List[str]:
+        """
+        Returns a list of the unique atom
+        species contained in the dataset.
+        """
+        if not hasattr(self, "_species"):
+            self._species = list(self.atom_types.keys())
+        return self._species
+
+    def atom_count(self, element: str) -> int:
+        """
+        Returns the count of a given
+        element in the dataset.
+        """
+        return self.atom_types[element]
diff --git a/src/openqdc/datasets/__init__.py b/src/openqdc/datasets/potential/__init__.py
similarity index 100%
rename from src/openqdc/datasets/__init__.py
rename to src/openqdc/datasets/potential/__init__.py
diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/potential/ani.py
similarity index 100%
rename from src/openqdc/datasets/ani.py
rename to src/openqdc/datasets/potential/ani.py
diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/potential/base.py
similarity index 100%
rename from src/openqdc/datasets/base.py
rename to src/openqdc/datasets/potential/base.py
diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/potential/comp6.py
similarity index 100%
rename from src/openqdc/datasets/comp6.py
rename to src/openqdc/datasets/potential/comp6.py
diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/potential/dess.py
similarity index 100%
rename from src/openqdc/datasets/dess.py
rename to src/openqdc/datasets/potential/dess.py
diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/potential/dummy.py
similarity index 100%
rename from src/openqdc/datasets/dummy.py
rename to src/openqdc/datasets/potential/dummy.py
diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/potential/gdml.py
similarity index 100%
rename from src/openqdc/datasets/gdml.py
rename to src/openqdc/datasets/potential/gdml.py
diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/potential/geom.py
similarity index 100%
rename from src/openqdc/datasets/geom.py
rename to src/openqdc/datasets/potential/geom.py
diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/potential/iso_17.py
similarity index 100%
rename from src/openqdc/datasets/iso_17.py
rename to src/openqdc/datasets/potential/iso_17.py
diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/potential/molecule3d.py
similarity index 100%
rename from src/openqdc/datasets/molecule3d.py
rename to src/openqdc/datasets/potential/molecule3d.py
diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/potential/nabladft.py
similarity index 100%
rename from src/openqdc/datasets/nabladft.py
rename to src/openqdc/datasets/potential/nabladft.py
diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/potential/orbnet_denali.py
similarity index 100%
rename from src/openqdc/datasets/orbnet_denali.py
rename to src/openqdc/datasets/potential/orbnet_denali.py
diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/potential/pcqm.py
similarity index 100%
rename from src/openqdc/datasets/pcqm.py
rename to src/openqdc/datasets/potential/pcqm.py
diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/potential/qm7x.py
similarity index 100%
rename from src/openqdc/datasets/qm7x.py
rename to src/openqdc/datasets/potential/qm7x.py
diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/potential/qmugs.py
similarity index 100%
rename from src/openqdc/datasets/qmugs.py
rename to src/openqdc/datasets/potential/qmugs.py
diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/potential/sn2_rxn.py
similarity index 100%
rename from src/openqdc/datasets/sn2_rxn.py
rename to src/openqdc/datasets/potential/sn2_rxn.py
diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/potential/solvated_peptides.py
similarity index 100%
rename from src/openqdc/datasets/solvated_peptides.py
rename to src/openqdc/datasets/potential/solvated_peptides.py
diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/potential/spice.py
similarity index 100%
rename from src/openqdc/datasets/spice.py
rename to src/openqdc/datasets/potential/spice.py
diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/potential/tmqm.py
similarity index 100%
rename from src/openqdc/datasets/tmqm.py
rename to src/openqdc/datasets/potential/tmqm.py
diff --git a/src/openqdc/datasets/transition1x.py b/src/openqdc/datasets/potential/transition1x.py
similarity index 100%
rename from src/openqdc/datasets/transition1x.py
rename to src/openqdc/datasets/potential/transition1x.py
diff --git a/src/openqdc/datasets/waterclusters3_30.py b/src/openqdc/datasets/potential/waterclusters3_30.py
similarity index 100%
rename from src/openqdc/datasets/waterclusters3_30.py
rename to src/openqdc/datasets/potential/waterclusters3_30.py

From a800ea5836027297d59f10b981dc73d28d3d1df0 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 1 Mar 2024 16:24:09 -0500
Subject: [PATCH 02/47] add num_unique_molecules property

---
 src/openqdc/datasets/interaction/des370k.py | 25 ++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
index e9b6a21..2884439 100644
--- a/src/openqdc/datasets/interaction/des370k.py
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -49,6 +49,7 @@ class DES370K(Dataset):
     def __init__(self, filepath="data/des370k.csv") -> None:
         self.df = pd.read_csv(filepath)
         self._atom_types = defaultdict(int)
+        self.smiles = set()
         self.data = []
         self._preprocess()
     
@@ -67,6 +68,10 @@ def _preprocess(self) -> None:
             )
             self.data.append(dimer)
 
+            # keep track of unique smiles strings
+            self.smiles.add(smiles0)
+            self.smiles.add(smiles1)
+
             # get atom types
             elems = row["elements"].split()
             counts = Counter(set(elems))
@@ -77,7 +82,9 @@ def _preprocess(self) -> None:
         self._atom_types = dict(self._atom_types)
 
     def __str__(self) -> str:
-        return f"DES370K(n_atoms={self.num_atoms}, n_molecules={self.num_molecules}, atom_types={self.species})"
+        return f"DES370K(n_atoms={self.num_atoms},\
+               n_molecules={self.num_molecules},\
+               atom_types={self.species})"
 
     def __repr__(self) -> str:
         return str(self)
@@ -91,8 +98,20 @@ def atom_types(self) -> Dict[str, int]:
         return self._atom_types
 
     @property
-    def num_molecules(self) -> int:
-        return self.df.shape[0]
+    def num_dimers(self) -> int:
+        """
+        Returns the number of 
+        dimers in the dataset.
+        """
+        return len(self.data)
+
+    @property
+    def num_unique_molecules(self) -> int:
+        """
+        Returns the number of unique
+        molecules in the dataset.
+        """
+        return len(self.smiles)
 
     @property
     def num_atoms(self) -> int:

From 9d6fca6ff6f514a0f7d5bcad4ecec3188c448336 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 1 Mar 2024 16:26:58 -0500
Subject: [PATCH 03/47] added logging

---
 src/openqdc/datasets/interaction/des370k.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
index 2884439..127ec89 100644
--- a/src/openqdc/datasets/interaction/des370k.py
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -3,6 +3,8 @@
 
 from typing import Dict, List
 from collections import defaultdict, Counter
+
+from loguru import logger
 from torch.utils.data import Dataset
 
 class Dimer:
@@ -47,6 +49,7 @@ def __repr__(self) -> str:
 
 class DES370K(Dataset):
     def __init__(self, filepath="data/des370k.csv") -> None:
+        self.filepath = filepath
         self.df = pd.read_csv(filepath)
         self._atom_types = defaultdict(int)
         self.smiles = set()
@@ -54,6 +57,7 @@ def __init__(self, filepath="data/des370k.csv") -> None:
         self._preprocess()
     
     def _preprocess(self) -> None:
+        logger.info(f"Reading data from {self.filepath}")
         for idx, row in self.df.iterrows():
             smiles0, smiles1 = row["smiles0"], row["smiles1"]
             charge0, charge1 = row["charge0"], row["charge1"]

From 794e63fc1faff3f8d1e03d485799688352bc43db Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 1 Mar 2024 16:31:20 -0500
Subject: [PATCH 04/47] started base interaction dataset

---
 src/openqdc/datasets/interaction/base.py | 553 +++++++++++++++++++++++
 1 file changed, 553 insertions(+)
 create mode 100644 src/openqdc/datasets/interaction/base.py

diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py
new file mode 100644
index 0000000..de43887
--- /dev/null
+++ b/src/openqdc/datasets/interaction/base.py
@@ -0,0 +1,553 @@
+from typing import Dict, List, Optional, Union
+from openqdc.utils.io import (
+    copy_exists,
+    dict_to_atoms,
+    get_local_cache,
+    load_hdf5_file,
+    load_pkl,
+    pull_locally,
+    push_remote,
+    set_cache_dir,
+)
+
+from loguru import logger
+
+class BaseInteractionDataset:
+    __energy_methods__ = []
+    __force_methods__ = []
+    energy_target_names = []
+    force_target_names = []
+    __isolated_atom_energies__ = []
+
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __fn_energy__ = lambda x: x
+    __fn_distance__ = lambda x: x
+    __fn_forces__ = lambda x: x
+    __average_nb_atoms__ = None
+    __stats__ = {}
+
+    def __init__(
+        self,
+        energy_unit: Optional[str] = None,
+        distance_unit: Optional[str] = None,
+        overwrite_local_cache: bool = False,
+        cache_dir: Optional[str] = None,
+    ) -> None:
+        set_cache_dir(cache_dir)
+        self.data = None
+        if not self.is_preprocessed():
+            raise DatasetNotAvailableError(self.__name__)
+        else:
+            self.read_preprocess(overwrite_local_cache=overwrite_local_cache)
+        self._post_init(overwrite_local_cache, energy_unit, distance_unit)
+
+    def _post_init(
+        self,
+        overwrite_local_cache: bool = False,
+        energy_unit: Optional[str] = None,
+        distance_unit: Optional[str] = None,
+    ) -> None:
+        self._set_units(None, None)
+        self._set_isolated_atom_energies()
+        self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)
+        self._set_units(energy_unit, distance_unit)
+        self._convert_data()
+        self._set_isolated_atom_energies()
+
+    def _convert_data(self):
+        logger.info(
+            f"Converting {self.__name__} data to the following units:\n\
+                     Energy: {self.energy_unit},\n\
+                     Distance: {self.distance_unit},\n\
+                     Forces: {self.force_unit if self.__force_methods__ else 'None'}"
+        )
+        for key in self.data_keys:
+            self.data[key] = self._convert_on_loading(self.data[key], key)
+
+    def _precompute_statistics(self, overwrite_local_cache: bool = False):
+        local_path = p_join(self.preprocess_path, "stats.pkl")
+        if self.is_preprocessed_statistics() and not overwrite_local_cache:
+            stats = load_pkl(local_path)
+            logger.info("Loaded precomputed statistics")
+        else:
+            logger.info("Precomputing relevant statistics")
+            (
+                inter_E_mean,
+                inter_E_std,
+                formation_E_mean,
+                formation_E_std,
+                total_E_mean,
+                total_E_std,
+            ) = self._precompute_E()
+            forces_dict = self._precompute_F()
+            stats = {
+                "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict},
+                "inter": {"energy": {"mean": inter_E_mean, "std": inter_E_std}, "forces": forces_dict},
+                "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict},
+            }
+            with open(local_path, "wb") as f:
+                pkl.dump(stats, f)
+        self._compute_average_nb_atoms()
+        self.__stats__ = stats
+
+    def _compute_average_nb_atoms(self):
+        self.__average_nb_atoms__ = np.mean(self.data["n_atoms"])
+
+    def _precompute_E(self):
+        splits_idx = self.data["position_idx_range"][:, 1]
+        s = np.array(self.data["atomic_inputs"][:, :2], dtype=int)
+        s[:, 1] += IsolatedAtomEnergyFactory.max_charge
+        matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__]
+        converted_energy_data = self.data["energies"]
+        # calculation per molecule formation energy statistics
+        E = []
+        for i, matrix in enumerate(matrixs):
+            c = np.cumsum(np.append([0], matrix))[splits_idx]
+            c[1:] = c[1:] - c[:-1]
+            E.append(converted_energy_data[:, i] - c)
+        E = np.array(E).T
+        inter_E_mean = np.nanmean(E / self.data["n_atoms"][:, None], axis=0)
+        inter_E_std = np.nanstd(E / self.data["n_atoms"][:, None], axis=0)
+        formation_E_mean = np.nanmean(E, axis=0)
+        formation_E_std = np.nanstd(E, axis=0)
+        total_E_mean = np.nanmean(converted_energy_data, axis=0)
+        total_E_std = np.nanstd(converted_energy_data, axis=0)
+
+        return (
+            np.atleast_2d(inter_E_mean),
+            np.atleast_2d(inter_E_std),
+            np.atleast_2d(formation_E_mean),
+            np.atleast_2d(formation_E_std),
+            np.atleast_2d(total_E_mean),
+            np.atleast_2d(total_E_std),
+        )
+
+    def _precompute_F(self):
+        if len(self.__force_methods__) == 0:
+            return NOT_DEFINED
+        converted_force_data = self.convert_forces(self.data["forces"])
+        force_mean = np.nanmean(converted_force_data, axis=0)
+        force_std = np.nanstd(converted_force_data, axis=0)
+        force_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))
+        return {
+            "mean": np.atleast_2d(force_mean.mean(axis=0)),
+            "std": np.atleast_2d(force_std.mean(axis=0)),
+            "components": {"rms": force_rms, "std": force_std, "mean": force_mean},
+        }
+
+    @property
+    def numbers(self):
+        if hasattr(self, "_numbers"):
+            return self._numbers
+        self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
+        return self._numbers
+
+    @property
+    def chemical_species(self):
+        return np.array(chemical_symbols)[self.numbers]
+
+    @property
+    def energy_unit(self):
+        return self.__energy_unit__
+
+    @property
+    def distance_unit(self):
+        return self.__distance_unit__
+
+    @property
+    def force_unit(self):
+        return self.__forces_unit__
+
+    @property
+    def root(self):
+        return p_join(get_local_cache(), self.__name__)
+
+    @property
+    def preprocess_path(self):
+        path = p_join(self.root, "preprocessed")
+        os.makedirs(path, exist_ok=True)
+        return path
+
+    @property
+    def data_keys(self):
+        keys = list(self.data_types.keys())
+        if len(self.__force_methods__) == 0:
+            keys.remove("forces")
+        return keys
+
+    @property
+    def data_types(self):
+        return {
+            "atomic_inputs": np.float32,
+            "position_idx_range": np.int32,
+            "energies": np.float32,
+            "forces": np.float32,
+        }
+
+    @property
+    def data_shapes(self):
+        return {
+            "atomic_inputs": (-1, NB_ATOMIC_FEATURES),
+            "position_idx_range": (-1, 2),
+            "energies": (-1, len(self.energy_target_names)),
+            "forces": (-1, 3, len(self.force_target_names)),
+        }
+
+    @property
+    def atoms_per_molecules(self):
+        try:
+            if hasattr(self, "_n_atoms"):
+                return self._n_atoms
+            self._n_atoms = self.data["n_atoms"]
+            return self._n_atoms
+        except:  # noqa
+            return None
+
+    def _set_units(self, en, ds):
+        old_en, old_ds = self.energy_unit, self.distance_unit
+        en = en if en is not None else old_en
+        ds = ds if ds is not None else old_ds
+
+        # if en is None:
+        self.set_energy_unit(en)
+        # if ds is not None:
+        self.set_distance_unit(ds)
+        if self.__force_methods__:
+            self.__forces_unit__ = self.energy_unit + "/" + self.distance_unit
+            self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__)
+
+    def _set_isolated_atom_energies(self):
+        if self.__energy_methods__ is None:
+            logger.error("No energy methods defined for this dataset.")
+        f = get_conversion("hartree", self.__energy_unit__)
+        self.__isolated_atom_energies__ = f(
+            np.array([IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.__energy_methods__])
+        )
+
+    def convert_energy(self, x):
+        return self.__class__.__fn_energy__(x)
+
+    def convert_distance(self, x):
+        return self.__class__.__fn_distance__(x)
+
+    def convert_forces(self, x):
+        return self.__class__.__fn_forces__(x)
+
+    def set_energy_unit(self, value: str):
+        """
+        Set a new energy unit for the dataset.
+        """
+        old_unit = self.energy_unit
+        self.__energy_unit__ = value
+        self.__class__.__fn_energy__ = get_conversion(old_unit, value)
+
+    def set_distance_unit(self, value: str):
+        """
+        Set a new distance unit for the dataset.
+        """
+        old_unit = self.distance_unit
+        self.__distance_unit__ = value
+        self.__class__.__fn_distance__ = get_conversion(old_unit, value)
+
+    def read_raw_entries(self):
+        raise NotImplementedError
+
+    def collate_list(self, list_entries):
+        # concatenate entries
+        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
+
+        csum = np.cumsum(res.get("n_atoms"))
+        x = np.zeros((csum.shape[0], 2), dtype=np.int32)
+        x[1:, 0], x[:, 1] = csum[:-1], csum
+        res["position_idx_range"] = x
+
+        return res
+
+    def save_preprocess(self, data_dict):
+        # save memmaps
+        logger.info("Preprocessing data and saving it to cache.")
+        for key in self.data_keys:
+            local_path = p_join(self.preprocess_path, f"{key}.mmap")
+            out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
+            out[:] = data_dict.pop(key)[:]
+            out.flush()
+            push_remote(local_path, overwrite=True)
+
+        # save smiles and subset
+        local_path = p_join(self.preprocess_path, "props.pkl")
+        for key in ["name", "subset"]:
+            data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+
+        with open(local_path, "wb") as f:
+            pkl.dump(data_dict, f)
+        push_remote(local_path, overwrite=True)
+
+    def _convert_on_loading(self, x, key):
+        if key == "energies":
+            return self.convert_energy(x)
+        elif key == "forces":
+            return self.convert_forces(x)
+        elif key == "atomic_inputs":
+            x = np.array(x, dtype=np.float32)
+            x[:, -3:] = self.convert_distance(x[:, -3:])
+            return x
+        else:
+            return x
+
+    def read_preprocess(self, overwrite_local_cache=False):
+        logger.info("Reading preprocessed data")
+        logger.info(
+            f"{self.__name__} data with the following units:\n\
+                     Energy: {self.energy_unit},\n\
+                     Distance: {self.distance_unit},\n\
+                     Forces: {self.force_unit if self.__force_methods__ else 'None'}"
+        )
+        self.data = {}
+        for key in self.data_keys:
+            filename = p_join(self.preprocess_path, f"{key}.mmap")
+            pull_locally(filename, overwrite=overwrite_local_cache)
+            self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key])
+
+        filename = p_join(self.preprocess_path, "props.pkl")
+        pull_locally(filename, overwrite=overwrite_local_cache)
+        with open(filename, "rb") as f:
+            tmp = pkl.load(f)
+            for key in ["name", "subset", "n_atoms"]:
+                x = tmp.pop(key)
+                if len(x) == 2:
+                    self.data[key] = x[0][x[1]]
+                else:
+                    self.data[key] = x
+
+        for key in self.data:
+            logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
+
+    def is_preprocessed(self):
+        predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
+        predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))]
+        return all(predicats)
+
+    def is_preprocessed_statistics(self):
+        return bool(copy_exists(p_join(self.preprocess_path, "stats.pkl")))
+
+    def preprocess(self, overwrite=False):
+        if overwrite or not self.is_preprocessed():
+            entries = self.read_raw_entries()
+            res = self.collate_list(entries)
+            self.save_preprocess(res)
+
+    def save_xyz(self, idx: int, path: Optional[str] = None, name=None):
+        """
+        Save the entry at index idx as an extxyz file.
+        """
+        if path is None:
+            path = os.getcwd()
+        at = self.get_ase_atoms(idx, ext=True)
+        if name is not None:
+            name = at.info["name"]
+        write_extxyz(p_join(path, f"{name}.xyz"), at)
+
+    def get_ase_atoms(self, idx: int, ext=True):
+        """
+        Get the ASE atoms object for the entry at index idx.
+
+        Parameters
+        ----------
+        idx : int
+            Index of the entry.
+        ext : bool, optional
+            Whether to include additional informations
+        """
+        entry = self[idx]
+        # _ = entry.pop("forces")
+        at = dict_to_atoms(entry, ext=ext)
+        return at
+
+    @requires_package("dscribe")
+    @requires_package("datamol")
+    def soap_descriptors(
+        self,
+        n_samples: Optional[Union[List[int], int]] = None,
+        return_idxs: bool = True,
+        progress: bool = True,
+        **soap_kwargs,
+    ) -> Dict[str, np.ndarray]:
+        """
+        Compute the SOAP descriptors for the dataset.
+
+        Parameters
+        ----------
+        n_samples : Optional[Union[List[int],int]], optional
+            Number of samples to use for the computation, by default None. If None, all the dataset is used.
+            If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.
+        return_idxs : bool, optional
+            Whether to return the indices of the samples used, by default True.
+        progress : bool, optional
+            Whether to show a progress bar, by default True.
+        **soap_kwargs : dict
+            Keyword arguments to pass to the SOAP descriptor.
+            By defaut, the following values are used:
+                - r_cut : 5.0
+                - n_max : 8
+                - l_max : 6
+                - average : "inner"
+                - periodic : False
+                - compression : {"mode" : "mu1nu1"}
+
+        Returns
+        -------
+        Dict[str, np.ndarray]
+            Dictionary containing the following keys:
+                - soap : np.ndarray of shape (N, M) containing the SOAP descriptors for the dataset
+                - soap_kwargs : dict containing the keyword arguments used for the SOAP descriptor
+                - idxs : np.ndarray of shape (N,) containing the indices of the samples used
+
+        """
+        import datamol as dm
+        from dscribe.descriptors import SOAP
+
+        if n_samples is None:
+            idxs = list(range(len(self)))
+        elif isinstance(n_samples, int):
+            idxs = np.random.choice(len(self), size=n_samples, replace=False)
+        else:  # list, set, np.ndarray
+            idxs = n_samples
+        datum = {}
+        r_cut = soap_kwargs.pop("r_cut", 5.0)
+        n_max = soap_kwargs.pop("n_max", 8)
+        l_max = soap_kwargs.pop("l_max", 6)
+        average = soap_kwargs.pop("average", "inner")
+        periodic = soap_kwargs.pop("periodic", False)
+        compression = soap_kwargs.pop("compression", {"mode": "mu1nu1"})
+        soap = SOAP(
+            species=self.chemical_species,
+            periodic=periodic,
+            r_cut=r_cut,
+            n_max=n_max,
+            l_max=l_max,
+            average=average,
+            compression=compression,
+        )
+        datum["soap_kwargs"] = {
+            "r_cut": r_cut,
+            "n_max": n_max,
+            "l_max": l_max,
+            "average": average,
+            "compression": compression,
+            "species": self.chemical_species,
+            "periodic": periodic,
+            **soap_kwargs,
+        }
+
+        def wrapper(idx):
+            entry = self.get_ase_atoms(idx, ext=False)
+            return soap.create(entry, centers=entry.positions)
+
+        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads", n_jobs=-1)
+        datum["soap"] = np.vstack(descr)
+        if return_idxs:
+            datum["idxs"] = idxs
+        return datum
+
+    def __len__(self):
+        return self.data["energies"].shape[0]
+
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return x
+
+    def __getitem__(self, idx: int):
+        shift = IsolatedAtomEnergyFactory.max_charge
+        p_start, p_end = self.data["position_idx_range"][idx]
+        input = self.data["atomic_inputs"][p_start:p_end]
+        z, c, positions, energies = (
+            np.array(input[:, 0], dtype=np.int32),
+            np.array(input[:, 1], dtype=np.int32),
+            np.array(input[:, -3:], dtype=np.float32),
+            np.array(self.data["energies"][idx], dtype=np.float32),
+        )
+        name = self.__smiles_converter__(self.data["name"][idx])
+        subset = self.data["subset"][idx]
+
+        if "forces" in self.data:
+            forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32)
+        else:
+            forces = None
+        return Bunch(
+            positions=positions,
+            atomic_numbers=z,
+            charges=c,
+            e0=self.__isolated_atom_energies__[..., z, c + shift].T,
+            energies=energies,
+            name=name,
+            subset=subset,
+            forces=forces,
+        )
+
+    def __str__(self):
+        return f"{self.__name__}"
+
+    def __repr__(self):
+        return f"{self.__name__}"
+
+    @property
+    def _stats(self):
+        return self.__stats__
+
+    @property
+    def average_n_atoms(self):
+        """
+        Average number of atoms in a molecule in the dataset.
+        """
+        if self.__average_nb_atoms__ is None:
+            raise StatisticsNotAvailableError(self.__name__)
+        return self.__average_nb_atoms__
+
+    def get_statistics(self, normalization: str = "formation", return_none: bool = True):
+        """
+        Get the statistics of the dataset.
+        normalization : str, optional
+            Type of energy, by default "formation", must be one of ["formation", "total", "inter"]
+        return_none : bool, optional
+            Whether to return None if the statistics for the forces are not available, by default True
+            Otherwise, the statistics for the forces are set to 0.0
+        """
+        stats = deepcopy(self._stats)
+        if len(stats) == 0:
+            raise StatisticsNotAvailableError(self.__name__)
+        if normalization not in POSSIBLE_NORMALIZATION:
+            raise NormalizationNotAvailableError(normalization)
+        selected_stats = stats[normalization]
+        if len(self.__force_methods__) == 0 and not return_none:
+            selected_stats.update(
+                {
+                    "forces": {
+                        "mean": np.array([0.0]),
+                        "std": np.array([0.0]),
+                        "components": {
+                            "mean": np.array([[0.0], [0.0], [0.0]]),
+                            "std": np.array([[0.0], [0.0], [0.0]]),
+                            "rms": np.array([[0.0], [0.0], [0.0]]),
+                        },
+                    }
+                }
+            )
+        # cycle trough dict to convert units
+        for key in selected_stats:
+            if key == "forces":
+                for key2 in selected_stats[key]:
+                    if key2 != "components":
+                        selected_stats[key][key2] = self.convert_forces(selected_stats[key][key2])
+                    else:
+                        for key2 in selected_stats[key]["components"]:
+                            selected_stats[key]["components"][key2] = self.convert_forces(
+                                selected_stats[key]["components"][key2]
+                            )
+            else:
+                for key2 in selected_stats[key]:
+                    selected_stats[key][key2] = self.convert_energy(selected_stats[key][key2])
+        return selected_stats
\ No newline at end of file

From 0db476578c9e7f4f9ee2c068eb9926c2df62b891 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Mon, 4 Mar 2024 10:01:58 -0500
Subject: [PATCH 05/47] add interaction __init__ file and revise potential
 __init__ file

---
 src/openqdc/datasets/interaction/__init__.py | 46 ++++++++++++++++++++
 src/openqdc/datasets/potential/__init__.py   | 44 +++++++++----------
 2 files changed, 68 insertions(+), 22 deletions(-)
 create mode 100644 src/openqdc/datasets/interaction/__init__.py

diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
new file mode 100644
index 0000000..782b972
--- /dev/null
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -0,0 +1,46 @@
+import importlib
+import os
+from typing import TYPE_CHECKING  # noqa F401
+
+# The below lazy import logic is coming from openff-toolkit:
+# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44
+
+# Dictionary of objects to lazily import; maps the object's name to its module path
+
+_lazy_imports_obj = {
+    "DES370K": "openqdc.datasets.interaction.des370k",
+}
+
+_lazy_imports_mod = {}
+
+
+def __getattr__(name):
+    """Lazily import objects from _lazy_imports_obj or _lazy_imports_mod
+
+    Note that this method is only called by Python if the name cannot be found
+    in the current module."""
+    obj_mod = _lazy_imports_obj.get(name)
+    if obj_mod is not None:
+        mod = importlib.import_module(obj_mod)
+        return mod.__dict__[name]
+
+    lazy_mod = _lazy_imports_mod.get(name)
+    if lazy_mod is not None:
+        return importlib.import_module(lazy_mod)
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    """Add _lazy_imports_obj and _lazy_imports_mod to dir(<module>)"""
+    keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys())
+    return sorted(keys)
+
+
+if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
+
+    from .des370k import DES370K
+
+    __all__ = [
+        "DES370K"
+    ]
diff --git a/src/openqdc/datasets/potential/__init__.py b/src/openqdc/datasets/potential/__init__.py
index d989935..1513c28 100644
--- a/src/openqdc/datasets/potential/__init__.py
+++ b/src/openqdc/datasets/potential/__init__.py
@@ -8,28 +8,28 @@
 # Dictionary of objects to lazily import; maps the object's name to its module path
 
 _lazy_imports_obj = {
-    "ANI1": "openqdc.datasets.ani",
-    "ANI1CCX": "openqdc.datasets.ani",
-    "ANI1X": "openqdc.datasets.ani",
-    "Spice": "openqdc.datasets.spice",
-    "GEOM": "openqdc.datasets.geom",
-    "QMugs": "openqdc.datasets.qmugs",
-    "ISO17": "openqdc.datasets.iso_17",
-    "COMP6": "openqdc.datasets.comp6",
-    "GDML": "openqdc.datasets.gdml",
-    "Molecule3D": "openqdc.datasets.molecule3d",
-    "OrbnetDenali": "openqdc.datasets.orbnet_denali",
-    "SN2RXN": "openqdc.datasets.sn2_rxn",
-    "QM7X": "openqdc.datasets.qm7x",
-    "DESS": "openqdc.datasets.dess",
-    "NablaDFT": "openqdc.datasets.nabladft",
-    "SolvatedPeptides": "openqdc.datasets.solvated_peptides",
-    "WaterClusters": "openqdc.datasets.waterclusters3_30",
-    "TMQM": "openqdc.datasets.tmqm",
-    "Dummy": "openqdc.datasets.dummy",
-    "PCQM_B3LYP": "openqdc.datasets.pcqm",
-    "PCQM_PM6": "openqdc.datasets.pcqm",
-    "Transition1X": "openqdc.datasets.transition1x",
+    "ANI1": "openqdc.datasets.potential.ani",
+    "ANI1CCX": "openqdc.datasets.potential.ani",
+    "ANI1X": "openqdc.datasets.potential.ani",
+    "Spice": "openqdc.datasets.potential.spice",
+    "GEOM": "openqdc.datasets.potential.geom",
+    "QMugs": "openqdc.datasets.potential.qmugs",
+    "ISO17": "openqdc.datasets.potential.iso_17",
+    "COMP6": "openqdc.datasets.potential.comp6",
+    "GDML": "openqdc.datasets.potential.gdml",
+    "Molecule3D": "openqdc.datasets.potential.molecule3d",
+    "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
+    "SN2RXN": "openqdc.datasets.potential.sn2_rxn",
+    "QM7X": "openqdc.datasets.potential.qm7x",
+    "DESS": "openqdc.datasets.potential.dess",
+    "NablaDFT": "openqdc.datasets.potential.nabladft",
+    "SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
+    "WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
+    "TMQM": "openqdc.datasets.potential.tmqm",
+    "Dummy": "openqdc.datasets.potential.dummy",
+    "PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
+    "PCQM_PM6": "openqdc.datasets.potential.pcqm",
+    "Transition1X": "openqdc.datasets.potential.transition1x",
 }
 
 _lazy_imports_mod = {}

From 6e5a00287c8ea3f7fe1276c27a5cd0cc409e9b5c Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Mon, 4 Mar 2024 10:02:53 -0500
Subject: [PATCH 06/47] add des370k interaction to config_factory.py

---
 src/openqdc/raws/config_factory.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py
index c8dddba..df54307 100644
--- a/src/openqdc/raws/config_factory.py
+++ b/src/openqdc/raws/config_factory.py
@@ -90,6 +90,13 @@ class DataConfigFactory:
         },
     )
 
+    des370k_interaction = dict(
+        dataset_name="des370k_interaction",
+        links={
+            "DES370K.zip": "https://zenodo.org/record/5676266/files/DES370K.zip",
+        }
+    )
+
     tmqm = dict(
         dataset_name="tmqm",
         links={

From 8e1e0031f08462e5ac254f2bfbe07981758ac893 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Mon, 4 Mar 2024 10:18:29 -0500
Subject: [PATCH 07/47] have BaseInteractionDataset inherit BaseDataset

---
 src/openqdc/datasets/interaction/base.py | 538 +----------------------
 1 file changed, 8 insertions(+), 530 deletions(-)

diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py
index de43887..cb7c812 100644
--- a/src/openqdc/datasets/interaction/base.py
+++ b/src/openqdc/datasets/interaction/base.py
@@ -9,25 +9,13 @@
     push_remote,
     set_cache_dir,
 )
+from openqdc.datasets.potential.base import BaseDataset
 
 from loguru import logger
 
-class BaseInteractionDataset:
-    __energy_methods__ = []
-    __force_methods__ = []
-    energy_target_names = []
-    force_target_names = []
-    __isolated_atom_energies__ = []
-
-    __energy_unit__ = "hartree"
-    __distance_unit__ = "ang"
-    __forces_unit__ = "hartree/ang"
-    __fn_energy__ = lambda x: x
-    __fn_distance__ = lambda x: x
-    __fn_forces__ = lambda x: x
-    __average_nb_atoms__ = None
-    __stats__ = {}
+import numpy as np
 
+class BaseInteractionDataset(BaseDataset):
     def __init__(
         self,
         energy_unit: Optional[str] = None,
@@ -35,519 +23,9 @@ def __init__(
         overwrite_local_cache: bool = False,
         cache_dir: Optional[str] = None,
     ) -> None:
-        set_cache_dir(cache_dir)
-        self.data = None
-        if not self.is_preprocessed():
-            raise DatasetNotAvailableError(self.__name__)
-        else:
-            self.read_preprocess(overwrite_local_cache=overwrite_local_cache)
-        self._post_init(overwrite_local_cache, energy_unit, distance_unit)
-
-    def _post_init(
-        self,
-        overwrite_local_cache: bool = False,
-        energy_unit: Optional[str] = None,
-        distance_unit: Optional[str] = None,
-    ) -> None:
-        self._set_units(None, None)
-        self._set_isolated_atom_energies()
-        self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)
-        self._set_units(energy_unit, distance_unit)
-        self._convert_data()
-        self._set_isolated_atom_energies()
-
-    def _convert_data(self):
-        logger.info(
-            f"Converting {self.__name__} data to the following units:\n\
-                     Energy: {self.energy_unit},\n\
-                     Distance: {self.distance_unit},\n\
-                     Forces: {self.force_unit if self.__force_methods__ else 'None'}"
-        )
-        for key in self.data_keys:
-            self.data[key] = self._convert_on_loading(self.data[key], key)
-
-    def _precompute_statistics(self, overwrite_local_cache: bool = False):
-        local_path = p_join(self.preprocess_path, "stats.pkl")
-        if self.is_preprocessed_statistics() and not overwrite_local_cache:
-            stats = load_pkl(local_path)
-            logger.info("Loaded precomputed statistics")
-        else:
-            logger.info("Precomputing relevant statistics")
-            (
-                inter_E_mean,
-                inter_E_std,
-                formation_E_mean,
-                formation_E_std,
-                total_E_mean,
-                total_E_std,
-            ) = self._precompute_E()
-            forces_dict = self._precompute_F()
-            stats = {
-                "formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict},
-                "inter": {"energy": {"mean": inter_E_mean, "std": inter_E_std}, "forces": forces_dict},
-                "total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict},
-            }
-            with open(local_path, "wb") as f:
-                pkl.dump(stats, f)
-        self._compute_average_nb_atoms()
-        self.__stats__ = stats
-
-    def _compute_average_nb_atoms(self):
-        self.__average_nb_atoms__ = np.mean(self.data["n_atoms"])
-
-    def _precompute_E(self):
-        splits_idx = self.data["position_idx_range"][:, 1]
-        s = np.array(self.data["atomic_inputs"][:, :2], dtype=int)
-        s[:, 1] += IsolatedAtomEnergyFactory.max_charge
-        matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__]
-        converted_energy_data = self.data["energies"]
-        # calculation per molecule formation energy statistics
-        E = []
-        for i, matrix in enumerate(matrixs):
-            c = np.cumsum(np.append([0], matrix))[splits_idx]
-            c[1:] = c[1:] - c[:-1]
-            E.append(converted_energy_data[:, i] - c)
-        E = np.array(E).T
-        inter_E_mean = np.nanmean(E / self.data["n_atoms"][:, None], axis=0)
-        inter_E_std = np.nanstd(E / self.data["n_atoms"][:, None], axis=0)
-        formation_E_mean = np.nanmean(E, axis=0)
-        formation_E_std = np.nanstd(E, axis=0)
-        total_E_mean = np.nanmean(converted_energy_data, axis=0)
-        total_E_std = np.nanstd(converted_energy_data, axis=0)
-
-        return (
-            np.atleast_2d(inter_E_mean),
-            np.atleast_2d(inter_E_std),
-            np.atleast_2d(formation_E_mean),
-            np.atleast_2d(formation_E_std),
-            np.atleast_2d(total_E_mean),
-            np.atleast_2d(total_E_std),
-        )
-
-    def _precompute_F(self):
-        if len(self.__force_methods__) == 0:
-            return NOT_DEFINED
-        converted_force_data = self.convert_forces(self.data["forces"])
-        force_mean = np.nanmean(converted_force_data, axis=0)
-        force_std = np.nanstd(converted_force_data, axis=0)
-        force_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))
-        return {
-            "mean": np.atleast_2d(force_mean.mean(axis=0)),
-            "std": np.atleast_2d(force_std.mean(axis=0)),
-            "components": {"rms": force_rms, "std": force_std, "mean": force_mean},
-        }
-
-    @property
-    def numbers(self):
-        if hasattr(self, "_numbers"):
-            return self._numbers
-        self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
-        return self._numbers
-
-    @property
-    def chemical_species(self):
-        return np.array(chemical_symbols)[self.numbers]
-
-    @property
-    def energy_unit(self):
-        return self.__energy_unit__
-
-    @property
-    def distance_unit(self):
-        return self.__distance_unit__
-
-    @property
-    def force_unit(self):
-        return self.__forces_unit__
-
-    @property
-    def root(self):
-        return p_join(get_local_cache(), self.__name__)
-
-    @property
-    def preprocess_path(self):
-        path = p_join(self.root, "preprocessed")
-        os.makedirs(path, exist_ok=True)
-        return path
-
-    @property
-    def data_keys(self):
-        keys = list(self.data_types.keys())
-        if len(self.__force_methods__) == 0:
-            keys.remove("forces")
-        return keys
-
-    @property
-    def data_types(self):
-        return {
-            "atomic_inputs": np.float32,
-            "position_idx_range": np.int32,
-            "energies": np.float32,
-            "forces": np.float32,
-        }
-
-    @property
-    def data_shapes(self):
-        return {
-            "atomic_inputs": (-1, NB_ATOMIC_FEATURES),
-            "position_idx_range": (-1, 2),
-            "energies": (-1, len(self.energy_target_names)),
-            "forces": (-1, 3, len(self.force_target_names)),
-        }
-
-    @property
-    def atoms_per_molecules(self):
-        try:
-            if hasattr(self, "_n_atoms"):
-                return self._n_atoms
-            self._n_atoms = self.data["n_atoms"]
-            return self._n_atoms
-        except:  # noqa
-            return None
-
-    def _set_units(self, en, ds):
-        old_en, old_ds = self.energy_unit, self.distance_unit
-        en = en if en is not None else old_en
-        ds = ds if ds is not None else old_ds
-
-        # if en is None:
-        self.set_energy_unit(en)
-        # if ds is not None:
-        self.set_distance_unit(ds)
-        if self.__force_methods__:
-            self.__forces_unit__ = self.energy_unit + "/" + self.distance_unit
-            self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__)
-
-    def _set_isolated_atom_energies(self):
-        if self.__energy_methods__ is None:
-            logger.error("No energy methods defined for this dataset.")
-        f = get_conversion("hartree", self.__energy_unit__)
-        self.__isolated_atom_energies__ = f(
-            np.array([IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.__energy_methods__])
-        )
-
-    def convert_energy(self, x):
-        return self.__class__.__fn_energy__(x)
-
-    def convert_distance(self, x):
-        return self.__class__.__fn_distance__(x)
-
-    def convert_forces(self, x):
-        return self.__class__.__fn_forces__(x)
-
-    def set_energy_unit(self, value: str):
-        """
-        Set a new energy unit for the dataset.
-        """
-        old_unit = self.energy_unit
-        self.__energy_unit__ = value
-        self.__class__.__fn_energy__ = get_conversion(old_unit, value)
-
-    def set_distance_unit(self, value: str):
-        """
-        Set a new distance unit for the dataset.
-        """
-        old_unit = self.distance_unit
-        self.__distance_unit__ = value
-        self.__class__.__fn_distance__ = get_conversion(old_unit, value)
-
-    def read_raw_entries(self):
-        raise NotImplementedError
-
-    def collate_list(self, list_entries):
-        # concatenate entries
-        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
-
-        csum = np.cumsum(res.get("n_atoms"))
-        x = np.zeros((csum.shape[0], 2), dtype=np.int32)
-        x[1:, 0], x[:, 1] = csum[:-1], csum
-        res["position_idx_range"] = x
-
-        return res
-
-    def save_preprocess(self, data_dict):
-        # save memmaps
-        logger.info("Preprocessing data and saving it to cache.")
-        for key in self.data_keys:
-            local_path = p_join(self.preprocess_path, f"{key}.mmap")
-            out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
-            out[:] = data_dict.pop(key)[:]
-            out.flush()
-            push_remote(local_path, overwrite=True)
-
-        # save smiles and subset
-        local_path = p_join(self.preprocess_path, "props.pkl")
-        for key in ["name", "subset"]:
-            data_dict[key] = np.unique(data_dict[key], return_inverse=True)
-
-        with open(local_path, "wb") as f:
-            pkl.dump(data_dict, f)
-        push_remote(local_path, overwrite=True)
-
-    def _convert_on_loading(self, x, key):
-        if key == "energies":
-            return self.convert_energy(x)
-        elif key == "forces":
-            return self.convert_forces(x)
-        elif key == "atomic_inputs":
-            x = np.array(x, dtype=np.float32)
-            x[:, -3:] = self.convert_distance(x[:, -3:])
-            return x
-        else:
-            return x
-
-    def read_preprocess(self, overwrite_local_cache=False):
-        logger.info("Reading preprocessed data")
-        logger.info(
-            f"{self.__name__} data with the following units:\n\
-                     Energy: {self.energy_unit},\n\
-                     Distance: {self.distance_unit},\n\
-                     Forces: {self.force_unit if self.__force_methods__ else 'None'}"
-        )
-        self.data = {}
-        for key in self.data_keys:
-            filename = p_join(self.preprocess_path, f"{key}.mmap")
-            pull_locally(filename, overwrite=overwrite_local_cache)
-            self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key])
-
-        filename = p_join(self.preprocess_path, "props.pkl")
-        pull_locally(filename, overwrite=overwrite_local_cache)
-        with open(filename, "rb") as f:
-            tmp = pkl.load(f)
-            for key in ["name", "subset", "n_atoms"]:
-                x = tmp.pop(key)
-                if len(x) == 2:
-                    self.data[key] = x[0][x[1]]
-                else:
-                    self.data[key] = x
-
-        for key in self.data:
-            logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
-
-    def is_preprocessed(self):
-        predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
-        predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))]
-        return all(predicats)
-
-    def is_preprocessed_statistics(self):
-        return bool(copy_exists(p_join(self.preprocess_path, "stats.pkl")))
-
-    def preprocess(self, overwrite=False):
-        if overwrite or not self.is_preprocessed():
-            entries = self.read_raw_entries()
-            res = self.collate_list(entries)
-            self.save_preprocess(res)
-
-    def save_xyz(self, idx: int, path: Optional[str] = None, name=None):
-        """
-        Save the entry at index idx as an extxyz file.
-        """
-        if path is None:
-            path = os.getcwd()
-        at = self.get_ase_atoms(idx, ext=True)
-        if name is not None:
-            name = at.info["name"]
-        write_extxyz(p_join(path, f"{name}.xyz"), at)
-
-    def get_ase_atoms(self, idx: int, ext=True):
-        """
-        Get the ASE atoms object for the entry at index idx.
-
-        Parameters
-        ----------
-        idx : int
-            Index of the entry.
-        ext : bool, optional
-            Whether to include additional informations
-        """
-        entry = self[idx]
-        # _ = entry.pop("forces")
-        at = dict_to_atoms(entry, ext=ext)
-        return at
-
-    @requires_package("dscribe")
-    @requires_package("datamol")
-    def soap_descriptors(
-        self,
-        n_samples: Optional[Union[List[int], int]] = None,
-        return_idxs: bool = True,
-        progress: bool = True,
-        **soap_kwargs,
-    ) -> Dict[str, np.ndarray]:
-        """
-        Compute the SOAP descriptors for the dataset.
-
-        Parameters
-        ----------
-        n_samples : Optional[Union[List[int],int]], optional
-            Number of samples to use for the computation, by default None. If None, all the dataset is used.
-            If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.
-        return_idxs : bool, optional
-            Whether to return the indices of the samples used, by default True.
-        progress : bool, optional
-            Whether to show a progress bar, by default True.
-        **soap_kwargs : dict
-            Keyword arguments to pass to the SOAP descriptor.
-            By defaut, the following values are used:
-                - r_cut : 5.0
-                - n_max : 8
-                - l_max : 6
-                - average : "inner"
-                - periodic : False
-                - compression : {"mode" : "mu1nu1"}
-
-        Returns
-        -------
-        Dict[str, np.ndarray]
-            Dictionary containing the following keys:
-                - soap : np.ndarray of shape (N, M) containing the SOAP descriptors for the dataset
-                - soap_kwargs : dict containing the keyword arguments used for the SOAP descriptor
-                - idxs : np.ndarray of shape (N,) containing the indices of the samples used
-
-        """
-        import datamol as dm
-        from dscribe.descriptors import SOAP
-
-        if n_samples is None:
-            idxs = list(range(len(self)))
-        elif isinstance(n_samples, int):
-            idxs = np.random.choice(len(self), size=n_samples, replace=False)
-        else:  # list, set, np.ndarray
-            idxs = n_samples
-        datum = {}
-        r_cut = soap_kwargs.pop("r_cut", 5.0)
-        n_max = soap_kwargs.pop("n_max", 8)
-        l_max = soap_kwargs.pop("l_max", 6)
-        average = soap_kwargs.pop("average", "inner")
-        periodic = soap_kwargs.pop("periodic", False)
-        compression = soap_kwargs.pop("compression", {"mode": "mu1nu1"})
-        soap = SOAP(
-            species=self.chemical_species,
-            periodic=periodic,
-            r_cut=r_cut,
-            n_max=n_max,
-            l_max=l_max,
-            average=average,
-            compression=compression,
-        )
-        datum["soap_kwargs"] = {
-            "r_cut": r_cut,
-            "n_max": n_max,
-            "l_max": l_max,
-            "average": average,
-            "compression": compression,
-            "species": self.chemical_species,
-            "periodic": periodic,
-            **soap_kwargs,
-        }
-
-        def wrapper(idx):
-            entry = self.get_ase_atoms(idx, ext=False)
-            return soap.create(entry, centers=entry.positions)
-
-        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads", n_jobs=-1)
-        datum["soap"] = np.vstack(descr)
-        if return_idxs:
-            datum["idxs"] = idxs
-        return datum
-
-    def __len__(self):
-        return self.data["energies"].shape[0]
-
-    def __smiles_converter__(self, x):
-        """util function to convert string to smiles: useful if the smiles is
-        encoded in a different format than its display format
-        """
-        return x
-
-    def __getitem__(self, idx: int):
-        shift = IsolatedAtomEnergyFactory.max_charge
-        p_start, p_end = self.data["position_idx_range"][idx]
-        input = self.data["atomic_inputs"][p_start:p_end]
-        z, c, positions, energies = (
-            np.array(input[:, 0], dtype=np.int32),
-            np.array(input[:, 1], dtype=np.int32),
-            np.array(input[:, -3:], dtype=np.float32),
-            np.array(self.data["energies"][idx], dtype=np.float32),
+        super().__init__(
+            energy_unit=energy_unit,
+            distance_unit=distance_unit,
+            overwrite_local_cache=overwrite_local_cache,
+            cache_dir=cache_dir
         )
-        name = self.__smiles_converter__(self.data["name"][idx])
-        subset = self.data["subset"][idx]
-
-        if "forces" in self.data:
-            forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32)
-        else:
-            forces = None
-        return Bunch(
-            positions=positions,
-            atomic_numbers=z,
-            charges=c,
-            e0=self.__isolated_atom_energies__[..., z, c + shift].T,
-            energies=energies,
-            name=name,
-            subset=subset,
-            forces=forces,
-        )
-
-    def __str__(self):
-        return f"{self.__name__}"
-
-    def __repr__(self):
-        return f"{self.__name__}"
-
-    @property
-    def _stats(self):
-        return self.__stats__
-
-    @property
-    def average_n_atoms(self):
-        """
-        Average number of atoms in a molecule in the dataset.
-        """
-        if self.__average_nb_atoms__ is None:
-            raise StatisticsNotAvailableError(self.__name__)
-        return self.__average_nb_atoms__
-
-    def get_statistics(self, normalization: str = "formation", return_none: bool = True):
-        """
-        Get the statistics of the dataset.
-        normalization : str, optional
-            Type of energy, by default "formation", must be one of ["formation", "total", "inter"]
-        return_none : bool, optional
-            Whether to return None if the statistics for the forces are not available, by default True
-            Otherwise, the statistics for the forces are set to 0.0
-        """
-        stats = deepcopy(self._stats)
-        if len(stats) == 0:
-            raise StatisticsNotAvailableError(self.__name__)
-        if normalization not in POSSIBLE_NORMALIZATION:
-            raise NormalizationNotAvailableError(normalization)
-        selected_stats = stats[normalization]
-        if len(self.__force_methods__) == 0 and not return_none:
-            selected_stats.update(
-                {
-                    "forces": {
-                        "mean": np.array([0.0]),
-                        "std": np.array([0.0]),
-                        "components": {
-                            "mean": np.array([[0.0], [0.0], [0.0]]),
-                            "std": np.array([[0.0], [0.0], [0.0]]),
-                            "rms": np.array([[0.0], [0.0], [0.0]]),
-                        },
-                    }
-                }
-            )
-        # cycle trough dict to convert units
-        for key in selected_stats:
-            if key == "forces":
-                for key2 in selected_stats[key]:
-                    if key2 != "components":
-                        selected_stats[key][key2] = self.convert_forces(selected_stats[key][key2])
-                    else:
-                        for key2 in selected_stats[key]["components"]:
-                            selected_stats[key]["components"][key2] = self.convert_forces(
-                                selected_stats[key]["components"][key2]
-                            )
-            else:
-                for key2 in selected_stats[key]:
-                    selected_stats[key][key2] = self.convert_energy(selected_stats[key][key2])
-        return selected_stats
\ No newline at end of file

From d68bae6529c1343adb719f03a83d7fa7e9b4003e Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Mon, 4 Mar 2024 12:08:48 -0500
Subject: [PATCH 08/47] implemented read_raw_entries for DES370K

---
 src/openqdc/datasets/interaction/des370k.py | 169 +++++++++-----------
 1 file changed, 73 insertions(+), 96 deletions(-)

diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
index 127ec89..4c79fcc 100644
--- a/src/openqdc/datasets/interaction/des370k.py
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -1,11 +1,12 @@
-import torch
+import os
+import numpy as np
 import pandas as pd
 
+from tqdm import tqdm
 from typing import Dict, List
-from collections import defaultdict, Counter
 
 from loguru import logger
-from torch.utils.data import Dataset
+from openqdc.datasets.interaction import BaseInteractionDataset
 
 class Dimer:
     def __init__(
@@ -16,7 +17,7 @@ def __init__(
         charge_1: int,
         n_atoms_0: int,
         n_atoms_1: int,
-        pos: torch.Tensor,
+        pos: np.array,
         sapt_energies: List[float],
     ) -> None:
         self.smiles_0 = smiles_0
@@ -47,99 +48,75 @@ def __repr__(self) -> str:
         return str(self)
 
 
-class DES370K(Dataset):
-    def __init__(self, filepath="data/des370k.csv") -> None:
-        self.filepath = filepath
-        self.df = pd.read_csv(filepath)
-        self._atom_types = defaultdict(int)
-        self.smiles = set()
-        self.data = []
-        self._preprocess()
-    
-    def _preprocess(self) -> None:
+class DES370K(BaseInteractionDataset):
+    __name__ = "des370k_interaction"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "mp2/cc-pvdz",
+        "mp2/cc-pvqz",
+        "mp2/cc-pvtz",
+        "mp2/cbs",
+        "ccsd(t)/cc-pvdz",
+        "ccsd(t)/cbs",  # cbs
+        "ccsd(t)/nn",  # nn
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz",
+    ]
+
+    energy_target_names = [
+        "cc_MP2_all",
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "cc_CCSD(T)_all",
+        "cbs_CCSD(T)_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    def read_raw_entries(self) -> List[Dict]:
+        self.filepath = os.path.join(self.root, "DES370K.csv")
         logger.info(f"Reading data from {self.filepath}")
-        for idx, row in self.df.iterrows():
+        df = pd.read_csv(self.filepath)
+        data = []
+        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
             smiles0, smiles1 = row["smiles0"], row["smiles1"]
-            charge0, charge1 = row["charge0"], row["charge1"]
             natoms0, natoms1 = row["natoms0"], row["natoms1"]
-            pos = torch.tensor(list(map(float, row["xyz"].split()))).view(-1, 3)
-            sapt_energies = [row[col] for col in self.df.columns if "sapt" in col]
-            dimer = Dimer(
-                smiles0, smiles1,
-                charge0, charge1,
-                natoms0, natoms1,
-                pos, sapt_energies
+            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
+            pos0 = pos[:natoms0]
+            pos1 = pos[natoms0:]
+            # sapt_components = {col: row[col] for col in df.columns if "sapt" in col}
+            item = dict(
+                mol0=dict(
+                    smiles=smiles0,
+                    atomic_inputs=pos0,
+                    n_atoms=natoms0,
+                ),
+                mol1=dict(
+                    smiles=smiles1,
+                    atomic_inputs=pos1,
+                    n_atoms=natoms1,
+                ),
+                targets=row[self.energy_target_names].values,
             )
-            self.data.append(dimer)
-
-            # keep track of unique smiles strings
-            self.smiles.add(smiles0)
-            self.smiles.add(smiles1)
-
-            # get atom types
-            elems = row["elements"].split()
-            counts = Counter(set(elems))
-            for key in counts:
-                self._atom_types[key] += counts[key]
-
-        # convert defaultdict to regular dict
-        self._atom_types = dict(self._atom_types)
-
-    def __str__(self) -> str:
-        return f"DES370K(n_atoms={self.num_atoms},\
-               n_molecules={self.num_molecules},\
-               atom_types={self.species})"
-
-    def __repr__(self) -> str:
-        return str(self)
-
-    @property
-    def atom_types(self) -> Dict[str, int]:
-        """
-        Returns a dictionary of 
-        (element, count) pairs.
-        """
-        return self._atom_types
-
-    @property
-    def num_dimers(self) -> int:
-        """
-        Returns the number of 
-        dimers in the dataset.
-        """
-        return len(self.data)
-
-    @property
-    def num_unique_molecules(self) -> int:
-        """
-        Returns the number of unique
-        molecules in the dataset.
-        """
-        return len(self.smiles)
-
-    @property
-    def num_atoms(self) -> int:
-        """
-        Returns the total number of atoms in 
-        the dataset.
-        """
-        if not hasattr(self, "_num_atoms"):
-            self._num_atoms = sum(self.atom_types.values())
-        return self._num_atoms 
-
-    @property
-    def species(self) -> List[str]:
-        """
-        Returns a list of the unique atom
-        species contained in the dataset.
-        """
-        if not hasattr(self, "_species"):
-            self._species = list(self.atom_types.keys())
-        return self._species
-
-    def atom_count(self, element: str) -> int:
-        """
-        Returns the count of a given
-        element in the dataset.
-        """
-        return self.atom_types[element]
+            data.append(item)
+        return data

From 5e94d6727236dd3dd2737a516dc79b6dafbbdf27 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Mon, 4 Mar 2024 14:04:40 -0500
Subject: [PATCH 09/47] finished implementation of DES370K interaction

---
 src/openqdc/datasets/interaction/__init__.py |  6 +-
 src/openqdc/datasets/interaction/base.py     | 12 ++++
 src/openqdc/datasets/interaction/des370k.py  | 76 +++++++++-----------
 src/openqdc/datasets/potential/base.py       |  5 ++
 src/openqdc/datasets/potential/dess.py       |  3 +-
 src/openqdc/datasets/potential/tmqm.py       |  2 +-
 6 files changed, 55 insertions(+), 49 deletions(-)

diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
index 782b972..62392e3 100644
--- a/src/openqdc/datasets/interaction/__init__.py
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -8,6 +8,7 @@
 # Dictionary of objects to lazily import; maps the object's name to its module path
 
 _lazy_imports_obj = {
+    "BaseInteractionDataset": "openqdc.datasets.interaction.base",
     "DES370K": "openqdc.datasets.interaction.des370k",
 }
 
@@ -38,9 +39,10 @@ def __dir__():
 
 
 if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
-
+    from .base import BaseInteractionDataset
     from .des370k import DES370K
 
     __all__ = [
-        "DES370K"
+        "BaseInteractionDataset",
+        "DES370K",
     ]
diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py
index cb7c812..e266658 100644
--- a/src/openqdc/datasets/interaction/base.py
+++ b/src/openqdc/datasets/interaction/base.py
@@ -29,3 +29,15 @@ def __init__(
             overwrite_local_cache=overwrite_local_cache,
             cache_dir=cache_dir
         )
+
+    def collate_list(self, list_entries: List[Dict]):
+        # concatenate entries
+        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \
+               for key in list_entries[0] if not isinstance(list_entries[0][key], dict)}
+
+        csum = np.cumsum(res.get("n_atoms"))
+        x = np.zeros((csum.shape[0], 2), dtype=np.int32)
+        x[1:, 0], x[:, 1] = csum[:-1], csum
+        res["position_idx_range"] = x
+
+        return res
diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
index 4c79fcc..da6a9b9 100644
--- a/src/openqdc/datasets/interaction/des370k.py
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -2,50 +2,12 @@
 import numpy as np
 import pandas as pd
 
-from tqdm import tqdm
 from typing import Dict, List
 
+from tqdm import tqdm
 from loguru import logger
 from openqdc.datasets.interaction import BaseInteractionDataset
-
-class Dimer:
-    def __init__(
-        self,
-        smiles_0: str,
-        smiles_1: str,
-        charge_0: int,
-        charge_1: int,
-        n_atoms_0: int,
-        n_atoms_1: int,
-        pos: np.array,
-        sapt_energies: List[float],
-    ) -> None:
-        self.smiles_0 = smiles_0
-        self.smiles_1 = smiles_1
-        self.charge_1 = charge_0
-        self.charge_1 = charge_1
-        self.n_atoms_0 = n_atoms_0
-        self.n_atoms_1 = n_atoms_1
-        self.pos = pos
-        self.sapt_energies = sapt_energies
-        (
-            self.sapt_es,
-            self.sapt_ex,
-            self.sapt_exs2,
-            self.sapt_ind,
-            self.sapt_exind,
-            self.sapt_disp,
-            self.sapt_exdisp_os,
-            self.sapt_exdisp_ss,
-            self.sapt_delta_HF,
-            self.sapt_all
-        ) = tuple(sapt_energies)
-
-    def __str__(self) -> str:
-        return f"Dimer(smiles_0='{self.smiles_0}', smiles_1='{self.smiles_1}')"
-
-    def __repr__(self) -> str:
-        return str(self)
+from openqdc.utils.molecule import atom_table
 
 
 class DES370K(BaseInteractionDataset):
@@ -95,28 +57,54 @@ class DES370K(BaseInteractionDataset):
 
     def read_raw_entries(self) -> List[Dict]:
         self.filepath = os.path.join(self.root, "DES370K.csv")
-        logger.info(f"Reading data from {self.filepath}")
+        logger.info(f"Reading DES370K interaction data from {self.filepath}")
         df = pd.read_csv(self.filepath)
         data = []
         for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
             smiles0, smiles1 = row["smiles0"], row["smiles1"]
+            charge0, charge1 = row["charge0"], row["charge1"]
             natoms0, natoms1 = row["natoms0"], row["natoms1"]
             pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
             pos0 = pos[:natoms0]
             pos1 = pos[natoms0:]
-            # sapt_components = {col: row[col] for col in df.columns if "sapt" in col}
+            
+            elements = row["elements"].split()
+            elements0 = np.array(elements[:natoms0])
+            elements1 = np.array(elements[natoms0:])
+
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+            atomic_nums0 = np.array(atomic_nums[:natoms0])
+            atomic_nums1 = np.array(atomic_nums[natoms0:])
+
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+            atomic_inputs0 = atomic_inputs[:natoms0, :]
+            atomic_inputs1 = atomic_inputs[natoms0:, :]
+
             item = dict(
                 mol0=dict(
                     smiles=smiles0,
-                    atomic_inputs=pos0,
+                    atomic_inputs=atomic_inputs0,
                     n_atoms=natoms0,
+                    charge=charge0,
+                    elements=elements0,
+                    atomic_nums=atomic_nums0,
+                    pos=pos0,
                 ),
                 mol1=dict(
                     smiles=smiles1,
-                    atomic_inputs=pos1,
+                    atomic_inputs=atomic_inputs1,
                     n_atoms=natoms1,
+                    charge=charge1,
+                    elements=elements1,
+                    atomic_nums=atomic_nums1,
+                    pos=pos1,
                 ),
                 targets=row[self.energy_target_names].values,
+                subset=np.array(["DES370K"]),
+                n_atoms=np.array([natoms0 + natoms1]),
+                atomic_inputs=atomic_inputs,
             )
             data.append(item)
         return data
diff --git a/src/openqdc/datasets/potential/base.py b/src/openqdc/datasets/potential/base.py
index adf9ae6..447985a 100644
--- a/src/openqdc/datasets/potential/base.py
+++ b/src/openqdc/datasets/potential/base.py
@@ -125,6 +125,10 @@ def _post_init(
         self._convert_data()
         self._set_isolated_atom_energies()
 
+    @classmethod
+    def no_init(cls):
+        return cls.__new__(cls)
+
     def _convert_data(self):
         logger.info(
             f"Converting {self.__name__} data to the following units:\n\
@@ -325,6 +329,7 @@ def read_raw_entries(self):
 
     def collate_list(self, list_entries):
         # concatenate entries
+        logger.info(f"list entries: {type(list_entries)}")
         res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
 
         csum = np.cumsum(res.get("n_atoms"))
diff --git a/src/openqdc/datasets/potential/dess.py b/src/openqdc/datasets/potential/dess.py
index 80b1e1c..fd36e07 100644
--- a/src/openqdc/datasets/potential/dess.py
+++ b/src/openqdc/datasets/potential/dess.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from tqdm import tqdm
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
@@ -58,7 +58,6 @@ class DESS(BaseDataset):
         "nn_CCSD(T)_all",
         "sapt_all",
     ]
-    # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all']
 
     partitions = ["DES370K", "DES5M"]
 
diff --git a/src/openqdc/datasets/potential/tmqm.py b/src/openqdc/datasets/potential/tmqm.py
index 8952aaa..b613929 100644
--- a/src/openqdc/datasets/potential/tmqm.py
+++ b/src/openqdc/datasets/potential/tmqm.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from tqdm import tqdm
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.molecule import atom_table
 
 

From 3c9508beb6e6d73e0832a553e9456baed16db5ec Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Mon, 4 Mar 2024 14:16:43 -0500
Subject: [PATCH 10/47] finished implementation of DES370K interaction

---
 src/openqdc/datasets/interaction/des370k.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
index da6a9b9..824da46 100644
--- a/src/openqdc/datasets/interaction/des370k.py
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -82,6 +82,10 @@ def read_raw_entries(self) -> List[Dict]:
             atomic_inputs0 = atomic_inputs[:natoms0, :]
             atomic_inputs1 = atomic_inputs[natoms0:, :]
 
+            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
+
+            name = np.array([smiles0 + "." + smiles1])
+
             item = dict(
                 mol0=dict(
                     smiles=smiles0,
@@ -101,10 +105,11 @@ def read_raw_entries(self) -> List[Dict]:
                     atomic_nums=atomic_nums1,
                     pos=pos1,
                 ),
-                targets=row[self.energy_target_names].values,
+                energies=energies,
                 subset=np.array(["DES370K"]),
-                n_atoms=np.array([natoms0 + natoms1]),
+                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
                 atomic_inputs=atomic_inputs,
+                name=name,
             )
             data.append(item)
         return data

From 768fb2e39ebd56b30bac5b9b68df4dac51015c6d Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Mon, 4 Mar 2024 14:23:47 -0500
Subject: [PATCH 11/47] update BaseDataset import path

---
 src/openqdc/datasets/potential/comp6.py             | 2 +-
 src/openqdc/datasets/potential/dummy.py             | 2 +-
 src/openqdc/datasets/potential/gdml.py              | 2 +-
 src/openqdc/datasets/potential/geom.py              | 2 +-
 src/openqdc/datasets/potential/iso_17.py            | 2 +-
 src/openqdc/datasets/potential/molecule3d.py        | 2 +-
 src/openqdc/datasets/potential/nabladft.py          | 2 +-
 src/openqdc/datasets/potential/orbnet_denali.py     | 2 +-
 src/openqdc/datasets/potential/pcqm.py              | 2 +-
 src/openqdc/datasets/potential/qm7x.py              | 2 +-
 src/openqdc/datasets/potential/qmugs.py             | 2 +-
 src/openqdc/datasets/potential/sn2_rxn.py           | 2 +-
 src/openqdc/datasets/potential/solvated_peptides.py | 2 +-
 src/openqdc/datasets/potential/spice.py             | 2 +-
 src/openqdc/datasets/potential/transition1x.py      | 2 +-
 src/openqdc/datasets/potential/waterclusters3_30.py | 2 +-
 16 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/openqdc/datasets/potential/comp6.py b/src/openqdc/datasets/potential/comp6.py
index 7b6890b..ac37094 100644
--- a/src/openqdc/datasets/potential/comp6.py
+++ b/src/openqdc/datasets/potential/comp6.py
@@ -1,6 +1,6 @@
 from os.path import join as p_join
 
-from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5
 
 
 class COMP6(BaseDataset):
diff --git a/src/openqdc/datasets/potential/dummy.py b/src/openqdc/datasets/potential/dummy.py
index c87e03d..b4bd2e8 100644
--- a/src/openqdc/datasets/potential/dummy.py
+++ b/src/openqdc/datasets/potential/dummy.py
@@ -2,7 +2,7 @@
 from numpy import array
 from sklearn.utils import Bunch
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
 from openqdc.utils.constants import NOT_DEFINED
 
diff --git a/src/openqdc/datasets/potential/gdml.py b/src/openqdc/datasets/potential/gdml.py
index e40b3fa..67f6bde 100644
--- a/src/openqdc/datasets/potential/gdml.py
+++ b/src/openqdc/datasets/potential/gdml.py
@@ -1,6 +1,6 @@
 from os.path import join as p_join
 
-from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5
 
 
 class GDML(BaseDataset):
diff --git a/src/openqdc/datasets/potential/geom.py b/src/openqdc/datasets/potential/geom.py
index c016a9f..8b87ead 100644
--- a/src/openqdc/datasets/potential/geom.py
+++ b/src/openqdc/datasets/potential/geom.py
@@ -4,7 +4,7 @@
 import datamol as dm
 import numpy as np
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils import load_json, load_pkl
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
diff --git a/src/openqdc/datasets/potential/iso_17.py b/src/openqdc/datasets/potential/iso_17.py
index 4553ec1..1dd5d56 100644
--- a/src/openqdc/datasets/potential/iso_17.py
+++ b/src/openqdc/datasets/potential/iso_17.py
@@ -1,6 +1,6 @@
 from os.path import join as p_join
 
-from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5
 
 
 class ISO17(BaseDataset):
diff --git a/src/openqdc/datasets/potential/molecule3d.py b/src/openqdc/datasets/potential/molecule3d.py
index dc47e53..834525a 100644
--- a/src/openqdc/datasets/potential/molecule3d.py
+++ b/src/openqdc/datasets/potential/molecule3d.py
@@ -8,7 +8,7 @@
 from rdkit import Chem
 from tqdm import tqdm
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
diff --git a/src/openqdc/datasets/potential/nabladft.py b/src/openqdc/datasets/potential/nabladft.py
index 0555cdc..703fdc5 100644
--- a/src/openqdc/datasets/potential/nabladft.py
+++ b/src/openqdc/datasets/potential/nabladft.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.molecule import z_to_formula
 from openqdc.utils.package_utils import requires_package
 
diff --git a/src/openqdc/datasets/potential/orbnet_denali.py b/src/openqdc/datasets/potential/orbnet_denali.py
index 614e252..14c19ef 100644
--- a/src/openqdc/datasets/potential/orbnet_denali.py
+++ b/src/openqdc/datasets/potential/orbnet_denali.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.molecule import atom_table
 
 
diff --git a/src/openqdc/datasets/potential/pcqm.py b/src/openqdc/datasets/potential/pcqm.py
index 543c494..c86d2b6 100644
--- a/src/openqdc/datasets/potential/pcqm.py
+++ b/src/openqdc/datasets/potential/pcqm.py
@@ -10,7 +10,7 @@
 import pandas as pd
 from loguru import logger
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.io import get_local_cache, push_remote
 
 
diff --git a/src/openqdc/datasets/potential/qm7x.py b/src/openqdc/datasets/potential/qm7x.py
index eb8b015..361707f 100644
--- a/src/openqdc/datasets/potential/qm7x.py
+++ b/src/openqdc/datasets/potential/qm7x.py
@@ -3,7 +3,7 @@
 import numpy as np
 from tqdm import tqdm
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.io import load_hdf5_file
 
 
diff --git a/src/openqdc/datasets/potential/qmugs.py b/src/openqdc/datasets/potential/qmugs.py
index c75f8b5..b7e0a86 100644
--- a/src/openqdc/datasets/potential/qmugs.py
+++ b/src/openqdc/datasets/potential/qmugs.py
@@ -5,7 +5,7 @@
 import datamol as dm
 import numpy as np
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
diff --git a/src/openqdc/datasets/potential/sn2_rxn.py b/src/openqdc/datasets/potential/sn2_rxn.py
index abcbd62..f8e35cc 100644
--- a/src/openqdc/datasets/potential/sn2_rxn.py
+++ b/src/openqdc/datasets/potential/sn2_rxn.py
@@ -1,6 +1,6 @@
 from os.path import join as p_join
 
-from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5
 
 
 class SN2RXN(BaseDataset):
diff --git a/src/openqdc/datasets/potential/solvated_peptides.py b/src/openqdc/datasets/potential/solvated_peptides.py
index 216ecdd..77d8318 100644
--- a/src/openqdc/datasets/potential/solvated_peptides.py
+++ b/src/openqdc/datasets/potential/solvated_peptides.py
@@ -1,6 +1,6 @@
 from os.path import join as p_join
 
-from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5
 
 
 class SolvatedPeptides(BaseDataset):
diff --git a/src/openqdc/datasets/potential/spice.py b/src/openqdc/datasets/potential/spice.py
index 0b90912..d891f6b 100644
--- a/src/openqdc/datasets/potential/spice.py
+++ b/src/openqdc/datasets/potential/spice.py
@@ -4,7 +4,7 @@
 import numpy as np
 from tqdm import tqdm
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils import load_hdf5_file
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
diff --git a/src/openqdc/datasets/potential/transition1x.py b/src/openqdc/datasets/potential/transition1x.py
index 0285ec9..c266325 100644
--- a/src/openqdc/datasets/potential/transition1x.py
+++ b/src/openqdc/datasets/potential/transition1x.py
@@ -3,7 +3,7 @@
 import numpy as np
 from tqdm import tqdm
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.constants import NB_ATOMIC_FEATURES
 from openqdc.utils.io import load_hdf5_file
 
diff --git a/src/openqdc/datasets/potential/waterclusters3_30.py b/src/openqdc/datasets/potential/waterclusters3_30.py
index 6aa5748..ce0e5de 100644
--- a/src/openqdc/datasets/potential/waterclusters3_30.py
+++ b/src/openqdc/datasets/potential/waterclusters3_30.py
@@ -4,7 +4,7 @@
 import numpy as np
 from tqdm import tqdm
 
-from openqdc.datasets.base import BaseDataset
+from openqdc.datasets.potential.base import BaseDataset
 from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 from openqdc.utils.molecule import atom_table
 

From 8aeadd84259223d1fc7daa48d83f6e869122c010 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 5 Mar 2024 10:36:34 -0500
Subject: [PATCH 12/47] added Metcalf dataset

---
 src/openqdc/datasets/interaction/__init__.py |  2 +
 src/openqdc/datasets/interaction/base.py     |  2 +
 src/openqdc/datasets/interaction/metcalf.py  | 91 ++++++++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100644 src/openqdc/datasets/interaction/metcalf.py

diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
index 62392e3..a920396 100644
--- a/src/openqdc/datasets/interaction/__init__.py
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -10,6 +10,7 @@
 _lazy_imports_obj = {
     "BaseInteractionDataset": "openqdc.datasets.interaction.base",
     "DES370K": "openqdc.datasets.interaction.des370k",
+    "Metcalf": "openqdc.datasets.interaction.metcalf",
 }
 
 _lazy_imports_mod = {}
@@ -45,4 +46,5 @@ def __dir__():
     __all__ = [
         "BaseInteractionDataset",
         "DES370K",
+        "Metcalf",
     ]
diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py
index e266658..c73ce7c 100644
--- a/src/openqdc/datasets/interaction/base.py
+++ b/src/openqdc/datasets/interaction/base.py
@@ -32,10 +32,12 @@ def __init__(
 
     def collate_list(self, list_entries: List[Dict]):
         # concatenate entries
+        print(list_entries[0])
         res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \
                for key in list_entries[0] if not isinstance(list_entries[0][key], dict)}
 
         csum = np.cumsum(res.get("n_atoms"))
+        print(csum)
         x = np.zeros((csum.shape[0], 2), dtype=np.int32)
         x[1:, 0], x[:, 1] = csum[:-1], csum
         res["position_idx_range"] = x
diff --git a/src/openqdc/datasets/interaction/metcalf.py b/src/openqdc/datasets/interaction/metcalf.py
new file mode 100644
index 0000000..3a8e714
--- /dev/null
+++ b/src/openqdc/datasets/interaction/metcalf.py
@@ -0,0 +1,91 @@
+import os
+import numpy as np
+
+from typing import Dict, List
+
+from tqdm import tqdm
+from rdkit import Chem
+from loguru import logger
+from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table
+
+class Metcalf(BaseInteractionDataset):
+    __name__ = "metcalf"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = None
+    __energy_methods__ = [
+        "SAPT0/jun-cc-pVDZ"
+    ]
+    energy_target_names = [
+        "total energy",
+        "electrostatic energy",
+        "exchange energy",
+        "induction energy",
+        "dispersion energy",
+    ]
+
+    def read_raw_entries(self) -> List[Dict]:
+        """
+        SMILES strings are inferred from the
+        .xyz files using the RDKit xyz2mol function.
+        More details here:
+
+        https://github.com/jensengroup/xyz2mol
+
+        """
+        data = []
+        for dirname in os.listdir(self.root):
+            xyz_dir = os.path.join(self.root, dirname)
+            if not os.path.isdir(xyz_dir):
+                continue
+            subset = np.array([dirname.split("-")[0].lower()]) # training, validation, or test
+            for filename in os.listdir(xyz_dir):
+                if not filename.endswith(".xyz"):
+                    continue
+                lines = list(map(lambda x: x.strip(), open(os.path.join(xyz_dir, filename), "r").readlines()))
+                line_two = lines[1].split(",")
+                energies = np.array([line_two[1:6]], dtype=np.float32)
+                num_atoms = np.array([int(lines[0])])
+                num_atoms0 = int(line_two[-1])
+                num_atoms1 = num_atoms[0] - num_atoms0
+
+                elem_xyz = np.array([x.split() for x in lines[2:]])
+                # elements = np.expand_dims(elem_xyz[:, 0], axis=0)
+                elements = elem_xyz[:, 0]
+                xyz = elem_xyz[:, 1:].astype(np.float32)
+                # xyz0_fname = os.path.join(xyz_dir, f"{filename}_0_tmp.xyz")
+                # with open(xyz0_fname, "w") as xyz_0_file:
+                #     lines_to_write = [str(num_atoms0) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[:num_atoms0].tolist()))
+                #     lines_to_write[-1] = lines_to_write[-1][:-1]
+                #     print(lines_to_write)
+                #     xyz_0_file.writelines(lines_to_write) # writelines doesn't actually add a newline to each string (weird)
+
+                # xyz1_fname = os.path.join(xyz_dir, f"{filename}_1_tmp.xyz")
+                # with open(xyz1_fname, "w") as xyz_1_file:
+                #     lines_to_write = [str(num_atoms1) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[num_atoms0:].tolist()))
+                #     lines_to_write[-1] = lines_to_write[-1][:-1]
+                #     xyz_1_file.writelines(lines_to_write)
+
+                # smiles0 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz0_fname))
+                # smiles1 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz1_fname))
+                #            
+                atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+                charges = np.expand_dims(np.array([0] * num_atoms[0]), axis=1)
+
+                atomic_inputs = np.concatenate((atomic_nums, charges, xyz), axis=-1, dtype=np.float32)
+
+                item = dict(
+                    # elements=elements,
+                    n_atoms=num_atoms,
+                    subset=subset,
+                    energies=energies,
+                    positions=xyz,
+                    atomic_inputs=atomic_inputs,
+                    name=np.array([""])
+                )
+                data.append(item)
+        return data
+
+
+

From 9cf6034b43cfa6fd8989bc58a849453d1671872a Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 5 Mar 2024 10:59:06 -0500
Subject: [PATCH 13/47] updated DES370K based on Prudencio's comments

---
 src/openqdc/datasets/interaction/des370k.py | 37 ++++++---------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
index 824da46..78b3bde 100644
--- a/src/openqdc/datasets/interaction/des370k.py
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -24,15 +24,15 @@ class DES370K(BaseInteractionDataset):
         "ccsd(t)/cbs",  # cbs
         "ccsd(t)/nn",  # nn
         "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz_es",
+        "sapt0/aug-cc-pwcvxz_ex",
+        "sapt0/aug-cc-pwcvxz_exs2",
+        "sapt0/aug-cc-pwcvxz_ind",
+        "sapt0/aug-cc-pwcvxz_exind",
+        "sapt0/aug-cc-pwcvxz_disp",
+        "sapt0/aug-cc-pwcvxz_exdisp_os",
+        "sapt0/aug-cc-pwcvxz_exdisp_ss",
+        "sapt0/aug-cc-pwcvxz_delta_HF",
     ]
 
     energy_target_names = [
@@ -87,27 +87,10 @@ def read_raw_entries(self) -> List[Dict]:
             name = np.array([smiles0 + "." + smiles1])
 
             item = dict(
-                mol0=dict(
-                    smiles=smiles0,
-                    atomic_inputs=atomic_inputs0,
-                    n_atoms=natoms0,
-                    charge=charge0,
-                    elements=elements0,
-                    atomic_nums=atomic_nums0,
-                    pos=pos0,
-                ),
-                mol1=dict(
-                    smiles=smiles1,
-                    atomic_inputs=atomic_inputs1,
-                    n_atoms=natoms1,
-                    charge=charge1,
-                    elements=elements1,
-                    atomic_nums=atomic_nums1,
-                    pos=pos1,
-                ),
                 energies=energies,
                 subset=np.array(["DES370K"]),
                 n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
+                n_atoms_first=np.array([natoms0], dtype=np.int32),
                 atomic_inputs=atomic_inputs,
                 name=name,
             )

From 6206665a601d2ba4192ccc6292fc85086b3cf989 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 5 Mar 2024 15:09:08 -0500
Subject: [PATCH 14/47] added const molecule_groups lookup for DES370K dataset

---
 src/openqdc/utils/molecule.py | 43 +++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py
index 82a58d2..4aeea1b 100644
--- a/src/openqdc/utils/molecule.py
+++ b/src/openqdc/utils/molecule.py
@@ -8,6 +8,49 @@
 
 atom_table = Chem.GetPeriodicTable()
 
+# molecule group classification for DES datasets
+molecule_groups = {
+    "acids": set(["CCC(=O)O", "CC(=O)O", "OC=O", "OC(=O)CC(=O)O"]),
+    "alcohols": set(["CCCO", "CCC(O)C", "CCO", "CC(O)C", "CO", "OC1CCCC1", "OC1CCCCC1", "OCCCCO", "OCCCO", "OCCO"]),
+    "alkanes": set(["C1CCCC1", "C1CCCCC1", "C", "CC1CCCC1", "CC1CCCCC1", "CC", "CCC", "CC(C)C", "CCCC", "CC(C)(C)C", "CCC(C)C", "CCCCC", "CCC(C)(C)C", "CCCCCC"]),
+    "alkenes": set(["C=C", "CC=C", "CC=CC", "CC(=C)C", "CCC=C", "CC=C(C)C", "CCC=CC", "CCC(=C)C", "CC(=C(C)C)C", "CCC=C(C)C", "CCC(=CC)C", "CCC(=C(C)C)C"]),
+    "amides": set(["CCCNC=O", "CCC(=O)N", "CCC(=O)NC", "CCC(=O)N(C)C", "CCC(=O)N(CC)C", "CCNC=O", "CCNC(=O)C", "CCN(C=O)CC", "CCN(C(=O)C)C", "CCNC(=O)CC", \
+                   "CCN(C(=O)C)CC", "CC(=O)N", "CC(=O)N(C)C", "CNC=O", "CNC(=O)C", "CN(C=O)CC", "CNC(=O)CC(=O)N", "CNC(=O)CC(=O)NC", \
+                   "CNC(=O)CNC=O", "CNC(=O)CNC(=O)C", "CNC(=O)C(NC(=O)C)C", "NC=O, NC(=O)CC(=O)N", "O=CN(C)C", "O=CNCCC(=O)N", "O=CNCCC(=O)NC", "O=CNCCNC=O", "O=CNCC(=O)N"]),
+    "amines": set(["C1CCCN1", "C1CCCNC1", "CCCN", "CCCNC", "CCCN(C)C", "CCN", "CCN(C)C", "CCNCC", "CCN(CC)C", "CN", "CNC", "CN(C)C", "CNCC", "CNCCCN", "CNCCCNC", "CNCCN", "CNCCNC", "N", "NCCCN", "NCCN"]),
+    "ammoniums": set(["CC[NH3+]", "C[N+](C)(C)C", "C[NH2+]C", "C[NH3+]", "C[NH+](C)C", "[NH4+]"]),
+    "benzene": set(["c1ccccc1", "Cc1ccccc1", "CCc1ccccc1"]),
+    "carboxylates": set(["[O-]C=O", "[O-]C(=O)C", "[O-]C(=O)CC"]),
+    "esters": set(["CCCOC=O", "CCC(=O)OC", "CCOC(=O)CC", "CCOC(=O)C", "CCOC=O", "COC(=O)C", "COC=O", "O=COCCCOC=O", "O=COCCOC=O", "O=COCOC=O"]),
+    "ethers": set(["C1CCCO1", "C1CCCOC1", "C1CCOCO1", "C1OCCO1", "CCCOC", "CCCOCOC", "CCOCC", "COCCCOC", "COCC", "COCCOC", "COC", "COCOCC", "COCOC", "O1CCOCC1", "O1COCOC1"]),
+    "guanidiums": set(["CCNC(=[NH2+])N", "CNC(=[NH2+])N", "NC(=[NH2+])N"]),
+    "imidazolium": set(["c1[nH]cc[nH+]1", "Cc1c[nH]c[nH+]1", "CCc1c[nH]c[nH+]1"]),
+    "ketones": set(["CCC(=O)CC", "CCC(=O)C", "CCC=O", "CC(=O)C", "CC=O", "C=O"]),
+    "monoatomics": set(["[Ar]", "[Br-]", "[Ca+2]", "[Cl-]", "[F-]", "[He]", "[I-]", "[K+]", "[Kr]", "[Li+]", "[Mg+2]", "[Na+]", "[Ne]", "[Xe]"]),
+    "other": set(["Brc1ccc(cc1)Br", "Brc1ccccc1", "BrC(Br)Br", "BrCBr", "BrCCBr", "CBr", "CC(Br)Br", "CCBr", "CCCC#CC", "CCCC(Cl)(Cl)Cl", "CCCC(Cl)Cl", \
+                  "CCCCCl", "CCC#CC", "CCCC#C", "CCCC(F)(F)F", "CCCC(F)F", "CCCCF", "CCC(Cl)(Cl)Cl", "CCC(Cl)Cl", "CCCCl", "CCCC#N", "CC#CC", "CCC#C", "CCC(F)(F)F", \
+                  "CCC(F)F", "CCCF", "CC(Cl)(Cl)Cl", "CC(Cl)Cl", "CCCl", "CCC#N", "CC#C", "CC(F)(F)F", "CC(F)F", "CCF", "CC(I)I", "CCI", "CCl", "CC#N", "CCOP(=O)(OC)OC", \
+                  "CCOP(=O)(OC)[O-]", "CCOP(=O)(OC)O", "C#C", "CF", "CI", "Clc1ccc(cc1)Cl", "Clc1cccc(c1)Cl", "Clc1ccccc1Cl", "Clc1ccccc1", "Clc1cc(Cl)c(c(c1Cl)Cl)Cl", \
+                  "Clc1cc(Cl)cc(c1)Cl", "Clc1c(Cl)c(Cl)c(c(c1Cl)Cl)Cl", "ClC(C(Cl)(Cl)Cl)(Cl)Cl", "ClC(C(Cl)(Cl)Cl)Cl", "ClCC(Cl)(Cl)Cl", "ClCC(Cl)Cl", "ClCCCl", "ClC(Cl)Cl", \
+                  "ClCCl", "CNCCCOC=O", "CNCCCOC", "CNCCC(=O)NC", "CNCCC(=O)N", "CNCCC(=O)O", "CNCCCO", "CNCCCSC", "CNCCCS", "CNCCNC=O", "CNCCOC=O", "CNCCOC", "CNCC(=O)NC", \
+                  "CNCC(=O)N", "CNCC(=O)O", "CNCCO", "CNCCSC", "CNCCS", "CNC(=O)CCN", "CNC(=O)CC(=O)O", "CNC(=O)CCO", "CNC(=O)CCS", "CNC(=O)CN", "CNC(=O)COC=O", "CNC(=O)CO", \
+                  "CNCOC=O", "CNCOC", "CNC(=O)CS", "CNCSC", "C#N", "COCCCN", "COCCCOC=O", "COCCC(=O)NC", "COCCC(=O)N", "COCCC(=O)O", "COCCCO", "COCCCSC", "COCCCS", "COCCNC=O", \
+                  "COCCN", "COCCOC=O", "COCC(=O)NC", "COCC(=O)N", "COCC(=O)O", "COCCO", "COCCSC", "COCCS", "COCNC=O", "COCN", "COCOC=O", "COCO", "COCSC", "COCS", "COP(=O)(OC)OC", \
+                  "COP(=O)(OC)[O-]", "COP(=O)(OC)O", "COP(=O)(O)O", "COP(=O)(OP(=O)(O)O)[O-]", "CSCCCNC=O", "CSCCCN", "CSCCCOC=O", "CSCCC(=O)N", "CSCCC(=O)O", "CSCCCO", "CSCCN", "CSCCOC=O", \
+                  "CSCC(=O)NC", "CSCC(=O)N", "CSCC(=O)O", "CSCCO", "CSCNC=O", "CSCN", "CSCOC=O", "CSCO", "Fc1ccc(cc1)F", "Fc1cccc(c1)F", "Fc1ccccc1F", "Fc1ccccc1", "Fc1cc(F)c(c(c1F)F)F", \
+                  "Fc1cc(F)cc(c1)F", "Fc1c(F)c(F)c(c(c1F)F)F", "FC(C(F)(F)F)(F)F", "FC(C(F)(F)F)F", "FCC(F)(F)F", "FCC(F)F", "FCCF", "FC(F)F", "FCF", "ICCI", "ICI", "NCCCOC=O", "NCCC(=O)N", \
+                  "NCCC(=O)O", "NCCCO", "NCCCS", "NCCNC=O", "NCCOC=O", "NCC(=O)N", "NCC(=O)O", "NCCO", "NCCS", "NC(=O)CC(=O)O","NC(=O)CCO", "NC(=O)CCS", "NC(=O)CO", "NCOC=O", "NC(=O)CS", \
+                  "OCCCNC=O", "OCCCOC=O", "OCCC(=O)O", "OCCCS", "OCCNC=O", "OCCOC=O", "OCC(=O)O", "OCCS", "O=CNCCC(=O)O", "O=CNCCOC=O", "O=CNCC(=O)O", "O=CNCOC=O", "O=COCCC(=O)NC", \
+                  "O=COCCC(=O)N", "O=COCCC(=O)O", "O=COCC(=O)N", "O=COCC(=O)O", "OC(=O)CCS", "OCOC=O", "OC(=O)CS", "OP(=O)(O)O", "[O-]P(=O)(OP(=O)(OC)O)O", "SCCCOC=O", "SCCNC=O", "SCCOC=O", \
+                  "SCOC=O", "[H][H]"]),
+    "phenol": set(["Cc1ccc(cc1)O", "CCc1ccc(cc1)O", "Oc1ccccc1"]),
+    "pyridine": set(["c1cccnc1", "c1ccncn1", "n1ccncc1"]),
+    "pyrrole": set(["c1ccc2c(c1)[nH]cc2", "c1ccc[nH]1", "c1ncc[nH]1", "Cc1cnc[nH]1", "Cc1c[nH]c2c1cccc2", "Cc1c[nH]cn1", "CCc1cnc[nH]1", "CCc1c[nH]c2c1cccc2", "CCc1c[nH]cn1"]),
+    "sulfides": set(["C1CCCS1", "C1CCCSC1", "C1CCSCS1", "C1CCSSC1", "C1CSSC1", "C1SCCS1", "CCCSCSC", "CCCSC", "CCCSSC", "CCSCC", "CCSSCC", "CCSSC", "CSCCCSC", "CSCCSC", "CSCC", \
+                     "CSCSCC", "CSCSC", "CSC", "CSSC", "S1CCSCC1", "S1CSCSC1"]),
+    "thiols": set(["CCCSS", "CCCS", "CCSS", "CCS", "CSCCCS", "CSCCS", "CSCS", "CSS", "CS", "SCCCS", "SCCS", "SS", "S"]),
+    "water": set(["O"]),
+}
 
 def z_to_formula(z):
     u, c = np.unique(z, return_counts=True)

From 5cb57d95a2e70ba59cb90748eecda6bda15b3547 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 5 Mar 2024 15:31:18 -0500
Subject: [PATCH 15/47] updated subsets for DES370K

---
 src/openqdc/datasets/interaction/des370k.py | 19 +++++++++++++++++--
 src/openqdc/utils/molecule.py               |  2 +-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
index 78b3bde..ed5b7fb 100644
--- a/src/openqdc/datasets/interaction/des370k.py
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -5,9 +5,10 @@
 from typing import Dict, List
 
 from tqdm import tqdm
+from rdkit import Chem
 from loguru import logger
 from openqdc.datasets.interaction import BaseInteractionDataset
-from openqdc.utils.molecule import atom_table
+from openqdc.utils.molecule import atom_table, molecule_groups
 
 
 class DES370K(BaseInteractionDataset):
@@ -85,10 +86,24 @@ def read_raw_entries(self) -> List[Dict]:
             energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
 
             name = np.array([smiles0 + "." + smiles1])
+            canon_smiles0 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles0))
+            canon_smiles1 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1))
+
+            subsets = []
+            # for smiles in [canon_smiles0, canon_smiles1]:
+            for smiles in [smiles0, smiles1]:
+                found = False
+                for functional_group, smiles_set in molecule_groups.items():
+                    if smiles in smiles_set:
+                        subsets.append(functional_group)
+                        found = True
+                if not found:
+                    logger.info(f"molecule group lookup failed for {smiles}")
+
 
             item = dict(
                 energies=energies,
-                subset=np.array(["DES370K"]),
+                subset=np.array([subsets]),
                 n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
                 n_atoms_first=np.array([natoms0], dtype=np.int32),
                 atomic_inputs=atomic_inputs,
diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py
index 4aeea1b..e7a3710 100644
--- a/src/openqdc/utils/molecule.py
+++ b/src/openqdc/utils/molecule.py
@@ -16,7 +16,7 @@
     "alkenes": set(["C=C", "CC=C", "CC=CC", "CC(=C)C", "CCC=C", "CC=C(C)C", "CCC=CC", "CCC(=C)C", "CC(=C(C)C)C", "CCC=C(C)C", "CCC(=CC)C", "CCC(=C(C)C)C"]),
     "amides": set(["CCCNC=O", "CCC(=O)N", "CCC(=O)NC", "CCC(=O)N(C)C", "CCC(=O)N(CC)C", "CCNC=O", "CCNC(=O)C", "CCN(C=O)CC", "CCN(C(=O)C)C", "CCNC(=O)CC", \
                    "CCN(C(=O)C)CC", "CC(=O)N", "CC(=O)N(C)C", "CNC=O", "CNC(=O)C", "CN(C=O)CC", "CNC(=O)CC(=O)N", "CNC(=O)CC(=O)NC", \
-                   "CNC(=O)CNC=O", "CNC(=O)CNC(=O)C", "CNC(=O)C(NC(=O)C)C", "NC=O, NC(=O)CC(=O)N", "O=CN(C)C", "O=CNCCC(=O)N", "O=CNCCC(=O)NC", "O=CNCCNC=O", "O=CNCC(=O)N"]),
+                   "CNC(=O)CNC=O", "CNC(=O)CNC(=O)C", "CNC(=O)C(NC(=O)C)C", "NC=O", "NC(=O)CC(=O)N", "O=CN(C)C", "O=CNCCC(=O)N", "O=CNCCC(=O)NC", "O=CNCCNC=O", "O=CNCC(=O)N"]),
     "amines": set(["C1CCCN1", "C1CCCNC1", "CCCN", "CCCNC", "CCCN(C)C", "CCN", "CCN(C)C", "CCNCC", "CCN(CC)C", "CN", "CNC", "CN(C)C", "CNCC", "CNCCCN", "CNCCCNC", "CNCCN", "CNCCNC", "N", "NCCCN", "NCCN"]),
     "ammoniums": set(["CC[NH3+]", "C[N+](C)(C)C", "C[NH2+]C", "C[NH3+]", "C[NH+](C)C", "[NH4+]"]),
     "benzene": set(["c1ccccc1", "Cc1ccccc1", "CCc1ccccc1"]),

From e18b7104c6b9bc07580eb030b2b7dc18029641a3 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 5 Mar 2024 15:41:35 -0500
Subject: [PATCH 16/47] added download url for des5m_interaction

---
 src/openqdc/raws/config_factory.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py
index df54307..28f115e 100644
--- a/src/openqdc/raws/config_factory.py
+++ b/src/openqdc/raws/config_factory.py
@@ -97,6 +97,13 @@ class DataConfigFactory:
         }
     )
 
+    des5m_interaction = dict(
+        dataset_name="des5m_interaction",
+        links={
+            "DES5M.zip": "https://zenodo.org/records/5706002/files/DESS5M.zip?download=1",
+        }
+    )
+
     tmqm = dict(
         dataset_name="tmqm",
         links={

From 54cadbf8e21b46fe0467b41c9ba1b2af13c59fc8 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 5 Mar 2024 15:48:37 -0500
Subject: [PATCH 17/47] updated README with new datasets

---
 README.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c143e95..4e3b6e2 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ pytest
 6. QM Level of Theory
  -->
 
-We provide support for the following publicly available QM Datasets.
+We provide support for the following publicly available QM Potential Energy Datasets.
 
 | Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
 | --- | --- | --- | --- | --- | --- | --- | --- |
@@ -46,3 +46,11 @@ We provide support for the following publicly available QM Datasets.
 | [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes |
 | [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | |
 | [QM7X](https://www.nature.com/articles/s41597-021-00812-2) |  6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes |
+
+We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets.
+
+| Dataset | 
+| --- | 
+| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) |
+| [DES5M](https://www.nature.com/articles/s41597-021-00833-x)   |
+| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) |
\ No newline at end of file

From a922ef7568013236d852a366eb5e5bc1e579aae8 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 5 Mar 2024 17:05:01 -0500
Subject: [PATCH 18/47] Added DES5M dataset

---
 src/openqdc/datasets/interaction/__init__.py |   2 +
 src/openqdc/datasets/interaction/des5m.py    | 104 +++++++++++++++++++
 src/openqdc/utils/molecule.py                |   2 +
 3 files changed, 108 insertions(+)
 create mode 100644 src/openqdc/datasets/interaction/des5m.py

diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
index a920396..aa33b66 100644
--- a/src/openqdc/datasets/interaction/__init__.py
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -10,6 +10,7 @@
 _lazy_imports_obj = {
     "BaseInteractionDataset": "openqdc.datasets.interaction.base",
     "DES370K": "openqdc.datasets.interaction.des370k",
+    "DES5M": "openqdc.datasets.interaction.des5m",
     "Metcalf": "openqdc.datasets.interaction.metcalf",
 }
 
@@ -46,5 +47,6 @@ def __dir__():
     __all__ = [
         "BaseInteractionDataset",
         "DES370K",
+        "DES5M",
         "Metcalf",
     ]
diff --git a/src/openqdc/datasets/interaction/des5m.py b/src/openqdc/datasets/interaction/des5m.py
new file mode 100644
index 0000000..7fb5d50
--- /dev/null
+++ b/src/openqdc/datasets/interaction/des5m.py
@@ -0,0 +1,104 @@
+import os
+import numpy as np
+import pandas as pd
+
+from typing import Dict, List
+
+from tqdm import tqdm
+from rdkit import Chem
+from loguru import logger
+from openqdc.datasets.interaction import DES370K
+from openqdc.utils.molecule import atom_table, molecule_groups
+
+
+class DES5M(DES370K):
+    __name__ = "des5m_interaction"
+    __energy_methods__ = [
+        "mp2/cc-pvqz",
+        "mp2/cc-pvtz",
+        "mp2/cbs",
+        "ccsd(t)/nn",  # nn
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz_es",
+        "sapt0/aug-cc-pwcvxz_ex",
+        "sapt0/aug-cc-pwcvxz_exs2",
+        "sapt0/aug-cc-pwcvxz_ind",
+        "sapt0/aug-cc-pwcvxz_exind",
+        "sapt0/aug-cc-pwcvxz_disp",
+        "sapt0/aug-cc-pwcvxz_exdisp_os",
+        "sapt0/aug-cc-pwcvxz_exdisp_ss",
+        "sapt0/aug-cc-pwcvxz_delta_HF",
+    ]
+
+    energy_target_names = [
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    def read_raw_entries(self) -> List[Dict]:
+        self.filepath = os.path.join(self.root, "DES5M.csv")
+        logger.info(f"Reading DES5M interaction data from {self.filepath}")
+        df = pd.read_csv(self.filepath)
+        data = []
+        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+            smiles0, smiles1 = row["smiles0"], row["smiles1"]
+            charge0, charge1 = row["charge0"], row["charge1"]
+            natoms0, natoms1 = row["natoms0"], row["natoms1"]
+            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
+            pos0 = pos[:natoms0]
+            pos1 = pos[natoms0:]
+            
+            elements = row["elements"].split()
+            elements0 = np.array(elements[:natoms0])
+            elements1 = np.array(elements[natoms0:])
+
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+            atomic_nums0 = np.array(atomic_nums[:natoms0])
+            atomic_nums1 = np.array(atomic_nums[natoms0:])
+
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+            atomic_inputs0 = atomic_inputs[:natoms0, :]
+            atomic_inputs1 = atomic_inputs[natoms0:, :]
+
+            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
+
+            name = np.array([smiles0 + "." + smiles1])
+            canon_smiles0 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles0))
+            canon_smiles1 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1))
+
+            subsets = []
+            # for smiles in [canon_smiles0, canon_smiles1]:
+            for smiles in [smiles0, smiles1]:
+                found = False
+                for functional_group, smiles_set in molecule_groups.items():
+                    if smiles in smiles_set:
+                        subsets.append(functional_group)
+                        found = True
+                if not found:
+                    logger.info(f"molecule group lookup failed for {smiles}")
+
+
+            item = dict(
+                energies=energies,
+                subset=np.array([subsets]),
+                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
+                n_atoms_first=np.array([natoms0], dtype=np.int32),
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py
index e7a3710..bddaaab 100644
--- a/src/openqdc/utils/molecule.py
+++ b/src/openqdc/utils/molecule.py
@@ -50,6 +50,8 @@
                      "CSCSCC", "CSCSC", "CSC", "CSSC", "S1CCSCC1", "S1CSCSC1"]),
     "thiols": set(["CCCSS", "CCCS", "CCSS", "CCS", "CSCCCS", "CSCCS", "CSCS", "CSS", "CS", "SCCCS", "SCCS", "SS", "S"]),
     "water": set(["O"]),
+    "flourane": set(["F"]),
+    "hydrogen chloride": set(["Cl"]),
 }
 
 def z_to_formula(z):

From 2146058503c91a7b6685618b6a9084050179f106 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 5 Mar 2024 21:56:37 -0500
Subject: [PATCH 19/47] added des_s66 dataset

---
 src/openqdc/datasets/interaction/__init__.py |   2 +
 src/openqdc/datasets/interaction/des5m.py    |   2 -
 src/openqdc/datasets/interaction/dess66.py   | 102 +++++++++++++++++++
 src/openqdc/raws/config_factory.py           |  11 ++
 4 files changed, 115 insertions(+), 2 deletions(-)
 create mode 100644 src/openqdc/datasets/interaction/dess66.py

diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
index aa33b66..c7cddb4 100644
--- a/src/openqdc/datasets/interaction/__init__.py
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -12,6 +12,7 @@
     "DES370K": "openqdc.datasets.interaction.des370k",
     "DES5M": "openqdc.datasets.interaction.des5m",
     "Metcalf": "openqdc.datasets.interaction.metcalf",
+    "DESS66": "openqdc.datasets.interaction.dess66",
 }
 
 _lazy_imports_mod = {}
@@ -49,4 +50,5 @@ def __dir__():
         "DES370K",
         "DES5M",
         "Metcalf",
+        "DESS66",
     ]
diff --git a/src/openqdc/datasets/interaction/des5m.py b/src/openqdc/datasets/interaction/des5m.py
index 7fb5d50..e79d690 100644
--- a/src/openqdc/datasets/interaction/des5m.py
+++ b/src/openqdc/datasets/interaction/des5m.py
@@ -77,8 +77,6 @@ def read_raw_entries(self) -> List[Dict]:
             energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
 
             name = np.array([smiles0 + "." + smiles1])
-            canon_smiles0 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles0))
-            canon_smiles1 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1))
 
             subsets = []
             # for smiles in [canon_smiles0, canon_smiles1]:
diff --git a/src/openqdc/datasets/interaction/dess66.py b/src/openqdc/datasets/interaction/dess66.py
new file mode 100644
index 0000000..13f5b9c
--- /dev/null
+++ b/src/openqdc/datasets/interaction/dess66.py
@@ -0,0 +1,102 @@
+import os
+import numpy as np
+import pandas as pd
+
+from typing import Dict, List
+
+from tqdm import tqdm
+from rdkit import Chem
+from loguru import logger
+from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table, molecule_groups
+
+
+class DESS66(BaseInteractionDataset):
+    __name__ = "des_s66"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "mp2/cc-pvdz",
+        "mp2/cc-pvqz",
+        "mp2/cc-pvtz",
+        "mp2/cbs",
+        "ccsd(t)/cc-pvdz",
+        "ccsd(t)/cbs",  # cbs
+        "ccsd(t)/nn",  # nn
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz_es",
+        "sapt0/aug-cc-pwcvxz_ex",
+        "sapt0/aug-cc-pwcvxz_exs2",
+        "sapt0/aug-cc-pwcvxz_ind",
+        "sapt0/aug-cc-pwcvxz_exind",
+        "sapt0/aug-cc-pwcvxz_disp",
+        "sapt0/aug-cc-pwcvxz_exdisp_os",
+        "sapt0/aug-cc-pwcvxz_exdisp_ss",
+        "sapt0/aug-cc-pwcvxz_delta_HF",
+    ]
+
+    energy_target_names = [
+        "cc_MP2_all",
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "cc_CCSD(T)_all",
+        "cbs_CCSD(T)_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    def read_raw_entries(self) -> List[Dict]:
+        self.filepath = os.path.join(self.root, "DESS66.csv")
+        logger.info(f"Reading DESS66 interaction data from {self.filepath}")
+        df = pd.read_csv(self.filepath)
+        data = []
+        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+            smiles0, smiles1 = row["smiles0"], row["smiles1"]
+            charge0, charge1 = row["charge0"], row["charge1"]
+            natoms0, natoms1 = row["natoms0"], row["natoms1"]
+            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
+            pos0 = pos[:natoms0]
+            pos1 = pos[natoms0:]
+            
+            elements = row["elements"].split()
+            elements0 = np.array(elements[:natoms0])
+            elements1 = np.array(elements[natoms0:])
+
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+            atomic_nums0 = np.array(atomic_nums[:natoms0])
+            atomic_nums1 = np.array(atomic_nums[natoms0:])
+
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+            atomic_inputs0 = atomic_inputs[:natoms0, :]
+            atomic_inputs1 = atomic_inputs[natoms0:, :]
+
+            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
+
+            name = np.array([smiles0 + "." + smiles1])
+
+
+            subset = row["system_name"]
+
+            item = dict(
+                energies=energies,
+                subset=np.array([subset]),
+                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
+                n_atoms_first=np.array([natoms0], dtype=np.int32),
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py
index 28f115e..5e01a91 100644
--- a/src/openqdc/raws/config_factory.py
+++ b/src/openqdc/raws/config_factory.py
@@ -144,6 +144,17 @@ class DataConfigFactory:
         dataset_name="transition1x",
         links={"Transition1x.h5": "https://figshare.com/ndownloader/files/36035789"},
     )
+
+    # l7 = dict(
+    #     dataset_name="l7",
+    #     links={"l7.zip": "http://www.begdb.org/moldown.php?id=40"}
+    # )
+
+    des_s66 = dict(
+        dataset_name="des_s66",
+        links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"},
+    )
+
     available_datasets = [k for k in locals().keys() if not k.startswith("__")]
 
     def __init__(self):

From 4d9a4bab09a5a32770e4190da40bef28bc0fd0c3 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 5 Mar 2024 22:02:08 -0500
Subject: [PATCH 20/47] added DESS66x8 dataset

---
 src/openqdc/datasets/interaction/__init__.py |   2 +
 src/openqdc/datasets/interaction/dess66x8.py | 102 +++++++++++++++++++
 src/openqdc/raws/config_factory.py           |   5 +
 3 files changed, 109 insertions(+)
 create mode 100644 src/openqdc/datasets/interaction/dess66x8.py

diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
index c7cddb4..b848fc7 100644
--- a/src/openqdc/datasets/interaction/__init__.py
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -13,6 +13,7 @@
     "DES5M": "openqdc.datasets.interaction.des5m",
     "Metcalf": "openqdc.datasets.interaction.metcalf",
     "DESS66": "openqdc.datasets.interaction.dess66",
+    "DESS66x8": "openqdc.datasets.interaction.dess66x8",
 }
 
 _lazy_imports_mod = {}
@@ -51,4 +52,5 @@ def __dir__():
         "DES5M",
         "Metcalf",
         "DESS66",
+        "DESS66x8",
     ]
diff --git a/src/openqdc/datasets/interaction/dess66x8.py b/src/openqdc/datasets/interaction/dess66x8.py
new file mode 100644
index 0000000..80e3bad
--- /dev/null
+++ b/src/openqdc/datasets/interaction/dess66x8.py
@@ -0,0 +1,102 @@
+import os
+import numpy as np
+import pandas as pd
+
+from typing import Dict, List
+
+from tqdm import tqdm
+from rdkit import Chem
+from loguru import logger
+from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table, molecule_groups
+
+
+class DESS66x8(BaseInteractionDataset):
+    __name__ = "des_s66x8"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "mp2/cc-pvdz",
+        "mp2/cc-pvqz",
+        "mp2/cc-pvtz",
+        "mp2/cbs",
+        "ccsd(t)/cc-pvdz",
+        "ccsd(t)/cbs",  # cbs
+        "ccsd(t)/nn",  # nn
+        "sapt0/aug-cc-pwcvxz",
+        "sapt0/aug-cc-pwcvxz_es",
+        "sapt0/aug-cc-pwcvxz_ex",
+        "sapt0/aug-cc-pwcvxz_exs2",
+        "sapt0/aug-cc-pwcvxz_ind",
+        "sapt0/aug-cc-pwcvxz_exind",
+        "sapt0/aug-cc-pwcvxz_disp",
+        "sapt0/aug-cc-pwcvxz_exdisp_os",
+        "sapt0/aug-cc-pwcvxz_exdisp_ss",
+        "sapt0/aug-cc-pwcvxz_delta_HF",
+    ]
+
+    energy_target_names = [
+        "cc_MP2_all",
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "cc_CCSD(T)_all",
+        "cbs_CCSD(T)_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    def read_raw_entries(self) -> List[Dict]:
+        self.filepath = os.path.join(self.root, "DESS66x8.csv")
+        logger.info(f"Reading DESS66x8 interaction data from {self.filepath}")
+        df = pd.read_csv(self.filepath)
+        data = []
+        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+            smiles0, smiles1 = row["smiles0"], row["smiles1"]
+            charge0, charge1 = row["charge0"], row["charge1"]
+            natoms0, natoms1 = row["natoms0"], row["natoms1"]
+            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
+            pos0 = pos[:natoms0]
+            pos1 = pos[natoms0:]
+            
+            elements = row["elements"].split()
+            elements0 = np.array(elements[:natoms0])
+            elements1 = np.array(elements[natoms0:])
+
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+            atomic_nums0 = np.array(atomic_nums[:natoms0])
+            atomic_nums1 = np.array(atomic_nums[natoms0:])
+
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+            atomic_inputs0 = atomic_inputs[:natoms0, :]
+            atomic_inputs1 = atomic_inputs[natoms0:, :]
+
+            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
+
+            name = np.array([smiles0 + "." + smiles1])
+
+
+            subset = row["system_name"]
+
+            item = dict(
+                energies=energies,
+                subset=np.array([subset]),
+                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
+                n_atoms_first=np.array([natoms0], dtype=np.int32),
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py
index 5e01a91..312faf0 100644
--- a/src/openqdc/raws/config_factory.py
+++ b/src/openqdc/raws/config_factory.py
@@ -155,6 +155,11 @@ class DataConfigFactory:
         links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"},
     )
 
+    des_s66x8 = dict(
+        dataset_name="des_s66x8",
+        links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"}
+    )
+
     available_datasets = [k for k in locals().keys() if not k.startswith("__")]
 
     def __init__(self):

From c2229e38be0d76680155323250ef95d5ce78143d Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Wed, 6 Mar 2024 09:56:24 -0500
Subject: [PATCH 21/47] small update to __init__ file

---
 src/openqdc/datasets/interaction/__init__.py | 4 ++++
 src/openqdc/datasets/interaction/des370k.py  | 2 --
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
index b848fc7..bf266c2 100644
--- a/src/openqdc/datasets/interaction/__init__.py
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -45,6 +45,10 @@ def __dir__():
 if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
     from .base import BaseInteractionDataset
     from .des370k import DES370K
+    from .des5m import DES5M
+    from .metcalf import Metcalf
+    from .dess66 import DESS66
+    from .dess66x8 import DESS66x8
 
     __all__ = [
         "BaseInteractionDataset",
diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
index ed5b7fb..65a5835 100644
--- a/src/openqdc/datasets/interaction/des370k.py
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -86,8 +86,6 @@ def read_raw_entries(self) -> List[Dict]:
             energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
 
             name = np.array([smiles0 + "." + smiles1])
-            canon_smiles0 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles0))
-            canon_smiles1 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1))
 
             subsets = []
             # for smiles in [canon_smiles0, canon_smiles1]:

From 9349454ec47a76fc9bd07109189f6edfd1fc42fe Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Wed, 6 Mar 2024 10:52:40 -0500
Subject: [PATCH 22/47] added L7 dataset

---
 src/openqdc/datasets/interaction/L7.py | 73 ++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 src/openqdc/datasets/interaction/L7.py

diff --git a/src/openqdc/datasets/interaction/L7.py b/src/openqdc/datasets/interaction/L7.py
new file mode 100644
index 0000000..a0d657f
--- /dev/null
+++ b/src/openqdc/datasets/interaction/L7.py
@@ -0,0 +1,73 @@
+import os
+import numpy as np
+import pandas as pd
+
+from typing import Dict, List
+
+from tqdm import tqdm
+from rdkit import Chem
+from ruamel.yaml import YAML
+from loguru import logger
+from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table, molecule_groups
+
+
+class L7(BaseInteractionDataset):
+    __name__ = "L7"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "CSD(T) | QCISD(T)",
+        "DLPNO-CCSD(T)",
+        "MP2/CBS",
+        "MP2C/CBS",
+        "fixed",
+        "DLPNO-CCSD(T0)",
+        "LNO-CCSD(T)",
+        "FN-DMC",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        yaml_fpath = os.path.join(self.root, "l7.yaml")
+        logger.info(f"Reading L7 interaction data from {self.root}")
+        yaml_file = open(yaml_fpath, "r")
+        yaml = YAML()
+        data = []
+        data_dict = yaml.load(yaml_file)
+        charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"])
+
+        for idx, item in enumerate(data_dict["items"]):
+            energies = []
+            name = np.array([item["shortname"]])
+            fname = item["geometry"].split(":")[1]
+            energies.append(item["reference_value"])
+            xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r")
+            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
+            lines.pop(1) 
+            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
+            n_atoms_first = np.array([int(item["setup"]["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
+            subset = np.array([item["group"]])
+            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies = np.array([energies], dtype=np.float32)
+            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
+            elems = np.array(lines[1:])[:, 0]
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            natoms0 = n_atoms_first[0]
+            natoms1 = n_atoms[0] - natoms0
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            item = dict(
+                energies=energies,
+                subset=subset,
+                n_atoms=n_atoms,
+                n_atoms_first=n_atoms_first,
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data

From c3bdc640d7039d00350fb6fc26f0449b81e9133b Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Wed, 6 Mar 2024 11:10:52 -0500
Subject: [PATCH 23/47] added X40 dataset

---
 src/openqdc/datasets/interaction/X40.py | 70 +++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 src/openqdc/datasets/interaction/X40.py

diff --git a/src/openqdc/datasets/interaction/X40.py b/src/openqdc/datasets/interaction/X40.py
new file mode 100644
index 0000000..f89e6dd
--- /dev/null
+++ b/src/openqdc/datasets/interaction/X40.py
@@ -0,0 +1,70 @@
+import os
+import numpy as np
+import pandas as pd
+
+from typing import Dict, List
+
+from tqdm import tqdm
+from rdkit import Chem
+from ruamel.yaml import YAML
+from loguru import logger
+from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table, molecule_groups
+
+
+class X40(BaseInteractionDataset):
+    __name__ = "X40"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "default",
+        "MP2/CBS",
+        "dCCSD(T)/haDZ",
+        "dCCSD(T)/haTZ",
+        "MP2.5/CBS(aDZ)",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        yaml_fpath = os.path.join(self.root, "x40.yaml")
+        logger.info(f"Reading X40 interaction data from {self.root}")
+        yaml_file = open(yaml_fpath, "r")
+        yaml = YAML()
+        data = []
+        data_dict = yaml.load(yaml_file)
+        charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"])
+
+        for idx, item in enumerate(data_dict["items"]):
+            energies = []
+            name = np.array([item["shortname"]])
+            energies.append(float(item["reference_value"]))
+            xyz_file = open(os.path.join(self.root, f"{item['shortname']}.xyz"), "r")
+            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
+            setup = lines.pop(1) 
+            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
+            n_atoms_first = setup[0].split("-")[1]
+            n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32)
+            subset = np.array([item["group"]])
+            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies = np.array([energies], dtype=np.float32)
+            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
+            elems = np.array(lines[1:])[:, 0]
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            natoms0 = n_atoms_first[0]
+            natoms1 = n_atoms[0] - natoms0
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            item = dict(
+                energies=energies,
+                subset=subset,
+                n_atoms=n_atoms,
+                n_atoms_first=n_atoms_first,
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data

From 23c0739c177b56168d84039e95b12e257f60de7a Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Wed, 6 Mar 2024 11:13:28 -0500
Subject: [PATCH 24/47] add new datasets to __init__.py

---
 src/openqdc/datasets/interaction/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
index bf266c2..3d6f17e 100644
--- a/src/openqdc/datasets/interaction/__init__.py
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -14,6 +14,8 @@
     "Metcalf": "openqdc.datasets.interaction.metcalf",
     "DESS66": "openqdc.datasets.interaction.dess66",
     "DESS66x8": "openqdc.datasets.interaction.dess66x8",
+    "L7": "openqdc.datasets.interaction.L7",
+    "X40": "openqdc.datasets.interaction.X40",
 }
 
 _lazy_imports_mod = {}
@@ -49,6 +51,8 @@ def __dir__():
     from .metcalf import Metcalf
     from .dess66 import DESS66
     from .dess66x8 import DESS66x8
+    from .L7 import L7
+    from .X40 import X40
 
     __all__ = [
         "BaseInteractionDataset",
@@ -57,4 +61,6 @@ def __dir__():
         "Metcalf",
         "DESS66",
         "DESS66x8",
+        "L7",
+        "X40",
     ]

From 74f87a6934c9ddcf847e80bda884ba764a934b8d Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Thu, 7 Mar 2024 10:35:00 -0500
Subject: [PATCH 25/47] added splinter dataset

---
 src/openqdc/datasets/interaction/__init__.py |   3 +
 src/openqdc/datasets/interaction/splinter.py | 107 +++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 src/openqdc/datasets/interaction/splinter.py

diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
index 3d6f17e..1906b7f 100644
--- a/src/openqdc/datasets/interaction/__init__.py
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -16,6 +16,7 @@
     "DESS66x8": "openqdc.datasets.interaction.dess66x8",
     "L7": "openqdc.datasets.interaction.L7",
     "X40": "openqdc.datasets.interaction.X40",
+    "Splinter": "openqdc.datasets.interaction.splinter",
 }
 
 _lazy_imports_mod = {}
@@ -53,6 +54,7 @@ def __dir__():
     from .dess66x8 import DESS66x8
     from .L7 import L7
     from .X40 import X40
+    from .splinter import Splinter
 
     __all__ = [
         "BaseInteractionDataset",
@@ -63,4 +65,5 @@ def __dir__():
         "DESS66x8",
         "L7",
         "X40",
+        "Splinter",
     ]
diff --git a/src/openqdc/datasets/interaction/splinter.py b/src/openqdc/datasets/interaction/splinter.py
new file mode 100644
index 0000000..ff39268
--- /dev/null
+++ b/src/openqdc/datasets/interaction/splinter.py
@@ -0,0 +1,107 @@
+import os
+import numpy as np
+import pandas as pd
+
+from typing import Dict, List
+
+from tqdm import tqdm
+from rdkit import Chem
+from loguru import logger
+from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table, molecule_groups
+
+
+class Splinter(BaseInteractionDataset):
+    __name__ = "splinter"
+    __energy_methods__ = [
+        "sapt0/jun-cc-pV(D+d)Z_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_es_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_ex_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_ind_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_disp_unscaled",
+        "sapt0/jun-cc-pV(D+d)Z_scaled",
+        "sapt0/jun-cc-pV(D+d)Z_es_scaled",
+        "sapt0/jun-cc-pV(D+d)Z_ex_scaled",
+        "sapt0/jun-cc-pV(D+d)Z_ind_scaled",
+        "sapt0/jun-cc-pV(D+d)Z_disp_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_es_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_ex_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_ind_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_disp_unscaled",
+        "sapt0/aug-cc-pV(D+d)Z_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_es_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_ex_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_ind_scaled",
+        "sapt0/aug-cc-pV(D+d)Z_disp_scaled",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        logger.info(f"Reading Splinter interaction data from {self.root}")
+        data = []
+        i = 0
+        for root, dirs, files in tqdm(os.walk(self.root), total=1709042): # total is currently an approximation
+            i += 1
+            if not files:
+                continue
+            for filename in files:
+                if not filename.endswith(".xyz"):
+                    continue
+                filepath = os.path.join(root, filename)
+                filein = open(filepath, "r")                
+                lines = list(map(lambda x: x.strip(), filein.readlines()))
+                n_atoms = np.array([int(lines[0])], dtype=np.int32)
+                metadata = lines[1].split(",")
+                try:
+                    (
+                        protein_monomer_name, 
+                        protein_interaction_site_type, 
+                        ligand_monomer_name, 
+                        ligand_interaction_site_type,
+                        index, r, theta_P, tau_P, theta_L,
+                        tau_L, tau_PL
+                    ) = metadata[0].split("_")
+                    index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL]))
+                except:
+                        (protein_monomer_name, 
+                        protein_interaction_site_type, 
+                        ligand_monomer_name, 
+                        ligand_interaction_site_type,
+                        index, _) = metadata[0].split("_")
+                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6
+                energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)
+                n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32)
+                total_charge, charge0, charge1 = list(map(int, metadata[1:4]))
+                lines = list(map(lambda x: x.split(), lines[2:]))
+                pos = np.array(lines)[:, 1:].astype(np.float32)
+                elems = np.array(lines)[:, 0]
+                atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+                natoms0 = n_atoms_first[0]
+                natoms1 = n_atoms[0] - natoms0
+                charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+                atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+                subset = np.array([root.split("/")[-1]])
+
+                item = dict(
+                    energies=energies,
+                    subset=subset,
+                    n_atoms=n_atoms,
+                    n_atoms_first=n_atoms_first,
+                    atomic_inputs=atomic_inputs,
+                    protein_monomer_name= np.array([protein_monomer_name]),
+                    protein_interaction_site_type=np.array([protein_interaction_site_type]),
+                    ligand_monomer_name=np.array([ligand_monomer_name]),
+                    ligand_interaction_site_type=np.array([ligand_interaction_site_type]),
+                    index = np.array([index]),
+                    r=np.array([r]),
+                    theta_P=np.array([theta_P]),
+                    tau_P=np.array([tau_P]),
+                    theta_L=np.array([theta_L]),
+                    tau_L=np.array([tau_L]),
+                    tau_PL=np.array([tau_PL]),
+                )
+                data.append(item)
+        logger.info(f"Processed {i} files in total")
+        return data

From f046ea9b56076066b420eeedba12341039e42174 Mon Sep 17 00:00:00 2001
From: Danny McNeela <daniel.mcneela@valencelabs.com>
Date: Thu, 7 Mar 2024 16:29:31 +0000
Subject: [PATCH 26/47] fixed a couple splinter things

---
 src/openqdc/datasets/interaction/splinter.py | 121 ++++++++++---------
 1 file changed, 61 insertions(+), 60 deletions(-)

diff --git a/src/openqdc/datasets/interaction/splinter.py b/src/openqdc/datasets/interaction/splinter.py
index ff39268..7d7dbd0 100644
--- a/src/openqdc/datasets/interaction/splinter.py
+++ b/src/openqdc/datasets/interaction/splinter.py
@@ -42,66 +42,67 @@ def read_raw_entries(self) -> List[Dict]:
         logger.info(f"Reading Splinter interaction data from {self.root}")
         data = []
         i = 0
-        for root, dirs, files in tqdm(os.walk(self.root), total=1709042): # total is currently an approximation
-            i += 1
-            if not files:
-                continue
-            for filename in files:
-                if not filename.endswith(".xyz"):
-                    continue
-                filepath = os.path.join(root, filename)
-                filein = open(filepath, "r")                
-                lines = list(map(lambda x: x.strip(), filein.readlines()))
-                n_atoms = np.array([int(lines[0])], dtype=np.int32)
-                metadata = lines[1].split(",")
-                try:
-                    (
-                        protein_monomer_name, 
-                        protein_interaction_site_type, 
-                        ligand_monomer_name, 
-                        ligand_interaction_site_type,
-                        index, r, theta_P, tau_P, theta_L,
-                        tau_L, tau_PL
-                    ) = metadata[0].split("_")
-                    index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL]))
-                except:
-                        (protein_monomer_name, 
-                        protein_interaction_site_type, 
-                        ligand_monomer_name, 
-                        ligand_interaction_site_type,
-                        index, _) = metadata[0].split("_")
-                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6
-                energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)
-                n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32)
-                total_charge, charge0, charge1 = list(map(int, metadata[1:4]))
-                lines = list(map(lambda x: x.split(), lines[2:]))
-                pos = np.array(lines)[:, 1:].astype(np.float32)
-                elems = np.array(lines)[:, 0]
-                atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
-                natoms0 = n_atoms_first[0]
-                natoms1 = n_atoms[0] - natoms0
-                charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
-                atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-                subset = np.array([root.split("/")[-1]])
+        with tqdm(total=1706343) as progress_bar:
+            for root, dirs, files in os.walk(self.root): # total is currently an approximation
+                for filename in files:
+                    if not filename.endswith(".xyz"):
+                        continue
+                    i += 1
+                    filepath = os.path.join(root, filename)
+                    filein = open(filepath, "r")                
+                    lines = list(map(lambda x: x.strip(), filein.readlines()))
+                    n_atoms = np.array([int(lines[0])], dtype=np.int32)
+                    metadata = lines[1].split(",")
+                    try:
+                        (
+                            protein_monomer_name, 
+                            protein_interaction_site_type, 
+                            ligand_monomer_name, 
+                            ligand_interaction_site_type,
+                            index, r, theta_P, tau_P, theta_L,
+                            tau_L, tau_PL
+                        ) = metadata[0].split("_")
+                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL]))
+                    except:
+                            (protein_monomer_name, 
+                            protein_interaction_site_type, 
+                            ligand_monomer_name, 
+                            ligand_interaction_site_type,
+                            index, _) = metadata[0].split("_")
+                            r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6
+                    energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)
+                    n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32)
+                    total_charge, charge0, charge1 = list(map(int, metadata[1:4]))
+                    lines = list(map(lambda x: x.split(), lines[2:]))
+                    pos = np.array(lines)[:, 1:].astype(np.float32)
+                    elems = np.array(lines)[:, 0]
+                    atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+                    natoms0 = n_atoms_first[0]
+                    natoms1 = n_atoms[0] - natoms0
+                    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+                    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+                    subset = np.array([root.split("/")[-1]])
 
-                item = dict(
-                    energies=energies,
-                    subset=subset,
-                    n_atoms=n_atoms,
-                    n_atoms_first=n_atoms_first,
-                    atomic_inputs=atomic_inputs,
-                    protein_monomer_name= np.array([protein_monomer_name]),
-                    protein_interaction_site_type=np.array([protein_interaction_site_type]),
-                    ligand_monomer_name=np.array([ligand_monomer_name]),
-                    ligand_interaction_site_type=np.array([ligand_interaction_site_type]),
-                    index = np.array([index]),
-                    r=np.array([r]),
-                    theta_P=np.array([theta_P]),
-                    tau_P=np.array([tau_P]),
-                    theta_L=np.array([theta_L]),
-                    tau_L=np.array([tau_L]),
-                    tau_PL=np.array([tau_PL]),
-                )
-                data.append(item)
+                    item = dict(
+                        energies=energies,
+                        subset=subset,
+                        n_atoms=n_atoms,
+                        n_atoms_first=n_atoms_first,
+                        atomic_inputs=atomic_inputs,
+                        protein_monomer_name= np.array([protein_monomer_name]),
+                        protein_interaction_site_type=np.array([protein_interaction_site_type]),
+                        ligand_monomer_name=np.array([ligand_monomer_name]),
+                        ligand_interaction_site_type=np.array([ligand_interaction_site_type]),
+                        index = np.array([index]),
+                        r=np.array([r]),
+                        theta_P=np.array([theta_P]),
+                        tau_P=np.array([tau_P]),
+                        theta_L=np.array([theta_L]),
+                        tau_L=np.array([tau_L]),
+                        tau_PL=np.array([tau_PL]),
+                        name=np.array([protein_monomer_name + "." + ligand_monomer_name])
+                    )
+                    data.append(item)
+                    progress_bar.update(1)
         logger.info(f"Processed {i} files in total")
         return data

From 3c84ee910ecf1731ef51529fbdeed0103e3dcc71 Mon Sep 17 00:00:00 2001
From: Danny McNeela <daniel.mcneela@valencelabs.com>
Date: Thu, 7 Mar 2024 16:44:31 +0000
Subject: [PATCH 27/47] update default data shapes for interaction datasets

---
 src/openqdc/datasets/interaction/base.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py
index c73ce7c..8de065e 100644
--- a/src/openqdc/datasets/interaction/base.py
+++ b/src/openqdc/datasets/interaction/base.py
@@ -10,6 +10,9 @@
     set_cache_dir,
 )
 from openqdc.datasets.potential.base import BaseDataset
+from openqdc.utils.constants import (
+    NB_ATOMIC_FEATURES
+)
 
 from loguru import logger
 
@@ -43,3 +46,12 @@ def collate_list(self, list_entries: List[Dict]):
         res["position_idx_range"] = x
 
         return res
+
+    @property
+    def data_shapes(self):
+        return {
+            "atomic_inputs": (-1, NB_ATOMIC_FEATURES),
+            "position_idx_range": (-1, 2),
+            "energies": (-1, len(self.__energy_methods__)),
+            "forces": (-1, 3, len(self.force_target_names)),
+        }

From 04c81aea5c4745cfd36ef53c24a1e516fe2b14fc Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Thu, 7 Mar 2024 11:51:06 -0500
Subject: [PATCH 28/47] updated test_dummy.py with new import structure

---
 tests/test_dummy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_dummy.py b/tests/test_dummy.py
index 65fe9b6..f82376c 100644
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
@@ -1,6 +1,6 @@
 """Path hack to make tests work."""
 
-from openqdc.datasets.dummy import Dummy  # noqa: E402
+from openqdc.datasets.potential.dummy import Dummy  # noqa: E402
 from openqdc.utils.atomization_energies import (
     ISOLATED_ATOM_ENERGIES,
     IsolatedAtomEnergyFactory,

From 11e28581713ce2c584a6c7e8caa9ae7bb963babd Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Thu, 7 Mar 2024 11:53:02 -0500
Subject: [PATCH 29/47] fix test_import.py

---
 tests/test_import.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_import.py b/tests/test_import.py
index 0736e30..0d3344a 100644
--- a/tests/test_import.py
+++ b/tests/test_import.py
@@ -1,2 +1,2 @@
 def test_open_qdc():
-    import openQDC  # noqa
+    import openqdc # noqa

From 78f042338dd5c0e8dd8760dd1da7abd0d7bbf409 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 10:00:56 -0500
Subject: [PATCH 30/47] code cleanup for the linter

---
 src/openqdc/datasets/interaction/L7.py       | 13 ++---
 src/openqdc/datasets/interaction/X40.py      | 13 ++---
 src/openqdc/datasets/interaction/__init__.py |  6 +--
 src/openqdc/datasets/interaction/base.py     | 31 ++++-------
 src/openqdc/datasets/interaction/des370k.py  | 20 ++------
 src/openqdc/datasets/interaction/des5m.py    | 20 ++------
 src/openqdc/datasets/interaction/dess66.py   | 22 +++-----
 src/openqdc/datasets/interaction/dess66x8.py | 22 +++-----
 src/openqdc/datasets/interaction/metcalf.py  | 39 +++-----------
 src/openqdc/datasets/interaction/splinter.py | 54 +++++++++++---------
 10 files changed, 83 insertions(+), 157 deletions(-)

diff --git a/src/openqdc/datasets/interaction/L7.py b/src/openqdc/datasets/interaction/L7.py
index a0d657f..db6fa3c 100644
--- a/src/openqdc/datasets/interaction/L7.py
+++ b/src/openqdc/datasets/interaction/L7.py
@@ -1,15 +1,12 @@
 import os
-import numpy as np
-import pandas as pd
-
 from typing import Dict, List
 
-from tqdm import tqdm
-from rdkit import Chem
-from ruamel.yaml import YAML
+import numpy as np
 from loguru import logger
+from ruamel.yaml import YAML
+
 from openqdc.datasets.interaction import BaseInteractionDataset
-from openqdc.utils.molecule import atom_table, molecule_groups
+from openqdc.utils.molecule import atom_table
 
 
 class L7(BaseInteractionDataset):
@@ -47,7 +44,7 @@ def read_raw_entries(self) -> List[Dict]:
             energies.append(item["reference_value"])
             xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r")
             lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
-            lines.pop(1) 
+            lines.pop(1)
             n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
             n_atoms_first = np.array([int(item["setup"]["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
             subset = np.array([item["group"]])
diff --git a/src/openqdc/datasets/interaction/X40.py b/src/openqdc/datasets/interaction/X40.py
index f89e6dd..826e455 100644
--- a/src/openqdc/datasets/interaction/X40.py
+++ b/src/openqdc/datasets/interaction/X40.py
@@ -1,15 +1,12 @@
 import os
-import numpy as np
-import pandas as pd
-
 from typing import Dict, List
 
-from tqdm import tqdm
-from rdkit import Chem
-from ruamel.yaml import YAML
+import numpy as np
 from loguru import logger
+from ruamel.yaml import YAML
+
 from openqdc.datasets.interaction import BaseInteractionDataset
-from openqdc.utils.molecule import atom_table, molecule_groups
+from openqdc.utils.molecule import atom_table
 
 
 class X40(BaseInteractionDataset):
@@ -43,7 +40,7 @@ def read_raw_entries(self) -> List[Dict]:
             energies.append(float(item["reference_value"]))
             xyz_file = open(os.path.join(self.root, f"{item['shortname']}.xyz"), "r")
             lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
-            setup = lines.pop(1) 
+            setup = lines.pop(1)
             n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
             n_atoms_first = setup[0].split("-")[1]
             n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32)
diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
index 1906b7f..f3cdfa0 100644
--- a/src/openqdc/datasets/interaction/__init__.py
+++ b/src/openqdc/datasets/interaction/__init__.py
@@ -47,14 +47,14 @@ def __dir__():
 
 if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
     from .base import BaseInteractionDataset
-    from .des370k import DES370K
     from .des5m import DES5M
-    from .metcalf import Metcalf
+    from .des370k import DES370K
     from .dess66 import DESS66
     from .dess66x8 import DESS66x8
     from .L7 import L7
-    from .X40 import X40
+    from .metcalf import Metcalf
     from .splinter import Splinter
+    from .X40 import X40
 
     __all__ = [
         "BaseInteractionDataset",
diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py
index 8de065e..c06c4bc 100644
--- a/src/openqdc/datasets/interaction/base.py
+++ b/src/openqdc/datasets/interaction/base.py
@@ -1,23 +1,11 @@
-from typing import Dict, List, Optional, Union
-from openqdc.utils.io import (
-    copy_exists,
-    dict_to_atoms,
-    get_local_cache,
-    load_hdf5_file,
-    load_pkl,
-    pull_locally,
-    push_remote,
-    set_cache_dir,
-)
-from openqdc.datasets.potential.base import BaseDataset
-from openqdc.utils.constants import (
-    NB_ATOMIC_FEATURES
-)
-
-from loguru import logger
+from typing import Dict, List, Optional
 
 import numpy as np
 
+from openqdc.datasets.potential.base import BaseDataset
+from openqdc.utils.constants import NB_ATOMIC_FEATURES
+
+
 class BaseInteractionDataset(BaseDataset):
     def __init__(
         self,
@@ -30,14 +18,17 @@ def __init__(
             energy_unit=energy_unit,
             distance_unit=distance_unit,
             overwrite_local_cache=overwrite_local_cache,
-            cache_dir=cache_dir
+            cache_dir=cache_dir,
         )
 
     def collate_list(self, list_entries: List[Dict]):
         # concatenate entries
         print(list_entries[0])
-        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \
-               for key in list_entries[0] if not isinstance(list_entries[0][key], dict)}
+        res = {
+            key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0)
+            for key in list_entries[0]
+            if not isinstance(list_entries[0][key], dict)
+        }
 
         csum = np.cumsum(res.get("n_atoms"))
         print(csum)
diff --git a/src/openqdc/datasets/interaction/des370k.py b/src/openqdc/datasets/interaction/des370k.py
index 65a5835..4b866e7 100644
--- a/src/openqdc/datasets/interaction/des370k.py
+++ b/src/openqdc/datasets/interaction/des370k.py
@@ -1,12 +1,11 @@
 import os
-import numpy as np
-import pandas as pd
-
 from typing import Dict, List
 
-from tqdm import tqdm
-from rdkit import Chem
+import numpy as np
+import pandas as pd
 from loguru import logger
+from tqdm import tqdm
+
 from openqdc.datasets.interaction import BaseInteractionDataset
 from openqdc.utils.molecule import atom_table, molecule_groups
 
@@ -66,22 +65,14 @@ def read_raw_entries(self) -> List[Dict]:
             charge0, charge1 = row["charge0"], row["charge1"]
             natoms0, natoms1 = row["natoms0"], row["natoms1"]
             pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
-            pos0 = pos[:natoms0]
-            pos1 = pos[natoms0:]
-            
+
             elements = row["elements"].split()
-            elements0 = np.array(elements[:natoms0])
-            elements1 = np.array(elements[natoms0:])
 
             atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
-            atomic_nums0 = np.array(atomic_nums[:natoms0])
-            atomic_nums1 = np.array(atomic_nums[natoms0:])
 
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
 
             atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-            atomic_inputs0 = atomic_inputs[:natoms0, :]
-            atomic_inputs1 = atomic_inputs[natoms0:, :]
 
             energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
 
@@ -98,7 +89,6 @@ def read_raw_entries(self) -> List[Dict]:
                 if not found:
                     logger.info(f"molecule group lookup failed for {smiles}")
 
-
             item = dict(
                 energies=energies,
                 subset=np.array([subsets]),
diff --git a/src/openqdc/datasets/interaction/des5m.py b/src/openqdc/datasets/interaction/des5m.py
index e79d690..256df0b 100644
--- a/src/openqdc/datasets/interaction/des5m.py
+++ b/src/openqdc/datasets/interaction/des5m.py
@@ -1,12 +1,11 @@
 import os
-import numpy as np
-import pandas as pd
-
 from typing import Dict, List
 
-from tqdm import tqdm
-from rdkit import Chem
+import numpy as np
+import pandas as pd
 from loguru import logger
+from tqdm import tqdm
+
 from openqdc.datasets.interaction import DES370K
 from openqdc.utils.molecule import atom_table, molecule_groups
 
@@ -57,22 +56,14 @@ def read_raw_entries(self) -> List[Dict]:
             charge0, charge1 = row["charge0"], row["charge1"]
             natoms0, natoms1 = row["natoms0"], row["natoms1"]
             pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
-            pos0 = pos[:natoms0]
-            pos1 = pos[natoms0:]
-            
+
             elements = row["elements"].split()
-            elements0 = np.array(elements[:natoms0])
-            elements1 = np.array(elements[natoms0:])
 
             atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
-            atomic_nums0 = np.array(atomic_nums[:natoms0])
-            atomic_nums1 = np.array(atomic_nums[natoms0:])
 
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
 
             atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-            atomic_inputs0 = atomic_inputs[:natoms0, :]
-            atomic_inputs1 = atomic_inputs[natoms0:, :]
 
             energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
 
@@ -89,7 +80,6 @@ def read_raw_entries(self) -> List[Dict]:
                 if not found:
                     logger.info(f"molecule group lookup failed for {smiles}")
 
-
             item = dict(
                 energies=energies,
                 subset=np.array([subsets]),
diff --git a/src/openqdc/datasets/interaction/dess66.py b/src/openqdc/datasets/interaction/dess66.py
index 13f5b9c..a0d361b 100644
--- a/src/openqdc/datasets/interaction/dess66.py
+++ b/src/openqdc/datasets/interaction/dess66.py
@@ -1,14 +1,13 @@
 import os
-import numpy as np
-import pandas as pd
-
 from typing import Dict, List
 
-from tqdm import tqdm
-from rdkit import Chem
+import numpy as np
+import pandas as pd
 from loguru import logger
+from tqdm import tqdm
+
 from openqdc.datasets.interaction import BaseInteractionDataset
-from openqdc.utils.molecule import atom_table, molecule_groups
+from openqdc.utils.molecule import atom_table
 
 
 class DESS66(BaseInteractionDataset):
@@ -66,28 +65,19 @@ def read_raw_entries(self) -> List[Dict]:
             charge0, charge1 = row["charge0"], row["charge1"]
             natoms0, natoms1 = row["natoms0"], row["natoms1"]
             pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
-            pos0 = pos[:natoms0]
-            pos1 = pos[natoms0:]
-            
+
             elements = row["elements"].split()
-            elements0 = np.array(elements[:natoms0])
-            elements1 = np.array(elements[natoms0:])
 
             atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
-            atomic_nums0 = np.array(atomic_nums[:natoms0])
-            atomic_nums1 = np.array(atomic_nums[natoms0:])
 
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
 
             atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-            atomic_inputs0 = atomic_inputs[:natoms0, :]
-            atomic_inputs1 = atomic_inputs[natoms0:, :]
 
             energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
 
             name = np.array([smiles0 + "." + smiles1])
 
-
             subset = row["system_name"]
 
             item = dict(
diff --git a/src/openqdc/datasets/interaction/dess66x8.py b/src/openqdc/datasets/interaction/dess66x8.py
index 80e3bad..575d601 100644
--- a/src/openqdc/datasets/interaction/dess66x8.py
+++ b/src/openqdc/datasets/interaction/dess66x8.py
@@ -1,14 +1,13 @@
 import os
-import numpy as np
-import pandas as pd
-
 from typing import Dict, List
 
-from tqdm import tqdm
-from rdkit import Chem
+import numpy as np
+import pandas as pd
 from loguru import logger
+from tqdm import tqdm
+
 from openqdc.datasets.interaction import BaseInteractionDataset
-from openqdc.utils.molecule import atom_table, molecule_groups
+from openqdc.utils.molecule import atom_table
 
 
 class DESS66x8(BaseInteractionDataset):
@@ -66,28 +65,19 @@ def read_raw_entries(self) -> List[Dict]:
             charge0, charge1 = row["charge0"], row["charge1"]
             natoms0, natoms1 = row["natoms0"], row["natoms1"]
             pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
-            pos0 = pos[:natoms0]
-            pos1 = pos[natoms0:]
-            
+
             elements = row["elements"].split()
-            elements0 = np.array(elements[:natoms0])
-            elements1 = np.array(elements[natoms0:])
 
             atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
-            atomic_nums0 = np.array(atomic_nums[:natoms0])
-            atomic_nums1 = np.array(atomic_nums[natoms0:])
 
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
 
             atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-            atomic_inputs0 = atomic_inputs[:natoms0, :]
-            atomic_inputs1 = atomic_inputs[natoms0:, :]
 
             energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
 
             name = np.array([smiles0 + "." + smiles1])
 
-
             subset = row["system_name"]
 
             item = dict(
diff --git a/src/openqdc/datasets/interaction/metcalf.py b/src/openqdc/datasets/interaction/metcalf.py
index 3a8e714..3553d90 100644
--- a/src/openqdc/datasets/interaction/metcalf.py
+++ b/src/openqdc/datasets/interaction/metcalf.py
@@ -1,22 +1,18 @@
 import os
-import numpy as np
-
 from typing import Dict, List
 
-from tqdm import tqdm
-from rdkit import Chem
-from loguru import logger
+import numpy as np
+
 from openqdc.datasets.interaction import BaseInteractionDataset
 from openqdc.utils.molecule import atom_table
 
+
 class Metcalf(BaseInteractionDataset):
     __name__ = "metcalf"
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = None
-    __energy_methods__ = [
-        "SAPT0/jun-cc-pVDZ"
-    ]
+    __energy_methods__ = ["SAPT0/jun-cc-pVDZ"]
     energy_target_names = [
         "total energy",
         "electrostatic energy",
@@ -39,7 +35,7 @@ def read_raw_entries(self) -> List[Dict]:
             xyz_dir = os.path.join(self.root, dirname)
             if not os.path.isdir(xyz_dir):
                 continue
-            subset = np.array([dirname.split("-")[0].lower()]) # training, validation, or test
+            subset = np.array([dirname.split("-")[0].lower()])  # training, validation, or test
             for filename in os.listdir(xyz_dir):
                 if not filename.endswith(".xyz"):
                     continue
@@ -47,45 +43,22 @@ def read_raw_entries(self) -> List[Dict]:
                 line_two = lines[1].split(",")
                 energies = np.array([line_two[1:6]], dtype=np.float32)
                 num_atoms = np.array([int(lines[0])])
-                num_atoms0 = int(line_two[-1])
-                num_atoms1 = num_atoms[0] - num_atoms0
 
                 elem_xyz = np.array([x.split() for x in lines[2:]])
-                # elements = np.expand_dims(elem_xyz[:, 0], axis=0)
                 elements = elem_xyz[:, 0]
                 xyz = elem_xyz[:, 1:].astype(np.float32)
-                # xyz0_fname = os.path.join(xyz_dir, f"{filename}_0_tmp.xyz")
-                # with open(xyz0_fname, "w") as xyz_0_file:
-                #     lines_to_write = [str(num_atoms0) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[:num_atoms0].tolist()))
-                #     lines_to_write[-1] = lines_to_write[-1][:-1]
-                #     print(lines_to_write)
-                #     xyz_0_file.writelines(lines_to_write) # writelines doesn't actually add a newline to each string (weird)
-
-                # xyz1_fname = os.path.join(xyz_dir, f"{filename}_1_tmp.xyz")
-                # with open(xyz1_fname, "w") as xyz_1_file:
-                #     lines_to_write = [str(num_atoms1) + "\n"] + ["charge=0=\n"] + list(map(lambda x: " ".join(x) + "\n", elem_xyz[num_atoms0:].tolist()))
-                #     lines_to_write[-1] = lines_to_write[-1][:-1]
-                #     xyz_1_file.writelines(lines_to_write)
-
-                # smiles0 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz0_fname))
-                # smiles1 = Chem.MolToSmiles(Chem.MolFromXYZFile(xyz1_fname))
-                #            
                 atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
                 charges = np.expand_dims(np.array([0] * num_atoms[0]), axis=1)
 
                 atomic_inputs = np.concatenate((atomic_nums, charges, xyz), axis=-1, dtype=np.float32)
 
                 item = dict(
-                    # elements=elements,
                     n_atoms=num_atoms,
                     subset=subset,
                     energies=energies,
                     positions=xyz,
                     atomic_inputs=atomic_inputs,
-                    name=np.array([""])
+                    name=np.array([""]),
                 )
                 data.append(item)
         return data
-
-
-
diff --git a/src/openqdc/datasets/interaction/splinter.py b/src/openqdc/datasets/interaction/splinter.py
index 7d7dbd0..2841cf7 100644
--- a/src/openqdc/datasets/interaction/splinter.py
+++ b/src/openqdc/datasets/interaction/splinter.py
@@ -1,14 +1,12 @@
 import os
-import numpy as np
-import pandas as pd
-
 from typing import Dict, List
 
-from tqdm import tqdm
-from rdkit import Chem
+import numpy as np
 from loguru import logger
+from tqdm import tqdm
+
 from openqdc.datasets.interaction import BaseInteractionDataset
-from openqdc.utils.molecule import atom_table, molecule_groups
+from openqdc.utils.molecule import atom_table
 
 
 class Splinter(BaseInteractionDataset):
@@ -43,33 +41,43 @@ def read_raw_entries(self) -> List[Dict]:
         data = []
         i = 0
         with tqdm(total=1706343) as progress_bar:
-            for root, dirs, files in os.walk(self.root): # total is currently an approximation
+            for root, dirs, files in os.walk(self.root):  # total is currently an approximation
                 for filename in files:
                     if not filename.endswith(".xyz"):
                         continue
                     i += 1
                     filepath = os.path.join(root, filename)
-                    filein = open(filepath, "r")                
+                    filein = open(filepath, "r")
                     lines = list(map(lambda x: x.strip(), filein.readlines()))
                     n_atoms = np.array([int(lines[0])], dtype=np.int32)
                     metadata = lines[1].split(",")
                     try:
                         (
-                            protein_monomer_name, 
-                            protein_interaction_site_type, 
-                            ligand_monomer_name, 
+                            protein_monomer_name,
+                            protein_interaction_site_type,
+                            ligand_monomer_name,
                             ligand_interaction_site_type,
-                            index, r, theta_P, tau_P, theta_L,
-                            tau_L, tau_PL
+                            index,
+                            r,
+                            theta_P,
+                            tau_P,
+                            theta_L,
+                            tau_L,
+                            tau_PL,
                         ) = metadata[0].split("_")
-                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL]))
-                    except:
-                            (protein_monomer_name, 
-                            protein_interaction_site_type, 
-                            ligand_monomer_name, 
+                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(
+                            map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])
+                        )
+                    except ValueError:
+                        (
+                            protein_monomer_name,
+                            protein_interaction_site_type,
+                            ligand_monomer_name,
                             ligand_interaction_site_type,
-                            index, _) = metadata[0].split("_")
-                            r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6
+                            index,
+                            _,
+                        ) = metadata[0].split("_")
+                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6
                     energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)
                     n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32)
                     total_charge, charge0, charge1 = list(map(int, metadata[1:4]))
@@ -89,18 +97,18 @@ def read_raw_entries(self) -> List[Dict]:
                         n_atoms=n_atoms,
                         n_atoms_first=n_atoms_first,
                         atomic_inputs=atomic_inputs,
-                        protein_monomer_name= np.array([protein_monomer_name]),
+                        protein_monomer_name=np.array([protein_monomer_name]),
                         protein_interaction_site_type=np.array([protein_interaction_site_type]),
                         ligand_monomer_name=np.array([ligand_monomer_name]),
                         ligand_interaction_site_type=np.array([ligand_interaction_site_type]),
-                        index = np.array([index]),
+                        index=np.array([index]),
                         r=np.array([r]),
                         theta_P=np.array([theta_P]),
                         tau_P=np.array([tau_P]),
                         theta_L=np.array([theta_L]),
                         tau_L=np.array([tau_L]),
                         tau_PL=np.array([tau_PL]),
-                        name=np.array([protein_monomer_name + "." + ligand_monomer_name])
+                        name=np.array([protein_monomer_name + "." + ligand_monomer_name]),
                     )
                     data.append(item)
                     progress_bar.update(1)

From bd58fdf627b272e0e3fe7aac669c4f1a209a8555 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 10:18:40 -0500
Subject: [PATCH 31/47] fix ani import

---
 src/openqdc/datasets/potential/ani.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/openqdc/datasets/potential/ani.py b/src/openqdc/datasets/potential/ani.py
index 3f1b92b..34ab904 100644
--- a/src/openqdc/datasets/potential/ani.py
+++ b/src/openqdc/datasets/potential/ani.py
@@ -1,7 +1,7 @@
 import os
 from os.path import join as p_join
 
-from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
+from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5
 from openqdc.utils.io import get_local_cache
 
 

From 4bc3a49268baab66fac7f93037fb07bf288940d1 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 10:40:50 -0500
Subject: [PATCH 32/47] fix base dataset import

---
 openqdc/datasets/interaction/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index c06c4bc..71c8e84 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from openqdc.datasets.potential.base import BaseDataset
+from openqdc.datasets.base import BaseDataset
 from openqdc.utils.constants import NB_ATOMIC_FEATURES
 
 

From b046eeac3d52f79549c8bb99c924ae883c9dc8ff Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 10:42:33 -0500
Subject: [PATCH 33/47] black formatting

---
 openqdc/raws/config_factory.py |   6 +-
 openqdc/utils/molecule.py      | 403 ++++++++++++++++++++++++++++++---
 tests/test_import.py           |   2 +-
 3 files changed, 379 insertions(+), 32 deletions(-)

diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py
index 312faf0..c9530cf 100644
--- a/openqdc/raws/config_factory.py
+++ b/openqdc/raws/config_factory.py
@@ -94,14 +94,14 @@ class DataConfigFactory:
         dataset_name="des370k_interaction",
         links={
             "DES370K.zip": "https://zenodo.org/record/5676266/files/DES370K.zip",
-        }
+        },
     )
 
     des5m_interaction = dict(
         dataset_name="des5m_interaction",
         links={
             "DES5M.zip": "https://zenodo.org/records/5706002/files/DESS5M.zip?download=1",
-        }
+        },
     )
 
     tmqm = dict(
@@ -157,7 +157,7 @@ class DataConfigFactory:
 
     des_s66x8 = dict(
         dataset_name="des_s66x8",
-        links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"}
+        links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"},
     )
 
     available_datasets = [k for k in locals().keys() if not k.startswith("__")]
diff --git a/openqdc/utils/molecule.py b/openqdc/utils/molecule.py
index 47f9ded..6b48cc5 100644
--- a/openqdc/utils/molecule.py
+++ b/openqdc/utils/molecule.py
@@ -14,48 +14,395 @@
 molecule_groups = {
     "acids": set(["CCC(=O)O", "CC(=O)O", "OC=O", "OC(=O)CC(=O)O"]),
     "alcohols": set(["CCCO", "CCC(O)C", "CCO", "CC(O)C", "CO", "OC1CCCC1", "OC1CCCCC1", "OCCCCO", "OCCCO", "OCCO"]),
-    "alkanes": set(["C1CCCC1", "C1CCCCC1", "C", "CC1CCCC1", "CC1CCCCC1", "CC", "CCC", "CC(C)C", "CCCC", "CC(C)(C)C", "CCC(C)C", "CCCCC", "CCC(C)(C)C", "CCCCCC"]),
-    "alkenes": set(["C=C", "CC=C", "CC=CC", "CC(=C)C", "CCC=C", "CC=C(C)C", "CCC=CC", "CCC(=C)C", "CC(=C(C)C)C", "CCC=C(C)C", "CCC(=CC)C", "CCC(=C(C)C)C"]),
-    "amides": set(["CCCNC=O", "CCC(=O)N", "CCC(=O)NC", "CCC(=O)N(C)C", "CCC(=O)N(CC)C", "CCNC=O", "CCNC(=O)C", "CCN(C=O)CC", "CCN(C(=O)C)C", "CCNC(=O)CC", \
-                   "CCN(C(=O)C)CC", "CC(=O)N", "CC(=O)N(C)C", "CNC=O", "CNC(=O)C", "CN(C=O)CC", "CNC(=O)CC(=O)N", "CNC(=O)CC(=O)NC", \
-                   "CNC(=O)CNC=O", "CNC(=O)CNC(=O)C", "CNC(=O)C(NC(=O)C)C", "NC=O", "NC(=O)CC(=O)N", "O=CN(C)C", "O=CNCCC(=O)N", "O=CNCCC(=O)NC", "O=CNCCNC=O", "O=CNCC(=O)N"]),
-    "amines": set(["C1CCCN1", "C1CCCNC1", "CCCN", "CCCNC", "CCCN(C)C", "CCN", "CCN(C)C", "CCNCC", "CCN(CC)C", "CN", "CNC", "CN(C)C", "CNCC", "CNCCCN", "CNCCCNC", "CNCCN", "CNCCNC", "N", "NCCCN", "NCCN"]),
+    "alkanes": set(
+        [
+            "C1CCCC1",
+            "C1CCCCC1",
+            "C",
+            "CC1CCCC1",
+            "CC1CCCCC1",
+            "CC",
+            "CCC",
+            "CC(C)C",
+            "CCCC",
+            "CC(C)(C)C",
+            "CCC(C)C",
+            "CCCCC",
+            "CCC(C)(C)C",
+            "CCCCCC",
+        ]
+    ),
+    "alkenes": set(
+        [
+            "C=C",
+            "CC=C",
+            "CC=CC",
+            "CC(=C)C",
+            "CCC=C",
+            "CC=C(C)C",
+            "CCC=CC",
+            "CCC(=C)C",
+            "CC(=C(C)C)C",
+            "CCC=C(C)C",
+            "CCC(=CC)C",
+            "CCC(=C(C)C)C",
+        ]
+    ),
+    "amides": set(
+        [
+            "CCCNC=O",
+            "CCC(=O)N",
+            "CCC(=O)NC",
+            "CCC(=O)N(C)C",
+            "CCC(=O)N(CC)C",
+            "CCNC=O",
+            "CCNC(=O)C",
+            "CCN(C=O)CC",
+            "CCN(C(=O)C)C",
+            "CCNC(=O)CC",
+            "CCN(C(=O)C)CC",
+            "CC(=O)N",
+            "CC(=O)N(C)C",
+            "CNC=O",
+            "CNC(=O)C",
+            "CN(C=O)CC",
+            "CNC(=O)CC(=O)N",
+            "CNC(=O)CC(=O)NC",
+            "CNC(=O)CNC=O",
+            "CNC(=O)CNC(=O)C",
+            "CNC(=O)C(NC(=O)C)C",
+            "NC=O",
+            "NC(=O)CC(=O)N",
+            "O=CN(C)C",
+            "O=CNCCC(=O)N",
+            "O=CNCCC(=O)NC",
+            "O=CNCCNC=O",
+            "O=CNCC(=O)N",
+        ]
+    ),
+    "amines": set(
+        [
+            "C1CCCN1",
+            "C1CCCNC1",
+            "CCCN",
+            "CCCNC",
+            "CCCN(C)C",
+            "CCN",
+            "CCN(C)C",
+            "CCNCC",
+            "CCN(CC)C",
+            "CN",
+            "CNC",
+            "CN(C)C",
+            "CNCC",
+            "CNCCCN",
+            "CNCCCNC",
+            "CNCCN",
+            "CNCCNC",
+            "N",
+            "NCCCN",
+            "NCCN",
+        ]
+    ),
     "ammoniums": set(["CC[NH3+]", "C[N+](C)(C)C", "C[NH2+]C", "C[NH3+]", "C[NH+](C)C", "[NH4+]"]),
     "benzene": set(["c1ccccc1", "Cc1ccccc1", "CCc1ccccc1"]),
     "carboxylates": set(["[O-]C=O", "[O-]C(=O)C", "[O-]C(=O)CC"]),
-    "esters": set(["CCCOC=O", "CCC(=O)OC", "CCOC(=O)CC", "CCOC(=O)C", "CCOC=O", "COC(=O)C", "COC=O", "O=COCCCOC=O", "O=COCCOC=O", "O=COCOC=O"]),
-    "ethers": set(["C1CCCO1", "C1CCCOC1", "C1CCOCO1", "C1OCCO1", "CCCOC", "CCCOCOC", "CCOCC", "COCCCOC", "COCC", "COCCOC", "COC", "COCOCC", "COCOC", "O1CCOCC1", "O1COCOC1"]),
+    "esters": set(
+        [
+            "CCCOC=O",
+            "CCC(=O)OC",
+            "CCOC(=O)CC",
+            "CCOC(=O)C",
+            "CCOC=O",
+            "COC(=O)C",
+            "COC=O",
+            "O=COCCCOC=O",
+            "O=COCCOC=O",
+            "O=COCOC=O",
+        ]
+    ),
+    "ethers": set(
+        [
+            "C1CCCO1",
+            "C1CCCOC1",
+            "C1CCOCO1",
+            "C1OCCO1",
+            "CCCOC",
+            "CCCOCOC",
+            "CCOCC",
+            "COCCCOC",
+            "COCC",
+            "COCCOC",
+            "COC",
+            "COCOCC",
+            "COCOC",
+            "O1CCOCC1",
+            "O1COCOC1",
+        ]
+    ),
     "guanidiums": set(["CCNC(=[NH2+])N", "CNC(=[NH2+])N", "NC(=[NH2+])N"]),
     "imidazolium": set(["c1[nH]cc[nH+]1", "Cc1c[nH]c[nH+]1", "CCc1c[nH]c[nH+]1"]),
     "ketones": set(["CCC(=O)CC", "CCC(=O)C", "CCC=O", "CC(=O)C", "CC=O", "C=O"]),
-    "monoatomics": set(["[Ar]", "[Br-]", "[Ca+2]", "[Cl-]", "[F-]", "[He]", "[I-]", "[K+]", "[Kr]", "[Li+]", "[Mg+2]", "[Na+]", "[Ne]", "[Xe]"]),
-    "other": set(["Brc1ccc(cc1)Br", "Brc1ccccc1", "BrC(Br)Br", "BrCBr", "BrCCBr", "CBr", "CC(Br)Br", "CCBr", "CCCC#CC", "CCCC(Cl)(Cl)Cl", "CCCC(Cl)Cl", \
-                  "CCCCCl", "CCC#CC", "CCCC#C", "CCCC(F)(F)F", "CCCC(F)F", "CCCCF", "CCC(Cl)(Cl)Cl", "CCC(Cl)Cl", "CCCCl", "CCCC#N", "CC#CC", "CCC#C", "CCC(F)(F)F", \
-                  "CCC(F)F", "CCCF", "CC(Cl)(Cl)Cl", "CC(Cl)Cl", "CCCl", "CCC#N", "CC#C", "CC(F)(F)F", "CC(F)F", "CCF", "CC(I)I", "CCI", "CCl", "CC#N", "CCOP(=O)(OC)OC", \
-                  "CCOP(=O)(OC)[O-]", "CCOP(=O)(OC)O", "C#C", "CF", "CI", "Clc1ccc(cc1)Cl", "Clc1cccc(c1)Cl", "Clc1ccccc1Cl", "Clc1ccccc1", "Clc1cc(Cl)c(c(c1Cl)Cl)Cl", \
-                  "Clc1cc(Cl)cc(c1)Cl", "Clc1c(Cl)c(Cl)c(c(c1Cl)Cl)Cl", "ClC(C(Cl)(Cl)Cl)(Cl)Cl", "ClC(C(Cl)(Cl)Cl)Cl", "ClCC(Cl)(Cl)Cl", "ClCC(Cl)Cl", "ClCCCl", "ClC(Cl)Cl", \
-                  "ClCCl", "CNCCCOC=O", "CNCCCOC", "CNCCC(=O)NC", "CNCCC(=O)N", "CNCCC(=O)O", "CNCCCO", "CNCCCSC", "CNCCCS", "CNCCNC=O", "CNCCOC=O", "CNCCOC", "CNCC(=O)NC", \
-                  "CNCC(=O)N", "CNCC(=O)O", "CNCCO", "CNCCSC", "CNCCS", "CNC(=O)CCN", "CNC(=O)CC(=O)O", "CNC(=O)CCO", "CNC(=O)CCS", "CNC(=O)CN", "CNC(=O)COC=O", "CNC(=O)CO", \
-                  "CNCOC=O", "CNCOC", "CNC(=O)CS", "CNCSC", "C#N", "COCCCN", "COCCCOC=O", "COCCC(=O)NC", "COCCC(=O)N", "COCCC(=O)O", "COCCCO", "COCCCSC", "COCCCS", "COCCNC=O", \
-                  "COCCN", "COCCOC=O", "COCC(=O)NC", "COCC(=O)N", "COCC(=O)O", "COCCO", "COCCSC", "COCCS", "COCNC=O", "COCN", "COCOC=O", "COCO", "COCSC", "COCS", "COP(=O)(OC)OC", \
-                  "COP(=O)(OC)[O-]", "COP(=O)(OC)O", "COP(=O)(O)O", "COP(=O)(OP(=O)(O)O)[O-]", "CSCCCNC=O", "CSCCCN", "CSCCCOC=O", "CSCCC(=O)N", "CSCCC(=O)O", "CSCCCO", "CSCCN", "CSCCOC=O", \
-                  "CSCC(=O)NC", "CSCC(=O)N", "CSCC(=O)O", "CSCCO", "CSCNC=O", "CSCN", "CSCOC=O", "CSCO", "Fc1ccc(cc1)F", "Fc1cccc(c1)F", "Fc1ccccc1F", "Fc1ccccc1", "Fc1cc(F)c(c(c1F)F)F", \
-                  "Fc1cc(F)cc(c1)F", "Fc1c(F)c(F)c(c(c1F)F)F", "FC(C(F)(F)F)(F)F", "FC(C(F)(F)F)F", "FCC(F)(F)F", "FCC(F)F", "FCCF", "FC(F)F", "FCF", "ICCI", "ICI", "NCCCOC=O", "NCCC(=O)N", \
-                  "NCCC(=O)O", "NCCCO", "NCCCS", "NCCNC=O", "NCCOC=O", "NCC(=O)N", "NCC(=O)O", "NCCO", "NCCS", "NC(=O)CC(=O)O","NC(=O)CCO", "NC(=O)CCS", "NC(=O)CO", "NCOC=O", "NC(=O)CS", \
-                  "OCCCNC=O", "OCCCOC=O", "OCCC(=O)O", "OCCCS", "OCCNC=O", "OCCOC=O", "OCC(=O)O", "OCCS", "O=CNCCC(=O)O", "O=CNCCOC=O", "O=CNCC(=O)O", "O=CNCOC=O", "O=COCCC(=O)NC", \
-                  "O=COCCC(=O)N", "O=COCCC(=O)O", "O=COCC(=O)N", "O=COCC(=O)O", "OC(=O)CCS", "OCOC=O", "OC(=O)CS", "OP(=O)(O)O", "[O-]P(=O)(OP(=O)(OC)O)O", "SCCCOC=O", "SCCNC=O", "SCCOC=O", \
-                  "SCOC=O", "[H][H]"]),
+    "monoatomics": set(
+        [
+            "[Ar]",
+            "[Br-]",
+            "[Ca+2]",
+            "[Cl-]",
+            "[F-]",
+            "[He]",
+            "[I-]",
+            "[K+]",
+            "[Kr]",
+            "[Li+]",
+            "[Mg+2]",
+            "[Na+]",
+            "[Ne]",
+            "[Xe]",
+        ]
+    ),
+    "other": set(
+        [
+            "Brc1ccc(cc1)Br",
+            "Brc1ccccc1",
+            "BrC(Br)Br",
+            "BrCBr",
+            "BrCCBr",
+            "CBr",
+            "CC(Br)Br",
+            "CCBr",
+            "CCCC#CC",
+            "CCCC(Cl)(Cl)Cl",
+            "CCCC(Cl)Cl",
+            "CCCCCl",
+            "CCC#CC",
+            "CCCC#C",
+            "CCCC(F)(F)F",
+            "CCCC(F)F",
+            "CCCCF",
+            "CCC(Cl)(Cl)Cl",
+            "CCC(Cl)Cl",
+            "CCCCl",
+            "CCCC#N",
+            "CC#CC",
+            "CCC#C",
+            "CCC(F)(F)F",
+            "CCC(F)F",
+            "CCCF",
+            "CC(Cl)(Cl)Cl",
+            "CC(Cl)Cl",
+            "CCCl",
+            "CCC#N",
+            "CC#C",
+            "CC(F)(F)F",
+            "CC(F)F",
+            "CCF",
+            "CC(I)I",
+            "CCI",
+            "CCl",
+            "CC#N",
+            "CCOP(=O)(OC)OC",
+            "CCOP(=O)(OC)[O-]",
+            "CCOP(=O)(OC)O",
+            "C#C",
+            "CF",
+            "CI",
+            "Clc1ccc(cc1)Cl",
+            "Clc1cccc(c1)Cl",
+            "Clc1ccccc1Cl",
+            "Clc1ccccc1",
+            "Clc1cc(Cl)c(c(c1Cl)Cl)Cl",
+            "Clc1cc(Cl)cc(c1)Cl",
+            "Clc1c(Cl)c(Cl)c(c(c1Cl)Cl)Cl",
+            "ClC(C(Cl)(Cl)Cl)(Cl)Cl",
+            "ClC(C(Cl)(Cl)Cl)Cl",
+            "ClCC(Cl)(Cl)Cl",
+            "ClCC(Cl)Cl",
+            "ClCCCl",
+            "ClC(Cl)Cl",
+            "ClCCl",
+            "CNCCCOC=O",
+            "CNCCCOC",
+            "CNCCC(=O)NC",
+            "CNCCC(=O)N",
+            "CNCCC(=O)O",
+            "CNCCCO",
+            "CNCCCSC",
+            "CNCCCS",
+            "CNCCNC=O",
+            "CNCCOC=O",
+            "CNCCOC",
+            "CNCC(=O)NC",
+            "CNCC(=O)N",
+            "CNCC(=O)O",
+            "CNCCO",
+            "CNCCSC",
+            "CNCCS",
+            "CNC(=O)CCN",
+            "CNC(=O)CC(=O)O",
+            "CNC(=O)CCO",
+            "CNC(=O)CCS",
+            "CNC(=O)CN",
+            "CNC(=O)COC=O",
+            "CNC(=O)CO",
+            "CNCOC=O",
+            "CNCOC",
+            "CNC(=O)CS",
+            "CNCSC",
+            "C#N",
+            "COCCCN",
+            "COCCCOC=O",
+            "COCCC(=O)NC",
+            "COCCC(=O)N",
+            "COCCC(=O)O",
+            "COCCCO",
+            "COCCCSC",
+            "COCCCS",
+            "COCCNC=O",
+            "COCCN",
+            "COCCOC=O",
+            "COCC(=O)NC",
+            "COCC(=O)N",
+            "COCC(=O)O",
+            "COCCO",
+            "COCCSC",
+            "COCCS",
+            "COCNC=O",
+            "COCN",
+            "COCOC=O",
+            "COCO",
+            "COCSC",
+            "COCS",
+            "COP(=O)(OC)OC",
+            "COP(=O)(OC)[O-]",
+            "COP(=O)(OC)O",
+            "COP(=O)(O)O",
+            "COP(=O)(OP(=O)(O)O)[O-]",
+            "CSCCCNC=O",
+            "CSCCCN",
+            "CSCCCOC=O",
+            "CSCCC(=O)N",
+            "CSCCC(=O)O",
+            "CSCCCO",
+            "CSCCN",
+            "CSCCOC=O",
+            "CSCC(=O)NC",
+            "CSCC(=O)N",
+            "CSCC(=O)O",
+            "CSCCO",
+            "CSCNC=O",
+            "CSCN",
+            "CSCOC=O",
+            "CSCO",
+            "Fc1ccc(cc1)F",
+            "Fc1cccc(c1)F",
+            "Fc1ccccc1F",
+            "Fc1ccccc1",
+            "Fc1cc(F)c(c(c1F)F)F",
+            "Fc1cc(F)cc(c1)F",
+            "Fc1c(F)c(F)c(c(c1F)F)F",
+            "FC(C(F)(F)F)(F)F",
+            "FC(C(F)(F)F)F",
+            "FCC(F)(F)F",
+            "FCC(F)F",
+            "FCCF",
+            "FC(F)F",
+            "FCF",
+            "ICCI",
+            "ICI",
+            "NCCCOC=O",
+            "NCCC(=O)N",
+            "NCCC(=O)O",
+            "NCCCO",
+            "NCCCS",
+            "NCCNC=O",
+            "NCCOC=O",
+            "NCC(=O)N",
+            "NCC(=O)O",
+            "NCCO",
+            "NCCS",
+            "NC(=O)CC(=O)O",
+            "NC(=O)CCO",
+            "NC(=O)CCS",
+            "NC(=O)CO",
+            "NCOC=O",
+            "NC(=O)CS",
+            "OCCCNC=O",
+            "OCCCOC=O",
+            "OCCC(=O)O",
+            "OCCCS",
+            "OCCNC=O",
+            "OCCOC=O",
+            "OCC(=O)O",
+            "OCCS",
+            "O=CNCCC(=O)O",
+            "O=CNCCOC=O",
+            "O=CNCC(=O)O",
+            "O=CNCOC=O",
+            "O=COCCC(=O)NC",
+            "O=COCCC(=O)N",
+            "O=COCCC(=O)O",
+            "O=COCC(=O)N",
+            "O=COCC(=O)O",
+            "OC(=O)CCS",
+            "OCOC=O",
+            "OC(=O)CS",
+            "OP(=O)(O)O",
+            "[O-]P(=O)(OP(=O)(OC)O)O",
+            "SCCCOC=O",
+            "SCCNC=O",
+            "SCCOC=O",
+            "SCOC=O",
+            "[H][H]",
+        ]
+    ),
     "phenol": set(["Cc1ccc(cc1)O", "CCc1ccc(cc1)O", "Oc1ccccc1"]),
     "pyridine": set(["c1cccnc1", "c1ccncn1", "n1ccncc1"]),
-    "pyrrole": set(["c1ccc2c(c1)[nH]cc2", "c1ccc[nH]1", "c1ncc[nH]1", "Cc1cnc[nH]1", "Cc1c[nH]c2c1cccc2", "Cc1c[nH]cn1", "CCc1cnc[nH]1", "CCc1c[nH]c2c1cccc2", "CCc1c[nH]cn1"]),
-    "sulfides": set(["C1CCCS1", "C1CCCSC1", "C1CCSCS1", "C1CCSSC1", "C1CSSC1", "C1SCCS1", "CCCSCSC", "CCCSC", "CCCSSC", "CCSCC", "CCSSCC", "CCSSC", "CSCCCSC", "CSCCSC", "CSCC", \
-                     "CSCSCC", "CSCSC", "CSC", "CSSC", "S1CCSCC1", "S1CSCSC1"]),
+    "pyrrole": set(
+        [
+            "c1ccc2c(c1)[nH]cc2",
+            "c1ccc[nH]1",
+            "c1ncc[nH]1",
+            "Cc1cnc[nH]1",
+            "Cc1c[nH]c2c1cccc2",
+            "Cc1c[nH]cn1",
+            "CCc1cnc[nH]1",
+            "CCc1c[nH]c2c1cccc2",
+            "CCc1c[nH]cn1",
+        ]
+    ),
+    "sulfides": set(
+        [
+            "C1CCCS1",
+            "C1CCCSC1",
+            "C1CCSCS1",
+            "C1CCSSC1",
+            "C1CSSC1",
+            "C1SCCS1",
+            "CCCSCSC",
+            "CCCSC",
+            "CCCSSC",
+            "CCSCC",
+            "CCSSCC",
+            "CCSSC",
+            "CSCCCSC",
+            "CSCCSC",
+            "CSCC",
+            "CSCSCC",
+            "CSCSC",
+            "CSC",
+            "CSSC",
+            "S1CCSCC1",
+            "S1CSCSC1",
+        ]
+    ),
     "thiols": set(["CCCSS", "CCCS", "CCSS", "CCS", "CSCCCS", "CSCCS", "CSCS", "CSS", "CS", "SCCCS", "SCCS", "SS", "S"]),
     "water": set(["O"]),
     "flourane": set(["F"]),
     "hydrogen chloride": set(["Cl"]),
 }
 
+
 def z_to_formula(z):
     u, c = np.unique(z, return_counts=True)
     idxs = np.argsort(u)
diff --git a/tests/test_import.py b/tests/test_import.py
index 0d3344a..8c272b1 100644
--- a/tests/test_import.py
+++ b/tests/test_import.py
@@ -1,2 +1,2 @@
 def test_open_qdc():
-    import openqdc # noqa
+    import openqdc  # noqa

From fe540444dd97ae2cf07ce1a0b815cdb1da71c51c Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 10:45:34 -0500
Subject: [PATCH 34/47] ran precommit

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 405bd7b..c0db26d 100644
--- a/README.md
+++ b/README.md
@@ -85,8 +85,8 @@ We provide support for the following publicly available QM Potential Energy Data
 
 We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets.
 
-| Dataset | 
-| --- | 
+| Dataset |
+| --- |
 | [DES370K](https://www.nature.com/articles/s41597-021-00833-x) |
 | [DES5M](https://www.nature.com/articles/s41597-021-00833-x)   |
-| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) |
\ No newline at end of file
+| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) |

From ef2528ca3c57f33874277dbe076ffb7d1e21e3dd Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 10:48:47 -0500
Subject: [PATCH 35/47] removed DES from datasets/__init__.py

---
 openqdc/datasets/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py
index 9291d77..c86b8b3 100644
--- a/openqdc/datasets/__init__.py
+++ b/openqdc/datasets/__init__.py
@@ -24,7 +24,6 @@
     "ani1ccx": ANI1CCX,
     "ani1x": ANI1X,
     "comp6": COMP6,
-    "des": DES,
     "gdml": GDML,
     "geom": GEOM,
     "iso17": ISO17,

From c0ef5b168e96ef10b0e3f4e8c5bbac85842ae22e Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 10:50:39 -0500
Subject: [PATCH 36/47] removed DES from datasets/__init__.py

---
 openqdc/datasets/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py
index c86b8b3..b8ac166 100644
--- a/openqdc/datasets/__init__.py
+++ b/openqdc/datasets/__init__.py
@@ -1,5 +1,4 @@
 from .base import BaseDataset  # noqa
-from .interaction import DES  # noqa
 from .potential.ani import ANI1, ANI1CCX, ANI1X  # noqa
 from .potential.comp6 import COMP6  # noqa
 from .potential.dummy import Dummy  # noqa

From ad552962c7bc84132e72b280bf1d18f45eac724a Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 11:04:13 -0500
Subject: [PATCH 37/47] fix X40 energy methods

---
 openqdc/datasets/interaction/X40.py     | 2 +-
 openqdc/datasets/interaction/des370k.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
index 826e455..09d65aa 100644
--- a/openqdc/datasets/interaction/X40.py
+++ b/openqdc/datasets/interaction/X40.py
@@ -15,7 +15,7 @@ class X40(BaseInteractionDataset):
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
     __energy_methods__ = [
-        "default",
+        "CCSD(T)/CBS",
         "MP2/CBS",
         "dCCSD(T)/haDZ",
         "dCCSD(T)/haTZ",
diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py
index 4b866e7..9cbf736 100644
--- a/openqdc/datasets/interaction/des370k.py
+++ b/openqdc/datasets/interaction/des370k.py
@@ -79,7 +79,6 @@ def read_raw_entries(self) -> List[Dict]:
             name = np.array([smiles0 + "." + smiles1])
 
             subsets = []
-            # for smiles in [canon_smiles0, canon_smiles1]:
             for smiles in [smiles0, smiles1]:
                 found = False
                 for functional_group, smiles_set in molecule_groups.items():

From 0a51e7c2fe43341b85f12187158bbbe6ae7d084e Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 11:24:24 -0500
Subject: [PATCH 38/47] added interaction dataset docstrings

---
 openqdc/datasets/interaction/L7.py       | 12 ++++++++++++
 openqdc/datasets/interaction/X40.py      | 13 ++++++++++++
 openqdc/datasets/interaction/des370k.py  | 10 ++++++++++
 openqdc/datasets/interaction/des5m.py    | 10 ++++++++++
 openqdc/datasets/interaction/dess66.py   | 15 ++++++++++++++
 openqdc/datasets/interaction/dess66x8.py | 16 +++++++++++++++
 openqdc/datasets/interaction/metcalf.py  | 25 ++++++++++++++++--------
 openqdc/datasets/interaction/splinter.py | 10 ++++++++++
 8 files changed, 103 insertions(+), 8 deletions(-)

diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
index db6fa3c..8c76c03 100644
--- a/openqdc/datasets/interaction/L7.py
+++ b/openqdc/datasets/interaction/L7.py
@@ -10,6 +10,18 @@
 
 
 class L7(BaseInteractionDataset):
+    """
+    The L7 interaction energy dataset as described in:
+
+    Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes
+    Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza
+    Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374
+    DOI: 10.1021/ct400036b
+
+    Data was downloaded and extracted from:
+    http://cuby4.molecular.cz/dataset_l7.html
+    """
+
     __name__ = "L7"
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
index 09d65aa..f38f22a 100644
--- a/openqdc/datasets/interaction/X40.py
+++ b/openqdc/datasets/interaction/X40.py
@@ -10,6 +10,19 @@
 
 
 class X40(BaseInteractionDataset):
+    """
+    X40 interaction dataset of 40 dimer pairs as
+    introduced in the following paper:
+
+    Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules
+    Jan Řezáč, Kevin E. Riley, and Pavel Hobza
+    Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292
+    DOI: 10.1021/ct300647k
+
+    Dataset retrieved and processed from:
+    http://cuby4.molecular.cz/dataset_x40.html
+    """
+
     __name__ = "X40"
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py
index 9cbf736..b98b296 100644
--- a/openqdc/datasets/interaction/des370k.py
+++ b/openqdc/datasets/interaction/des370k.py
@@ -11,6 +11,16 @@
 
 
 class DES370K(BaseInteractionDataset):
+    """
+    DE Shaw Research interaction energy of over 370K
+    small molecule dimers as described in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+    """
+
     __name__ = "des370k_interaction"
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py
index 256df0b..40e2ca1 100644
--- a/openqdc/datasets/interaction/des5m.py
+++ b/openqdc/datasets/interaction/des5m.py
@@ -11,6 +11,16 @@
 
 
 class DES5M(DES370K):
+    """
+    DE Shaw Research interaction energy calculations for
+    over 5M small molecule dimers as described in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+    """
+
     __name__ = "des5m_interaction"
     __energy_methods__ = [
         "mp2/cc-pvqz",
diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py
index a0d361b..7b53057 100644
--- a/openqdc/datasets/interaction/dess66.py
+++ b/openqdc/datasets/interaction/dess66.py
@@ -11,6 +11,21 @@
 
 
 class DESS66(BaseInteractionDataset):
+    """
+    DE Shaw Research interaction energy
+    estimates of all 66 conformers from
+    the original S66 dataset as described
+    in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+
+    Data was downloaded from Zenodo:
+    https://zenodo.org/records/5676284
+    """
+
     __name__ = "des_s66"
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py
index 575d601..1291c7a 100644
--- a/openqdc/datasets/interaction/dess66x8.py
+++ b/openqdc/datasets/interaction/dess66x8.py
@@ -11,6 +11,22 @@
 
 
 class DESS66x8(BaseInteractionDataset):
+    """
+    DE Shaw Research interaction energy
+    estimates of all 528 conformers from
+    the original S66x8 dataset as described
+    in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+
+    Data was downloaded from Zenodo:
+
+    https://zenodo.org/records/5676284
+    """
+
     __name__ = "des_s66x8"
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index 3553d90..caddc8f 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -8,6 +8,23 @@
 
 
 class Metcalf(BaseInteractionDataset):
+    """
+    Hydrogen-bonded dimers of NMA with 126 molecules as described in:
+
+    Approaches for machine learning intermolecular interaction energies and
+    application to energy components from symmetry adapted perturbation theory.
+    Derek P. Metcalf, Alexios Koutsoukas, Steven A. Spronk, Brian L. Claus,
+    Deborah A. Loughney, Stephen R. Johnson, Daniel L. Cheney, C. David Sherrill;
+    J. Chem. Phys. 21 February 2020; 152 (7): 074103.
+    https://doi.org/10.1063/1.5142636
+
+    Further details:
+    "Hydrogen-bonded dimers involving N-methylacetamide (NMA) and 126 molecules
+    (46 donors and 80 acceptors; Figs. 2 and 3) were used. Optimized geometries
+    for the 126 individual monomers were obtained and paired with NMA in broad
+    arrays of spatial configurations to generate thousands of complexes for training.
+    """
+
     __name__ = "metcalf"
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
@@ -22,14 +39,6 @@ class Metcalf(BaseInteractionDataset):
     ]
 
     def read_raw_entries(self) -> List[Dict]:
-        """
-        SMILES strings are inferred from the
-        .xyz files using the RDKit xyz2mol function.
-        More details here:
-
-        https://github.com/jensengroup/xyz2mol
-
-        """
         data = []
         for dirname in os.listdir(self.root):
             xyz_dir = os.path.join(self.root, dirname)
diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
index 2841cf7..25611c2 100644
--- a/openqdc/datasets/interaction/splinter.py
+++ b/openqdc/datasets/interaction/splinter.py
@@ -10,6 +10,16 @@
 
 
 class Splinter(BaseInteractionDataset):
+    """
+    A dataset of over 1.7 million protein-ligand
+    interactions as described in the paper:
+
+    A quantum chemical interaction energy dataset for accurately modeling protein-ligand interactions.
+    Spronk, S.A., Glick, Z.L., Metcalf, D.P. et al.
+    Sci Data 10, 619 (2023).
+    https://doi.org/10.1038/s41597-023-02443-1
+    """
+
     __name__ = "splinter"
     __energy_methods__ = [
         "sapt0/jun-cc-pV(D+d)Z_unscaled",

From b6c3a6a581312f3cd5e08e474eec3e3af7e366ce Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 11:29:55 -0500
Subject: [PATCH 39/47] update readme with all interaction datasets

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c0db26d..267a83a 100644
--- a/README.md
+++ b/README.md
@@ -78,11 +78,6 @@ We provide support for the following publicly available QM Potential Energy Data
 
 # Interaction energy
 
-| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes |
-| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes |
-
 We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets.
 
 | Dataset |
@@ -90,3 +85,8 @@ We also provide support for the following publicly available QM Noncovalent Inte
 | [DES370K](https://www.nature.com/articles/s41597-021-00833-x) |
 | [DES5M](https://www.nature.com/articles/s41597-021-00833-x)   |
 | [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) |
+| [DESS66](https://www.nature.com/articles/s41597-021-00833-x) |
+| [DESS66x8](https://www.nature.com/articles/s41597-021-00833-x) |
+| [Splinter](https://www.nature.com/articles/s41597-023-02443-1) |
+| [X40](https://pubs.acs.org/doi/10.1021/ct300647k) |
+| [L7](https://pubs.acs.org/doi/10.1021/ct400036b)  |

From 07f70b8c51cbf20a813bd9c85b5e0e563524c2ff Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 11:34:27 -0500
Subject: [PATCH 40/47] update metcalf __energy_methods__

---
 openqdc/datasets/interaction/metcalf.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index caddc8f..5e1cd73 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -29,7 +29,13 @@ class Metcalf(BaseInteractionDataset):
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = None
-    __energy_methods__ = ["SAPT0/jun-cc-pVDZ"]
+    __energy_methods__ = [
+        "SAPT0/jun-cc-pVDZ",
+        "SAPT0/jun-cc-pVDZ_es",
+        "SAPT0/jun-cc-pVDZ_ex",
+        "SAPT0/jun-cc-pVDZ_ind",
+        "SAPT0/jun-cc-pVDZ_disp",
+    ]
     energy_target_names = [
         "total energy",
         "electrostatic energy",

From 1443450de7f036238b1793c85ec4a1c4b03f4838 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 11:44:39 -0500
Subject: [PATCH 41/47] refactored des370k and des5m

---
 openqdc/datasets/interaction/des370k.py | 17 +++++---
 openqdc/datasets/interaction/des5m.py   | 54 ++-----------------------
 2 files changed, 16 insertions(+), 55 deletions(-)

diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py
index b98b296..e97710c 100644
--- a/openqdc/datasets/interaction/des370k.py
+++ b/openqdc/datasets/interaction/des370k.py
@@ -65,10 +65,14 @@ class DES370K(BaseInteractionDataset):
         "sapt_delta_HF",
     ]
 
-    def read_raw_entries(self) -> List[Dict]:
-        self.filepath = os.path.join(self.root, "DES370K.csv")
-        logger.info(f"Reading DES370K interaction data from {self.filepath}")
-        df = pd.read_csv(self.filepath)
+    _filename = "DES370K.csv"
+    _short_name = "DES370K"
+
+    @classmethod
+    def _read_raw_entries(cls) -> List[Dict]:
+        filepath = os.path.join(cls.root, cls._filename)
+        logger.info(f"Reading {cls._short_name} interaction data from {filepath}")
+        df = pd.read_csv(filepath)
         data = []
         for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
             smiles0, smiles1 = row["smiles0"], row["smiles1"]
@@ -84,7 +88,7 @@ def read_raw_entries(self) -> List[Dict]:
 
             atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
 
-            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
+            energies = np.array(row[cls.energy_target_names].values).astype(np.float32)[None, :]
 
             name = np.array([smiles0 + "." + smiles1])
 
@@ -108,3 +112,6 @@ def read_raw_entries(self) -> List[Dict]:
             )
             data.append(item)
         return data
+
+    def read_raw_entries(self) -> List[Dict]:
+        return DES370K._read_raw_entries()
diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py
index 40e2ca1..ea0d929 100644
--- a/openqdc/datasets/interaction/des5m.py
+++ b/openqdc/datasets/interaction/des5m.py
@@ -1,13 +1,6 @@
-import os
 from typing import Dict, List
 
-import numpy as np
-import pandas as pd
-from loguru import logger
-from tqdm import tqdm
-
 from openqdc.datasets.interaction import DES370K
-from openqdc.utils.molecule import atom_table, molecule_groups
 
 
 class DES5M(DES370K):
@@ -56,47 +49,8 @@ class DES5M(DES370K):
         "sapt_delta_HF",
     ]
 
-    def read_raw_entries(self) -> List[Dict]:
-        self.filepath = os.path.join(self.root, "DES5M.csv")
-        logger.info(f"Reading DES5M interaction data from {self.filepath}")
-        df = pd.read_csv(self.filepath)
-        data = []
-        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
-            smiles0, smiles1 = row["smiles0"], row["smiles1"]
-            charge0, charge1 = row["charge0"], row["charge1"]
-            natoms0, natoms1 = row["natoms0"], row["natoms1"]
-            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
-
-            elements = row["elements"].split()
-
-            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
-
-            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+    _filename = "DES5M.csv"
+    _short_name = "DES5M"
 
-            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-
-            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
-
-            name = np.array([smiles0 + "." + smiles1])
-
-            subsets = []
-            # for smiles in [canon_smiles0, canon_smiles1]:
-            for smiles in [smiles0, smiles1]:
-                found = False
-                for functional_group, smiles_set in molecule_groups.items():
-                    if smiles in smiles_set:
-                        subsets.append(functional_group)
-                        found = True
-                if not found:
-                    logger.info(f"molecule group lookup failed for {smiles}")
-
-            item = dict(
-                energies=energies,
-                subset=np.array([subsets]),
-                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
-                n_atoms_first=np.array([natoms0], dtype=np.int32),
-                atomic_inputs=atomic_inputs,
-                name=name,
-            )
-            data.append(item)
-        return data
+    def read_raw_entries(self) -> List[Dict]:
+        return DES5M._read_raw_entries()

From 802b70b5d7028e4d2e7bfa92867ec1429f535b92 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Fri, 8 Mar 2024 12:23:29 -0500
Subject: [PATCH 42/47] update base interaction dataset to add n_atoms_first
 property

---
 openqdc/datasets/interaction/base.py    | 43 +++++++++++++++++++++++++
 openqdc/datasets/interaction/des370k.py | 11 +++++--
 openqdc/datasets/interaction/des5m.py   |  2 +-
 openqdc/datasets/interaction/metcalf.py |  1 +
 4 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 71c8e84..27c2f88 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -1,8 +1,10 @@
 from typing import Dict, List, Optional
 
 import numpy as np
+from sklearn.utils import Bunch
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
 from openqdc.utils.constants import NB_ATOMIC_FEATURES
 
 
@@ -45,4 +47,45 @@ def data_shapes(self):
             "position_idx_range": (-1, 2),
             "energies": (-1, len(self.__energy_methods__)),
             "forces": (-1, 3, len(self.force_target_names)),
+            "n_atoms_first": (-1,),
         }
+
+    @property
+    def data_types(self):
+        return {
+            "atomic_inputs": np.float32,
+            "position_idx_range": np.int32,
+            "energies": np.float32,
+            "forces": np.float32,
+            "n_atoms_first": np.int32,
+        }
+
+    def __getitem__(self, idx: int):
+        shift = IsolatedAtomEnergyFactory.max_charge
+        p_start, p_end = self.data["position_idx_range"][idx]
+        input = self.data["atomic_inputs"][p_start:p_end]
+        z, c, positions, energies = (
+            np.array(input[:, 0], dtype=np.int32),
+            np.array(input[:, 1], dtype=np.int32),
+            np.array(input[:, -3:], dtype=np.float32),
+            np.array(self.data["energies"][idx], dtype=np.float32),
+        )
+        name = self.__smiles_converter__(self.data["name"][idx])
+        subset = self.data["subset"][idx]
+        n_atoms_first = self.data["n_atoms_first"][idx]
+
+        if "forces" in self.data:
+            forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32)
+        else:
+            forces = None
+        return Bunch(
+            positions=positions,
+            atomic_numbers=z,
+            charges=c,
+            e0=self.__isolated_atom_energies__[..., z, c + shift].T,
+            energies=energies,
+            name=name,
+            subset=subset,
+            forces=forces,
+            n_atoms_first=n_atoms_first,
+        )
diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py
index e97710c..382b84c 100644
--- a/openqdc/datasets/interaction/des370k.py
+++ b/openqdc/datasets/interaction/des370k.py
@@ -7,6 +7,7 @@
 from tqdm import tqdm
 
 from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.utils.io import get_local_cache
 from openqdc.utils.molecule import atom_table, molecule_groups
 
 
@@ -66,12 +67,16 @@ class DES370K(BaseInteractionDataset):
     ]
 
     _filename = "DES370K.csv"
-    _short_name = "DES370K"
+    _name = "des370k_interaction"
+
+    @classmethod
+    def _root(cls):
+        return os.path.join(get_local_cache(), cls._name)
 
     @classmethod
     def _read_raw_entries(cls) -> List[Dict]:
-        filepath = os.path.join(cls.root, cls._filename)
-        logger.info(f"Reading {cls._short_name} interaction data from {filepath}")
+        filepath = os.path.join(cls._root(), cls._filename)
+        logger.info(f"Reading {cls._name} interaction data from {filepath}")
         df = pd.read_csv(filepath)
         data = []
         for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py
index ea0d929..5b027f4 100644
--- a/openqdc/datasets/interaction/des5m.py
+++ b/openqdc/datasets/interaction/des5m.py
@@ -50,7 +50,7 @@ class DES5M(DES370K):
     ]
 
     _filename = "DES5M.csv"
-    _short_name = "DES5M"
+    _name = "des5m_interaction"
 
     def read_raw_entries(self) -> List[Dict]:
         return DES5M._read_raw_entries()
diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index 5e1cd73..c9921da 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -74,6 +74,7 @@ def read_raw_entries(self) -> List[Dict]:
                     positions=xyz,
                     atomic_inputs=atomic_inputs,
                     name=np.array([""]),
+                    n_atoms_first=np.array([-1]),
                 )
                 data.append(item)
         return data

From e969b54b0dab138285edc1d68cc05a1fb0dc574d Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Tue, 12 Mar 2024 13:56:12 -0400
Subject: [PATCH 43/47] update L7 and X40 to use python base yaml package

---
 openqdc/datasets/interaction/L7.py  | 59 ++++++++++++++++++++++++-----
 openqdc/datasets/interaction/X40.py | 18 ++++-----
 2 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
index 8c76c03..fc2f899 100644
--- a/openqdc/datasets/interaction/L7.py
+++ b/openqdc/datasets/interaction/L7.py
@@ -2,13 +2,53 @@
 from typing import Dict, List
 
 import numpy as np
+import yaml
 from loguru import logger
-from ruamel.yaml import YAML
 
 from openqdc.datasets.interaction import BaseInteractionDataset
 from openqdc.utils.molecule import atom_table
 
 
+class DataItemYAMLObj:
+    def __init__(self, name, shortname, geometry, reference_value, setup, group, tags):
+        self.name = name
+        self.shortname = shortname
+        self.geometry = geometry
+        self.reference_value = reference_value
+        self.setup = setup
+        self.group = group
+        self.tags = tags
+
+
+class DataSetYAMLObj:
+    def __init__(self, name, references, text, method_energy, groups_by, groups, global_setup):
+        self.name = name
+        self.references = references
+        self.text = text
+        self.method_energy = method_energy
+        self.groups_by = groups_by
+        self.groups = groups
+        self.global_setup = global_setup
+
+
+def data_item_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode):
+    """Construct an employee."""
+    return DataItemYAMLObj(**loader.construct_mapping(node))
+
+
+def dataset_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode):
+    """Construct an employee."""
+    return DataSetYAMLObj(**loader.construct_mapping(node))
+
+
+def get_loader():
+    """Add constructors to PyYAML loader."""
+    loader = yaml.SafeLoader
+    loader.add_constructor("!ruby/object:ProtocolDataset::DataSetItem", data_item_constructor)
+    loader.add_constructor("!ruby/object:ProtocolDataset::DataSetDescription", dataset_constructor)
+    return loader
+
+
 class L7(BaseInteractionDataset):
     """
     The L7 interaction energy dataset as described in:
@@ -43,23 +83,22 @@ def read_raw_entries(self) -> List[Dict]:
         yaml_fpath = os.path.join(self.root, "l7.yaml")
         logger.info(f"Reading L7 interaction data from {self.root}")
         yaml_file = open(yaml_fpath, "r")
-        yaml = YAML()
         data = []
-        data_dict = yaml.load(yaml_file)
-        charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"])
-        charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"])
+        data_dict = yaml.load(yaml_file, Loader=get_loader())
+        charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"])
 
         for idx, item in enumerate(data_dict["items"]):
             energies = []
-            name = np.array([item["shortname"]])
-            fname = item["geometry"].split(":")[1]
-            energies.append(item["reference_value"])
+            name = np.array([item.shortname])
+            fname = item.geometry.split(":")[1]
+            energies.append(item.reference_value)
             xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r")
             lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
             lines.pop(1)
             n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
-            n_atoms_first = np.array([int(item["setup"]["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
-            subset = np.array([item["group"]])
+            n_atoms_first = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
+            subset = np.array([item.group])
             energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
             energies = np.array([energies], dtype=np.float32)
             pos = np.array(lines[1:])[:, 1:].astype(np.float32)
diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
index f38f22a..3888469 100644
--- a/openqdc/datasets/interaction/X40.py
+++ b/openqdc/datasets/interaction/X40.py
@@ -2,10 +2,11 @@
 from typing import Dict, List
 
 import numpy as np
+import yaml
 from loguru import logger
-from ruamel.yaml import YAML
 
 from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.datasets.interaction.L7 import get_loader
 from openqdc.utils.molecule import atom_table
 
 
@@ -41,23 +42,22 @@ def read_raw_entries(self) -> List[Dict]:
         yaml_fpath = os.path.join(self.root, "x40.yaml")
         logger.info(f"Reading X40 interaction data from {self.root}")
         yaml_file = open(yaml_fpath, "r")
-        yaml = YAML()
         data = []
-        data_dict = yaml.load(yaml_file)
-        charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"])
-        charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"])
+        data_dict = yaml.load(yaml_file, Loader=get_loader())
+        charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"])
 
         for idx, item in enumerate(data_dict["items"]):
             energies = []
-            name = np.array([item["shortname"]])
-            energies.append(float(item["reference_value"]))
-            xyz_file = open(os.path.join(self.root, f"{item['shortname']}.xyz"), "r")
+            name = np.array([item.shortname])
+            energies.append(float(item.reference_value))
+            xyz_file = open(os.path.join(self.root, f"{item.shortname}.xyz"), "r")
             lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
             setup = lines.pop(1)
             n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
             n_atoms_first = setup[0].split("-")[1]
             n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32)
-            subset = np.array([item["group"]])
+            subset = np.array([item.group])
             energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
             energies = np.array([energies], dtype=np.float32)
             pos = np.array(lines[1:])[:, 1:].astype(np.float32)

From 5725fed90b5ec5c00df1992d433f1009e2cfb664 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Wed, 13 Mar 2024 11:23:16 -0400
Subject: [PATCH 44/47] modify interaction/base.py to save keys other than
 force/energy in props.pkl

---
 openqdc/datasets/interaction/base.py | 52 ++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 27c2f88..3abe01f 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -1,11 +1,15 @@
+import pickle as pkl
+from os.path import join as p_join
 from typing import Dict, List, Optional
 
 import numpy as np
+from loguru import logger
 from sklearn.utils import Bunch
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
 from openqdc.utils.constants import NB_ATOMIC_FEATURES
+from openqdc.utils.io import pull_locally, push_remote
 
 
 class BaseInteractionDataset(BaseDataset):
@@ -89,3 +93,51 @@ def __getitem__(self, idx: int):
             forces=forces,
             n_atoms_first=n_atoms_first,
         )
+
+    def save_preprocess(self, data_dict):
+        # save memmaps
+        logger.info("Preprocessing data and saving it to cache.")
+        for key in self.data_keys:
+            local_path = p_join(self.preprocess_path, f"{key}.mmap")
+            out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
+            out[:] = data_dict.pop(key)[:]
+            out.flush()
+            push_remote(local_path, overwrite=True)
+
+        # save all other keys in props.pkl
+        local_path = p_join(self.preprocess_path, "props.pkl")
+        for key in data_dict:
+            if key not in self.data_keys:
+                data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+
+        with open(local_path, "wb") as f:
+            pkl.dump(data_dict, f)
+        push_remote(local_path, overwrite=True)
+
+    def read_preprocess(self, overwrite_local_cache=False):
+        logger.info("Reading preprocessed data.")
+        logger.info(
+            f"Dataset {self.__name__} with the following units:\n\
+                     Energy: {self.energy_unit},\n\
+                     Distance: {self.distance_unit},\n\
+                     Forces: {self.force_unit if self.__force_methods__ else 'None'}"
+        )
+        self.data = {}
+        for key in self.data_keys:
+            filename = p_join(self.preprocess_path, f"{key}.mmap")
+            pull_locally(filename, overwrite=overwrite_local_cache)
+            self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key])
+
+        filename = p_join(self.preprocess_path, "props.pkl")
+        pull_locally(filename, overwrite=overwrite_local_cache)
+        with open(filename, "rb") as f:
+            tmp = pkl.load(f)
+            for key in set(tmp.keys()) - set(self.data_keys):
+                x = tmp.pop(key)
+                if len(x) == 2:
+                    self.data[key] = x[0][x[1]]
+                else:
+                    self.data[key] = x
+
+        for key in self.data:
+            logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")

From 6c6b2867425938018b10e81769d56ad8bf1dae89 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Wed, 13 Mar 2024 12:59:42 -0400
Subject: [PATCH 45/47] fix base dataset issue

---
 openqdc/datasets/interaction/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 3abe01f..42bf573 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -51,7 +51,6 @@ def data_shapes(self):
             "position_idx_range": (-1, 2),
             "energies": (-1, len(self.__energy_methods__)),
             "forces": (-1, 3, len(self.force_target_names)),
-            "n_atoms_first": (-1,),
         }
 
     @property
@@ -61,7 +60,6 @@ def data_types(self):
             "position_idx_range": np.int32,
             "energies": np.float32,
             "forces": np.float32,
-            "n_atoms_first": np.int32,
         }
 
     def __getitem__(self, idx: int):
@@ -108,7 +106,9 @@ def save_preprocess(self, data_dict):
         local_path = p_join(self.preprocess_path, "props.pkl")
         for key in data_dict:
             if key not in self.data_keys:
-                data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+                x = data_dict[key]
+                x[np.where(x is None)] = np.inf
+                data_dict[key] = np.unique(x, return_inverse=True)
 
         with open(local_path, "wb") as f:
             pkl.dump(data_dict, f)

From 46c5ebe0235707b0f075dbba7cea961c4337f3a4 Mon Sep 17 00:00:00 2001
From: mcneela <daniel.mcneela@valencelabs.com>
Date: Wed, 13 Mar 2024 14:36:43 -0400
Subject: [PATCH 46/47] fix circular imports

---
 openqdc/datasets/interaction/L7.py       | 2 +-
 openqdc/datasets/interaction/X40.py      | 2 +-
 openqdc/datasets/interaction/des370k.py  | 2 +-
 openqdc/datasets/interaction/des5m.py    | 2 +-
 openqdc/datasets/interaction/dess66.py   | 2 +-
 openqdc/datasets/interaction/dess66x8.py | 2 +-
 openqdc/datasets/interaction/metcalf.py  | 2 +-
 openqdc/datasets/interaction/splinter.py | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
index fc2f899..4e6ad01 100644
--- a/openqdc/datasets/interaction/L7.py
+++ b/openqdc/datasets/interaction/L7.py
@@ -5,7 +5,7 @@
 import yaml
 from loguru import logger
 
-from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.utils.molecule import atom_table
 
 
diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
index 3888469..08f4037 100644
--- a/openqdc/datasets/interaction/X40.py
+++ b/openqdc/datasets/interaction/X40.py
@@ -5,7 +5,7 @@
 import yaml
 from loguru import logger
 
-from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.datasets.interaction.L7 import get_loader
 from openqdc.utils.molecule import atom_table
 
diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py
index 382b84c..75198fd 100644
--- a/openqdc/datasets/interaction/des370k.py
+++ b/openqdc/datasets/interaction/des370k.py
@@ -6,7 +6,7 @@
 from loguru import logger
 from tqdm import tqdm
 
-from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.utils.io import get_local_cache
 from openqdc.utils.molecule import atom_table, molecule_groups
 
diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py
index 5b027f4..46d9ba1 100644
--- a/openqdc/datasets/interaction/des5m.py
+++ b/openqdc/datasets/interaction/des5m.py
@@ -1,6 +1,6 @@
 from typing import Dict, List
 
-from openqdc.datasets.interaction import DES370K
+from openqdc.datasets.interaction.des370k import DES370K
 
 
 class DES5M(DES370K):
diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py
index 7b53057..ae3ce81 100644
--- a/openqdc/datasets/interaction/dess66.py
+++ b/openqdc/datasets/interaction/dess66.py
@@ -6,7 +6,7 @@
 from loguru import logger
 from tqdm import tqdm
 
-from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.utils.molecule import atom_table
 
 
diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py
index 1291c7a..2e97221 100644
--- a/openqdc/datasets/interaction/dess66x8.py
+++ b/openqdc/datasets/interaction/dess66x8.py
@@ -6,7 +6,7 @@
 from loguru import logger
 from tqdm import tqdm
 
-from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.utils.molecule import atom_table
 
 
diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index c9921da..041964a 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.utils.molecule import atom_table
 
 
diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
index 25611c2..06b6d86 100644
--- a/openqdc/datasets/interaction/splinter.py
+++ b/openqdc/datasets/interaction/splinter.py
@@ -5,7 +5,7 @@
 from loguru import logger
 from tqdm import tqdm
 
-from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.utils.molecule import atom_table
 
 

From cb9987cdca2d45eb83464e3ba229059865c4b7d5 Mon Sep 17 00:00:00 2001
From: Daniel McNeela <daniel.mcneela@gmail.com>
Date: Thu, 14 Mar 2024 17:01:59 -0500
Subject: [PATCH 47/47] removed print statements

---
 openqdc/datasets/interaction/base.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 42bf573..bd3ab70 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -29,7 +29,6 @@ def __init__(
 
     def collate_list(self, list_entries: List[Dict]):
         # concatenate entries
-        print(list_entries[0])
         res = {
             key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0)
             for key in list_entries[0]
@@ -37,7 +36,6 @@ def collate_list(self, list_entries: List[Dict]):
         }
 
         csum = np.cumsum(res.get("n_atoms"))
-        print(csum)
         x = np.zeros((csum.shape[0], 2), dtype=np.int32)
         x[1:, 0], x[:, 1] = csum[:-1], csum
         res["position_idx_range"] = x