valence-labs · mcneela · Mar 14, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ pytest
 6. QM Level of Theory
  -->
 
-We provide support for the following publicly available QM Datasets.
+We provide support for the following publicly available QM Potential Energy Datasets.
 
 | Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
 | --- | --- | --- | --- | --- | --- | --- | --- |
@@ -46,3 +46,11 @@ We provide support for the following publicly available QM Datasets.
 | [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes |
 | [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | |
 | [QM7X](https://www.nature.com/articles/s41597-021-00812-2) |  6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes |
+
+We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets.
+
+| Dataset | 
+| --- | 
+| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) |
+| [DES5M](https://www.nature.com/articles/s41597-021-00833-x)   |
+| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) |
diff --git a/src/openqdc/datasets/interaction/L7.py b/src/openqdc/datasets/interaction/L7.py
@@ -0,0 +1,73 @@
+import os
+import numpy as np
+import pandas as pd
+
+from typing import Dict, List
+
+from tqdm import tqdm
+from rdkit import Chem
+from ruamel.yaml import YAML
+from loguru import logger
+from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table, molecule_groups
+
+
+class L7(BaseInteractionDataset):
+    __name__ = "L7"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "CSD(T) | QCISD(T)",
+        "DLPNO-CCSD(T)",
+        "MP2/CBS",
+        "MP2C/CBS",
+        "fixed",
+        "DLPNO-CCSD(T0)",
+        "LNO-CCSD(T)",
+        "FN-DMC",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        yaml_fpath = os.path.join(self.root, "l7.yaml")
+        logger.info(f"Reading L7 interaction data from {self.root}")
+        yaml_file = open(yaml_fpath, "r")
+        yaml = YAML()
+        data = []
+        data_dict = yaml.load(yaml_file)
+        charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"])
+
+        for idx, item in enumerate(data_dict["items"]):
+            energies = []
+            name = np.array([item["shortname"]])
+            fname = item["geometry"].split(":")[1]
+            energies.append(item["reference_value"])
+            xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r")
+            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
+            lines.pop(1) 
+            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
+            n_atoms_first = np.array([int(item["setup"]["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
+            subset = np.array([item["group"]])
+            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies = np.array([energies], dtype=np.float32)
+            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
+            elems = np.array(lines[1:])[:, 0]
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            natoms0 = n_atoms_first[0]
+            natoms1 = n_atoms[0] - natoms0
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            item = dict(
+                energies=energies,
+                subset=subset,
+                n_atoms=n_atoms,
+                n_atoms_first=n_atoms_first,
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/src/openqdc/datasets/interaction/X40.py b/src/openqdc/datasets/interaction/X40.py
@@ -0,0 +1,70 @@
+import os
+import numpy as np
+import pandas as pd
+
+from typing import Dict, List
+
+from tqdm import tqdm
+from rdkit import Chem
+from ruamel.yaml import YAML
+from loguru import logger
+from openqdc.datasets.interaction import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table, molecule_groups
+
+
+class X40(BaseInteractionDataset):
+    __name__ = "X40"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "default",
+        "MP2/CBS",
+        "dCCSD(T)/haDZ",
+        "dCCSD(T)/haTZ",
+        "MP2.5/CBS(aDZ)",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        yaml_fpath = os.path.join(self.root, "x40.yaml")
+        logger.info(f"Reading X40 interaction data from {self.root}")
+        yaml_file = open(yaml_fpath, "r")
+        yaml = YAML()
+        data = []
+        data_dict = yaml.load(yaml_file)
+        charge0 = int(data_dict["description"]["global_setup"]["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"]["global_setup"]["molecule_b"]["charge"])
+
+        for idx, item in enumerate(data_dict["items"]):
+            energies = []
+            name = np.array([item["shortname"]])
+            energies.append(float(item["reference_value"]))
+            xyz_file = open(os.path.join(self.root, f"{item['shortname']}.xyz"), "r")
+            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
+            setup = lines.pop(1) 
+            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
+            n_atoms_first = setup[0].split("-")[1]
+            n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32)
+            subset = np.array([item["group"]])
+            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies = np.array([energies], dtype=np.float32)
+            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
+            elems = np.array(lines[1:])[:, 0]
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            natoms0 = n_atoms_first[0]
+            natoms1 = n_atoms[0] - natoms0
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            item = dict(
+                energies=energies,
+                subset=subset,
+                n_atoms=n_atoms,
+                n_atoms_first=n_atoms_first,
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/src/openqdc/datasets/interaction/__init__.py b/src/openqdc/datasets/interaction/__init__.py
@@ -0,0 +1,69 @@
+import importlib
+import os
+from typing import TYPE_CHECKING  # noqa F401
+
+# The below lazy import logic is coming from openff-toolkit:
+# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44
+
+# Dictionary of objects to lazily import; maps the object's name to its module path
+
+_lazy_imports_obj = {
+    "BaseInteractionDataset": "openqdc.datasets.interaction.base",
+    "DES370K": "openqdc.datasets.interaction.des370k",
+    "DES5M": "openqdc.datasets.interaction.des5m",
+    "Metcalf": "openqdc.datasets.interaction.metcalf",
+    "DESS66": "openqdc.datasets.interaction.dess66",
+    "DESS66x8": "openqdc.datasets.interaction.dess66x8",
+    "L7": "openqdc.datasets.interaction.L7",
+    "X40": "openqdc.datasets.interaction.X40",
+    "Splinter": "openqdc.datasets.interaction.splinter",
+}
+
+_lazy_imports_mod = {}
+
+
+def __getattr__(name):
+    """Lazily import objects from _lazy_imports_obj or _lazy_imports_mod
+
+    Note that this method is only called by Python if the name cannot be found
+    in the current module."""
+    obj_mod = _lazy_imports_obj.get(name)
+    if obj_mod is not None:
+        mod = importlib.import_module(obj_mod)
+        return mod.__dict__[name]
+
+    lazy_mod = _lazy_imports_mod.get(name)
+    if lazy_mod is not None:
+        return importlib.import_module(lazy_mod)
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    """Add _lazy_imports_obj and _lazy_imports_mod to dir(<module>)"""
+    keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys())
+    return sorted(keys)
+
+
+if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
+    from .base import BaseInteractionDataset
+    from .des370k import DES370K
+    from .des5m import DES5M
+    from .metcalf import Metcalf
+    from .dess66 import DESS66
+    from .dess66x8 import DESS66x8
+    from .L7 import L7
+    from .X40 import X40
+    from .splinter import Splinter
+
+    __all__ = [
+        "BaseInteractionDataset",
+        "DES370K",
+        "DES5M",
+        "Metcalf",
+        "DESS66",
+        "DESS66x8",
+        "L7",
+        "X40",
+        "Splinter",
+    ]
diff --git a/src/openqdc/datasets/interaction/base.py b/src/openqdc/datasets/interaction/base.py
@@ -0,0 +1,57 @@
+from typing import Dict, List, Optional, Union
+from openqdc.utils.io import (
+    copy_exists,
+    dict_to_atoms,
+    get_local_cache,
+    load_hdf5_file,
+    load_pkl,
+    pull_locally,
+    push_remote,
+    set_cache_dir,
+)
+from openqdc.datasets.potential.base import BaseDataset
+from openqdc.utils.constants import (
+    NB_ATOMIC_FEATURES
+)
+
+from loguru import logger
+
+import numpy as np
+
+class BaseInteractionDataset(BaseDataset):
+    def __init__(
+        self,
+        energy_unit: Optional[str] = None,
+        distance_unit: Optional[str] = None,
+        overwrite_local_cache: bool = False,
+        cache_dir: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            energy_unit=energy_unit,
+            distance_unit=distance_unit,
+            overwrite_local_cache=overwrite_local_cache,
+            cache_dir=cache_dir
+        )
+
+    def collate_list(self, list_entries: List[Dict]):
+        # concatenate entries
+        print(list_entries[0])
+        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \
+               for key in list_entries[0] if not isinstance(list_entries[0][key], dict)}
+
+        csum = np.cumsum(res.get("n_atoms"))
+        print(csum)
+        x = np.zeros((csum.shape[0], 2), dtype=np.int32)
+        x[1:, 0], x[:, 1] = csum[:-1], csum
+        res["position_idx_range"] = x
+
+        return res
+
+    @property
+    def data_shapes(self):
+        return {
+            "atomic_inputs": (-1, NB_ATOMIC_FEATURES),
+            "position_idx_range": (-1, 2),
+            "energies": (-1, len(self.__energy_methods__)),
+            "forces": (-1, 3, len(self.force_target_names)),
+        }