valence-labs · FNTwin · Apr 3, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -47,8 +47,11 @@ jobs:
       - name: Install library
         run: python -m pip install --no-deps .
 
+      - name: Check directory
+        run: ls
+
       - name: Run tests
-        run: pytest
+        run: python -m pytest
 
       - name: Test building the doc
         run: mkdocs build
diff --git a/README.md b/README.md
@@ -90,3 +90,8 @@ We also provide support for the following publicly available QM Noncovalent Inte
 | [Splinter](https://www.nature.com/articles/s41597-023-02443-1) |
 | [X40](https://pubs.acs.org/doi/10.1021/ct300647k) |
 | [L7](https://pubs.acs.org/doi/10.1021/ct400036b)  |
+
+# How to cite
+All data presented in the OpenQDC are already published in scientific journals, full reference to the respective paper is attached to each dataset class. When citing data obtained from OpenQDC, you should cite both the original paper(s) the data come from and our paper on OpenQDC itself. The reference is:
+
+ADD REF HERE LATER
diff --git a/docs/API/isolated_atom_energies.md b/docs/API/isolated_atom_energies.md
diff --git a/docs/API/methods.md b/docs/API/methods.md
@@ -0,0 +1,3 @@
+# QM Methods
+
+::: openqdc.methods
diff --git a/docs/tutorials/usage.ipynb b/docs/tutorials/usage.ipynb
@@ -657,7 +657,7 @@
     "\n",
     "$U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$\n",
     "\n",
-    "The isolated atoms energies are automatically used inside the datasets for the correct level of theory, but you can also use them directly by accessing the IsolatedAtomEnergyFactor class."
+    "The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow"
    ]
   },
   {
@@ -715,10 +715,11 @@
     }
    ],
    "source": [
-    "from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory\n",
+    "from openqdc.methods import QmMethod\n",
     "\n",
-    "# Get the hasmap of isolated atom energies for the b3lyp/6-31g* method\n",
-    "IsolatedAtomEnergyFactory.get(\"b3lyp/6-31g*\")"
+    "# Get the b3lyp/6-31g* method\n",
+    "method = QmMethod.B3LYP_6_31G_D\n",
+    "method.atom_energies_dict"
    ]
   },
   {
@@ -745,7 +746,7 @@
    ],
    "source": [
     "# Get the matrix of atomization energies for the b3lyp/6-31g* method\n",
-    "IsolatedAtomEnergyFactory.get_matrix(\"b3lyp/6-31g*\")"
+    "method.atom_energies_matrix"
    ]
   },
   {

diff --git a/openqdc/cli.py b/openqdc/cli.py
@@ -20,10 +20,11 @@ def exist_dataset(dataset):
 
 
 def format_entry(empty_dataset):
-    if len(empty_dataset.__energy_methods__) > 10:
-        entry = ",".join(empty_dataset.__energy_methods__[:10]) + "..."
+    energy_methods = [str(x) for x in empty_dataset.__energy_methods__]
+    if len(energy_methods) > 10:
+        entry = ",".join(energy_methods[:10]) + "..."
     else:
-        entry = ",".join(empty_dataset.__energy_methods__[:10])
+        entry = ",".join(energy_methods[:10])
     return entry
 
 

diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
@@ -15,11 +15,9 @@
 from sklearn.utils import Bunch
 from tqdm import tqdm
 
-from openqdc.utils.atomization_energies import (
-    IsolatedAtomEnergyFactory,
-    chemical_symbols,
-)
 from openqdc.utils.constants import (
+    ATOM_SYMBOLS,
+    MAX_CHARGE,
     NB_ATOMIC_FEATURES,
     NOT_DEFINED,
     POSSIBLE_NORMALIZATION,
@@ -135,7 +133,7 @@ def __force_methods__(self):
 
     @property
     def energy_methods(self):
-        return self.__energy_methods__
+        return [str(i) for i in self.__energy_methods__]
 
     @property
     def force_methods(self):
@@ -205,7 +203,7 @@ def _set_linear_e0s(self):
     def _precompute_E(self):
         splits_idx = self.data["position_idx_range"][:, 1]
         s = np.array(self.data["atomic_inputs"][:, :2], dtype=int)
-        s[:, 1] += IsolatedAtomEnergyFactory.max_charge
+        s[:, 1] += MAX_CHARGE
         matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__]
         REGRESSOR_SUCCESS = False
         try:
@@ -279,7 +277,7 @@ def numbers(self):
 
     @property
     def chemical_species(self):
-        return np.array(chemical_symbols)[self.numbers]
+        return np.array(ATOM_SYMBOLS)[self.numbers]
 
     @property
     def energy_unit(self):
@@ -347,7 +345,7 @@ def _set_new_e0s_unit(self, en):
     @property
     def force_mask(self):
         if len(self.__class__.__force_mask__) == 0:
-            self.__class__.__force_mask__ = [False] * len(self.energy_methods)
+            self.__class__.__force_mask__ = [False] * len(self.__energy_methods__)
         return self.__class__.__force_mask__
 
     def _set_units(self, en, ds):
@@ -364,11 +362,11 @@ def _set_units(self, en, ds):
             self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__)
 
     def _set_isolated_atom_energies(self):
-        if self.energy_methods is None:
+        if self.__energy_methods__ is None:
             logger.error("No energy methods defined for this dataset.")
         f = get_conversion("hartree", self.__energy_unit__)
         self.__isolated_atom_energies__ = f(
-            np.array([IsolatedAtomEnergyFactory.get_matrix(energy_method) for energy_method in self.energy_methods])
+            np.array([en_method.atom_energies_matrix for en_method in self.__energy_methods__])
         )
 
     def convert_energy(self, x):
@@ -703,7 +701,7 @@ def __smiles_converter__(self, x):
         return x
 
     def __getitem__(self, idx: int):
-        shift = IsolatedAtomEnergyFactory.max_charge
+        shift = MAX_CHARGE
         p_start, p_end = self.data["position_idx_range"][idx]
         input = self.data["atomic_inputs"][p_start:p_end]
         z, c, positions, energies = (

diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
@@ -6,7 +6,8 @@
 from loguru import logger
 
 from openqdc.datasets.interaction.base import BaseInteractionDataset
-from openqdc.utils.molecule import atom_table
+from openqdc.methods import InteractionMethod, InterEnergyType
+from openqdc.utils.constants import ATOM_TABLE
 
 
 class DataItemYAMLObj:
@@ -66,16 +67,18 @@ class L7(BaseInteractionDataset):
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
     __energy_methods__ = [
-        "CSD(T) | QCISD(T)",
-        "DLPNO-CCSD(T)",
-        "MP2/CBS",
-        "MP2C/CBS",
-        "fixed",
-        "DLPNO-CCSD(T0)",
-        "LNO-CCSD(T)",
-        "FN-DMC",
+        InteractionMethod.QCISDT_CBS,  # "QCISD(T)/CBS",
+        InteractionMethod.DLPNO_CCSDT,  # "DLPNO-CCSD(T)",
+        InteractionMethod.MP2_CBS,  # "MP2/CBS",
+        InteractionMethod.MP2C_CBS,  # "MP2C/CBS",
+        InteractionMethod.FIXED,  # "fixed", TODO: we should remove this level of theory because unless we have a pro
+        InteractionMethod.DLPNO_CCSDT0,  # "DLPNO-CCSD(T0)",
+        InteractionMethod.LNO_CCSDT,  # "LNO-CCSD(T)",
+        InteractionMethod.FN_DMC,  # "FN-DMC",
     ]
 
+    __energy_type__ = [InterEnergyType.TOTAL] * 8
+
     energy_target_names = []
 
     def read_raw_entries(self) -> List[Dict]:
@@ -102,7 +105,7 @@ def read_raw_entries(self) -> List[Dict]:
             energies = np.array([energies], dtype=np.float32)
             pos = np.array(lines[1:])[:, 1:].astype(np.float32)
             elems = np.array(lines[1:])[:, 0]
-            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)
             natoms0 = n_atoms_first[0]
             natoms1 = n_atoms[0] - natoms0
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)

diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
@@ -7,7 +7,8 @@
 
 from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.datasets.interaction.L7 import get_loader
-from openqdc.utils.molecule import atom_table
+from openqdc.methods import InteractionMethod, InterEnergyType
+from openqdc.utils.constants import ATOM_TABLE
 
 
 class X40(BaseInteractionDataset):
@@ -29,12 +30,15 @@ class X40(BaseInteractionDataset):
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
     __energy_methods__ = [
-        "CCSD(T)/CBS",
-        "MP2/CBS",
-        "dCCSD(T)/haDZ",
-        "dCCSD(T)/haTZ",
-        "MP2.5/CBS(aDZ)",
+        InteractionMethod.CCSD_T_CBS,  # "CCSD(T)/CBS",
+        InteractionMethod.MP2_CBS,  # "MP2/CBS",
+        InteractionMethod.DCCSDT_HA_DZ,  # "dCCSD(T)/haDZ",
+        InteractionMethod.DCCSDT_HA_TZ,  # "dCCSD(T)/haTZ",
+        InteractionMethod.MP2_5_CBS_ADZ,  # "MP2.5/CBS(aDZ)",
     ]
+    __energy_type__ = [
+        InterEnergyType.TOTAL,
+    ] * 5
 
     energy_target_names = []
 
@@ -62,7 +66,7 @@ def read_raw_entries(self) -> List[Dict]:
             energies = np.array([energies], dtype=np.float32)
             pos = np.array(lines[1:])[:, 1:].astype(np.float32)
             elems = np.array(lines[1:])[:, 0]
-            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)
             natoms0 = n_atoms_first[0]
             natoms1 = n_atoms[0] - natoms0
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)

diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
@@ -9,25 +9,12 @@
 from sklearn.utils import Bunch
 
 from openqdc.datasets.base import BaseDataset
-from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
-from openqdc.utils.constants import NB_ATOMIC_FEATURES
+from openqdc.utils.constants import MAX_CHARGE, NB_ATOMIC_FEATURES
 from openqdc.utils.io import pull_locally, push_remote, to_atoms
 
 
 class BaseInteractionDataset(BaseDataset):
-    def __init__(
-        self,
-        energy_unit: Optional[str] = None,
-        distance_unit: Optional[str] = None,
-        overwrite_local_cache: bool = False,
-        cache_dir: Optional[str] = None,
-    ) -> None:
-        super().__init__(
-            energy_unit=energy_unit,
-            distance_unit=distance_unit,
-            overwrite_local_cache=overwrite_local_cache,
-            cache_dir=cache_dir,
-        )
+    __energy_type__ = []
 
     def collate_list(self, list_entries: List[Dict]):
         # concatenate entries
@@ -63,7 +50,7 @@ def data_types(self):
         }
 
     def __getitem__(self, idx: int):
-        shift = IsolatedAtomEnergyFactory.max_charge
+        shift = MAX_CHARGE
         p_start, p_end = self.data["position_idx_range"][idx]
         input = self.data["atomic_inputs"][p_start:p_end]
         z, c, positions, energies = (

diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py
@@ -7,8 +7,10 @@
 from tqdm import tqdm
 
 from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.methods import InteractionMethod, InterEnergyType
+from openqdc.utils.constants import ATOM_TABLE
 from openqdc.utils.io import get_local_cache
-from openqdc.utils.molecule import atom_table, molecule_groups
+from openqdc.utils.molecule import molecule_groups
 
 
 class DES370K(BaseInteractionDataset):
@@ -27,23 +29,43 @@ class DES370K(BaseInteractionDataset):
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
     __energy_methods__ = [
-        "mp2/cc-pvdz",
-        "mp2/cc-pvqz",
-        "mp2/cc-pvtz",
-        "mp2/cbs",
-        "ccsd(t)/cc-pvdz",
-        "ccsd(t)/cbs",  # cbs
-        "ccsd(t)/nn",  # nn
-        "sapt0/aug-cc-pwcvxz",
-        "sapt0/aug-cc-pwcvxz_es",
-        "sapt0/aug-cc-pwcvxz_ex",
-        "sapt0/aug-cc-pwcvxz_exs2",
-        "sapt0/aug-cc-pwcvxz_ind",
-        "sapt0/aug-cc-pwcvxz_exind",
-        "sapt0/aug-cc-pwcvxz_disp",
-        "sapt0/aug-cc-pwcvxz_exdisp_os",
-        "sapt0/aug-cc-pwcvxz_exdisp_ss",
-        "sapt0/aug-cc-pwcvxz_delta_HF",
+        InteractionMethod.MP2_CC_PVDZ,
+        InteractionMethod.MP2_CC_PVQZ,
+        InteractionMethod.MP2_CC_PVTZ,
+        InteractionMethod.MP2_CBS,
+        InteractionMethod.CCSD_T_CC_PVDZ,
+        InteractionMethod.CCSD_T_CBS,
+        InteractionMethod.CCSD_T_NN,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+    ]
+
+    __energy_type__ = [
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.ES,
+        InterEnergyType.EX,
+        InterEnergyType.EX_S2,
+        InterEnergyType.IND,
+        InterEnergyType.EX_IND,
+        InterEnergyType.DISP,
+        InterEnergyType.EX_DISP_OS,
+        InterEnergyType.EX_DISP_SS,
+        InterEnergyType.DELTA_HF,
     ]
 
     energy_target_names = [
@@ -87,7 +109,7 @@ def _read_raw_entries(cls) -> List[Dict]:
 
             elements = row["elements"].split()
 
-            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
+            atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
 
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)