valence-labs · prtos · Jun 8, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -47,8 +47,11 @@ jobs:
       - name: Install library
         run: python -m pip install --no-deps .
 
+      - name: Check directory
+        run: ls
+
       - name: Run tests
-        run: pytest
+        run: python -m pytest
 
-      - name: Test building the doc
-        run: mkdocs build
+      #- name: Test building the doc
+      #  run: mkdocs build
diff --git a/README.md b/README.md
@@ -90,3 +90,8 @@ We also provide support for the following publicly available QM Noncovalent Inte
 | [Splinter](https://www.nature.com/articles/s41597-023-02443-1) |
 | [X40](https://pubs.acs.org/doi/10.1021/ct300647k) |
 | [L7](https://pubs.acs.org/doi/10.1021/ct400036b)  |
+
+# How to cite
+All data presented in the OpenQDC are already published in scientific journals, full reference to the respective paper is attached to each dataset class. When citing data obtained from OpenQDC, you should cite both the original paper(s) the data come from and our paper on OpenQDC itself. The reference is:
+
+ADD REF HERE LATER
diff --git a/docs/API/isolated_atom_energies.md b/docs/API/isolated_atom_energies.md
diff --git a/docs/API/methods.md b/docs/API/methods.md
@@ -0,0 +1,3 @@
+# QM Methods
+
+::: openqdc.methods
diff --git a/docs/tutorials/usage.ipynb b/docs/tutorials/usage.ipynb
@@ -657,7 +657,7 @@
     "\n",
     "$U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$\n",
     "\n",
-    "The isolated atoms energies are automatically used inside the datasets for the correct level of theory, but you can also use them directly by accessing the IsolatedAtomEnergyFactor class."
+    "The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow"
    ]
   },
   {
@@ -715,10 +715,11 @@
     }
    ],
    "source": [
-    "from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory\n",
+    "from openqdc.methods import QmMethod\n",
     "\n",
-    "# Get the hasmap of isolated atom energies for the b3lyp/6-31g* method\n",
-    "IsolatedAtomEnergyFactory.get(\"b3lyp/6-31g*\")"
+    "# Get the b3lyp/6-31g* method\n",
+    "method = QmMethod.B3LYP_6_31G_D\n",
+    "method.atom_energies_dict"
    ]
   },
   {
@@ -745,7 +746,7 @@
    ],
    "source": [
     "# Get the matrix of atomization energies for the b3lyp/6-31g* method\n",
-    "IsolatedAtomEnergyFactory.get_matrix(\"b3lyp/6-31g*\")"
+    "method.atom_energies_matrix"
    ]
   },
   {

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -12,8 +12,8 @@ docs_dir: "docs"
 nav:
   - Overview: index.md
   - Available Datasets: datasets.md
-  - Tutorials:
-    - Really hard example: tutorials/usage.ipynb
+  #- Tutorials:
+  #  #- Really hard example: tutorials/usage.ipynb
   - API:
     - Datasets: API/available_datasets.md
     - Isolated Atoms Energies: API/isolated_atom_energies.md

diff --git a/openqdc/__init__.py b/openqdc/__init__.py
@@ -1,39 +1,61 @@
 import importlib
 import os
-from typing import TYPE_CHECKING  # noqa F401
+from typing import TYPE_CHECKING
 
 # The below lazy import logic is coming from openff-toolkit:
 # https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44
 
+
 # Dictionary of objects to lazily import; maps the object's name to its module path
+def get_project_root():
+    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
 
 _lazy_imports_obj = {
     "__version__": "openqdc._version",
     "BaseDataset": "openqdc.datasets.base",
+    # POTENTIAL
     "ANI1": "openqdc.datasets.potential.ani",
     "ANI1CCX": "openqdc.datasets.potential.ani",
+    "ANI1CCX_V2": "openqdc.datasets.potential.ani",
     "ANI1X": "openqdc.datasets.potential.ani",
     "Spice": "openqdc.datasets.potential.spice",
     "SpiceV2": "openqdc.datasets.potential.spice",
+    "SpiceVL2": "openqdc.datasets.potential.spice",
     "GEOM": "openqdc.datasets.potential.geom",
     "QMugs": "openqdc.datasets.potential.qmugs",
+    "QMugs_V2": "openqdc.datasets.potential.qmugs",
     "ISO17": "openqdc.datasets.potential.iso_17",
     "COMP6": "openqdc.datasets.potential.comp6",
     "GDML": "openqdc.datasets.potential.gdml",
     "Molecule3D": "openqdc.datasets.potential.molecule3d",
     "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
     "SN2RXN": "openqdc.datasets.potential.sn2_rxn",
     "QM7X": "openqdc.datasets.potential.qm7x",
+    "QM7X_V2": "openqdc.datasets.potential.qm7x",
     "NablaDFT": "openqdc.datasets.potential.nabladft",
     "SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
     "WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
     "TMQM": "openqdc.datasets.potential.tmqm",
-    "Dummy": "openqdc.datasets.potential.dummy",
     "PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
     "PCQM_PM6": "openqdc.datasets.potential.pcqm",
     "RevMD17": "openqdc.datasets.potential.revmd17",
+    "MD22": "openqdc.datasets.potential.md22",
     "Transition1X": "openqdc.datasets.potential.transition1x",
     "MultixcQM9": "openqdc.datasets.potential.multixcqm9",
+    "MultixcQM9_V2": "openqdc.datasets.potential.multixcqm9",
+    # INTERACTION
+    "DES5M": "openqdc.datasets.interaction.des",
+    "DES370K": "openqdc.datasets.interaction.des",
+    "DESS66": "openqdc.datasets.interaction.des",
+    "DESS66x8": "openqdc.datasets.interaction.des",
+    "L7": "openqdc.datasets.interaction.l7",
+    "X40": "openqdc.datasets.interaction.x40",
+    "Metcalf": "openqdc.datasets.interaction.metcalf",
+    "Splinter": "openqdc.datasets.interaction.splinter",
+    # DEBUG
+    "Dummy": "openqdc.datasets.potential.dummy",
+    # ALL
     "AVAILABLE_DATASETS": "openqdc.datasets",
     "AVAILABLE_POTENTIAL_DATASETS": "openqdc.datasets.potential",
     "AVAILABLE_INTERACTION_DATASETS": "openqdc.datasets.interaction",
@@ -68,26 +90,34 @@ def __dir__():
 if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
     # These types are imported lazily at runtime, but we need to tell type
     # checkers what they are.
-    from ._version import __version__  # noqa
-    from .datasets import AVAILABLE_DATASETS  # noqa
-    from .datasets.base import BaseDataset  # noqa
-    from .datasets.potential.ani import ANI1, ANI1CCX, ANI1X  # noqa
-    from .datasets.potential.comp6 import COMP6  # noqa
-    from .datasets.potential.dummy import Dummy  # noqa
-    from .datasets.potential.gdml import GDML  # noqa
-    from .datasets.potential.geom import GEOM  # noqa
-    from .datasets.potential.iso_17 import ISO17  # noqa
-    from .datasets.potential.molecule3d import Molecule3D  # noqa
-    from .datasets.potential.multixcqm9 import MultixcQM9  # noqa
-    from .datasets.potential.nabladft import NablaDFT  # noqa
-    from .datasets.potential.orbnet_denali import OrbnetDenali  # noqa
-    from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6  # noqa
-    from .datasets.potential.qm7x import QM7X  # noqa
-    from .datasets.potential.qmugs import QMugs  # noqa
-    from .datasets.potential.revmd17 import RevMD17  # noqa
-    from .datasets.potential.sn2_rxn import SN2RXN  # noqa
-    from .datasets.potential.solvated_peptides import SolvatedPeptides  # noqa
-    from .datasets.potential.spice import Spice, SpiceV2  # noqa
-    from .datasets.potential.tmqm import TMQM  # noqa
-    from .datasets.potential.transition1x import Transition1X  # noqa
-    from .datasets.potential.waterclusters3_30 import WaterClusters  # noqa
+    from ._version import __version__
+    from .datasets import AVAILABLE_DATASETS
+    from .datasets.base import BaseDataset
+
+    # INTERACTION
+    from .datasets.interaction.des import DES5M, DES370K, DESS66, DESS66x8
+    from .datasets.interaction.l7 import L7
+    from .datasets.interaction.metcalf import Metcalf
+    from .datasets.interaction.splinter import Splinter
+    from .datasets.interaction.x40 import X40
+    from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
+    from .datasets.potential.comp6 import COMP6
+    from .datasets.potential.dummy import Dummy
+    from .datasets.potential.gdml import GDML
+    from .datasets.potential.geom import GEOM
+    from .datasets.potential.iso_17 import ISO17
+    from .datasets.potential.md22 import MD22
+    from .datasets.potential.molecule3d import Molecule3D
+    from .datasets.potential.multixcqm9 import MultixcQM9, MultixcQM9_V2
+    from .datasets.potential.nabladft import NablaDFT
+    from .datasets.potential.orbnet_denali import OrbnetDenali
+    from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6
+    from .datasets.potential.qm7x import QM7X, QM7X_V2
+    from .datasets.potential.qmugs import QMugs, QMugs_V2
+    from .datasets.potential.revmd17 import RevMD17
+    from .datasets.potential.sn2_rxn import SN2RXN
+    from .datasets.potential.solvated_peptides import SolvatedPeptides
+    from .datasets.potential.spice import Spice, SpiceV2, SpiceVL2
+    from .datasets.potential.tmqm import TMQM
+    from .datasets.potential.transition1x import Transition1X
+    from .datasets.potential.waterclusters3_30 import WaterClusters
diff --git a/openqdc/cli.py b/openqdc/cli.py
@@ -3,11 +3,15 @@
 import typer
 from loguru import logger
 from prettytable import PrettyTable
+from rich import print
 from typing_extensions import Annotated
 
-from openqdc import AVAILABLE_DATASETS, AVAILABLE_POTENTIAL_DATASETS
-from openqdc.raws.config_factory import DataConfigFactory
-from openqdc.raws.fetch import DataDownloader
+from openqdc.datasets import COMMON_MAP_POTENTIALS  # noqa
+from openqdc.datasets import (
+    AVAILABLE_DATASETS,
+    AVAILABLE_INTERACTION_DATASETS,
+    AVAILABLE_POTENTIAL_DATASETS,
+)
 
 app = typer.Typer(help="OpenQDC CLI")
 
@@ -20,10 +24,12 @@ def exist_dataset(dataset):
 
 
 def format_entry(empty_dataset):
-    if len(empty_dataset.__energy_methods__) > 10:
-        entry = ",".join(empty_dataset.__energy_methods__[:10]) + "..."
+    energy_methods = [str(x) for x in empty_dataset.__energy_methods__]
+    max_num_to_display = 6
+    if len(energy_methods) > 6:
+        entry = ",".join(energy_methods[:max_num_to_display]) + "..."
     else:
-        entry = ",".join(empty_dataset.__energy_methods__[:10])
+        entry = ",".join(energy_methods[:max_num_to_display])
     return entry
 
 
@@ -65,7 +71,7 @@ def datasets():
     table = PrettyTable(["Name", "Type of Energy", "Forces", "Level of theory"])
     for dataset in AVAILABLE_DATASETS:
         empty_dataset = AVAILABLE_DATASETS[dataset].no_init()
-        has_forces = False if not empty_dataset.__force_methods__ else True
+        has_forces = False if not any(empty_dataset.force_mask) else True
         en_type = "Potential" if dataset in AVAILABLE_POTENTIAL_DATASETS else "Interaction"
         table.add_row(
             [
@@ -80,22 +86,78 @@ def datasets():
 
 
 @app.command()
-def fetch(datasets: List[str]):
+def fetch(
+    datasets: List[str],
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to overwrite or force the re-download of the files.",
+        ),
+    ] = False,
+    cache_dir: Annotated[
+        Optional[str],
+        typer.Option(
+            help="Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used.",
+        ),
+    ] = None,
+):
     """
     Download the raw datasets files from the main openQDC hub.
-    Special case: if the dataset is "all", all available datasets will be downloaded.
-
+    overwrite: bool = False,
+        If True, the files will be re-downloaded and overwritten.
+    cache_dir: Optional[str] = None,
+        Path to the cache. If not provided, the default cache directory will be used.
+    Special case: if the dataset is "all", "potential", "interaction".
+        all: all available datasets will be downloaded.
+        potential: all the potential datasets will be downloaded
+        interaction: all the interaction datasets will be downloaded
     Example:
         openqdc fetch Spice
     """
-    if datasets[0] == "all":
-        dataset_names = DataConfigFactory.available_datasets
+    if datasets[0].lower() == "all":
+        dataset_names = AVAILABLE_DATASETS
+    elif datasets[0].lower() == "potential":
+        dataset_names = AVAILABLE_POTENTIAL_DATASETS
+    elif datasets[0].lower() == "interaction":
+        dataset_names = AVAILABLE_INTERACTION_DATASETS
     else:
         dataset_names = datasets
 
-    for dataset_name in dataset_names:
-        dd = DataDownloader()
-        dd.from_name(dataset_name)
+    for dataset in list(map(lambda x: x.lower().replace("_", ""), dataset_names)):
+        if exist_dataset(dataset):
+            try:
+                AVAILABLE_DATASETS[dataset].fetch(cache_dir, overwrite)
+            except Exception as e:
+                logger.error(f"Something unexpected happended while fetching {dataset}: {repr(e)}")
+
+
+@app.command()
+def preprocess(
+    datasets: List[str],
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to overwrite or force the re-download of the datasets.",
+        ),
+    ] = True,
+    upload: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to try the upload to the remote storage.",
+        ),
+    ] = False,
+):
+    """
+    Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.
+    """
+    for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
+        if exist_dataset(dataset):
+            logger.info(f"Preprocessing {AVAILABLE_DATASETS[dataset].__name__}")
+            try:
+                AVAILABLE_DATASETS[dataset].no_init().preprocess(upload=upload, overwrite=overwrite)
+            except Exception as e:
+                logger.error(f"Error while preprocessing {dataset}. {e}. Did you fetch the dataset first?")
+                raise e
 
 
 if __name__ == "__main__":

diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py
@@ -1,4 +1,28 @@
-from .interaction import AVAILABLE_INTERACTION_DATASETS  # noqa
-from .potential import AVAILABLE_POTENTIAL_DATASETS  # noqa
+from .interaction import *
+from .potential import *
 
 AVAILABLE_DATASETS = {**AVAILABLE_POTENTIAL_DATASETS, **AVAILABLE_INTERACTION_DATASETS}
+
+
+def _level_of_theory_overlap(dataset_collection):
+    import itertools
+    from itertools import groupby
+
+    dataset_map = {}
+    for dataset in dataset_collection:
+        dataset_map[dataset.lower().replace("_", "")] = dataset_collection[dataset].no_init().energy_methods
+
+    common_values_dict = {}
+
+    for key, values in dataset_map.items():
+        for value in values:
+            if value in common_values_dict:
+                common_values_dict[value].append(key)
+            else:
+                common_values_dict[value] = [key]
+
+    return dict(filter(lambda x: len(x[1]) > 1, common_values_dict.items()))
+
+
+COMMON_MAP_POTENTIALS = _level_of_theory_overlap(AVAILABLE_POTENTIAL_DATASETS)
+COMMON_MAP_INTERACTIONS = _level_of_theory_overlap(AVAILABLE_INTERACTION_DATASETS)