choderalab · chrisiacovella · Sep 20, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023
diff --git a/devtools/conda-envs/espaloma.yaml b/devtools/conda-envs/espaloma.yaml
@@ -20,7 +20,7 @@ dependencies:
   - tqdm
   - pydantic <2  # We need our deps to fix this
   - dgl =1.1.2
-  - qcportal =0.15.8
+  - qcportal>=0.5
   # Testing
   - pytest
   - pytest-cov
@@ -30,6 +30,5 @@ dependencies:
   - nose
   - nose-timer
   - coverage
-  - qcportal>=0.15.0
   - sphinx
   - sphinx_rtd_theme
diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py
@@ -5,7 +5,7 @@
 from typing import Tuple
 
 import numpy as np
-import qcportal as ptl
+import qcportal
 import torch
 from openmm import unit
 from openmm.unit import Quantity
@@ -21,35 +21,69 @@
 # =============================================================================
 # UTILITY FUNCTIONS
 # =============================================================================
-def get_client():
-    return ptl.FractalClient()
+def get_client(url: str = "api.qcarchive.molssi.org") -> qcportal.client.PortalClient:
+    """
+    Returns a instance of the qcportal client.
+
+    Parameters
+    ----------
+    url: str, default="api.qcarchive.molssi.org"
+        qcportal instance to connect
+
+    Returns
+    -------
+    qcportal.client.PortalClient
+        qcportal client instance.
+    """
+    # Note, this may need to be modified to include username/password for non-public servers
+    return qcportal.PortalClient(url)
 
 
 def get_collection(
-    client,
-    collection_type="OptimizationDataset",
-    name="OpenFF Full Optimization Benchmark 1",
+        client,
+        collection_type="optimization",
+        name="OpenFF Full Optimization Benchmark 1",
 ):
-    collection = client.get_collection(
-        collection_type,
-        name,
+    """
+    Connects to a specific dataset on qcportal
+
+    Parameters
+    ----------
+    client: qcportal.client, required
+        The qcportal client instance
+    collection_type: str, default="optimization"
+        The type of qcarchive collection, options are
+        "torsiondrive", "optimization", "gridoptimization", "reaction", "singlepoint" "manybody"
+    name: str, default="OpenFF Full Optimization Benchmark 1"
+        Name of the dataset
+
+    Returns
+    -------
+    (qcportal dataset, list(str))
+        Tuple with an instance of qcportal dataset and list of record names
+
+    """
+    collection = client.get_dataset(
+        dataset_type=collection_type,
+        dataset_name=name,
     )
 
-    record_names = list(collection.data.records)
+    record_names = collection.entry_names
 
     return collection, record_names
 
 
-def get_graph(collection, record_name):
-    # get record and trajectory
-    record = collection.get_record(record_name, specification="default")
-    entry = collection.get_entry(record_name)
+def process_record(record, entry):
+    """
+    Processes a given record/entry pair from a dataset and returns the graph
+    """
+
     from openff.toolkit.topology import Molecule
 
-    mol = Molecule.from_qcschema(entry)
+    mol = Molecule.from_qcschema(entry.dict())
 
     try:
-        trajectory = record.get_trajectory()
+        trajectory = record.trajectory
     except:
         return None
 
@@ -62,7 +96,7 @@ def get_graph(collection, record_name):
     g.nodes["g"].data["u_ref"] = torch.tensor(
         [
             Quantity(
-                snapshot.properties.scf_total_energy,
+                snapshot.properties["scf_total_energy"],
                 esp.units.HARTREE_PER_PARTICLE,
             ).value_in_unit(esp.units.ENERGY_UNIT)
             for snapshot in trajectory
@@ -74,7 +108,7 @@ def get_graph(collection, record_name):
         np.stack(
             [
                 Quantity(
-                    snapshot.get_molecule().geometry,
+                    snapshot.molecule.geometry,
                     unit.bohr,
                 ).value_in_unit(esp.units.DISTANCE_UNIT)
                 for snapshot in trajectory
@@ -89,7 +123,7 @@ def get_graph(collection, record_name):
         [
             torch.tensor(
                 Quantity(
-                    snapshot.dict()["return_result"],
+                    np.array(snapshot.properties["return_result"]).reshape((-1, 3)),
                     esp.units.HARTREE_PER_PARTICLE / unit.bohr,
                 ).value_in_unit(esp.units.FORCE_UNIT),
                 dtype=torch.get_default_dtype(),
@@ -102,21 +136,83 @@ def get_graph(collection, record_name):
     return g
 
 
-def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord):
-    final_molecules = record.get_final_molecules()
-    final_results = record.get_final_results()
+def get_graph(collection, record_name, spec_name="default"):
+    """
+    Processes the qcportal data for a given record name.
+
+    Parameters
+    ----------
+    collection, qcportal dataset, required
+        The instance of the qcportal dataset
+    record_name, str, required
+        The name of a give record
+    spec_name, str, default="default"
+        Retrieve data for a given qcportal specification.
+    Returns
+    -------
+        Graph
+    """
+    # get record and trajectory
+    record = collection.get_record(record_name, specification_name=spec_name)
+    entry = collection.get_entry(record_name)
+
+    g = process_record(record, entry)
+
+    return g
+
 
-    angle_keys = list(final_molecules.keys())
+def get_graphs(collection, record_names, spec_name="default"):
+    """
+    Processes the qcportal data for a given set of record names.
+    This uses the qcportal iteration functions which are faster than processing
+    records one at a time
+
+    Parameters
+    ----------
+    collection, qcportal dataset, required
+        The instance of the qcportal dataset
+    record_name, str, required
+        The name of a give record
+    spec_name, str, default="default"
+        Retrieve data for a given qcportal specification.
+    Returns
+    -------
+    list(graph)
+        Returns a list of the corresponding graph for each record name
+    """
+    g_list = []
+    for record, entry in zip(
+            collection.iterate_records(record_names, specification_names=[spec_name]),
+            collection.iterate_entries(record_names),
+    ):
+        g = process_record(record, entry)
+        g_list.append(g)
+
+    return g_list
+
+
+def fetch_td_record(record: qcportal.torsiondrive.record_models.TorsiondriveRecord):
+    molecule_optimization = record.optimizations
+
+    angle_keys = list(molecule_optimization.keys())
 
     xyzs = []
     energies = []
     gradients = []
 
     for angle in angle_keys:
-        result = final_results[angle]
-        mol = final_molecules[angle]
+        # NOTE: this is calling the first index of the optimization array
+        # this gives the same value as the prior implementation, but I wonder if it
+        # should be molecule_optimization[angle][-1] in both cases
+        mol = molecule_optimization[angle][0].final_molecule
+        result = molecule_optimization[angle][0].trajectory[-1].properties
+
+        """Note: force = - gradient"""
 
-        e, g = get_energy_and_gradient(result)
+        # TODO: attach units here? or later?
+
+        e = result["current energy"]
+        g = np.array(result["current gradient"]).reshape(-1, 3)
 
         xyzs.append(mol.geometry)
         energies.append(e)
@@ -146,22 +242,6 @@ def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord):
     return flat_angles, xyz_in_order, energies_in_order, gradients_in_order
 
 
-def get_energy_and_gradient(
-    snapshot: ptl.models.records.ResultRecord,
-) -> Tuple[float, np.ndarray]:
-    """Note: force = - gradient"""
-
-    # TODO: attach units here? or later?
-
-    d = snapshot.dict()
-    qcvars = d["extras"]["qcvars"]
-    energy = qcvars["CURRENT ENERGY"]
-    flat_gradient = np.array(qcvars["CURRENT GRADIENT"])
-    num_atoms = len(flat_gradient) // 3
-    gradient = flat_gradient.reshape((num_atoms, 3))
-    return energy, gradient
-
-
 MolWithTargets = namedtuple(
     "MolWithTargets", ["offmol", "xyz", "energies", "gradients"]
 )
@@ -184,9 +264,9 @@ def get_smiles(x):
         g = esp.Graph(mol_ref)
 
         u_ref = np.concatenate(group["energies"].values)
-        u_ref_prime = np.concatenate(
-            group["gradients"].values, axis=0
-        ).transpose(1, 0, 2)
+        u_ref_prime = np.concatenate(group["gradients"].values, axis=0).transpose(
+            1, 0, 2
+        )
         xyz = np.concatenate(group["xyz"].values, axis=0).transpose(1, 0, 2)
 
         assert u_ref_prime.shape[0] == xyz.shape[0] == mol_ref.n_atoms
@@ -229,7 +309,7 @@ def breakdown_along_time_axis(g, batch_size=32):
 
     shuffle(idxs)
     chunks = [
-        idxs[_idx * batch_size : (_idx + 1) * batch_size]
+        idxs[_idx * batch_size: (_idx + 1) * batch_size]
         for _idx in range(n_snapshots // batch_size)
     ]
 
@@ -259,10 +339,7 @@ def make_batch_size_consistent(ds, batch_size=32):
     return esp.data.dataset.GraphDataset(
         list(
             itertools.chain.from_iterable(
-                [
-                    breakdown_along_time_axis(g, batch_size=batch_size)
-                    for g in ds
-                ]
+                [breakdown_along_time_axis(g, batch_size=batch_size) for g in ds]
             )
         )
     )

diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py
@@ -12,3 +12,67 @@ def test_get_graph():
     collection, record_names = qcarchive_utils.get_collection(client)
     record_name = record_names[0]
     graph = qcarchive_utils.get_graph(collection, record_name)
+
+
+def test_get_torsiondrive():
+    from espaloma.data import qcarchive_utils
+    import numpy as np
+
+    record_name = "[h]c1c(c(c(c([c:1]1[n:2]([c:3](=[o:4])c(=c([h])[h])[h])c([h])([h])[h])[h])[h])n(=o)=o)[h]"
+
+    # example dataset
+    name = "OpenFF Amide Torsion Set v1.0"
+    collection_type = "torsiondrive"
+
+    collection, record_names = qcarchive_utils.get_collection(
+        qcarchive_utils.get_client(), collection_type, name
+    )
+    record_info = collection.get_record(record_name, specification_name="default")
+
+    (
+        flat_angles,
+        xyz_in_order,
+        energies_in_order,
+        gradients_in_order,
+    ) = qcarchive_utils.fetch_td_record(record_info)
+
+    assert flat_angles.shape == (24,)
+    assert energies_in_order.shape == (24,)
+    assert gradients_in_order.shape == (24, 25, 3)
+    assert xyz_in_order.shape == (24, 25, 3)
+
+    assert np.isclose(energies_in_order[0], -722.2850260791969)
+    assert np.all(
+        flat_angles
+        == np.array(
+            [
+                -165,
+                -150,
+                -135,
+                -120,
+                -105,
+                -90,
+                -75,
+                -60,
+                -45,
+                -30,
+                -15,
+                0,
+                15,
+                30,
+                45,
+                60,
+                75,
+                90,
+                105,
+                120,
+                135,
+                150,
+                165,
+                180,
+            ]
+        )
+    )
+    assert np.all(
+        xyz_in_order[0][0] == np.array([-0.66407807, -8.59922225, -0.02685972])
+    )