From 88c66cbc3db10af934d114b5b00d8b93d50cfbfc Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Wed, 20 Sep 2023 14:58:01 -0700 Subject: [PATCH 01/23] updated qcharcive code to fetch OpenFF Full Optimization Benchmark 1 to be compatible with qcportal >=0.5 --- devtools/conda-envs/espaloma.yaml | 2 +- espaloma/data/qcarchive_utils.py | 41 ++++++++++++++----------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/devtools/conda-envs/espaloma.yaml b/devtools/conda-envs/espaloma.yaml index 4865f11c..642761e2 100644 --- a/devtools/conda-envs/espaloma.yaml +++ b/devtools/conda-envs/espaloma.yaml @@ -29,6 +29,6 @@ dependencies: - nose - nose-timer - coverage - - qcportal>=0.15.0 + - qcportal>=0.50 - sphinx - sphinx_rtd_theme diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index 14cd76ae..fff59c7d 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -22,34 +22,34 @@ # UTILITY FUNCTIONS # ============================================================================= def get_client(): - return ptl.FractalClient() + return ptl.PortalClient("api.qcarchive.molssi.org") def get_collection( - client, - collection_type="OptimizationDataset", - name="OpenFF Full Optimization Benchmark 1", + client, + collection_type="OptimizationDataset", + name="OpenFF Full Optimization Benchmark 1", ): - collection = client.get_collection( - collection_type, - name, + collection = client.get_dataset( + dataset_type=collection_type, + dataset_name=name, ) - record_names = list(collection.data.records) + record_names = collection.entry_names return collection, record_names def get_graph(collection, record_name): # get record and trajectory - record = collection.get_record(record_name, specification="default") + record = collection.get_record(record_name, specification_name="default") entry = collection.get_entry(record_name) from openff.toolkit.topology import Molecule mol = Molecule.from_qcschema(entry) try: - trajectory = record.get_trajectory() + trajectory = record.trajectory except: return None @@ -62,7 +62,7 @@ def get_graph(collection, record_name): g.nodes["g"].data["u_ref"] = torch.tensor( [ Quantity( - snapshot.properties.scf_total_energy, + snapshot.properties["scf_total_energy"], esp.units.HARTREE_PER_PARTICLE, ).value_in_unit(esp.units.ENERGY_UNIT) for snapshot in trajectory @@ -74,7 +74,7 @@ def get_graph(collection, record_name): np.stack( [ Quantity( - snapshot.get_molecule().geometry, + snapshot.molecule.geometry, unit.bohr, ).value_in_unit(esp.units.DISTANCE_UNIT) for snapshot in trajectory @@ -89,7 +89,7 @@ def get_graph(collection, record_name): [ torch.tensor( Quantity( - snapshot.dict()["return_result"], + np.array(snapshot.properties["return_result"]).reshape((-1, 3)), esp.units.HARTREE_PER_PARTICLE / unit.bohr, ).value_in_unit(esp.units.FORCE_UNIT), dtype=torch.get_default_dtype(), @@ -147,7 +147,7 @@ def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord): def get_energy_and_gradient( - snapshot: ptl.models.records.ResultRecord, + snapshot: ptl.models.records.ResultRecord, ) -> Tuple[float, np.ndarray]: """Note: force = - gradient""" @@ -184,9 +184,9 @@ def get_smiles(x): g = esp.Graph(mol_ref) u_ref = np.concatenate(group["energies"].values) - u_ref_prime = np.concatenate( - group["gradients"].values, axis=0 - ).transpose(1, 0, 2) + u_ref_prime = np.concatenate(group["gradients"].values, axis=0).transpose( + 1, 0, 2 + ) xyz = np.concatenate(group["xyz"].values, axis=0).transpose(1, 0, 2) assert u_ref_prime.shape[0] == xyz.shape[0] == mol_ref.n_atoms @@ -229,7 +229,7 @@ def breakdown_along_time_axis(g, batch_size=32): shuffle(idxs) chunks = [ - idxs[_idx * batch_size : (_idx + 1) * batch_size] + idxs[_idx * batch_size: (_idx + 1) * batch_size] for _idx in range(n_snapshots // batch_size) ] @@ -259,10 +259,7 @@ def make_batch_size_consistent(ds, batch_size=32): return esp.data.dataset.GraphDataset( list( itertools.chain.from_iterable( - [ - breakdown_along_time_axis(g, batch_size=batch_size) - for g in ds - ] + [breakdown_along_time_axis(g, batch_size=batch_size) for g in ds] ) ) ) From 6f8198e2a0e9d3bbf9316da6443aadda9dbc06ae Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Wed, 20 Sep 2023 17:53:14 -0700 Subject: [PATCH 02/23] Updated torsiondrive parsing. I'm not sure this has sufficient testing. --- espaloma/data/qcarchive_utils.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index fff59c7d..d105016c 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -103,18 +103,20 @@ def get_graph(collection, record_name): def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord): - final_molecules = record.get_final_molecules() - final_results = record.get_final_results() + molecule_optimization = record_info.optimizations - angle_keys = list(final_molecules.keys()) + angle_keys = list(record_info.optimizations.keys()) xyzs = [] energies = [] gradients = [] for angle in angle_keys: - result = final_results[angle] - mol = final_molecules[angle] + # NOTE: this is calling the first index of the optimization array + # this gives the same value as the prior implementation, but I wonder if it + # should be molecule_optimization[angle][-1] in both cases + mol = molecule_optimization[angle][0].final_molecule + result = molecule_optimization[angle[0]][0].trajectory[-1].properties e, g = get_energy_and_gradient(result) @@ -153,12 +155,9 @@ def get_energy_and_gradient( # TODO: attach units here? or later? - d = snapshot.dict() - qcvars = d["extras"]["qcvars"] - energy = qcvars["CURRENT ENERGY"] - flat_gradient = np.array(qcvars["CURRENT GRADIENT"]) - num_atoms = len(flat_gradient) // 3 - gradient = flat_gradient.reshape((num_atoms, 3)) + energy = snapshot["current energy"] + gradient = np.array(snapshot["current gradient"]).reshape(-1, 3) + return energy, gradient From 3e5a58fab4a6272dd2e3824adb83cc3f68827bb1 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Wed, 20 Sep 2023 20:27:28 -0700 Subject: [PATCH 03/23] Adding in some testing of the torsion function --- espaloma/data/qcarchive_utils.py | 28 +++++------- espaloma/data/tests/test_qcarchive.py | 63 +++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 18 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index d105016c..e310d63b 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -102,10 +102,10 @@ def get_graph(collection, record_name): return g -def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord): - molecule_optimization = record_info.optimizations +def fetch_td_record(record: ptl.torsiondrive.record_models.TorsiondriveRecord): + molecule_optimization = record.optimizations - angle_keys = list(record_info.optimizations.keys()) + angle_keys = list(molecule_optimization.keys()) xyzs = [] energies = [] @@ -116,9 +116,14 @@ def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord): # this gives the same value as the prior implementation, but I wonder if it # should be molecule_optimization[angle][-1] in both cases mol = molecule_optimization[angle][0].final_molecule - result = molecule_optimization[angle[0]][0].trajectory[-1].properties + result = molecule_optimization[angle][0].trajectory[-1].properties - e, g = get_energy_and_gradient(result) + """Note: force = - gradient""" + + # TODO: attach units here? or later? + + e = result["current energy"] + g = np.array(result["current gradient"]).reshape(-1, 3) xyzs.append(mol.geometry) energies.append(e) @@ -148,19 +153,6 @@ def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord): return flat_angles, xyz_in_order, energies_in_order, gradients_in_order -def get_energy_and_gradient( - snapshot: ptl.models.records.ResultRecord, -) -> Tuple[float, np.ndarray]: - """Note: force = - gradient""" - - # TODO: attach units here? or later? - - energy = snapshot["current energy"] - gradient = np.array(snapshot["current gradient"]).reshape(-1, 3) - - return energy, gradient - - MolWithTargets = namedtuple( "MolWithTargets", ["offmol", "xyz", "energies", "gradients"] ) diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index 9a2b8b81..8902ca36 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -12,3 +12,66 @@ def test_get_graph(): collection, record_names = qcarchive_utils.get_collection(client) record_name = record_names[0] graph = qcarchive_utils.get_graph(collection, record_name) + + +def test_get_torsiondrive(): + import espaloma.data.qcarchive_utils + import numpy as np + + record_name = "[h]c1c(c(c(c([c:1]1[n:2]([c:3](=[o:4])c(=c([h])[h])[h])c([h])([h])[h])[h])[h])n(=o)=o)[h]" + + name = "OpenFF Amide Torsion Set v1.0" + collection_type = "torsiondrive" + + collection, record_names = qcarchive_utils.get_collection( + get_client(), collection_type, name + ) + record_info = collection.get_record(record_names, specification_name="default") + + ( + flat_angles, + xyz_in_order, + energies_in_order, + gradients_in_order, + ) = qcarchive_utils.fetch_td_record(record_info) + + assert flat_angles.shape == (24,) + assert energies_in_order.shape == (24,) + assert gradients_in_order.shape == (24, 25, 3) + assert xyz_in_order.shape == (24, 25, 3) + + assert np.isclose(energies_in_order[0], -722.2850260791969) + assert np.all( + flat_angles + == np.array( + [ + -165, + -150, + -135, + -120, + -105, + -90, + -75, + -60, + -45, + -30, + -15, + 0, + 15, + 30, + 45, + 60, + 75, + 90, + 105, + 120, + 135, + 150, + 165, + 180, + ] + ) + ) + assert np.all( + xyz_in_order[0][0] == np.array([-0.66407807, -8.59922225, -0.02685972]) + ) From 94add0cc8a78a425cdc706ab8b4a3b3a06570513 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Wed, 20 Sep 2023 20:31:09 -0700 Subject: [PATCH 04/23] Adding in some testing of the torsion function. --- espaloma/data/tests/test_qcarchive.py | 1 + 1 file changed, 1 insertion(+) diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index 8902ca36..3887abf9 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -20,6 +20,7 @@ def test_get_torsiondrive(): record_name = "[h]c1c(c(c(c([c:1]1[n:2]([c:3](=[o:4])c(=c([h])[h])[h])c([h])([h])[h])[h])[h])n(=o)=o)[h]" + # example dataset name = "OpenFF Amide Torsion Set v1.0" collection_type = "torsiondrive" From 908c35f164a48f03effea1419f50f1e0c44e671d Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Wed, 20 Sep 2023 22:00:25 -0700 Subject: [PATCH 05/23] Updated collection type name. --- espaloma/data/qcarchive_utils.py | 2 +- espaloma/data/tests/test_qcarchive.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index e310d63b..72c9a2fb 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -27,7 +27,7 @@ def get_client(): def get_collection( client, - collection_type="OptimizationDataset", + collection_type="optimization", name="OpenFF Full Optimization Benchmark 1", ): collection = client.get_dataset( diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index 3887abf9..0e7805ee 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -15,7 +15,7 @@ def test_get_graph(): def test_get_torsiondrive(): - import espaloma.data.qcarchive_utils + from espaloma.data import qcarchive_utils import numpy as np record_name = "[h]c1c(c(c(c([c:1]1[n:2]([c:3](=[o:4])c(=c([h])[h])[h])c([h])([h])[h])[h])[h])n(=o)=o)[h]" From 2afcd1dd028b6a3b1ed611e2c5741d5214cbcb95 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Wed, 20 Sep 2023 23:00:58 -0700 Subject: [PATCH 06/23] fixed import issue in test --- espaloma/data/tests/test_qcarchive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index 0e7805ee..f86736aa 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -25,7 +25,7 @@ def test_get_torsiondrive(): collection_type = "torsiondrive" collection, record_names = qcarchive_utils.get_collection( - get_client(), collection_type, name + qcarchive_utils.get_client(), collection_type, name ) record_info = collection.get_record(record_names, specification_name="default") From aa41891e255898a25ed989e6eff688b78e5234b9 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 08:15:15 -0700 Subject: [PATCH 07/23] fixed parsing of the schema. --- espaloma/data/qcarchive_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index 72c9a2fb..82cde67c 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -46,7 +46,7 @@ def get_graph(collection, record_name): entry = collection.get_entry(record_name) from openff.toolkit.topology import Molecule - mol = Molecule.from_qcschema(entry) + mol = Molecule.from_qcschema(entry.dict()) try: trajectory = record.trajectory From dc7b10cf619649ef2312b67d941426b72173e216 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 09:08:05 -0700 Subject: [PATCH 08/23] fixing a typo that was causing failure of torsion test. --- espaloma/data/tests/test_qcarchive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index f86736aa..13c07c0a 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -27,7 +27,7 @@ def test_get_torsiondrive(): collection, record_names = qcarchive_utils.get_collection( qcarchive_utils.get_client(), collection_type, name ) - record_info = collection.get_record(record_names, specification_name="default") + record_info = collection.get_record(record_name, specification_name="default") ( flat_angles, From 9cb3189979d13679569dd5f1f6287782141fe33d Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 10:01:18 -0700 Subject: [PATCH 09/23] Slight change to code to add in a function that uses the iterate_records and iterates_entries functions --- espaloma/data/qcarchive_utils.py | 47 ++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index 82cde67c..6c57043b 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -26,9 +26,9 @@ def get_client(): def get_collection( - client, - collection_type="optimization", - name="OpenFF Full Optimization Benchmark 1", + client, + collection_type="optimization", + name="OpenFF Full Optimization Benchmark 1", ): collection = client.get_dataset( dataset_type=collection_type, @@ -40,10 +40,7 @@ def get_collection( return collection, record_names -def get_graph(collection, record_name): - # get record and trajectory - record = collection.get_record(record_name, specification_name="default") - entry = collection.get_entry(record_name) +def process_record(record, entry): from openff.toolkit.topology import Molecule mol = Molecule.from_qcschema(entry.dict()) @@ -102,6 +99,40 @@ def get_graph(collection, record_name): return g +def get_graph(collection, record_name, spec_name="default")): + # get record and trajectory + record = collection.get_record(record_name, specification_name=spec_name) + entry = collection.get_entry(record_name) + + g = process_record(record, entry) + + return g + + +def get_graphs(collection, record_names, spec_name="default"): + """ + + Parameters + ---------- + collection : + record_names + + Returns + ------- + + """ + g_list = [] + for record, entry in zip( + + collection.iterate_records(record_names, specification_names=[spec_name]), + collection.iterate_entries(record_names), + ): + g = process_record(record, entry) + g_list.append(g) + + return g_list + + def fetch_td_record(record: ptl.torsiondrive.record_models.TorsiondriveRecord): molecule_optimization = record.optimizations @@ -220,7 +251,7 @@ def breakdown_along_time_axis(g, batch_size=32): shuffle(idxs) chunks = [ - idxs[_idx * batch_size: (_idx + 1) * batch_size] + idxs[_idx * batch_size : (_idx + 1) * batch_size] for _idx in range(n_snapshots // batch_size) ] From cb28a488e8f01661b5a68c7c5aae08b43876e30b Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 10:03:51 -0700 Subject: [PATCH 10/23] merged with updated dgl update; removing qcportal pinning to the old version. --- devtools/conda-envs/espaloma.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devtools/conda-envs/espaloma.yaml b/devtools/conda-envs/espaloma.yaml index 686497da..2bfe5287 100644 --- a/devtools/conda-envs/espaloma.yaml +++ b/devtools/conda-envs/espaloma.yaml @@ -20,7 +20,7 @@ dependencies: - tqdm - pydantic <2 # We need our deps to fix this - dgl =1.1.2 - - qcportal =0.15.8 + - qcportal>=0.5 # Testing - pytest - pytest-cov From f48ae7cceadb08f58c88e59d7a54bb9540c8cb30 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 10:22:35 -0700 Subject: [PATCH 11/23] Made spec_name be a variable --- espaloma/data/qcarchive_utils.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index 6c57043b..b8b3d0f8 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -26,9 +26,9 @@ def get_client(): def get_collection( - client, - collection_type="optimization", - name="OpenFF Full Optimization Benchmark 1", + client, + collection_type="optimization", + name="OpenFF Full Optimization Benchmark 1", ): collection = client.get_dataset( dataset_type=collection_type, @@ -99,7 +99,7 @@ def process_record(record, entry): return g -def get_graph(collection, record_name, spec_name="default")): +def get_graph(collection, record_name, spec_name="default"): # get record and trajectory record = collection.get_record(record_name, specification_name=spec_name) entry = collection.get_entry(record_name) @@ -110,22 +110,10 @@ def get_graph(collection, record_name, spec_name="default")): def get_graphs(collection, record_names, spec_name="default"): - """ - - Parameters - ---------- - collection : - record_names - - Returns - ------- - - """ g_list = [] for record, entry in zip( - - collection.iterate_records(record_names, specification_names=[spec_name]), - collection.iterate_entries(record_names), + collection.iterate_records(record_names, specification_names=[spec_name]), + collection.iterate_entries(record_names), ): g = process_record(record, entry) g_list.append(g) @@ -251,7 +239,7 @@ def breakdown_along_time_axis(g, batch_size=32): shuffle(idxs) chunks = [ - idxs[_idx * batch_size : (_idx + 1) * batch_size] + idxs[_idx * batch_size: (_idx + 1) * batch_size] for _idx in range(n_snapshots // batch_size) ] From 5f638fd4ea8f7b126738be8ea5782f4f6c9032f4 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 11:29:42 -0700 Subject: [PATCH 12/23] Adding in some docstrings --- espaloma/data/qcarchive_utils.py | 78 ++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index b8b3d0f8..7ed98101 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -5,7 +5,7 @@ from typing import Tuple import numpy as np -import qcportal as ptl +import qcportal import torch from openmm import unit from openmm.unit import Quantity @@ -21,8 +21,22 @@ # ============================================================================= # UTILITY FUNCTIONS # ============================================================================= -def get_client(): - return ptl.PortalClient("api.qcarchive.molssi.org") +def get_client(url: str = "api.qcarchive.molssi.org") -> qcportal.client.PortalClient: + """ + Returns a instance of the qcportal client. + + Parameters + ---------- + url: str, default="api.qcarchive.molssi.org" + qcportal instance to connect + + Returns + ------- + qcportal.client.PortalClient + qcportal client instance. + """ + # Note, this may need to be modified to include username/password for non-public servers + return qcportal.PortalClient(url) def get_collection( @@ -30,6 +44,25 @@ def get_collection( collection_type="optimization", name="OpenFF Full Optimization Benchmark 1", ): + """ + Connects to a specific dataset on qcportal + + Parameters + ---------- + client: qcportal.client, required + The qcportal client instance + collection_type: str, default="optimization" + The type of qcarchive collection, options are + "torsiondrive", "optimization", "gridoptimization", "reaction", "singlepoint" "manybody" + name: str, default="OpenFF Full Optimization Benchmark 1" + Name of the dataset + + Returns + ------- + (qcportal dataset, list(str)) + Tuple with an instance of qcportal dataset and list of record names + + """ collection = client.get_dataset( dataset_type=collection_type, dataset_name=name, @@ -41,6 +74,10 @@ def get_collection( def process_record(record, entry): + """ + Processes a given record/entry pair from a dataset and returns the graph + """ + from openff.toolkit.topology import Molecule mol = Molecule.from_qcschema(entry.dict()) @@ -100,6 +137,21 @@ def process_record(record, entry): def get_graph(collection, record_name, spec_name="default"): + """ + Processes the qcportal data for a given record name. + + Parameters + ---------- + collection, qcportal dataset, required + The instance of the qcportal dataset + record_name, str, required + The name of a give record + spec_name, str, default="default" + Retrieve data for a given qcportal specification. + Returns + ------- + Graph + """ # get record and trajectory record = collection.get_record(record_name, specification_name=spec_name) entry = collection.get_entry(record_name) @@ -110,6 +162,24 @@ def get_graph(collection, record_name, spec_name="default"): def get_graphs(collection, record_names, spec_name="default"): + """ + Processes the qcportal data for a given set of record names. + This uses the qcportal iteration functions which are faster than processing + records one at a time + + Parameters + ---------- + collection, qcportal dataset, required + The instance of the qcportal dataset + record_name, str, required + The name of a give record + spec_name, str, default="default" + Retrieve data for a given qcportal specification. + Returns + ------- + list(graph) + Returns a list of the corresponding graph for each record name + """ g_list = [] for record, entry in zip( collection.iterate_records(record_names, specification_names=[spec_name]), @@ -121,7 +191,7 @@ def get_graphs(collection, record_names, spec_name="default"): return g_list -def fetch_td_record(record: ptl.torsiondrive.record_models.TorsiondriveRecord): +def fetch_td_record(record: qcportal.torsiondrive.record_models.TorsiondriveRecord): molecule_optimization = record.optimizations angle_keys = list(molecule_optimization.keys()) From 86042f331b493519bda32b99fbda65c4232d5ccb Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 12:41:37 -0700 Subject: [PATCH 13/23] Addressed Mike's comment.s --- devtools/conda-envs/espaloma.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/devtools/conda-envs/espaloma.yaml b/devtools/conda-envs/espaloma.yaml index 2bfe5287..320837c0 100644 --- a/devtools/conda-envs/espaloma.yaml +++ b/devtools/conda-envs/espaloma.yaml @@ -30,6 +30,5 @@ dependencies: - nose - nose-timer - coverage - - qcportal>=0.50 - sphinx - sphinx_rtd_theme From 3861bd70fc2bf038550f77a5dda1af99a87d6e67 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 16:26:45 -0700 Subject: [PATCH 14/23] Added additional basic docstring. --- espaloma/data/qcarchive_utils.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index 7ed98101..15a6806b 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -192,6 +192,18 @@ def get_graphs(collection, record_names, spec_name="default"): def fetch_td_record(record: qcportal.torsiondrive.record_models.TorsiondriveRecord): + """ + Fetches configuration, energy, and gradients for a given torsiondrive record as a function of different angles. + + Parameters + ---------- + record: qcportal.torsiondrive.record_models.TorsiondriveRecord, required + Torsiondrive record of interest + Returns + ------- + tuple, ( numpy.array, numpy.array, numpy.array,numpy.array) + Returned data represents flat_angles, xyz_in_order, energies_in_order, gradients_in_order + """ molecule_optimization = record.optimizations angle_keys = list(molecule_optimization.keys()) @@ -202,8 +214,9 @@ def fetch_td_record(record: qcportal.torsiondrive.record_models.TorsiondriveReco for angle in angle_keys: # NOTE: this is calling the first index of the optimization array - # this gives the same value as the prior implementation, but I wonder if it - # should be molecule_optimization[angle][-1] in both cases + # this gives the same value as the prior implementation. + # however it seems to be that this contains multiple different initial configurations + # that have been optimized. Should all conformers and energies/gradients be considered? mol = molecule_optimization[angle][0].final_molecule result = molecule_optimization[angle][0].trajectory[-1].properties From 4821aade2d68e785039794d603fb79ff789f2f9e Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 16:29:14 -0700 Subject: [PATCH 15/23] Added additional basic docstring for torsion parsing --- espaloma/data/qcarchive_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index 15a6806b..3fcf013f 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -202,7 +202,10 @@ def fetch_td_record(record: qcportal.torsiondrive.record_models.TorsiondriveReco Returns ------- tuple, ( numpy.array, numpy.array, numpy.array,numpy.array) - Returned data represents flat_angles, xyz_in_order, energies_in_order, gradients_in_order + Returned data is a tuple of numpy arrays. + The first index contains angles and subsequent arrays represent + molecule coordinate, energy and gradients associated with each angle. + """ molecule_optimization = record.optimizations From a6a4cdf6a0ab087dffd15351f47417dd546430a1 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 21 Sep 2023 17:41:57 -0700 Subject: [PATCH 16/23] Fixed bug in iterate function; added test to catch that bug . --- espaloma/data/qcarchive_utils.py | 3 ++- espaloma/data/tests/test_qcarchive.py | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index 3fcf013f..497f94a4 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -185,7 +185,8 @@ def get_graphs(collection, record_names, spec_name="default"): collection.iterate_records(record_names, specification_names=[spec_name]), collection.iterate_entries(record_names), ): - g = process_record(record, entry) + # note interate records returns a tuple of lenth 3 (name, spec_name, actual record) + g = process_record(record[2], entry) g_list.append(g) return g_list diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index 13c07c0a..367a2c6a 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -12,6 +12,11 @@ def test_get_graph(): collection, record_names = qcarchive_utils.get_collection(client) record_name = record_names[0] graph = qcarchive_utils.get_graph(collection, record_name) + assert graph is not None + + graphs = qcarchive_utils.get_graphs(collection, record_names[0:2]) + assert len(graphs) == 2 + assert graphs[0] is not None def test_get_torsiondrive(): From 54ce4647c23ec59ab05a5715744104d9aba59d9e Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 22 Sep 2023 11:55:00 -0700 Subject: [PATCH 17/23] Added support for singlepoint datasets --- espaloma/data/qcarchive_utils.py | 70 ++++++++++++++++++--------- espaloma/data/tests/test_qcarchive.py | 28 +++++++++++ 2 files changed, 76 insertions(+), 22 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index 497f94a4..c4b7b750 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -82,13 +82,17 @@ def process_record(record, entry): mol = Molecule.from_qcschema(entry.dict()) - try: + if record.record_type == "optimization": trajectory = record.trajectory - except: - return None - - if trajectory is None: - return None + if trajectory is None: + return None + elif record.record_type == "singlepoint": + # the syntax for energy would be the same if we just place + trajectory = [record] + else: + raise Exception( + f"{record.record_type} is not supported: only optimization and singlepoint datasets can be processed." + ) g = esp.Graph(mol) @@ -104,21 +108,38 @@ def process_record(record, entry): dtype=torch.get_default_dtype(), )[None, :] - g.nodes["n1"].data["xyz"] = torch.tensor( - np.stack( - [ - Quantity( - snapshot.molecule.geometry, - unit.bohr, - ).value_in_unit(esp.units.DISTANCE_UNIT) - for snapshot in trajectory - ], - axis=1, - ), - requires_grad=True, - dtype=torch.get_default_dtype(), - ) - + if record.record_type == "optimization": + g.nodes["n1"].data["xyz"] = torch.tensor( + np.stack( + [ + Quantity( + snapshot.molecule.geometry, + unit.bohr, + ).value_in_unit(esp.units.DISTANCE_UNIT) + for snapshot in trajectory + ], + axis=1, + ), + requires_grad=True, + dtype=torch.get_default_dtype(), + ) + elif record.record_type == "singlepoint": + # singlepoint datasets have configuration stored in entry + # rather than in the trajectory. + g.nodes["n1"].data["xyz"] = torch.tensor( + np.stack( + [ + Quantity( + entry.molecule.geometry, + unit.bohr, + ).value_in_unit(esp.units.DISTANCE_UNIT) + for snapshot in trajectory + ], + axis=1, + ), + requires_grad=True, + dtype=torch.get_default_dtype(), + ) g.nodes["n1"].data["u_ref_prime"] = torch.stack( [ torch.tensor( @@ -140,6 +161,8 @@ def get_graph(collection, record_name, spec_name="default"): """ Processes the qcportal data for a given record name. + This supports optimization and singlepoint datasets. + Parameters ---------- collection, qcportal dataset, required @@ -165,7 +188,10 @@ def get_graphs(collection, record_names, spec_name="default"): """ Processes the qcportal data for a given set of record names. This uses the qcportal iteration functions which are faster than processing - records one at a time + records one at a time. + + This supports optimization and singlepoint datasets. + Parameters ---------- diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index 367a2c6a..b10be20a 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -19,6 +19,34 @@ def test_get_graph(): assert graphs[0] is not None +def test_singlepoint(): + from espaloma.data import qcarchive_utils + + name = "QM9" + collection_type = "singlepoint" + collection, record_names = get_collection( + get_client("ml.qcarchive.molssi.org"), collection_type, name + ) + + record_name = record_names[0] + + with pytest.raises(Exception): + graph = qcarchive_utils.get_graph(collection, record_name, spec_name="spec_2") + + +def test_notsupported_dataset(): + from espaloma.data import qcarchive_utils + + name = "DBH24" + collection_type = "reaction" + collection, record_names = get_collection( + get_client("ml.qcarchive.molssi.org"), collection_type, name + ) + record_name = record_names[0] + + graph = qcarchive_utils.get_graph(collection, record_name, spec_name="spec_2") + + def test_get_torsiondrive(): from espaloma.data import qcarchive_utils import numpy as np From a05bff70e4c36bf635ab2fa3cc8c2e45dd7f7988 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 22 Sep 2023 13:42:38 -0700 Subject: [PATCH 18/23] fixing error in test. --- espaloma/data/tests/test_qcarchive.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index b10be20a..4a639699 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -24,8 +24,8 @@ def test_singlepoint(): name = "QM9" collection_type = "singlepoint" - collection, record_names = get_collection( - get_client("ml.qcarchive.molssi.org"), collection_type, name + collection, record_names = qcarchive_utils.get_collection( + qcarchive_utils.get_client("ml.qcarchive.molssi.org"), collection_type, name ) record_name = record_names[0] @@ -39,8 +39,8 @@ def test_notsupported_dataset(): name = "DBH24" collection_type = "reaction" - collection, record_names = get_collection( - get_client("ml.qcarchive.molssi.org"), collection_type, name + collection, record_names = qcarchive_utils.get_collection( + qcarchive_utils.get_client("ml.qcarchive.molssi.org"), collection_type, name ) record_name = record_names[0] From 87e9d5ac91d3ef4816dfb406618e01cf20c8dbc2 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 22 Sep 2023 14:10:23 -0700 Subject: [PATCH 19/23] Changing the dataset for singlepoint testing as we need to ensure the dataset has the smiles encoded for converting to openff.molecule --- espaloma/data/tests/test_qcarchive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index 4a639699..8de878f2 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -22,7 +22,7 @@ def test_get_graph(): def test_singlepoint(): from espaloma.data import qcarchive_utils - name = "QM9" + name = "SPICE PubChem Set 1 Single Points Dataset v1.2" collection_type = "singlepoint" collection, record_names = qcarchive_utils.get_collection( qcarchive_utils.get_client("ml.qcarchive.molssi.org"), collection_type, name @@ -31,7 +31,7 @@ def test_singlepoint(): record_name = record_names[0] with pytest.raises(Exception): - graph = qcarchive_utils.get_graph(collection, record_name, spec_name="spec_2") + graph = qcarchive_utils.get_graph(collection, record_name, spec_name="spec_1") def test_notsupported_dataset(): From 384bcc192c9b1d3d9edee70b26ae9015cb1fd0df Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 22 Sep 2023 14:16:47 -0700 Subject: [PATCH 20/23] Changing the dataset for singlepoint testing as we need to ensure the dataset has the smiles encoded for converting to openff.molecule --- espaloma/data/tests/test_qcarchive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index 8de878f2..ecac33ba 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -31,7 +31,7 @@ def test_singlepoint(): record_name = record_names[0] with pytest.raises(Exception): - graph = qcarchive_utils.get_graph(collection, record_name, spec_name="spec_1") + graph = qcarchive_utils.get_graph(collection, record_name, spec_name="spec_2") def test_notsupported_dataset(): From cac1776cf86fd0959a4e717204405c3b8b397217 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 22 Sep 2023 15:43:21 -0700 Subject: [PATCH 21/23] Move the schema conversion to after checking if a dataset is supported so that it will raise the desired exception rather than failing. --- espaloma/data/qcarchive_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index c4b7b750..3cfd289b 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -80,8 +80,6 @@ def process_record(record, entry): from openff.toolkit.topology import Molecule - mol = Molecule.from_qcschema(entry.dict()) - if record.record_type == "optimization": trajectory = record.trajectory if trajectory is None: @@ -93,6 +91,7 @@ def process_record(record, entry): raise Exception( f"{record.record_type} is not supported: only optimization and singlepoint datasets can be processed." ) + mol = Molecule.from_qcschema(entry.dict()) g = esp.Graph(mol) From 01c9de577c8106d9259a5a41841718fd89270800 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 22 Sep 2023 16:59:31 -0700 Subject: [PATCH 22/23] Removed support for singlepoint dataset, as openff.molecule cannot parse the singlepoint records properly at this point. Other issues need to be resolved with singlepoint energy beyond this (i.e., summation of dispersion corrections). --- espaloma/data/qcarchive_utils.py | 63 ++++++++++++--------------- espaloma/data/tests/test_qcarchive.py | 18 +------- 2 files changed, 29 insertions(+), 52 deletions(-) diff --git a/espaloma/data/qcarchive_utils.py b/espaloma/data/qcarchive_utils.py index 3cfd289b..d6fda817 100644 --- a/espaloma/data/qcarchive_utils.py +++ b/espaloma/data/qcarchive_utils.py @@ -76,6 +76,17 @@ def get_collection( def process_record(record, entry): """ Processes a given record/entry pair from a dataset and returns the graph + + Parameters + ---------- + record: qcportal.optimization.record_models.OptimizationRecord + qcportal record + entry: cportal.optimization.dataset_models.OptimizationDatasetEntry + qcportal entry + + Returns + ------- + esp.Graph """ from openff.toolkit.topology import Molecule @@ -84,12 +95,9 @@ def process_record(record, entry): trajectory = record.trajectory if trajectory is None: return None - elif record.record_type == "singlepoint": - # the syntax for energy would be the same if we just place - trajectory = [record] else: raise Exception( - f"{record.record_type} is not supported: only optimization and singlepoint datasets can be processed." + f"{record.record_type} is not supported: only optimization datasets can be processed." ) mol = Molecule.from_qcschema(entry.dict()) @@ -107,38 +115,21 @@ def process_record(record, entry): dtype=torch.get_default_dtype(), )[None, :] - if record.record_type == "optimization": - g.nodes["n1"].data["xyz"] = torch.tensor( - np.stack( - [ - Quantity( - snapshot.molecule.geometry, - unit.bohr, - ).value_in_unit(esp.units.DISTANCE_UNIT) - for snapshot in trajectory - ], - axis=1, - ), - requires_grad=True, - dtype=torch.get_default_dtype(), - ) - elif record.record_type == "singlepoint": - # singlepoint datasets have configuration stored in entry - # rather than in the trajectory. - g.nodes["n1"].data["xyz"] = torch.tensor( - np.stack( - [ - Quantity( - entry.molecule.geometry, - unit.bohr, - ).value_in_unit(esp.units.DISTANCE_UNIT) - for snapshot in trajectory - ], - axis=1, - ), - requires_grad=True, - dtype=torch.get_default_dtype(), - ) + g.nodes["n1"].data["xyz"] = torch.tensor( + np.stack( + [ + Quantity( + snapshot.molecule.geometry, + unit.bohr, + ).value_in_unit(esp.units.DISTANCE_UNIT) + for snapshot in trajectory + ], + axis=1, + ), + requires_grad=True, + dtype=torch.get_default_dtype(), + ) + g.nodes["n1"].data["u_ref_prime"] = torch.stack( [ torch.tensor( diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index ecac33ba..2eb1f163 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -19,21 +19,6 @@ def test_get_graph(): assert graphs[0] is not None -def test_singlepoint(): - from espaloma.data import qcarchive_utils - - name = "SPICE PubChem Set 1 Single Points Dataset v1.2" - collection_type = "singlepoint" - collection, record_names = qcarchive_utils.get_collection( - qcarchive_utils.get_client("ml.qcarchive.molssi.org"), collection_type, name - ) - - record_name = record_names[0] - - with pytest.raises(Exception): - graph = qcarchive_utils.get_graph(collection, record_name, spec_name="spec_2") - - def test_notsupported_dataset(): from espaloma.data import qcarchive_utils @@ -44,7 +29,8 @@ def test_notsupported_dataset(): ) record_name = record_names[0] - graph = qcarchive_utils.get_graph(collection, record_name, spec_name="spec_2") + with pytest.raises(Exception): + graph = qcarchive_utils.get_graph(collection, record_name, spec_name="spec_2") def test_get_torsiondrive(): From 3cf37d08611a79456687dc4d0b9df912a8a1f311 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 22 Sep 2023 17:00:56 -0700 Subject: [PATCH 23/23] Removed support for singlepoint dataset, as openff.molecule cannot parse the singlepoint records properly at this point. Other issues need to be resolved with singlepoint energy beyond this (i.e., summation of dispersion corrections). This PR should sufficiently reproduce the prior behavior, but with new qcportal. --- espaloma/data/tests/test_qcarchive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espaloma/data/tests/test_qcarchive.py b/espaloma/data/tests/test_qcarchive.py index 2eb1f163..31a17c55 100644 --- a/espaloma/data/tests/test_qcarchive.py +++ b/espaloma/data/tests/test_qcarchive.py @@ -39,7 +39,7 @@ def test_get_torsiondrive(): record_name = "[h]c1c(c(c(c([c:1]1[n:2]([c:3](=[o:4])c(=c([h])[h])[h])c([h])([h])[h])[h])[h])n(=o)=o)[h]" - # example dataset + # example dataset name = "OpenFF Amide Torsion Set v1.0" collection_type = "torsiondrive"