From ef0420006ee673bbddc11ba70418ef33ae46f7d9 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 9 Feb 2024 15:25:57 -0500 Subject: [PATCH 01/59] use pint.UnitRegistry to convert units --- espfit/utils/units.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/espfit/utils/units.py b/espfit/utils/units.py index 011dadc..4e480ab 100644 --- a/espfit/utils/units.py +++ b/espfit/utils/units.py @@ -1,3 +1,13 @@ -# Constants for unit conversions -HARTEE_TO_KCALPERMOL = 627.509 -BOHR_TO_ANGSTROMS = 0.529177 \ No newline at end of file +from pint import UnitRegistry + +# Define pint unit registry +ureg = UnitRegistry() +hartree = 1 * ureg.hartree +bohr = 1 * ureg.bohr +angstrom = 1 * ureg.angstrom + +# Conversion factors +#HARTEE_TO_KCALPERMOL = 627.509 +#BOHR_TO_ANGSTROMS = 0.529177 +HARTREE_TO_KCALMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude +BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude From b63147fec68dd4339f9fdd893833260d766d9fe9 Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 12 Feb 2024 13:44:09 -0500 Subject: [PATCH 02/59] change units.py --- espfit/utils/units.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espfit/utils/units.py b/espfit/utils/units.py index 4e480ab..ccc236a 100644 --- a/espfit/utils/units.py +++ b/espfit/utils/units.py @@ -9,5 +9,5 @@ # Conversion factors #HARTEE_TO_KCALPERMOL = 627.509 #BOHR_TO_ANGSTROMS = 0.529177 -HARTREE_TO_KCALMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude +HARTREE_TO_KCALPERMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude From 3176da2d1f4c07b9aba3703d7d069f344523590c Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 12 Feb 2024 13:46:28 -0500 Subject: [PATCH 03/59] add implementation error for other systems in experiment.py --- espfit/app/experiment.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/espfit/app/experiment.py b/espfit/app/experiment.py index 86165a7..c574fcd 100644 --- a/espfit/app/experiment.py +++ b/espfit/app/experiment.py @@ -242,4 +242,15 @@ def get_available_couplings(self): import barnaba as bb available_coupling_names = list(bb.definitions.couplings_idx.keys()) return available_coupling_names - \ No newline at end of file + + +class ProteinSystem(BaseDataLoader): + def __init__(self, **kwargs): + super(ProteinSystem, self).__init__(**kwargs) + raise NotImplementedError("ProteinSystem class is not implemented yet.") + + +class ProteinLigandSystem(BaseDataLoader): + def __init__(self, **kwargs): + super(ProteinLigandSystem, self).__init__(**kwargs) + raise NotImplementedError("ProteinLigandSystem class is not implemented yet.") From 12a749e7ea0b7478ea0df0371a1100441317ce78 Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 12 Feb 2024 13:46:58 -0500 Subject: [PATCH 04/59] add draft for train.py --- espfit/app/train.py | 173 +++++++++++++++++++++++++++++++++----------- 1 file changed, 129 insertions(+), 44 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index aa35a08..a44e03f 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -70,7 +70,7 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase self.dataset_validation = dataset_validation self.dataset_test = dataset_test self.random_seed = random_seed - self.config = None # TODO: Better way to handle this? + self.config = config if output_directory_path is None: import os self.output_directory_path = os.getcwd() @@ -103,12 +103,7 @@ def from_toml(cls, filename): except FileNotFoundError as e: print(e) raise - #model = cls.create_model(config['espaloma']) - - # TODO: Better way to handle this? - #model = cls(model) - #model.config = config - + model = cls() net = model.create_model(config['espaloma']) model.net = net @@ -234,49 +229,14 @@ def _restart_checkpoint(self, output_directory_path): restart_epoch = 0 return restart_epoch - - - def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None): - """ - Train the Espaloma network model. - - This method trains the Espaloma network model using the training dataset. The training process can be customized - by specifying the number of epochs, batch size, learning rate, checkpoint frequency, and an output directory. - The method also supports restarting the training from a checkpoint. - Parameters - ---------- - epochs : int, default=1000 - The number of epochs to train the model for. - - batch_size : int, default=128 - The number of samples per batch. - - learning_rate : float, default=1e-4 - The learning rate for the optimizer. - - checkpoint_frequency : int, default=10 - The frequency at which the model should be saved. - output_directory_path : str, default=None - The directory where the model checkpoints should be saved. If None, the default output directory is used. - - Returns - ------- - None - """ + def _get_train_parameters(self, output_directory_path): import os import torch - from pathlib import Path if torch.cuda.is_available(): _logger.info('GPU is available for training.') - - # Change default device to GPU if available - # Will this map all data onto GPU and cause memory error if the data is too large? - # https://pytorch.org/tutorials/recipes/recipes/changing_default_device.html - - #torch.set_default_device('cuda') else: _logger.info('GPU is not available for training.') @@ -305,10 +265,48 @@ def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_freq else: _logger.info(f'Training from scratch for {epochs} epochs.') + return restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency + + + def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None): + """ + Train the Espaloma network model. + + This method trains the Espaloma network model using the training dataset. The training process can be customized + by specifying the number of epochs, batch size, learning rate, checkpoint frequency, and an output directory. + The method also supports restarting the training from a checkpoint. + + Parameters + ---------- + epochs : int, default=1000 + The number of epochs to train the model for. + + batch_size : int, default=128 + The number of samples per batch. + + learning_rate : float, default=1e-4 + The learning rate for the optimizer. + + checkpoint_frequency : int, default=10 + The frequency at which the model should be saved. + + output_directory_path : str, default=None + The directory where the model checkpoints should be saved. If None, the default output directory is used. + + Returns + ------- + None + """ + import os + import torch + from espfit.utils.units import HARTREE_TO_KCALPERMOL + + # Get training parameters + restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency = self._get_train_parameters(epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path) + # Train # https://github.com/choderalab/espaloma/blob/main/espaloma/app/train.py#L33 # https://github.com/choderalab/espaloma/blob/main/espaloma/data/dataset.py#L310 - from espfit.utils.units import HARTEE_TO_KCALPERMOL ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=batch_size, shuffle=True) optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate) with torch.autograd.set_detect_anomaly(True): @@ -332,7 +330,94 @@ def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_freq _logger.info(f'epoch {epoch}: {_loss:.3f}') checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt") torch.save(self.net.state_dict(), checkpoint_file) + + + def train_reweight(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None, + system_type='RNA', neff_threshold=0.2, filename=None, maxIterations=10, nsteps=10): + import os + import torch + from espfit.utils.units import HARTREE_TO_KCALPERMOL + from espfit.app.sampler import SetupSampler + + if system_type == 'RNA': + from espfit.app.experiment import RNASystem + else: + raise NotImplementedError("Only RNA system is supported at the moment.") + + # Get training parameters + restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency = self._get_train_parameters(epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path) + + # Run MD simulation + if restart_epoch == 0: + _logger.info('Running MD simulation for the first time.') + sampler = SetupSampler() + sampler.create_system(biopolymer_file=filename) + sampler.minimize(maxIterations) + sampler.run(nsteps) + + sampler_output_directory_path = os.path.join(output_directory_path, "md", restart_epoch) + sampler.export_xml(output_directory_path=sampler_output_directory_path) + + # Bookkeep last output directory + old_sampler_output_directory_path = sampler_output_directory_path + + target = RNASystem() + target.load_traj(input_directory_path=sampler_output_directory_path) + obs = target.compute_jcouplings() + _logger.info(f'Computed observable: {obs}') + + # Train + ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=batch_size, shuffle=True) + optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate) + with torch.autograd.set_detect_anomaly(True): + for i in range(restart_epoch, epochs): + epoch = i + 1 # Start from epoch 1 (not zero-indexing) + for g in ds_tr_loader: + optimizer.zero_grad() + + if torch.cuda.is_available(): + g = g.to("cuda:0") + + g.nodes["n1"].data["xyz"].requires_grad = True + + # QC loss + loss += self.net(g) + + + # Compute MD loss + neff = 0.5 + if neff < neff_threshold: + # Re-run MD simulation + sampler_output_directory_path = os.path.join(output_directory_path, "md", restart_epoch) + sampler = SetupSampler.from_xml(input_directory_path=old_sampler_output_directory_path, output_directory_path=sampler_output_directory_path) + sampler.minimize(maxIterations) + sampler.run(nsteps) + # Check if new observable is computed + target = RNASystem() + target.load_traj(input_directory_path=sampler_output_directory_path) + obs = target.compute_jcouplings() + # Update directory + old_sampler_output_directory_path = sampler_output_directory_path + else: + pass + + + # Update weights + loss.backward() + optimizer.step() + + if epoch % checkpoint_frequency == 0: + # Note: returned loss is a joint loss of different units. + _loss = HARTEE_TO_KCALPERMOL * loss.pow(0.5).item() + _logger.info(f'epoch {epoch}: {_loss:.3f}') + checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt") + torch.save(self.net.state_dict(), checkpoint_file) + + + + def compute_md_loss(self, couplings): + pass def validate(): raise NotImplementedError From 47f00ce0b9d42b101943cc5267dc5180cd719bd5 Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 15 Feb 2024 15:33:34 -0500 Subject: [PATCH 05/59] change logging info in espfit.utils.graphs.drop_and_merge_duplicates --- espfit/utils/graphs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index 914efb3..3ab1fd0 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -459,7 +459,7 @@ def reshape_conformation_size(self, n_confs=50): ------- None """ - _logger.info(f'Reshape graphs size') + _logger.info(f'Reshape graph size') import random import copy @@ -473,13 +473,13 @@ def reshape_conformation_size(self, n_confs=50): n = g.nodes['n1'].data['xyz'].shape[1] if n == n_confs: - _logger.info(f"Molecule #{i} ({n} conformations)") + _logger.info(f"Mol #{i} ({n} conformations)") new_graphs.append(g) elif n < n_confs: random.seed(self.random_seed) index_random = random.choices(range(0, n), k=n_confs-n) - _logger.info(f"Molecule #{i} ({n} conformations). Randomly select {len(index_random)} conformations") + _logger.info(f"Randomly select {len(index_random)} conformations from Mol #{i} ({n} conformations)") _g = copy.deepcopy(g) _g.nodes["g"].data["u_ref"] = torch.cat((_g.nodes['g'].data['u_ref'], _g.nodes['g'].data['u_ref'][:, index_random]), dim=-1) @@ -488,7 +488,7 @@ def reshape_conformation_size(self, n_confs=50): new_graphs.append(_g) else: - _logger.info(f"Molecule #{i} ({n} conformations). Shuffle indices and split data into chunks") + _logger.info(f"Shuffling Mol #{i} ({n} conformations) and splitting into {n_confs}") random.seed(self.random_seed) idx_range = random.sample(range(n), k=n) for j in range(n // n_confs + 1): From f62bfd4ada10e87e7d2baf9eab7ca745aab04e6f Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 15 Feb 2024 15:58:10 -0500 Subject: [PATCH 06/59] first commit espfit/utils/sampler --- espfit/utils/sampler/__init__.py | 0 espfit/utils/sampler/module.py | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 espfit/utils/sampler/__init__.py create mode 100644 espfit/utils/sampler/module.py diff --git a/espfit/utils/sampler/__init__.py b/espfit/utils/sampler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/espfit/utils/sampler/module.py b/espfit/utils/sampler/module.py new file mode 100644 index 0000000..d6e2492 --- /dev/null +++ b/espfit/utils/sampler/module.py @@ -0,0 +1,48 @@ +import logging + +_logger = logging.getLogger(__name__) + + +def check_effective_sample_size(): + # Compute effective sample size + neff = 0.5 + + return neff + + +def run_sampler(sampler_output_directory_path, biopolymer_file, ligand_file, maxIterations, nsteps, small_molecule_forcefield): + import os + from espfit.app.sampler import SetupSampler + + c = SetupSampler(output_directory_path=sampler_output_directory_path, small_molecule_forcefield=small_molecule_forcefield) + c.create_system(biopolymer_file, ligand_file) + c.minimize(maxIterations) + c.run(nsteps=nsteps) + c.export_xml() + + +def compute_observable(input_directory_path): + from espfit.app.experiment import RNASystem + target = RNASystem() + target.load_traj(input_directory_path=input_directory_path) + val = target.compute_jcouplings() + _logger.info(f'Computed observable: {val}') + + import os + import yaml + with open(os.path.join(input_directory_path, 'observable.yaml'), 'w') as f: + yaml.dump(val, f, allow_unicode=True) + + return val + + +def compute_loss(input_directory_path): + # Compute observable + val = compute_observable(input_directory_path) + _logger.info(f'Computed observable: {val}') + + # Compute loss + import torch + loss = torch.tensor(0.0) + + return loss From cd23fff4b792da1382bf62f804d7977778da459a Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 15 Feb 2024 15:59:12 -0500 Subject: [PATCH 07/59] use pint.UnitRegistry to define units --- espfit/utils/units.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espfit/utils/units.py b/espfit/utils/units.py index ccc236a..7082d9a 100644 --- a/espfit/utils/units.py +++ b/espfit/utils/units.py @@ -8,6 +8,6 @@ # Conversion factors #HARTEE_TO_KCALPERMOL = 627.509 -#BOHR_TO_ANGSTROMS = 0.529177 +#BOHR_TO_ANGSTROMS = 0.529 HARTREE_TO_KCALPERMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude From 91ac11803a1d41a77657909db1235723101d74a3 Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 15 Feb 2024 16:11:22 -0500 Subject: [PATCH 08/59] remove espaloma train settings and add sampler settings to config.toml --- espfit/data/config/config.toml | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml index 1cab4a8..d020d18 100644 --- a/espfit/data/config/config.toml +++ b/espfit/data/config/config.toml @@ -1,7 +1,7 @@ # configuration for gnn [espaloma.gnn] method = "SAGEConv" -aggregator_type = 'mean' +aggregator_type = "mean" feat_drop = 0.1 # configuration for stage 1 (gnn) & 2 (janossy pooling) @@ -9,13 +9,6 @@ feat_drop = 0.1 stage1 = [ 512, "relu", 0.1, 512, "relu", 0.1, 512, "relu", 0.1 ] # (units, activation, dropout) stage2 = [ 512, "relu", 0.1, 512, "relu", 0.1, 512, "relu", 0.1, 512, "relu", 0.1 ] # (units, activation, dropout) -# training -[espaloma.train] -epochs = 20 -batch_size = 128 -learning_rate = 1e-4 -checkpoint_frequency = 10 - # loss weights [espaloma.weights] energy = 1.0 @@ -23,3 +16,22 @@ force = 1.0 charge = 1.0 torsion = 1.0 improper = 1.0 + +# system setup parameters +[sampler.setup] +water_model = tip3p +solvent_padding = 9.0 * angstroms +ionic_strength = 0.15 * molar +temperature = 300 * kelvin + +# sampler production +[sampler.run] +maxIterations = 100 +nsteps = 100 +checkpoint_frequency = 25000 +logging_frequency = 250000 +netcdf_frequency = 250000 + +# reweighting settings +[sampler.reweight] +neff = 0.2 From 08f43192cd2c57987a68dbfeed86fe7cd262db1e Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 15 Feb 2024 16:23:34 -0500 Subject: [PATCH 09/59] use local variables instead of instance attributes --- espfit/app/sampler.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 1eb1de7..19d1ead 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -113,7 +113,7 @@ def minimize(self, maxIterations=100): ------- None """ - _logger.info(f"Minimizing system...") + _logger.info(f"Minimizing system for maximum {maxIterations} steps.") self.simulation.minimizeEnergy(maxIterations) @@ -144,23 +144,18 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ ------- None """ - self.checkpoint_frequency = checkpoint_frequency - self.logging_frequency = logging_frequency - self.netcdf_frequency = netcdf_frequency - self.nsteps = nsteps - self.atom_indices = atom_indices if output_directory_path is not None: self.output_directory_path = output_directory_path # property decorator is called # Select atoms to save import mdtraj - if self.atom_indices is None: - self.atom_indices = [] + if atom_indices is None: + atom_indices = [] mdtop = mdtraj.Topology.from_openmm(self.simulation.topology) res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] for r in res: for a in r.atoms: - self.atom_indices.append(a.index) + atom_indices.append(a.index) # Define reporter from mdtraj.reporters import NetCDFReporter @@ -168,22 +163,22 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ self._check_file_exists("traj.nc") self.simulation.reporters.append(NetCDFReporter(os.path.join(self.output_directory_path, f"traj.nc"), - min(self.netcdf_frequency, self.nsteps), - atomSubset=self.atom_indices)) + min(netcdf_frequency, nsteps), + atomSubset=atom_indices)) self._check_file_exists("checkpoint.chk") self.simulation.reporters.append(CheckpointReporter(os.path.join(self.output_directory_path, f"checkpoint.chk"), - min(self.checkpoint_frequency, self.nsteps))) + min(checkpoint_frequency, nsteps))) self._check_file_exists("reporter.log") self.simulation.reporters.append(StateDataReporter(os.path.join(self.output_directory_path, f"reporter.log"), - min(self.logging_frequency, self.nsteps), + min(logging_frequency, nsteps), step=True, potentialEnergy=True, kineticEnergy=True, totalEnergy=True, temperature=True, volume=True, density=True, speed=True)) # Run - _logger.info(f"Run MD simulation for {self.nsteps} steps") - self.simulation.step(self.nsteps) + _logger.info(f"Run MD simulation for {nsteps} steps") + self.simulation.step(nsteps) def export_xml(self, exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None): From cef64974b558730e33efe063f0b4a94715f39b42 Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 15 Feb 2024 16:24:39 -0500 Subject: [PATCH 10/59] add train_weight method --- espfit/app/train.py | 254 +++++++++++++++++++------------------------- 1 file changed, 112 insertions(+), 142 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index a44e03f..99e2a53 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -6,6 +6,7 @@ * Add support to use multiple GPUs * Add support to validate model? (or use independent script?) * Add support to save model? (or use independent script?) +* Improve how data are parsed using dataclasses or pydantic """ import logging @@ -33,7 +34,8 @@ class EspalomaModel(object): >>> model.train() """ - def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, random_seed=2666, config=None, output_directory_path=None): + def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, random_seed=2666, output_directory_path=None, + epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10): """Initialize an instance of the class with an Espaloma network model and a random seed. This constructor method sets up the Espaloma network model, the training, validation, test datasets, @@ -44,8 +46,8 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase Parameters ---------- net : torch.nn.Sequential, default=None - The Espaloma network model to be used for training. - + The Espaloma network model to be used for training. + dataset_train : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None The training dataset. espaloma.graphs.graph.Graph. If not provided, the `train_data` attribute will be set to None. @@ -58,22 +60,46 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase random_seed : int, default=2666 The random seed used throughout the espaloma training. - config : dict, default=None - The configuration for the espaloma model. If not provided, the `config` attribute will be set to None. - output_directory_path : str, default=None The directory where the model checkpoints should be saved. If not provided, the checkpoints will be saved in the current working directory. + + epochs : int, default=1000 + The number of epochs to train the model for. + + batch_size : int, default=128 + The number of samples per batch. + + learning_rate : float, default=1e-4 + The learning rate for the optimizer. + + checkpoint_frequency : int, default=10 + The frequency at which the model should be saved. """ - self.net = net + import os + import torch self.dataset_train = dataset_train self.dataset_validation = dataset_validation self.dataset_test = dataset_test + self.net = net self.random_seed = random_seed - self.config = config + self.epochs = epochs + self.batch_size = batch_size + self.learning_rate = learning_rate + self.checkpoint_frequency = checkpoint_frequency if output_directory_path is None: - import os self.output_directory_path = os.getcwd() + else: + self.output_directory_path = output_directory_path + + # Check if GPU is available + if torch.cuda.is_available(): + _logger.info('GPU is available for training.') + else: + _logger.info('GPU is not available for training.') + + # Check torch data type + _logger.info(f'Torch data type is {torch.get_default_dtype()}') @classmethod @@ -107,7 +133,6 @@ def from_toml(cls, filename): model = cls() net = model.create_model(config['espaloma']) model.net = net - model.config = config return model @@ -195,17 +220,12 @@ def create_model(espaloma_config): return net - def _restart_checkpoint(self, output_directory_path): + def _load_checkpoint(self): """Load the last checkpoint and restart the training process. - This method finds all the checkpoint files in the directory specified by `output_directory_path`, - loads the last checkpoint (e.g. net100.pt), and restarts the training process from the next step. If no - checkpoint files are found, the training process starts from the first step. - - Parameters - ---------- - output_directory_path : str - The directory where the checkpoint files are stored. + This method finds all the checkpoint files in the output directory, loads the + last checkpoint (e.g. net100.pt), and restarts the training process from the next step. + If no checkpoint files are found, the training process starts from the first step. Returns ------- @@ -213,83 +233,39 @@ def _restart_checkpoint(self, output_directory_path): The step from which the training process should be restarted. """ import os + import sys import glob import torch - checkpoints = glob.glob("{}/*.pt".format(output_directory_path)) + checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path)) if checkpoints: n = [ int(c.split('net')[1].split('.')[0]) for c in checkpoints ] n.sort() restart_epoch = n[-1] - restart_checkpoint = os.path.join(output_directory_path, f"net{restart_epoch}.pt") + restart_checkpoint = os.path.join(self.output_directory_path, f"net{restart_epoch}.pt") self.net.load_state_dict(torch.load(restart_checkpoint)) logging.info(f'Restarting from ({restart_checkpoint}).') else: restart_epoch = 0 - return restart_epoch - - - def _get_train_parameters(self, output_directory_path): - import os - import torch - - if torch.cuda.is_available(): - _logger.info('GPU is available for training.') - else: - _logger.info('GPU is not available for training.') - - # Check if training dataset is provided - if self.dataset_train is None: - raise ValueError('Training dataset is not provided.') - - # Espaloma settings for training - config = self.config['espaloma']['train'] - epochs = config.get('epochs', epochs) - batch_size = config.get('batch_size', batch_size) - learning_rate = config.get('learning_rate', learning_rate) - checkpoint_frequency = config.get('checkpoint_frequency', checkpoint_frequency) - if output_directory_path is not None: - self.output_directory_path = output_directory_path - # Create output directory if not exists - os.makedirs(output_directory_path, exist_ok=True) - - # Restart from checkpoint if exists - restart_epoch = self._restart_checkpoint(output_directory_path) - if restart_epoch >= epochs: - _logger.info(f'Already trained for {epochs} epochs.') - return + if restart_epoch >= self.epochs: + _logger.info(f'Already trained for {self.epochs} epochs.') + sys.exit(0) elif restart_epoch > 0: - _logger.info(f'Training for additional {epochs-restart_epoch} epochs.') + _logger.info(f'Training for additional {self.epochs-restart_epoch} epochs.') else: - _logger.info(f'Training from scratch for {epochs} epochs.') + _logger.info(f'Training from scratch for {self.epochs} epochs.') + + return restart_epoch - return restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency - - def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None): + def train(self, output_directory_path=None): """ Train the Espaloma network model. - This method trains the Espaloma network model using the training dataset. The training process can be customized - by specifying the number of epochs, batch size, learning rate, checkpoint frequency, and an output directory. - The method also supports restarting the training from a checkpoint. - Parameters ---------- - epochs : int, default=1000 - The number of epochs to train the model for. - - batch_size : int, default=128 - The number of samples per batch. - - learning_rate : float, default=1e-4 - The learning rate for the optimizer. - - checkpoint_frequency : int, default=10 - The frequency at which the model should be saved. - output_directory_path : str, default=None The directory where the model checkpoints should be saved. If None, the default output directory is used. @@ -301,16 +277,23 @@ def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_freq import torch from espfit.utils.units import HARTREE_TO_KCALPERMOL - # Get training parameters - restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency = self._get_train_parameters(epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path) + if self.dataset_train is None: + raise ValueError('Training dataset is not provided.') + + if output_directory_path is not None: + self.output_directory_path = output_directory_path + os.makedirs(self.output_directory_path, exist_ok=True) + + # Load checkpoint + restart_epoch = self._load_checkpoint() # Train # https://github.com/choderalab/espaloma/blob/main/espaloma/app/train.py#L33 # https://github.com/choderalab/espaloma/blob/main/espaloma/data/dataset.py#L310 - ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=batch_size, shuffle=True) - optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate) + ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) + optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) with torch.autograd.set_detect_anomaly(True): - for i in range(restart_epoch, epochs): + for i in range(restart_epoch, self.epochs): epoch = i + 1 # Start from epoch 1 (not zero-indexing) for g in ds_tr_loader: optimizer.zero_grad() @@ -324,101 +307,88 @@ def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_freq loss.backward() optimizer.step() - if epoch % checkpoint_frequency == 0: + if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. - _loss = HARTEE_TO_KCALPERMOL * loss.pow(0.5).item() + _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() _logger.info(f'epoch {epoch}: {_loss:.3f}') checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt") torch.save(self.net.state_dict(), checkpoint_file) - - def train_reweight(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None, - system_type='RNA', neff_threshold=0.2, filename=None, maxIterations=10, nsteps=10): + + def train_sampler(self, output_directory_path=None, + biopolymer_file=None, ligand_file=None, small_molecule_forcefield=None, + sampler_patience=800, maxIterations=10, nsteps=10, neff_threshold=0.2): import os import torch from espfit.utils.units import HARTREE_TO_KCALPERMOL - from espfit.app.sampler import SetupSampler + from espfit.utils.sampler import module - if system_type == 'RNA': - from espfit.app.experiment import RNASystem - else: - raise NotImplementedError("Only RNA system is supported at the moment.") - - # Get training parameters - restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency = self._get_train_parameters(epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path) - - # Run MD simulation - if restart_epoch == 0: - _logger.info('Running MD simulation for the first time.') - sampler = SetupSampler() - sampler.create_system(biopolymer_file=filename) - sampler.minimize(maxIterations) - sampler.run(nsteps) - - sampler_output_directory_path = os.path.join(output_directory_path, "md", restart_epoch) - sampler.export_xml(output_directory_path=sampler_output_directory_path) - - # Bookkeep last output directory - old_sampler_output_directory_path = sampler_output_directory_path - - target = RNASystem() - target.load_traj(input_directory_path=sampler_output_directory_path) - obs = target.compute_jcouplings() - _logger.info(f'Computed observable: {obs}') + # Parameters for sampling and reweighting + self.biopolymer_file = biopolymer_file + self.ligand_file = ligand_file + self.sampler_patience = sampler_patience + self.maxIterations = maxIterations + self.nsteps = nsteps + self.neff_threshold = neff_threshold + self.small_molecule_forcefield = small_molecule_forcefield + + if self.dataset_train is None: + raise ValueError('Training dataset is not provided.') + + if output_directory_path is not None: + self.output_directory_path = output_directory_path + os.makedirs(self.output_directory_path, exist_ok=True) + + # Load checkpoint + restart_epoch = self._load_checkpoint() + + # Initialize neff to -1 to trigger the first sampling + neff = -1 # Train - ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=batch_size, shuffle=True) - optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate) + ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) + optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) with torch.autograd.set_detect_anomaly(True): - for i in range(restart_epoch, epochs): + for i in range(restart_epoch, self.epochs): epoch = i + 1 # Start from epoch 1 (not zero-indexing) + loss = torch.tensor(0.0) for g in ds_tr_loader: optimizer.zero_grad() - + if torch.cuda.is_available(): g = g.to("cuda:0") - - g.nodes["n1"].data["xyz"].requires_grad = True - # QC loss + g.nodes["n1"].data["xyz"].requires_grad = True loss += self.net(g) + # Run sampling + if epoch > self.sampler_patience: + if neff < self.neff_threshold: + _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') + # Create system and run sampling, instead of restarting from previous checkpoint + _logger.info(f'Run simulation...') + sampler_output_directory_path = os.path.join(self.output_directory_path, "sampler", str(epoch)) + module.run_sampler(sampler_output_directory_path, self.biopolymer_file, self.ligand_file, self.maxIterations, self.nsteps, self.small_molecule_forcefield) - # Compute MD loss - neff = 0.5 - if neff < neff_threshold: - # Re-run MD simulation - sampler_output_directory_path = os.path.join(output_directory_path, "md", restart_epoch) - sampler = SetupSampler.from_xml(input_directory_path=old_sampler_output_directory_path, output_directory_path=sampler_output_directory_path) - sampler.minimize(maxIterations) - sampler.run(nsteps) - # Check if new observable is computed - target = RNASystem() - target.load_traj(input_directory_path=sampler_output_directory_path) - obs = target.compute_jcouplings() - # Update directory - old_sampler_output_directory_path = sampler_output_directory_path - else: - pass + # Compute MD loss + _logger.info(f'Compute sampler loss.') + sampler_loss = module.compute_loss(input_directory_path=sampler_output_directory_path) + # Add MD loss to the joint loss + loss += sampler_loss # Update weights loss.backward() optimizer.step() - if epoch % checkpoint_frequency == 0: + if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. - _loss = HARTEE_TO_KCALPERMOL * loss.pow(0.5).item() + _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() _logger.info(f'epoch {epoch}: {_loss:.3f}') - checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt") + checkpoint_file = os.path.join(self.output_directory_path, f"net{epoch}.pt") torch.save(self.net.state_dict(), checkpoint_file) - - def compute_md_loss(self, couplings): - - pass - def validate(): raise NotImplementedError From d72a53148812f174d18c7f6c76e30abd04e4d03c Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 15 Feb 2024 16:25:33 -0500 Subject: [PATCH 11/59] first commit test_app_train_sampler.py --- espfit/tests/test_app_train_sampler.py | 67 ++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 espfit/tests/test_app_train_sampler.py diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py new file mode 100644 index 0000000..222f86c --- /dev/null +++ b/espfit/tests/test_app_train_sampler.py @@ -0,0 +1,67 @@ +import pytest +from importlib.resources import files +from espfit.utils.graphs import CustomGraphDataset +from espfit.app.train import EspalomaModel + + +@pytest.fixture +def test_create_espaloma_model(): + """Test function to load a TOML configuration file and create an EspalomaModel object. + + Returns + ------- + model : espfit.app.train.EspalomaModel + The created EspalomaModel object. + """ + filename = files('espfit').joinpath('data/config/config.toml') # PosixPath + model = EspalomaModel.from_toml(str(filename)) + + return model + + +@pytest.fixture +def test_load_dataset(tmpdir): + """Test function to load a dataset and prepare it for training. + + Parameters + ---------- + tmpdir : py._path.local.LocalPath # IS THIS CORRECT? + Temporary directory. + + Notes + ----- + This function is not intended for production use. It is a minimal example for testing purposes. + + Returns + ------- + ds : espfit.utils.graphs.CustomGraphDataset + The loaded dataset. + """ + # load dataset + path = 'data/qcdata/openff-toolkit-0.10.6/dgl2/gen2-torsion-sm' + mydata = files('espfit').joinpath(path) + ds = CustomGraphDataset.load(str(mydata)) + + # Prepare input dataset ready for training + temporary_directory = tmpdir.mkdir('misc') + ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) + ds.reshape_conformation_size(n_confs=50) + ds.compute_relative_energy() + + return ds + + +def test_train_sampler(test_load_dataset, test_create_espaloma_model): + + # Load dataset and model + ds = test_load_dataset + model = test_create_espaloma_model + model.epochs = 50 + model.dataset_train = ds + + # Train + small_molecule_forcefield=files('espfit').joinpath('data/forcefield/espaloma-0.3.2.pt') + biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb') # PosixPath + output_directory_path = 'examples/checkpoints_sampler' + model.train_sampler(biopolymer_file=biopolymer_file, sampler_patience=3, maxIterations=100, nsteps=1000, neff_threshold=0.2, output_directory_path=output_directory_path, small_molecule_forcefield=str(small_molecule_forcefield)) + raise ValueError('This test is not yet implemented.') From ab0a2513316749a53e1cd0b2bfbc1b0c9acf3e0d Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 16 Feb 2024 10:34:14 -0500 Subject: [PATCH 12/59] fix python to 3.11 and add pytest-cov in test_env.yaml --- devtools/conda-envs/test_env.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 5fac361..b683519 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -3,13 +3,15 @@ channels: - conda-forge - openeye dependencies: + - python=3.11 - espaloma=0.3.2 - pdbfixer - ca-certificates - certifi - openssl - pytest + - pytest-cov - openeye-toolkits - openmmtools - barnaba -prefix: /lila/home/takabak/mambaforge/envs/espfit +prefix: /lila/home/takabak/mambaforge/envs/espfit \ No newline at end of file From ed42b0c28565727f8bfc170c3f341498d3ed93d7 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 16 Feb 2024 10:40:53 -0500 Subject: [PATCH 13/59] add space to indent lines --- devtools/conda-envs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devtools/conda-envs/README.md b/devtools/conda-envs/README.md index 83704b9..58cc1cb 100644 --- a/devtools/conda-envs/README.md +++ b/devtools/conda-envs/README.md @@ -6,7 +6,7 @@ >#uninstall openff-toolkit and install a customized version to support dgl graphs created using openff-toolkit=0.10.6 >conda uninstall --force openff-toolkit >pip install git+https://github.com/kntkb/openff-toolkit.git@7e9d0225782ef723083407a1cbf1f4f70631f934 ->#uninstall openmmforcefields if < 0.12.0 +>#uninstall openmmforcefields if < 0.12.0 >#use pip instead of mamba to avoid dependency issues with ambertools and python >conda uninstall --force openmmforcefields >pip install git+https://github.com/openmm/openmmforcefields@0.12.0 From f546b7feb97f43e658df043e8e2d3f6374ff2df6 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 16 Feb 2024 10:53:16 -0500 Subject: [PATCH 14/59] add space to indent line --- devtools/conda-envs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devtools/conda-envs/README.md b/devtools/conda-envs/README.md index 58cc1cb..83776c6 100644 --- a/devtools/conda-envs/README.md +++ b/devtools/conda-envs/README.md @@ -2,7 +2,7 @@ >conda activate espfit >conda env export --from-history > test_env.yaml ->conda env create -f test_env.yaml -n test_env +>conda env create -f test_env.yaml -n test_env >#uninstall openff-toolkit and install a customized version to support dgl graphs created using openff-toolkit=0.10.6 >conda uninstall --force openff-toolkit >pip install git+https://github.com/kntkb/openff-toolkit.git@7e9d0225782ef723083407a1cbf1f4f70631f934 From 628440fe4fdc77cf597aae287e2d1ec4afc0fbcb Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 19 Feb 2024 15:58:43 -0500 Subject: [PATCH 15/59] convert unit string into openmm unit --- espfit/utils/units.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/espfit/utils/units.py b/espfit/utils/units.py index 7082d9a..3d5c2c4 100644 --- a/espfit/utils/units.py +++ b/espfit/utils/units.py @@ -1,3 +1,4 @@ +import openmm.unit as unit from pint import UnitRegistry # Define pint unit registry @@ -11,3 +12,41 @@ #BOHR_TO_ANGSTROMS = 0.529 HARTREE_TO_KCALPERMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude + + +def convert_string_to_unit(unit_string): + """Convert a unit string to a openmm unit object. + + Parameters + ---------- + unit_string : str + The string representation of the unit. + + Returns + ------- + openmm.unit + The openmm unit object. + """ + unit_mapping = { + "nanometer": unit.nanometer, + "angstrom": unit.angstrom, + "nanometers": unit.nanometers, + "angstroms": unit.angstroms, + "kelvin": unit.kelvin, + "molar": unit.molar, + "millimolar": unit.millimolar, + "micromolar": unit.micromolar, + "atomsphere": unit.atmosphere, + "bar": unit.bar, + "nanoseconds": unit.nanoseconds, + "picoseconds": unit.picoseconds, + "femtoseconds": unit.femtoseconds, + "nanosecond": unit.nanosecond, + "picosecond": unit.picosecond, + "femtosecond": unit.femtosecond, + # Add more units as needed + } + if unit_string in unit_mapping: + return unit_mapping[unit_string] + else: + raise ValueError(f"Unit '{unit_string}' is not recognized.") \ No newline at end of file From 245ab2ca4b6b9b49d9633eb7a94c02714fb3fabf Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 19 Feb 2024 17:09:15 -0500 Subject: [PATCH 16/59] create sampler system from toml file --- espfit/app/sampler.py | 153 +++++++++++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 39 deletions(-) diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 19d1ead..45e113b 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -29,20 +29,43 @@ class BaseSimulation(object): Methods ------- - minimize(maxIterations=100): + minimize(output_directory_path=None): Minimize solvated system. - run(checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, nsteps=250000, atom_indices=None): + run(output_directory_path=None): Run standard MD simulation. - export_xml(exportSystem=True, exportState=True, exportIntegrator=True): + export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None): Export serialized system XML file and solvated pdb file. """ - def __init__(self, output_directory_path=None, input_directory_path=None): + def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, neff_threshold=0.2, + checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, + output_directory_path=None, input_directory_path=None): """Initialize base simulation object. Parameters ---------- + maxIterations : int, default=100 + Maximum number of iterations to perform minimization. + + nsteps : int, default=250000 (10 ns using 4 fs timestep) + Number of steps to run the simulation. + + atom_indices : list, default=None + List of atom indices to save. If None, save all atoms except water and ions. + + neff_threshold : float, default=0.2 + Effective sample size threshold to rerun the simulation. + + checkpoint_frequency : int, default=25000 (1 ns) + Frequency (in steps) at which to write checkpoint files. + + logging_frequency : int, default=250000 (10 ns) + Frequency (in steps) at which to write logging files. + + netcdf_frequency : int, default=250000 (10 ns) + Frequency (in steps) at which to write netcdf files. + output_directory_path : str, optional Output directory path. Default is None. If None, the current working directory will be used. @@ -51,6 +74,14 @@ def __init__(self, output_directory_path=None, input_directory_path=None): Input directory path to restart simulation. Default is None. If None, the current working directory will be used. """ + self.maxIterations = maxIterations + self.nsteps = nsteps + self.atom_indices = atom_indices + self.neff_threshold = neff_threshold + self.checkpoint_frequency = checkpoint_frequency + self.logging_frequency = logging_frequency + self.netcdf_frequency = netcdf_frequency + if output_directory_path is None: output_directory_path = os.getcwd() # Is this right? if input_directory_path is None: @@ -101,42 +132,29 @@ def _get_platform(self): return platform - def minimize(self, maxIterations=100): + def minimize(self, output_directory_path=None): """Minimize solvated system. - Parameters - ---------- - maxIterations : int, default=100 - Maximum number of iterations to perform. + output_directory_path : str, default=None + The path to the output directory. If None, the default output directory is used. Returns ------- None """ - _logger.info(f"Minimizing system for maximum {maxIterations} steps.") - self.simulation.minimizeEnergy(maxIterations) + if output_directory_path is not None: + self.output_directory_path = output_directory_path # property decorator is called - def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, nsteps=250000, atom_indices=None, output_directory_path=None): + _logger.info(f"Minimizing system for maximum {self.maxIterations} steps.") + self.simulation.minimizeEnergy(self.maxIterations) + + + def run(self, output_directory_path=None): """Run standard MD simulation. Parameters ---------- - checkpoint_frequency : int, default=25000 (1 ns) - Frequency (in steps) at which to write checkpoint files. - - logging_frequency : int, default=250000 (10 ns) - Frequency (in steps) at which to write logging files. - - netcdf_frequency : int, default=250000 (10 ns) - Frequency (in steps) at which to write netcdf files. - - nsteps : int, default=250000 (10 ns) - Number of steps to run the simulation. - - atom_indices : list, default=None - List of atom indices to save. If None, save all atoms except water and ions. - output_directory_path : str, default=None The path to the output directory. If None, the default output directory is used. @@ -149,13 +167,13 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ # Select atoms to save import mdtraj - if atom_indices is None: - atom_indices = [] + if self.atom_indices is None: + self.atom_indices = [] mdtop = mdtraj.Topology.from_openmm(self.simulation.topology) res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] for r in res: for a in r.atoms: - atom_indices.append(a.index) + self.atom_indices.append(a.index) # Define reporter from mdtraj.reporters import NetCDFReporter @@ -163,22 +181,22 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ self._check_file_exists("traj.nc") self.simulation.reporters.append(NetCDFReporter(os.path.join(self.output_directory_path, f"traj.nc"), - min(netcdf_frequency, nsteps), - atomSubset=atom_indices)) + min(self.netcdf_frequency, self.nsteps), + atomSubset=self.atom_indices)) self._check_file_exists("checkpoint.chk") self.simulation.reporters.append(CheckpointReporter(os.path.join(self.output_directory_path, f"checkpoint.chk"), - min(checkpoint_frequency, nsteps))) + min(self.checkpoint_frequency, self.nsteps))) self._check_file_exists("reporter.log") self.simulation.reporters.append(StateDataReporter(os.path.join(self.output_directory_path, f"reporter.log"), - min(logging_frequency, nsteps), + min(self.logging_frequency, self.nsteps), step=True, potentialEnergy=True, kineticEnergy=True, totalEnergy=True, temperature=True, volume=True, density=True, speed=True)) # Run - _logger.info(f"Run MD simulation for {nsteps} steps") - self.simulation.step(nsteps) + _logger.info(f"Run MD simulation for {self.nsteps} steps") + self.simulation.step(self.nsteps) def export_xml(self, exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None): @@ -296,8 +314,10 @@ class SetupSampler(BaseSimulation): >>> from espfit.app.sampler import SetupSampler >>> c = SetupSampler() >>> c.create_system(biopolymer_file='protein.pdb', ligand_file='ligand.sdf') - >>> c.minimize(maxIterations=10) - >>> c.run(nsteps=10) + >>> c.maxIterations = 10 # change default setting + >>> c.minimize() + >>> c.nsteps = 100 # change default setting + >>> c.run() Notes ----- @@ -374,6 +394,61 @@ def __init__(self, self.barostat_period = barostat_period self.timestep = timestep self.override_with_espaloma = override_with_espaloma + self.target_class = None + self.target_name = None + + + @classmethod + def from_toml(cls, filename): + import tomllib + from espfit.utils.units import convert_string_to_unit + from importlib.resources import files + + try: + with open(filename, 'rb') as f: + config = tomllib.load(f) + except FileNotFoundError as e: + print(e) + raise + + config = config['sampler']['setup'] # list + if config is None: + raise ValueError("target is not specified in the configuration file") + + systems = [] + _logger.info(f'Found {len(config)} systems in the configuration file') + for _config in config: + system = cls() + # Target information + target_class = _config['target_class'] + target_name = _config['target_name'] + + system.target_class = target_class + system.target_name = target_name + + biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb') + ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf') + if not ligand_file.exists(): + ligand_file = None + + # System settings + for key, value in _config.items(): + if key not in ['target_class', 'target_name']: + if "*" in value: + _value = float(value.split('*')[0].strip()) + unit_string = value.split('*')[1].strip() + unit_mapping = convert_string_to_unit(unit_string) + value = _value * unit_mapping + + # All key should be instance variable of the class + setattr(system, key, value) + + # Create system + system.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file) + systems.append(system) + del system + + return systems def _update_forcefield_files(self, forcefield_files): @@ -486,7 +561,7 @@ def _get_complex(self): return complex_topology, complex_positions - + def create_system(self, biopolymer_file=None, ligand_file=None): """Create biopolymer-ligand system and export serialized system XML file and solvated pdb file. From e4ecdf40135fdb5a44faeba5870e4bdc6858938d Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 19 Feb 2024 17:10:13 -0500 Subject: [PATCH 17/59] allow creating multiple sampler systems --- espfit/data/config/config.toml | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml index d020d18..99caf42 100644 --- a/espfit/data/config/config.toml +++ b/espfit/data/config/config.toml @@ -18,20 +18,21 @@ torsion = 1.0 improper = 1.0 # system setup parameters -[sampler.setup] -water_model = tip3p -solvent_padding = 9.0 * angstroms -ionic_strength = 0.15 * molar -temperature = 300 * kelvin +# Note: only target class/name and variables related to `openmm.unit` are supported +[[sampler.setup]] +target_class = "nucleoside" +target_name = "cytidine" +water_model = "tip3p" +solvent_padding = "10.0 * angstroms" +ionic_strength = "0.08 * molar" # 80 mM NaCl +temperature = "303.15 * kelvin" -# sampler production -[sampler.run] -maxIterations = 100 -nsteps = 100 -checkpoint_frequency = 25000 -logging_frequency = 250000 -netcdf_frequency = 250000 - -# reweighting settings -[sampler.reweight] -neff = 0.2 +# system setup parameters +# Note: only target class/name and variables related to `openmm.unit` are supported +[[sampler.setup]] +target_class = "nucleoside" +target_name = "cytidine" +water_model = "tip3p" +solvent_padding = "10.0 * angstroms" +ionic_strength = "0.08 * molar" # 80 mM NaCl +temperature = "303.15 * kelvin" \ No newline at end of file From 04145ffad640f7fe818c96810e1d1bba26851258 Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 19 Feb 2024 17:11:50 -0500 Subject: [PATCH 18/59] rename pdbfixer_min.pdb to target.pdb for nucleoside systems --- .../{pdbfixer_min.pdb => target.pdb} | 0 .../cytidine/{pdbfixer_min.pdb => target.pdb} | 0 .../{pdbfixer_min.pdb => target.pdb} | 0 .../uridine/{pdbfixer_min.pdb => target.pdb} | 0 .../testsystems/nucleoside/pdbfixer_min.pdb | 35 ------------------- 5 files changed, 35 deletions(-) rename espfit/data/target/nucleoside/adenosine/{pdbfixer_min.pdb => target.pdb} (100%) rename espfit/data/target/nucleoside/cytidine/{pdbfixer_min.pdb => target.pdb} (100%) rename espfit/data/target/nucleoside/guanosine/{pdbfixer_min.pdb => target.pdb} (100%) rename espfit/data/target/nucleoside/uridine/{pdbfixer_min.pdb => target.pdb} (100%) delete mode 100644 espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb diff --git a/espfit/data/target/nucleoside/adenosine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/adenosine/target.pdb similarity index 100% rename from espfit/data/target/nucleoside/adenosine/pdbfixer_min.pdb rename to espfit/data/target/nucleoside/adenosine/target.pdb diff --git a/espfit/data/target/nucleoside/cytidine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/cytidine/target.pdb similarity index 100% rename from espfit/data/target/nucleoside/cytidine/pdbfixer_min.pdb rename to espfit/data/target/nucleoside/cytidine/target.pdb diff --git a/espfit/data/target/nucleoside/guanosine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/guanosine/target.pdb similarity index 100% rename from espfit/data/target/nucleoside/guanosine/pdbfixer_min.pdb rename to espfit/data/target/nucleoside/guanosine/target.pdb diff --git a/espfit/data/target/nucleoside/uridine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/uridine/target.pdb similarity index 100% rename from espfit/data/target/nucleoside/uridine/pdbfixer_min.pdb rename to espfit/data/target/nucleoside/uridine/target.pdb diff --git a/espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb b/espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb deleted file mode 100644 index c80385e..0000000 --- a/espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb +++ /dev/null @@ -1,35 +0,0 @@ -REMARK 1 CREATED WITH OPENMM 8.1, 2024-01-26 -ATOM 1 C5' A A 1 5.609 7.878 -3.616 1.00 0.00 C -ATOM 2 H5' A A 1 5.926 8.406 -4.515 1.00 0.00 H -ATOM 3 H5'' A A 1 5.488 8.603 -2.812 1.00 0.00 H -ATOM 4 O5' A A 1 4.377 7.214 -3.875 1.00 0.00 O -ATOM 5 HO5' A A 1 3.732 7.873 -4.143 1.00 0.00 H -ATOM 6 C4' A A 1 6.701 6.879 -3.233 1.00 0.00 C -ATOM 7 H4' A A 1 7.663 7.391 -3.262 1.00 0.00 H -ATOM 8 O4' A A 1 6.494 6.375 -1.913 1.00 0.00 O -ATOM 9 C3' A A 1 6.778 5.649 -4.144 1.00 0.00 C -ATOM 10 H3' A A 1 5.784 5.235 -4.319 1.00 0.00 H -ATOM 11 O3' A A 1 7.428 5.896 -5.380 1.00 0.00 O -ATOM 12 HO3' A A 1 8.365 5.865 -5.178 1.00 0.00 H -ATOM 13 C2' A A 1 7.561 4.713 -3.219 1.00 0.00 C -ATOM 14 H2' A A 1 7.428 3.673 -3.519 1.00 0.00 H -ATOM 15 C1' A A 1 6.878 5.002 -1.872 1.00 0.00 C -ATOM 16 H1' A A 1 7.580 4.816 -1.059 1.00 0.00 H -ATOM 17 N1 A A 1 5.124 0.467 -0.163 1.00 0.00 N -ATOM 18 C2 A A 1 6.351 0.978 -0.252 1.00 0.00 C -ATOM 19 H2 A A 1 7.149 0.341 0.097 1.00 0.00 H -ATOM 20 N3 A A 1 6.723 2.170 -0.712 1.00 0.00 N -ATOM 21 C4 A A 1 5.650 2.896 -1.124 1.00 0.00 C -ATOM 22 C5 A A 1 4.335 2.511 -1.094 1.00 0.00 C -ATOM 23 C6 A A 1 4.098 1.219 -0.579 1.00 0.00 C -ATOM 24 N6 A A 1 2.894 0.673 -0.481 1.00 0.00 N -ATOM 25 H61 A A 1 2.824 -0.276 -0.142 1.00 0.00 H -ATOM 26 H62 A A 1 2.092 1.188 -0.815 1.00 0.00 H -ATOM 27 N7 A A 1 3.506 3.517 -1.595 1.00 0.00 N -ATOM 28 C8 A A 1 4.352 4.463 -1.905 1.00 0.00 C -ATOM 29 H8 A A 1 4.040 5.410 -2.321 1.00 0.00 H -ATOM 30 N9 A A 1 5.670 4.162 -1.657 1.00 0.00 N -ATOM 31 O2' A A 1 8.945 5.071 -3.169 1.00 0.00 O -ATOM 32 HO2' A A 1 9.415 4.404 -2.663 1.00 0.00 H -TER 33 A A 1 -END From 1ada151293c00f8f511be2cfa65aab727c26baeb Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 19 Feb 2024 17:12:48 -0500 Subject: [PATCH 19/59] add reference fig, table information --- espfit/data/target/nucleoside/adenosine/experiment.yml | 1 + espfit/data/target/nucleoside/cytidine/experiment.yml | 1 + espfit/data/target/nucleoside/guanosine/experiment.yml | 1 + espfit/data/target/nucleoside/uridine/experiment.yml | 1 + 4 files changed, 4 insertions(+) diff --git a/espfit/data/target/nucleoside/adenosine/experiment.yml b/espfit/data/target/nucleoside/adenosine/experiment.yml index 5d300f9..c4ffa09 100644 --- a/espfit/data/target/nucleoside/adenosine/experiment.yml +++ b/espfit/data/target/nucleoside/adenosine/experiment.yml @@ -9,6 +9,7 @@ experiment_1: name: nmr concentration: 0.2 mM temperature: 303.15 * kelvin + reference: table S12 comment: sequence: a smiles: diff --git a/espfit/data/target/nucleoside/cytidine/experiment.yml b/espfit/data/target/nucleoside/cytidine/experiment.yml index ecd73f8..b0354a3 100644 --- a/espfit/data/target/nucleoside/cytidine/experiment.yml +++ b/espfit/data/target/nucleoside/cytidine/experiment.yml @@ -9,6 +9,7 @@ experiment_1: name: nmr concentration: 5.0 mM temperature: 303.15 * kelvin + reference: table S11 comment: sequence: c smiles: diff --git a/espfit/data/target/nucleoside/guanosine/experiment.yml b/espfit/data/target/nucleoside/guanosine/experiment.yml index 8adf911..a313122 100644 --- a/espfit/data/target/nucleoside/guanosine/experiment.yml +++ b/espfit/data/target/nucleoside/guanosine/experiment.yml @@ -9,6 +9,7 @@ experiment_1: name: nmr concentration: 0.2 mM temperature: 303.15 * kelvin + reference: table S12 comment: sequence: g smiles: diff --git a/espfit/data/target/nucleoside/uridine/experiment.yml b/espfit/data/target/nucleoside/uridine/experiment.yml index 181d12c..a75185c 100644 --- a/espfit/data/target/nucleoside/uridine/experiment.yml +++ b/espfit/data/target/nucleoside/uridine/experiment.yml @@ -9,6 +9,7 @@ experiment_1: name: nmr concentration: 5.0 mM temperature: 303.15 * kelvin + reference: table S11 comment: sequence: u smiles: From 9d9bbe253b8afce7bd51b58adcceb5513dfba201 Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 19 Feb 2024 17:13:46 -0500 Subject: [PATCH 20/59] change output file name from pdbfixer_min.pdb to target.pdb --- espfit/data/target/nucleoside/pdbfixer_setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/espfit/data/target/nucleoside/pdbfixer_setup.py b/espfit/data/target/nucleoside/pdbfixer_setup.py index 1269915..bde251b 100644 --- a/espfit/data/target/nucleoside/pdbfixer_setup.py +++ b/espfit/data/target/nucleoside/pdbfixer_setup.py @@ -46,7 +46,8 @@ def prep(inputfile): # minimize: fix hydrogen positions simulation.minimizeEnergy(maxIterations=50) positions = simulation.context.getState(getPositions=True).getPositions() - PDBFile.writeFile(model.topology, positions, open("pdbfixer_min.pdb", 'w')) + #PDBFile.writeFile(model.topology, positions, open("pdbfixer_min.pdb", 'w')) + PDBFile.writeFile(model.topology, positions, open("target.pdb", 'w')) @click.command() From f337f55c5be5df130adf3c5e342c00df98d7e266 Mon Sep 17 00:00:00 2001 From: kt Date: Tue, 20 Feb 2024 21:27:31 -0500 Subject: [PATCH 21/59] replace module.py with reweight.py --- espfit/utils/sampler/module.py | 48 -------------------------------- espfit/utils/sampler/reweight.py | 47 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 48 deletions(-) delete mode 100644 espfit/utils/sampler/module.py create mode 100644 espfit/utils/sampler/reweight.py diff --git a/espfit/utils/sampler/module.py b/espfit/utils/sampler/module.py deleted file mode 100644 index d6e2492..0000000 --- a/espfit/utils/sampler/module.py +++ /dev/null @@ -1,48 +0,0 @@ -import logging - -_logger = logging.getLogger(__name__) - - -def check_effective_sample_size(): - # Compute effective sample size - neff = 0.5 - - return neff - - -def run_sampler(sampler_output_directory_path, biopolymer_file, ligand_file, maxIterations, nsteps, small_molecule_forcefield): - import os - from espfit.app.sampler import SetupSampler - - c = SetupSampler(output_directory_path=sampler_output_directory_path, small_molecule_forcefield=small_molecule_forcefield) - c.create_system(biopolymer_file, ligand_file) - c.minimize(maxIterations) - c.run(nsteps=nsteps) - c.export_xml() - - -def compute_observable(input_directory_path): - from espfit.app.experiment import RNASystem - target = RNASystem() - target.load_traj(input_directory_path=input_directory_path) - val = target.compute_jcouplings() - _logger.info(f'Computed observable: {val}') - - import os - import yaml - with open(os.path.join(input_directory_path, 'observable.yaml'), 'w') as f: - yaml.dump(val, f, allow_unicode=True) - - return val - - -def compute_loss(input_directory_path): - # Compute observable - val = compute_observable(input_directory_path) - _logger.info(f'Computed observable: {val}') - - # Compute loss - import torch - loss = torch.tensor(0.0) - - return loss diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py new file mode 100644 index 0000000..69c3c5e --- /dev/null +++ b/espfit/utils/sampler/reweight.py @@ -0,0 +1,47 @@ +import os +import logging +from espfit.app.sampler import SetupSampler, BaseSimulation + +_logger = logging.getLogger(__name__) + + +class SamplerReweight(SetupSampler, BaseSimulation): + + def __init__(self, weight=1, **kwargs): + super().__init__(**kwargs) + self.weight = weight + + + def get_effective_sample_size(self): + # Compute effective sample size + neff = 0.5 + return neff + + + def _compute_observable(self): + if self.target_class == 'nucleoside': + from espfit.app.analysis import RNASystem + target = RNASystem() + target.load_traj(input_directory_path=self.output_directory_path) + obs_calc = target.compute_jcouplings() + _logger.info(f'Computed observable: {obs_calc}') + else: + raise NotImplementedError(f'Observable for {self.target_class} is not implemented.') + + import yaml + with open(os.path.join(self.output_directory_path, 'observable.yaml'), 'w') as f: + yaml.dump(obs_calc, f, allow_unicode=True) + + return obs_calc + + + def compute_loss(self): + # Compute experimental observable + obs_calc = self._compute_observable() + _logger.info(f'Computed observable: {obs_calc}') + + # Compute loss + import torch + loss = torch.tensor(0.0) + + return loss \ No newline at end of file From 585a6850e281660e3bb5ac8849214ece216a2f04 Mon Sep 17 00:00:00 2001 From: kt Date: Tue, 20 Feb 2024 21:28:04 -0500 Subject: [PATCH 22/59] rename experiment.py to analysis.py --- espfit/app/{experiment.py => analysis.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename espfit/app/{experiment.py => analysis.py} (100%) diff --git a/espfit/app/experiment.py b/espfit/app/analysis.py similarity index 100% rename from espfit/app/experiment.py rename to espfit/app/analysis.py From 1e5b510a5ee1a4e30d50558beb6e50da799a4307 Mon Sep 17 00:00:00 2001 From: kt Date: Tue, 20 Feb 2024 21:29:00 -0500 Subject: [PATCH 23/59] add simulation settings to config --- espfit/data/config/config.toml | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml index 99caf42..9088025 100644 --- a/espfit/data/config/config.toml +++ b/espfit/data/config/config.toml @@ -17,7 +17,16 @@ charge = 1.0 torsion = 1.0 improper = 1.0 -# system setup parameters +# training settings +[espaloma.train] +epochs = 10 +batch_size = 128 +learning_rate = 1e-4 +checkpoint_frequency = 1 +output_directory_path = "checkpoints" + + +# System setup parameters # Note: only target class/name and variables related to `openmm.unit` are supported [[sampler.setup]] target_class = "nucleoside" @@ -26,13 +35,28 @@ water_model = "tip3p" solvent_padding = "10.0 * angstroms" ionic_strength = "0.08 * molar" # 80 mM NaCl temperature = "303.15 * kelvin" +maxIterations = 100 +nsteps = 1000 +neff_threshold = 0.2 +checkpoint_frequency = 10 +logging_frequency = 1 +netcdf_frequency = 10 +weight = 1 + # system setup parameters # Note: only target class/name and variables related to `openmm.unit` are supported [[sampler.setup]] target_class = "nucleoside" -target_name = "cytidine" +target_name = "adenosine" water_model = "tip3p" solvent_padding = "10.0 * angstroms" ionic_strength = "0.08 * molar" # 80 mM NaCl -temperature = "303.15 * kelvin" \ No newline at end of file +temperature = "303.15 * kelvin" +maxIterations = 100 +nsteps = 1000 +neff_threshold = 0.2 +checkpoint_frequency = 10 +logging_frequency = 1 +netcdf_frequency = 10 +weight = 1 \ No newline at end of file From cf39f68a23f34d2c7d0f63d9ac0da57946889018 Mon Sep 17 00:00:00 2001 From: kt Date: Tue, 20 Feb 2024 21:30:30 -0500 Subject: [PATCH 24/59] remove constraint and nonbonded method from instance variable --- espfit/app/sampler.py | 52 ++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 45e113b..7e232bf 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -335,12 +335,12 @@ def __init__(self, water_model='tip3p', solvent_padding=9.0 * unit.angstroms, ionic_strength=0.15 * unit.molar, - constraints=app.HBonds, + #constraints=app.HBonds, hmass=3.0 * unit.amu, temperature=300.0 * unit.kelvin, pressure=1.0 * unit.atmosphere, pme_tol=2.5e-04, - nonbonded_method=app.PME, + #nonbonded_method=app.PME, barostat_period=50, timestep=4 * unit.femtoseconds, override_with_espaloma=True, @@ -385,12 +385,12 @@ def __init__(self, self.forcefield_files = self._update_forcefield_files(forcefield_files) self.solvent_padding = solvent_padding self.ionic_strength = ionic_strength - self.constraints = constraints + #self.constraints = constraints self.hmass = hmass self.temperature = temperature self.pressure = pressure self.pme_tol = pme_tol - self.nonbonded_method = nonbonded_method + #self.nonbonded_method = nonbonded_method self.barostat_period = barostat_period self.timestep = timestep self.override_with_espaloma = override_with_espaloma @@ -399,7 +399,10 @@ def __init__(self, @classmethod - def from_toml(cls, filename): + def from_toml(cls, filename, *epoch, **override_sampler_kwargs): + + # kwargs: force setting updates for SetupSampler and BaseSimulation + import tomllib from espfit.utils.units import convert_string_to_unit from importlib.resources import files @@ -415,16 +418,17 @@ def from_toml(cls, filename): if config is None: raise ValueError("target is not specified in the configuration file") - systems = [] + samplers = [] _logger.info(f'Found {len(config)} systems in the configuration file') for _config in config: - system = cls() + sampler = cls() + # Target information target_class = _config['target_class'] target_name = _config['target_name'] - system.target_class = target_class - system.target_name = target_name + sampler.target_class = target_class + sampler.target_name = target_name biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb') ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf') @@ -434,21 +438,31 @@ def from_toml(cls, filename): # System settings for key, value in _config.items(): if key not in ['target_class', 'target_name']: - if "*" in value: + if isinstance(value, str) and "*" in value: _value = float(value.split('*')[0].strip()) unit_string = value.split('*')[1].strip() unit_mapping = convert_string_to_unit(unit_string) value = _value * unit_mapping - # All key should be instance variable of the class - setattr(system, key, value) + setattr(sampler, key, value) + + # Override system settings by kwargs + for key, value in override_sampler_kwargs.items(): + setattr(sampler, key, value) + + # Update output directory path if epoch is given + if epoch is not None: + if len(epoch) == 1 and isinstance(epoch[0], int): + sampler.output_directory_path = os.path.join(sampler.output_directory_path, f'{epoch[0]}') + else: + raise ValueError("epoch should be a single value or a list of a single value") # Create system - system.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file) - systems.append(system) - del system + sampler.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file) + samplers.append(sampler) + del sampler - return systems + return samplers def _update_forcefield_files(self, forcefield_files): @@ -601,8 +615,10 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # Initialize system generator. _logger.debug("Initialize system generator") - forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : self.constraints, 'rigidWater': True, 'hydrogenMass' : self.hmass} - periodic_forcefield_kwargs = {'nonbondedMethod': self.nonbonded_method} + #forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : self.constraints, 'rigidWater': True, 'hydrogenMass' : self.hmass} + #periodic_forcefield_kwargs = {'nonbondedMethod': self.nonbonded_method} + forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : app.HBonds, 'rigidWater': True, 'hydrogenMass' : self.hmass} + periodic_forcefield_kwargs = {'nonbondedMethod': app.PME} barostat = MonteCarloBarostat(self.pressure, self.temperature, self.barostat_period) # SystemGenerator will automatically load the TemplateGenerator based on the given `small_molecule_forcefield`. From eb234c0d90d21d36e47eb5d9e2d73cbf46ea709c Mon Sep 17 00:00:00 2001 From: kt Date: Tue, 20 Feb 2024 21:31:24 -0500 Subject: [PATCH 25/59] improve running sampler during espaloma training --- espfit/app/train.py | 110 ++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 50 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index 99e2a53..386e4c4 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -8,6 +8,7 @@ * Add support to save model? (or use independent script?) * Improve how data are parsed using dataclasses or pydantic """ +import os import logging _logger = logging.getLogger(__name__) @@ -76,8 +77,6 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase checkpoint_frequency : int, default=10 The frequency at which the model should be saved. """ - import os - import torch self.dataset_train = dataset_train self.dataset_validation = dataset_validation self.dataset_test = dataset_test @@ -87,12 +86,14 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase self.batch_size = batch_size self.learning_rate = learning_rate self.checkpoint_frequency = checkpoint_frequency + self.restart_epoch = 0 + self.configfile = None if output_directory_path is None: - self.output_directory_path = os.getcwd() - else: - self.output_directory_path = output_directory_path - + output_directory_path = os.getcwd() + self.output_directory_path = output_directory_path + # Check if GPU is available + import torch if torch.cuda.is_available(): _logger.info('GPU is available for training.') else: @@ -102,6 +103,20 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase _logger.info(f'Torch data type is {torch.get_default_dtype()}') + @property + def output_directory_path(self): + """Get output directory path.""" + return self._output_directory_path + + + @output_directory_path.setter + def output_directory_path(self, value): + """Set output directory path.""" + self._output_directory_path = value + # Create output directory if it does not exist + os.makedirs(value, exist_ok=True) + + @classmethod def from_toml(cls, filename): """Create an instance of the class from a TOML configuration file. @@ -133,6 +148,11 @@ def from_toml(cls, filename): model = cls() net = model.create_model(config['espaloma']) model.net = net + model.configfile = filename + + # Update training settings + for key, value in config['espaloma']['train'].items(): + setattr(model, key, value) return model @@ -195,7 +215,7 @@ def create_model(espaloma_config): readout_improper = esp.nn.readout.janossy.JanossyPoolingWithSmirnoffImproper(in_features=units, config=config_2, out_features={"k": 2}) # Get loss weights - # TODO: Better way to handle this? + # TODO: Better way to initialize weights? weights = { 'energy': 1.0, 'force': 1.0, 'charge': 1.0, 'torsion': 1.0, 'improper': 1.0 } if 'weights' in espaloma_config.keys(): for key in espaloma_config['weights'].keys(): @@ -260,14 +280,13 @@ def _load_checkpoint(self): return restart_epoch - def train(self, output_directory_path=None): + def train(self): """ Train the Espaloma network model. - Parameters - ---------- - output_directory_path : str, default=None - The directory where the model checkpoints should be saved. If None, the default output directory is used. + TODO + ---- + * Export training settings to a file? Returns ------- @@ -279,13 +298,9 @@ def train(self, output_directory_path=None): if self.dataset_train is None: raise ValueError('Training dataset is not provided.') - - if output_directory_path is not None: - self.output_directory_path = output_directory_path - os.makedirs(self.output_directory_path, exist_ok=True) - + # Load checkpoint - restart_epoch = self._load_checkpoint() + self.restart_epoch = self._load_checkpoint() # Train # https://github.com/choderalab/espaloma/blob/main/espaloma/app/train.py#L33 @@ -293,15 +308,13 @@ def train(self, output_directory_path=None): ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) with torch.autograd.set_detect_anomaly(True): - for i in range(restart_epoch, self.epochs): + for i in range(self.restart_epoch, self.epochs): epoch = i + 1 # Start from epoch 1 (not zero-indexing) for g in ds_tr_loader: optimizer.zero_grad() - # TODO: Better way to handle this? if torch.cuda.is_available(): g = g.to("cuda:0") - g.nodes["n1"].data["xyz"].requires_grad = True loss = self.net(g) loss.backward() @@ -311,36 +324,27 @@ def train(self, output_directory_path=None): # Note: returned loss is a joint loss of different units. _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() _logger.info(f'epoch {epoch}: {_loss:.3f}') - checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt") + checkpoint_file = os.path.join(self.output_directory_path, f"net{epoch}.pt") torch.save(self.net.state_dict(), checkpoint_file) - def train_sampler(self, output_directory_path=None, - biopolymer_file=None, ligand_file=None, small_molecule_forcefield=None, - sampler_patience=800, maxIterations=10, nsteps=10, neff_threshold=0.2): + def train_sampler(self, sampler_patience=800, neff_threshold=0.2): + + # sampler_kwargs: attributes supported by BaseSimulation + import os import torch from espfit.utils.units import HARTREE_TO_KCALPERMOL - from espfit.utils.sampler import module + from espfit.utils.sampler.reweight import SamplerReweight - # Parameters for sampling and reweighting - self.biopolymer_file = biopolymer_file - self.ligand_file = ligand_file self.sampler_patience = sampler_patience - self.maxIterations = maxIterations - self.nsteps = nsteps self.neff_threshold = neff_threshold - self.small_molecule_forcefield = small_molecule_forcefield if self.dataset_train is None: raise ValueError('Training dataset is not provided.') - if output_directory_path is not None: - self.output_directory_path = output_directory_path - os.makedirs(self.output_directory_path, exist_ok=True) - # Load checkpoint - restart_epoch = self._load_checkpoint() + self.restart_epoch = self._load_checkpoint() # Initialize neff to -1 to trigger the first sampling neff = -1 @@ -349,33 +353,39 @@ def train_sampler(self, output_directory_path=None, ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) with torch.autograd.set_detect_anomaly(True): - for i in range(restart_epoch, self.epochs): - epoch = i + 1 # Start from epoch 1 (not zero-indexing) + for i in range(self.restart_epoch, self.epochs): + epoch = i + 1 # Start from 1 (not zero-indexing) loss = torch.tensor(0.0) + if torch.cuda.is_available(): + loss = loss.cuda("cuda:0") for g in ds_tr_loader: optimizer.zero_grad() - if torch.cuda.is_available(): g = g.to("cuda:0") - g.nodes["n1"].data["xyz"].requires_grad = True loss += self.net(g) # Run sampling if epoch > self.sampler_patience: if neff < self.neff_threshold: - _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') - # Create system and run sampling, instead of restarting from previous checkpoint - _logger.info(f'Run simulation...') - sampler_output_directory_path = os.path.join(self.output_directory_path, "sampler", str(epoch)) - module.run_sampler(sampler_output_directory_path, self.biopolymer_file, self.ligand_file, self.maxIterations, self.nsteps, self.small_molecule_forcefield) + # Get Effective sample size + if neff < 0: + _logger.info(f'Reached sampler patience {self.sampler_patience}. Run sampler for the first time.') + else: + _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') + + # Create sampler system from configuration file. Returns list of systems. + override_sampler_kwargs = { "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt" } # change this to local espaloma model + samplers = SamplerReweight.from_toml(self.configfile, epoch, override_sampler_kwargs) + for sampler in samplers: + _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') + sampler.minimize() + sampler.run() # Compute MD loss _logger.info(f'Compute sampler loss.') - sampler_loss = module.compute_loss(input_directory_path=sampler_output_directory_path) - - # Add MD loss to the joint loss - loss += sampler_loss + for sampler in samplers: + loss += sampler.compute_loss() * sampler.weight # Update weights loss.backward() From 56570cbc065cc94db57405f3f7b8f2b0457a9d8b Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 21 Feb 2024 14:13:13 -0500 Subject: [PATCH 26/59] update usage in README.md --- README.md | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 47b67ca..b012b38 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,26 @@ Infrastruture to train espaloma with experimental observables ### Quick Usage +```python +from espfit.utils.graphs import CustomGraphDataset +path = 'espfit/data/qcdata/openff-toolkit-0.10.6/dgl2/protein-torsion-sm/' +ds = CustomGraphDataset.load(path) +ds.reshape_conformation_size(n_confs=50) +ds.compute_relative_energy() +# Create esplama model +from espfit.app.train import EspalomaModel +filename = 'espfit/data/config/config.toml' +model = EspalomaModel.from_toml(filename) +model.dataset_train = ds +# Change default training settings +model.epochs = 100 +# Set sampler settings +model.train_sampler(sampler_patience=800, neff_threshold=0.2) +``` +### Standalone Usage #### Change logging ```python -# load dgl graph data from espfit.utils import logging logging.get_logging_level() #>'INFO' @@ -51,12 +67,11 @@ from espfit.app.train import EspalomaModel filename = 'espfit/data/config/config.toml' model = EspalomaModel.from_toml(filename) model.dataset_train = ds -# Train -model.train(output_directory_path='path/to/output') -# To extend training, update the `epoch` in config.toml -# Alternatively, do the following: -model.config['espaloma']['train']['epochs'] = 50 -model.train(output_directory_path='path/to/output') +# Change default training settings +model.epochs = 100 +model.output_directory_path = 'path/to/output' +# Train (default output directory is current path) +model.train() ``` #### Standard MD (default: espaloma-0.3.2 force field for solute molecules) @@ -66,8 +81,10 @@ from espfit.app.sampler import SetupSampler c = SetupSampler() filename = 'espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb' c.create_system(biopolymer_file=filename) -c.minimize(maxIterations=10) -c.run(nsteps=10, output_directory_path='path/to/output') +c.minimize() +# Change default settings +c.nsteps = 1000 +c.run() # Export to XML c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path='path/to/output') ``` @@ -76,7 +93,8 @@ c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_ ```python from espfit.app.sampler import SetupSampler c = SetupSampler.from_xml(input_directory_path='path/to/input') -c.run(nsteps=10, output_directory_path='path/to/output') +c.nsteps = 1000 +c.run() ``` #### Compute RNA J-couplings from MD trajectory From 7db4dba73d5dc62b5949d779536b98564ae64bc5 Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 21 Feb 2024 14:14:24 -0500 Subject: [PATCH 27/59] create new samplers during espaloma training --- espfit/app/sampler.py | 57 ++++++++++++++++++++++---------- espfit/app/train.py | 7 ++-- espfit/utils/sampler/reweight.py | 2 +- 3 files changed, 46 insertions(+), 20 deletions(-) diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 7e232bf..5c1fe75 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -399,10 +399,27 @@ def __init__(self, @classmethod - def from_toml(cls, filename, *epoch, **override_sampler_kwargs): + def from_toml(cls, filename, *args, **override_sampler_kwargs): + """Create SetupSampler from a TOML configuration file. + + Parameters + ---------- + filename : str + The path to the TOML configuration file. - # kwargs: force setting updates for SetupSampler and BaseSimulation - + *args : list + This is used to update the output directory path during espaloma training. + The list should contain a single integer value, corresponding to the epoch number. + + **override_sampler_kwargs : dict + The dictionary of keyword arguments to override the default settings of the + BaseSimulation and SetupSampler classes. This option is intended for creating + new systems with temporary espaloma models generated during espaloma training. + + Returns + ------- + samplers : list of SetupSampler instances + """ import tomllib from espfit.utils.units import convert_string_to_unit from importlib.resources import files @@ -438,25 +455,31 @@ def from_toml(cls, filename, *epoch, **override_sampler_kwargs): # System settings for key, value in _config.items(): if key not in ['target_class', 'target_name']: - if isinstance(value, str) and "*" in value: - _value = float(value.split('*')[0].strip()) - unit_string = value.split('*')[1].strip() - unit_mapping = convert_string_to_unit(unit_string) - value = _value * unit_mapping - # All key should be instance variable of the class - setattr(sampler, key, value) + if hasattr(sampler, key): + if isinstance(value, str) and "*" in value: + _value = float(value.split('*')[0].strip()) + unit_string = value.split('*')[1].strip() + unit_mapping = convert_string_to_unit(unit_string) + value = _value * unit_mapping + setattr(sampler, key, value) + else: + raise ValueError(f"Invalid keyword argument: {key}") - # Override system settings by kwargs + # Expected kwargs: output_directory_path + # Pass temporary espaloma model to the sampler if kwargs are given for key, value in override_sampler_kwargs.items(): - setattr(sampler, key, value) + if hasattr(sampler, key): + setattr(sampler, key, value) + else: + raise ValueError(f"Invalid keyword argument: {key}") # Update output directory path if epoch is given - if epoch is not None: - if len(epoch) == 1 and isinstance(epoch[0], int): - sampler.output_directory_path = os.path.join(sampler.output_directory_path, f'{epoch[0]}') + if args is not None: + if len(args) == 1 and isinstance(args[0], int): + sampler.output_directory_path = os.path.join(sampler.output_directory_path, sampler.target_name, f'{args[0]}') else: - raise ValueError("epoch should be a single value or a list of a single value") - + raise ValueError(f"Invalid argument: {args}. Expected a single integer value for the epoch number.") + # Create system sampler.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file) samplers.append(sampler) diff --git a/espfit/app/train.py b/espfit/app/train.py index 386e4c4..bf5ec99 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -375,8 +375,11 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2): _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') # Create sampler system from configuration file. Returns list of systems. - override_sampler_kwargs = { "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt" } # change this to local espaloma model - samplers = SamplerReweight.from_toml(self.configfile, epoch, override_sampler_kwargs) + args = [epoch] + override_sampler_kwargs = { + "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt", # change this to local espaloma model + "output_directory_path": self.output_directory_path } + samplers = SamplerReweight.from_toml(self.configfile, *args, **override_sampler_kwargs) for sampler in samplers: _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') sampler.minimize() diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index 69c3c5e..f1d7a53 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -38,7 +38,7 @@ def _compute_observable(self): def compute_loss(self): # Compute experimental observable obs_calc = self._compute_observable() - _logger.info(f'Computed observable: {obs_calc}') + _logger.info(f'Compute loss') # Compute loss import torch From f7dbd7a805fea717a8a5dcf13d4ed62033cbd023 Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 21 Feb 2024 14:14:43 -0500 Subject: [PATCH 28/59] remove old comment --- espfit/data/config/config.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml index 9088025..0a219ed 100644 --- a/espfit/data/config/config.toml +++ b/espfit/data/config/config.toml @@ -27,7 +27,6 @@ output_directory_path = "checkpoints" # System setup parameters -# Note: only target class/name and variables related to `openmm.unit` are supported [[sampler.setup]] target_class = "nucleoside" target_name = "cytidine" @@ -45,7 +44,6 @@ weight = 1 # system setup parameters -# Note: only target class/name and variables related to `openmm.unit` are supported [[sampler.setup]] target_class = "nucleoside" target_name = "adenosine" From c6b0c3e7f1850da1ed8cb1cc8a66189319b7664d Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 22 Feb 2024 14:17:02 -0500 Subject: [PATCH 29/59] rename test_app_experiment.py to test_app_analysis.py --- ...app_experiment.py => test_app_analysis.py} | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) rename espfit/tests/{test_app_experiment.py => test_app_analysis.py} (51%) diff --git a/espfit/tests/test_app_experiment.py b/espfit/tests/test_app_analysis.py similarity index 51% rename from espfit/tests/test_app_experiment.py rename to espfit/tests/test_app_analysis.py index ecea804..9c783d7 100644 --- a/espfit/tests/test_app_experiment.py +++ b/espfit/tests/test_app_analysis.py @@ -1,33 +1,40 @@ import pytest from importlib.resources import files -from espfit.app.experiment import RNASystem +from espfit.app.analysis import RNASystem -def test_load_traj(): +@pytest.fixture +def _get_input_directory_path(): input_directory_path = files('espfit').joinpath('data/sampler') # PosixPath + return input_directory_path + + +def test_load_traj(_get_input_directory_path): + # TODO: Better test + input_directory_path = _get_input_directory_path data = RNASystem(input_directory_path=input_directory_path) data.load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc') - # TODO: Better test - return data + assert data.traj is not None -def test_compute_jcouplings_1(): - input_directory_path = files('espfit').joinpath('data/sampler') # PosixPath +def test_compute_jcouplings(_get_input_directory_path): + # TODO: Better test + input_directory_path = _get_input_directory_path data = RNASystem(input_directory_path=input_directory_path) data.load_traj() couplings = data.compute_jcouplings(couplings=['H1H2', 'H2H3', 'H3H4']) - - # TODO: Better test - return couplings + assert couplings is not None -def test_compute_jcouplings_2(): - input_directory_path = files('espfit').joinpath('data/sampler') # PosixPath + +def test_compute_jcouplings_all(_get_input_directory_path): + # TODO: Better test + input_directory_path = _get_input_directory_path data = RNASystem() data.input_directory_path = str(input_directory_path) data.load_traj() - couplings = data.compute_jcouplings(couplings=None) + couplings = data.compute_jcouplings() - # TODO: Better test - return couplings \ No newline at end of file + assert couplings is not None + \ No newline at end of file From 7d67a3b4a6f4763a65f14217abc802cac15f163b Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 22 Feb 2024 14:18:00 -0500 Subject: [PATCH 30/59] fix minor bug to pass all tests --- espfit/app/analysis.py | 14 ++++++- espfit/app/sampler.py | 8 +--- espfit/app/train.py | 11 +++++- espfit/data/config/config.toml | 4 -- espfit/tests/test_app_sampler.py | 54 +++++++++++++------------- espfit/tests/test_app_train.py | 50 +++++++++++++++--------- espfit/tests/test_app_train_sampler.py | 29 +++++++++----- espfit/utils/sampler/reweight.py | 12 +++--- 8 files changed, 107 insertions(+), 75 deletions(-) diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py index c574fcd..5234e53 100644 --- a/espfit/app/analysis.py +++ b/espfit/app/analysis.py @@ -222,8 +222,18 @@ def compute_jcouplings(self, couplings=None, residues=None): _values = values[:,i,:] # Coupling values of i-th residue values_by_names = dict() for j, coupling_name in enumerate(couplings): - avg = _values[:,j].mean() # Mean value of H1H2 coupling of i-th residue - std = _values[:,j].std() # Standard deviation of H1H2 coupling of i-th residue + # Function to replace np.nan with None + avg = np.round(_values[:,j].mean(), 5) # Mean value of H1H2 coupling of i-th residue + std = np.round(_values[:,j].std(), 5) # Standard deviation of H1H2 coupling of i-th residue + + replace_nan_with_none = lambda x: None if np.isscalar(x) and np.isnan(x) else x + avg = replace_nan_with_none(avg) + std = replace_nan_with_none(std) + if avg: + avg = avg.item() + if std: + std = std.item() + # Convert numpy.float to float to avoid serialization issues values_by_names[coupling_name] = {'avg': avg, 'std': std} coupling_dict[resname] = values_by_names diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 5c1fe75..c75a82c 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -38,7 +38,7 @@ class BaseSimulation(object): export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None): Export serialized system XML file and solvated pdb file. """ - def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, neff_threshold=0.2, + def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, output_directory_path=None, input_directory_path=None): """Initialize base simulation object. @@ -54,9 +54,6 @@ def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, neff_thr atom_indices : list, default=None List of atom indices to save. If None, save all atoms except water and ions. - neff_threshold : float, default=0.2 - Effective sample size threshold to rerun the simulation. - checkpoint_frequency : int, default=25000 (1 ns) Frequency (in steps) at which to write checkpoint files. @@ -77,7 +74,6 @@ def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, neff_thr self.maxIterations = maxIterations self.nsteps = nsteps self.atom_indices = atom_indices - self.neff_threshold = neff_threshold self.checkpoint_frequency = checkpoint_frequency self.logging_frequency = logging_frequency self.netcdf_frequency = netcdf_frequency @@ -419,7 +415,7 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs): Returns ------- samplers : list of SetupSampler instances - """ + """ import tomllib from espfit.utils.units import convert_string_to_unit from importlib.resources import files diff --git a/espfit/app/train.py b/espfit/app/train.py index bf5ec99..824b9c1 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -329,9 +329,15 @@ def train(self): def train_sampler(self, sampler_patience=800, neff_threshold=0.2): + """ + Train the Espaloma network model with sampler. - # sampler_kwargs: attributes supported by BaseSimulation + TODO + ---- + * Should `nsteps` be a variable when calling train_sampler? + * Should `sampler_patience` and `neff_threshold` be an instance variable of sampler.BaseSimulation? + """ import os import torch from espfit.utils.units import HARTREE_TO_KCALPERMOL @@ -378,7 +384,8 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2): args = [epoch] override_sampler_kwargs = { "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt", # change this to local espaloma model - "output_directory_path": self.output_directory_path } + "output_directory_path": self.output_directory_path + } samplers = SamplerReweight.from_toml(self.configfile, *args, **override_sampler_kwargs) for sampler in samplers: _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml index 0a219ed..31ec82c 100644 --- a/espfit/data/config/config.toml +++ b/espfit/data/config/config.toml @@ -23,7 +23,6 @@ epochs = 10 batch_size = 128 learning_rate = 1e-4 checkpoint_frequency = 1 -output_directory_path = "checkpoints" # System setup parameters @@ -36,13 +35,11 @@ ionic_strength = "0.08 * molar" # 80 mM NaCl temperature = "303.15 * kelvin" maxIterations = 100 nsteps = 1000 -neff_threshold = 0.2 checkpoint_frequency = 10 logging_frequency = 1 netcdf_frequency = 10 weight = 1 - # system setup parameters [[sampler.setup]] target_class = "nucleoside" @@ -53,7 +50,6 @@ ionic_strength = "0.08 * molar" # 80 mM NaCl temperature = "303.15 * kelvin" maxIterations = 100 nsteps = 1000 -neff_threshold = 0.2 checkpoint_frequency = 10 logging_frequency = 1 netcdf_frequency = 10 diff --git a/espfit/tests/test_app_sampler.py b/espfit/tests/test_app_sampler.py index b7ab1d3..09a44b1 100644 --- a/espfit/tests/test_app_sampler.py +++ b/espfit/tests/test_app_sampler.py @@ -15,7 +15,7 @@ def test_create_test_espaloma_system(tmpdir): ------- c : espfit.app.sampler.SetupSampler """ - biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb') # PosixPath + biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/target.pdb') # PosixPath c = SetupSampler(small_molecule_forcefield=ESPALOMA_FORCEFIELD, output_directory_path=str(tmpdir)) c.create_system(biopolymer_file=biopolymer_file) # Exports solvated system as pdb file automatically. @@ -31,7 +31,7 @@ def test_create_nucleoside_espaloma_system(tmpdir): ------- None """ - biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb') + biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/target.pdb') c = SetupSampler(small_molecule_forcefield=ESPALOMA_FORCEFIELD, output_directory_path=str(tmpdir)) c.create_system(biopolymer_file=biopolymer_file) @@ -100,7 +100,7 @@ def test_create_multi_protein_ligand_espaloma_system(tmpdir): c.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file) -def test_export_system(test_create_test_espaloma_system, tmpdir): +def test_export_system(test_create_test_espaloma_system): """Test exporting the system to xml files. Parameters @@ -108,18 +108,15 @@ def test_export_system(test_create_test_espaloma_system, tmpdir): test_create_test_espaloma_system : espfit.app.sampler.SetupSampler Test system instance. - tmpdir : tmpdir fixture from pytest - Returns ------- None """ c = test_create_test_espaloma_system - c.output_directory_path = str(tmpdir) c.export_xml() + - -def test_export_system_change_outdir(test_create_test_espaloma_system, tmpdir): +def test_export_system_change_outdir(test_create_test_espaloma_system): """Test exporting the system to xml files. Change the output directory path and check if the new directory is created. @@ -129,18 +126,16 @@ def test_export_system_change_outdir(test_create_test_espaloma_system, tmpdir): test_create_test_espaloma_system : espfit.app.sampler.SetupSampler Test system instance. - tmpdir : tmpdir fixture from pytest - Returns ------- None """ + import os c = test_create_test_espaloma_system - old_outdir = c.output_directory_path - c.export_xml(output_directory_path=str(tmpdir.join('newdir'))) - new_outdir = c.output_directory_path + old_output_directory_path = c.output_directory_path + c.export_xml(output_directory_path=os.path.join(old_output_directory_path, 'newdir')) - assert old_outdir != new_outdir + assert old_output_directory_path != c.output_directory_path def test_minimize(test_create_test_espaloma_system): @@ -156,10 +151,14 @@ def test_minimize(test_create_test_espaloma_system): None """ c = test_create_test_espaloma_system - c.minimize(maxIterations=10) + old_maxIterations = c.maxIterations + c.maxIterations = 9 # change default + c.minimize() + + assert old_maxIterations != c.maxIterations -def test_standard_md(test_create_test_espaloma_system, tmpdir): +def test_standard_md(test_create_test_espaloma_system): """Test standard md simulation. Parameters @@ -172,12 +171,13 @@ def test_standard_md(test_create_test_espaloma_system, tmpdir): None """ c = test_create_test_espaloma_system - c.output_directory_path = str(tmpdir) - c.minimize(maxIterations=10) # Minimize the system before running the simulation to avoid Energy NaN. - c.run(nsteps=10) + c.maxIterations = 10 # update maxIterations to speed up the test + c.nsteps = 10 + c.minimize() # minimize the system before running the simulation to avoid Energy NaN. + c.run() -def test_create_system_from_xml(test_create_test_espaloma_system, tmpdir): +def test_create_system_from_xml(test_create_test_espaloma_system): """Test creating a system from loading existing xml files. Parameters @@ -185,20 +185,20 @@ def test_create_system_from_xml(test_create_test_espaloma_system, tmpdir): test_create_test_espaloma_system : espfit.app.sampler.SetupSampler Test system instance. - tmpdir : tmpdir fixture from pytest - Returns ------- None """ + import os + import glob + c = test_create_test_espaloma_system - c.output_directory_path = str(tmpdir) c.export_xml() - c2 = SetupSampler.from_xml(input_directory_path=str(tmpdir)) - c2.export_xml(output_directory_path=str(tmpdir)) + c2 = SetupSampler.from_xml(input_directory_path=c.output_directory_path) + c2.export_xml(output_directory_path=c.output_directory_path) # Check number of exported files. Check state.xml as a representative file. - import glob - n_files = len(glob.glob(str(tmpdir.join('state*.xml')))) + # If the same file exists, then suffix number will be added to the file name. + n_files = len(glob.glob(os.path.join(c.output_directory_path, 'state*.xml'))) assert n_files == 2 diff --git a/espfit/tests/test_app_train.py b/espfit/tests/test_app_train.py index d24fbd7..05d29d5 100644 --- a/espfit/tests/test_app_train.py +++ b/espfit/tests/test_app_train.py @@ -74,14 +74,17 @@ def test_train(test_load_dataset, test_create_espaloma_model, tmpdir): # Create temporary checkpoint directory checkpoint_directory = tmpdir.mkdir('checkpoints') # PosixPath + model.output_directory_path=str(checkpoint_directory) - # Train model - model.train(output_directory_path=str(checkpoint_directory)) + # Train model with arbitrary number of epochs and checkpoint frequency + model.epochs = 20 + model.checkpoint_frequency = 5 + model.train() # Test if the model has been trained n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - expected_n_checkpoints = int(model.config['espaloma']['train']['epochs']/model.config['espaloma']['train']['checkpoint_frequency']) - assert expected_n_checkpoints == n_checkpoints + expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency) + assert expected_n_checkpoints == n_checkpoints == 4 # 20/5 = 4 def test_train_extend(test_load_dataset, test_create_espaloma_model, tmpdir): @@ -107,20 +110,24 @@ def test_train_extend(test_load_dataset, test_create_espaloma_model, tmpdir): # Create temporary checkpoint directory checkpoint_directory = tmpdir.mkdir('checkpoints') # PosixPath + model.output_directory_path=str(checkpoint_directory) - # Train model - model.train(output_directory_path=str(checkpoint_directory)) + # Train model with arbitrary number of epochs and checkpoint frequency + model.epochs = 10 + model.checkpoint_frequency = 2 + model.train() # Test if the model has been trained n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - expected_n_checkpoints = int(model.config['espaloma']['train']['epochs']/model.config['espaloma']['train']['checkpoint_frequency']) - assert n_checkpoints == expected_n_checkpoints + expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency) + assert n_checkpoints == expected_n_checkpoints == 5 # 10/2 = 5 # Extend training - model.config['espaloma']['train']['epochs'] = 40 - model.train(output_directory_path=str(checkpoint_directory)) + model.epochs = 40 + model.train() n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - assert n_checkpoints == 4 + expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency) + assert n_checkpoints == expected_n_checkpoints == 20 # 40/2 = 20 def test_train_extend_failure(test_load_dataset, test_create_espaloma_model, tmpdir): @@ -146,19 +153,24 @@ def test_train_extend_failure(test_load_dataset, test_create_espaloma_model, tmp # Create temporary checkpoint directory checkpoint_directory = tmpdir.mkdir('checkpoints') # PosixPath + model.output_directory_path=str(checkpoint_directory) # Train model - model.train(output_directory_path=str(checkpoint_directory)) + model.epochs = 20 + model.checkpoint_frequency = 10 + model.train() # Test if the model has been trained n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - expected_n_checkpoints = int(model.config['espaloma']['train']['epochs']/model.config['espaloma']['train']['checkpoint_frequency']) - assert n_checkpoints == expected_n_checkpoints + expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency) + assert n_checkpoints == expected_n_checkpoints == 2 # 20/10 = 2 # Extend training - # This should fail to extend the training because the given new number of epoch (i.e. 10) is less than the + # The training should not extend because the given new number of epoch (i.e. 10) is less than the # last epoch of the checkpoint file (i.e. 20). - model.config['espaloma']['train']['epochs'] = 10 - model.train(output_directory_path=str(checkpoint_directory)) - n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - assert n_checkpoints == expected_n_checkpoints \ No newline at end of file + with pytest.raises(SystemExit) as excinfo: + model.epochs = 10 + model.train() + assert excinfo.value.code == 0 + #n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) + #assert n_checkpoints == expected_n_checkpoints == 2 # 20/10 = 2 \ No newline at end of file diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py index 222f86c..babf1bf 100644 --- a/espfit/tests/test_app_train_sampler.py +++ b/espfit/tests/test_app_train_sampler.py @@ -5,7 +5,7 @@ @pytest.fixture -def test_create_espaloma_model(): +def test_create_espaloma_from_toml(tmpdir): """Test function to load a TOML configuration file and create an EspalomaModel object. Returns @@ -15,6 +15,7 @@ def test_create_espaloma_model(): """ filename = files('espfit').joinpath('data/config/config.toml') # PosixPath model = EspalomaModel.from_toml(str(filename)) + model.output_directory_path = str(tmpdir) # Update output directory path return model @@ -51,17 +52,27 @@ def test_load_dataset(tmpdir): return ds -def test_train_sampler(test_load_dataset, test_create_espaloma_model): +def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml): + + """ + TODO + ---- + + * sampler.py needs to support loading temporary espaloma model during training + """ # Load dataset and model ds = test_load_dataset - model = test_create_espaloma_model - model.epochs = 50 + model = test_create_espaloma_from_toml + + # Set espaloma parameters model.dataset_train = ds + model.epochs = 10 # Train - small_molecule_forcefield=files('espfit').joinpath('data/forcefield/espaloma-0.3.2.pt') - biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb') # PosixPath - output_directory_path = 'examples/checkpoints_sampler' - model.train_sampler(biopolymer_file=biopolymer_file, sampler_patience=3, maxIterations=100, nsteps=1000, neff_threshold=0.2, output_directory_path=output_directory_path, small_molecule_forcefield=str(small_molecule_forcefield)) - raise ValueError('This test is not yet implemented.') + model.train_sampler(sampler_patience=3, neff_threshold=0.2) # fails if sampler_patience is < epochs + + # Check outputs + import glob + #assert len(glob.glob(model.output_directory_path + '/*')) > 0 + #assert model.sampler is not None \ No newline at end of file diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index f1d7a53..47a6983 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -23,21 +23,21 @@ def _compute_observable(self): from espfit.app.analysis import RNASystem target = RNASystem() target.load_traj(input_directory_path=self.output_directory_path) - obs_calc = target.compute_jcouplings() - _logger.info(f'Computed observable: {obs_calc}') + val = target.compute_jcouplings() + _logger.info(f'Computed observable: {val}') else: raise NotImplementedError(f'Observable for {self.target_class} is not implemented.') import yaml - with open(os.path.join(self.output_directory_path, 'observable.yaml'), 'w') as f: - yaml.dump(obs_calc, f, allow_unicode=True) + with open(os.path.join(self.output_directory_path, 'pred.yaml'), 'w') as f: + yaml.dump(val, f, allow_unicode=True) - return obs_calc + return val def compute_loss(self): # Compute experimental observable - obs_calc = self._compute_observable() + val = self._compute_observable() _logger.info(f'Compute loss') # Compute loss From cbcd34385858883e0608cc60eba5b2af9637a370 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 23 Feb 2024 15:23:26 -0500 Subject: [PATCH 31/59] run sampler using on-the-fly espaloma model created during training --- espfit/app/train.py | 403 ++++++++++++++++--------- espfit/tests/test_app_train_sampler.py | 4 +- 2 files changed, 268 insertions(+), 139 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index 824b9c1..f64ecfb 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -3,118 +3,27 @@ TODO ---- +* Export loss to a file (e.g. LossReporter class?) * Add support to use multiple GPUs -* Add support to validate model? (or use independent script?) -* Add support to save model? (or use independent script?) * Improve how data are parsed using dataclasses or pydantic """ import os +import torch import logging _logger = logging.getLogger(__name__) -class EspalomaModel(object): - """Espaloma network model and training modules. - - Methods - ------- - from_toml(filename): - Load espaloma configuration file in TOML format. - - Examples - -------- - >>> from espfit.app.train import EspalomaModel - >>> filename = 'espfit/data/config/config.toml' - >>> # create espaloma network model from toml file - >>> model = EspalomaModel.from_toml(filename) - >>> # check espaloma network model - >>> model.net - >>> # load training dataset - >>> model.dataset_train = ds - >>> model.train() - """ - - def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, random_seed=2666, output_directory_path=None, - epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10): - """Initialize an instance of the class with an Espaloma network model and a random seed. - - This constructor method sets up the Espaloma network model, the training, validation, test datasets, - a configuratino file, and the random seed that will be used throughout the training process. - If no model or datasets are provided, the corresponding attributes will be set to None. If no random seed is - provided, the `random_seed` attribute will be set to 2666. - - Parameters - ---------- - net : torch.nn.Sequential, default=None - The Espaloma network model to be used for training. - - dataset_train : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None - The training dataset. espaloma.graphs.graph.Graph. If not provided, the `train_data` attribute will be set to None. - - dataset_validation : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None - The validation dataset. If not provided, the `validation_data` attribute will be set to None. - - dataset_test : Dataset, espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None - The test dataset. If not provided, the `test_data` attribute will be set to None. - - random_seed : int, default=2666 - The random seed used throughout the espaloma training. - - output_directory_path : str, default=None - The directory where the model checkpoints should be saved. - If not provided, the checkpoints will be saved in the current working directory. - - epochs : int, default=1000 - The number of epochs to train the model for. - - batch_size : int, default=128 - The number of samples per batch. - - learning_rate : float, default=1e-4 - The learning rate for the optimizer. - - checkpoint_frequency : int, default=10 - The frequency at which the model should be saved. - """ - self.dataset_train = dataset_train - self.dataset_validation = dataset_validation - self.dataset_test = dataset_test - self.net = net - self.random_seed = random_seed - self.epochs = epochs - self.batch_size = batch_size - self.learning_rate = learning_rate - self.checkpoint_frequency = checkpoint_frequency - self.restart_epoch = 0 - self.configfile = None - if output_directory_path is None: - output_directory_path = os.getcwd() - self.output_directory_path = output_directory_path - +class EspalomaBase(object): + def __init__(self): # Check if GPU is available - import torch if torch.cuda.is_available(): _logger.info('GPU is available for training.') else: _logger.info('GPU is not available for training.') # Check torch data type - _logger.info(f'Torch data type is {torch.get_default_dtype()}') - - - @property - def output_directory_path(self): - """Get output directory path.""" - return self._output_directory_path - - - @output_directory_path.setter - def output_directory_path(self, value): - """Set output directory path.""" - self._output_directory_path = value - # Create output directory if it does not exist - os.makedirs(value, exist_ok=True) + _logger.debug(f'Torch data type is {torch.get_default_dtype()}') @classmethod @@ -158,14 +67,9 @@ def from_toml(cls, filename): @staticmethod - def create_model(espaloma_config): - """Create an Espaloma network model using the provided configuration. - - This function constructs a PyTorch Sequential model with two stages of Graph Neural Network (GNN) layers, - JanossyPooling readout layers for various features, and additional layers for energy computation and loss calculation. - The specifics of the GNN layers and the readout layers are controlled by the `espaloma_config` dictionary. - If a CUDA-compatible GPU is available, the model is moved to the GPU before being returned. - + def _get_base_module(espaloma_config): + """Create base modules for Espaloma network model. + Parameters ---------- espaloma_config : dict @@ -175,11 +79,12 @@ def create_model(espaloma_config): Returns ------- - torch.nn.Sequential - The constructed Espaloma network model. + list + A list of modules for the Espaloma network model. """ + import espaloma as esp - + # GNN gnn_method = 'SAGEConv' gnn_options = {} @@ -214,30 +119,203 @@ def create_model(espaloma_config): # Improper torsions (multiplicity n=2) readout_improper = esp.nn.readout.janossy.JanossyPoolingWithSmirnoffImproper(in_features=units, config=config_2, out_features={"k": 2}) - # Get loss weights - # TODO: Better way to initialize weights? + # Initialize loss weights and update if provided weights = { 'energy': 1.0, 'force': 1.0, 'charge': 1.0, 'torsion': 1.0, 'improper': 1.0 } if 'weights' in espaloma_config.keys(): for key in espaloma_config['weights'].keys(): weights[key] = espaloma_config['weights'][key] - # Define espaloma architecture - import torch + # Append base modules + modules = [] + modules.append(representation) + modules.append(readout) + modules.append(readout_improper) + modules.append(esp.nn.readout.janossy.ExpCoefficients()) + modules.append(esp.nn.readout.charge_equilibrium.ChargeEquilibrium()) + + return modules, weights + + + @staticmethod + def create_model(espaloma_config): + """Create an Espaloma network model using the provided configuration. + + This function constructs a PyTorch Sequential model with two stages of Graph Neural Network (GNN) layers, + JanossyPooling readout layers for various features, and additional layers for energy computation and loss calculation. + The specifics of the GNN layers and the readout layers are controlled by the `espaloma_config` dictionary. + If a CUDA-compatible GPU is available, the model is moved to the GPU before being returned. + + Parameters + ---------- + espaloma_config : dict + A dictionary containing the configuration for the Espaloma network. + This includes the method and options for the GNN layers, the configurations for the two stages of the network, + and optionally the weights for different loss components. + + Returns + ------- + torch.nn.Sequential + The constructed Espaloma network model. + """ + import espaloma as esp from espfit.utils.espaloma.module import GetLoss - net = torch.nn.Sequential( - representation, - readout, - readout_improper, - esp.nn.readout.janossy.ExpCoefficients(), - esp.nn.readout.charge_equilibrium.ChargeEquilibrium(), - esp.mm.geometry.GeometryInGraph(), - esp.mm.energy.EnergyInGraph(terms=["n2", "n3", "n4", "n4_improper"]), - GetLoss(weights), - ) + + # Get base model + modules, weights = EspalomaBase._get_base_module(espaloma_config) + + # Define espaloma architecture + modules.append(esp.mm.geometry.GeometryInGraph()) + modules.append(esp.mm.energy.EnergyInGraph(terms=["n2", "n3", "n4", "n4_improper"])) + modules.append(GetLoss(weights)) + + # Create model + net = torch.nn.Sequential(*modules) if torch.cuda.is_available(): return net.cuda() else: return net + + + def save_model(self, net=None, best_model=None, model_name='espaloma.pt', output_directory_path=None): + """Save the Espaloma network model to a file. + + This method saves the Espaloma network model to a file in the specified output directory. + + Parameters + ---------- + net : torch.nn.Sequential + The Espaloma network model to be saved. + + best_model : str + The path to the best model file. + + model_name : str, default='espaloma.pt' + The name of the file to save the model to. + + output_directory_path : str, default=None + The directory where the model should be saved. + If not provided, the model will be saved in the current working directory. + + Returns + ------- + None + """ + import espaloma as esp + + if output_directory_path is not None: + os.makedirs(output_directory_path, exist_ok=True) + else: + output_directory_path = os.getcwd() + + if net: + modules = [] + for module in net: + if isinstance(module, esp.mm.geometry.GeometryInGraph): + break + modules.append(module) + modules.append(esp.nn.readout.janossy.LinearMixtureToOriginal()) + net = torch.nn.Sequential(*modules) + else: + raise ValueError('No model provided.') + + # Save model + state_dict = torch.load(best_model, map_location=torch.device('cpu')) + net.load_state_dict(state_dict) + torch.save(net, os.path.join(output_directory_path, model_name)) + + +class EspalomaModel(EspalomaBase): + """Espaloma network model and training modules. + + Methods + ------- + from_toml(filename): + Load espaloma configuration file in TOML format. + + Examples + -------- + >>> from espfit.app.train import EspalomaModel + >>> filename = 'espfit/data/config/config.toml' + >>> # create espaloma network model from toml file + >>> model = EspalomaModel.from_toml(filename) + >>> # check espaloma network model + >>> model.net + >>> # load training dataset + >>> model.dataset_train = ds + >>> model.train() + """ + + def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, + epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, + random_seed=2666, output_directory_path=None): + """Initialize an instance of the class with an Espaloma network model and a random seed. + + This constructor method sets up the Espaloma network model, the training, validation, test datasets, + a configuratino file, and the random seed that will be used throughout the training process. + If no model or datasets are provided, the corresponding attributes will be set to None. If no random seed is + provided, the `random_seed` attribute will be set to 2666. + + Parameters + ---------- + net : torch.nn.Sequential, default=None + The Espaloma network model to be used for training. + + dataset_train : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None + The training dataset. espaloma.graphs.graph.Graph. If not provided, the `train_data` attribute will be set to None. + + dataset_validation : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None + The validation dataset. If not provided, the `validation_data` attribute will be set to None. + + dataset_test : Dataset, espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None + The test dataset. If not provided, the `test_data` attribute will be set to None. + + epochs : int, default=1000 + The number of epochs to train the model for. + + batch_size : int, default=128 + The number of samples per batch. + + learning_rate : float, default=1e-4 + The learning rate for the optimizer. + + checkpoint_frequency : int, default=10 + The frequency at which the model should be saved. + + random_seed : int, default=2666 + The random seed used throughout the espaloma training. + + output_directory_path : str, default=None + The directory where the model checkpoints should be saved. + If not provided, the checkpoints will be saved in the current working directory. + """ + super(EspalomaBase, self).__init__() + self.net = net + self.dataset_train = dataset_train + self.dataset_validation = dataset_validation + self.dataset_test = dataset_test + self.epochs = epochs + self.batch_size = batch_size + self.learning_rate = learning_rate + self.checkpoint_frequency = checkpoint_frequency + self.restart_epoch = 0 + self.random_seed = random_seed + if output_directory_path is None: + output_directory_path = os.getcwd() + self.output_directory_path = output_directory_path + + + @property + def output_directory_path(self): + """Get output directory path.""" + return self._output_directory_path + + + @output_directory_path.setter + def output_directory_path(self, value): + """Set output directory path.""" + self._output_directory_path = value + # Create output directory if it does not exist + os.makedirs(value, exist_ok=True) def _load_checkpoint(self): @@ -252,10 +330,8 @@ def _load_checkpoint(self): int The step from which the training process should be restarted. """ - import os import sys import glob - import torch checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path)) @@ -292,8 +368,6 @@ def train(self): ------- None """ - import os - import torch from espfit.utils.units import HARTREE_TO_KCALPERMOL if self.dataset_train is None: @@ -328,21 +402,29 @@ def train(self): torch.save(self.net.state_dict(), checkpoint_file) - def train_sampler(self, sampler_patience=800, neff_threshold=0.2): + def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): """ Train the Espaloma network model with sampler. TODO ---- + * Export loss to a file (e.g. LossReporter class?) * Should `nsteps` be a variable when calling train_sampler? * Should `sampler_patience` and `neff_threshold` be an instance variable of sampler.BaseSimulation? """ - import os - import torch from espfit.utils.units import HARTREE_TO_KCALPERMOL from espfit.utils.sampler.reweight import SamplerReweight + + # Note: RuntimeError will be raised if copy.deepcopy is used. + # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace + # operation: [torch.cuda.FloatTensor [512, 1]], which is output 0 of AsStridedBackward0, is at version 2; + # expected version 1 instead. Hint: the backtrace further above shows the operation that failed to + # compute its gradient. The variable in question was changed in there or anywhere later. Good luck! + import copy + net_local = copy.deepcopy(self.net) + self.sampler_patience = sampler_patience self.neff_threshold = neff_threshold @@ -361,18 +443,23 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2): with torch.autograd.set_detect_anomaly(True): for i in range(self.restart_epoch, self.epochs): epoch = i + 1 # Start from 1 (not zero-indexing) + loss = torch.tensor(0.0) if torch.cuda.is_available(): loss = loss.cuda("cuda:0") + for g in ds_tr_loader: optimizer.zero_grad() if torch.cuda.is_available(): g = g.to("cuda:0") g.nodes["n1"].data["xyz"].requires_grad = True - loss += self.net(g) + loss += self.net(g) # Return each loss component? # Run sampling if epoch > self.sampler_patience: + # Compute effective sample size + #neff = xxx + if neff < self.neff_threshold: # Get Effective sample size if neff < 0: @@ -380,12 +467,26 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2): else: _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') - # Create sampler system from configuration file. Returns list of systems. + # Save espaloma model + self._save_local_model(epoch) + local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt") + self.save_model(net=net_local, best_model=local_model, model_name=f"net{epoch}.pt", output_directory_path=self.output_directory_path) + + # Define sampler settings to force the use of local espaloma model args = [epoch] + if debug == True: + # DEBUG PURPOSE + from importlib.resources import files + small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt")) + else: + small_molecule_forcefield = os.path.join(self.output_directory_path, f"net{epoch}.pt") + override_sampler_kwargs = { - "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt", # change this to local espaloma model + "small_molecule_forcefield": small_molecule_forcefield, "output_directory_path": self.output_directory_path } + + # Create sampler system from configuration file. Returns list of systems. samplers = SamplerReweight.from_toml(self.configfile, *args, **override_sampler_kwargs) for sampler in samplers: _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') @@ -401,17 +502,45 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2): loss.backward() optimizer.step() + # Report loss? + #self.export_loss(loss, epoch) + if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() _logger.info(f'epoch {epoch}: {_loss:.3f}') - checkpoint_file = os.path.join(self.output_directory_path, f"net{epoch}.pt") - torch.save(self.net.state_dict(), checkpoint_file) + self._save_local_model(epoch) - def validate(): - raise NotImplementedError + def _save_local_model(self, epoch): + """Save local model. + Parameters + ---------- + epoch : int + The epoch number. + + Returns + ------- + None + """ + checkpoint_file = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt") + torch.save(self.net.state_dict(), checkpoint_file) + + + def report_loss(self, loss, epoch): + """Report loss. - def save_model(): - raise NotImplementedError \ No newline at end of file + Parameters + ---------- + loss : float + The loss value. + + epoch : int + The epoch number. + + Returns + ------- + None + """ + pass \ No newline at end of file diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py index babf1bf..e0747a1 100644 --- a/espfit/tests/test_app_train_sampler.py +++ b/espfit/tests/test_app_train_sampler.py @@ -70,9 +70,9 @@ def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml): model.epochs = 10 # Train - model.train_sampler(sampler_patience=3, neff_threshold=0.2) # fails if sampler_patience is < epochs + model.train_sampler(sampler_patience=3, neff_threshold=0.2, debug=True) # fails if sampler_patience is < epochs # Check outputs - import glob + #import glob #assert len(glob.glob(model.output_directory_path + '/*')) > 0 #assert model.sampler is not None \ No newline at end of file From ff0468213ada48b9744933f06247ebbb94a29e4a Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 26 Feb 2024 10:06:37 -0500 Subject: [PATCH 32/59] export loss per epoch to reporter.log --- espfit/app/train.py | 53 +++++++++++++++++++++++---------- espfit/utils/espaloma/module.py | 16 ++++++++-- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index f64ecfb..d0053b6 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -27,7 +27,7 @@ def __init__(self): @classmethod - def from_toml(cls, filename): + def from_toml(cls, filename, **override_espalomamodel_kwargs): """Create an instance of the class from a TOML configuration file. This method reads a TOML file specified by `filename`, extracts the 'espaloma' @@ -61,7 +61,17 @@ def from_toml(cls, filename): # Update training settings for key, value in config['espaloma']['train'].items(): - setattr(model, key, value) + if hasattr(model, key): + setattr(model, key, value) + else: + raise ValueError(f'Invalid attribute {key}.') + + # Override training settings + for key, value in override_espalomamodel_kwargs.items(): + if hasattr(model, key): + setattr(model, key, value) + else: + raise ValueError(f'Invalid attribute {key}.') return model @@ -438,6 +448,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): neff = -1 # Train + loss_trajectory = {} ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) with torch.autograd.set_detect_anomaly(True): @@ -452,8 +463,13 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): optimizer.zero_grad() if torch.cuda.is_available(): g = g.to("cuda:0") - g.nodes["n1"].data["xyz"].requires_grad = True - loss += self.net(g) # Return each loss component? + g.nodes["n1"].data["xyz"].requires_grad = True + + # Forward pass + # Note that returned values are weighted losses. + _loss, loss_dict = self.net(g) + # Append loss + loss += _loss # Run sampling if epoch > self.sampler_patience: @@ -495,22 +511,28 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): # Compute MD loss _logger.info(f'Compute sampler loss.') - for sampler in samplers: - loss += sampler.compute_loss() * sampler.weight + for sampler_index, sampler in enumerate(samplers): + loss_sampler = sampler.compute_loss() * sampler.weight + loss += loss_sampler + loss_dict[f'sampler{sampler_index}'] = loss_sampler.item() + + # Append individual loss to loss_trajectory + loss_trajectory[epoch] = loss_dict # Update weights loss.backward() optimizer.step() - # Report loss? - #self.export_loss(loss, epoch) - if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() _logger.info(f'epoch {epoch}: {_loss:.3f}') self._save_local_model(epoch) + # Export loss trajectory + _logger.info(f'Export loss trajectory to a file.') + self.report_loss(loss_trajectory) + def _save_local_model(self, epoch): """Save local model. @@ -528,19 +550,18 @@ def _save_local_model(self, epoch): torch.save(self.net.state_dict(), checkpoint_file) - def report_loss(self, loss, epoch): + def report_loss(self, loss_trajecotry): """Report loss. Parameters ---------- - loss : float - The loss value. - - epoch : int - The epoch number. + loss : dict + The loss trajectory that stores individual weighted losses for each epoch. Returns ------- None """ - pass \ No newline at end of file + import pandas as pd + df = pd.DataFrame.from_dict(loss_trajecotry, orient='index') + df.to_csv(os.path.join(self.output_directory_path, 'report.log'), sep='\t', float_format='%.4f') diff --git a/espfit/utils/espaloma/module.py b/espfit/utils/espaloma/module.py index e3d27b4..1e5ea18 100644 --- a/espfit/utils/espaloma/module.py +++ b/espfit/utils/espaloma/module.py @@ -166,7 +166,11 @@ def forward(self, g): Returns ------- - loss : torch.Tensor + loss : torch.Tensor + Total weighted loss + + loss_dict : dict + Dictionary of individual weighted losses """ loss_energy = self.compute_energy_loss(g) * self.weights['energy'] loss_force = self.compute_force_loss(g) * self.weights['force'] @@ -180,5 +184,13 @@ def forward(self, g): _logger.debug(f"energy: {loss_energy:.5f}, force: {loss_force:.5f}, charge: {loss_charge:.5f}, torsion: {loss_torsion:.5f}, improper: {loss_improper:.5f}") loss = loss_energy + loss_force + loss_charge + loss_torsion + loss_improper + + loss_dict = { + 'energy': loss_energy.item(), + 'force': loss_force.item(), + 'charge': loss_charge.item(), + 'torsion': loss_torsion.item(), + 'improper': loss_improper.item(), + } - return loss \ No newline at end of file + return loss, loss_dict \ No newline at end of file From 43f7e9940c458982f8ec04b711f1d86b723a06d7 Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 28 Feb 2024 13:09:24 -0500 Subject: [PATCH 33/59] use j-coupling names for keyname --- .../nucleoside/adenosine/experiment.yml | 24 ++++++++++++------- .../target/nucleoside/cytidine/experiment.yml | 24 ++++++++++++------- .../nucleoside/guanosine/experiment.yml | 24 ++++++++++++------- .../target/nucleoside/uridine/experiment.yml | 24 ++++++++++++------- 4 files changed, 64 insertions(+), 32 deletions(-) diff --git a/espfit/data/target/nucleoside/adenosine/experiment.yml b/espfit/data/target/nucleoside/adenosine/experiment.yml index c4ffa09..b7efedf 100644 --- a/espfit/data/target/nucleoside/adenosine/experiment.yml +++ b/espfit/data/target/nucleoside/adenosine/experiment.yml @@ -15,35 +15,43 @@ experiment_1: smiles: measurement: resi_1: - beta_1: + 1H5P: + name: beta_1 value: operator: error: - beta_2: + 2H5P: + name: beta_2 value: operator: error: - gamma_1: + 1H5H4: + name: gamma_1 value: operator: error: - gamma_2: + 2H5P: + name: gamma_2 value: operator: error: - epsilon: + H3P: + name: epsilon value: operator: error: - nu_1: + H1H2: + name: nu_1 value: 6.0 operator: error: - nu_2: + H2H3: + name: nu_2 value: 5.1 operator: error: - nu_3: + H3H4: + name: nu_3 value: 3.5 operator: error: diff --git a/espfit/data/target/nucleoside/cytidine/experiment.yml b/espfit/data/target/nucleoside/cytidine/experiment.yml index b0354a3..e6bbbe6 100644 --- a/espfit/data/target/nucleoside/cytidine/experiment.yml +++ b/espfit/data/target/nucleoside/cytidine/experiment.yml @@ -15,35 +15,43 @@ experiment_1: smiles: measurement: resi_1: - beta_1: + 1H5P: + name: beta_1 value: operator: error: - beta_2: + 2H5P: + name: beta_2 value: operator: error: - gamma_1: + 1H5H4: + name: gamma_1 value: operator: error: - gamma_2: + 2H5P: + name: gamma_2 value: operator: error: - epsilon: + H3P: + name: epsilon value: operator: error: - nu_1: + H1H2: + name: nu_1 value: 4.02 operator: error: - nu_2: + H2H3: + name: nu_2 value: 5.49 operator: error: - nu_3: + H3H4: + name: nu_3 value: 6.15 operator: error: diff --git a/espfit/data/target/nucleoside/guanosine/experiment.yml b/espfit/data/target/nucleoside/guanosine/experiment.yml index a313122..46bcbc4 100644 --- a/espfit/data/target/nucleoside/guanosine/experiment.yml +++ b/espfit/data/target/nucleoside/guanosine/experiment.yml @@ -15,35 +15,43 @@ experiment_1: smiles: measurement: resi_1: - beta_1: + 1H5P: + name: beta_1 value: operator: error: - beta_2: + 2H5P: + name: beta_2 value: operator: error: - gamma_1: + 1H5H4: + name: gamma_1 value: operator: error: - gamma_2: + 2H5P: + name: gamma_2 value: operator: error: - epsilon: + H3P: + name: epsilon value: operator: error: - nu_1: + H1H2: + name: nu_1 value: 5.9 operator: error: - nu_2: + H2H3: + name: nu_2 value: 5.3 operator: error: - nu_3: + H3H4: + name: nu_3 value: 4.1 operator: error: diff --git a/espfit/data/target/nucleoside/uridine/experiment.yml b/espfit/data/target/nucleoside/uridine/experiment.yml index a75185c..47ba261 100644 --- a/espfit/data/target/nucleoside/uridine/experiment.yml +++ b/espfit/data/target/nucleoside/uridine/experiment.yml @@ -15,35 +15,43 @@ experiment_1: smiles: measurement: resi_1: - beta_1: + 1H5P: + name: beta_1 value: operator: error: - beta_2: + 2H5P: + name: beta_2 value: operator: error: - gamma_1: + 1H5H4: + name: gamma_1 value: operator: error: - gamma_2: + 2H5P: + name: gamma_2 value: operator: error: - epsilon: + H3P: + name: epsilon value: operator: error: - nu_1: + H1H2: + name: nu_1 value: 4.59 operator: error: - nu_2: + H2H3: + name: nu_2 value: 5.45 operator: error: - nu_3: + H3H4: + name: nu_3 value: 5.77 operator: error: From bdb143384a5ff46ae6c4593e00f6dc4b19e3b7da Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 28 Feb 2024 13:10:08 -0500 Subject: [PATCH 34/59] add support to compute sampler loss --- espfit/utils/sampler/reweight.py | 60 ++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index 47a6983..672002d 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -5,16 +5,28 @@ _logger = logging.getLogger(__name__) -class SamplerReweight(SetupSampler, BaseSimulation): +class SetupSamplerReweight(SetupSampler, BaseSimulation): def __init__(self, weight=1, **kwargs): super().__init__(**kwargs) self.weight = weight + + def _get_experiment_data(self, target_class, target_name): + import yaml + from importlib.resources import files + + yaml_file = str(files('espfit').joinpath(f'data/target/{target_class}/{target_name}/experiment.yml')) + with open(yaml_file, 'r') as f: + d = yaml.safe_load(f) + + # {'resi_1': {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}}} + return d['experiment_1']['measurement'] + def get_effective_sample_size(self): # Compute effective sample size - neff = 0.5 + neff = 0.5 return neff @@ -23,25 +35,53 @@ def _compute_observable(self): from espfit.app.analysis import RNASystem target = RNASystem() target.load_traj(input_directory_path=self.output_directory_path) - val = target.compute_jcouplings() - _logger.info(f'Computed observable: {val}') + pred = target.compute_jcouplings() + #_logger.debug(f'Computed observable: {pred}') else: raise NotImplementedError(f'Observable for {self.target_class} is not implemented.') import yaml with open(os.path.join(self.output_directory_path, 'pred.yaml'), 'w') as f: - yaml.dump(val, f, allow_unicode=True) + yaml.dump(pred, f, allow_unicode=True) - return val + return pred def compute_loss(self): # Compute experimental observable - val = self._compute_observable() - _logger.info(f'Compute loss') + exp = self._get_experiment_data(self.target_class, self.target_name) + pred = self._compute_observable() + + loss = [] + for resi_index, exp_dict in enumerate(exp.values()): + for key, value in exp_dict.items(): + # {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}} + if value['operator'] in ['>', '<', '>=', '<=', '~'] or value['value'] == None: + # Dont use uncertain data + pass + else: + exp_value = value['value'] + exp_error = value['error'] + if exp_error == None: + exp_error = 0.5 # TODO: Check experimental error + + resi_index = int(resi_index) + pred_value = list(pred.values())[resi_index][key]['avg'] + pred_error = list(pred.values())[resi_index][key]['std'] # standard deviation + + # TODO: change to debug + _logger.info(f'Exp ({resi_index}-{key}): {exp}') + _logger.info(f'Pred ({resi_index}-{key}): {pred}') + + # Compute loss + numerator = (pred_value - exp_value) ** 2 + dominator = (exp_error ** 2 + pred_error ** 2) + loss.append(numerator / dominator) + # Compute loss import torch - loss = torch.tensor(0.0) + loss_avg = torch.mean(torch.tensor(loss)) + _logger.info(f'Computed sampler loss: {loss_avg.item()}') - return loss \ No newline at end of file + return loss_avg \ No newline at end of file From 095c87353be204d6c7d646ea839ec90a0c93aeb0 Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 4 Mar 2024 16:48:03 -0500 Subject: [PATCH 35/59] refactor reweight.py --- espfit/app/analysis.py | 38 ++++-- espfit/app/sampler.py | 41 +++++- espfit/app/train.py | 212 ++++++++++++++++--------------- espfit/data/config/config.toml | 2 - espfit/utils/espaloma/module.py | 1 + espfit/utils/sampler/reweight.py | 110 ++++++++++------ 6 files changed, 241 insertions(+), 163 deletions(-) diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py index 5234e53..1b6d97d 100644 --- a/espfit/app/analysis.py +++ b/espfit/app/analysis.py @@ -26,11 +26,14 @@ class BaseDataLoader(object): load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1): Load MD trajectory. """ - def __init__(self, input_directory_path=None, output_directory_path=None): + def __init__(self, atomSubset='solute', input_directory_path=None, output_directory_path=None): """Initialize base data loader object. Parameters ---------- + atomSubset : str, default='solute' + Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'. + input_directory_path : str, optional Input directory path. Default is None. If None, the current working directory will be used. @@ -39,6 +42,10 @@ def __init__(self, input_directory_path=None, output_directory_path=None): Output directory path. Default is None. If None, the current working directory will be used. """ + self.atomSubset = atomSubset + if self.atomSubset not in ['solute', 'all', 'not water']: + raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.") + if input_directory_path is None: input_directory_path = os.getcwd() if output_directory_path is None: @@ -62,7 +69,7 @@ def output_directory_path(self, value): # Should this be a classmethod? - def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1, input_directory_path=None): + def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None): """Load MD trajectory. Parameters @@ -73,10 +80,6 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a trajectory_netcdf : str, optional Trajectory netcdf file name. Default is 'traj.nc'. - atom_indices : list, optional - List of atom indices to load from trajectory. Default is None. - If None, all atoms will be loaded. - stride : int, optional Stride to load the trajectory. Default is 1. @@ -92,22 +95,31 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a # Load reference pdb (solvated system) pdb = os.path.join(self.input_directory_path, reference_pdb) ref_traj = mdtraj.load(pdb) + # Select atoms to load from trajectory - if atom_indices is None: + if self.atomSubset == 'all': + self.atom_indices = None + self.ref_traj = ref_traj + else: self.atom_indices = [] mdtop = ref_traj.topology - res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] + if self.atomSubset == 'solute': + res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] + elif self.atomSubset == 'not water': + res = [ r for r in mdtop.residues if r.name not in ('HOH') ] + # Get atom indices for r in res: for a in r.atoms: self.atom_indices.append(a.index) - else: - self.atom_indices = atom_indices - self.ref_traj = ref_traj.atom_slice(self.atom_indices) - + self.ref_traj = ref_traj.atom_slice(self.atom_indices) + # Load trajectory netcdf = os.path.join(self.input_directory_path, trajectory_netcdf) traj = mdtraj.load(netcdf, top=self.ref_traj.topology, stride=stride) - self.traj = traj.atom_slice(self.atom_indices) + if self.atom_indices: + self.traj = traj.atom_slice(self.atom_indices) + else: + self.traj = traj class RNASystem(BaseDataLoader): diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index c75a82c..4d431d3 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -38,7 +38,7 @@ class BaseSimulation(object): export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None): Export serialized system XML file and solvated pdb file. """ - def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, + def __init__(self, maxIterations=100, nsteps=250000, atomSubset='solute', checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, output_directory_path=None, input_directory_path=None): """Initialize base simulation object. @@ -51,9 +51,9 @@ def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, nsteps : int, default=250000 (10 ns using 4 fs timestep) Number of steps to run the simulation. - atom_indices : list, default=None - List of atom indices to save. If None, save all atoms except water and ions. - + atomSubset : str, default='solute' + Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'. + checkpoint_frequency : int, default=25000 (1 ns) Frequency (in steps) at which to write checkpoint files. @@ -73,11 +73,14 @@ def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, """ self.maxIterations = maxIterations self.nsteps = nsteps - self.atom_indices = atom_indices + self.atomSubset = atomSubset self.checkpoint_frequency = checkpoint_frequency self.logging_frequency = logging_frequency self.netcdf_frequency = netcdf_frequency + if self.atomSubset not in ['solute', 'all', 'not water']: + raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.") + if output_directory_path is None: output_directory_path = os.getcwd() # Is this right? if input_directory_path is None: @@ -163,10 +166,34 @@ def run(self, output_directory_path=None): # Select atoms to save import mdtraj - if self.atom_indices is None: + #if self.atomSubset == 'solute': + # self.atom_indices = [] + # mdtop = mdtraj.Topology.from_openmm(self.simulation.topology) + # res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] + # for r in res: + # for a in r.atoms: + # self.atom_indices.append(a.index) + #elif self.atomSubset == 'all': + # self.atom_indices = None + #elif self.atomSubset == 'not water': + # self.atom_indices = [] + # mdtop = mdtraj.Topology.from_openmm(self.simulation.topology) + # res = [ r for r in mdtop.residues if r.name not in ('HOH') ] + # for r in res: + # for a in r.atoms: + # self.atom_indices.append(a.index) + #else: + # raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.") + + if self.atomSubset == 'all': + self.atom_indices = None + else: self.atom_indices = [] mdtop = mdtraj.Topology.from_openmm(self.simulation.topology) - res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] + if self.atomSubset == 'solute': + res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] + elif self.atomSubset == 'not water': + res = [ r for r in mdtop.residues if r.name not in ('HOH') ] for r in res: for a in r.atoms: self.atom_indices.append(a.index) diff --git a/espfit/app/train.py b/espfit/app/train.py index d0053b6..3b4d007 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -328,42 +328,21 @@ def output_directory_path(self, value): os.makedirs(value, exist_ok=True) - def _load_checkpoint(self): - """Load the last checkpoint and restart the training process. + def report_loss(self, loss_trajecotry): + """Report loss. - This method finds all the checkpoint files in the output directory, loads the - last checkpoint (e.g. net100.pt), and restarts the training process from the next step. - If no checkpoint files are found, the training process starts from the first step. + Parameters + ---------- + loss : dict + The loss trajectory that stores individual weighted losses for each epoch. Returns ------- - int - The step from which the training process should be restarted. + None """ - import sys - import glob - - checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path)) - - if checkpoints: - n = [ int(c.split('net')[1].split('.')[0]) for c in checkpoints ] - n.sort() - restart_epoch = n[-1] - restart_checkpoint = os.path.join(self.output_directory_path, f"net{restart_epoch}.pt") - self.net.load_state_dict(torch.load(restart_checkpoint)) - logging.info(f'Restarting from ({restart_checkpoint}).') - else: - restart_epoch = 0 - - if restart_epoch >= self.epochs: - _logger.info(f'Already trained for {self.epochs} epochs.') - sys.exit(0) - elif restart_epoch > 0: - _logger.info(f'Training for additional {self.epochs-restart_epoch} epochs.') - else: - _logger.info(f'Training from scratch for {self.epochs} epochs.') - - return restart_epoch + import pandas as pd + df = pd.DataFrame.from_dict(loss_trajecotry, orient='index') + df.to_csv(os.path.join(self.output_directory_path, 'reporter.log'), sep='\t', float_format='%.4f') def train(self): @@ -408,11 +387,10 @@ def train(self): # Note: returned loss is a joint loss of different units. _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() _logger.info(f'epoch {epoch}: {_loss:.3f}') - checkpoint_file = os.path.join(self.output_directory_path, f"net{epoch}.pt") - torch.save(self.net.state_dict(), checkpoint_file) + self._save_checkpoint(epoch) - def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): + def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False): """ Train the Espaloma network model with sampler. @@ -424,8 +402,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): """ from espfit.utils.units import HARTREE_TO_KCALPERMOL - from espfit.utils.sampler.reweight import SamplerReweight - + from espfit.utils.sampler.reweight import SetupSamplerReweight # Note: RuntimeError will be raised if copy.deepcopy is used. # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace @@ -433,7 +410,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): # expected version 1 instead. Hint: the backtrace further above shows the operation that failed to # compute its gradient. The variable in question was changed in there or anywhere later. Good luck! import copy - net_local = copy.deepcopy(self.net) + net_copy = copy.deepcopy(self.net) self.sampler_patience = sampler_patience self.neff_threshold = neff_threshold @@ -444,10 +421,8 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): # Load checkpoint self.restart_epoch = self._load_checkpoint() - # Initialize neff to -1 to trigger the first sampling - neff = -1 - # Train + neff = -1 loss_trajectory = {} ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) @@ -471,52 +446,37 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): # Append loss loss += _loss - # Run sampling + # Include sampler loss after certain epochs if epoch > self.sampler_patience: - # Compute effective sample size - #neff = xxx - - if neff < self.neff_threshold: - # Get Effective sample size - if neff < 0: - _logger.info(f'Reached sampler patience {self.sampler_patience}. Run sampler for the first time.') - else: - _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') - - # Save espaloma model - self._save_local_model(epoch) - local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt") - self.save_model(net=net_local, best_model=local_model, model_name=f"net{epoch}.pt", output_directory_path=self.output_directory_path) - - # Define sampler settings to force the use of local espaloma model - args = [epoch] - if debug == True: - # DEBUG PURPOSE - from importlib.resources import files - small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt")) - else: - small_molecule_forcefield = os.path.join(self.output_directory_path, f"net{epoch}.pt") - - override_sampler_kwargs = { - "small_molecule_forcefield": small_molecule_forcefield, - "output_directory_path": self.output_directory_path - } - - # Create sampler system from configuration file. Returns list of systems. - samplers = SamplerReweight.from_toml(self.configfile, *args, **override_sampler_kwargs) - for sampler in samplers: - _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') - sampler.minimize() - sampler.run() - - # Compute MD loss + # Run sampling for the first time + if neff == -1: + _logger.info(f'Reached sampler patience epoch={self.sampler_patience}. Run sampler for the first time.') + # Initialize + SamplerReweight = SetupSamplerReweight() + # Create new sampler system using local espaloma model + samplers = self._setup_local_samplers(epoch, net_copy, debug) + SamplerReweight.update(samplers) + SamplerReweight.run() + else: + # If effective sample size is below threshold, re-run sampler + neff = SamplerReweight.get_effective_sample_size() + if neff < self.neff_threshold: + _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') + samplers = self._setup_local_samplers(epoch, net_copy, debug) + SamplerReweight.update(samplers) + SamplerReweight.run() + + # Compute sampler loss _logger.info(f'Compute sampler loss.') - for sampler_index, sampler in enumerate(samplers): - loss_sampler = sampler.compute_loss() * sampler.weight - loss += loss_sampler - loss_dict[f'sampler{sampler_index}'] = loss_sampler.item() - - # Append individual loss to loss_trajectory + loss_list = SamplerReweight.compute_loss() # list of torch.tensor + for sampler_index, _loss in enumerate(loss_list): + #loss_dict[f'sampler{sampler_index}'] = _loss.item() + _sampler = SamplerReweight.samplers[sampler_index] + loss_dict[f'{_sampler.target_name}'] = _loss.item() + loss += _loss * sampler_weight + + # Append total and individual loss to loss_trajectory + loss_dict['loss'] = loss.item() loss_trajectory[epoch] = loss_dict # Update weights @@ -527,14 +487,53 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False): # Note: returned loss is a joint loss of different units. _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() _logger.info(f'epoch {epoch}: {_loss:.3f}') - self._save_local_model(epoch) + self._save_checkpoint(epoch) # Export loss trajectory + # TODO: Report losses at every epoch _logger.info(f'Export loss trajectory to a file.') self.report_loss(loss_trajectory) - def _save_local_model(self, epoch): + def _load_checkpoint(self): + """Load the last checkpoint and restart the training process. + + This method finds all the checkpoint files in the output directory, loads the + last checkpoint (e.g. net100.pt), and restarts the training process from the next step. + If no checkpoint files are found, the training process starts from the first step. + + Returns + ------- + int + The step from which the training process should be restarted. + """ + import sys + import glob + + checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path)) + + if checkpoints: + n = [ int(c.split('net')[1].split('.')[0]) for c in checkpoints ] + n.sort() + restart_epoch = n[-1] + restart_checkpoint = os.path.join(self.output_directory_path, f"net{restart_epoch}.pt") + self.net.load_state_dict(torch.load(restart_checkpoint)) + logging.info(f'Restarting from ({restart_checkpoint}).') + else: + restart_epoch = 0 + + if restart_epoch >= self.epochs: + _logger.info(f'Already trained for {self.epochs} epochs.') + sys.exit(0) + elif restart_epoch > 0: + _logger.info(f'Training for additional {self.epochs-restart_epoch} epochs.') + else: + _logger.info(f'Training from scratch for {self.epochs} epochs.') + + return restart_epoch + + + def _save_checkpoint(self, epoch): """Save local model. Parameters @@ -549,19 +548,32 @@ def _save_local_model(self, epoch): checkpoint_file = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt") torch.save(self.net.state_dict(), checkpoint_file) - - def report_loss(self, loss_trajecotry): - """Report loss. - Parameters - ---------- - loss : dict - The loss trajectory that stores individual weighted losses for each epoch. + def _setup_local_samplers(self, epoch, net_copy, debug): + from espfit.app.sampler import SetupSampler - Returns - ------- - None - """ - import pandas as pd - df = pd.DataFrame.from_dict(loss_trajecotry, orient='index') - df.to_csv(os.path.join(self.output_directory_path, 'report.log'), sep='\t', float_format='%.4f') + # Save espaloma checkpoint models + self._save_checkpoint(epoch) + # Save checkpoint as temporary espaloma model (force field) + local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt") + self.save_model(net=net_copy, best_model=local_model, model_name=f"net{epoch}.pt", output_directory_path=self.output_directory_path) + + # Define sampler settings with override arguments + args = [epoch] + if debug == True: + from importlib.resources import files + small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt")) + else: + small_molecule_forcefield = os.path.join(self.output_directory_path, f"net{epoch}.pt") + + override_sampler_kwargs = { + "atomSubset": 'all', + "small_molecule_forcefield": small_molecule_forcefield, + "output_directory_path": self.output_directory_path + } + + # Create sampler system from configuration file. Returns list of systems. + samplers = SetupSampler.from_toml(self.configfile, *args, **override_sampler_kwargs) + + return samplers + \ No newline at end of file diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml index 31ec82c..a3f9138 100644 --- a/espfit/data/config/config.toml +++ b/espfit/data/config/config.toml @@ -38,7 +38,6 @@ nsteps = 1000 checkpoint_frequency = 10 logging_frequency = 1 netcdf_frequency = 10 -weight = 1 # system setup parameters [[sampler.setup]] @@ -53,4 +52,3 @@ nsteps = 1000 checkpoint_frequency = 10 logging_frequency = 1 netcdf_frequency = 10 -weight = 1 \ No newline at end of file diff --git a/espfit/utils/espaloma/module.py b/espfit/utils/espaloma/module.py index 1e5ea18..6f3fb34 100644 --- a/espfit/utils/espaloma/module.py +++ b/espfit/utils/espaloma/module.py @@ -186,6 +186,7 @@ def forward(self, g): loss = loss_energy + loss_force + loss_charge + loss_torsion + loss_improper loss_dict = { + 'loss': None, 'energy': loss_energy.item(), 'force': loss_force.item(), 'charge': loss_charge.item(), diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index 672002d..917ef8d 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -5,52 +5,58 @@ _logger = logging.getLogger(__name__) -class SetupSamplerReweight(SetupSampler, BaseSimulation): +class SetupSamplerReweight(object): - def __init__(self, weight=1, **kwargs): - super().__init__(**kwargs) - self.weight = weight + def __init__(self): + self.samplers = None + self.samplers_old = None + self.weights = None # list - - def _get_experiment_data(self, target_class, target_name): - import yaml - from importlib.resources import files - yaml_file = str(files('espfit').joinpath(f'data/target/{target_class}/{target_name}/experiment.yml')) - with open(yaml_file, 'r') as f: - d = yaml.safe_load(f) + def run(self): + for sampler in self.samplers: + _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') + sampler.minimize() + sampler.run() - # {'resi_1': {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}}} - return d['experiment_1']['measurement'] + + def update(self, samplers): + # Update sampler + self.samplers_old = self.samplers + self.samplers = samplers def get_effective_sample_size(self): # Compute effective sample size - neff = 0.5 - return neff + # U(x0, theta0) + old_potential_energy = 0 - def _compute_observable(self): - if self.target_class == 'nucleoside': - from espfit.app.analysis import RNASystem - target = RNASystem() - target.load_traj(input_directory_path=self.output_directory_path) - pred = target.compute_jcouplings() - #_logger.debug(f'Computed observable: {pred}') - else: - raise NotImplementedError(f'Observable for {self.target_class} is not implemented.') + # U(x0, theta1) + reduced_potential_energy = 0 - import yaml - with open(os.path.join(self.output_directory_path, 'pred.yaml'), 'w') as f: - yaml.dump(pred, f, allow_unicode=True) + neff = 0.5 + return neff + + + def compute_loss(self): - return pred + loss_list = [] + for sampler in self.samplers: + loss = self._compute_loss_per_system(sampler) # torch.tensor + loss_list.append(loss) + # list of torch.tensor + return loss_list + + + def _compute_loss_per_system(self, sampler): + + import torch - def compute_loss(self): # Compute experimental observable - exp = self._get_experiment_data(self.target_class, self.target_name) - pred = self._compute_observable() + exp = self._get_experiment_data(sampler.target_class, sampler.target_name) + pred = self._compute_observable(sampler.atomSubset, sampler.target_class, sampler.output_directory_path) loss = [] for resi_index, exp_dict in enumerate(exp.values()): @@ -64,24 +70,46 @@ def compute_loss(self): exp_error = value['error'] if exp_error == None: exp_error = 0.5 # TODO: Check experimental error - resi_index = int(resi_index) pred_value = list(pred.values())[resi_index][key]['avg'] pred_error = list(pred.values())[resi_index][key]['std'] # standard deviation - - # TODO: change to debug - _logger.info(f'Exp ({resi_index}-{key}): {exp}') - _logger.info(f'Pred ({resi_index}-{key}): {pred}') - + _logger.debug(f'Exp ({resi_index}-{key}): {exp}') + _logger.debug(f'Pred ({resi_index}-{key}): {pred}') # Compute loss numerator = (pred_value - exp_value) ** 2 dominator = (exp_error ** 2 + pred_error ** 2) loss.append(numerator / dominator) - - # Compute loss - import torch loss_avg = torch.mean(torch.tensor(loss)) _logger.info(f'Computed sampler loss: {loss_avg.item()}') - return loss_avg \ No newline at end of file + return loss_avg + + + def _get_experiment_data(self, target_class, target_name): + import yaml + from importlib.resources import files + + yaml_file = str(files('espfit').joinpath(f'data/target/{target_class}/{target_name}/experiment.yml')) + with open(yaml_file, 'r', encoding='utf8') as f: + d = yaml.safe_load(f) + + # {'resi_1': {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}}} + return d['experiment_1']['measurement'] + + + def _compute_observable(self, atomSubset, target_class, output_directory_path): + if target_class == 'nucleoside': + from espfit.app.analysis import RNASystem + target = RNASystem(atomSubset=atomSubset) + target.load_traj(input_directory_path=output_directory_path) + pred = target.compute_jcouplings() + _logger.debug(f'Computed observable: {pred}') + else: + raise NotImplementedError(f'Observable for {target_class} is not implemented.') + + import yaml + with open(os.path.join(output_directory_path, 'pred.yaml'), 'w') as f: + yaml.dump(pred, f, allow_unicode=True) + + return pred From 09565a9a6203ec1ada510859c42f3f19b68dd4e4 Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 4 Mar 2024 21:01:21 -0500 Subject: [PATCH 36/59] fix SetupSampler.from_toml when args is not defined --- espfit/app/train.py | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index 3b4d007..49aeea8 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -421,8 +421,10 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight # Load checkpoint self.restart_epoch = self._load_checkpoint() + # Initialize + SamplerReweight = SetupSamplerReweight() + # Train - neff = -1 loss_trajectory = {} ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) @@ -448,29 +450,23 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight # Include sampler loss after certain epochs if epoch > self.sampler_patience: - # Run sampling for the first time - if neff == -1: - _logger.info(f'Reached sampler patience epoch={self.sampler_patience}. Run sampler for the first time.') - # Initialize - SamplerReweight = SetupSamplerReweight() - # Create new sampler system using local espaloma model - samplers = self._setup_local_samplers(epoch, net_copy, debug) - SamplerReweight.update(samplers) + # Save checkpoint as local model (force field) + _samplers = self._setup_local_samplers(epoch, net_copy, debug) + neff = SamplerReweight.get_effective_sample_size(temporary_samplers=_samplers) # returns -1 if SamplerReweight.samplers is None + + # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton + if neff < self.neff_threshold: + _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') + SamplerReweight.samplers = _samplers SamplerReweight.run() - else: - # If effective sample size is below threshold, re-run sampler - neff = SamplerReweight.get_effective_sample_size() - if neff < self.neff_threshold: - _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') - samplers = self._setup_local_samplers(epoch, net_copy, debug) - SamplerReweight.update(samplers) - SamplerReweight.run() + + # Delete temporary_samplers + del _samplers # Compute sampler loss _logger.info(f'Compute sampler loss.') loss_list = SamplerReweight.compute_loss() # list of torch.tensor for sampler_index, _loss in enumerate(loss_list): - #loss_dict[f'sampler{sampler_index}'] = _loss.item() _sampler = SamplerReweight.samplers[sampler_index] loss_dict[f'{_sampler.target_name}'] = _loss.item() loss += _loss * sampler_weight @@ -549,27 +545,31 @@ def _save_checkpoint(self, epoch): torch.save(self.net.state_dict(), checkpoint_file) - def _setup_local_samplers(self, epoch, net_copy, debug): - from espfit.app.sampler import SetupSampler - - # Save espaloma checkpoint models - self._save_checkpoint(epoch) + def _save_local_model(self, epoch, net_copy): # Save checkpoint as temporary espaloma model (force field) + _logger.info(f'Save checkpoint{epoch}.pt as temporary espaloma model (force field).') + self._save_checkpoint(epoch) local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt") - self.save_model(net=net_copy, best_model=local_model, model_name=f"net{epoch}.pt", output_directory_path=self.output_directory_path) + self.save_model(net=net_copy, best_model=local_model, model_name=f"net.pt", output_directory_path=self.output_directory_path) + + + def _setup_local_samplers(self, epoch, net_copy, debug): + from espfit.app.sampler import SetupSampler + self._save_local_model(epoch, net_copy) + # Define sampler settings with override arguments args = [epoch] if debug == True: from importlib.resources import files small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt")) else: - small_molecule_forcefield = os.path.join(self.output_directory_path, f"net{epoch}.pt") + small_molecule_forcefield = os.path.join(self.output_directory_path, f"net.pt") override_sampler_kwargs = { "atomSubset": 'all', "small_molecule_forcefield": small_molecule_forcefield, - "output_directory_path": self.output_directory_path + "output_directory_path": self.output_directory_path } # Create sampler system from configuration file. Returns list of systems. From f70140f039a1d78eeb6716eaa7b6d5e70270d93a Mon Sep 17 00:00:00 2001 From: kt Date: Mon, 4 Mar 2024 21:02:12 -0500 Subject: [PATCH 37/59] add pseudo code for get_effective_sampler_size --- espfit/app/sampler.py | 4 ++-- espfit/utils/sampler/reweight.py | 33 ++++++++++++++++++++------------ 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 4d431d3..6ec6d5e 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -496,8 +496,8 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs): else: raise ValueError(f"Invalid keyword argument: {key}") - # Update output directory path if epoch is given - if args is not None: + # Update output directory path if args (epoch) is given + if args: if len(args) == 1 and isinstance(args[0], int): sampler.output_directory_path = os.path.join(sampler.output_directory_path, sampler.target_name, f'{args[0]}') else: diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index 917ef8d..3142932 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -9,7 +9,6 @@ class SetupSamplerReweight(object): def __init__(self): self.samplers = None - self.samplers_old = None self.weights = None # list @@ -20,20 +19,30 @@ def run(self): sampler.run() - def update(self, samplers): - # Update sampler - self.samplers_old = self.samplers - self.samplers = samplers - + def get_effective_sample_size(self, temporary_samplers): + + # Check if sampler is None + if self.samplers is None: + return -1 - def get_effective_sample_size(self): # Compute effective sample size + import mdtraj + from openmm.unit import kilocalories_per_mole as kcalpermol + potential_energy_diff = [] + for sampler, temporary_sampler in zip(self.samplers, temporary_samplers): + traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb') + for i in range(traj.n_frames): + # U(x0, theta0) + sampler.simulation.context.setPositions(traj.openmm_positions(i)) + potential_energy = sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy() + # U(x0, theta1) + temporary_sampler.simulation.context.setPositions(traj.openmm_positions(i)) + reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy() + + delta = (potential_energy - reduced_potential_energy).value_in_unit(kcalpermol) + potential_energy_diff.append(delta) - # U(x0, theta0) - old_potential_energy = 0 - - # U(x0, theta1) - reduced_potential_energy = 0 + _logger.info(f'{potential_energy._value}, {reduced_potential_energy._value}, {delta}') neff = 0.5 return neff From 52d32b5cc6cb0463bbbf68b40df04a8864d3c385 Mon Sep 17 00:00:00 2001 From: kt Date: Tue, 5 Mar 2024 18:18:30 -0500 Subject: [PATCH 38/59] clean up --- README.md | 8 +- espfit/app/analysis.py | 62 ++++++----- espfit/app/sampler.py | 76 ++++--------- espfit/app/train.py | 145 ++++++++++++++++--------- espfit/utils/espaloma/module.py | 2 + espfit/utils/graphs.py | 2 - espfit/utils/sampler/reweight.py | 181 ++++++++++++++++++++++++++----- espfit/utils/units.py | 7 +- 8 files changed, 311 insertions(+), 172 deletions(-) diff --git a/README.md b/README.md index b012b38..d8b18a2 100644 --- a/README.md +++ b/README.md @@ -37,12 +37,12 @@ ds.compute_relative_energy() # Create esplama model from espfit.app.train import EspalomaModel filename = 'espfit/data/config/config.toml' -model = EspalomaModel.from_toml(filename) +# Override training settings in config.toml +kwargs = {'output_directory_path': 'checkpoints', 'epochs': 100} +model = EspalomaModel.from_toml(filename, **kwargs) model.dataset_train = ds -# Change default training settings -model.epochs = 100 # Set sampler settings -model.train_sampler(sampler_patience=800, neff_threshold=0.2) +model.train_sampler(sampler_patience=800, neff_threshold=0.2, sampler_weight=1) ``` ### Standalone Usage diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py index 1b6d97d..5235935 100644 --- a/espfit/app/analysis.py +++ b/espfit/app/analysis.py @@ -1,11 +1,5 @@ """ Compute experimental observables from MD simulations. - -Notes ------ - -TODO ----- """ import os import numpy as np @@ -17,10 +11,6 @@ class BaseDataLoader(object): """Base class for data loader. - TODO - ---- - * Add more methods to check trajectory information (e.g. number of frames, number of atoms, etc.) - Methods ------- load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1): @@ -68,7 +58,6 @@ def output_directory_path(self, value): os.makedirs(value, exist_ok=True) - # Should this be a classmethod? def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None): """Load MD trajectory. @@ -86,6 +75,10 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s input_directory_path : str, optional Input directory path. Default is None. If None, the current working directory will be used. + + Returns + ------- + None """ import mdtraj @@ -182,15 +175,14 @@ def radian_to_degree(self, a): return a - def compute_jcouplings(self, couplings=None, residues=None): + def compute_jcouplings(self, weights=None, couplings=None, residues=None): """Compute J-couplings from MD trajectory. - TODO - ---- - * Compute confidence interval. - Parameters ---------- + weights : numpy.ndarray, optional + Weights to compute the J-couplings. Default is None. + couplings : str, optional Name of the couplings to compute. Default is None. If a list of couplings to be chosen from [H1H2,H2H3,H3H4,1H5P,2H5P,C4Pb,1H5H4,2H5H4,H3P,C4Pe,H1C2/4,H1C6/8] @@ -216,7 +208,7 @@ def compute_jcouplings(self, couplings=None, residues=None): """ import barnaba as bb - _logger.info("Computing J-couplings from MD trajectory...") + _logger.info("Compute J-couplings from MD trajectory") if couplings is not None: # Check if the provided coupling names are valid @@ -228,25 +220,32 @@ def compute_jcouplings(self, couplings=None, residues=None): # residue_list: list of M nucleobases values, resname_list = bb.jcouplings_traj(self.traj, couplings=couplings, residues=residues) + # Convert numpy.float to float to avoid serialization issues + replace_nan_with_none = lambda x: None if np.isscalar(x) and np.isnan(x) else x.item() + # Loop over residues and couplings to store the computed values coupling_dict = dict() for i, resname in enumerate(resname_list): _values = values[:,i,:] # Coupling values of i-th residue values_by_names = dict() for j, coupling_name in enumerate(couplings): - # Function to replace np.nan with None - avg = np.round(_values[:,j].mean(), 5) # Mean value of H1H2 coupling of i-th residue - std = np.round(_values[:,j].std(), 5) # Standard deviation of H1H2 coupling of i-th residue - - replace_nan_with_none = lambda x: None if np.isscalar(x) and np.isnan(x) else x - avg = replace_nan_with_none(avg) - std = replace_nan_with_none(std) - if avg: - avg = avg.item() - if std: - std = std.item() - # Convert numpy.float to float to avoid serialization issues - values_by_names[coupling_name] = {'avg': avg, 'std': std} + avg_raw = np.round(_values[:,j].mean(), 5) # e.g. mean value of H1H2 coupling of i-th residue + std_raw = np.round(_values[:,j].std(), 5) # e.g. standard deviation of H1H2 coupling of i-th residue + avg_raw = replace_nan_with_none(avg_raw) + std_raw = replace_nan_with_none(std_raw) + if weights is not None: + arr = _values[:,j] * weights + #_logger.info(f'non-weighted: {_values[:,j]}') + #_logger.info(f'weights: {weights}') + #_logger.info(f'weighted: {arr}') + avg = np.round(arr.mean(), 5) + std = np.round(arr.std(), 5) + avg = replace_nan_with_none(avg) + std = replace_nan_with_none(std) + else: + avg = avg_raw + std = std_raw + values_by_names[coupling_name] = {'avg': avg, 'std': std, 'avg_raw': avg_raw, 'std_raw': std_raw} coupling_dict[resname] = values_by_names return coupling_dict @@ -266,6 +265,9 @@ def get_available_couplings(self): return available_coupling_names +# +# Future work? +# class ProteinSystem(BaseDataLoader): def __init__(self, **kwargs): super(ProteinSystem, self).__init__(**kwargs) diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 6ec6d5e..eb211e5 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -123,7 +123,7 @@ def _get_platform(self): from openmmtools.utils import get_fastest_platform platform = get_fastest_platform() platform_name = platform.getName() - _logger.info(f"Fastest platform: {platform_name}") + _logger.debug(f"Fastest platform: {platform_name}") if platform_name == "CUDA": platform.setPropertyDefaultValue('DeterministicForces', 'true') # default is false platform.setPropertyDefaultValue('Precision', 'mixed') # default is single @@ -141,11 +141,10 @@ def minimize(self, output_directory_path=None): ------- None """ - if output_directory_path is not None: self.output_directory_path = output_directory_path # property decorator is called - _logger.info(f"Minimizing system for maximum {self.maxIterations} steps.") + _logger.debug(f"Minimizing system for maximum {self.maxIterations} steps") self.simulation.minimizeEnergy(self.maxIterations) @@ -161,30 +160,14 @@ def run(self, output_directory_path=None): ------- None """ + import mdtraj + from mdtraj.reporters import NetCDFReporter + from openmm.app import CheckpointReporter, StateDataReporter + if output_directory_path is not None: self.output_directory_path = output_directory_path # property decorator is called # Select atoms to save - import mdtraj - #if self.atomSubset == 'solute': - # self.atom_indices = [] - # mdtop = mdtraj.Topology.from_openmm(self.simulation.topology) - # res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] - # for r in res: - # for a in r.atoms: - # self.atom_indices.append(a.index) - #elif self.atomSubset == 'all': - # self.atom_indices = None - #elif self.atomSubset == 'not water': - # self.atom_indices = [] - # mdtop = mdtraj.Topology.from_openmm(self.simulation.topology) - # res = [ r for r in mdtop.residues if r.name not in ('HOH') ] - # for r in res: - # for a in r.atoms: - # self.atom_indices.append(a.index) - #else: - # raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.") - if self.atomSubset == 'all': self.atom_indices = None else: @@ -199,18 +182,13 @@ def run(self, output_directory_path=None): self.atom_indices.append(a.index) # Define reporter - from mdtraj.reporters import NetCDFReporter - from openmm.app import CheckpointReporter, StateDataReporter - self._check_file_exists("traj.nc") self.simulation.reporters.append(NetCDFReporter(os.path.join(self.output_directory_path, f"traj.nc"), min(self.netcdf_frequency, self.nsteps), atomSubset=self.atom_indices)) - self._check_file_exists("checkpoint.chk") self.simulation.reporters.append(CheckpointReporter(os.path.join(self.output_directory_path, f"checkpoint.chk"), min(self.checkpoint_frequency, self.nsteps))) - self._check_file_exists("reporter.log") self.simulation.reporters.append(StateDataReporter(os.path.join(self.output_directory_path, f"reporter.log"), min(self.logging_frequency, self.nsteps), @@ -218,7 +196,7 @@ def run(self, output_directory_path=None): totalEnergy=True, temperature=True, volume=True, density=True, speed=True)) # Run - _logger.info(f"Run MD simulation for {self.nsteps} steps") + _logger.info(f"Running simulation for {self.nsteps} steps...") self.simulation.step(self.nsteps) @@ -248,7 +226,7 @@ def export_xml(self, exportSystem=True, exportState=True, exportIntegrator=True, None """ from openmm import XmlSerializer - _logger.info(f"Serialize and export system") + _logger.debug(f"Serialize and export system") if output_directory_path is not None: # Create a new output directory different from the one specified when the SetupSampler instance was created. @@ -332,6 +310,12 @@ class SetupSampler(BaseSimulation): create_system(biopolymer_file=None, ligand_file=None): Create biopolymer-ligand system and export serialized system XML file and solvated pdb file. + from_toml(filename, *args, **override_sampler_kwargs): + Create SetupSampler from a TOML configuration file. + + from_xml(filename): + Create SetupSampler from a serialized system XML file. + Examples -------- >>> from espfit.app.sampler import SetupSampler @@ -352,18 +336,15 @@ class SetupSampler(BaseSimulation): ['amber/protein.ff14SB.xml', 'amber/RNA.OL3.xml'] : pl-multi (TPO): NG, pl-single: OK, RNA: OK """ def __init__(self, - #small_molecule_forcefield='openff-2.1.0', small_molecule_forcefield='espfit/data/forcefield/espaloma-0.3.2.pt', forcefield_files = ['amber/ff14SB.xml', 'amber/phosaa14SB.xml'], water_model='tip3p', solvent_padding=9.0 * unit.angstroms, ionic_strength=0.15 * unit.molar, - #constraints=app.HBonds, hmass=3.0 * unit.amu, temperature=300.0 * unit.kelvin, pressure=1.0 * unit.atmosphere, pme_tol=2.5e-04, - #nonbonded_method=app.PME, barostat_period=50, timestep=4 * unit.femtoseconds, override_with_espaloma=True, @@ -383,8 +364,6 @@ def __init__(self, The padding distance around the solute in the solvent box. Default is 9.0 * unit.angstroms. ionic_strength : Quantity, optional The ionic strength of the solvent. Default is 0.15 * unit.molar. - constraints : object, optional - The type of constraints to be applied to the system. Default is app.HBonds. hmass : Quantity, optional The mass of the hydrogen atoms. Default is 3.0 * unit.amu. temperature : Quantity, optional @@ -393,8 +372,6 @@ def __init__(self, The pressure of the system. Default is 1.0 * unit.atmosphere. pme_tol : float, optional The Ewald error tolerance for PME electrostatics. Default is 2.5e-04. - nonbonded_method : object, optional - The nonbonded method to be used for the system. Default is app.PME. barostat_period : int, optional The frequency at which the barostat is applied. Default is 50. timestep : Quantity, optional @@ -408,12 +385,10 @@ def __init__(self, self.forcefield_files = self._update_forcefield_files(forcefield_files) self.solvent_padding = solvent_padding self.ionic_strength = ionic_strength - #self.constraints = constraints self.hmass = hmass self.temperature = temperature self.pressure = pressure self.pme_tol = pme_tol - #self.nonbonded_method = nonbonded_method self.barostat_period = barostat_period self.timestep = timestep self.override_with_espaloma = override_with_espaloma @@ -459,7 +434,7 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs): raise ValueError("target is not specified in the configuration file") samplers = [] - _logger.info(f'Found {len(config)} systems in the configuration file') + _logger.debug(f'Found {len(config)} systems in the configuration file') for _config in config: sampler = cls() @@ -488,7 +463,6 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs): else: raise ValueError(f"Invalid keyword argument: {key}") - # Expected kwargs: output_directory_path # Pass temporary espaloma model to the sampler if kwargs are given for key, value in override_sampler_kwargs.items(): if hasattr(sampler, key): @@ -661,8 +635,6 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # Initialize system generator. _logger.debug("Initialize system generator") - #forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : self.constraints, 'rigidWater': True, 'hydrogenMass' : self.hmass} - #periodic_forcefield_kwargs = {'nonbondedMethod': self.nonbonded_method} forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : app.HBonds, 'rigidWater': True, 'hydrogenMass' : self.hmass} periodic_forcefield_kwargs = {'nonbondedMethod': app.PME} barostat = MonteCarloBarostat(self.pressure, self.temperature, self.barostat_period) @@ -682,11 +654,11 @@ def create_system(self, biopolymer_file=None, ligand_file=None): template_generator_kwargs=template_generator_kwargs) if ligand_file is not None: - _logger.info("Add molecules to system generator") + _logger.debug("Add molecules to system generator") self._system_generator.template_generator.add_molecules(self._ligand_offmol) # Solvate system - _logger.info("Solvating system...") + _logger.debug("Solvating system...") modeller = app.Modeller(self._complex_topology, self._complex_positions) modeller.addSolvent(self._system_generator.forcefield, model=self.water_model, padding=self.solvent_padding, ionicStrength=self.ionic_strength) @@ -704,7 +676,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # (espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb). # No explicit error message was given. It failed to show the following logging information: # - # _logger.info(f'Requested to generate parameters for residue {residue}') + # _logger.debug(f'Requested to generate parameters for residue {residue}') # https://github.com/openmm/openmmforcefields/blob/main/openmmforcefields/generators/template_generators.py#L285 # # However, it works for protein test systems (espfit/data/target/testsystems/protein-ligand/target.pdb). @@ -712,7 +684,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # As a workaround, we will delete the original `self._system_generator` and create a new one to regenerate the system with espaloma. # Only water and ion forcefield files will be used to regenerate the system. Solute molecules will be parametrized with espaloma. # - _logger.info("Regenerate system with espaloma.") + _logger.debug("Regenerate system with espaloma.") # Re-create system generator del self._system_generator @@ -757,13 +729,13 @@ def _regenerate_espaloma_system(self): import mdtraj from openff.toolkit import Molecule - _logger.info("Regenerate system with espaloma") + _logger.debug("Regenerate system with espaloma") # Check biopolymer chains mdtop = mdtraj.Topology.from_openmm(self.modeller_solvated_topology) chain_indices = [ chain.index for chain in self.modeller_solvated_topology.chains() ] biopolymer_chain_indices = [ chain_index for chain_index in chain_indices if mdtop.select(f"not (water or resname NA or resname K or resname CL or resname UNK) and chainid == {chain_index}").any() ] - _logger.info(f"Biopolymer chain indices: {biopolymer_chain_indices}") + _logger.debug(f"Biopolymer chain indices: {biopolymer_chain_indices}") # Get OpenMM topology of solute with one residue per molecule. # Espaloma will use residue name "XX". Check conflicting residue names. @@ -772,14 +744,14 @@ def _regenerate_espaloma_system(self): raise Exception('Found conflict residue name in biopolymer.') # Initilize espaloma topology - # TODO: From software engineering point of view, should this be `self.new_solvated_topology` or `new_solvated_topology`? + # TODO: Should this be `self.new_solvated_topology` or `new_solvated_topology`? self.new_solvated_topology = app.Topology() self.new_solvated_topology.setPeriodicBoxVectors(self.modeller_solvated_topology.getPeriodicBoxVectors()) new_atoms = {} # Regenerate biopolymer topology chain_index = 0 - _logger.info(f"Regenerating biopolymer topology...") + _logger.debug(f"Regenerating biopolymer topology...") for chain in self.modeller_solvated_topology.chains(): new_chain = self.new_solvated_topology.addChain(chain.id) # Convert biopolymer into a single residue @@ -844,7 +816,7 @@ def _update_espaloma_topology(self): ------- app.Topology : The updated topology reflecting the new system. """ - _logger.info("Update residue names in espaloma topology.") + _logger.debug("Update residue names in espaloma topology.") # Get original residue names. atom_name_lookup = [] diff --git a/espfit/app/train.py b/espfit/app/train.py index 49aeea8..1f3d977 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -3,12 +3,12 @@ TODO ---- -* Export loss to a file (e.g. LossReporter class?) * Add support to use multiple GPUs * Improve how data are parsed using dataclasses or pydantic """ import os import torch +import espaloma as esp import logging _logger = logging.getLogger(__name__) @@ -18,9 +18,9 @@ class EspalomaBase(object): def __init__(self): # Check if GPU is available if torch.cuda.is_available(): - _logger.info('GPU is available for training.') + _logger.debug('GPU is available for training.') else: - _logger.info('GPU is not available for training.') + _logger.debug('GPU is not available for training.') # Check torch data type _logger.debug(f'Torch data type is {torch.get_default_dtype()}') @@ -40,6 +40,10 @@ def from_toml(cls, filename, **override_espalomamodel_kwargs): filename : str Path to the TOML file containing the configuration for the espaloma model. + override_espalomamodel_kwargs : dict + A dictionary of keyword arguments to override the default settings for the + espaloma model. + Returns ------- object @@ -92,9 +96,6 @@ def _get_base_module(espaloma_config): list A list of modules for the Espaloma network model. """ - - import espaloma as esp - # GNN gnn_method = 'SAGEConv' gnn_options = {} @@ -167,7 +168,6 @@ def create_model(espaloma_config): torch.nn.Sequential The constructed Espaloma network model. """ - import espaloma as esp from espfit.utils.espaloma.module import GetLoss # Get base model @@ -210,8 +210,6 @@ def save_model(self, net=None, best_model=None, model_name='espaloma.pt', output ------- None """ - import espaloma as esp - if output_directory_path is not None: os.makedirs(output_directory_path, exist_ok=True) else: @@ -328,31 +326,39 @@ def output_directory_path(self, value): os.makedirs(value, exist_ok=True) - def report_loss(self, loss_trajecotry): + def report_loss(self, epoch, loss_dict): """Report loss. Parameters ---------- - loss : dict - The loss trajectory that stores individual weighted losses for each epoch. + loss_dict : dict + The loss trajectory that stores individual weighted losses at a given epoch. Returns ------- None """ import pandas as pd - df = pd.DataFrame.from_dict(loss_trajecotry, orient='index') - df.to_csv(os.path.join(self.output_directory_path, 'reporter.log'), sep='\t', float_format='%.4f') + df = pd.DataFrame.from_dict(loss_dict, orient='index').T + df.insert(0, 'epoch', epoch) + + log_file_path = os.path.join(self.output_directory_path, 'reporter.log') + + if os.path.exists(log_file_path): + existing_headers = pd.read_csv(log_file_path, sep='\t', nrows=0).columns.tolist() + if set(df.columns) != set(existing_headers): + df_old = pd.read_csv(log_file_path, sep='\t') + df = pd.concat([df_old, df], ignore_index=True) + else: + df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False, header=False, mode='a') + else: + df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False) def train(self): """ Train the Espaloma network model. - TODO - ---- - * Export training settings to a file? - Returns ------- None @@ -394,12 +400,23 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight """ Train the Espaloma network model with sampler. - TODO - ---- - * Export loss to a file (e.g. LossReporter class?) - * Should `nsteps` be a variable when calling train_sampler? - * Should `sampler_patience` and `neff_threshold` be an instance variable of sampler.BaseSimulation? + Parameters + ---------- + sampler_patience : int, default=800 + The number of epochs to wait before using sampler. + + neff_threshold : float, default=0.2 + The minimum effective sample size threshold. + sampler_weight : float, default=1.0 + The weight for the sampler loss. + + debug : bool, default=False + If True, use espaloma-0.3.pt for debugging. + + Returns + ------- + None """ from espfit.utils.units import HARTREE_TO_KCALPERMOL from espfit.utils.sampler.reweight import SetupSamplerReweight @@ -425,7 +442,6 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight SamplerReweight = SetupSamplerReweight() # Train - loss_trajectory = {} ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) with torch.autograd.set_detect_anomaly(True): @@ -441,55 +457,44 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight if torch.cuda.is_available(): g = g.to("cuda:0") g.nodes["n1"].data["xyz"].requires_grad = True - - # Forward pass - # Note that returned values are weighted losses. + _loss, loss_dict = self.net(g) - # Append loss loss += _loss - # Include sampler loss after certain epochs if epoch > self.sampler_patience: - # Save checkpoint as local model (force field) - _samplers = self._setup_local_samplers(epoch, net_copy, debug) - neff = SamplerReweight.get_effective_sample_size(temporary_samplers=_samplers) # returns -1 if SamplerReweight.samplers is None + # Save checkpoint as local model (net.pt) + samplers = self._setup_local_samplers(epoch, net_copy, debug) + # neff_min is -1 if SamplerReweight.samplers is None + neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton - if neff < self.neff_threshold: - _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).') - SamplerReweight.samplers = _samplers + if neff_min < self.neff_threshold: + _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') + SamplerReweight.samplers = samplers SamplerReweight.run() - - # Delete temporary_samplers - del _samplers + del samplers # Compute sampler loss - _logger.info(f'Compute sampler loss.') loss_list = SamplerReweight.compute_loss() # list of torch.tensor - for sampler_index, _loss in enumerate(loss_list): - _sampler = SamplerReweight.samplers[sampler_index] - loss_dict[f'{_sampler.target_name}'] = _loss.item() - loss += _loss * sampler_weight + for sampler_index, sampler_loss in enumerate(loss_list): + sampler = SamplerReweight.samplers[sampler_index] + loss += sampler_loss * sampler_weight + loss_dict[f'{sampler.target_name}'] = sampler_loss.item() + loss_dict['neff'] = neff_min - # Append total and individual loss to loss_trajectory loss_dict['loss'] = loss.item() - loss_trajectory[epoch] = loss_dict + self.report_loss(epoch, loss_dict) - # Update weights + # Back propagate loss.backward() optimizer.step() if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. - _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() - _logger.info(f'epoch {epoch}: {_loss:.3f}') + #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() + _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}') self._save_checkpoint(epoch) - # Export loss trajectory - # TODO: Report losses at every epoch - _logger.info(f'Export loss trajectory to a file.') - self.report_loss(loss_trajectory) - def _load_checkpoint(self): """Load the last checkpoint and restart the training process. @@ -546,14 +551,46 @@ def _save_checkpoint(self, epoch): def _save_local_model(self, epoch, net_copy): + """Save local model (force field). + + Parameters + ---------- + epoch : int + The epoch number. + + net_copy : torch.nn.Sequential + A deep copy of the Espaloma network model. + + Returns + ------- + None + """ # Save checkpoint as temporary espaloma model (force field) - _logger.info(f'Save checkpoint{epoch}.pt as temporary espaloma model (force field).') + _logger.info(f'Save checkpoint{epoch}.pt as temporary espaloma model (net.pt)') self._save_checkpoint(epoch) local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt") self.save_model(net=net_copy, best_model=local_model, model_name=f"net.pt", output_directory_path=self.output_directory_path) def _setup_local_samplers(self, epoch, net_copy, debug): + """Setup local samplers. + + Parameters + ---------- + epoch : int + The epoch number. + + net_copy : torch.nn.Sequential + A deep copy of the Espaloma network model. + + debug : bool + If True, use espaloma-0.3.2.pt for debugging. + + Returns + ------- + list + A list of sampler systems. + """ from espfit.app.sampler import SetupSampler self._save_local_model(epoch, net_copy) diff --git a/espfit/utils/espaloma/module.py b/espfit/utils/espaloma/module.py index 6f3fb34..642d91e 100644 --- a/espfit/utils/espaloma/module.py +++ b/espfit/utils/espaloma/module.py @@ -56,6 +56,8 @@ class GetLoss(torch.nn.Module): compute_improper_loss(g): Compute improper l2 regularization + forward(g): + Compute joint loss """ def __init__(self, weights={'energy': 1.0, 'force': 1.0, 'charge': 1.0, 'torsion': 1.0, 'improper': 1.0}): """Define loss function. diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index 3ab1fd0..32adeb1 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -585,8 +585,6 @@ def _merge_graphs(ds): for key in g.nodes['n1'].data.keys(): if key not in ["q_ref", "idxs", "h0"]: for i in range(1, len(ds)): - if key == "xyz": - n_confs = ds[i].nodes['n1'].data['xyz'].shape[1] g.nodes['n1'].data[key] = torch.cat((g.nodes['n1'].data[key], ds[i].nodes['n1'].data[key]), dim=1) return g diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index 3142932..58b65ad 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -1,18 +1,46 @@ +""" +Compute effective sample size and weights for each simulation. + +TODO +---- +* Check J-coupling experimental error. Currently, fixed to 0.5 Hz. +""" import os import logging -from espfit.app.sampler import SetupSampler, BaseSimulation _logger = logging.getLogger(__name__) class SetupSamplerReweight(object): + """Setup sampler for reweighting simulation. + + This class is responsible for setting up the sampler for reweighting simulation. + It provides methods to run the simulation, compute the effective sample size, + compute the loss, and compute the weighted observable. + + Methods + ------- + run(): + Runs the simulation for each sampler. + + get_effective_sample_size(temporary_samplers): + Computes the effective sample size and sampling weights for each sampler. + compute_loss(): + Computes the loss for each sampler. + """ def __init__(self): self.samplers = None - self.weights = None # list + self.weights = dict() # {'target_name': {'weights': w_i}, {'neff': neff}} def run(self): + """Runs the simulation for each sampler. + + Returns + ------- + None + """ for sampler in self.samplers: _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') sampler.minimize() @@ -20,17 +48,42 @@ def run(self): def get_effective_sample_size(self, temporary_samplers): + """Computes the effective sample size and sampling weights for each sampler. + + Parameters + ---------- + temporary_samplers : list + List of temporary samplers. + + Returns + ------- + float + The minimum effective sample size among all samplers. + """ + import mdtraj + import numpy as np + from openmm.unit import kilocalories_per_mole as kcalpermol + from espfit.utils.units import KB_T_KCALPERMOL - # Check if sampler is None if self.samplers is None: return -1 - # Compute effective sample size - import mdtraj - from openmm.unit import kilocalories_per_mole as kcalpermol - potential_energy_diff = [] for sampler, temporary_sampler in zip(self.samplers, temporary_samplers): + _logger.info(f'Compute effective sample size and sampling weights for {sampler.target_name}') + + # Get temperature + temp0 = sampler.temperature._value + temp1 = temporary_sampler.temperature._value + assert temp0 == temp1, f'Temperature should be equivalent but got sampler {temp0} K and temporary sampler {temp1} K' + beta = 1 / (KB_T_KCALPERMOL * temp0) + _logger.debug(f'beta temperature in kcal/mol: {beta}') + + # Get position from trajectory traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb') + _logger.info(f'Found {traj.n_frames} frames in trajectory') + + # Compute weights and effective sample size + log_w = [] for i in range(traj.n_frames): # U(x0, theta0) sampler.simulation.context.setPositions(traj.openmm_positions(i)) @@ -38,34 +91,65 @@ def get_effective_sample_size(self, temporary_samplers): # U(x0, theta1) temporary_sampler.simulation.context.setPositions(traj.openmm_positions(i)) reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy() - - delta = (potential_energy - reduced_potential_energy).value_in_unit(kcalpermol) - potential_energy_diff.append(delta) - - _logger.info(f'{potential_energy._value}, {reduced_potential_energy._value}, {delta}') - - neff = 0.5 - return neff + # deltaU = U(x0, theta1) - U(x0, theta0) + delta = (reduced_potential_energy - potential_energy).value_in_unit(kcalpermol) + # log_w = ln(exp(-beta * delta)) + w = -1 * beta * delta + log_w.append(w) + + #_logger.debug(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') + #_logger.debug(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') + #_logger.debug(f'deltaU: {delta:10.3f} kcal/mol') + #_logger.debug(f'log_w: {w:10.3f}') + + # Compute weights and effective sample size (ratio: 0 to 1) + w_i = np.exp(log_w) / np.sum(np.exp(log_w)) + neff = np.sum(w_i) ** 2 / np.sum(w_i ** 2) / len(w_i) + #_logger.debug(f'w_i_sum: {np.sum(w_i):10.3f}') + #_logger.debug(f'neff: {neff:10.3f}') + + self.weights[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i} + #_logger.info(f'{self.weights}') + neffs = [self.weights[key]['neff'] for key in self.weights.keys()] + + return min(neffs) def compute_loss(self): + """Computes the loss for each sampler. + Returns + ------- + list + List of torch tensors representing the loss for each sampler. + """ loss_list = [] for sampler in self.samplers: + _logger.info(f'Compute loss for {sampler.target_name}') loss = self._compute_loss_per_system(sampler) # torch.tensor loss_list.append(loss) - # list of torch.tensor return loss_list def _compute_loss_per_system(self, sampler): - + """Computes the loss per system for a given sampler. + + Parameters + ---------- + sampler : object + The sampler object. + + Returns + ------- + torch.Tensor + The loss per system as a torch tensor. + """ import torch # Compute experimental observable exp = self._get_experiment_data(sampler.target_class, sampler.target_name) - pred = self._compute_observable(sampler.atomSubset, sampler.target_class, sampler.output_directory_path) + pred = self._compute_weighted_observable(sampler.atomSubset, sampler.target_name, sampler.output_directory_path) loss = [] for resi_index, exp_dict in enumerate(exp.values()): @@ -84,18 +168,32 @@ def _compute_loss_per_system(self, sampler): pred_error = list(pred.values())[resi_index][key]['std'] # standard deviation _logger.debug(f'Exp ({resi_index}-{key}): {exp}') _logger.debug(f'Pred ({resi_index}-{key}): {pred}') - # Compute loss + numerator = (pred_value - exp_value) ** 2 dominator = (exp_error ** 2 + pred_error ** 2) loss.append(numerator / dominator) # Compute loss loss_avg = torch.mean(torch.tensor(loss)) - _logger.info(f'Computed sampler loss: {loss_avg.item()}') + _logger.info(f'Sampler loss: {loss_avg.item():.3f}') return loss_avg def _get_experiment_data(self, target_class, target_name): + """Retrieves the experimental data for a given target. + + Parameters + ---------- + target_class : str + The class of the target. + + target_name : str + The name of the target. + + Returns + ------- + dict : The experimental data for the target. + """ import yaml from importlib.resources import files @@ -107,17 +205,42 @@ def _get_experiment_data(self, target_class, target_name): return d['experiment_1']['measurement'] - def _compute_observable(self, atomSubset, target_class, output_directory_path): - if target_class == 'nucleoside': - from espfit.app.analysis import RNASystem - target = RNASystem(atomSubset=atomSubset) - target.load_traj(input_directory_path=output_directory_path) - pred = target.compute_jcouplings() - _logger.debug(f'Computed observable: {pred}') - else: - raise NotImplementedError(f'Observable for {target_class} is not implemented.') + def _compute_weighted_observable(self, atomSubset, target_name, output_directory_path): + """Computes the weighted observable for a given target. + + Parameters + ---------- + atomSubset : str + The atom subset. + target_name : str + The name of the target. + + output_directory_path : str + The output directory path. + + Returns + ------- + dict : The computed weighted observable. + """ import yaml + from espfit.app.analysis import RNASystem + + # Load trajectory + target = RNASystem(atomSubset=atomSubset) + target.load_traj(input_directory_path=output_directory_path) + + # Compute observable + if self.weights.keys(): + #print('weights key found') + #print(f'{self.weights}') + pred = target.compute_jcouplings(weights=self.weights[target_name]['weights']) + else: + #print('weights key not found') + pred = target.compute_jcouplings(weights=None) + _logger.debug(f'Computed observable: {pred}') + + # Export observable with open(os.path.join(output_directory_path, 'pred.yaml'), 'w') as f: yaml.dump(pred, f, allow_unicode=True) diff --git a/espfit/utils/units.py b/espfit/utils/units.py index 3d5c2c4..9db83d5 100644 --- a/espfit/utils/units.py +++ b/espfit/utils/units.py @@ -6,12 +6,17 @@ hartree = 1 * ureg.hartree bohr = 1 * ureg.bohr angstrom = 1 * ureg.angstrom +kelvin = 1 * ureg.kelvin +kB= ureg.boltzmann_constant +kBT = kB * kelvin +kcalpermol = ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole) # Conversion factors #HARTEE_TO_KCALPERMOL = 627.509 #BOHR_TO_ANGSTROMS = 0.529 -HARTREE_TO_KCALPERMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude +HARTREE_TO_KCALPERMOL = hartree.to(kcalpermol).magnitude BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude +KB_T_KCALPERMOL = kBT.to(kcalpermol).magnitude def convert_string_to_unit(unit_string): From 6a0f43db4517c59d273743a09bf6d8202deae96d Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 13:17:54 -0500 Subject: [PATCH 39/59] fix docstring --- espfit/utils/logging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/espfit/utils/logging.py b/espfit/utils/logging.py index 97b825a..0fd0f22 100644 --- a/espfit/utils/logging.py +++ b/espfit/utils/logging.py @@ -11,8 +11,8 @@ def set_logging_level(level): Parameters ---------- - level : int - The logging level. For example, logging.INFO. + level : str + The logging level. Options are [NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL, FATAL]. Returns ------- From fd052d53b8c00e0c1f5a86173397ff90e6e7af5e Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 14:34:01 -0500 Subject: [PATCH 40/59] remove old comment --- espfit/utils/sampler/reweight.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index 58b65ad..cfb14bc 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -165,7 +165,7 @@ def _compute_loss_per_system(self, sampler): exp_error = 0.5 # TODO: Check experimental error resi_index = int(resi_index) pred_value = list(pred.values())[resi_index][key]['avg'] - pred_error = list(pred.values())[resi_index][key]['std'] # standard deviation + pred_error = list(pred.values())[resi_index][key]['std'] _logger.debug(f'Exp ({resi_index}-{key}): {exp}') _logger.debug(f'Pred ({resi_index}-{key}): {pred}') @@ -232,11 +232,8 @@ def _compute_weighted_observable(self, atomSubset, target_name, output_directory # Compute observable if self.weights.keys(): - #print('weights key found') - #print(f'{self.weights}') pred = target.compute_jcouplings(weights=self.weights[target_name]['weights']) else: - #print('weights key not found') pred = target.compute_jcouplings(weights=None) _logger.debug(f'Computed observable: {pred}') From 3ad0aa926d2bd2e13b66348cddb95ae2c136daeb Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 14:34:20 -0500 Subject: [PATCH 41/59] uncomment version --- espfit/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espfit/__init__.py b/espfit/__init__.py index d312d34..564e5cb 100644 --- a/espfit/__init__.py +++ b/espfit/__init__.py @@ -25,4 +25,4 @@ #from .espfit import * -#from ._version import __version__ +from ._version import __version__ From 19f442f232871b2cf3a2d91d51fa5842e03ec77f Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 14:36:25 -0500 Subject: [PATCH 42/59] support minima conformer for each chunk when reshaping graphs --- espfit/utils/graphs.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index 32adeb1..75d47f4 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -440,7 +440,7 @@ def compute_relative_energy(self): del new_graphs - def reshape_conformation_size(self, n_confs=50): + def reshape_conformation_size(self, n_confs=50, include_min_energy_conf=False): """Reshape conformation size. This is a work around to handle different graph size (shape). DGL requires at least one dimension with same size. @@ -469,17 +469,18 @@ def reshape_conformation_size(self, n_confs=50): self._remove_node_features() new_graphs = [] + n_confs_cache = n_confs for i, g in enumerate(self.graphs): n = g.nodes['n1'].data['xyz'].shape[1] if n == n_confs: - _logger.info(f"Mol #{i} ({n} conformations)") + _logger.info(f"Mol #{i} ({n} conformers)") new_graphs.append(g) elif n < n_confs: random.seed(self.random_seed) index_random = random.choices(range(0, n), k=n_confs-n) - _logger.info(f"Randomly select {len(index_random)} conformations from Mol #{i} ({n} conformations)") + _logger.info(f"Randomly select {len(index_random)} conformers from Mol #{i} ({n} conformers)") _g = copy.deepcopy(g) _g.nodes["g"].data["u_ref"] = torch.cat((_g.nodes['g'].data['u_ref'], _g.nodes['g'].data['u_ref'][:, index_random]), dim=-1) @@ -488,9 +489,17 @@ def reshape_conformation_size(self, n_confs=50): new_graphs.append(_g) else: - _logger.info(f"Shuffling Mol #{i} ({n} conformations) and splitting into {n_confs}") random.seed(self.random_seed) idx_range = random.sample(range(n), k=n) + + # Get index for minimum energy conformer + if include_min_energy_conf: + index_min = [g.nodes['g'].data['u_ref'].argmin().item()] + n_confs = n_confs_cache - 1 + _logger.info(f"Shuffe Mol #{i} ({n} conformers) and split into {n_confs} conformers and add minimum energy conformer (index #{index_min[0]})") + else: + _logger.info(f"Shuffe Mol #{i} ({n} conformers) and split into {n_confs} conformers") + for j in range(n // n_confs + 1): _g = copy.deepcopy(g) @@ -498,7 +507,12 @@ def reshape_conformation_size(self, n_confs=50): index = range(j*n_confs, n) random.seed(self.random_seed) index_random = random.choices(range(0, n), k=(j+1)*n_confs-n) - _logger.debug(f"Iteration {j}: Randomly select {len(index_random)} conformers") + + if include_min_energy_conf: + index_random = index_random + index_min + _logger.debug(f"Iteration {j}: Randomly select {len(index_random)} conformers and add minimum energy conformer") + else: + _logger.debug(f"Iteration {j}: Randomly select {len(index_random)} conformers") _g.nodes["g"].data["u_ref"] = torch.cat((_g.nodes['g'].data['u_ref'][:, index], _g.nodes['g'].data['u_ref'][:, index_random]), dim=-1) _g.nodes["n1"].data["xyz"] = torch.cat((_g.nodes['n1'].data['xyz'][:, index, :], _g.nodes['n1'].data['xyz'][:, index_random, :]), dim=1) @@ -507,7 +521,12 @@ def reshape_conformation_size(self, n_confs=50): idx1 = j*n_confs idx2 = (j+1)*n_confs index = idx_range[idx1:idx2] - _logger.debug(f"Iteration {j}: Extract indice from {idx1} to {idx2}") + + if include_min_energy_conf: + index = index + index_min + _logger.debug(f"Iteration {j}: Extract indice from {idx1} to {idx2} and add minimum energy conformer") + else: + _logger.debug(f"Iteration {j}: Extract indice from {idx1} to {idx2}") _g.nodes["g"].data["u_ref"] = _g.nodes['g'].data['u_ref'][:, index] _g.nodes["n1"].data["xyz"] = _g.nodes['n1'].data['xyz'][:, index, :] From 252012e59e30e2a4b0cd8cc79e8499b934a3bf20 Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 14:38:23 -0500 Subject: [PATCH 43/59] fix report_loss to properly append loss data at each epoch --- espfit/app/train.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index 1f3d977..2190baa 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -339,20 +339,17 @@ def report_loss(self, epoch, loss_dict): None """ import pandas as pd - df = pd.DataFrame.from_dict(loss_dict, orient='index').T - df.insert(0, 'epoch', epoch) log_file_path = os.path.join(self.output_directory_path, 'reporter.log') + df_new = pd.DataFrame.from_dict(loss_dict, orient='index').T + df_new.insert(0, 'epoch', epoch) if os.path.exists(log_file_path): - existing_headers = pd.read_csv(log_file_path, sep='\t', nrows=0).columns.tolist() - if set(df.columns) != set(existing_headers): - df_old = pd.read_csv(log_file_path, sep='\t') - df = pd.concat([df_old, df], ignore_index=True) - else: - df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False, header=False, mode='a') + df_old = pd.read_csv(log_file_path, sep='\t') + df = pd.concat([df_old, df_new]) else: - df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False) + df = df_new + df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False) def train(self): @@ -385,14 +382,18 @@ def train(self): if torch.cuda.is_available(): g = g.to("cuda:0") g.nodes["n1"].data["xyz"].requires_grad = True - loss = self.net(g) + #loss = self.net(g) + loss, loss_dict = self.net(g) loss.backward() optimizer.step() - + + loss_dict['loss'] = loss.item() + self.report_loss(epoch, loss_dict) + if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. - _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() - _logger.info(f'epoch {epoch}: {_loss:.3f}') + loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() + _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}') self._save_checkpoint(epoch) @@ -418,7 +419,6 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight ------- None """ - from espfit.utils.units import HARTREE_TO_KCALPERMOL from espfit.utils.sampler.reweight import SetupSamplerReweight # Note: RuntimeError will be raised if copy.deepcopy is used. @@ -463,8 +463,8 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight if epoch > self.sampler_patience: # Save checkpoint as local model (net.pt) + # `neff_min` is -1 if SamplerReweight.samplers is None samplers = self._setup_local_samplers(epoch, net_copy, debug) - # neff_min is -1 if SamplerReweight.samplers is None neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton @@ -546,7 +546,7 @@ def _save_checkpoint(self, epoch): ------- None """ - checkpoint_file = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt") + checkpoint_file = os.path.join(self.output_directory_path, f"ckpt{epoch}.pt") torch.save(self.net.state_dict(), checkpoint_file) @@ -566,9 +566,9 @@ def _save_local_model(self, epoch, net_copy): None """ # Save checkpoint as temporary espaloma model (force field) - _logger.info(f'Save checkpoint{epoch}.pt as temporary espaloma model (net.pt)') + _logger.info(f'Save ckpt{epoch}.pt as temporary espaloma model (net.pt)') self._save_checkpoint(epoch) - local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt") + local_model = os.path.join(self.output_directory_path, f"ckpt{epoch}.pt") self.save_model(net=net_copy, best_model=local_model, model_name=f"net.pt", output_directory_path=self.output_directory_path) From 5bf8717f85d3fa3057891ac64db618c5b8a3fb8d Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 15:22:08 -0500 Subject: [PATCH 44/59] update docstring for reshape_conformation_size --- espfit/utils/graphs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index 75d47f4..d6b0d43 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -455,6 +455,9 @@ def reshape_conformation_size(self, n_confs=50, include_min_energy_conf=False): n_confs : int, default=50 Number of conformations per graph (molecule). + include_min_energy_conf : boolean, default=False + If True, then minimum energy conformer will be included for all split graphs. + Returns ------- None From 8d2871f1405a08155406791616376fe83ab07a88 Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 15:23:17 -0500 Subject: [PATCH 45/59] update docstring in CustomGraphDataset --- espfit/utils/graphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index d6b0d43..359dbff 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -33,7 +33,7 @@ class CustomGraphDataset(GraphDataset): compute_baseline_energy_force(forcefield_list=['openff-2.0.0']): Compute energies and forces using other force fields. - reshape_conformation_size(n_confs=50): + reshape_conformation_size(n_confs=50, include_min_energy_conf=False): Reshape conformation size. compute_relative_energy(): From ee1503e62568a54434bc1a18563cebfd610f1ac6 Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 16:58:24 -0500 Subject: [PATCH 46/59] fix bug in training when restarting from existing checkpoint file --- espfit/app/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index 2190baa..e4cbfdb 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -393,7 +393,7 @@ def train(self): if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() - _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}') + _logger.info(f'Epoch {epoch}: loss={loss:.3f}') self._save_checkpoint(epoch) @@ -514,10 +514,10 @@ def _load_checkpoint(self): checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path)) if checkpoints: - n = [ int(c.split('net')[1].split('.')[0]) for c in checkpoints ] + n = [ int(c.split('ckpt')[1].split('.')[0]) for c in checkpoints ] n.sort() restart_epoch = n[-1] - restart_checkpoint = os.path.join(self.output_directory_path, f"net{restart_epoch}.pt") + restart_checkpoint = os.path.join(self.output_directory_path, f"ckpt{restart_epoch}.pt") self.net.load_state_dict(torch.load(restart_checkpoint)) logging.info(f'Restarting from ({restart_checkpoint}).') else: From c757c73efa03dd7da9da93f0a90e08a8750cc77e Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 17:00:13 -0500 Subject: [PATCH 47/59] check output files after running test in test_train_sampler --- espfit/tests/test_app_train_sampler.py | 27 ++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py index e0747a1..8d39e32 100644 --- a/espfit/tests/test_app_train_sampler.py +++ b/espfit/tests/test_app_train_sampler.py @@ -53,13 +53,9 @@ def test_load_dataset(tmpdir): def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml): - - """ - TODO - ---- - - * sampler.py needs to support loading temporary espaloma model during training - """ + """Test function to train a sampler.""" + import os + import glob # Load dataset and model ds = test_load_dataset @@ -67,12 +63,19 @@ def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml): # Set espaloma parameters model.dataset_train = ds - model.epochs = 10 + model.epochs = 15 # Train - model.train_sampler(sampler_patience=3, neff_threshold=0.2, debug=True) # fails if sampler_patience is < epochs + sampler_patience = 10 + # Force sampler to run after reaching sampler patience by setting neff_threshold to 1.0 + model.train_sampler(sampler_patience=sampler_patience, neff_threshold=1.0, sampler_weight=1) # Check outputs - #import glob - #assert len(glob.glob(model.output_directory_path + '/*')) > 0 - #assert model.sampler is not None \ No newline at end of file + n_ckpt = len(glob.glob(os.path.join(model.output_directory_path, 'ckpt*pt'))) + assert n_ckpt == int(model.epochs / model.checkpoint_frequency) + + n_adenosine_pred_yaml = len(glob.glob(os.path.join(model.output_directory_path, 'adenosine/*/pred.yaml'))) + assert n_adenosine_pred_yaml == int(model.epochs - sampler_patience) + + n_cytidine_pred_yaml = len(glob.glob(os.path.join(model.output_directory_path, 'cytidine/*/pred.yaml'))) + assert n_cytidine_pred_yaml == int(model.epochs - sampler_patience) From 72946c49f32cfc13bfc6a2cb53f6e414165b6724 Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 17:00:26 -0500 Subject: [PATCH 48/59] minor update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d8b18a2..f23f56d 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Infrastruture to train espaloma with experimental observables from espfit.utils.graphs import CustomGraphDataset path = 'espfit/data/qcdata/openff-toolkit-0.10.6/dgl2/protein-torsion-sm/' ds = CustomGraphDataset.load(path) -ds.reshape_conformation_size(n_confs=50) +ds.reshape_conformation_size(n_confs=50, include_min_energy_conf=True) ds.compute_relative_energy() # Create esplama model from espfit.app.train import EspalomaModel From 9b9b96305656b1a3b25fda0f37f96b2e0ce2b093 Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 6 Mar 2024 17:07:22 -0500 Subject: [PATCH 49/59] remove mockcode.ipynb --- examples/mockcode/mockcode.ipynb | 911 ------------------------------- 1 file changed, 911 deletions(-) delete mode 100644 examples/mockcode/mockcode.ipynb diff --git a/examples/mockcode/mockcode.ipynb b/examples/mockcode/mockcode.ipynb deleted file mode 100644 index 490e58b..0000000 --- a/examples/mockcode/mockcode.ipynb +++ /dev/null @@ -1,911 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8425a3a5", - "metadata": {}, - "source": [ - "# Mock code for preparing and loading data for training espaloma" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6db9e281", - "metadata": {}, - "outputs": [], - "source": [ - "import espaloma\n", - "import espfit" - ] - }, - { - "cell_type": "markdown", - "id": "fa0f1027", - "metadata": {}, - "source": [ - "## Download QC datasets from QCArchive as HDF5 (SKIP IMPLEMENTATION) \n", - "\n", - "This functionality will not be implemented at the moment and alternatively rely on external scripts (e.g. https://github.com/choderalab/download-qca-datasets)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b69087c6", - "metadata": {}, - "outputs": [], - "source": [ - "# place holder\n", - "\n", - "outdir='/DATASET_HDF_PATH/MYDATA' \n", - "outfile='small_basic.hdf5'\n", - "\n", - "espfit.utils.data.download_qcarchive(workflow='Datataset', \n", - " qc_specification='default', \n", - " outdir=outdir,\n", - " outfile=outfile\n", - " )\n", - "#> raise NotImplemented Error" - ] - }, - { - "cell_type": "markdown", - "id": "aae11b77", - "metadata": {}, - "source": [ - "## Convert HDF5 to DGL graphs (SKIP IMPLEMENTATION)\n", - "\n", - "This function will not be implemented at the moment and alternatively rely on external scripts (e.g. https://github.com/choderalab/refit-espaloma/blob/main/openff-default/01-create-dataset/script/getgraph_hdf5.py)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45c152ff", - "metadata": {}, - "outputs": [], - "source": [ - "# place holder\n", - "\n", - "indir = '/DATASET_HDF_PATH/MYDATA'\n", - "outdir = '/DATASET_DGL_PATH/MYDATA'\n", - "\n", - "_filenames = [ 'small_basic.hdf5', 'small_optimize.hdf5', 'small_torsiondrive.hdf5', 'peptide_basic.hdf5', 'peptide_optimize.hdf5', 'peptide_torsiondrive.hdf5' ]\n", - "filenames = [ os.path.join(indir, filename) for filename in _filenames ]\n", - "\n", - "for filename in filenames:\n", - " ds += espfit.utils.data.hdf5_to_dgl(infile=filename,outdir=outdir)\n", - " \n", - "#> raise NotImplemented Error" - ] - }, - { - "cell_type": "markdown", - "id": "013f75f8", - "metadata": {}, - "source": [ - "## Filter DGL graphs (SKIP IMPLEMENTATION)\n", - "\n", - "This function will not be implemented at the moment and rely on external scripts (e.g. https://github.com/choderalab/refit-espaloma/tree/main/openff-default/02-train/merge-data/script)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6f81b71", - "metadata": {}, - "outputs": [], - "source": [ - "# place holder\n", - "\n", - "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED'\n", - "ds.filter(min_energy=0.1,\n", - " min_conformer=3,\n", - " compute_am1bcc='AM1BCC-ELF10', \n", - " compute_baseline_forcefields=forcefield_list, \n", - " compute_relative_energy=True,\n", - " subtract_nonbonded=True,\n", - " base_forcefiled='openff-2.0.0',\n", - " inplace=False,\n", - " outdir=outdir\n", - " )\n", - " \n", - "#> raise NotImplemented Error" - ] - }, - { - "cell_type": "markdown", - "id": "a5da73a3", - "metadata": {}, - "source": [ - "## Load preprocessed DGL graphs\n", - "\n", - "We are going to start from here." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f402f1a1", - "metadata": {}, - "outputs": [], - "source": [ - "indir = '/DATASET_DGL_PATH/MYDATA/FILTERED/*' # single path or list of paths\n", - "ds = espfit.utils.data.load(in_prefix)" - ] - }, - { - "cell_type": "markdown", - "id": "c06051b0", - "metadata": {}, - "source": [ - "#### Check properties" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26b1b096", - "metadata": {}, - "outputs": [], - "source": [ - "ds.n_data # number of data (entries)\n", - "#> 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97fca213", - "metadata": {}, - "outputs": [], - "source": [ - "ds.n_conf # number of conformations\n", - "#> 10000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edd853ac", - "metadata": {}, - "outputs": [], - "source": [ - "ds.elements # elements\n", - "#> H,B,Br,C,N,O,I" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d118175", - "metadata": {}, - "outputs": [], - "source": [ - "ds.duplicate_isomeric_smiles # isomeric smiles\n", - "#> returns list of duplicate isomeric smiles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05bdc5b7", - "metadata": {}, - "outputs": [], - "source": [ - "ds.duplicate_nonisomeric_smiles # nonisomeric smiles\n", - "#> returns list of duplicate nonisomeric smiles" - ] - }, - { - "cell_type": "markdown", - "id": "e0af7103", - "metadata": {}, - "source": [ - "#### Drop/merge duplicate smiles and filter datasets\n", - "\n", - "Ensure the datasets loaded from different sources have no duplicated smiles. \n", - "Drop duplicate isomeric (nonisomeric) smiles across different sources of datasets. \n", - "Merge duplicate dgl graphs with same smiles into a single dgl graph and create a new dataset called 'misc'." - ] - }, - { - "cell_type": "markdown", - "id": "15340bae", - "metadata": {}, - "source": [ - "##### drop and merge smiles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47cb8e2b", - "metadata": {}, - "outputs": [], - "source": [ - "outdir = '/DATASET_DGL_PATH/MYDATA'\n", - "ds.drop_merge_nonisomeric_smiles(outdir=outdir, outname='misc') # miscellaneous\n", - "\n", - "# Alteratively,\n", - "ds.drop_merge_isomeric_smiles(outdir=outdir, outname='misc')" - ] - }, - { - "cell_type": "markdown", - "id": "1a84aa3d", - "metadata": {}, - "source": [ - "##### filter dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "958112bf", - "metadata": {}, - "outputs": [], - "source": [ - "# Add misc dataset that was just created\n", - "ds += espfit.utils.data.load('/DATASET_DGL_PATH/MYDATA/misc')\n", - "\n", - "# Filter all dataset\n", - "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED'\n", - "ds.filter(min_energy=0.1,\n", - " min_conformer=3,\n", - " inplace=False,\n", - " outdir=outdir\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "457c7c7c-bd74-4009-a748-b014a9e21e6a", - "metadata": {}, - "outputs": [], - "source": [ - "# Compute all dataset\n", - "ds.compute(compute_am1bcc=None, \n", - " compute_baseline_forcefields=None, \n", - " compute_relative_energy=True,\n", - " subtract_nonbonded=True,\n", - " base_forcefiled='openff-2.0.0',\n", - " inplace=False,\n", - " outdir=outdir\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f96faf73", - "metadata": {}, - "outputs": [], - "source": [ - "# Alternatively, we could just filter the misc data and reload all filtered dataset later\n", - "\n", - "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED'\n", - "misc_data = espfit.utils.data.load('/DATASET_DGL_PATH/MYDATA/misc')\n", - "misc_data.filter(min_energy=0.1,\n", - " min_conformer=3,\n", - " inplace=False,\n", - " outdir=outdir\n", - " )\n", - "misc_data.compute(compute_am1bcc=None, \n", - " compute_baseline_forcefields=None, \n", - " compute_relative_energy=True,\n", - " subtract_nonbonded=True,\n", - " base_forcefiled='openff-2.0.0',\n", - " inplace=False,\n", - " outdir=outdir\n", - " )\n", - "\n", - "# load filtered\n", - "input_dirs = glob.glob('/DATASET_DGL_PATH/MYDATA/FILTERED/*') # list of paths\n", - "ds = espfit.utils.data.load(input_dirs)" - ] - }, - { - "cell_type": "markdown", - "id": "a132c68d", - "metadata": {}, - "source": [ - "## Prepare for training" - ] - }, - { - "cell_type": "markdown", - "id": "8174a95f", - "metadata": {}, - "source": [ - "#### Split datasets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8b64d22", - "metadata": {}, - "outputs": [], - "source": [ - "RANDOM_SEED = 2666\n", - "ds.shuffle(RANDOM_SEED)\n", - "\n", - "ds_tr, ds_vl_te = ds.split(0.8, 0.2)\n", - "ds_vl, ds_te = ds_vl_te.split(0.5, 0.5)" - ] - }, - { - "cell_type": "markdown", - "id": "06de6513", - "metadata": {}, - "source": [ - "#### Augment conformations to handle heterographs\n", - "\n", - "This is a work around to handle different graph size (shape). DGL requires at least one dimension with same size. \n", - "Here, we will modify the graphs so that each graph has the same number of conformations instead fo concatenating \n", - "graphs into heterogenous graphs with the same number of conformations. This will allow batching and shuffling \n", - "during the training. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28ce7bd3", - "metadata": {}, - "outputs": [], - "source": [ - "# Remove unnecessary data from graph in backend? (will this speed up training?)\n", - "# e.g. g.nodes['g'].data.pop('u_qm')\n", - "\n", - "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED/RESHAPE'\n", - "ds_tr.reshape(n_conf=50,\n", - " preserve_min=True,\n", - " inplace=True,\n", - " outdir=outdir,\n", - " verbose=1,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9deeabf", - "metadata": {}, - "outputs": [], - "source": [ - "# regenerate impropers (forgot why we need to do this)\n", - "ds_tr.apply(regenerate_impropers, in_place=True)" - ] - }, - { - "cell_type": "markdown", - "id": "d45feb3f", - "metadata": {}, - "source": [ - "## Train espaloma" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eeb8b4f9", - "metadata": {}, - "outputs": [], - "source": [ - "# initialize\n", - "model = espfit.app.experiment()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c71d7e80", - "metadata": {}, - "outputs": [], - "source": [ - "# define espaloma architecture\n", - "\n", - "# use toml\n", - "import yaml\n", - "with open('config.yml', 'r') as file:\n", - " config = yaml.safe_load(file) \n", - " \n", - "# Possible methods\n", - "# 1. call predefined model?\n", - "model.call(model_name='model1')\n", - "# 2. create model using yaml config\n", - "model.create(config=config)\n", - "# 3. from file\n", - "model.from_file('config.toml')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32760aff", - "metadata": {}, - "outputs": [], - "source": [ - "# check neural network model\n", - "\n", - "model.net\n", - "#> returns neural network architecture" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "69a9e8d2", - "metadata": {}, - "outputs": [], - "source": [ - "# load dataset\n", - "\n", - "model.train_data = ds_tr\n", - "model.validation_data = ds_vl\n", - "model.test_data = ds_te" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "adeeb626", - "metadata": {}, - "outputs": [], - "source": [ - "# check data property\n", - "\n", - "model.train_data.n_data\n", - "model.train_data.n_conf\n", - "model.train_data.elements" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20caa2bf", - "metadata": {}, - "outputs": [], - "source": [ - "# save checkpoint file to `checkpoints` every 10 epochs\n", - "# restart training from checkpoint file\n", - "# validation is excluded from the training to decrease inference time\n", - "\n", - "model.train(steps, lr, batch_size, restart=checkpoint, checkpoint_frequency=10, log_file=logfile, log_level='debug')" - ] - }, - { - "cell_type": "markdown", - "id": "fd25f33e", - "metadata": {}, - "source": [ - "#### Validate and find best model\n", - "\n", - "Use job array to speed up this process using external scripts (e.g. https://github.com/choderalab/refit-espaloma/tree/main/openff-default/02-train/joint-improper-charge/charge-weight-1.0/eval)" - ] - }, - { - "cell_type": "markdown", - "id": "b94b4be6", - "metadata": {}, - "source": [ - "## Alternatively, train and validate simultaneously\n", - "\n", - "Not sure how slower this will be compared to just doing trainig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc585ee8", - "metadata": {}, - "outputs": [], - "source": [ - "model.train_val(steps, lr, batch_size, restart=checkpoint, checkpoint_frequency=10, logfile=logfile, verbose=1, early_stopping=800, patience=5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80498680", - "metadata": {}, - "outputs": [], - "source": [ - "# save model\n", - "model.save() # saves best model as 'model.pt'\n", - "\n", - "# plot loss validation\n", - "model.plot_loss()" - ] - }, - { - "cell_type": "markdown", - "id": "245006e3", - "metadata": {}, - "source": [ - "## Benchmark" - ] - }, - { - "cell_type": "markdown", - "id": "9861df48", - "metadata": {}, - "source": [ - "#### RMSE metric" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ff42837", - "metadata": {}, - "outputs": [], - "source": [ - "RANDOM_SEED = 2666\n", - "indir='/DATASET_DGL_PATH/MYDATA/FILTERED/RESHAPE'\n", - "data_split_size = [0.8, 0.1, 0.1]\n", - "best_model = 'model.pt'\n", - "\n", - "df = espfit.utils.rmse_metric(best_model, indir, data_split_size, RANDOM_SEED) # pandas dataframe\n", - "df.to_csv('rmse_metric.csv', index=False, sep='Â¥t', float_format='%.3f')" - ] - }, - { - "cell_type": "markdown", - "id": "dc75ac51", - "metadata": {}, - "source": [ - "#### Run other benchmarks independantly.\n", - "\n", - "- Small molecule geometry optmization (https://github.com/choderalab/geometry-benchmark-espaloma/tree/main/qc-opt-geo)\n", - "- ESP benchmark" - ] - }, - { - "cell_type": "markdown", - "id": "379ad7fb", - "metadata": {}, - "source": [ - "## Train espaloma with experimental observable refitting\n", - "\n", - "- `espfit_experiment/`\n", - " - `data/`: Cached dataset ready for training\n", - " - `utils/`: Stores scripts to run external benchmarks\n", - " - `small_molecule_geometry`\n", - " - geo.py\n", - " - `partial_charge_esp`\n", - " - ele.py\n", - " - `rna_nucleoside`\n", - " - rna_nucleoside.py\n", - " - `rna_tetramer`:\n", - " - rna_tetramer.py\n", - " - `experiment/`\n", - " - `001/`: Create new directory for each refitting experiment\n", - " - `xml/`: Stores openmm xml\n", - " - `refit/`: Espaloma training\n", - " - `checkpoints/`: Stores checkpoint files\n", - " - `sampling/`: MD simulation\n", - " - `iter_0`: Initial MD sampling\n", - " - `iter_n`: MD sampling at epoch-n when necesssary\n", - " - `train.log`: Log file during espaloma training\n", - " - `benchmark/`\n", - " - `rmse_metric`\n", - " - `small_molecule_geometry`\n", - " - `partial_charge_esp`\n", - " - `rna_nucleoside`\n", - " - `rna_tetramer`" - ] - }, - { - "cell_type": "markdown", - "id": "67893219", - "metadata": {}, - "source": [ - "#### Basic usage to run simulations for registered systems" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "343b1a35", - "metadata": {}, - "outputs": [], - "source": [ - "# check registered systems\n", - "registered_systems = espfit.system.available()\n", - "\n", - "registered_systems.get_names\n", - "#> ['A', 'G', 'C', 'U', 'ApA']\n", - "\n", - "registered_systems.get('name').observables\n", - "#> returns pandas dataframe with all experimental obervables and corresponding literature" - ] - }, - { - "cell_type": "markdown", - "id": "c5daf12a", - "metadata": {}, - "source": [ - "##### Prepare system" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c74f2962", - "metadata": {}, - "outputs": [], - "source": [ - "system = registered_systems.get('name')\n", - "simulation = system.setup(system_name=name, espaloma_model = 'model.pt', config=config, outdir=outdir) # save xml\n", - "\n", - "# minimize\n", - "simulation.min()" - ] - }, - { - "cell_type": "markdown", - "id": "66d56fba", - "metadata": {}, - "source": [ - "##### Load a system already prepared" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e673e6f4", - "metadata": {}, - "outputs": [], - "source": [ - "system = espfit.system.load()" - ] - }, - { - "cell_type": "markdown", - "id": "5fc790e6", - "metadata": {}, - "source": [ - "##### Run simulation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a1528dc", - "metadata": {}, - "outputs": [], - "source": [ - "simulation.run(steps=100) # standard MD?" - ] - }, - { - "cell_type": "markdown", - "id": "bc77fe98", - "metadata": {}, - "source": [ - "##### Compute loss" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0657b0a", - "metadata": {}, - "outputs": [], - "source": [ - "obs_exp = system.get_experimental_value()\n", - "obs_calc = simulation.compute_observable()\n", - "loss = simulation.compute_loss(obs_exp, obs_calc)" - ] - }, - { - "cell_type": "markdown", - "id": "a358db78", - "metadata": {}, - "source": [ - "##### Reweight observable using updated espaloma model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18099bb4", - "metadata": {}, - "outputs": [], - "source": [ - "result = simulation.compute_reweighted_observable(update_espaloma_model='new.pt')\n", - "\n", - "# reweighted observable\n", - "obs_calc = result.observable\n", - "\n", - "# effective sample size\n", - "n_eff = result.effective_sample_size\n", - "\n", - "# loss with reweighted observable\n", - "loss = simulation.compute_loss(obs_exp, obs_calc)" - ] - }, - { - "cell_type": "markdown", - "id": "98996eed", - "metadata": {}, - "source": [ - "## Pseudo code for training espaloma with reweighting on the fly" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f73acfab", - "metadata": {}, - "outputs": [], - "source": [ - "RANDOM_SEED = 2666\n", - "\n", - "input_dirs = glob.glob('/DATASET_DGL_PATH/MYDATA/FILTERED/RESHAPE/*') # list of paths\n", - "ds = espfit.utils.data.load(input_dirs)\n", - "ds.shuffle(RANDOM_SEED)\n", - "\n", - "ds_tr, ds_vl_te = ds.split(0.8, 0.2)\n", - "ds_vl, ds_te = ds_vl_te.split(0.5, 0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35e48171", - "metadata": {}, - "outputs": [], - "source": [ - "model = espfit.app.experiment()\n", - "\n", - "with open('config.yml', 'r') as file:\n", - " config = yaml.safe_load(file) \n", - "model.create(config=config)" - ] - }, - { - "cell_type": "markdown", - "id": "6a835423", - "metadata": {}, - "source": [ - "##### Run simulation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c7e206e", - "metadata": {}, - "outputs": [], - "source": [ - "system = registered_systems.get('A')\n", - "simulation = system.setup(system_name=name, espaloma_model = 'model.pt', config=config, outdir=outdir) # save xml\n", - "simulation.min()\n", - "simulation.run(1000)" - ] - }, - { - "cell_type": "markdown", - "id": "fe5fcd47", - "metadata": {}, - "source": [ - "##### Get experimental observables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ff3ca80", - "metadata": {}, - "outputs": [], - "source": [ - "obs_exp = system.get_experimental_value()" - ] - }, - { - "cell_type": "markdown", - "id": "2915bb00", - "metadata": {}, - "source": [ - "##### Train with MD reweighting\n", - "\n", - "[Iterative Optimization of Molecular Mechanics Force Fields from NMR Data of Full-Length Proteins, JCTC, 2011](https://pubs.acs.org/doi/full/10.1021/ct200094b) \n", - "[Automatic Learning of Hydrogen-Bond Fixes in the AMBER RNA Force Field, JCTC, 2022](https://pubs.acs.org/doi/10.1021/acs.jctc.2c00200) \n", - "[Enhanced sampling methods for molecular dynamics simulations, arXiv, 2022](https://arxiv.org/abs/2202.04164) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c4b766c", - "metadata": {}, - "outputs": [], - "source": [ - "ds_tr_loader = dgl.dataloading.GraphDataLoader(ds_tr, batch_size=batch_size, shuffle=True)\n", - "optimizer = torch.optim.Adam(model.net().parameters(), lr=learning_rate)\n", - "\n", - "with torch.autograd.set_detect_anomaly(True):\n", - " for idx in range(steps):\n", - " n_eff = [] # store effective sample size\n", - " for g in ds_tr_loader:\n", - " optimizer.zero_grad()\n", - " g = g.to(\"cuda:0\")\n", - " g.nodes[\"n1\"].data[\"xyz\"].requires_grad = True \n", - " \n", - " # Original espaloma loss\n", - " loss = net(g)\n", - "\n", - " # Reweighting \n", - " result = simulation.compute_reweighted_observable(net) # return: (reweighted observable, effective sample size)\n", - " obs_calc = result.observable\n", - " loss_md = simulation.compute_loss(obs_exp, obs_calc) \n", - " \n", - " n_eff += result.n_eff\n", - " \n", - " # Joint loss\n", - " loss += weight * loss_md\n", - " \n", - " loss.backward()\n", - " optimizer.step()\n", - " \n", - " # save checkpoint file \n", - " if idx % 10 == 0:\n", - " if not os.path.exists(output_prefix):\n", - " os.mkdir(output_prefix)\n", - " torch.save(net.state_dict(), output_prefix + \"/net%s.pth\" % idx)\n", - " \n", - " # Averaged effective samples\n", - " if n_eff.mean() < effective_sample_size_tolerance:\n", - " # rebuild system with current net model\n", - " # rerun simulation\n", - " # cache new trajectory\n", - " simulation.rebuild()\n", - " simulation.run()" - ] - }, - { - "cell_type": "raw", - "id": "3900fc9f", - "metadata": {}, - "source": [] - }, - { - "cell_type": "raw", - "id": "5735fb72", - "metadata": {}, - "source": [] - }, - { - "cell_type": "raw", - "id": "ec7b85df", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 58036dd7592ba6ba381e37707612e8b05523c228 Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 7 Mar 2024 12:58:51 -0500 Subject: [PATCH 50/59] average q_ref if different isomeric smiles are found in merging nonisomeric molecules --- espfit/utils/graphs.py | 53 +++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index 359dbff..32776c3 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -96,8 +96,8 @@ def __init__(self, graphs=[], reference_forcefield='openff-2.0.0', random_seed=2 self.random_seed = random_seed - def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc', output_directory_path=None): - """Drop and merge duplicate nonisomeric smiles across different data sources. + def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc', output_directory_path=None, isomeric=True): + """Drop and merge duplicate (non)isomeric smiles within the dataset. Modifies list of esp.Graph's in place. @@ -112,6 +112,14 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc output_directory_path : str, default=None Output directory path to save the merged dataset. If None, then the current working directory is used. + + isoemric : boolean, default=True + If True, then isomeric smiles will be used to identify unique molecules. + + If False, then nonisomeric smiles will be used to identify unique molecules. + Note that partial charges will be averaged for the same nonisomeric smiles. + This is because different 3D structures can have different partial charges + due to different conformations. Returns ------- @@ -123,8 +131,11 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc if output_directory_path == None: output_directory_path = os.getcwd() - _logger.info(f'Drop and merge duplicate smiles') - smiles = [ g.mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=False) for g in self.graphs ] + if isomeric == True: + _logger.info(f'Drop and merge duplicate isomeric smiles') + else: + _logger.info(f'Drop and merge duplicate nonisomeric smiles') + smiles = [ g.mol.to_smiles(isomeric=isomeric, explicit_hydrogens=True, mapped=False) for g in self.graphs ] _logger.info(f'Found {len(smiles)} molecules') # Unique entries @@ -160,8 +171,8 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc # Temporary directory needs to be created beforehand for `test_drop_and_merge_duplicates`. _output_directory_path = os.path.join(output_directory_path, dataset_name) os.makedirs(_output_directory_path, exist_ok=True) - output_directory_path = os.path.join(_output_directory_path, molname) - g.save(output_directory_path) + new_output_directory_path = os.path.join(_output_directory_path, molname) + g.save(new_output_directory_path) # Update in place new_graphs = unique_graphs + duplicated_graphs @@ -344,7 +355,7 @@ def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']): COLLISION_RATE = 1.0 / unit.picosecond if not all(_ in self.available_forcefields for _ in forcefield_list): - raise Exception(f'{forcefield} force field not supported. Supported force fields are {SUPPORTED_FORCEFIELD_LIST}.') + raise Exception(f'{forcefield} force field not supported. Supported force fields are {self.available_forcefields}.') new_graphs = [] for i, g in enumerate(self.graphs): @@ -573,7 +584,7 @@ def _merge_graphs(ds): Parameters ---------- - ds : list of espaloma.graphs.graph.Graph + ds : list of espaloma.graphs.graph.Graph, default=None The list of Graph instances to be merged. All Graphs in the list must be equivalent. Returns @@ -586,12 +597,32 @@ def _merge_graphs(ds): import copy import torch + # Check if all inputs are equivalent (isomeric smiles) + # If not, get average partial charges across different isomeric smiles (molecules) + isomeric_smiles = [g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=False) for g in ds] + unique_isomeric_smiles = set(isomeric_smiles) + if len(unique_isomeric_smiles) != 1: + n_atoms = ds[0].nodes['n1'].data['q_ref'].shape[0] + q_ref = torch.zeros(n_atoms, 1) + for unique_isomeric_smile in unique_isomeric_smiles: + index = [i for i, isomeric_smile in enumerate(isomeric_smiles) if isomeric_smile in unique_isomeric_smile][0] + q_ref += ds[index].nodes['n1'].data['q_ref'] + q_ref = q_ref / len(set(isomeric_smiles)) + # Update partial charges in-place + for i in range(len(ds)): + ds[i].nodes['n1'].data['q_ref'] = q_ref + # Check if graphs are equivalent for i in range(1, len(ds)): - # Openff molecule - assert ds[0].mol == ds[i].mol # Mapped isomeric smiles - assert ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) == ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + if len(unique_isomeric_smiles) != 1: + mapped_smiles = ds[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) + mapped_smiles_i = ds[i].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) + assert mapped_smiles == mapped_smiles_i, f"Mapped nonisomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" + else: + mapped_smiles = ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + mapped_smiles_i = ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + assert mapped_smiles == mapped_smiles_i, f"Mapped isomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" # Other node features for key in ["sum_q"]: np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy()) From 5be247c6d754b16dce81df6872f679f9b0db94fe Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 7 Mar 2024 13:45:28 -0500 Subject: [PATCH 51/59] add testsystems/nucleoside/target.pdb --- .../target/testsystems/nucleoside/target.pdb | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 espfit/data/target/testsystems/nucleoside/target.pdb diff --git a/espfit/data/target/testsystems/nucleoside/target.pdb b/espfit/data/target/testsystems/nucleoside/target.pdb new file mode 100644 index 0000000..c80385e --- /dev/null +++ b/espfit/data/target/testsystems/nucleoside/target.pdb @@ -0,0 +1,35 @@ +REMARK 1 CREATED WITH OPENMM 8.1, 2024-01-26 +ATOM 1 C5' A A 1 5.609 7.878 -3.616 1.00 0.00 C +ATOM 2 H5' A A 1 5.926 8.406 -4.515 1.00 0.00 H +ATOM 3 H5'' A A 1 5.488 8.603 -2.812 1.00 0.00 H +ATOM 4 O5' A A 1 4.377 7.214 -3.875 1.00 0.00 O +ATOM 5 HO5' A A 1 3.732 7.873 -4.143 1.00 0.00 H +ATOM 6 C4' A A 1 6.701 6.879 -3.233 1.00 0.00 C +ATOM 7 H4' A A 1 7.663 7.391 -3.262 1.00 0.00 H +ATOM 8 O4' A A 1 6.494 6.375 -1.913 1.00 0.00 O +ATOM 9 C3' A A 1 6.778 5.649 -4.144 1.00 0.00 C +ATOM 10 H3' A A 1 5.784 5.235 -4.319 1.00 0.00 H +ATOM 11 O3' A A 1 7.428 5.896 -5.380 1.00 0.00 O +ATOM 12 HO3' A A 1 8.365 5.865 -5.178 1.00 0.00 H +ATOM 13 C2' A A 1 7.561 4.713 -3.219 1.00 0.00 C +ATOM 14 H2' A A 1 7.428 3.673 -3.519 1.00 0.00 H +ATOM 15 C1' A A 1 6.878 5.002 -1.872 1.00 0.00 C +ATOM 16 H1' A A 1 7.580 4.816 -1.059 1.00 0.00 H +ATOM 17 N1 A A 1 5.124 0.467 -0.163 1.00 0.00 N +ATOM 18 C2 A A 1 6.351 0.978 -0.252 1.00 0.00 C +ATOM 19 H2 A A 1 7.149 0.341 0.097 1.00 0.00 H +ATOM 20 N3 A A 1 6.723 2.170 -0.712 1.00 0.00 N +ATOM 21 C4 A A 1 5.650 2.896 -1.124 1.00 0.00 C +ATOM 22 C5 A A 1 4.335 2.511 -1.094 1.00 0.00 C +ATOM 23 C6 A A 1 4.098 1.219 -0.579 1.00 0.00 C +ATOM 24 N6 A A 1 2.894 0.673 -0.481 1.00 0.00 N +ATOM 25 H61 A A 1 2.824 -0.276 -0.142 1.00 0.00 H +ATOM 26 H62 A A 1 2.092 1.188 -0.815 1.00 0.00 H +ATOM 27 N7 A A 1 3.506 3.517 -1.595 1.00 0.00 N +ATOM 28 C8 A A 1 4.352 4.463 -1.905 1.00 0.00 C +ATOM 29 H8 A A 1 4.040 5.410 -2.321 1.00 0.00 H +ATOM 30 N9 A A 1 5.670 4.162 -1.657 1.00 0.00 N +ATOM 31 O2' A A 1 8.945 5.071 -3.169 1.00 0.00 O +ATOM 32 HO2' A A 1 9.415 4.404 -2.663 1.00 0.00 H +TER 33 A A 1 +END From d53595a4cf7f84f4b89ab5433f5c57fcce45f018 Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 7 Mar 2024 19:23:14 -0500 Subject: [PATCH 52/59] compare sorted number of conformers to prevent github CI failure --- espfit/tests/test_utils_graphs.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/espfit/tests/test_utils_graphs.py b/espfit/tests/test_utils_graphs.py index 17f097b..ac94543 100644 --- a/espfit/tests/test_utils_graphs.py +++ b/espfit/tests/test_utils_graphs.py @@ -68,7 +68,9 @@ def test_load_dataset(mydata_gen2_torsion_sm): """ ds = mydata_gen2_torsion_sm nconfs = [g.nodes['g'].data['u_ref'].shape[1] for g in ds] - assert nconfs == [24, 24, 24, 13, 24, 24, 24, 24], 'Number of molecular conformers does not match' + # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI. + #assert nconfs == [24, 24, 24, 13, 24, 24, 24, 24], 'Number of molecular conformers does not match' + assert nconfs.sort() == [24, 24, 24, 13, 24, 24, 24, 24].sort(), 'Number of molecular conformers does not match' def test_load_dataset_multiple(mydata_gen2_torsion_sm, mydata_protein_torsion_sm, mydata_rna_diverse_sm): @@ -119,7 +121,9 @@ def test_drop_and_merge_duplicates(mydata_gen2_torsion_sm, tmpdir): temporary_directory = tmpdir.mkdir('misc') ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) nconfs = [ g.nodes['g'].data['u_ref'].shape[1] for g in ds ] - assert nconfs == [24, 13, 24, 24, 24, 72], 'Number of molecular conformers does not match' + # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI. + #assert nconfs == [24, 13, 24, 24, 24, 72], 'Number of molecular conformers does not match' + assert nconfs.sort() == [24, 13, 24, 24, 24, 72].sort(), 'Number of molecular conformers does not match' def test_subtract_nonbonded_interactions(mydata_gen2_torsion_sm): @@ -165,7 +169,9 @@ def test_filter_high_energy_conformers(mydata_gen2_torsion_sm): # set relative_energy_thershold very small to ensure some conformers will be filtered ds.filter_high_energy_conformers(relative_energy_threshold=0.01, node_feature='u_ref') nconfs = [ g.nodes['g'].data['u_ref'].shape[1] for g in ds ] - assert nconfs == [14, 19, 19, 5, 14, 19, 24, 24], 'Number of molecular conformers does not match' + # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI. + #assert nconfs == [14, 19, 19, 5, 14, 19, 24, 24], 'Number of molecular conformers does not match' + assert nconfs.sort() == [14, 19, 19, 5, 14, 19, 24, 24].sort(), 'Number of molecular conformers does not match' def test_filter_minimum_conformers(mydata_gen2_torsion_sm): From 41555f2242e628edc2b817663ae763166440efc1 Mon Sep 17 00:00:00 2001 From: kt Date: Thu, 7 Mar 2024 19:25:09 -0500 Subject: [PATCH 53/59] add try-except to prevent UnassignedProperTorsionParameterException --- espfit/utils/graphs.py | 165 ++++++++++++++++++++++------------------- 1 file changed, 88 insertions(+), 77 deletions(-) diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index 32776c3..2ac2798 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -27,10 +27,10 @@ class CustomGraphDataset(GraphDataset): filter_high_energy_conformers(relative_energy_threshold=0.1, node_feature='u_ref'): Filter high energy conformers and ensure minimum number of conformers. - filter_minimum_conformers(n_conformer_threshold=3): + filter_minimum_conformers(n_conformer_threshold=5): Filter molecules with conformers below given threshold. - compute_baseline_energy_force(forcefield_list=['openff-2.0.0']): + compute_baseline_energy_force(forcefield_list=['openff-2.1.0']): Compute energies and forces using other force fields. reshape_conformation_size(n_confs=50, include_min_energy_conf=False): @@ -62,9 +62,9 @@ class CustomGraphDataset(GraphDataset): >>> # Filter high energy conformers (u_ref: QM reference after nonbonded interactions are subtracted) >>> ds.filter_high_energy_conformers(relative_energy_threshold=0.1, node_feature='u_ref') >>> # Filter conformers below certain number - >>> ds.filter_minimum_conformers(n_conformer_threshold=3) + >>> ds.filter_minimum_conformers(n_conformer_threshold=5) >>> # Compute energies and forces using other force fields - >>> ds.compute_baseline_energy_force(forcefield_list=['openff-2.0.0']) + >>> ds.compute_baseline_energy_force(forcefield_list=['openff-2.1.0']) >>> # Regenerate improper torsions in-place >>> from espaloma.graphs.utils.regenerate_impropers import regenerate_impropers >>> ds.apply(regenerate_impropers, in_place=True) @@ -75,7 +75,7 @@ class CustomGraphDataset(GraphDataset): """ - def __init__(self, graphs=[], reference_forcefield='openff-2.0.0', random_seed=2666): + def __init__(self, graphs=[], reference_forcefield='openff-2.1.0', random_seed=2666): """Construct custom GraphDataset instance to prepare QC dataset for espaloma training. Parameters @@ -83,7 +83,7 @@ def __init__(self, graphs=[], reference_forcefield='openff-2.0.0', random_seed=2 graphs : list of espaloma.graphs.graph.Graph, default=[] DGL graphs loaded from `espaloma.data.dataset.GraphDataset.load`. - reference_forcefield : str, default=openff-2.0.0 + reference_forcefield : str, default=openff-2.1.0 Reference force field used to compute force field parameters if not present in espaloma. The default behavior is to compute the LJ parameters with `reference_forcefield`. @@ -215,6 +215,7 @@ def subtract_nonbonded_interactions(self, subtract_vdw=False, subtract_ele=True) ------- None """ + _logger.info(f'Subtract nonbonded interactions from QC reference') new_graphs = [] from espaloma.data.md import subtract_nonbonded_force @@ -234,8 +235,8 @@ def subtract_nonbonded_interactions(self, subtract_vdw=False, subtract_ele=True) # subtract_nonbonded_force() will return the coulomb interactions using the predefined partial charges. # # Reference: - # [1] https://github.com/choderalab/espaloma/blob/main/espaloma/data/md.py#L503C19-L503C19 - # [2] https://github.com/openmm/openmmforcefields/blob/637d551a4408cc6145529cd9dc30e267f4178367/openmmforcefields/generators/template_generators.py#L1432 + # [1] https://github.com/choderalab/espaloma/blob/main/espaloma/data/md.py#L503 + # [2] https://github.com/openmm/openmmforcefields/blob/637d551a4408cc6145529cd9dc30e267f4178367/openmmforcefields/generators/template_generators.py#L607 g = subtract_nonbonded_force(g, forcefield=self.reference_forcefield, subtract_charges=True) elif subtract_vdw == False and subtract_ele == False: g = subtract_nonbonded_force(g, forcefield=self.reference_forcefield, subtract_charges=False) @@ -266,6 +267,7 @@ def filter_high_energy_conformers(self, relative_energy_threshold=0.1, node_feat ------- None """ + _logger.info(f'Filter high energy conformers with relative energy threshold {relative_energy_threshold}') if node_feature == None: raise Exception(f'Please specify the node feature name under node type `g`') @@ -289,20 +291,21 @@ def filter_high_energy_conformers(self, relative_energy_threshold=0.1, node_feat del new_graphs - def filter_minimum_conformers(self, n_conformer_threshold=3): + def filter_minimum_conformers(self, n_conformer_threshold=5): """Filter molecules with conformers below given threshold. Modifies list of esp.Graph's in place. Parameters ---------- - n_conformer_threshold : int, default=3 + n_conformer_threshold : int, default=5 The minimium number of conformers per entry. Returns ------- None """ + _logger.info(f'Filter molecules with conformers below {n_conformer_threshold} conformers') new_graphs = [] for i, g in enumerate(self.graphs): n_confs = g.nodes['n1'].data['xyz'].shape[1] @@ -314,15 +317,15 @@ def filter_minimum_conformers(self, n_conformer_threshold=3): del new_graphs - def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']): + def compute_baseline_energy_force(self, forcefield_list=['openff-2.1.0']): """Compute energies and forces using other force fields. - New node features are added to g.nodes['g']. For example, g.nodes['g'].data['u_openff-2.0.0'] and - g.nodes['n1'].data['u_openff-2.0.0_prime'] will be created for energies and forces, respectively. + New node features are added to g.nodes['g']. For example, g.nodes['g'].data['u_openff-2.1.0'] and + g.nodes['n1'].data['u_openff-2.1.0_prime'] will be created for energies and forces, respectively. Parameters ---------- - forcefield_list : list, default=['openff-2.0.0'] + forcefield_list : list, default=['openff-2.1.0'] Currently supports the following force fields: 'gaff-1.81', 'gaff-2.11', 'openff-1.2.0', 'openff-2.0.0', 'openff-2.1.0', 'amber14-all.xml', 'amber/protein.ff14SBonlysc.xml' @@ -349,6 +352,8 @@ def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']): from openmm.unit import Quantity from openmmforcefields.generators import SystemGenerator + _logger.info(f'Compute energies and forces using other force fields') + # Simulation Specs (not important, just place holders) TEMPERATURE = 350 * unit.kelvin STEP_SIZE = 1.0 * unit.femtosecond @@ -358,73 +363,78 @@ def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']): raise Exception(f'{forcefield} force field not supported. Supported force fields are {self.available_forcefields}.') new_graphs = [] - for i, g in enumerate(self.graphs): - for forcefield in forcefield_list: - if forcefield.startswith('gaff') or forcefield.startswith('openff'): - generator = SystemGenerator( - small_molecule_forcefield=forcefield, - molecules=[g.mol], - forcefield_kwargs={"constraints": None, "removeCMMotion": False}, - ) - name = forcefield - elif forcefield.startswith('amber') or forcefield.startswith('protein'): - generator = SystemGenerator( - forcefields=[forcefield], - molecules=[g.mol], - forcefield_kwargs={"constraints": None, "removeCMMotion": False}, - ) - if forcefield == 'amber14-all.xml': - name = 'amber14sb' - elif forcefield == 'amber/protein.ff14SBonlysc.xml': - name = 'amber14sb_onlysc' - else: - import warnings - warnings.warn(f'{forcefield} not supported for molecule {g.mol.to_smiles()}') - - suffix = name - - # Parameterize topology - topology = g.mol.to_topology().to_openmm() - # Create openmm system - system = generator.create_system(topology) - # Use langevin integrator, although it's not super useful here - integrator = openmm.LangevinIntegrator(TEMPERATURE, COLLISION_RATE, STEP_SIZE) - # Create simulation - simulation = Simulation(topology=topology, system=system, integrator=integrator) - # Get energy - us = [] - us_prime = [] - xs = ( - Quantity( - g.nodes["n1"].data["xyz"].detach().numpy(), - espunits.DISTANCE_UNIT, - ) - .value_in_unit(unit.nanometer) - .transpose((1, 0, 2)) - ) - for x in xs: - simulation.context.setPositions(x) - us.append( - simulation.context.getState(getEnergy=True) - .getPotentialEnergy() - .value_in_unit(espunits.ENERGY_UNIT) + for g in self.graphs: + try: + for forcefield in forcefield_list: + if forcefield.startswith('gaff') or forcefield.startswith('openff'): + generator = SystemGenerator( + small_molecule_forcefield=forcefield, + molecules=[g.mol], + forcefield_kwargs={"constraints": None, "removeCMMotion": False}, + ) + name = forcefield + elif forcefield.startswith('amber') or forcefield.startswith('protein'): + generator = SystemGenerator( + forcefields=[forcefield], + molecules=[g.mol], + forcefield_kwargs={"constraints": None, "removeCMMotion": False}, + ) + if forcefield == 'amber14-all.xml': + name = 'amber14sb' + elif forcefield == 'amber/protein.ff14SBonlysc.xml': + name = 'amber14sb_onlysc' + else: + import warnings + warnings.warn(f'{forcefield} not supported for molecule {g.mol.to_smiles()}') + + suffix = name + + # Parameterize topology + topology = g.mol.to_topology().to_openmm() + # Create openmm system + system = generator.create_system(topology) + # Use langevin integrator, although it's not super useful here + integrator = openmm.LangevinIntegrator(TEMPERATURE, COLLISION_RATE, STEP_SIZE) + # Create simulation + simulation = Simulation(topology=topology, system=system, integrator=integrator) + # Get energy + us = [] + us_prime = [] + xs = ( + Quantity( + g.nodes["n1"].data["xyz"].detach().numpy(), + espunits.DISTANCE_UNIT, + ) + .value_in_unit(unit.nanometer) + .transpose((1, 0, 2)) ) - us_prime.append( - simulation.context.getState(getForces=True) - .getForces(asNumpy=True) - .value_in_unit(espunits.FORCE_UNIT) * -1 + for x in xs: + simulation.context.setPositions(x) + us.append( + simulation.context.getState(getEnergy=True) + .getPotentialEnergy() + .value_in_unit(espunits.ENERGY_UNIT) + ) + us_prime.append( + simulation.context.getState(getForces=True) + .getForces(asNumpy=True) + .value_in_unit(espunits.FORCE_UNIT) * -1 + ) + + us = torch.tensor(us, dtype=torch.float64)[None, :] + us_prime = torch.tensor( + np.stack(us_prime, axis=1), + dtype=torch.get_default_dtype(), ) - us = torch.tensor(us, dtype=torch.float64)[None, :] - us_prime = torch.tensor( - np.stack(us_prime, axis=1), - dtype=torch.get_default_dtype(), - ) - - g.nodes['g'].data['u_%s' % suffix] = us - g.nodes['n1'].data['u_%s_prime' % suffix] = us_prime + g.nodes['g'].data['u_%s' % suffix] = us + g.nodes['n1'].data['u_%s_prime' % suffix] = us_prime - new_graphs.append(g) + new_graphs.append(g) + except Exception as e: + mol_err = g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + _logger.warning(f'Error occured during processing {mol_err}: {e}') + continue # Update in place self.graphs = new_graphs @@ -440,6 +450,7 @@ def compute_relative_energy(self): ------- None """ + _logger.info(f'Compute relative energy') new_graphs = [] for g in self.graphs: g.nodes['g'].data['u_ref_relative'] = g.nodes['g'].data['u_ref'].detach().clone() From 716425314dc08741270410a020a506243d9276b9 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 8 Mar 2024 11:01:56 -0500 Subject: [PATCH 54/59] deprecate isomeric=False in drop_and_merge_duplicates --- espfit/utils/graphs.py | 55 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index 2ac2798..c886d07 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -116,11 +116,31 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc isoemric : boolean, default=True If True, then isomeric smiles will be used to identify unique molecules. - If False, then nonisomeric smiles will be used to identify unique molecules. - Note that partial charges will be averaged for the same nonisomeric smiles. - This is because different 3D structures can have different partial charges - due to different conformations. - + Note: isomeric=False is deprecated at the moment. + There is no guarantee that the atom order is consistent across different molecules + with the same nonisomeric smiles. In consistent atom order, results in different + mapped smiles (atom mapping). To concatenate graphs with different mapped smiles + but with the same nonisomeric smiles, atom orders needs to be fixed. + This is not implemented yet. + + For example, different mapped smiles for the same nonisomeric smiles: + + [H:21][c:1]1[c:2]([c:4]([c:7]([c:5]([c:3]1[H:23])[H:25])[N:14]=[N:15][C:8]2=[C:10]3[N:16]\ + ([C:9](=[C:6]([C:11](=[O:19])[N:18]3[N:17]([C:12]2=[O:20])[H:31])[H:26])[C:13]([H:27])\ + ([H:28])[H:29])[H:30])[H:24])[H:22] + + [H:22][c:11]1[c:12]([c:14]([c:16]([c:15]([c:13]1[H:24])[H:26])[N:4]=[N:3][C:8]2=[C:9]3[N:17]\ + ([C:10](=[C:7]([C:6](=[O:2])[N:19]3[N:18]([C:5]2=[O:1])[H:28])[H:21])[C:20]([H:29])\ + ([H:30])[H:31])[H:27])[H:25])[H:23] + + This will give you different g.nodes['n2'].data['idxs'] which is problematic when + concatenating graphs with different mapped smiles but the same nonisomeric smiles. + + #If False, then nonisomeric smiles will be used to identify unique molecules. + #Note that partial charges will be averaged for the same nonisomeric smiles. + #This is because different 3D structures can have different partial charges + #due to different conformations. + Returns ------- None @@ -134,7 +154,8 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc if isomeric == True: _logger.info(f'Drop and merge duplicate isomeric smiles') else: - _logger.info(f'Drop and merge duplicate nonisomeric smiles') + #_logger.info(f'Drop and merge duplicate nonisomeric smiles') + raise ImportError(f'isomeric=False is deprecated at the moment') smiles = [ g.mol.to_smiles(isomeric=isomeric, explicit_hydrogens=True, mapped=False) for g in self.graphs ] _logger.info(f'Found {len(smiles)} molecules') @@ -219,7 +240,7 @@ def subtract_nonbonded_interactions(self, subtract_vdw=False, subtract_ele=True) new_graphs = [] from espaloma.data.md import subtract_nonbonded_force - for i, g in enumerate(self.graphs): + for g in self.graphs: # `espaloma.data.md.subtract_nonbonded_force` will update g.nodes['g'].data['u_ref'] and g.nodes['g'].data['u_ref_prime'] in place. # Clone QM reference into g.nodes['g'].data['u_qm'] and g.nodes['g'].data['u_qm_prime'], if not exist if 'u_qm' not in g.nodes['g'].data.keys(): @@ -608,6 +629,14 @@ def _merge_graphs(ds): import copy import torch + """ + # + # NOTE + # ---- + # THIS IS DEPRECATED AT THE MOMENT. SEE LINE 116 FOR MORE DETAILS. + # KEEPING THIS FOR FUTURE REFERENCE. + # + # Check if all inputs are equivalent (isomeric smiles) # If not, get average partial charges across different isomeric smiles (molecules) isomeric_smiles = [g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=False) for g in ds] @@ -639,6 +668,18 @@ def _merge_graphs(ds): np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy()) for key in ["q_ref", "idxs", "h0"]: np.testing.assert_array_equal(ds[0].nodes['n1'].data[key].flatten().numpy(), ds[i].nodes['n1'].data[key].flatten().numpy()) + """ + + # Check if graphs are equivalent + for i in range(1, len(ds)): + mapped_smiles = ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + mapped_smiles_i = ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + assert mapped_smiles == mapped_smiles_i, f"Mapped isomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" + # Other node features + for key in ["sum_q"]: + np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy()) + for key in ["q_ref", "idxs", "h0"]: + np.testing.assert_array_equal(ds[0].nodes['n1'].data[key].flatten().numpy(), ds[i].nodes['n1'].data[key].flatten().numpy()) # Merge graphs g = copy.deepcopy(ds[0]) From 7490db5f2545789690cf9878da316953b27e7c33 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 8 Mar 2024 21:40:17 -0500 Subject: [PATCH 55/59] reflect changes in drop_duplicates in graphs.py --- espfit/tests/test_utils_graphs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/espfit/tests/test_utils_graphs.py b/espfit/tests/test_utils_graphs.py index ac94543..0446290 100644 --- a/espfit/tests/test_utils_graphs.py +++ b/espfit/tests/test_utils_graphs.py @@ -104,7 +104,7 @@ def test_load_dataset_multiple(mydata_gen2_torsion_sm, mydata_protein_torsion_sm assert sum(nconfs) == 5636, 'Total number of conformations does not match' -def test_drop_and_merge_duplicates(mydata_gen2_torsion_sm, tmpdir): +def test_drop_duplicates(mydata_gen2_torsion_sm, tmpdir): """Test function to drop and merge duplicate molecules. Parameters @@ -119,7 +119,7 @@ def test_drop_and_merge_duplicates(mydata_gen2_torsion_sm, tmpdir): """ ds = mydata_gen2_torsion_sm temporary_directory = tmpdir.mkdir('misc') - ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) + ds.drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) nconfs = [ g.nodes['g'].data['u_ref'].shape[1] for g in ds ] # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI. #assert nconfs == [24, 13, 24, 24, 24, 72], 'Number of molecular conformers does not match' From 1c844106e19ace56d036a41e5837a3ec5a7cc03c Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 8 Mar 2024 21:40:43 -0500 Subject: [PATCH 56/59] use nonisomeric smiles to detect unique molecules --- espfit/utils/graphs.py | 248 ++++++++++++++++++++--------------------- 1 file changed, 124 insertions(+), 124 deletions(-) diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index c886d07..7a00451 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -18,7 +18,7 @@ class CustomGraphDataset(GraphDataset): Methods ------- - drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=None): + drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=None): Drop and merge duplicate nonisomeric smiles across different data sources. subtract_nonbonded_interactions(subtract_vdw=False, subtract_ele=True): @@ -53,7 +53,7 @@ class CustomGraphDataset(GraphDataset): >>> ds = GraphDataset.load(path) >>> # Drop and merge duplicate molecules. Save merged dataset as a new dataset. >>> # If `output_directory_path` is None, then the current working directory is used. - >>> ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=None) + >>> ds.drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=None) >>> # Subtract nonbonded energies and forces from QC reference (e.g. subtract all valence and ele interactions) >>> # This will update u_ref and u_ref_relative in-place. copy of raw u_ref (QM reference) will be copied to u_qm. >>> ds.subtract_nonbonded_interactions(subtract_vdw=False, subtract_ele=True) @@ -96,34 +96,25 @@ def __init__(self, graphs=[], reference_forcefield='openff-2.1.0', random_seed=2 self.random_seed = random_seed - def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc', output_directory_path=None, isomeric=True): - """Drop and merge duplicate (non)isomeric smiles within the dataset. + def drop_duplicates(self, isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=None): + """Drop duplicate (non)isomeric smiles within the dataset. Modifies list of esp.Graph's in place. Parameters ---------- - save_merged_datest : boolean, default=True - If True, then merged datasets will be saved as a new dataset. - - dataset_name : str, default=misc - Name of the merged dataset. + isomeric : boolean, default=False + If True, then duplicated molecules are merged based on isomeric mapped smiles. + If False, then duplicated molecules are merged based on nonisomeric mapped smiles. - output_directory_path : str, default=None - Output directory path to save the merged dataset. - If None, then the current working directory is used. + Unique molecules are identified by nonisomeric non-mapped smiles. + Duplicated molecules (nonisomeric smiles) are merged into a single molecule based on + the isomeric mapped smiles (isomeric=True) or nonisomeric mapped smiles (isomeric=False). - isoemric : boolean, default=True - If True, then isomeric smiles will be used to identify unique molecules. - - Note: isomeric=False is deprecated at the moment. - There is no guarantee that the atom order is consistent across different molecules - with the same nonisomeric smiles. In consistent atom order, results in different - mapped smiles (atom mapping). To concatenate graphs with different mapped smiles - but with the same nonisomeric smiles, atom orders needs to be fixed. - This is not implemented yet. - - For example, different mapped smiles for the same nonisomeric smiles: + Note that there is no guarantee that the atom order (mapping) is consistent across different + molecules with the same (non)isomeric smiles. + + For example, molecules with same nonisomeric smiles could have different mapped smiles: [H:21][c:1]1[c:2]([c:4]([c:7]([c:5]([c:3]1[H:23])[H:25])[N:14]=[N:15][C:8]2=[C:10]3[N:16]\ ([C:9](=[C:6]([C:11](=[O:19])[N:18]3[N:17]([C:12]2=[O:20])[H:31])[H:26])[C:13]([H:27])\ @@ -133,13 +124,28 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc ([C:10](=[C:7]([C:6](=[O:2])[N:19]3[N:18]([C:5]2=[O:1])[H:28])[H:21])[C:20]([H:29])\ ([H:30])[H:31])[H:27])[H:25])[H:23] - This will give you different g.nodes['n2'].data['idxs'] which is problematic when - concatenating graphs with different mapped smiles but the same nonisomeric smiles. + This will give different atom ordering, leading to, for example, different + bond atom index (g.nodes['n2'].data['idxs']). - #If False, then nonisomeric smiles will be used to identify unique molecules. - #Note that partial charges will be averaged for the same nonisomeric smiles. - #This is because different 3D structures can have different partial charges - #due to different conformations. + To alleviate this issue, nonisomeric smiles without atom mapping is used to identify + unique molecules and remove any duplicated molecules. Then, duplicated molecules are + merged into a single molecule based on the isomeric mapped smiles (isomeric=True) or + nonisomeric mapped smiles (isomeric=False). + + keep : boolean, default=True + If True, then duplicate entries dropped from the dataset will be added back to the unique entries + after the dropped duplicated entries are merged into a single molecule. If False, then duplicated + entries dropped will be removed. + + save_merged_datest : boolean, default=True + If True, then duplicated molecules are merged into a single molecule and saved as a new dataset. + + dataset_name : str, default=misc + Name of the merged dataset. + + output_directory_path : str, default=None + Output directory path to save the merged dataset. + If None, then the current working directory is used. Returns ------- @@ -150,54 +156,79 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc if output_directory_path == None: output_directory_path = os.getcwd() - + + _logger.info(f'Remove duplicated nonisomeric smiles from dataset') if isomeric == True: - _logger.info(f'Drop and merge duplicate isomeric smiles') + _logger.info(f'Merge duplicated nonisomeric smiles into unique isomeric mapped smiles') else: - #_logger.info(f'Drop and merge duplicate nonisomeric smiles') - raise ImportError(f'isomeric=False is deprecated at the moment') - smiles = [ g.mol.to_smiles(isomeric=isomeric, explicit_hydrogens=True, mapped=False) for g in self.graphs ] - _logger.info(f'Found {len(smiles)} molecules') - - # Unique entries - df = pd.DataFrame.from_dict({'smiles': smiles}) - unique_index = df.drop_duplicates(keep=False).index.to_list() + _logger.info(f'Merge duplicated nonisomeric smiles into unique nonisomeric mapped smiles') + + # Get smiles + nonisomeric_smiles = [ g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=False) for g in self.graphs ] + nonisomeric_mapped_smiles = [ g.mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) for g in self.graphs ] + isomeric_mapped_smiles = [ g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) for g in self.graphs ] + _logger.info(f'Found {len(self.graphs)} graph entries') + + # Create pandas dataframe + df = pd.DataFrame.from_dict({'nonisomeric_smiles': nonisomeric_smiles, 'nonisomeric_mapped_smiles': nonisomeric_mapped_smiles, 'isomeric_mapped_smiles': isomeric_mapped_smiles}) + _logger.info(f'Unique nonisomeric smiles: {len(df.nonisomeric_smiles.unique())}') + _logger.info(f'Unique nonisomeric mapped smiles: {len(df.nonisomeric_mapped_smiles.unique())}') + _logger.info(f'Unique isomeric mapped smiles: {len(df.isomeric_mapped_smiles.unique())}') + + # Get unique and duplicated entries using nonisomeric smiles (non-mapped) + unique_index = df.nonisomeric_smiles.drop_duplicates(keep=False).index.to_list() unique_graphs = [self.graphs[_idx] for _idx in unique_index] - _logger.info(f'Found {len(unique_index)} unique molecules') + _logger.info(f'Drop all duplicated nonisomeric smiles from the dataset (unique nonisomeric smiles: {len(unique_index)})') - # Duplicated entries - index = df.duplicated(keep=False) # Mark all duplicate entries True + index = df.nonisomeric_smiles.duplicated(keep=False) # Mark all duplicate entries True duplicated_index = df[index].index.to_list() - _logger.info(f'Found {len(duplicated_index)} duplicated molecules') + assert len(unique_index) + len(duplicated_index) == len(self.graphs), \ + f'Unique + duplicated nonisomeric smiles: {len(unique_index)} + {len(duplicated_index)} != total dataset ({len(self.graphs)})' - # Get unique smiles and assign new molecule name `e.g. mol0001` - duplicated_df = df.iloc[duplicated_index] - duplicated_smiles = duplicated_df.smiles.unique().tolist() - molnames = [ f'mol{i:04d}' for i in range(len(duplicated_smiles)) ] - _logger.info(f'Found {len(molnames)} unique molecules within duplicate entries') - - # Merge duplicate entries into a new single graph - duplicated_graphs = [] - molnames_dict = {} - for molname, duplicated_smile in zip(molnames, duplicated_smiles): - # Map new molecule name with its unique smiles and dataframe indices - index = duplicated_df[duplicated_df['smiles'] == duplicated_smile].index.tolist() - molnames_dict[molname] = {'smiles': duplicated_smiles, 'index': index} - # Merge graphs - g = self._merge_graphs([self.graphs[_idx] for _idx in index]) - duplicated_graphs.append(g) - # Save graphs (optional) - if save_merged_dataset == True: - # Notes: Create a temporary directory, `_output_directory_path`, to support pytest in test_utils_graphs.py. - # Temporary directory needs to be created beforehand for `test_drop_and_merge_duplicates`. - _output_directory_path = os.path.join(output_directory_path, dataset_name) - os.makedirs(_output_directory_path, exist_ok=True) - new_output_directory_path = os.path.join(_output_directory_path, molname) - g.save(new_output_directory_path) - - # Update in place - new_graphs = unique_graphs + duplicated_graphs - _logger.info(f'Graph dataset reconstructed: {len(new_graphs)} unique molecules') + if keep == True: + if isomeric == True: + _logger.info(f'Merge dropped duplicated nonisomeric smiles into unique isomeric mapped smiles') + else: + _logger.info(f'Merge dropped duplicated nonisomeric smiles into unique nonisomeric mapped smiles') + + # Get unique (non)isomeric mapped smiles from duplicated nonisomeric smiles (non-mapped) and assign new molecule name `e.g. mol0001` + # Use copy() to prevent SettingWithCopyWarning when assigning new values to a new column + duplicated_df = df.iloc[duplicated_index].copy() + if isomeric == True: + duplicated_smiles = duplicated_df.isomeric_mapped_smiles.unique().tolist() + duplicated_df['smiles'] = duplicated_df.isomeric_mapped_smiles + _logger.info(f'Found {len(duplicated_smiles)} unique isomeric mapped smiles within duplicated {len(duplicated_index)} nonisomeric smiles') + else: + duplicated_smiles = duplicated_df.nonisomeric_mapped_smiles.unique().tolist() + duplicated_df['smiles'] = duplicated_df.nonisomeric_mapped_smiles + _logger.info(f'Found {len(duplicated_smiles)} unique nonisomeric mapped smiles within duplicated {len(duplicated_index)} nonisomeric smiles') + molnames = [ f'mol{i:04d}' for i in range(len(duplicated_smiles)) ] + + # Merge duplicate entries into a new single graph + duplicated_graphs = [] + #molnames_dict = {} # This is never used but keep this to export the dictionary? + for molname, duplicated_smile in zip(molnames, duplicated_smiles): + # Map new molecule name with its unique smiles and dataframe indices + index = duplicated_df[duplicated_df['smiles'] == duplicated_smile].index.tolist() + #molnames_dict[molname] = {'smiles': duplicated_smiles, 'index': index} + # Merge graphs + g = self._merge_graphs(subset=[self.graphs[_idx] for _idx in index], isomeric_flag=isomeric) + duplicated_graphs.append(g) + # Save graphs (optional) + if save_merged_dataset == True: + # Notes: Create a temporary directory, `_output_directory_path`, to support pytest in test_utils_graphs.py. + # Temporary directory needs to be created beforehand for `test_drop_and_merge_duplicates`. + _output_directory_path = os.path.join(output_directory_path, dataset_name) + os.makedirs(_output_directory_path, exist_ok=True) + new_output_directory_path = os.path.join(_output_directory_path, molname) + g.save(new_output_directory_path) + + new_graphs = unique_graphs + duplicated_graphs + _logger.info(f'Add back {len(duplicated_graphs)} merged duplicated (non)isomeric mapped smiles into the dataset') + _logger.info(f'Dataset reconstructed: {len(new_graphs)} unique molecules') + else: + new_graphs = unique_graphs + _logger.info(f'Dataset reconstructed: {len(new_graphs)} unique molecules') self.graphs = new_graphs del unique_graphs, duplicated_graphs, df, duplicated_df @@ -611,12 +642,12 @@ def _remove_node_features(self): @staticmethod - def _merge_graphs(ds): + def _merge_graphs(subset, isomeric_flag): """Merge multiple Graph instances into a single Graph. Parameters ---------- - ds : list of espaloma.graphs.graph.Graph, default=None + subset : list of espaloma.graphs.graph.Graph, default=None The list of Graph instances to be merged. All Graphs in the list must be equivalent. Returns @@ -629,68 +660,37 @@ def _merge_graphs(ds): import copy import torch - """ - # - # NOTE - # ---- - # THIS IS DEPRECATED AT THE MOMENT. SEE LINE 116 FOR MORE DETAILS. - # KEEPING THIS FOR FUTURE REFERENCE. - # - - # Check if all inputs are equivalent (isomeric smiles) - # If not, get average partial charges across different isomeric smiles (molecules) - isomeric_smiles = [g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=False) for g in ds] - unique_isomeric_smiles = set(isomeric_smiles) - if len(unique_isomeric_smiles) != 1: - n_atoms = ds[0].nodes['n1'].data['q_ref'].shape[0] - q_ref = torch.zeros(n_atoms, 1) - for unique_isomeric_smile in unique_isomeric_smiles: - index = [i for i, isomeric_smile in enumerate(isomeric_smiles) if isomeric_smile in unique_isomeric_smile][0] - q_ref += ds[index].nodes['n1'].data['q_ref'] - q_ref = q_ref / len(set(isomeric_smiles)) - # Update partial charges in-place - for i in range(len(ds)): - ds[i].nodes['n1'].data['q_ref'] = q_ref - # Check if graphs are equivalent - for i in range(1, len(ds)): - # Mapped isomeric smiles - if len(unique_isomeric_smiles) != 1: - mapped_smiles = ds[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) - mapped_smiles_i = ds[i].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) - assert mapped_smiles == mapped_smiles_i, f"Mapped nonisomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" + for i in range(1, len(subset)): + if isomeric_flag == True: + mapped_smiles = subset[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + mapped_smiles_i = subset[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + assert mapped_smiles == mapped_smiles_i, f"Isomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" else: - mapped_smiles = ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) - mapped_smiles_i = ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) - assert mapped_smiles == mapped_smiles_i, f"Mapped isomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" + mapped_smiles = subset[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) + mapped_smiles_i = subset[i].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) + assert mapped_smiles == mapped_smiles_i, f"Nonisomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" # Other node features for key in ["sum_q"]: - np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy()) - for key in ["q_ref", "idxs", "h0"]: - np.testing.assert_array_equal(ds[0].nodes['n1'].data[key].flatten().numpy(), ds[i].nodes['n1'].data[key].flatten().numpy()) - """ + np.testing.assert_array_equal(subset[0].nodes['g'].data[key].flatten().numpy(), subset[i].nodes['g'].data[key].flatten().numpy()) + for key in ["q_ref", "h0"]: + np.testing.assert_array_equal(subset[0].nodes['n1'].data[key].flatten().numpy(), subset[i].nodes['n1'].data[key].flatten().numpy()) + # As long as mapped smiles are the same, we don't need to compare n1, n2, n3 nodes. Maybe we don't need the above either? + #np.testing.assert_array_equal(subset[0].nodes['n1'].data['idxs'].flatten().numpy(), subset[i].nodes['n1'].data['idxs'].flatten().numpy()) + #np.testing.assert_array_equal(subset[0].nodes['n2'].data['idxs'].flatten().numpy(), subset[i].nodes['n2'].data['idxs'].flatten().numpy()) + #np.testing.assert_array_equal(subset[0].nodes['n3'].data['idxs'].flatten().numpy(), subset[i].nodes['n3'].data['idxs'].flatten().numpy()) - # Check if graphs are equivalent - for i in range(1, len(ds)): - mapped_smiles = ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) - mapped_smiles_i = ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) - assert mapped_smiles == mapped_smiles_i, f"Mapped isomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" - # Other node features - for key in ["sum_q"]: - np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy()) - for key in ["q_ref", "idxs", "h0"]: - np.testing.assert_array_equal(ds[0].nodes['n1'].data[key].flatten().numpy(), ds[i].nodes['n1'].data[key].flatten().numpy()) # Merge graphs - g = copy.deepcopy(ds[0]) + g = copy.deepcopy(subset[0]) for key in g.nodes['g'].data.keys(): if key not in ["sum_q"]: - for i in range(1, len(ds)): - g.nodes['g'].data[key] = torch.cat((g.nodes['g'].data[key], ds[i].nodes['g'].data[key]), dim=-1) + for i in range(1, len(subset)): + g.nodes['g'].data[key] = torch.cat((g.nodes['g'].data[key], subset[i].nodes['g'].data[key]), dim=-1) for key in g.nodes['n1'].data.keys(): if key not in ["q_ref", "idxs", "h0"]: - for i in range(1, len(ds)): - g.nodes['n1'].data[key] = torch.cat((g.nodes['n1'].data[key], ds[i].nodes['n1'].data[key]), dim=1) + for i in range(1, len(subset)): + g.nodes['n1'].data[key] = torch.cat((g.nodes['n1'].data[key], subset[i].nodes['n1'].data[key]), dim=1) return g From cf5a36db15f951a4a6e6175602c5c65270366c27 Mon Sep 17 00:00:00 2001 From: kt Date: Sun, 10 Mar 2024 15:01:44 -0400 Subject: [PATCH 57/59] fix bug in averaging partial charges for duplicate entries --- espfit/utils/graphs.py | 65 +++++++++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 13 deletions(-) diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index 7a00451..a9f7c20 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -660,26 +660,65 @@ def _merge_graphs(subset, isomeric_flag): import copy import torch + if isomeric_flag == True: + mapped_smiles = subset[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + else: + mapped_smiles = subset[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) + _logger.info(f'Merge {len(subset)} graphs: {mapped_smiles}') + # Check if graphs are equivalent + charge_index = [] # book keep indices with inconsistent partial charges + atol = rtol = 1e-2 # charge tolerance for i in range(1, len(subset)): if isomeric_flag == True: - mapped_smiles = subset[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) mapped_smiles_i = subset[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) assert mapped_smiles == mapped_smiles_i, f"Isomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" else: - mapped_smiles = subset[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) mapped_smiles_i = subset[i].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) - assert mapped_smiles == mapped_smiles_i, f"Nonisomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" - # Other node features - for key in ["sum_q"]: - np.testing.assert_array_equal(subset[0].nodes['g'].data[key].flatten().numpy(), subset[i].nodes['g'].data[key].flatten().numpy()) - for key in ["q_ref", "h0"]: - np.testing.assert_array_equal(subset[0].nodes['n1'].data[key].flatten().numpy(), subset[i].nodes['n1'].data[key].flatten().numpy()) - # As long as mapped smiles are the same, we don't need to compare n1, n2, n3 nodes. Maybe we don't need the above either? - #np.testing.assert_array_equal(subset[0].nodes['n1'].data['idxs'].flatten().numpy(), subset[i].nodes['n1'].data['idxs'].flatten().numpy()) - #np.testing.assert_array_equal(subset[0].nodes['n2'].data['idxs'].flatten().numpy(), subset[i].nodes['n2'].data['idxs'].flatten().numpy()) - #np.testing.assert_array_equal(subset[0].nodes['n3'].data['idxs'].flatten().numpy(), subset[i].nodes['n3'].data['idxs'].flatten().numpy()) - + assert mapped_smiles == mapped_smiles_i, f"Nonisomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" + # Net charge + np.testing.assert_array_equal(subset[0].nodes['g'].data['sum_q'].flatten().numpy(), subset[i].nodes['g'].data['sum_q'].flatten().numpy()) + # Input node features + np.testing.assert_array_equal(subset[0].nodes['n1'].data['h0'].flatten().numpy(), subset[i].nodes['n1'].data['h0'].flatten().numpy()) + # Atom ordering: As long as mapped smiles are the same, we don't need to compare n1, n2, n3 nodes? + np.testing.assert_array_equal(subset[0].nodes['n1'].data['idxs'].flatten().numpy(), subset[i].nodes['n1'].data['idxs'].flatten().numpy()) + np.testing.assert_array_equal(subset[0].nodes['n2'].data['idxs'].flatten().numpy(), subset[i].nodes['n2'].data['idxs'].flatten().numpy()) + np.testing.assert_array_equal(subset[0].nodes['n3'].data['idxs'].flatten().numpy(), subset[i].nodes['n3'].data['idxs'].flatten().numpy()) + # Partial charges: There could be inconsistency due to different 3D conformers generated during partial charge calculation process. + charge_boolean = np.allclose(subset[0].nodes['n1'].data['q_ref'].flatten().numpy(), subset[i].nodes['n1'].data['q_ref'].flatten().numpy(), rtol=rtol, atol=atol) + if charge_boolean == False: + charge_diff = np.abs(subset[0].nodes['n1'].data['q_ref'].flatten().numpy() - subset[i].nodes['n1'].data['q_ref'].flatten().numpy()) + _logger.warning(f"Entry {i}: Maximum charge difference {charge_diff.max()} is higher than {atol} when compared to the first graph") + charge_index.append(i) + + # Handle partial charges if inconsistent + if charge_index: + # Get indices with unique partial charges + # Book keep indices with unique partial charges starting from the first graph + unique_charge_index = [0] + for i in charge_index: + is_equal = [] + for j in unique_charge_index: + # Extract the arrays to compare + arr_i = subset[i].nodes['n1'].data['q_ref'].flatten().numpy() + arr_j = subset[j].nodes['n1'].data['q_ref'].flatten().numpy() + is_equal.append(np.array_equal(arr_i, arr_j)) + # Check if all False + if not any(is_equal): + unique_charge_index.append(i) + # Average partial charges + _logger.info(f'Average partial charges ({unique_charge_index})...') + q_ref = subset[0].nodes['n1'].data['q_ref'] + _logger.info(f'Entry #0: {q_ref.flatten().numpy()}') + for index in unique_charge_index[1:]: + _q_ref = subset[index].nodes['n1'].data['q_ref'] + _logger.info(f'Entry #{index}: {_q_ref.flatten().numpy()}') + q_ref += _q_ref + q_ref = q_ref / len(unique_charge_index) + # Update partial charges in-place + for i in range(len(subset)): + subset[i].nodes['n1'].data['q_ref'] = q_ref + _logger.info(f'Averaged partial charges: {subset[0].nodes["n1"].data["q_ref"].flatten().numpy()}') # Merge graphs g = copy.deepcopy(subset[0]) From 0963d0fc8b130c8c05bed1f8c552419a0ac3cfb0 Mon Sep 17 00:00:00 2001 From: kt Date: Tue, 12 Mar 2024 10:05:04 -0400 Subject: [PATCH 58/59] fix drop_and_merge_duplicates to drop_duplicates --- espfit/tests/test_app_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espfit/tests/test_app_train.py b/espfit/tests/test_app_train.py index 05d29d5..1f671c6 100644 --- a/espfit/tests/test_app_train.py +++ b/espfit/tests/test_app_train.py @@ -44,7 +44,7 @@ def test_load_dataset(tmpdir): # Prepare input dataset ready for training temporary_directory = tmpdir.mkdir('misc') - ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) + ds.drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) ds.reshape_conformation_size(n_confs=50) ds.compute_relative_energy() From fb97040f2e2ecaf948f52afdd5919fa3f1406d38 Mon Sep 17 00:00:00 2001 From: kt Date: Tue, 12 Mar 2024 14:59:38 -0400 Subject: [PATCH 59/59] fix drop_and_merge_duplicates to drop_duplicates --- espfit/tests/test_app_train_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py index 8d39e32..1292f4c 100644 --- a/espfit/tests/test_app_train_sampler.py +++ b/espfit/tests/test_app_train_sampler.py @@ -45,7 +45,7 @@ def test_load_dataset(tmpdir): # Prepare input dataset ready for training temporary_directory = tmpdir.mkdir('misc') - ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) + ds.drop_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) ds.reshape_conformation_size(n_confs=50) ds.compute_relative_energy()