From 33bbababddac14c48af5728522a23b59f6a43c68 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 22 Mar 2024 10:00:31 -0400 Subject: [PATCH 01/11] add data/target and data/forcefield to package" --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bffcfc8..4247bea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,10 @@ where = ["."] # Ref https://setuptools.pypa.io/en/latest/userguide/datafiles.html#package-data [tool.setuptools.package-data] espfit = [ - "py.typed" + "py.typed", + "data/target/nucleoside/*/target.pdb", + "data/target/nucleoside/*/experiment.yml", + "data/forcefield/espaloma-0.3.2.pt" ] [tool.versioningit] From dedf30d6eaaca8f29a7a194f4035d904321cbe80 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 22 Mar 2024 10:02:13 -0400 Subject: [PATCH 02/11] fix #13 and #14 --- espfit/app/sampler.py | 67 +++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index eb211e5..8efc348 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -38,8 +38,8 @@ class BaseSimulation(object): export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None): Export serialized system XML file and solvated pdb file. """ - def __init__(self, maxIterations=100, nsteps=250000, atomSubset='solute', - checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, + def __init__(self, maxIterations=100, nsteps=2500000, atomSubset='solute', + checkpoint_frequency=250000, logging_frequency=250000, netcdf_frequency=250000, output_directory_path=None, input_directory_path=None): """Initialize base simulation object. @@ -48,19 +48,19 @@ def __init__(self, maxIterations=100, nsteps=250000, atomSubset='solute', maxIterations : int, default=100 Maximum number of iterations to perform minimization. - nsteps : int, default=250000 (10 ns using 4 fs timestep) + nsteps : int, default=2500000 (10 ns using 4 fs timestep) Number of steps to run the simulation. atomSubset : str, default='solute' Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'. - checkpoint_frequency : int, default=25000 (1 ns) + checkpoint_frequency : int, default=250000 (1 ns) Frequency (in steps) at which to write checkpoint files. - logging_frequency : int, default=250000 (10 ns) + logging_frequency : int, default=250000 (1 ns) Frequency (in steps) at which to write logging files. - netcdf_frequency : int, default=250000 (10 ns) + netcdf_frequency : int, default=250000 (1 ns) Frequency (in steps) at which to write netcdf files. output_directory_path : str, optional @@ -336,7 +336,7 @@ class SetupSampler(BaseSimulation): ['amber/protein.ff14SB.xml', 'amber/RNA.OL3.xml'] : pl-multi (TPO): NG, pl-single: OK, RNA: OK """ def __init__(self, - small_molecule_forcefield='espfit/data/forcefield/espaloma-0.3.2.pt', + small_molecule_forcefield='espaloma-0.3.2', forcefield_files = ['amber/ff14SB.xml', 'amber/phosaa14SB.xml'], water_model='tip3p', solvent_padding=9.0 * unit.angstroms, @@ -355,7 +355,8 @@ def __init__(self, Parameters ---------- small_molecule_forcefield : str, optional - The force field to be used for small molecules. Default is 'openff-2.1.0'. + The force field to be used for small molecules. Default is 'espaloma-0.3.2'. + Alternative recommended choice is 'openff-2.1.0'. forcefield_files : list, optional List of force field files. Default is ['amber14-all.xml']. water_model : str, optional @@ -378,8 +379,12 @@ def __init__(self, The integration timestep. Default is 4 * unit.femtoseconds. override_with_espaloma : bool, optional Whether to override the original parameters with espaloma. Default is True. + This will override all solute molecules with espaloma parameters. """ super(SetupSampler, self).__init__(**kwargs) + if small_molecule_forcefield == 'espaloma-0.3.2': + from importlib.resources import files + small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt")) self.small_molecule_forcefield = small_molecule_forcefield self.water_model = water_model self.forcefield_files = self._update_forcefield_files(forcefield_files) @@ -397,9 +402,13 @@ def __init__(self, @classmethod - def from_toml(cls, filename, *args, **override_sampler_kwargs): + def _from_toml(cls, filename, *args, **override_sampler_kwargs): """Create SetupSampler from a TOML configuration file. + Note that this is designed for creating new systems with temporary espaloma models generated during + espaloma training. It supports multiple systems in the configuration file and returns a list of + SetupSampler instances. + Parameters ---------- filename : str @@ -429,30 +438,48 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs): print(e) raise - config = config['sampler']['setup'] # list + config = config['sampler']['setup'] # this could be list to support multiple systems if config is None: raise ValueError("target is not specified in the configuration file") + # If the configuration file contains a single system, convert it to a list + if not isinstance(config, list): + config = [config] + samplers = [] _logger.debug(f'Found {len(config)} systems in the configuration file') for _config in config: sampler = cls() # Target information - target_class = _config['target_class'] - target_name = _config['target_name'] - + target_class = _config.get('target_class', None) + target_name = _config.get('target_name', None) sampler.target_class = target_class sampler.target_name = target_name - biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb') - ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf') - if not ligand_file.exists(): - ligand_file = None + # Get biopolymer and ligand file if given + # Priority 1: Use input files if given + biopolymer_file, ligand_file = None, None + if _config.get('biopolymer_file'): + biopolymer_file = _config['biopolymer_file'] + if not os.path.exists(biopolymer_file): + raise FileNotFoundError(f"File not found: {biopolymer_file}") + if _config.get('ligand_file'): + ligand_file = _config['ligand_file'] + if not os.path.exists(ligand_file): + raise FileNotFoundError(f"File not found: {ligand_file}") + # Priority 2: Search espfit/data/target directory if input files are not given + if biopolymer_file is None and ligand_file is None: + biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb') + ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf') + if not biopolymer_file.exists(): + raise FileNotFoundError(f"File not found: {biopolymer_file}") + if not ligand_file.exists(): + ligand_file = None # System settings for key, value in _config.items(): - if key not in ['target_class', 'target_name']: + if key not in ['target_class', 'target_name', 'biopolymer_file', 'ligand_file']: if hasattr(sampler, key): if isinstance(value, str) and "*" in value: _value = float(value.split('*')[0].strip()) @@ -684,7 +711,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # As a workaround, we will delete the original `self._system_generator` and create a new one to regenerate the system with espaloma. # Only water and ion forcefield files will be used to regenerate the system. Solute molecules will be parametrized with espaloma. # - _logger.debug("Regenerate system with espaloma.") + _logger.info("Regenerate system with espaloma") # Re-create system generator del self._system_generator @@ -729,8 +756,6 @@ def _regenerate_espaloma_system(self): import mdtraj from openff.toolkit import Molecule - _logger.debug("Regenerate system with espaloma") - # Check biopolymer chains mdtop = mdtraj.Topology.from_openmm(self.modeller_solvated_topology) chain_indices = [ chain.index for chain in self.modeller_solvated_topology.chains() ] From 8529b32c5ce13016c38cf351901c00315c67b51f Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 22 Mar 2024 10:03:40 -0400 Subject: [PATCH 03/11] add retrain_graph in train_sampler during espaloma training --- espfit/app/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index 0dd37a6..047e96a 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -515,7 +515,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight loss, loss_dict = self.net(g) loss = loss/accumulation_steps - loss.backward() + loss.backward(retain_graph=True) if epoch > self.sampler_patience: # Save checkpoint as local model (net.pt) @@ -666,7 +666,7 @@ def _setup_local_samplers(self, epoch, net_copy, debug): } # Create sampler system from configuration file. Returns list of systems. - samplers = SetupSampler.from_toml(self.configfile, *args, **override_sampler_kwargs) + samplers = SetupSampler._from_toml(self.configfile, *args, **override_sampler_kwargs) return samplers \ No newline at end of file From 177ce743565d43072865578f27a50ad7204faa38 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 22 Mar 2024 19:58:55 -0400 Subject: [PATCH 04/11] fix bug to support water models other than tip3p --- espfit/app/sampler.py | 61 ++++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 8efc348..863e6c7 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -387,7 +387,7 @@ def __init__(self, small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt")) self.small_molecule_forcefield = small_molecule_forcefield self.water_model = water_model - self.forcefield_files = self._update_forcefield_files(forcefield_files) + self.forcefield_files = forcefield_files self.solvent_padding = solvent_padding self.ionic_strength = ionic_strength self.hmass = hmass @@ -399,6 +399,10 @@ def __init__(self, self.override_with_espaloma = override_with_espaloma self.target_class = None self.target_name = None + # Update forcefield file list to add water model files + self._update_forcefield_files() + # Get water class (3-site: tip3p, 4-site model: tip4pew) + self._get_water_class() @classmethod @@ -438,7 +442,7 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs): print(e) raise - config = config['sampler']['setup'] # this could be list to support multiple systems + config = config['sampler']['setup'] # list of multiple setups if config is None: raise ValueError("target is not specified in the configuration file") @@ -449,6 +453,8 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs): samplers = [] _logger.debug(f'Found {len(config)} systems in the configuration file') for _config in config: + # Create SetupSampler instance with default settings + # Note that this automatically adds the default water model (tip3p) to `self.forcefield_files`. sampler = cls() # Target information @@ -489,7 +495,7 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs): setattr(sampler, key, value) else: raise ValueError(f"Invalid keyword argument: {key}") - + # Pass temporary espaloma model to the sampler if kwargs are given for key, value in override_sampler_kwargs.items(): if hasattr(sampler, key): @@ -503,7 +509,18 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs): sampler.output_directory_path = os.path.join(sampler.output_directory_path, sampler.target_name, f'{args[0]}') else: raise ValueError(f"Invalid argument: {args}. Expected a single integer value for the epoch number.") - + + # Empty and recreate `forcefield_files` to avoid appending multiple water model forcefields. + # Use list of forcefield files if given in the `override_sampler_kwargs`. If not given, use the default forcefield_files. + if 'forcefield_files' in override_sampler_kwargs.keys(): + pass + else: + forcefield_files = ['amber/ff14SB.xml', 'amber/phosaa14SB.xml'] + sampler.forcefield_files = forcefield_files + sampler._update_forcefield_files() + # Update water class (3-site: tip3p, 4-site model: tip4pew) + sampler._get_water_class() + # Create system sampler.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file) samplers.append(sampler) @@ -512,7 +529,7 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs): return samplers - def _update_forcefield_files(self, forcefield_files): + def _update_forcefield_files(self): """Get forcefield files. Update `forcefield_files` depending on the type of water model. @@ -526,28 +543,23 @@ def _update_forcefield_files(self, forcefield_files): # For some reason, the original forcefield_files keeps appending when SetupSampler is called. # TODO: Is this the right way to handle this issue? import copy - _forcefield_files = copy.deepcopy(forcefield_files) + _forcefield_files = copy.deepcopy(self.forcefield_files) # 3-site water models if self.water_model == 'tip3p': _forcefield_files.append(['amber/tip3p_standard.xml', 'amber/tip3p_HFE_multivalent.xml']) elif self.water_model == 'tip3pfb': - self.water_model = 'tip3p' _forcefield_files.append(['amber/tip3pfb_standard.xml', 'amber/tip3pfb_HFE_multivalent.xml']) elif self.water_model == 'spce': - self.water_model = 'tip3p' _forcefield_files.append(['amber/spce_standard.xml', 'amber/spce_HFE_multivalent.xml']) elif self.water_model == 'opc3': raise NotImplementedError('see https://github.com/choderalab/rna-espaloma/blob/main/experiment/nucleoside/script/create_system_espaloma.py#L366') # 4-site water models elif self.water_model == 'tip4pew': - self.water_model = 'tip4pew' _forcefield_files.append(['amber/tip4pew_standard.xml', 'amber/tip4pew_HFE_multivalent.xml']) elif self.water_model == 'tip4pfb': - self.water_model = 'tip4pew' _forcefield_files.append(['amber/tip4pfb_standard.xml', 'amber/tip4pfb_HFE_multivalent.xml']) elif self.water_model == 'opc': - self.water_model = 'tip4pew' _forcefield_files.append(['amber/opc_standard.xml']) else: raise NotImplementedError(f'Water model {self.water_model} is not supported.') @@ -559,7 +571,24 @@ def _update_forcefield_files(self, forcefield_files): else: new_forcefield_files.append(f) - return new_forcefield_files + #return new_forcefield_files + self.forcefield_files = new_forcefield_files + + + def _get_water_class(self): + """Get water class (3-site or 4-site) based on the water model. + + Set self.water_class to tip3p or tip4pew based on the water model. + + 3-site water models: tip3p, tip3pfb, spce + 4-site water models: tip4pew, tip4pfb, opc + """ + if self.water_model in ['tip3p', 'tip3pfb', 'spce']: + self.water_class = 'tip3p' + elif self.water_model in ['tip4pew', 'tip4pfb', 'opc']: + self.water_class = 'tip4pew' + else: + raise NotImplementedError(f'Water model {self.water_model} is not supported.') def _load_biopolymer_file(self): @@ -642,10 +671,10 @@ def create_system(self, biopolymer_file=None, ligand_file=None): from openmm import MonteCarloBarostat from openmm import LangevinMiddleIntegrator + # Load biopolymer and ligand files self._biopolymer_file = biopolymer_file self._ligand_file = ligand_file - # Load biopolymer and ligand files if self._biopolymer_file is None and self._ligand_file is None: raise ValueError("At least one biopolymer (.pdb) or ligand (.sdf) file must be provided") if self._biopolymer_file is not None: @@ -687,7 +716,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # Solvate system _logger.debug("Solvating system...") modeller = app.Modeller(self._complex_topology, self._complex_positions) - modeller.addSolvent(self._system_generator.forcefield, model=self.water_model, padding=self.solvent_padding, ionicStrength=self.ionic_strength) + modeller.addSolvent(self._system_generator.forcefield, model=self.water_class, padding=self.solvent_padding, ionicStrength=self.ionic_strength) # Create system self.modeller_solvated_topology = modeller.getTopology() @@ -715,7 +744,9 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # Re-create system generator del self._system_generator - self.forcefield_files = self._update_forcefield_files(forcefield_files=[]) # Get water and ion forcefield files + #self.forcefield_files = self._update_forcefield_files(forcefield_files=[]) # Get water and ion forcefield files + self.forcefield_files = [] # Initialize forcefield_files to empty list and get water and ion forcefield files + self._update_forcefield_files() self._system_generator = SystemGenerator( forcefields=self.forcefield_files, forcefield_kwargs=forcefield_kwargs, periodic_forcefield_kwargs = periodic_forcefield_kwargs, barostat=barostat, small_molecule_forcefield=self.small_molecule_forcefield, cache=None, template_generator_kwargs=template_generator_kwargs) From 5d8fe9812c46cf537d58bfb4e016ea444c165fee Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 22 Mar 2024 20:00:13 -0400 Subject: [PATCH 05/11] add working draft of train_sampler for backup purpose --- espfit/app/train.py | 87 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index 047e96a..e397a37 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -405,7 +405,7 @@ def train(self): self._save_checkpoint(epoch) - def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False): + def _train_sampler_old(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False): """ Train the Espaloma network model with sampler. @@ -456,7 +456,6 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight for i in range(self.restart_epoch, self.epochs): epoch = i + 1 # Start from 1 (not zero-indexing) - """ # torch.cuda.OutOfMemoryError: CUDA out of memory. # Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; # 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) @@ -503,7 +502,89 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight # Back propagate loss.backward() optimizer.step() - """ + + if epoch % self.checkpoint_frequency == 0: + # Note: returned loss is a joint loss of different units. + #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() + _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}') + self._save_checkpoint(epoch) + + + + def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False): + """ + Train the Espaloma network model with sampler. + + Parameters + ---------- + sampler_patience : int, default=800 + The number of epochs to wait before using sampler. + + neff_threshold : float, default=0.2 + The minimum effective sample size threshold. + + sampler_weight : float, default=1.0 + The weight for the sampler loss. + + debug : bool, default=False + If True, use espaloma-0.3.pt for debugging. + + Returns + ------- + None + """ + from espfit.utils.sampler.reweight import SetupSamplerReweight + + # Note: RuntimeError will be raised if copy.deepcopy is used. + # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace + # operation: [torch.cuda.FloatTensor [512, 1]], which is output 0 of AsStridedBackward0, is at version 2; + # expected version 1 instead. Hint: the backtrace further above shows the operation that failed to + # compute its gradient. The variable in question was changed in there or anywhere later. Good luck! + import copy + net_copy = copy.deepcopy(self.net) + + self.sampler_patience = sampler_patience + self.neff_threshold = neff_threshold + + if self.dataset_train is None: + raise ValueError('Training dataset is not provided.') + + # Load checkpoint + self.restart_epoch = self._load_checkpoint() + + # Initialize + SamplerReweight = SetupSamplerReweight() + + # Train + ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) + optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) + with torch.autograd.set_detect_anomaly(True): + for i in range(self.restart_epoch, self.epochs): + epoch = i + 1 # Start from 1 (not zero-indexing) + + # + # Accumulate gradients to avoid CUDA out of memory error. + # --- + # torch.cuda.OutOfMemoryError: CUDA out of memory. + # Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; + # 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) + # If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. + # See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF + + # + if epoch > self.sampler_patience: + # Save checkpoint as local model (net.pt) + # `neff_min` is -1 if SamplerReweight.samplers is None + samplers = self._setup_local_samplers(epoch, net_copy, debug) + neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) + + # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton + if neff_min < self.neff_threshold: + _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') + SamplerReweight.samplers = samplers + SamplerReweight.run() + del samplers + # Gradient accumulation accumulation_steps = len(ds_tr_loader) From 83cbf0d5efe184c76cf5b1c75254e841676b3f49 Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 22 Mar 2024 20:47:21 -0400 Subject: [PATCH 06/11] update train_sampler --- espfit/app/train.py | 42 +++++++++++------------------------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index e397a37..193d7aa 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -571,21 +571,18 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight # If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. # See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF - # + # Check effective sample size and re-run simulation if necessary if epoch > self.sampler_patience: # Save checkpoint as local model (net.pt) # `neff_min` is -1 if SamplerReweight.samplers is None samplers = self._setup_local_samplers(epoch, net_copy, debug) neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) - - # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton if neff_min < self.neff_threshold: _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') SamplerReweight.samplers = samplers SamplerReweight.run() del samplers - # Gradient accumulation accumulation_steps = len(ds_tr_loader) for g in ds_tr_loader: @@ -593,39 +590,22 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight if torch.cuda.is_available(): g = g.to("cuda:0") g.nodes["n1"].data["xyz"].requires_grad = True - + # Compute loss loss, loss_dict = self.net(g) loss = loss/accumulation_steps + # MD sampler loss + if epoch > self.sampler_patience: + loss_list = SamplerReweight.compute_loss() # list of torch.tensor + for sampler_index, sampler_loss in enumerate(loss_list): + sampler = SamplerReweight.samplers[sampler_index] + loss += (sampler_loss/accumulation_steps) * sampler_weight + loss_dict[f'{sampler.target_name}'] = sampler_loss.item() + loss_dict['neff'] = neff_min loss.backward(retain_graph=True) - - if epoch > self.sampler_patience: - # Save checkpoint as local model (net.pt) - # `neff_min` is -1 if SamplerReweight.samplers is None - samplers = self._setup_local_samplers(epoch, net_copy, debug) - neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) - - # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton - if neff_min < self.neff_threshold: - _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') - SamplerReweight.samplers = samplers - SamplerReweight.run() - del samplers - - # Compute sampler loss - loss_list = SamplerReweight.compute_loss() # list of torch.tensor - for sampler_index, sampler_loss in enumerate(loss_list): - sampler = SamplerReweight.samplers[sampler_index] - loss += sampler_loss * sampler_weight - loss_dict[f'{sampler.target_name}'] = sampler_loss.item() - loss.backward() - loss_dict['neff'] = neff_min - loss_dict['loss'] = loss.item() self.report_loss(epoch, loss_dict) - - # Update optimizer.step() - + if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() From 9c8e8afd85c3316fca68266cdd8cd9a2bc13298f Mon Sep 17 00:00:00 2001 From: kt Date: Fri, 22 Mar 2024 21:05:02 -0400 Subject: [PATCH 07/11] change test_train_sampler to debug mode --- espfit/tests/test_app_train_sampler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py index 1f71247..2de9625 100644 --- a/espfit/tests/test_app_train_sampler.py +++ b/espfit/tests/test_app_train_sampler.py @@ -68,7 +68,9 @@ def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml): # Train sampler_patience = 10 # Force sampler to run after reaching sampler patience by setting neff_threshold to 1.0 - model.train_sampler(sampler_patience=sampler_patience, neff_threshold=1.0, sampler_weight=1) + # Set debug=True to use espaloma-0.3.2.pt instead of on-the-fly intermediate espaloma model. + # Epochs are too short for stable espaloma model. + model.train_sampler(sampler_patience=sampler_patience, neff_threshold=1.0, sampler_weight=1, debug=True) # Check outputs n_ckpt = len(glob.glob(os.path.join(model.output_directory_path, 'ckpt*pt'))) From 63e17dcf2d6b1519a770c8c1c35cb7648629d866 Mon Sep 17 00:00:00 2001 From: kt Date: Wed, 3 Apr 2024 16:44:32 -0400 Subject: [PATCH 08/11] dump backup --- espfit/app/analysis.py | 24 +++++++++-- espfit/app/sampler.py | 7 ++++ espfit/app/train.py | 56 +++++++++++++++----------- espfit/utils/sampler/reweight.py | 68 ++++++++++++++++++++------------ 4 files changed, 103 insertions(+), 52 deletions(-) diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py index 5235935..e512424 100644 --- a/espfit/app/analysis.py +++ b/espfit/app/analysis.py @@ -58,7 +58,7 @@ def output_directory_path(self, value): os.makedirs(value, exist_ok=True) - def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None): + def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, exclude_n_frames=0, input_directory_path=None): """Load MD trajectory. Parameters @@ -71,6 +71,11 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s stride : int, optional Stride to load the trajectory. Default is 1. + + exclude_n_frames : float or int, optional + Exclude the first n frames from the trajectory. Default is 0. + If float, the first n percentange of frames will be excluded. + If int, the first n frames will be excluded. input_directory_path : str, optional Input directory path. Default is None. @@ -114,6 +119,17 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s else: self.traj = traj + # Exclude the first n frames from the trajectory + if isinstance(exclude_n_frames, int): + if 0 < exclude_n_frames <= traj.n_frames: + self.traj = self.traj[exclude_n_frames:] + elif isinstance(exclude_n_frames, float): + if 0 < exclude_n_frames < 1: + exclude_n_frames = int(exclude_n_frames * traj.n_frames) + self.traj = self.traj[exclude_n_frames:] + else: + raise ValueError(f"Invalid exclude_n_frames: {exclude_n_frames}. Expected int or float.") + class RNASystem(BaseDataLoader): """RNA system class to compute experimental observables from MD simulations. @@ -235,9 +251,9 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None): std_raw = replace_nan_with_none(std_raw) if weights is not None: arr = _values[:,j] * weights - #_logger.info(f'non-weighted: {_values[:,j]}') - #_logger.info(f'weights: {weights}') - #_logger.info(f'weighted: {arr}') + _logger.info(f'non-weighted: {_values[:,j]}') + _logger.info(f'weights: {weights}') + _logger.info(f'weighted: {arr}') avg = np.round(arr.mean(), 5) std = np.round(arr.std(), 5) avg = replace_nan_with_none(avg) diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 863e6c7..387b684 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -757,6 +757,13 @@ def create_system(self, biopolymer_file=None, ligand_file=None): self.new_solvated_system = self.modeller_solvated_system self.new_solvated_topology = self.modeller_solvated_topology + # Set force groups + # https://github.com/openmm/openmm-cookbook/blob/main/notebooks/cookbook/Analyzing%20Energy%20Contributions.ipynb + # OpenMM does not have a way to directly query the energy of a Force object. + # Instead, it lets you query the energy of a force group. We therefore need each Force object to be in a different group. + for i, f in enumerate(self.new_solvated_system.getForces()): + f.setForceGroup(i) + # Save solvated pdb file outfile = os.path.join(self.output_directory_path, f"solvated.pdb") with open(f"{outfile}", "w") as wf: diff --git a/espfit/app/train.py b/espfit/app/train.py index 193d7aa..0338b20 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -532,6 +532,17 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight Returns ------- None + + Notes + ----- + Back-propagation is performed against accumulated gradients to avoid CUDA out of memory error. + Following error is raised if back-propagation is performed without gradient accumulation: + + > torch.cuda.OutOfMemoryError: CUDA out of memory. + > Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; + > 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) + > If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. + > See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF """ from espfit.utils.sampler.reweight import SetupSamplerReweight @@ -557,34 +568,38 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight # Train ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) + accumulation_steps = len(ds_tr_loader) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) with torch.autograd.set_detect_anomaly(True): for i in range(self.restart_epoch, self.epochs): epoch = i + 1 # Start from 1 (not zero-indexing) - - # - # Accumulate gradients to avoid CUDA out of memory error. - # --- - # torch.cuda.OutOfMemoryError: CUDA out of memory. - # Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; - # 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) - # If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. - # See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF - # Check effective sample size and re-run simulation if necessary + # Compute sampler loss if epoch > sampler_patience if epoch > self.sampler_patience: - # Save checkpoint as local model (net.pt) - # `neff_min` is -1 if SamplerReweight.samplers is None + # Save checkpoint as local model (net.pt) and create list of sampler instances samplers = self._setup_local_samplers(epoch, net_copy, debug) + # Check effective sample size (neff). neff is computed for each sampler (sampler). + # Re-run the simulations based on the minimum neff value. `neff_min` is -1 if SamplerReweight.samplers is None, + # which is the default value when SamplerReweight is initialized. neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) + # Re-run simulation if neff is below threshold if neff_min < self.neff_threshold: _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') SamplerReweight.samplers = samplers SamplerReweight.run() del samplers + # Compute sampler loss + _loss = torch.tensor(0.0) + _loss_dict = dict() + if SamplerReweight.samplers: + sampler_loss_list = SamplerReweight.compute_loss() # list of torch.tensor + for sampler_index, sampler_loss in enumerate(sampler_loss_list): + sampler = SamplerReweight.samplers[sampler_index] + _loss += (sampler_loss/accumulation_steps) * sampler_weight + _loss_dict[f'{sampler.target_name}'] = sampler_loss.item() + _loss_dict['neff'] = neff_min - # Gradient accumulation - accumulation_steps = len(ds_tr_loader) + # Accumulate gradiants for g in ds_tr_loader: optimizer.zero_grad() if torch.cuda.is_available(): @@ -593,16 +608,13 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight # Compute loss loss, loss_dict = self.net(g) loss = loss/accumulation_steps - # MD sampler loss + # Add sampler loss if epoch > self.sampler_patience: - loss_list = SamplerReweight.compute_loss() # list of torch.tensor - for sampler_index, sampler_loss in enumerate(loss_list): - sampler = SamplerReweight.samplers[sampler_index] - loss += (sampler_loss/accumulation_steps) * sampler_weight - loss_dict[f'{sampler.target_name}'] = sampler_loss.item() - loss_dict['neff'] = neff_min - loss.backward(retain_graph=True) + loss += _loss + loss.backward(retain_graph=True) loss_dict['loss'] = loss.item() + if _loss_dict: + loss_dict.update(_loss_dict) self.report_loss(epoch, loss_dict) optimizer.step() diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index cfb14bc..e91f1ec 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -31,7 +31,9 @@ class SetupSamplerReweight(object): """ def __init__(self): self.samplers = None - self.weights = dict() # {'target_name': {'weights': w_i}, {'neff': neff}} + self.weights_neff_dict = dict() # {'target_name': {'weights': w_i}, {'neff': neff}} + self.force_group_names = ['HarmonicBondForce', 'HarmonicAngleForce', 'PeriodicTorsionForce'] + self.exclude_n_frames = 0.1 def run(self): @@ -60,10 +62,10 @@ def get_effective_sample_size(self, temporary_samplers): float The minimum effective sample size among all samplers. """ - import mdtraj import numpy as np from openmm.unit import kilocalories_per_mole as kcalpermol from espfit.utils.units import KB_T_KCALPERMOL + from espfit.app.analysis import BaseDataLoader if self.samplers is None: return -1 @@ -76,41 +78,55 @@ def get_effective_sample_size(self, temporary_samplers): temp1 = temporary_sampler.temperature._value assert temp0 == temp1, f'Temperature should be equivalent but got sampler {temp0} K and temporary sampler {temp1} K' beta = 1 / (KB_T_KCALPERMOL * temp0) - _logger.debug(f'beta temperature in kcal/mol: {beta}') + _logger.info(f'beta temperature in kcal/mol: {beta}') # Get position from trajectory - traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb') - _logger.info(f'Found {traj.n_frames} frames in trajectory') + #traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb') + baseloader = BaseDataLoader() + baseloader.load_traj(input_directory_path=sampler.output_directory_path, exclude_n_frames=self.exclude_n_frames) + _logger.info(f'Found {baseloader.traj.n_frames} frames in trajectory') # Compute weights and effective sample size - log_w = [] - for i in range(traj.n_frames): + w_arr = [] + for i in range(baseloader.traj.n_frames): # U(x0, theta0) - sampler.simulation.context.setPositions(traj.openmm_positions(i)) - potential_energy = sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy() + sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i)) + for gid, force in enumerate(sampler.simulation.system.getForces()): + if force.getName() in self.force_group_names: + print(f'{force.getName()}') + try: + potential_energy += sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() + except: + potential_energy = sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() # U(x0, theta1) - temporary_sampler.simulation.context.setPositions(traj.openmm_positions(i)) - reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy() + temporary_sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i)) + for gid, force in enumerate(temporary_sampler.simulation.system.getForces()): + if force.getName() in self.force_group_names: + print(f'{force.getName()}') + try: + reduced_potential_energy += sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() + except: + reduced_potential_energy = sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() # deltaU = U(x0, theta1) - U(x0, theta0) delta = (reduced_potential_energy - potential_energy).value_in_unit(kcalpermol) - # log_w = ln(exp(-beta * delta)) + # w = ln(exp(-beta * delta)) w = -1 * beta * delta - log_w.append(w) + w_arr.append(w) - #_logger.debug(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') - #_logger.debug(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') - #_logger.debug(f'deltaU: {delta:10.3f} kcal/mol') - #_logger.debug(f'log_w: {w:10.3f}') + _logger.info(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') + _logger.info(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') + _logger.info(f'deltaU: {delta:10.3f} kcal/mol') + _logger.info(f'ln_w: {w:10.3f}') # Compute weights and effective sample size (ratio: 0 to 1) - w_i = np.exp(log_w) / np.sum(np.exp(log_w)) + w_i = np.exp(w_arr) / np.sum(np.exp(w_arr)) neff = np.sum(w_i) ** 2 / np.sum(w_i ** 2) / len(w_i) - #_logger.debug(f'w_i_sum: {np.sum(w_i):10.3f}') - #_logger.debug(f'neff: {neff:10.3f}') + _logger.info(f'w_i_sum: {np.sum(w_i):10.3f}') + _logger.info(f'neff: {neff:10.3f}') - self.weights[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i} - #_logger.info(f'{self.weights}') - neffs = [self.weights[key]['neff'] for key in self.weights.keys()] + self.weights_neff_dict[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i} + _logger.info(f'{self.weights_neff_dict}') + neffs = [self.weights_neff_dict[key]['neff'] for key in self.weights_neff_dict.keys()] return min(neffs) @@ -228,11 +244,11 @@ def _compute_weighted_observable(self, atomSubset, target_name, output_directory # Load trajectory target = RNASystem(atomSubset=atomSubset) - target.load_traj(input_directory_path=output_directory_path) + target.load_traj(input_directory_path=output_directory_path, exclude_n_frames=self.exclude_n_frames) # Compute observable - if self.weights.keys(): - pred = target.compute_jcouplings(weights=self.weights[target_name]['weights']) + if self.weights_neff_dict.keys(): + pred = target.compute_jcouplings(weights=self.weights_neff_dict[target_name]['weights']) else: pred = target.compute_jcouplings(weights=None) _logger.debug(f'Computed observable: {pred}') From 73dae5840a0dc202ef4f6b45a11029444ba4c45f Mon Sep 17 00:00:00 2001 From: kt Date: Sun, 7 Apr 2024 16:16:07 -0400 Subject: [PATCH 09/11] compute sugar couplings as default instead of all possible couplings --- espfit/app/analysis.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py index e512424..296cf4d 100644 --- a/espfit/app/analysis.py +++ b/espfit/app/analysis.py @@ -84,6 +84,10 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s Returns ------- None + + Notes + ----- + * Should `stride` and `exclude_n_frames` be set as class attributes? """ import mdtraj @@ -191,7 +195,7 @@ def radian_to_degree(self, a): return a - def compute_jcouplings(self, weights=None, couplings=None, residues=None): + def compute_jcouplings(self, weights=None, couplings=['H1H2', 'H2H3', 'H3H4'], residues=None): """Compute J-couplings from MD trajectory. Parameters @@ -200,7 +204,7 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None): Weights to compute the J-couplings. Default is None. couplings : str, optional - Name of the couplings to compute. Default is None. + Name of the couplings to compute. Default is ['H1H2', 'H2H3', 'H3H4']. If a list of couplings to be chosen from [H1H2,H2H3,H3H4,1H5P,2H5P,C4Pb,1H5H4,2H5H4,H3P,C4Pe,H1C2/4,H1C6/8] is provided, only the selected couplings will be computed. Otherwise, all couplings will be computed. @@ -224,7 +228,7 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None): """ import barnaba as bb - _logger.info("Compute J-couplings from MD trajectory") + _logger.debug("Compute J-couplings from MD trajectory") if couplings is not None: # Check if the provided coupling names are valid @@ -244,23 +248,25 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None): for i, resname in enumerate(resname_list): _values = values[:,i,:] # Coupling values of i-th residue values_by_names = dict() - for j, coupling_name in enumerate(couplings): + for j, coupling_name in enumerate(couplings): avg_raw = np.round(_values[:,j].mean(), 5) # e.g. mean value of H1H2 coupling of i-th residue std_raw = np.round(_values[:,j].std(), 5) # e.g. standard deviation of H1H2 coupling of i-th residue avg_raw = replace_nan_with_none(avg_raw) std_raw = replace_nan_with_none(std_raw) if weights is not None: arr = _values[:,j] * weights - _logger.info(f'non-weighted: {_values[:,j]}') - _logger.info(f'weights: {weights}') - _logger.info(f'weighted: {arr}') + _logger.info(f'weights:\n{weights}') + _logger.info(f'non-weighted observalble:\n{_values[:,j]}') + _logger.info(f'weighted observable:\n{arr}') avg = np.round(arr.mean(), 5) std = np.round(arr.std(), 5) avg = replace_nan_with_none(avg) std = replace_nan_with_none(std) else: + _logger.info(f'No weights are provided. Using predicted values without reweighting.') avg = avg_raw std = std_raw + _logger.info(f'J-scalar ({coupling_name}-{resname}): avg={avg}, std={std}') values_by_names[coupling_name] = {'avg': avg, 'std': std, 'avg_raw': avg_raw, 'std_raw': std_raw} coupling_dict[resname] = values_by_names From 1591436bc604d1448772db79ccc0f8483bdf412f Mon Sep 17 00:00:00 2001 From: kt Date: Sun, 7 Apr 2024 16:18:08 -0400 Subject: [PATCH 10/11] remove old train_sampler andupdate logic in train_sampler --- espfit/app/train.py | 146 ++++++++------------------------------------ 1 file changed, 27 insertions(+), 119 deletions(-) diff --git a/espfit/app/train.py b/espfit/app/train.py index 0338b20..1d90360 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -384,6 +384,8 @@ def train(self): with torch.autograd.set_detect_anomaly(True): for i in range(self.restart_epoch, self.epochs): epoch = i + 1 # Start from epoch 1 (not zero-indexing) + _logger.info(f'Epoch {epoch}') + for g in ds_tr_loader: optimizer.zero_grad() # TODO: Better way to handle this? @@ -401,115 +403,9 @@ def train(self): if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() - _logger.info(f'Epoch {epoch}: loss={loss:.3f}') + _logger.info(f'Total loss: {loss:.3f}') self._save_checkpoint(epoch) - - def _train_sampler_old(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False): - """ - Train the Espaloma network model with sampler. - - Parameters - ---------- - sampler_patience : int, default=800 - The number of epochs to wait before using sampler. - - neff_threshold : float, default=0.2 - The minimum effective sample size threshold. - - sampler_weight : float, default=1.0 - The weight for the sampler loss. - - debug : bool, default=False - If True, use espaloma-0.3.pt for debugging. - - Returns - ------- - None - """ - from espfit.utils.sampler.reweight import SetupSamplerReweight - - # Note: RuntimeError will be raised if copy.deepcopy is used. - # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace - # operation: [torch.cuda.FloatTensor [512, 1]], which is output 0 of AsStridedBackward0, is at version 2; - # expected version 1 instead. Hint: the backtrace further above shows the operation that failed to - # compute its gradient. The variable in question was changed in there or anywhere later. Good luck! - import copy - net_copy = copy.deepcopy(self.net) - - self.sampler_patience = sampler_patience - self.neff_threshold = neff_threshold - - if self.dataset_train is None: - raise ValueError('Training dataset is not provided.') - - # Load checkpoint - self.restart_epoch = self._load_checkpoint() - - # Initialize - SamplerReweight = SetupSamplerReweight() - - # Train - ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) - optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) - with torch.autograd.set_detect_anomaly(True): - for i in range(self.restart_epoch, self.epochs): - epoch = i + 1 # Start from 1 (not zero-indexing) - - # torch.cuda.OutOfMemoryError: CUDA out of memory. - # Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; - # 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) - # If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. - # See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF - - loss = torch.tensor(0.0) - if torch.cuda.is_available(): - loss = loss.cuda("cuda:0") - - for g in ds_tr_loader: - optimizer.zero_grad() - if torch.cuda.is_available(): - g = g.to("cuda:0") - g.nodes["n1"].data["xyz"].requires_grad = True - - _loss, loss_dict = self.net(g) - loss += _loss - - if epoch > self.sampler_patience: - # Save checkpoint as local model (net.pt) - # `neff_min` is -1 if SamplerReweight.samplers is None - samplers = self._setup_local_samplers(epoch, net_copy, debug) - neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) - - # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton - if neff_min < self.neff_threshold: - _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') - SamplerReweight.samplers = samplers - SamplerReweight.run() - del samplers - - # Compute sampler loss - loss_list = SamplerReweight.compute_loss() # list of torch.tensor - for sampler_index, sampler_loss in enumerate(loss_list): - sampler = SamplerReweight.samplers[sampler_index] - loss += sampler_loss * sampler_weight - loss_dict[f'{sampler.target_name}'] = sampler_loss.item() - loss_dict['neff'] = neff_min - - loss_dict['loss'] = loss.item() - self.report_loss(epoch, loss_dict) - - # Back propagate - loss.backward() - optimizer.step() - - if epoch % self.checkpoint_frequency == 0: - # Note: returned loss is a joint loss of different units. - #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() - _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}') - self._save_checkpoint(epoch) - - def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False): """ @@ -573,21 +469,33 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight with torch.autograd.set_detect_anomaly(True): for i in range(self.restart_epoch, self.epochs): epoch = i + 1 # Start from 1 (not zero-indexing) + _logger.info(f'# Epoch {epoch}') # Compute sampler loss if epoch > sampler_patience if epoch > self.sampler_patience: # Save checkpoint as local model (net.pt) and create list of sampler instances - samplers = self._setup_local_samplers(epoch, net_copy, debug) - # Check effective sample size (neff). neff is computed for each sampler (sampler). - # Re-run the simulations based on the minimum neff value. `neff_min` is -1 if SamplerReweight.samplers is None, - # which is the default value when SamplerReweight is initialized. - neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) - # Re-run simulation if neff is below threshold + temporary_samplers = self._setup_temporary_samplers(epoch, net_copy, debug) + + # Check the effective sample size (neff), computed for each sampler stored in `SamplerReweight.samplers`. + # Coordinates from `SamplerReweight.samplers` are used to compute the effective sample size and Boltzmann weights + # using force field parameters (Espaloma models) stored in `SamplerReweight.samplers` and `temporary_samplers`, + # which are the updated force field parameters from the current epoch. + # Re-run the simulations based on the minimum neff value. `neff_min` is set to -1 if `SamplerReweight.samplers` is None, + # which is the default state when `SetupSamplerReweight()` is initialized. + # Note that `SamplerReweight.get_effective_sample_size` updates the `SamplerReweight.weights_neff_dict` attribute. + neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=temporary_samplers) + + # Re-run simulation if `neff_min` is below threshold if neff_min < self.neff_threshold: - _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') - SamplerReweight.samplers = samplers + _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold:.3f})') + # Update SamplerReweight.sampler with the temporary_samplers + SamplerReweight.samplers = temporary_samplers SamplerReweight.run() - del samplers + # Initiliaze `SamplerReweight.weights_neff_dict` to empty dictionary + _logger.info(f'Reset sampler weights') + SamplerReweight.weights_neff_dict = dict() + del temporary_samplers + # Compute sampler loss _loss = torch.tensor(0.0) _loss_dict = dict() @@ -621,7 +529,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() - _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}') + _logger.info(f'Total loss: {loss.item():.3f}') self._save_checkpoint(epoch) @@ -701,8 +609,8 @@ def _save_local_model(self, epoch, net_copy): self.save_model(net=net_copy, checkpoint_file=local_model, output_model=f"net.pt", output_directory_path=self.output_directory_path) - def _setup_local_samplers(self, epoch, net_copy, debug): - """Setup local samplers. + def _setup_temporary_samplers(self, epoch, net_copy, debug): + """Setup samplers using temporary espaloma models generated during training. Parameters ---------- From 3455a6fe7bb13015762c47fff429e0b1ddaef197 Mon Sep 17 00:00:00 2001 From: kt Date: Sun, 7 Apr 2024 16:20:40 -0400 Subject: [PATCH 11/11] use espfit.analysis.BaseDataLoader to load trajectories --- espfit/utils/sampler/reweight.py | 38 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index e91f1ec..ba47a73 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -32,7 +32,7 @@ class SetupSamplerReweight(object): def __init__(self): self.samplers = None self.weights_neff_dict = dict() # {'target_name': {'weights': w_i}, {'neff': neff}} - self.force_group_names = ['HarmonicBondForce', 'HarmonicAngleForce', 'PeriodicTorsionForce'] + self.force_group_names = ['HarmonicBondForce', 'HarmonicAngleForce', 'PeriodicTorsionForce', 'NonbondedForce'] self.exclude_n_frames = 0.1 @@ -44,7 +44,7 @@ def run(self): None """ for sampler in self.samplers: - _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') + _logger.info(f'Re-run simulation for {sampler.target_name}') sampler.minimize() sampler.run() @@ -67,24 +67,26 @@ def get_effective_sample_size(self, temporary_samplers): from espfit.utils.units import KB_T_KCALPERMOL from espfit.app.analysis import BaseDataLoader + _logger.info(f'Compute effective sample size and sampling weights') + if self.samplers is None: + _logger.info('No samplers found. Return effective sample size -1.') return -1 for sampler, temporary_sampler in zip(self.samplers, temporary_samplers): - _logger.info(f'Compute effective sample size and sampling weights for {sampler.target_name}') + _logger.info(f'Analyzing {sampler.target_name}...') # Get temperature temp0 = sampler.temperature._value temp1 = temporary_sampler.temperature._value assert temp0 == temp1, f'Temperature should be equivalent but got sampler {temp0} K and temporary sampler {temp1} K' beta = 1 / (KB_T_KCALPERMOL * temp0) - _logger.info(f'beta temperature in kcal/mol: {beta}') + #_logger.debug(f'beta temperature in kcal/mol: {beta}') # Get position from trajectory - #traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb') - baseloader = BaseDataLoader() + baseloader = BaseDataLoader(atomSubset=sampler.atomSubset) baseloader.load_traj(input_directory_path=sampler.output_directory_path, exclude_n_frames=self.exclude_n_frames) - _logger.info(f'Found {baseloader.traj.n_frames} frames in trajectory') + _logger.info(f'Found {baseloader.traj.n_frames} frames from {sampler.output_directory_path}') # Compute weights and effective sample size w_arr = [] @@ -93,7 +95,6 @@ def get_effective_sample_size(self, temporary_samplers): sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i)) for gid, force in enumerate(sampler.simulation.system.getForces()): if force.getName() in self.force_group_names: - print(f'{force.getName()}') try: potential_energy += sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() except: @@ -102,11 +103,10 @@ def get_effective_sample_size(self, temporary_samplers): temporary_sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i)) for gid, force in enumerate(temporary_sampler.simulation.system.getForces()): if force.getName() in self.force_group_names: - print(f'{force.getName()}') try: - reduced_potential_energy += sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() + reduced_potential_energy += temporary_sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() except: - reduced_potential_energy = sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() + reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() # deltaU = U(x0, theta1) - U(x0, theta0) delta = (reduced_potential_energy - potential_energy).value_in_unit(kcalpermol) # w = ln(exp(-beta * delta)) @@ -116,16 +116,22 @@ def get_effective_sample_size(self, temporary_samplers): _logger.info(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') _logger.info(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') _logger.info(f'deltaU: {delta:10.3f} kcal/mol') - _logger.info(f'ln_w: {w:10.3f}') + _logger.info(f'w: {w:10.3f}') # Compute weights and effective sample size (ratio: 0 to 1) + # Prevent RuntimeWarning: overflow encountered in exp + w_arr = np.float128(w_arr) w_i = np.exp(w_arr) / np.sum(np.exp(w_arr)) neff = np.sum(w_i) ** 2 / np.sum(w_i ** 2) / len(w_i) - _logger.info(f'w_i_sum: {np.sum(w_i):10.3f}') - _logger.info(f'neff: {neff:10.3f}') + _logger.debug(f'w_i_sum: {np.sum(w_i):10.3f}') + _logger.debug(f'neff: {neff:10.3f}') + + # Check if sum of weights is 1 + sum_w = abs(np.sum(w_i)) + assert abs(1 - sum_w) <= 0.001, f"Weight sum {sum_w} is greater than tolerance 0.001" self.weights_neff_dict[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i} - _logger.info(f'{self.weights_neff_dict}') + _logger.debug(f'{self.weights_neff_dict}') neffs = [self.weights_neff_dict[key]['neff'] for key in self.weights_neff_dict.keys()] return min(neffs) @@ -141,7 +147,7 @@ def compute_loss(self): """ loss_list = [] for sampler in self.samplers: - _logger.info(f'Compute loss for {sampler.target_name}') + _logger.debug(f'Compute loss for {sampler.target_name}') loss = self._compute_loss_per_system(sampler) # torch.tensor loss_list.append(loss)