diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py index 5235935..296cf4d 100644 --- a/espfit/app/analysis.py +++ b/espfit/app/analysis.py @@ -58,7 +58,7 @@ def output_directory_path(self, value): os.makedirs(value, exist_ok=True) - def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None): + def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, exclude_n_frames=0, input_directory_path=None): """Load MD trajectory. Parameters @@ -71,6 +71,11 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s stride : int, optional Stride to load the trajectory. Default is 1. + + exclude_n_frames : float or int, optional + Exclude the first n frames from the trajectory. Default is 0. + If float, the first n percentange of frames will be excluded. + If int, the first n frames will be excluded. input_directory_path : str, optional Input directory path. Default is None. @@ -79,6 +84,10 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s Returns ------- None + + Notes + ----- + * Should `stride` and `exclude_n_frames` be set as class attributes? """ import mdtraj @@ -114,6 +123,17 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s else: self.traj = traj + # Exclude the first n frames from the trajectory + if isinstance(exclude_n_frames, int): + if 0 < exclude_n_frames <= traj.n_frames: + self.traj = self.traj[exclude_n_frames:] + elif isinstance(exclude_n_frames, float): + if 0 < exclude_n_frames < 1: + exclude_n_frames = int(exclude_n_frames * traj.n_frames) + self.traj = self.traj[exclude_n_frames:] + else: + raise ValueError(f"Invalid exclude_n_frames: {exclude_n_frames}. Expected int or float.") + class RNASystem(BaseDataLoader): """RNA system class to compute experimental observables from MD simulations. @@ -175,7 +195,7 @@ def radian_to_degree(self, a): return a - def compute_jcouplings(self, weights=None, couplings=None, residues=None): + def compute_jcouplings(self, weights=None, couplings=['H1H2', 'H2H3', 'H3H4'], residues=None): """Compute J-couplings from MD trajectory. Parameters @@ -184,7 +204,7 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None): Weights to compute the J-couplings. Default is None. couplings : str, optional - Name of the couplings to compute. Default is None. + Name of the couplings to compute. Default is ['H1H2', 'H2H3', 'H3H4']. If a list of couplings to be chosen from [H1H2,H2H3,H3H4,1H5P,2H5P,C4Pb,1H5H4,2H5H4,H3P,C4Pe,H1C2/4,H1C6/8] is provided, only the selected couplings will be computed. Otherwise, all couplings will be computed. @@ -208,7 +228,7 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None): """ import barnaba as bb - _logger.info("Compute J-couplings from MD trajectory") + _logger.debug("Compute J-couplings from MD trajectory") if couplings is not None: # Check if the provided coupling names are valid @@ -228,23 +248,25 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None): for i, resname in enumerate(resname_list): _values = values[:,i,:] # Coupling values of i-th residue values_by_names = dict() - for j, coupling_name in enumerate(couplings): + for j, coupling_name in enumerate(couplings): avg_raw = np.round(_values[:,j].mean(), 5) # e.g. mean value of H1H2 coupling of i-th residue std_raw = np.round(_values[:,j].std(), 5) # e.g. standard deviation of H1H2 coupling of i-th residue avg_raw = replace_nan_with_none(avg_raw) std_raw = replace_nan_with_none(std_raw) if weights is not None: arr = _values[:,j] * weights - #_logger.info(f'non-weighted: {_values[:,j]}') - #_logger.info(f'weights: {weights}') - #_logger.info(f'weighted: {arr}') + _logger.info(f'weights:\n{weights}') + _logger.info(f'non-weighted observalble:\n{_values[:,j]}') + _logger.info(f'weighted observable:\n{arr}') avg = np.round(arr.mean(), 5) std = np.round(arr.std(), 5) avg = replace_nan_with_none(avg) std = replace_nan_with_none(std) else: + _logger.info(f'No weights are provided. Using predicted values without reweighting.') avg = avg_raw std = std_raw + _logger.info(f'J-scalar ({coupling_name}-{resname}): avg={avg}, std={std}') values_by_names[coupling_name] = {'avg': avg, 'std': std, 'avg_raw': avg_raw, 'std_raw': std_raw} coupling_dict[resname] = values_by_names diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index eb211e5..387b684 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -38,8 +38,8 @@ class BaseSimulation(object): export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None): Export serialized system XML file and solvated pdb file. """ - def __init__(self, maxIterations=100, nsteps=250000, atomSubset='solute', - checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, + def __init__(self, maxIterations=100, nsteps=2500000, atomSubset='solute', + checkpoint_frequency=250000, logging_frequency=250000, netcdf_frequency=250000, output_directory_path=None, input_directory_path=None): """Initialize base simulation object. @@ -48,19 +48,19 @@ def __init__(self, maxIterations=100, nsteps=250000, atomSubset='solute', maxIterations : int, default=100 Maximum number of iterations to perform minimization. - nsteps : int, default=250000 (10 ns using 4 fs timestep) + nsteps : int, default=2500000 (10 ns using 4 fs timestep) Number of steps to run the simulation. atomSubset : str, default='solute' Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'. - checkpoint_frequency : int, default=25000 (1 ns) + checkpoint_frequency : int, default=250000 (1 ns) Frequency (in steps) at which to write checkpoint files. - logging_frequency : int, default=250000 (10 ns) + logging_frequency : int, default=250000 (1 ns) Frequency (in steps) at which to write logging files. - netcdf_frequency : int, default=250000 (10 ns) + netcdf_frequency : int, default=250000 (1 ns) Frequency (in steps) at which to write netcdf files. output_directory_path : str, optional @@ -336,7 +336,7 @@ class SetupSampler(BaseSimulation): ['amber/protein.ff14SB.xml', 'amber/RNA.OL3.xml'] : pl-multi (TPO): NG, pl-single: OK, RNA: OK """ def __init__(self, - small_molecule_forcefield='espfit/data/forcefield/espaloma-0.3.2.pt', + small_molecule_forcefield='espaloma-0.3.2', forcefield_files = ['amber/ff14SB.xml', 'amber/phosaa14SB.xml'], water_model='tip3p', solvent_padding=9.0 * unit.angstroms, @@ -355,7 +355,8 @@ def __init__(self, Parameters ---------- small_molecule_forcefield : str, optional - The force field to be used for small molecules. Default is 'openff-2.1.0'. + The force field to be used for small molecules. Default is 'espaloma-0.3.2'. + Alternative recommended choice is 'openff-2.1.0'. forcefield_files : list, optional List of force field files. Default is ['amber14-all.xml']. water_model : str, optional @@ -378,11 +379,15 @@ def __init__(self, The integration timestep. Default is 4 * unit.femtoseconds. override_with_espaloma : bool, optional Whether to override the original parameters with espaloma. Default is True. + This will override all solute molecules with espaloma parameters. """ super(SetupSampler, self).__init__(**kwargs) + if small_molecule_forcefield == 'espaloma-0.3.2': + from importlib.resources import files + small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt")) self.small_molecule_forcefield = small_molecule_forcefield self.water_model = water_model - self.forcefield_files = self._update_forcefield_files(forcefield_files) + self.forcefield_files = forcefield_files self.solvent_padding = solvent_padding self.ionic_strength = ionic_strength self.hmass = hmass @@ -394,12 +399,20 @@ def __init__(self, self.override_with_espaloma = override_with_espaloma self.target_class = None self.target_name = None + # Update forcefield file list to add water model files + self._update_forcefield_files() + # Get water class (3-site: tip3p, 4-site model: tip4pew) + self._get_water_class() @classmethod - def from_toml(cls, filename, *args, **override_sampler_kwargs): + def _from_toml(cls, filename, *args, **override_sampler_kwargs): """Create SetupSampler from a TOML configuration file. + Note that this is designed for creating new systems with temporary espaloma models generated during + espaloma training. It supports multiple systems in the configuration file and returns a list of + SetupSampler instances. + Parameters ---------- filename : str @@ -429,30 +442,50 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs): print(e) raise - config = config['sampler']['setup'] # list + config = config['sampler']['setup'] # list of multiple setups if config is None: raise ValueError("target is not specified in the configuration file") + # If the configuration file contains a single system, convert it to a list + if not isinstance(config, list): + config = [config] + samplers = [] _logger.debug(f'Found {len(config)} systems in the configuration file') for _config in config: + # Create SetupSampler instance with default settings + # Note that this automatically adds the default water model (tip3p) to `self.forcefield_files`. sampler = cls() # Target information - target_class = _config['target_class'] - target_name = _config['target_name'] - + target_class = _config.get('target_class', None) + target_name = _config.get('target_name', None) sampler.target_class = target_class sampler.target_name = target_name - biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb') - ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf') - if not ligand_file.exists(): - ligand_file = None + # Get biopolymer and ligand file if given + # Priority 1: Use input files if given + biopolymer_file, ligand_file = None, None + if _config.get('biopolymer_file'): + biopolymer_file = _config['biopolymer_file'] + if not os.path.exists(biopolymer_file): + raise FileNotFoundError(f"File not found: {biopolymer_file}") + if _config.get('ligand_file'): + ligand_file = _config['ligand_file'] + if not os.path.exists(ligand_file): + raise FileNotFoundError(f"File not found: {ligand_file}") + # Priority 2: Search espfit/data/target directory if input files are not given + if biopolymer_file is None and ligand_file is None: + biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb') + ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf') + if not biopolymer_file.exists(): + raise FileNotFoundError(f"File not found: {biopolymer_file}") + if not ligand_file.exists(): + ligand_file = None # System settings for key, value in _config.items(): - if key not in ['target_class', 'target_name']: + if key not in ['target_class', 'target_name', 'biopolymer_file', 'ligand_file']: if hasattr(sampler, key): if isinstance(value, str) and "*" in value: _value = float(value.split('*')[0].strip()) @@ -462,7 +495,7 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs): setattr(sampler, key, value) else: raise ValueError(f"Invalid keyword argument: {key}") - + # Pass temporary espaloma model to the sampler if kwargs are given for key, value in override_sampler_kwargs.items(): if hasattr(sampler, key): @@ -476,7 +509,18 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs): sampler.output_directory_path = os.path.join(sampler.output_directory_path, sampler.target_name, f'{args[0]}') else: raise ValueError(f"Invalid argument: {args}. Expected a single integer value for the epoch number.") - + + # Empty and recreate `forcefield_files` to avoid appending multiple water model forcefields. + # Use list of forcefield files if given in the `override_sampler_kwargs`. If not given, use the default forcefield_files. + if 'forcefield_files' in override_sampler_kwargs.keys(): + pass + else: + forcefield_files = ['amber/ff14SB.xml', 'amber/phosaa14SB.xml'] + sampler.forcefield_files = forcefield_files + sampler._update_forcefield_files() + # Update water class (3-site: tip3p, 4-site model: tip4pew) + sampler._get_water_class() + # Create system sampler.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file) samplers.append(sampler) @@ -485,7 +529,7 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs): return samplers - def _update_forcefield_files(self, forcefield_files): + def _update_forcefield_files(self): """Get forcefield files. Update `forcefield_files` depending on the type of water model. @@ -499,28 +543,23 @@ def _update_forcefield_files(self, forcefield_files): # For some reason, the original forcefield_files keeps appending when SetupSampler is called. # TODO: Is this the right way to handle this issue? import copy - _forcefield_files = copy.deepcopy(forcefield_files) + _forcefield_files = copy.deepcopy(self.forcefield_files) # 3-site water models if self.water_model == 'tip3p': _forcefield_files.append(['amber/tip3p_standard.xml', 'amber/tip3p_HFE_multivalent.xml']) elif self.water_model == 'tip3pfb': - self.water_model = 'tip3p' _forcefield_files.append(['amber/tip3pfb_standard.xml', 'amber/tip3pfb_HFE_multivalent.xml']) elif self.water_model == 'spce': - self.water_model = 'tip3p' _forcefield_files.append(['amber/spce_standard.xml', 'amber/spce_HFE_multivalent.xml']) elif self.water_model == 'opc3': raise NotImplementedError('see https://github.com/choderalab/rna-espaloma/blob/main/experiment/nucleoside/script/create_system_espaloma.py#L366') # 4-site water models elif self.water_model == 'tip4pew': - self.water_model = 'tip4pew' _forcefield_files.append(['amber/tip4pew_standard.xml', 'amber/tip4pew_HFE_multivalent.xml']) elif self.water_model == 'tip4pfb': - self.water_model = 'tip4pew' _forcefield_files.append(['amber/tip4pfb_standard.xml', 'amber/tip4pfb_HFE_multivalent.xml']) elif self.water_model == 'opc': - self.water_model = 'tip4pew' _forcefield_files.append(['amber/opc_standard.xml']) else: raise NotImplementedError(f'Water model {self.water_model} is not supported.') @@ -532,7 +571,24 @@ def _update_forcefield_files(self, forcefield_files): else: new_forcefield_files.append(f) - return new_forcefield_files + #return new_forcefield_files + self.forcefield_files = new_forcefield_files + + + def _get_water_class(self): + """Get water class (3-site or 4-site) based on the water model. + + Set self.water_class to tip3p or tip4pew based on the water model. + + 3-site water models: tip3p, tip3pfb, spce + 4-site water models: tip4pew, tip4pfb, opc + """ + if self.water_model in ['tip3p', 'tip3pfb', 'spce']: + self.water_class = 'tip3p' + elif self.water_model in ['tip4pew', 'tip4pfb', 'opc']: + self.water_class = 'tip4pew' + else: + raise NotImplementedError(f'Water model {self.water_model} is not supported.') def _load_biopolymer_file(self): @@ -615,10 +671,10 @@ def create_system(self, biopolymer_file=None, ligand_file=None): from openmm import MonteCarloBarostat from openmm import LangevinMiddleIntegrator + # Load biopolymer and ligand files self._biopolymer_file = biopolymer_file self._ligand_file = ligand_file - # Load biopolymer and ligand files if self._biopolymer_file is None and self._ligand_file is None: raise ValueError("At least one biopolymer (.pdb) or ligand (.sdf) file must be provided") if self._biopolymer_file is not None: @@ -660,7 +716,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # Solvate system _logger.debug("Solvating system...") modeller = app.Modeller(self._complex_topology, self._complex_positions) - modeller.addSolvent(self._system_generator.forcefield, model=self.water_model, padding=self.solvent_padding, ionicStrength=self.ionic_strength) + modeller.addSolvent(self._system_generator.forcefield, model=self.water_class, padding=self.solvent_padding, ionicStrength=self.ionic_strength) # Create system self.modeller_solvated_topology = modeller.getTopology() @@ -684,11 +740,13 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # As a workaround, we will delete the original `self._system_generator` and create a new one to regenerate the system with espaloma. # Only water and ion forcefield files will be used to regenerate the system. Solute molecules will be parametrized with espaloma. # - _logger.debug("Regenerate system with espaloma.") + _logger.info("Regenerate system with espaloma") # Re-create system generator del self._system_generator - self.forcefield_files = self._update_forcefield_files(forcefield_files=[]) # Get water and ion forcefield files + #self.forcefield_files = self._update_forcefield_files(forcefield_files=[]) # Get water and ion forcefield files + self.forcefield_files = [] # Initialize forcefield_files to empty list and get water and ion forcefield files + self._update_forcefield_files() self._system_generator = SystemGenerator( forcefields=self.forcefield_files, forcefield_kwargs=forcefield_kwargs, periodic_forcefield_kwargs = periodic_forcefield_kwargs, barostat=barostat, small_molecule_forcefield=self.small_molecule_forcefield, cache=None, template_generator_kwargs=template_generator_kwargs) @@ -699,6 +757,13 @@ def create_system(self, biopolymer_file=None, ligand_file=None): self.new_solvated_system = self.modeller_solvated_system self.new_solvated_topology = self.modeller_solvated_topology + # Set force groups + # https://github.com/openmm/openmm-cookbook/blob/main/notebooks/cookbook/Analyzing%20Energy%20Contributions.ipynb + # OpenMM does not have a way to directly query the energy of a Force object. + # Instead, it lets you query the energy of a force group. We therefore need each Force object to be in a different group. + for i, f in enumerate(self.new_solvated_system.getForces()): + f.setForceGroup(i) + # Save solvated pdb file outfile = os.path.join(self.output_directory_path, f"solvated.pdb") with open(f"{outfile}", "w") as wf: @@ -729,8 +794,6 @@ def _regenerate_espaloma_system(self): import mdtraj from openff.toolkit import Molecule - _logger.debug("Regenerate system with espaloma") - # Check biopolymer chains mdtop = mdtraj.Topology.from_openmm(self.modeller_solvated_topology) chain_indices = [ chain.index for chain in self.modeller_solvated_topology.chains() ] diff --git a/espfit/app/train.py b/espfit/app/train.py index 0dd37a6..1d90360 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -384,6 +384,8 @@ def train(self): with torch.autograd.set_detect_anomaly(True): for i in range(self.restart_epoch, self.epochs): epoch = i + 1 # Start from epoch 1 (not zero-indexing) + _logger.info(f'Epoch {epoch}') + for g in ds_tr_loader: optimizer.zero_grad() # TODO: Better way to handle this? @@ -401,10 +403,10 @@ def train(self): if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() - _logger.info(f'Epoch {epoch}: loss={loss:.3f}') + _logger.info(f'Total loss: {loss:.3f}') self._save_checkpoint(epoch) - + def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False): """ Train the Espaloma network model with sampler. @@ -426,6 +428,17 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight Returns ------- None + + Notes + ----- + Back-propagation is performed against accumulated gradients to avoid CUDA out of memory error. + Following error is raised if back-propagation is performed without gradient accumulation: + + > torch.cuda.OutOfMemoryError: CUDA out of memory. + > Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; + > 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) + > If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. + > See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF """ from espfit.utils.sampler.reweight import SetupSamplerReweight @@ -451,104 +464,72 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight # Train ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) + accumulation_steps = len(ds_tr_loader) optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) with torch.autograd.set_detect_anomaly(True): for i in range(self.restart_epoch, self.epochs): epoch = i + 1 # Start from 1 (not zero-indexing) - - """ - # torch.cuda.OutOfMemoryError: CUDA out of memory. - # Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; - # 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) - # If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. - # See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF - - loss = torch.tensor(0.0) - if torch.cuda.is_available(): - loss = loss.cuda("cuda:0") - - for g in ds_tr_loader: - optimizer.zero_grad() - if torch.cuda.is_available(): - g = g.to("cuda:0") - g.nodes["n1"].data["xyz"].requires_grad = True - - _loss, loss_dict = self.net(g) - loss += _loss + _logger.info(f'# Epoch {epoch}') + # Compute sampler loss if epoch > sampler_patience if epoch > self.sampler_patience: - # Save checkpoint as local model (net.pt) - # `neff_min` is -1 if SamplerReweight.samplers is None - samplers = self._setup_local_samplers(epoch, net_copy, debug) - neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) - - # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton + # Save checkpoint as local model (net.pt) and create list of sampler instances + temporary_samplers = self._setup_temporary_samplers(epoch, net_copy, debug) + + # Check the effective sample size (neff), computed for each sampler stored in `SamplerReweight.samplers`. + # Coordinates from `SamplerReweight.samplers` are used to compute the effective sample size and Boltzmann weights + # using force field parameters (Espaloma models) stored in `SamplerReweight.samplers` and `temporary_samplers`, + # which are the updated force field parameters from the current epoch. + # Re-run the simulations based on the minimum neff value. `neff_min` is set to -1 if `SamplerReweight.samplers` is None, + # which is the default state when `SetupSamplerReweight()` is initialized. + # Note that `SamplerReweight.get_effective_sample_size` updates the `SamplerReweight.weights_neff_dict` attribute. + neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=temporary_samplers) + + # Re-run simulation if `neff_min` is below threshold if neff_min < self.neff_threshold: - _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') - SamplerReweight.samplers = samplers + _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold:.3f})') + # Update SamplerReweight.sampler with the temporary_samplers + SamplerReweight.samplers = temporary_samplers SamplerReweight.run() - del samplers - + # Initiliaze `SamplerReweight.weights_neff_dict` to empty dictionary + _logger.info(f'Reset sampler weights') + SamplerReweight.weights_neff_dict = dict() + del temporary_samplers + # Compute sampler loss - loss_list = SamplerReweight.compute_loss() # list of torch.tensor - for sampler_index, sampler_loss in enumerate(loss_list): - sampler = SamplerReweight.samplers[sampler_index] - loss += sampler_loss * sampler_weight - loss_dict[f'{sampler.target_name}'] = sampler_loss.item() - loss_dict['neff'] = neff_min - - loss_dict['loss'] = loss.item() - self.report_loss(epoch, loss_dict) - - # Back propagate - loss.backward() - optimizer.step() - """ - - # Gradient accumulation - accumulation_steps = len(ds_tr_loader) + _loss = torch.tensor(0.0) + _loss_dict = dict() + if SamplerReweight.samplers: + sampler_loss_list = SamplerReweight.compute_loss() # list of torch.tensor + for sampler_index, sampler_loss in enumerate(sampler_loss_list): + sampler = SamplerReweight.samplers[sampler_index] + _loss += (sampler_loss/accumulation_steps) * sampler_weight + _loss_dict[f'{sampler.target_name}'] = sampler_loss.item() + _loss_dict['neff'] = neff_min + + # Accumulate gradiants for g in ds_tr_loader: optimizer.zero_grad() if torch.cuda.is_available(): g = g.to("cuda:0") g.nodes["n1"].data["xyz"].requires_grad = True - + # Compute loss loss, loss_dict = self.net(g) loss = loss/accumulation_steps - loss.backward() - - if epoch > self.sampler_patience: - # Save checkpoint as local model (net.pt) - # `neff_min` is -1 if SamplerReweight.samplers is None - samplers = self._setup_local_samplers(epoch, net_copy, debug) - neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) - - # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton - if neff_min < self.neff_threshold: - _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') - SamplerReweight.samplers = samplers - SamplerReweight.run() - del samplers - - # Compute sampler loss - loss_list = SamplerReweight.compute_loss() # list of torch.tensor - for sampler_index, sampler_loss in enumerate(loss_list): - sampler = SamplerReweight.samplers[sampler_index] - loss += sampler_loss * sampler_weight - loss_dict[f'{sampler.target_name}'] = sampler_loss.item() - loss.backward() - loss_dict['neff'] = neff_min - + # Add sampler loss + if epoch > self.sampler_patience: + loss += _loss + loss.backward(retain_graph=True) loss_dict['loss'] = loss.item() + if _loss_dict: + loss_dict.update(_loss_dict) self.report_loss(epoch, loss_dict) - - # Update optimizer.step() - + if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() - _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}') + _logger.info(f'Total loss: {loss.item():.3f}') self._save_checkpoint(epoch) @@ -628,8 +609,8 @@ def _save_local_model(self, epoch, net_copy): self.save_model(net=net_copy, checkpoint_file=local_model, output_model=f"net.pt", output_directory_path=self.output_directory_path) - def _setup_local_samplers(self, epoch, net_copy, debug): - """Setup local samplers. + def _setup_temporary_samplers(self, epoch, net_copy, debug): + """Setup samplers using temporary espaloma models generated during training. Parameters ---------- @@ -666,7 +647,7 @@ def _setup_local_samplers(self, epoch, net_copy, debug): } # Create sampler system from configuration file. Returns list of systems. - samplers = SetupSampler.from_toml(self.configfile, *args, **override_sampler_kwargs) + samplers = SetupSampler._from_toml(self.configfile, *args, **override_sampler_kwargs) return samplers \ No newline at end of file diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py index 1f71247..2de9625 100644 --- a/espfit/tests/test_app_train_sampler.py +++ b/espfit/tests/test_app_train_sampler.py @@ -68,7 +68,9 @@ def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml): # Train sampler_patience = 10 # Force sampler to run after reaching sampler patience by setting neff_threshold to 1.0 - model.train_sampler(sampler_patience=sampler_patience, neff_threshold=1.0, sampler_weight=1) + # Set debug=True to use espaloma-0.3.2.pt instead of on-the-fly intermediate espaloma model. + # Epochs are too short for stable espaloma model. + model.train_sampler(sampler_patience=sampler_patience, neff_threshold=1.0, sampler_weight=1, debug=True) # Check outputs n_ckpt = len(glob.glob(os.path.join(model.output_directory_path, 'ckpt*pt'))) diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py index cfb14bc..ba47a73 100644 --- a/espfit/utils/sampler/reweight.py +++ b/espfit/utils/sampler/reweight.py @@ -31,7 +31,9 @@ class SetupSamplerReweight(object): """ def __init__(self): self.samplers = None - self.weights = dict() # {'target_name': {'weights': w_i}, {'neff': neff}} + self.weights_neff_dict = dict() # {'target_name': {'weights': w_i}, {'neff': neff}} + self.force_group_names = ['HarmonicBondForce', 'HarmonicAngleForce', 'PeriodicTorsionForce', 'NonbondedForce'] + self.exclude_n_frames = 0.1 def run(self): @@ -42,7 +44,7 @@ def run(self): None """ for sampler in self.samplers: - _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') + _logger.info(f'Re-run simulation for {sampler.target_name}') sampler.minimize() sampler.run() @@ -60,57 +62,77 @@ def get_effective_sample_size(self, temporary_samplers): float The minimum effective sample size among all samplers. """ - import mdtraj import numpy as np from openmm.unit import kilocalories_per_mole as kcalpermol from espfit.utils.units import KB_T_KCALPERMOL + from espfit.app.analysis import BaseDataLoader + _logger.info(f'Compute effective sample size and sampling weights') + if self.samplers is None: + _logger.info('No samplers found. Return effective sample size -1.') return -1 for sampler, temporary_sampler in zip(self.samplers, temporary_samplers): - _logger.info(f'Compute effective sample size and sampling weights for {sampler.target_name}') + _logger.info(f'Analyzing {sampler.target_name}...') # Get temperature temp0 = sampler.temperature._value temp1 = temporary_sampler.temperature._value assert temp0 == temp1, f'Temperature should be equivalent but got sampler {temp0} K and temporary sampler {temp1} K' beta = 1 / (KB_T_KCALPERMOL * temp0) - _logger.debug(f'beta temperature in kcal/mol: {beta}') + #_logger.debug(f'beta temperature in kcal/mol: {beta}') # Get position from trajectory - traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb') - _logger.info(f'Found {traj.n_frames} frames in trajectory') + baseloader = BaseDataLoader(atomSubset=sampler.atomSubset) + baseloader.load_traj(input_directory_path=sampler.output_directory_path, exclude_n_frames=self.exclude_n_frames) + _logger.info(f'Found {baseloader.traj.n_frames} frames from {sampler.output_directory_path}') # Compute weights and effective sample size - log_w = [] - for i in range(traj.n_frames): + w_arr = [] + for i in range(baseloader.traj.n_frames): # U(x0, theta0) - sampler.simulation.context.setPositions(traj.openmm_positions(i)) - potential_energy = sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy() + sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i)) + for gid, force in enumerate(sampler.simulation.system.getForces()): + if force.getName() in self.force_group_names: + try: + potential_energy += sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() + except: + potential_energy = sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() # U(x0, theta1) - temporary_sampler.simulation.context.setPositions(traj.openmm_positions(i)) - reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy() + temporary_sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i)) + for gid, force in enumerate(temporary_sampler.simulation.system.getForces()): + if force.getName() in self.force_group_names: + try: + reduced_potential_energy += temporary_sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() + except: + reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy() # deltaU = U(x0, theta1) - U(x0, theta0) delta = (reduced_potential_energy - potential_energy).value_in_unit(kcalpermol) - # log_w = ln(exp(-beta * delta)) + # w = ln(exp(-beta * delta)) w = -1 * beta * delta - log_w.append(w) + w_arr.append(w) - #_logger.debug(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') - #_logger.debug(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') - #_logger.debug(f'deltaU: {delta:10.3f} kcal/mol') - #_logger.debug(f'log_w: {w:10.3f}') + _logger.info(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') + _logger.info(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') + _logger.info(f'deltaU: {delta:10.3f} kcal/mol') + _logger.info(f'w: {w:10.3f}') # Compute weights and effective sample size (ratio: 0 to 1) - w_i = np.exp(log_w) / np.sum(np.exp(log_w)) + # Prevent RuntimeWarning: overflow encountered in exp + w_arr = np.float128(w_arr) + w_i = np.exp(w_arr) / np.sum(np.exp(w_arr)) neff = np.sum(w_i) ** 2 / np.sum(w_i ** 2) / len(w_i) - #_logger.debug(f'w_i_sum: {np.sum(w_i):10.3f}') - #_logger.debug(f'neff: {neff:10.3f}') + _logger.debug(f'w_i_sum: {np.sum(w_i):10.3f}') + _logger.debug(f'neff: {neff:10.3f}') + + # Check if sum of weights is 1 + sum_w = abs(np.sum(w_i)) + assert abs(1 - sum_w) <= 0.001, f"Weight sum {sum_w} is greater than tolerance 0.001" - self.weights[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i} - #_logger.info(f'{self.weights}') - neffs = [self.weights[key]['neff'] for key in self.weights.keys()] + self.weights_neff_dict[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i} + _logger.debug(f'{self.weights_neff_dict}') + neffs = [self.weights_neff_dict[key]['neff'] for key in self.weights_neff_dict.keys()] return min(neffs) @@ -125,7 +147,7 @@ def compute_loss(self): """ loss_list = [] for sampler in self.samplers: - _logger.info(f'Compute loss for {sampler.target_name}') + _logger.debug(f'Compute loss for {sampler.target_name}') loss = self._compute_loss_per_system(sampler) # torch.tensor loss_list.append(loss) @@ -228,11 +250,11 @@ def _compute_weighted_observable(self, atomSubset, target_name, output_directory # Load trajectory target = RNASystem(atomSubset=atomSubset) - target.load_traj(input_directory_path=output_directory_path) + target.load_traj(input_directory_path=output_directory_path, exclude_n_frames=self.exclude_n_frames) # Compute observable - if self.weights.keys(): - pred = target.compute_jcouplings(weights=self.weights[target_name]['weights']) + if self.weights_neff_dict.keys(): + pred = target.compute_jcouplings(weights=self.weights_neff_dict[target_name]['weights']) else: pred = target.compute_jcouplings(weights=None) _logger.debug(f'Computed observable: {pred}') diff --git a/pyproject.toml b/pyproject.toml index bffcfc8..4247bea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,10 @@ where = ["."] # Ref https://setuptools.pypa.io/en/latest/userguide/datafiles.html#package-data [tool.setuptools.package-data] espfit = [ - "py.typed" + "py.typed", + "data/target/nucleoside/*/target.pdb", + "data/target/nucleoside/*/experiment.yml", + "data/forcefield/espaloma-0.3.2.pt" ] [tool.versioningit]