diff --git a/README.md b/README.md index 47b67ca..f23f56d 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,26 @@ Infrastruture to train espaloma with experimental observables ### Quick Usage +```python +from espfit.utils.graphs import CustomGraphDataset +path = 'espfit/data/qcdata/openff-toolkit-0.10.6/dgl2/protein-torsion-sm/' +ds = CustomGraphDataset.load(path) +ds.reshape_conformation_size(n_confs=50, include_min_energy_conf=True) +ds.compute_relative_energy() +# Create esplama model +from espfit.app.train import EspalomaModel +filename = 'espfit/data/config/config.toml' +# Override training settings in config.toml +kwargs = {'output_directory_path': 'checkpoints', 'epochs': 100} +model = EspalomaModel.from_toml(filename, **kwargs) +model.dataset_train = ds +# Set sampler settings +model.train_sampler(sampler_patience=800, neff_threshold=0.2, sampler_weight=1) +``` +### Standalone Usage #### Change logging ```python -# load dgl graph data from espfit.utils import logging logging.get_logging_level() #>'INFO' @@ -51,12 +67,11 @@ from espfit.app.train import EspalomaModel filename = 'espfit/data/config/config.toml' model = EspalomaModel.from_toml(filename) model.dataset_train = ds -# Train -model.train(output_directory_path='path/to/output') -# To extend training, update the `epoch` in config.toml -# Alternatively, do the following: -model.config['espaloma']['train']['epochs'] = 50 -model.train(output_directory_path='path/to/output') +# Change default training settings +model.epochs = 100 +model.output_directory_path = 'path/to/output' +# Train (default output directory is current path) +model.train() ``` #### Standard MD (default: espaloma-0.3.2 force field for solute molecules) @@ -66,8 +81,10 @@ from espfit.app.sampler import SetupSampler c = SetupSampler() filename = 'espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb' c.create_system(biopolymer_file=filename) -c.minimize(maxIterations=10) -c.run(nsteps=10, output_directory_path='path/to/output') +c.minimize() +# Change default settings +c.nsteps = 1000 +c.run() # Export to XML c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path='path/to/output') ``` @@ -76,7 +93,8 @@ c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_ ```python from espfit.app.sampler import SetupSampler c = SetupSampler.from_xml(input_directory_path='path/to/input') -c.run(nsteps=10, output_directory_path='path/to/output') +c.nsteps = 1000 +c.run() ``` #### Compute RNA J-couplings from MD trajectory diff --git a/devtools/conda-envs/README.md b/devtools/conda-envs/README.md index 83704b9..83776c6 100644 --- a/devtools/conda-envs/README.md +++ b/devtools/conda-envs/README.md @@ -2,11 +2,11 @@ >conda activate espfit >conda env export --from-history > test_env.yaml ->conda env create -f test_env.yaml -n test_env +>conda env create -f test_env.yaml -n test_env >#uninstall openff-toolkit and install a customized version to support dgl graphs created using openff-toolkit=0.10.6 >conda uninstall --force openff-toolkit >pip install git+https://github.com/kntkb/openff-toolkit.git@7e9d0225782ef723083407a1cbf1f4f70631f934 ->#uninstall openmmforcefields if < 0.12.0 +>#uninstall openmmforcefields if < 0.12.0 >#use pip instead of mamba to avoid dependency issues with ambertools and python >conda uninstall --force openmmforcefields >pip install git+https://github.com/openmm/openmmforcefields@0.12.0 diff --git a/espfit/__init__.py b/espfit/__init__.py index d312d34..564e5cb 100644 --- a/espfit/__init__.py +++ b/espfit/__init__.py @@ -25,4 +25,4 @@ #from .espfit import * -#from ._version import __version__ +from ._version import __version__ diff --git a/espfit/app/experiment.py b/espfit/app/analysis.py similarity index 70% rename from espfit/app/experiment.py rename to espfit/app/analysis.py index 86165a7..5235935 100644 --- a/espfit/app/experiment.py +++ b/espfit/app/analysis.py @@ -1,11 +1,5 @@ """ Compute experimental observables from MD simulations. - -Notes ------ - -TODO ----- """ import os import numpy as np @@ -17,20 +11,19 @@ class BaseDataLoader(object): """Base class for data loader. - TODO - ---- - * Add more methods to check trajectory information (e.g. number of frames, number of atoms, etc.) - Methods ------- load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1): Load MD trajectory. """ - def __init__(self, input_directory_path=None, output_directory_path=None): + def __init__(self, atomSubset='solute', input_directory_path=None, output_directory_path=None): """Initialize base data loader object. Parameters ---------- + atomSubset : str, default='solute' + Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'. + input_directory_path : str, optional Input directory path. Default is None. If None, the current working directory will be used. @@ -39,6 +32,10 @@ def __init__(self, input_directory_path=None, output_directory_path=None): Output directory path. Default is None. If None, the current working directory will be used. """ + self.atomSubset = atomSubset + if self.atomSubset not in ['solute', 'all', 'not water']: + raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.") + if input_directory_path is None: input_directory_path = os.getcwd() if output_directory_path is None: @@ -61,8 +58,7 @@ def output_directory_path(self, value): os.makedirs(value, exist_ok=True) - # Should this be a classmethod? - def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1, input_directory_path=None): + def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None): """Load MD trajectory. Parameters @@ -73,16 +69,16 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a trajectory_netcdf : str, optional Trajectory netcdf file name. Default is 'traj.nc'. - atom_indices : list, optional - List of atom indices to load from trajectory. Default is None. - If None, all atoms will be loaded. - stride : int, optional Stride to load the trajectory. Default is 1. input_directory_path : str, optional Input directory path. Default is None. If None, the current working directory will be used. + + Returns + ------- + None """ import mdtraj @@ -92,22 +88,31 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a # Load reference pdb (solvated system) pdb = os.path.join(self.input_directory_path, reference_pdb) ref_traj = mdtraj.load(pdb) + # Select atoms to load from trajectory - if atom_indices is None: + if self.atomSubset == 'all': + self.atom_indices = None + self.ref_traj = ref_traj + else: self.atom_indices = [] mdtop = ref_traj.topology - res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] + if self.atomSubset == 'solute': + res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] + elif self.atomSubset == 'not water': + res = [ r for r in mdtop.residues if r.name not in ('HOH') ] + # Get atom indices for r in res: for a in r.atoms: self.atom_indices.append(a.index) - else: - self.atom_indices = atom_indices - self.ref_traj = ref_traj.atom_slice(self.atom_indices) - + self.ref_traj = ref_traj.atom_slice(self.atom_indices) + # Load trajectory netcdf = os.path.join(self.input_directory_path, trajectory_netcdf) traj = mdtraj.load(netcdf, top=self.ref_traj.topology, stride=stride) - self.traj = traj.atom_slice(self.atom_indices) + if self.atom_indices: + self.traj = traj.atom_slice(self.atom_indices) + else: + self.traj = traj class RNASystem(BaseDataLoader): @@ -170,15 +175,14 @@ def radian_to_degree(self, a): return a - def compute_jcouplings(self, couplings=None, residues=None): + def compute_jcouplings(self, weights=None, couplings=None, residues=None): """Compute J-couplings from MD trajectory. - TODO - ---- - * Compute confidence interval. - Parameters ---------- + weights : numpy.ndarray, optional + Weights to compute the J-couplings. Default is None. + couplings : str, optional Name of the couplings to compute. Default is None. If a list of couplings to be chosen from [H1H2,H2H3,H3H4,1H5P,2H5P,C4Pb,1H5H4,2H5H4,H3P,C4Pe,H1C2/4,H1C6/8] @@ -204,7 +208,7 @@ def compute_jcouplings(self, couplings=None, residues=None): """ import barnaba as bb - _logger.info("Computing J-couplings from MD trajectory...") + _logger.info("Compute J-couplings from MD trajectory") if couplings is not None: # Check if the provided coupling names are valid @@ -216,15 +220,32 @@ def compute_jcouplings(self, couplings=None, residues=None): # residue_list: list of M nucleobases values, resname_list = bb.jcouplings_traj(self.traj, couplings=couplings, residues=residues) + # Convert numpy.float to float to avoid serialization issues + replace_nan_with_none = lambda x: None if np.isscalar(x) and np.isnan(x) else x.item() + # Loop over residues and couplings to store the computed values coupling_dict = dict() for i, resname in enumerate(resname_list): _values = values[:,i,:] # Coupling values of i-th residue values_by_names = dict() for j, coupling_name in enumerate(couplings): - avg = _values[:,j].mean() # Mean value of H1H2 coupling of i-th residue - std = _values[:,j].std() # Standard deviation of H1H2 coupling of i-th residue - values_by_names[coupling_name] = {'avg': avg, 'std': std} + avg_raw = np.round(_values[:,j].mean(), 5) # e.g. mean value of H1H2 coupling of i-th residue + std_raw = np.round(_values[:,j].std(), 5) # e.g. standard deviation of H1H2 coupling of i-th residue + avg_raw = replace_nan_with_none(avg_raw) + std_raw = replace_nan_with_none(std_raw) + if weights is not None: + arr = _values[:,j] * weights + #_logger.info(f'non-weighted: {_values[:,j]}') + #_logger.info(f'weights: {weights}') + #_logger.info(f'weighted: {arr}') + avg = np.round(arr.mean(), 5) + std = np.round(arr.std(), 5) + avg = replace_nan_with_none(avg) + std = replace_nan_with_none(std) + else: + avg = avg_raw + std = std_raw + values_by_names[coupling_name] = {'avg': avg, 'std': std, 'avg_raw': avg_raw, 'std_raw': std_raw} coupling_dict[resname] = values_by_names return coupling_dict @@ -242,4 +263,18 @@ def get_available_couplings(self): import barnaba as bb available_coupling_names = list(bb.definitions.couplings_idx.keys()) return available_coupling_names - \ No newline at end of file + + +# +# Future work? +# +class ProteinSystem(BaseDataLoader): + def __init__(self, **kwargs): + super(ProteinSystem, self).__init__(**kwargs) + raise NotImplementedError("ProteinSystem class is not implemented yet.") + + +class ProteinLigandSystem(BaseDataLoader): + def __init__(self, **kwargs): + super(ProteinLigandSystem, self).__init__(**kwargs) + raise NotImplementedError("ProteinLigandSystem class is not implemented yet.") diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py index 1eb1de7..eb211e5 100644 --- a/espfit/app/sampler.py +++ b/espfit/app/sampler.py @@ -29,20 +29,40 @@ class BaseSimulation(object): Methods ------- - minimize(maxIterations=100): + minimize(output_directory_path=None): Minimize solvated system. - run(checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, nsteps=250000, atom_indices=None): + run(output_directory_path=None): Run standard MD simulation. - export_xml(exportSystem=True, exportState=True, exportIntegrator=True): + export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None): Export serialized system XML file and solvated pdb file. """ - def __init__(self, output_directory_path=None, input_directory_path=None): + def __init__(self, maxIterations=100, nsteps=250000, atomSubset='solute', + checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, + output_directory_path=None, input_directory_path=None): """Initialize base simulation object. Parameters ---------- + maxIterations : int, default=100 + Maximum number of iterations to perform minimization. + + nsteps : int, default=250000 (10 ns using 4 fs timestep) + Number of steps to run the simulation. + + atomSubset : str, default='solute' + Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'. + + checkpoint_frequency : int, default=25000 (1 ns) + Frequency (in steps) at which to write checkpoint files. + + logging_frequency : int, default=250000 (10 ns) + Frequency (in steps) at which to write logging files. + + netcdf_frequency : int, default=250000 (10 ns) + Frequency (in steps) at which to write netcdf files. + output_directory_path : str, optional Output directory path. Default is None. If None, the current working directory will be used. @@ -51,6 +71,16 @@ def __init__(self, output_directory_path=None, input_directory_path=None): Input directory path to restart simulation. Default is None. If None, the current working directory will be used. """ + self.maxIterations = maxIterations + self.nsteps = nsteps + self.atomSubset = atomSubset + self.checkpoint_frequency = checkpoint_frequency + self.logging_frequency = logging_frequency + self.netcdf_frequency = netcdf_frequency + + if self.atomSubset not in ['solute', 'all', 'not water']: + raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.") + if output_directory_path is None: output_directory_path = os.getcwd() # Is this right? if input_directory_path is None: @@ -93,7 +123,7 @@ def _get_platform(self): from openmmtools.utils import get_fastest_platform platform = get_fastest_platform() platform_name = platform.getName() - _logger.info(f"Fastest platform: {platform_name}") + _logger.debug(f"Fastest platform: {platform_name}") if platform_name == "CUDA": platform.setPropertyDefaultValue('DeterministicForces', 'true') # default is false platform.setPropertyDefaultValue('Precision', 'mixed') # default is single @@ -101,42 +131,28 @@ def _get_platform(self): return platform - def minimize(self, maxIterations=100): + def minimize(self, output_directory_path=None): """Minimize solvated system. - Parameters - ---------- - maxIterations : int, default=100 - Maximum number of iterations to perform. + output_directory_path : str, default=None + The path to the output directory. If None, the default output directory is used. Returns ------- None """ - _logger.info(f"Minimizing system...") - self.simulation.minimizeEnergy(maxIterations) + if output_directory_path is not None: + self.output_directory_path = output_directory_path # property decorator is called + + _logger.debug(f"Minimizing system for maximum {self.maxIterations} steps") + self.simulation.minimizeEnergy(self.maxIterations) - def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, nsteps=250000, atom_indices=None, output_directory_path=None): + def run(self, output_directory_path=None): """Run standard MD simulation. Parameters ---------- - checkpoint_frequency : int, default=25000 (1 ns) - Frequency (in steps) at which to write checkpoint files. - - logging_frequency : int, default=250000 (10 ns) - Frequency (in steps) at which to write logging files. - - netcdf_frequency : int, default=250000 (10 ns) - Frequency (in steps) at which to write netcdf files. - - nsteps : int, default=250000 (10 ns) - Number of steps to run the simulation. - - atom_indices : list, default=None - List of atom indices to save. If None, save all atoms except water and ions. - output_directory_path : str, default=None The path to the output directory. If None, the default output directory is used. @@ -144,37 +160,35 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ ------- None """ - self.checkpoint_frequency = checkpoint_frequency - self.logging_frequency = logging_frequency - self.netcdf_frequency = netcdf_frequency - self.nsteps = nsteps - self.atom_indices = atom_indices + import mdtraj + from mdtraj.reporters import NetCDFReporter + from openmm.app import CheckpointReporter, StateDataReporter + if output_directory_path is not None: self.output_directory_path = output_directory_path # property decorator is called # Select atoms to save - import mdtraj - if self.atom_indices is None: + if self.atomSubset == 'all': + self.atom_indices = None + else: self.atom_indices = [] mdtop = mdtraj.Topology.from_openmm(self.simulation.topology) - res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] + if self.atomSubset == 'solute': + res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ] + elif self.atomSubset == 'not water': + res = [ r for r in mdtop.residues if r.name not in ('HOH') ] for r in res: for a in r.atoms: self.atom_indices.append(a.index) # Define reporter - from mdtraj.reporters import NetCDFReporter - from openmm.app import CheckpointReporter, StateDataReporter - self._check_file_exists("traj.nc") self.simulation.reporters.append(NetCDFReporter(os.path.join(self.output_directory_path, f"traj.nc"), min(self.netcdf_frequency, self.nsteps), atomSubset=self.atom_indices)) - self._check_file_exists("checkpoint.chk") self.simulation.reporters.append(CheckpointReporter(os.path.join(self.output_directory_path, f"checkpoint.chk"), min(self.checkpoint_frequency, self.nsteps))) - self._check_file_exists("reporter.log") self.simulation.reporters.append(StateDataReporter(os.path.join(self.output_directory_path, f"reporter.log"), min(self.logging_frequency, self.nsteps), @@ -182,7 +196,7 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ totalEnergy=True, temperature=True, volume=True, density=True, speed=True)) # Run - _logger.info(f"Run MD simulation for {self.nsteps} steps") + _logger.info(f"Running simulation for {self.nsteps} steps...") self.simulation.step(self.nsteps) @@ -212,7 +226,7 @@ def export_xml(self, exportSystem=True, exportState=True, exportIntegrator=True, None """ from openmm import XmlSerializer - _logger.info(f"Serialize and export system") + _logger.debug(f"Serialize and export system") if output_directory_path is not None: # Create a new output directory different from the one specified when the SetupSampler instance was created. @@ -296,13 +310,21 @@ class SetupSampler(BaseSimulation): create_system(biopolymer_file=None, ligand_file=None): Create biopolymer-ligand system and export serialized system XML file and solvated pdb file. + from_toml(filename, *args, **override_sampler_kwargs): + Create SetupSampler from a TOML configuration file. + + from_xml(filename): + Create SetupSampler from a serialized system XML file. + Examples -------- >>> from espfit.app.sampler import SetupSampler >>> c = SetupSampler() >>> c.create_system(biopolymer_file='protein.pdb', ligand_file='ligand.sdf') - >>> c.minimize(maxIterations=10) - >>> c.run(nsteps=10) + >>> c.maxIterations = 10 # change default setting + >>> c.minimize() + >>> c.nsteps = 100 # change default setting + >>> c.run() Notes ----- @@ -314,18 +336,15 @@ class SetupSampler(BaseSimulation): ['amber/protein.ff14SB.xml', 'amber/RNA.OL3.xml'] : pl-multi (TPO): NG, pl-single: OK, RNA: OK """ def __init__(self, - #small_molecule_forcefield='openff-2.1.0', small_molecule_forcefield='espfit/data/forcefield/espaloma-0.3.2.pt', forcefield_files = ['amber/ff14SB.xml', 'amber/phosaa14SB.xml'], water_model='tip3p', solvent_padding=9.0 * unit.angstroms, ionic_strength=0.15 * unit.molar, - constraints=app.HBonds, hmass=3.0 * unit.amu, temperature=300.0 * unit.kelvin, pressure=1.0 * unit.atmosphere, pme_tol=2.5e-04, - nonbonded_method=app.PME, barostat_period=50, timestep=4 * unit.femtoseconds, override_with_espaloma=True, @@ -345,8 +364,6 @@ def __init__(self, The padding distance around the solute in the solvent box. Default is 9.0 * unit.angstroms. ionic_strength : Quantity, optional The ionic strength of the solvent. Default is 0.15 * unit.molar. - constraints : object, optional - The type of constraints to be applied to the system. Default is app.HBonds. hmass : Quantity, optional The mass of the hydrogen atoms. Default is 3.0 * unit.amu. temperature : Quantity, optional @@ -355,8 +372,6 @@ def __init__(self, The pressure of the system. Default is 1.0 * unit.atmosphere. pme_tol : float, optional The Ewald error tolerance for PME electrostatics. Default is 2.5e-04. - nonbonded_method : object, optional - The nonbonded method to be used for the system. Default is app.PME. barostat_period : int, optional The frequency at which the barostat is applied. Default is 50. timestep : Quantity, optional @@ -370,15 +385,104 @@ def __init__(self, self.forcefield_files = self._update_forcefield_files(forcefield_files) self.solvent_padding = solvent_padding self.ionic_strength = ionic_strength - self.constraints = constraints self.hmass = hmass self.temperature = temperature self.pressure = pressure self.pme_tol = pme_tol - self.nonbonded_method = nonbonded_method self.barostat_period = barostat_period self.timestep = timestep self.override_with_espaloma = override_with_espaloma + self.target_class = None + self.target_name = None + + + @classmethod + def from_toml(cls, filename, *args, **override_sampler_kwargs): + """Create SetupSampler from a TOML configuration file. + + Parameters + ---------- + filename : str + The path to the TOML configuration file. + + *args : list + This is used to update the output directory path during espaloma training. + The list should contain a single integer value, corresponding to the epoch number. + + **override_sampler_kwargs : dict + The dictionary of keyword arguments to override the default settings of the + BaseSimulation and SetupSampler classes. This option is intended for creating + new systems with temporary espaloma models generated during espaloma training. + + Returns + ------- + samplers : list of SetupSampler instances + """ + import tomllib + from espfit.utils.units import convert_string_to_unit + from importlib.resources import files + + try: + with open(filename, 'rb') as f: + config = tomllib.load(f) + except FileNotFoundError as e: + print(e) + raise + + config = config['sampler']['setup'] # list + if config is None: + raise ValueError("target is not specified in the configuration file") + + samplers = [] + _logger.debug(f'Found {len(config)} systems in the configuration file') + for _config in config: + sampler = cls() + + # Target information + target_class = _config['target_class'] + target_name = _config['target_name'] + + sampler.target_class = target_class + sampler.target_name = target_name + + biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb') + ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf') + if not ligand_file.exists(): + ligand_file = None + + # System settings + for key, value in _config.items(): + if key not in ['target_class', 'target_name']: + if hasattr(sampler, key): + if isinstance(value, str) and "*" in value: + _value = float(value.split('*')[0].strip()) + unit_string = value.split('*')[1].strip() + unit_mapping = convert_string_to_unit(unit_string) + value = _value * unit_mapping + setattr(sampler, key, value) + else: + raise ValueError(f"Invalid keyword argument: {key}") + + # Pass temporary espaloma model to the sampler if kwargs are given + for key, value in override_sampler_kwargs.items(): + if hasattr(sampler, key): + setattr(sampler, key, value) + else: + raise ValueError(f"Invalid keyword argument: {key}") + + # Update output directory path if args (epoch) is given + if args: + if len(args) == 1 and isinstance(args[0], int): + sampler.output_directory_path = os.path.join(sampler.output_directory_path, sampler.target_name, f'{args[0]}') + else: + raise ValueError(f"Invalid argument: {args}. Expected a single integer value for the epoch number.") + + # Create system + sampler.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file) + samplers.append(sampler) + del sampler + + return samplers def _update_forcefield_files(self, forcefield_files): @@ -491,7 +595,7 @@ def _get_complex(self): return complex_topology, complex_positions - + def create_system(self, biopolymer_file=None, ligand_file=None): """Create biopolymer-ligand system and export serialized system XML file and solvated pdb file. @@ -531,8 +635,8 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # Initialize system generator. _logger.debug("Initialize system generator") - forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : self.constraints, 'rigidWater': True, 'hydrogenMass' : self.hmass} - periodic_forcefield_kwargs = {'nonbondedMethod': self.nonbonded_method} + forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : app.HBonds, 'rigidWater': True, 'hydrogenMass' : self.hmass} + periodic_forcefield_kwargs = {'nonbondedMethod': app.PME} barostat = MonteCarloBarostat(self.pressure, self.temperature, self.barostat_period) # SystemGenerator will automatically load the TemplateGenerator based on the given `small_molecule_forcefield`. @@ -550,11 +654,11 @@ def create_system(self, biopolymer_file=None, ligand_file=None): template_generator_kwargs=template_generator_kwargs) if ligand_file is not None: - _logger.info("Add molecules to system generator") + _logger.debug("Add molecules to system generator") self._system_generator.template_generator.add_molecules(self._ligand_offmol) # Solvate system - _logger.info("Solvating system...") + _logger.debug("Solvating system...") modeller = app.Modeller(self._complex_topology, self._complex_positions) modeller.addSolvent(self._system_generator.forcefield, model=self.water_model, padding=self.solvent_padding, ionicStrength=self.ionic_strength) @@ -572,7 +676,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # (espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb). # No explicit error message was given. It failed to show the following logging information: # - # _logger.info(f'Requested to generate parameters for residue {residue}') + # _logger.debug(f'Requested to generate parameters for residue {residue}') # https://github.com/openmm/openmmforcefields/blob/main/openmmforcefields/generators/template_generators.py#L285 # # However, it works for protein test systems (espfit/data/target/testsystems/protein-ligand/target.pdb). @@ -580,7 +684,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None): # As a workaround, we will delete the original `self._system_generator` and create a new one to regenerate the system with espaloma. # Only water and ion forcefield files will be used to regenerate the system. Solute molecules will be parametrized with espaloma. # - _logger.info("Regenerate system with espaloma.") + _logger.debug("Regenerate system with espaloma.") # Re-create system generator del self._system_generator @@ -625,13 +729,13 @@ def _regenerate_espaloma_system(self): import mdtraj from openff.toolkit import Molecule - _logger.info("Regenerate system with espaloma") + _logger.debug("Regenerate system with espaloma") # Check biopolymer chains mdtop = mdtraj.Topology.from_openmm(self.modeller_solvated_topology) chain_indices = [ chain.index for chain in self.modeller_solvated_topology.chains() ] biopolymer_chain_indices = [ chain_index for chain_index in chain_indices if mdtop.select(f"not (water or resname NA or resname K or resname CL or resname UNK) and chainid == {chain_index}").any() ] - _logger.info(f"Biopolymer chain indices: {biopolymer_chain_indices}") + _logger.debug(f"Biopolymer chain indices: {biopolymer_chain_indices}") # Get OpenMM topology of solute with one residue per molecule. # Espaloma will use residue name "XX". Check conflicting residue names. @@ -640,14 +744,14 @@ def _regenerate_espaloma_system(self): raise Exception('Found conflict residue name in biopolymer.') # Initilize espaloma topology - # TODO: From software engineering point of view, should this be `self.new_solvated_topology` or `new_solvated_topology`? + # TODO: Should this be `self.new_solvated_topology` or `new_solvated_topology`? self.new_solvated_topology = app.Topology() self.new_solvated_topology.setPeriodicBoxVectors(self.modeller_solvated_topology.getPeriodicBoxVectors()) new_atoms = {} # Regenerate biopolymer topology chain_index = 0 - _logger.info(f"Regenerating biopolymer topology...") + _logger.debug(f"Regenerating biopolymer topology...") for chain in self.modeller_solvated_topology.chains(): new_chain = self.new_solvated_topology.addChain(chain.id) # Convert biopolymer into a single residue @@ -712,7 +816,7 @@ def _update_espaloma_topology(self): ------- app.Topology : The updated topology reflecting the new system. """ - _logger.info("Update residue names in espaloma topology.") + _logger.debug("Update residue names in espaloma topology.") # Get original residue names. atom_name_lookup = [] diff --git a/espfit/app/train.py b/espfit/app/train.py index aa35a08..e4cbfdb 100644 --- a/espfit/app/train.py +++ b/espfit/app/train.py @@ -4,80 +4,30 @@ TODO ---- * Add support to use multiple GPUs -* Add support to validate model? (or use independent script?) -* Add support to save model? (or use independent script?) +* Improve how data are parsed using dataclasses or pydantic """ +import os +import torch +import espaloma as esp import logging _logger = logging.getLogger(__name__) -class EspalomaModel(object): - """Espaloma network model and training modules. - - Methods - ------- - from_toml(filename): - Load espaloma configuration file in TOML format. - - Examples - -------- - >>> from espfit.app.train import EspalomaModel - >>> filename = 'espfit/data/config/config.toml' - >>> # create espaloma network model from toml file - >>> model = EspalomaModel.from_toml(filename) - >>> # check espaloma network model - >>> model.net - >>> # load training dataset - >>> model.dataset_train = ds - >>> model.train() - """ - - def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, random_seed=2666, config=None, output_directory_path=None): - """Initialize an instance of the class with an Espaloma network model and a random seed. - - This constructor method sets up the Espaloma network model, the training, validation, test datasets, - a configuratino file, and the random seed that will be used throughout the training process. - If no model or datasets are provided, the corresponding attributes will be set to None. If no random seed is - provided, the `random_seed` attribute will be set to 2666. - - Parameters - ---------- - net : torch.nn.Sequential, default=None - The Espaloma network model to be used for training. - - dataset_train : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None - The training dataset. espaloma.graphs.graph.Graph. If not provided, the `train_data` attribute will be set to None. - - dataset_validation : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None - The validation dataset. If not provided, the `validation_data` attribute will be set to None. - - dataset_test : Dataset, espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None - The test dataset. If not provided, the `test_data` attribute will be set to None. +class EspalomaBase(object): + def __init__(self): + # Check if GPU is available + if torch.cuda.is_available(): + _logger.debug('GPU is available for training.') + else: + _logger.debug('GPU is not available for training.') - random_seed : int, default=2666 - The random seed used throughout the espaloma training. - - config : dict, default=None - The configuration for the espaloma model. If not provided, the `config` attribute will be set to None. - - output_directory_path : str, default=None - The directory where the model checkpoints should be saved. - If not provided, the checkpoints will be saved in the current working directory. - """ - self.net = net - self.dataset_train = dataset_train - self.dataset_validation = dataset_validation - self.dataset_test = dataset_test - self.random_seed = random_seed - self.config = None # TODO: Better way to handle this? - if output_directory_path is None: - import os - self.output_directory_path = os.getcwd() + # Check torch data type + _logger.debug(f'Torch data type is {torch.get_default_dtype()}') @classmethod - def from_toml(cls, filename): + def from_toml(cls, filename, **override_espalomamodel_kwargs): """Create an instance of the class from a TOML configuration file. This method reads a TOML file specified by `filename`, extracts the 'espaloma' @@ -90,6 +40,10 @@ def from_toml(cls, filename): filename : str Path to the TOML file containing the configuration for the espaloma model. + override_espalomamodel_kwargs : dict + A dictionary of keyword arguments to override the default settings for the + espaloma model. + Returns ------- object @@ -103,29 +57,33 @@ def from_toml(cls, filename): except FileNotFoundError as e: print(e) raise - #model = cls.create_model(config['espaloma']) - - # TODO: Better way to handle this? - #model = cls(model) - #model.config = config - + model = cls() net = model.create_model(config['espaloma']) model.net = net - model.config = config + model.configfile = filename - return model + # Update training settings + for key, value in config['espaloma']['train'].items(): + if hasattr(model, key): + setattr(model, key, value) + else: + raise ValueError(f'Invalid attribute {key}.') + # Override training settings + for key, value in override_espalomamodel_kwargs.items(): + if hasattr(model, key): + setattr(model, key, value) + else: + raise ValueError(f'Invalid attribute {key}.') - @staticmethod - def create_model(espaloma_config): - """Create an Espaloma network model using the provided configuration. + return model - This function constructs a PyTorch Sequential model with two stages of Graph Neural Network (GNN) layers, - JanossyPooling readout layers for various features, and additional layers for energy computation and loss calculation. - The specifics of the GNN layers and the readout layers are controlled by the `espaloma_config` dictionary. - If a CUDA-compatible GPU is available, the model is moved to the GPU before being returned. + @staticmethod + def _get_base_module(espaloma_config): + """Create base modules for Espaloma network model. + Parameters ---------- espaloma_config : dict @@ -135,11 +93,9 @@ def create_model(espaloma_config): Returns ------- - torch.nn.Sequential - The constructed Espaloma network model. + list + A list of modules for the Espaloma network model. """ - import espaloma as esp - # GNN gnn_method = 'SAGEConv' gnn_options = {} @@ -174,78 +130,153 @@ def create_model(espaloma_config): # Improper torsions (multiplicity n=2) readout_improper = esp.nn.readout.janossy.JanossyPoolingWithSmirnoffImproper(in_features=units, config=config_2, out_features={"k": 2}) - # Get loss weights - # TODO: Better way to handle this? + # Initialize loss weights and update if provided weights = { 'energy': 1.0, 'force': 1.0, 'charge': 1.0, 'torsion': 1.0, 'improper': 1.0 } if 'weights' in espaloma_config.keys(): for key in espaloma_config['weights'].keys(): weights[key] = espaloma_config['weights'][key] - # Define espaloma architecture - import torch + # Append base modules + modules = [] + modules.append(representation) + modules.append(readout) + modules.append(readout_improper) + modules.append(esp.nn.readout.janossy.ExpCoefficients()) + modules.append(esp.nn.readout.charge_equilibrium.ChargeEquilibrium()) + + return modules, weights + + + @staticmethod + def create_model(espaloma_config): + """Create an Espaloma network model using the provided configuration. + + This function constructs a PyTorch Sequential model with two stages of Graph Neural Network (GNN) layers, + JanossyPooling readout layers for various features, and additional layers for energy computation and loss calculation. + The specifics of the GNN layers and the readout layers are controlled by the `espaloma_config` dictionary. + If a CUDA-compatible GPU is available, the model is moved to the GPU before being returned. + + Parameters + ---------- + espaloma_config : dict + A dictionary containing the configuration for the Espaloma network. + This includes the method and options for the GNN layers, the configurations for the two stages of the network, + and optionally the weights for different loss components. + + Returns + ------- + torch.nn.Sequential + The constructed Espaloma network model. + """ from espfit.utils.espaloma.module import GetLoss - net = torch.nn.Sequential( - representation, - readout, - readout_improper, - esp.nn.readout.janossy.ExpCoefficients(), - esp.nn.readout.charge_equilibrium.ChargeEquilibrium(), - esp.mm.geometry.GeometryInGraph(), - esp.mm.energy.EnergyInGraph(terms=["n2", "n3", "n4", "n4_improper"]), - GetLoss(weights), - ) + + # Get base model + modules, weights = EspalomaBase._get_base_module(espaloma_config) + + # Define espaloma architecture + modules.append(esp.mm.geometry.GeometryInGraph()) + modules.append(esp.mm.energy.EnergyInGraph(terms=["n2", "n3", "n4", "n4_improper"])) + modules.append(GetLoss(weights)) + + # Create model + net = torch.nn.Sequential(*modules) if torch.cuda.is_available(): return net.cuda() else: return net + + def save_model(self, net=None, best_model=None, model_name='espaloma.pt', output_directory_path=None): + """Save the Espaloma network model to a file. + + This method saves the Espaloma network model to a file in the specified output directory. + + Parameters + ---------- + net : torch.nn.Sequential + The Espaloma network model to be saved. - def _restart_checkpoint(self, output_directory_path): - """Load the last checkpoint and restart the training process. + best_model : str + The path to the best model file. - This method finds all the checkpoint files in the directory specified by `output_directory_path`, - loads the last checkpoint (e.g. net100.pt), and restarts the training process from the next step. If no - checkpoint files are found, the training process starts from the first step. + model_name : str, default='espaloma.pt' + The name of the file to save the model to. - Parameters - ---------- - output_directory_path : str - The directory where the checkpoint files are stored. + output_directory_path : str, default=None + The directory where the model should be saved. + If not provided, the model will be saved in the current working directory. Returns ------- - int - The step from which the training process should be restarted. + None """ - import os - import glob - import torch - - checkpoints = glob.glob("{}/*.pt".format(output_directory_path)) - - if checkpoints: - n = [ int(c.split('net')[1].split('.')[0]) for c in checkpoints ] - n.sort() - restart_epoch = n[-1] - restart_checkpoint = os.path.join(output_directory_path, f"net{restart_epoch}.pt") - self.net.load_state_dict(torch.load(restart_checkpoint)) - logging.info(f'Restarting from ({restart_checkpoint}).') + if output_directory_path is not None: + os.makedirs(output_directory_path, exist_ok=True) else: - restart_epoch = 0 + output_directory_path = os.getcwd() + + if net: + modules = [] + for module in net: + if isinstance(module, esp.mm.geometry.GeometryInGraph): + break + modules.append(module) + modules.append(esp.nn.readout.janossy.LinearMixtureToOriginal()) + net = torch.nn.Sequential(*modules) + else: + raise ValueError('No model provided.') - return restart_epoch + # Save model + state_dict = torch.load(best_model, map_location=torch.device('cpu')) + net.load_state_dict(state_dict) + torch.save(net, os.path.join(output_directory_path, model_name)) + + +class EspalomaModel(EspalomaBase): + """Espaloma network model and training modules. + + Methods + ------- + from_toml(filename): + Load espaloma configuration file in TOML format. + Examples + -------- + >>> from espfit.app.train import EspalomaModel + >>> filename = 'espfit/data/config/config.toml' + >>> # create espaloma network model from toml file + >>> model = EspalomaModel.from_toml(filename) + >>> # check espaloma network model + >>> model.net + >>> # load training dataset + >>> model.dataset_train = ds + >>> model.train() + """ - def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None): - """ - Train the Espaloma network model. + def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, + epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, + random_seed=2666, output_directory_path=None): + """Initialize an instance of the class with an Espaloma network model and a random seed. - This method trains the Espaloma network model using the training dataset. The training process can be customized - by specifying the number of epochs, batch size, learning rate, checkpoint frequency, and an output directory. - The method also supports restarting the training from a checkpoint. + This constructor method sets up the Espaloma network model, the training, validation, test datasets, + a configuratino file, and the random seed that will be used throughout the training process. + If no model or datasets are provided, the corresponding attributes will be set to None. If no random seed is + provided, the `random_seed` attribute will be set to 2666. Parameters ---------- + net : torch.nn.Sequential, default=None + The Espaloma network model to be used for training. + + dataset_train : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None + The training dataset. espaloma.graphs.graph.Graph. If not provided, the `train_data` attribute will be set to None. + + dataset_validation : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None + The validation dataset. If not provided, the `validation_data` attribute will be set to None. + + dataset_test : Dataset, espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None + The test dataset. If not provided, the `test_data` attribute will be set to None. + epochs : int, default=1000 The number of epochs to train the model for. @@ -258,85 +289,328 @@ def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_freq checkpoint_frequency : int, default=10 The frequency at which the model should be saved. + random_seed : int, default=2666 + The random seed used throughout the espaloma training. + output_directory_path : str, default=None - The directory where the model checkpoints should be saved. If None, the default output directory is used. + The directory where the model checkpoints should be saved. + If not provided, the checkpoints will be saved in the current working directory. + """ + super(EspalomaBase, self).__init__() + self.net = net + self.dataset_train = dataset_train + self.dataset_validation = dataset_validation + self.dataset_test = dataset_test + self.epochs = epochs + self.batch_size = batch_size + self.learning_rate = learning_rate + self.checkpoint_frequency = checkpoint_frequency + self.restart_epoch = 0 + self.random_seed = random_seed + if output_directory_path is None: + output_directory_path = os.getcwd() + self.output_directory_path = output_directory_path + + + @property + def output_directory_path(self): + """Get output directory path.""" + return self._output_directory_path + + + @output_directory_path.setter + def output_directory_path(self, value): + """Set output directory path.""" + self._output_directory_path = value + # Create output directory if it does not exist + os.makedirs(value, exist_ok=True) + + + def report_loss(self, epoch, loss_dict): + """Report loss. + + Parameters + ---------- + loss_dict : dict + The loss trajectory that stores individual weighted losses at a given epoch. Returns ------- None """ - import os - import torch - from pathlib import Path + import pandas as pd + + log_file_path = os.path.join(self.output_directory_path, 'reporter.log') + df_new = pd.DataFrame.from_dict(loss_dict, orient='index').T + df_new.insert(0, 'epoch', epoch) - if torch.cuda.is_available(): - _logger.info('GPU is available for training.') + if os.path.exists(log_file_path): + df_old = pd.read_csv(log_file_path, sep='\t') + df = pd.concat([df_old, df_new]) + else: + df = df_new + df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False) - # Change default device to GPU if available - # Will this map all data onto GPU and cause memory error if the data is too large? - # https://pytorch.org/tutorials/recipes/recipes/changing_default_device.html - #torch.set_default_device('cuda') - else: - _logger.info('GPU is not available for training.') + def train(self): + """ + Train the Espaloma network model. + + Returns + ------- + None + """ + from espfit.utils.units import HARTREE_TO_KCALPERMOL - # Check if training dataset is provided if self.dataset_train is None: raise ValueError('Training dataset is not provided.') - # Espaloma settings for training - config = self.config['espaloma']['train'] - epochs = config.get('epochs', epochs) - batch_size = config.get('batch_size', batch_size) - learning_rate = config.get('learning_rate', learning_rate) - checkpoint_frequency = config.get('checkpoint_frequency', checkpoint_frequency) - if output_directory_path is not None: - self.output_directory_path = output_directory_path - # Create output directory if not exists - os.makedirs(output_directory_path, exist_ok=True) - - # Restart from checkpoint if exists - restart_epoch = self._restart_checkpoint(output_directory_path) - if restart_epoch >= epochs: - _logger.info(f'Already trained for {epochs} epochs.') - return - elif restart_epoch > 0: - _logger.info(f'Training for additional {epochs-restart_epoch} epochs.') - else: - _logger.info(f'Training from scratch for {epochs} epochs.') + # Load checkpoint + self.restart_epoch = self._load_checkpoint() # Train # https://github.com/choderalab/espaloma/blob/main/espaloma/app/train.py#L33 # https://github.com/choderalab/espaloma/blob/main/espaloma/data/dataset.py#L310 - from espfit.utils.units import HARTEE_TO_KCALPERMOL - ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=batch_size, shuffle=True) - optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate) + ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) + optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) with torch.autograd.set_detect_anomaly(True): - for i in range(restart_epoch, epochs): + for i in range(self.restart_epoch, self.epochs): epoch = i + 1 # Start from epoch 1 (not zero-indexing) for g in ds_tr_loader: optimizer.zero_grad() - # TODO: Better way to handle this? if torch.cuda.is_available(): g = g.to("cuda:0") - g.nodes["n1"].data["xyz"].requires_grad = True - loss = self.net(g) + #loss = self.net(g) + loss, loss_dict = self.net(g) loss.backward() optimizer.step() + + loss_dict['loss'] = loss.item() + self.report_loss(epoch, loss_dict) + + if epoch % self.checkpoint_frequency == 0: + # Note: returned loss is a joint loss of different units. + loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() + _logger.info(f'Epoch {epoch}: loss={loss:.3f}') + self._save_checkpoint(epoch) + + + def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False): + """ + Train the Espaloma network model with sampler. + + Parameters + ---------- + sampler_patience : int, default=800 + The number of epochs to wait before using sampler. + + neff_threshold : float, default=0.2 + The minimum effective sample size threshold. + + sampler_weight : float, default=1.0 + The weight for the sampler loss. + + debug : bool, default=False + If True, use espaloma-0.3.pt for debugging. + + Returns + ------- + None + """ + from espfit.utils.sampler.reweight import SetupSamplerReweight + + # Note: RuntimeError will be raised if copy.deepcopy is used. + # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace + # operation: [torch.cuda.FloatTensor [512, 1]], which is output 0 of AsStridedBackward0, is at version 2; + # expected version 1 instead. Hint: the backtrace further above shows the operation that failed to + # compute its gradient. The variable in question was changed in there or anywhere later. Good luck! + import copy + net_copy = copy.deepcopy(self.net) + + self.sampler_patience = sampler_patience + self.neff_threshold = neff_threshold + + if self.dataset_train is None: + raise ValueError('Training dataset is not provided.') + + # Load checkpoint + self.restart_epoch = self._load_checkpoint() + + # Initialize + SamplerReweight = SetupSamplerReweight() + + # Train + ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True) + optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) + with torch.autograd.set_detect_anomaly(True): + for i in range(self.restart_epoch, self.epochs): + epoch = i + 1 # Start from 1 (not zero-indexing) - if epoch % checkpoint_frequency == 0: + loss = torch.tensor(0.0) + if torch.cuda.is_available(): + loss = loss.cuda("cuda:0") + + for g in ds_tr_loader: + optimizer.zero_grad() + if torch.cuda.is_available(): + g = g.to("cuda:0") + g.nodes["n1"].data["xyz"].requires_grad = True + + _loss, loss_dict = self.net(g) + loss += _loss + + if epoch > self.sampler_patience: + # Save checkpoint as local model (net.pt) + # `neff_min` is -1 if SamplerReweight.samplers is None + samplers = self._setup_local_samplers(epoch, net_copy, debug) + neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers) + + # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton + if neff_min < self.neff_threshold: + _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})') + SamplerReweight.samplers = samplers + SamplerReweight.run() + del samplers + + # Compute sampler loss + loss_list = SamplerReweight.compute_loss() # list of torch.tensor + for sampler_index, sampler_loss in enumerate(loss_list): + sampler = SamplerReweight.samplers[sampler_index] + loss += sampler_loss * sampler_weight + loss_dict[f'{sampler.target_name}'] = sampler_loss.item() + loss_dict['neff'] = neff_min + + loss_dict['loss'] = loss.item() + self.report_loss(epoch, loss_dict) + + # Back propagate + loss.backward() + optimizer.step() + + if epoch % self.checkpoint_frequency == 0: # Note: returned loss is a joint loss of different units. - _loss = HARTEE_TO_KCALPERMOL * loss.pow(0.5).item() - _logger.info(f'epoch {epoch}: {_loss:.3f}') - checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt") - torch.save(self.net.state_dict(), checkpoint_file) + #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item() + _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}') + self._save_checkpoint(epoch) + + + def _load_checkpoint(self): + """Load the last checkpoint and restart the training process. + + This method finds all the checkpoint files in the output directory, loads the + last checkpoint (e.g. net100.pt), and restarts the training process from the next step. + If no checkpoint files are found, the training process starts from the first step. + + Returns + ------- + int + The step from which the training process should be restarted. + """ + import sys + import glob + + checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path)) + + if checkpoints: + n = [ int(c.split('ckpt')[1].split('.')[0]) for c in checkpoints ] + n.sort() + restart_epoch = n[-1] + restart_checkpoint = os.path.join(self.output_directory_path, f"ckpt{restart_epoch}.pt") + self.net.load_state_dict(torch.load(restart_checkpoint)) + logging.info(f'Restarting from ({restart_checkpoint}).') + else: + restart_epoch = 0 + + if restart_epoch >= self.epochs: + _logger.info(f'Already trained for {self.epochs} epochs.') + sys.exit(0) + elif restart_epoch > 0: + _logger.info(f'Training for additional {self.epochs-restart_epoch} epochs.') + else: + _logger.info(f'Training from scratch for {self.epochs} epochs.') + + return restart_epoch - def validate(): - raise NotImplementedError + def _save_checkpoint(self, epoch): + """Save local model. + + Parameters + ---------- + epoch : int + The epoch number. + Returns + ------- + None + """ + checkpoint_file = os.path.join(self.output_directory_path, f"ckpt{epoch}.pt") + torch.save(self.net.state_dict(), checkpoint_file) + + + def _save_local_model(self, epoch, net_copy): + """Save local model (force field). + + Parameters + ---------- + epoch : int + The epoch number. + + net_copy : torch.nn.Sequential + A deep copy of the Espaloma network model. + + Returns + ------- + None + """ + # Save checkpoint as temporary espaloma model (force field) + _logger.info(f'Save ckpt{epoch}.pt as temporary espaloma model (net.pt)') + self._save_checkpoint(epoch) + local_model = os.path.join(self.output_directory_path, f"ckpt{epoch}.pt") + self.save_model(net=net_copy, best_model=local_model, model_name=f"net.pt", output_directory_path=self.output_directory_path) + + + def _setup_local_samplers(self, epoch, net_copy, debug): + """Setup local samplers. + + Parameters + ---------- + epoch : int + The epoch number. + + net_copy : torch.nn.Sequential + A deep copy of the Espaloma network model. + + debug : bool + If True, use espaloma-0.3.2.pt for debugging. + + Returns + ------- + list + A list of sampler systems. + """ + from espfit.app.sampler import SetupSampler + + self._save_local_model(epoch, net_copy) + + # Define sampler settings with override arguments + args = [epoch] + if debug == True: + from importlib.resources import files + small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt")) + else: + small_molecule_forcefield = os.path.join(self.output_directory_path, f"net.pt") + + override_sampler_kwargs = { + "atomSubset": 'all', + "small_molecule_forcefield": small_molecule_forcefield, + "output_directory_path": self.output_directory_path + } + + # Create sampler system from configuration file. Returns list of systems. + samplers = SetupSampler.from_toml(self.configfile, *args, **override_sampler_kwargs) - def save_model(): - raise NotImplementedError \ No newline at end of file + return samplers + \ No newline at end of file diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml index 1cab4a8..a3f9138 100644 --- a/espfit/data/config/config.toml +++ b/espfit/data/config/config.toml @@ -1,7 +1,7 @@ # configuration for gnn [espaloma.gnn] method = "SAGEConv" -aggregator_type = 'mean' +aggregator_type = "mean" feat_drop = 0.1 # configuration for stage 1 (gnn) & 2 (janossy pooling) @@ -9,13 +9,6 @@ feat_drop = 0.1 stage1 = [ 512, "relu", 0.1, 512, "relu", 0.1, 512, "relu", 0.1 ] # (units, activation, dropout) stage2 = [ 512, "relu", 0.1, 512, "relu", 0.1, 512, "relu", 0.1, 512, "relu", 0.1 ] # (units, activation, dropout) -# training -[espaloma.train] -epochs = 20 -batch_size = 128 -learning_rate = 1e-4 -checkpoint_frequency = 10 - # loss weights [espaloma.weights] energy = 1.0 @@ -23,3 +16,39 @@ force = 1.0 charge = 1.0 torsion = 1.0 improper = 1.0 + +# training settings +[espaloma.train] +epochs = 10 +batch_size = 128 +learning_rate = 1e-4 +checkpoint_frequency = 1 + + +# System setup parameters +[[sampler.setup]] +target_class = "nucleoside" +target_name = "cytidine" +water_model = "tip3p" +solvent_padding = "10.0 * angstroms" +ionic_strength = "0.08 * molar" # 80 mM NaCl +temperature = "303.15 * kelvin" +maxIterations = 100 +nsteps = 1000 +checkpoint_frequency = 10 +logging_frequency = 1 +netcdf_frequency = 10 + +# system setup parameters +[[sampler.setup]] +target_class = "nucleoside" +target_name = "adenosine" +water_model = "tip3p" +solvent_padding = "10.0 * angstroms" +ionic_strength = "0.08 * molar" # 80 mM NaCl +temperature = "303.15 * kelvin" +maxIterations = 100 +nsteps = 1000 +checkpoint_frequency = 10 +logging_frequency = 1 +netcdf_frequency = 10 diff --git a/espfit/data/target/nucleoside/adenosine/experiment.yml b/espfit/data/target/nucleoside/adenosine/experiment.yml index 5d300f9..b7efedf 100644 --- a/espfit/data/target/nucleoside/adenosine/experiment.yml +++ b/espfit/data/target/nucleoside/adenosine/experiment.yml @@ -9,40 +9,49 @@ experiment_1: name: nmr concentration: 0.2 mM temperature: 303.15 * kelvin + reference: table S12 comment: sequence: a smiles: measurement: resi_1: - beta_1: + 1H5P: + name: beta_1 value: operator: error: - beta_2: + 2H5P: + name: beta_2 value: operator: error: - gamma_1: + 1H5H4: + name: gamma_1 value: operator: error: - gamma_2: + 2H5P: + name: gamma_2 value: operator: error: - epsilon: + H3P: + name: epsilon value: operator: error: - nu_1: + H1H2: + name: nu_1 value: 6.0 operator: error: - nu_2: + H2H3: + name: nu_2 value: 5.1 operator: error: - nu_3: + H3H4: + name: nu_3 value: 3.5 operator: error: diff --git a/espfit/data/target/nucleoside/adenosine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/adenosine/target.pdb similarity index 100% rename from espfit/data/target/nucleoside/adenosine/pdbfixer_min.pdb rename to espfit/data/target/nucleoside/adenosine/target.pdb diff --git a/espfit/data/target/nucleoside/cytidine/experiment.yml b/espfit/data/target/nucleoside/cytidine/experiment.yml index ecd73f8..e6bbbe6 100644 --- a/espfit/data/target/nucleoside/cytidine/experiment.yml +++ b/espfit/data/target/nucleoside/cytidine/experiment.yml @@ -9,40 +9,49 @@ experiment_1: name: nmr concentration: 5.0 mM temperature: 303.15 * kelvin + reference: table S11 comment: sequence: c smiles: measurement: resi_1: - beta_1: + 1H5P: + name: beta_1 value: operator: error: - beta_2: + 2H5P: + name: beta_2 value: operator: error: - gamma_1: + 1H5H4: + name: gamma_1 value: operator: error: - gamma_2: + 2H5P: + name: gamma_2 value: operator: error: - epsilon: + H3P: + name: epsilon value: operator: error: - nu_1: + H1H2: + name: nu_1 value: 4.02 operator: error: - nu_2: + H2H3: + name: nu_2 value: 5.49 operator: error: - nu_3: + H3H4: + name: nu_3 value: 6.15 operator: error: diff --git a/espfit/data/target/nucleoside/cytidine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/cytidine/target.pdb similarity index 100% rename from espfit/data/target/nucleoside/cytidine/pdbfixer_min.pdb rename to espfit/data/target/nucleoside/cytidine/target.pdb diff --git a/espfit/data/target/nucleoside/guanosine/experiment.yml b/espfit/data/target/nucleoside/guanosine/experiment.yml index 8adf911..46bcbc4 100644 --- a/espfit/data/target/nucleoside/guanosine/experiment.yml +++ b/espfit/data/target/nucleoside/guanosine/experiment.yml @@ -9,40 +9,49 @@ experiment_1: name: nmr concentration: 0.2 mM temperature: 303.15 * kelvin + reference: table S12 comment: sequence: g smiles: measurement: resi_1: - beta_1: + 1H5P: + name: beta_1 value: operator: error: - beta_2: + 2H5P: + name: beta_2 value: operator: error: - gamma_1: + 1H5H4: + name: gamma_1 value: operator: error: - gamma_2: + 2H5P: + name: gamma_2 value: operator: error: - epsilon: + H3P: + name: epsilon value: operator: error: - nu_1: + H1H2: + name: nu_1 value: 5.9 operator: error: - nu_2: + H2H3: + name: nu_2 value: 5.3 operator: error: - nu_3: + H3H4: + name: nu_3 value: 4.1 operator: error: diff --git a/espfit/data/target/nucleoside/guanosine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/guanosine/target.pdb similarity index 100% rename from espfit/data/target/nucleoside/guanosine/pdbfixer_min.pdb rename to espfit/data/target/nucleoside/guanosine/target.pdb diff --git a/espfit/data/target/nucleoside/pdbfixer_setup.py b/espfit/data/target/nucleoside/pdbfixer_setup.py index 1269915..bde251b 100644 --- a/espfit/data/target/nucleoside/pdbfixer_setup.py +++ b/espfit/data/target/nucleoside/pdbfixer_setup.py @@ -46,7 +46,8 @@ def prep(inputfile): # minimize: fix hydrogen positions simulation.minimizeEnergy(maxIterations=50) positions = simulation.context.getState(getPositions=True).getPositions() - PDBFile.writeFile(model.topology, positions, open("pdbfixer_min.pdb", 'w')) + #PDBFile.writeFile(model.topology, positions, open("pdbfixer_min.pdb", 'w')) + PDBFile.writeFile(model.topology, positions, open("target.pdb", 'w')) @click.command() diff --git a/espfit/data/target/nucleoside/uridine/experiment.yml b/espfit/data/target/nucleoside/uridine/experiment.yml index 181d12c..47ba261 100644 --- a/espfit/data/target/nucleoside/uridine/experiment.yml +++ b/espfit/data/target/nucleoside/uridine/experiment.yml @@ -9,40 +9,49 @@ experiment_1: name: nmr concentration: 5.0 mM temperature: 303.15 * kelvin + reference: table S11 comment: sequence: u smiles: measurement: resi_1: - beta_1: + 1H5P: + name: beta_1 value: operator: error: - beta_2: + 2H5P: + name: beta_2 value: operator: error: - gamma_1: + 1H5H4: + name: gamma_1 value: operator: error: - gamma_2: + 2H5P: + name: gamma_2 value: operator: error: - epsilon: + H3P: + name: epsilon value: operator: error: - nu_1: + H1H2: + name: nu_1 value: 4.59 operator: error: - nu_2: + H2H3: + name: nu_2 value: 5.45 operator: error: - nu_3: + H3H4: + name: nu_3 value: 5.77 operator: error: diff --git a/espfit/data/target/nucleoside/uridine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/uridine/target.pdb similarity index 100% rename from espfit/data/target/nucleoside/uridine/pdbfixer_min.pdb rename to espfit/data/target/nucleoside/uridine/target.pdb diff --git a/espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb b/espfit/data/target/testsystems/nucleoside/target.pdb similarity index 100% rename from espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb rename to espfit/data/target/testsystems/nucleoside/target.pdb diff --git a/espfit/tests/test_app_experiment.py b/espfit/tests/test_app_analysis.py similarity index 51% rename from espfit/tests/test_app_experiment.py rename to espfit/tests/test_app_analysis.py index ecea804..9c783d7 100644 --- a/espfit/tests/test_app_experiment.py +++ b/espfit/tests/test_app_analysis.py @@ -1,33 +1,40 @@ import pytest from importlib.resources import files -from espfit.app.experiment import RNASystem +from espfit.app.analysis import RNASystem -def test_load_traj(): +@pytest.fixture +def _get_input_directory_path(): input_directory_path = files('espfit').joinpath('data/sampler') # PosixPath + return input_directory_path + + +def test_load_traj(_get_input_directory_path): + # TODO: Better test + input_directory_path = _get_input_directory_path data = RNASystem(input_directory_path=input_directory_path) data.load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc') - # TODO: Better test - return data + assert data.traj is not None -def test_compute_jcouplings_1(): - input_directory_path = files('espfit').joinpath('data/sampler') # PosixPath +def test_compute_jcouplings(_get_input_directory_path): + # TODO: Better test + input_directory_path = _get_input_directory_path data = RNASystem(input_directory_path=input_directory_path) data.load_traj() couplings = data.compute_jcouplings(couplings=['H1H2', 'H2H3', 'H3H4']) - - # TODO: Better test - return couplings + assert couplings is not None -def test_compute_jcouplings_2(): - input_directory_path = files('espfit').joinpath('data/sampler') # PosixPath + +def test_compute_jcouplings_all(_get_input_directory_path): + # TODO: Better test + input_directory_path = _get_input_directory_path data = RNASystem() data.input_directory_path = str(input_directory_path) data.load_traj() - couplings = data.compute_jcouplings(couplings=None) + couplings = data.compute_jcouplings() - # TODO: Better test - return couplings \ No newline at end of file + assert couplings is not None + \ No newline at end of file diff --git a/espfit/tests/test_app_sampler.py b/espfit/tests/test_app_sampler.py index b7ab1d3..09a44b1 100644 --- a/espfit/tests/test_app_sampler.py +++ b/espfit/tests/test_app_sampler.py @@ -15,7 +15,7 @@ def test_create_test_espaloma_system(tmpdir): ------- c : espfit.app.sampler.SetupSampler """ - biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb') # PosixPath + biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/target.pdb') # PosixPath c = SetupSampler(small_molecule_forcefield=ESPALOMA_FORCEFIELD, output_directory_path=str(tmpdir)) c.create_system(biopolymer_file=biopolymer_file) # Exports solvated system as pdb file automatically. @@ -31,7 +31,7 @@ def test_create_nucleoside_espaloma_system(tmpdir): ------- None """ - biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb') + biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/target.pdb') c = SetupSampler(small_molecule_forcefield=ESPALOMA_FORCEFIELD, output_directory_path=str(tmpdir)) c.create_system(biopolymer_file=biopolymer_file) @@ -100,7 +100,7 @@ def test_create_multi_protein_ligand_espaloma_system(tmpdir): c.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file) -def test_export_system(test_create_test_espaloma_system, tmpdir): +def test_export_system(test_create_test_espaloma_system): """Test exporting the system to xml files. Parameters @@ -108,18 +108,15 @@ def test_export_system(test_create_test_espaloma_system, tmpdir): test_create_test_espaloma_system : espfit.app.sampler.SetupSampler Test system instance. - tmpdir : tmpdir fixture from pytest - Returns ------- None """ c = test_create_test_espaloma_system - c.output_directory_path = str(tmpdir) c.export_xml() + - -def test_export_system_change_outdir(test_create_test_espaloma_system, tmpdir): +def test_export_system_change_outdir(test_create_test_espaloma_system): """Test exporting the system to xml files. Change the output directory path and check if the new directory is created. @@ -129,18 +126,16 @@ def test_export_system_change_outdir(test_create_test_espaloma_system, tmpdir): test_create_test_espaloma_system : espfit.app.sampler.SetupSampler Test system instance. - tmpdir : tmpdir fixture from pytest - Returns ------- None """ + import os c = test_create_test_espaloma_system - old_outdir = c.output_directory_path - c.export_xml(output_directory_path=str(tmpdir.join('newdir'))) - new_outdir = c.output_directory_path + old_output_directory_path = c.output_directory_path + c.export_xml(output_directory_path=os.path.join(old_output_directory_path, 'newdir')) - assert old_outdir != new_outdir + assert old_output_directory_path != c.output_directory_path def test_minimize(test_create_test_espaloma_system): @@ -156,10 +151,14 @@ def test_minimize(test_create_test_espaloma_system): None """ c = test_create_test_espaloma_system - c.minimize(maxIterations=10) + old_maxIterations = c.maxIterations + c.maxIterations = 9 # change default + c.minimize() + + assert old_maxIterations != c.maxIterations -def test_standard_md(test_create_test_espaloma_system, tmpdir): +def test_standard_md(test_create_test_espaloma_system): """Test standard md simulation. Parameters @@ -172,12 +171,13 @@ def test_standard_md(test_create_test_espaloma_system, tmpdir): None """ c = test_create_test_espaloma_system - c.output_directory_path = str(tmpdir) - c.minimize(maxIterations=10) # Minimize the system before running the simulation to avoid Energy NaN. - c.run(nsteps=10) + c.maxIterations = 10 # update maxIterations to speed up the test + c.nsteps = 10 + c.minimize() # minimize the system before running the simulation to avoid Energy NaN. + c.run() -def test_create_system_from_xml(test_create_test_espaloma_system, tmpdir): +def test_create_system_from_xml(test_create_test_espaloma_system): """Test creating a system from loading existing xml files. Parameters @@ -185,20 +185,20 @@ def test_create_system_from_xml(test_create_test_espaloma_system, tmpdir): test_create_test_espaloma_system : espfit.app.sampler.SetupSampler Test system instance. - tmpdir : tmpdir fixture from pytest - Returns ------- None """ + import os + import glob + c = test_create_test_espaloma_system - c.output_directory_path = str(tmpdir) c.export_xml() - c2 = SetupSampler.from_xml(input_directory_path=str(tmpdir)) - c2.export_xml(output_directory_path=str(tmpdir)) + c2 = SetupSampler.from_xml(input_directory_path=c.output_directory_path) + c2.export_xml(output_directory_path=c.output_directory_path) # Check number of exported files. Check state.xml as a representative file. - import glob - n_files = len(glob.glob(str(tmpdir.join('state*.xml')))) + # If the same file exists, then suffix number will be added to the file name. + n_files = len(glob.glob(os.path.join(c.output_directory_path, 'state*.xml'))) assert n_files == 2 diff --git a/espfit/tests/test_app_train.py b/espfit/tests/test_app_train.py index d24fbd7..1f671c6 100644 --- a/espfit/tests/test_app_train.py +++ b/espfit/tests/test_app_train.py @@ -44,7 +44,7 @@ def test_load_dataset(tmpdir): # Prepare input dataset ready for training temporary_directory = tmpdir.mkdir('misc') - ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) + ds.drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) ds.reshape_conformation_size(n_confs=50) ds.compute_relative_energy() @@ -74,14 +74,17 @@ def test_train(test_load_dataset, test_create_espaloma_model, tmpdir): # Create temporary checkpoint directory checkpoint_directory = tmpdir.mkdir('checkpoints') # PosixPath + model.output_directory_path=str(checkpoint_directory) - # Train model - model.train(output_directory_path=str(checkpoint_directory)) + # Train model with arbitrary number of epochs and checkpoint frequency + model.epochs = 20 + model.checkpoint_frequency = 5 + model.train() # Test if the model has been trained n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - expected_n_checkpoints = int(model.config['espaloma']['train']['epochs']/model.config['espaloma']['train']['checkpoint_frequency']) - assert expected_n_checkpoints == n_checkpoints + expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency) + assert expected_n_checkpoints == n_checkpoints == 4 # 20/5 = 4 def test_train_extend(test_load_dataset, test_create_espaloma_model, tmpdir): @@ -107,20 +110,24 @@ def test_train_extend(test_load_dataset, test_create_espaloma_model, tmpdir): # Create temporary checkpoint directory checkpoint_directory = tmpdir.mkdir('checkpoints') # PosixPath + model.output_directory_path=str(checkpoint_directory) - # Train model - model.train(output_directory_path=str(checkpoint_directory)) + # Train model with arbitrary number of epochs and checkpoint frequency + model.epochs = 10 + model.checkpoint_frequency = 2 + model.train() # Test if the model has been trained n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - expected_n_checkpoints = int(model.config['espaloma']['train']['epochs']/model.config['espaloma']['train']['checkpoint_frequency']) - assert n_checkpoints == expected_n_checkpoints + expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency) + assert n_checkpoints == expected_n_checkpoints == 5 # 10/2 = 5 # Extend training - model.config['espaloma']['train']['epochs'] = 40 - model.train(output_directory_path=str(checkpoint_directory)) + model.epochs = 40 + model.train() n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - assert n_checkpoints == 4 + expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency) + assert n_checkpoints == expected_n_checkpoints == 20 # 40/2 = 20 def test_train_extend_failure(test_load_dataset, test_create_espaloma_model, tmpdir): @@ -146,19 +153,24 @@ def test_train_extend_failure(test_load_dataset, test_create_espaloma_model, tmp # Create temporary checkpoint directory checkpoint_directory = tmpdir.mkdir('checkpoints') # PosixPath + model.output_directory_path=str(checkpoint_directory) # Train model - model.train(output_directory_path=str(checkpoint_directory)) + model.epochs = 20 + model.checkpoint_frequency = 10 + model.train() # Test if the model has been trained n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - expected_n_checkpoints = int(model.config['espaloma']['train']['epochs']/model.config['espaloma']['train']['checkpoint_frequency']) - assert n_checkpoints == expected_n_checkpoints + expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency) + assert n_checkpoints == expected_n_checkpoints == 2 # 20/10 = 2 # Extend training - # This should fail to extend the training because the given new number of epoch (i.e. 10) is less than the + # The training should not extend because the given new number of epoch (i.e. 10) is less than the # last epoch of the checkpoint file (i.e. 20). - model.config['espaloma']['train']['epochs'] = 10 - model.train(output_directory_path=str(checkpoint_directory)) - n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) - assert n_checkpoints == expected_n_checkpoints \ No newline at end of file + with pytest.raises(SystemExit) as excinfo: + model.epochs = 10 + model.train() + assert excinfo.value.code == 0 + #n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt')))) + #assert n_checkpoints == expected_n_checkpoints == 2 # 20/10 = 2 \ No newline at end of file diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py new file mode 100644 index 0000000..1292f4c --- /dev/null +++ b/espfit/tests/test_app_train_sampler.py @@ -0,0 +1,81 @@ +import pytest +from importlib.resources import files +from espfit.utils.graphs import CustomGraphDataset +from espfit.app.train import EspalomaModel + + +@pytest.fixture +def test_create_espaloma_from_toml(tmpdir): + """Test function to load a TOML configuration file and create an EspalomaModel object. + + Returns + ------- + model : espfit.app.train.EspalomaModel + The created EspalomaModel object. + """ + filename = files('espfit').joinpath('data/config/config.toml') # PosixPath + model = EspalomaModel.from_toml(str(filename)) + model.output_directory_path = str(tmpdir) # Update output directory path + + return model + + +@pytest.fixture +def test_load_dataset(tmpdir): + """Test function to load a dataset and prepare it for training. + + Parameters + ---------- + tmpdir : py._path.local.LocalPath # IS THIS CORRECT? + Temporary directory. + + Notes + ----- + This function is not intended for production use. It is a minimal example for testing purposes. + + Returns + ------- + ds : espfit.utils.graphs.CustomGraphDataset + The loaded dataset. + """ + # load dataset + path = 'data/qcdata/openff-toolkit-0.10.6/dgl2/gen2-torsion-sm' + mydata = files('espfit').joinpath(path) + ds = CustomGraphDataset.load(str(mydata)) + + # Prepare input dataset ready for training + temporary_directory = tmpdir.mkdir('misc') + ds.drop_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) + ds.reshape_conformation_size(n_confs=50) + ds.compute_relative_energy() + + return ds + + +def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml): + """Test function to train a sampler.""" + import os + import glob + + # Load dataset and model + ds = test_load_dataset + model = test_create_espaloma_from_toml + + # Set espaloma parameters + model.dataset_train = ds + model.epochs = 15 + + # Train + sampler_patience = 10 + # Force sampler to run after reaching sampler patience by setting neff_threshold to 1.0 + model.train_sampler(sampler_patience=sampler_patience, neff_threshold=1.0, sampler_weight=1) + + # Check outputs + n_ckpt = len(glob.glob(os.path.join(model.output_directory_path, 'ckpt*pt'))) + assert n_ckpt == int(model.epochs / model.checkpoint_frequency) + + n_adenosine_pred_yaml = len(glob.glob(os.path.join(model.output_directory_path, 'adenosine/*/pred.yaml'))) + assert n_adenosine_pred_yaml == int(model.epochs - sampler_patience) + + n_cytidine_pred_yaml = len(glob.glob(os.path.join(model.output_directory_path, 'cytidine/*/pred.yaml'))) + assert n_cytidine_pred_yaml == int(model.epochs - sampler_patience) diff --git a/espfit/tests/test_utils_graphs.py b/espfit/tests/test_utils_graphs.py index 17f097b..0446290 100644 --- a/espfit/tests/test_utils_graphs.py +++ b/espfit/tests/test_utils_graphs.py @@ -68,7 +68,9 @@ def test_load_dataset(mydata_gen2_torsion_sm): """ ds = mydata_gen2_torsion_sm nconfs = [g.nodes['g'].data['u_ref'].shape[1] for g in ds] - assert nconfs == [24, 24, 24, 13, 24, 24, 24, 24], 'Number of molecular conformers does not match' + # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI. + #assert nconfs == [24, 24, 24, 13, 24, 24, 24, 24], 'Number of molecular conformers does not match' + assert nconfs.sort() == [24, 24, 24, 13, 24, 24, 24, 24].sort(), 'Number of molecular conformers does not match' def test_load_dataset_multiple(mydata_gen2_torsion_sm, mydata_protein_torsion_sm, mydata_rna_diverse_sm): @@ -102,7 +104,7 @@ def test_load_dataset_multiple(mydata_gen2_torsion_sm, mydata_protein_torsion_sm assert sum(nconfs) == 5636, 'Total number of conformations does not match' -def test_drop_and_merge_duplicates(mydata_gen2_torsion_sm, tmpdir): +def test_drop_duplicates(mydata_gen2_torsion_sm, tmpdir): """Test function to drop and merge duplicate molecules. Parameters @@ -117,9 +119,11 @@ def test_drop_and_merge_duplicates(mydata_gen2_torsion_sm, tmpdir): """ ds = mydata_gen2_torsion_sm temporary_directory = tmpdir.mkdir('misc') - ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) + ds.drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory)) nconfs = [ g.nodes['g'].data['u_ref'].shape[1] for g in ds ] - assert nconfs == [24, 13, 24, 24, 24, 72], 'Number of molecular conformers does not match' + # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI. + #assert nconfs == [24, 13, 24, 24, 24, 72], 'Number of molecular conformers does not match' + assert nconfs.sort() == [24, 13, 24, 24, 24, 72].sort(), 'Number of molecular conformers does not match' def test_subtract_nonbonded_interactions(mydata_gen2_torsion_sm): @@ -165,7 +169,9 @@ def test_filter_high_energy_conformers(mydata_gen2_torsion_sm): # set relative_energy_thershold very small to ensure some conformers will be filtered ds.filter_high_energy_conformers(relative_energy_threshold=0.01, node_feature='u_ref') nconfs = [ g.nodes['g'].data['u_ref'].shape[1] for g in ds ] - assert nconfs == [14, 19, 19, 5, 14, 19, 24, 24], 'Number of molecular conformers does not match' + # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI. + #assert nconfs == [14, 19, 19, 5, 14, 19, 24, 24], 'Number of molecular conformers does not match' + assert nconfs.sort() == [14, 19, 19, 5, 14, 19, 24, 24].sort(), 'Number of molecular conformers does not match' def test_filter_minimum_conformers(mydata_gen2_torsion_sm): diff --git a/espfit/utils/espaloma/module.py b/espfit/utils/espaloma/module.py index e3d27b4..642d91e 100644 --- a/espfit/utils/espaloma/module.py +++ b/espfit/utils/espaloma/module.py @@ -56,6 +56,8 @@ class GetLoss(torch.nn.Module): compute_improper_loss(g): Compute improper l2 regularization + forward(g): + Compute joint loss """ def __init__(self, weights={'energy': 1.0, 'force': 1.0, 'charge': 1.0, 'torsion': 1.0, 'improper': 1.0}): """Define loss function. @@ -166,7 +168,11 @@ def forward(self, g): Returns ------- - loss : torch.Tensor + loss : torch.Tensor + Total weighted loss + + loss_dict : dict + Dictionary of individual weighted losses """ loss_energy = self.compute_energy_loss(g) * self.weights['energy'] loss_force = self.compute_force_loss(g) * self.weights['force'] @@ -180,5 +186,14 @@ def forward(self, g): _logger.debug(f"energy: {loss_energy:.5f}, force: {loss_force:.5f}, charge: {loss_charge:.5f}, torsion: {loss_torsion:.5f}, improper: {loss_improper:.5f}") loss = loss_energy + loss_force + loss_charge + loss_torsion + loss_improper + + loss_dict = { + 'loss': None, + 'energy': loss_energy.item(), + 'force': loss_force.item(), + 'charge': loss_charge.item(), + 'torsion': loss_torsion.item(), + 'improper': loss_improper.item(), + } - return loss \ No newline at end of file + return loss, loss_dict \ No newline at end of file diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py index 914efb3..a9f7c20 100644 --- a/espfit/utils/graphs.py +++ b/espfit/utils/graphs.py @@ -18,7 +18,7 @@ class CustomGraphDataset(GraphDataset): Methods ------- - drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=None): + drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=None): Drop and merge duplicate nonisomeric smiles across different data sources. subtract_nonbonded_interactions(subtract_vdw=False, subtract_ele=True): @@ -27,13 +27,13 @@ class CustomGraphDataset(GraphDataset): filter_high_energy_conformers(relative_energy_threshold=0.1, node_feature='u_ref'): Filter high energy conformers and ensure minimum number of conformers. - filter_minimum_conformers(n_conformer_threshold=3): + filter_minimum_conformers(n_conformer_threshold=5): Filter molecules with conformers below given threshold. - compute_baseline_energy_force(forcefield_list=['openff-2.0.0']): + compute_baseline_energy_force(forcefield_list=['openff-2.1.0']): Compute energies and forces using other force fields. - reshape_conformation_size(n_confs=50): + reshape_conformation_size(n_confs=50, include_min_energy_conf=False): Reshape conformation size. compute_relative_energy(): @@ -53,7 +53,7 @@ class CustomGraphDataset(GraphDataset): >>> ds = GraphDataset.load(path) >>> # Drop and merge duplicate molecules. Save merged dataset as a new dataset. >>> # If `output_directory_path` is None, then the current working directory is used. - >>> ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=None) + >>> ds.drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=None) >>> # Subtract nonbonded energies and forces from QC reference (e.g. subtract all valence and ele interactions) >>> # This will update u_ref and u_ref_relative in-place. copy of raw u_ref (QM reference) will be copied to u_qm. >>> ds.subtract_nonbonded_interactions(subtract_vdw=False, subtract_ele=True) @@ -62,9 +62,9 @@ class CustomGraphDataset(GraphDataset): >>> # Filter high energy conformers (u_ref: QM reference after nonbonded interactions are subtracted) >>> ds.filter_high_energy_conformers(relative_energy_threshold=0.1, node_feature='u_ref') >>> # Filter conformers below certain number - >>> ds.filter_minimum_conformers(n_conformer_threshold=3) + >>> ds.filter_minimum_conformers(n_conformer_threshold=5) >>> # Compute energies and forces using other force fields - >>> ds.compute_baseline_energy_force(forcefield_list=['openff-2.0.0']) + >>> ds.compute_baseline_energy_force(forcefield_list=['openff-2.1.0']) >>> # Regenerate improper torsions in-place >>> from espaloma.graphs.utils.regenerate_impropers import regenerate_impropers >>> ds.apply(regenerate_impropers, in_place=True) @@ -75,7 +75,7 @@ class CustomGraphDataset(GraphDataset): """ - def __init__(self, graphs=[], reference_forcefield='openff-2.0.0', random_seed=2666): + def __init__(self, graphs=[], reference_forcefield='openff-2.1.0', random_seed=2666): """Construct custom GraphDataset instance to prepare QC dataset for espaloma training. Parameters @@ -83,7 +83,7 @@ def __init__(self, graphs=[], reference_forcefield='openff-2.0.0', random_seed=2 graphs : list of espaloma.graphs.graph.Graph, default=[] DGL graphs loaded from `espaloma.data.dataset.GraphDataset.load`. - reference_forcefield : str, default=openff-2.0.0 + reference_forcefield : str, default=openff-2.1.0 Reference force field used to compute force field parameters if not present in espaloma. The default behavior is to compute the LJ parameters with `reference_forcefield`. @@ -96,15 +96,49 @@ def __init__(self, graphs=[], reference_forcefield='openff-2.0.0', random_seed=2 self.random_seed = random_seed - def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc', output_directory_path=None): - """Drop and merge duplicate nonisomeric smiles across different data sources. + def drop_duplicates(self, isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=None): + """Drop duplicate (non)isomeric smiles within the dataset. Modifies list of esp.Graph's in place. Parameters ---------- + isomeric : boolean, default=False + If True, then duplicated molecules are merged based on isomeric mapped smiles. + If False, then duplicated molecules are merged based on nonisomeric mapped smiles. + + Unique molecules are identified by nonisomeric non-mapped smiles. + Duplicated molecules (nonisomeric smiles) are merged into a single molecule based on + the isomeric mapped smiles (isomeric=True) or nonisomeric mapped smiles (isomeric=False). + + Note that there is no guarantee that the atom order (mapping) is consistent across different + molecules with the same (non)isomeric smiles. + + For example, molecules with same nonisomeric smiles could have different mapped smiles: + + [H:21][c:1]1[c:2]([c:4]([c:7]([c:5]([c:3]1[H:23])[H:25])[N:14]=[N:15][C:8]2=[C:10]3[N:16]\ + ([C:9](=[C:6]([C:11](=[O:19])[N:18]3[N:17]([C:12]2=[O:20])[H:31])[H:26])[C:13]([H:27])\ + ([H:28])[H:29])[H:30])[H:24])[H:22] + + [H:22][c:11]1[c:12]([c:14]([c:16]([c:15]([c:13]1[H:24])[H:26])[N:4]=[N:3][C:8]2=[C:9]3[N:17]\ + ([C:10](=[C:7]([C:6](=[O:2])[N:19]3[N:18]([C:5]2=[O:1])[H:28])[H:21])[C:20]([H:29])\ + ([H:30])[H:31])[H:27])[H:25])[H:23] + + This will give different atom ordering, leading to, for example, different + bond atom index (g.nodes['n2'].data['idxs']). + + To alleviate this issue, nonisomeric smiles without atom mapping is used to identify + unique molecules and remove any duplicated molecules. Then, duplicated molecules are + merged into a single molecule based on the isomeric mapped smiles (isomeric=True) or + nonisomeric mapped smiles (isomeric=False). + + keep : boolean, default=True + If True, then duplicate entries dropped from the dataset will be added back to the unique entries + after the dropped duplicated entries are merged into a single molecule. If False, then duplicated + entries dropped will be removed. + save_merged_datest : boolean, default=True - If True, then merged datasets will be saved as a new dataset. + If True, then duplicated molecules are merged into a single molecule and saved as a new dataset. dataset_name : str, default=misc Name of the merged dataset. @@ -112,7 +146,7 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc output_directory_path : str, default=None Output directory path to save the merged dataset. If None, then the current working directory is used. - + Returns ------- None @@ -122,50 +156,79 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc if output_directory_path == None: output_directory_path = os.getcwd() - - _logger.info(f'Drop and merge duplicate smiles') - smiles = [ g.mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=False) for g in self.graphs ] - _logger.info(f'Found {len(smiles)} molecules') - - # Unique entries - df = pd.DataFrame.from_dict({'smiles': smiles}) - unique_index = df.drop_duplicates(keep=False).index.to_list() + + _logger.info(f'Remove duplicated nonisomeric smiles from dataset') + if isomeric == True: + _logger.info(f'Merge duplicated nonisomeric smiles into unique isomeric mapped smiles') + else: + _logger.info(f'Merge duplicated nonisomeric smiles into unique nonisomeric mapped smiles') + + # Get smiles + nonisomeric_smiles = [ g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=False) for g in self.graphs ] + nonisomeric_mapped_smiles = [ g.mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) for g in self.graphs ] + isomeric_mapped_smiles = [ g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) for g in self.graphs ] + _logger.info(f'Found {len(self.graphs)} graph entries') + + # Create pandas dataframe + df = pd.DataFrame.from_dict({'nonisomeric_smiles': nonisomeric_smiles, 'nonisomeric_mapped_smiles': nonisomeric_mapped_smiles, 'isomeric_mapped_smiles': isomeric_mapped_smiles}) + _logger.info(f'Unique nonisomeric smiles: {len(df.nonisomeric_smiles.unique())}') + _logger.info(f'Unique nonisomeric mapped smiles: {len(df.nonisomeric_mapped_smiles.unique())}') + _logger.info(f'Unique isomeric mapped smiles: {len(df.isomeric_mapped_smiles.unique())}') + + # Get unique and duplicated entries using nonisomeric smiles (non-mapped) + unique_index = df.nonisomeric_smiles.drop_duplicates(keep=False).index.to_list() unique_graphs = [self.graphs[_idx] for _idx in unique_index] - _logger.info(f'Found {len(unique_index)} unique molecules') + _logger.info(f'Drop all duplicated nonisomeric smiles from the dataset (unique nonisomeric smiles: {len(unique_index)})') - # Duplicated entries - index = df.duplicated(keep=False) # Mark all duplicate entries True + index = df.nonisomeric_smiles.duplicated(keep=False) # Mark all duplicate entries True duplicated_index = df[index].index.to_list() - _logger.info(f'Found {len(duplicated_index)} duplicated molecules') + assert len(unique_index) + len(duplicated_index) == len(self.graphs), \ + f'Unique + duplicated nonisomeric smiles: {len(unique_index)} + {len(duplicated_index)} != total dataset ({len(self.graphs)})' - # Get unique smiles and assign new molecule name `e.g. mol0001` - duplicated_df = df.iloc[duplicated_index] - duplicated_smiles = duplicated_df.smiles.unique().tolist() - molnames = [ f'mol{i:04d}' for i in range(len(duplicated_smiles)) ] - _logger.info(f'Found {len(molnames)} unique molecules within duplicate entries') - - # Merge duplicate entries into a new single graph - duplicated_graphs = [] - molnames_dict = {} - for molname, duplicated_smile in zip(molnames, duplicated_smiles): - # Map new molecule name with its unique smiles and dataframe indices - index = duplicated_df[duplicated_df['smiles'] == duplicated_smile].index.tolist() - molnames_dict[molname] = {'smiles': duplicated_smiles, 'index': index} - # Merge graphs - g = self._merge_graphs([self.graphs[_idx] for _idx in index]) - duplicated_graphs.append(g) - # Save graphs (optional) - if save_merged_dataset == True: - # Notes: Create a temporary directory, `_output_directory_path`, to support pytest in test_utils_graphs.py. - # Temporary directory needs to be created beforehand for `test_drop_and_merge_duplicates`. - _output_directory_path = os.path.join(output_directory_path, dataset_name) - os.makedirs(_output_directory_path, exist_ok=True) - output_directory_path = os.path.join(_output_directory_path, molname) - g.save(output_directory_path) - - # Update in place - new_graphs = unique_graphs + duplicated_graphs - _logger.info(f'Graph dataset reconstructed: {len(new_graphs)} unique molecules') + if keep == True: + if isomeric == True: + _logger.info(f'Merge dropped duplicated nonisomeric smiles into unique isomeric mapped smiles') + else: + _logger.info(f'Merge dropped duplicated nonisomeric smiles into unique nonisomeric mapped smiles') + + # Get unique (non)isomeric mapped smiles from duplicated nonisomeric smiles (non-mapped) and assign new molecule name `e.g. mol0001` + # Use copy() to prevent SettingWithCopyWarning when assigning new values to a new column + duplicated_df = df.iloc[duplicated_index].copy() + if isomeric == True: + duplicated_smiles = duplicated_df.isomeric_mapped_smiles.unique().tolist() + duplicated_df['smiles'] = duplicated_df.isomeric_mapped_smiles + _logger.info(f'Found {len(duplicated_smiles)} unique isomeric mapped smiles within duplicated {len(duplicated_index)} nonisomeric smiles') + else: + duplicated_smiles = duplicated_df.nonisomeric_mapped_smiles.unique().tolist() + duplicated_df['smiles'] = duplicated_df.nonisomeric_mapped_smiles + _logger.info(f'Found {len(duplicated_smiles)} unique nonisomeric mapped smiles within duplicated {len(duplicated_index)} nonisomeric smiles') + molnames = [ f'mol{i:04d}' for i in range(len(duplicated_smiles)) ] + + # Merge duplicate entries into a new single graph + duplicated_graphs = [] + #molnames_dict = {} # This is never used but keep this to export the dictionary? + for molname, duplicated_smile in zip(molnames, duplicated_smiles): + # Map new molecule name with its unique smiles and dataframe indices + index = duplicated_df[duplicated_df['smiles'] == duplicated_smile].index.tolist() + #molnames_dict[molname] = {'smiles': duplicated_smiles, 'index': index} + # Merge graphs + g = self._merge_graphs(subset=[self.graphs[_idx] for _idx in index], isomeric_flag=isomeric) + duplicated_graphs.append(g) + # Save graphs (optional) + if save_merged_dataset == True: + # Notes: Create a temporary directory, `_output_directory_path`, to support pytest in test_utils_graphs.py. + # Temporary directory needs to be created beforehand for `test_drop_and_merge_duplicates`. + _output_directory_path = os.path.join(output_directory_path, dataset_name) + os.makedirs(_output_directory_path, exist_ok=True) + new_output_directory_path = os.path.join(_output_directory_path, molname) + g.save(new_output_directory_path) + + new_graphs = unique_graphs + duplicated_graphs + _logger.info(f'Add back {len(duplicated_graphs)} merged duplicated (non)isomeric mapped smiles into the dataset') + _logger.info(f'Dataset reconstructed: {len(new_graphs)} unique molecules') + else: + new_graphs = unique_graphs + _logger.info(f'Dataset reconstructed: {len(new_graphs)} unique molecules') self.graphs = new_graphs del unique_graphs, duplicated_graphs, df, duplicated_df @@ -204,10 +267,11 @@ def subtract_nonbonded_interactions(self, subtract_vdw=False, subtract_ele=True) ------- None """ + _logger.info(f'Subtract nonbonded interactions from QC reference') new_graphs = [] from espaloma.data.md import subtract_nonbonded_force - for i, g in enumerate(self.graphs): + for g in self.graphs: # `espaloma.data.md.subtract_nonbonded_force` will update g.nodes['g'].data['u_ref'] and g.nodes['g'].data['u_ref_prime'] in place. # Clone QM reference into g.nodes['g'].data['u_qm'] and g.nodes['g'].data['u_qm_prime'], if not exist if 'u_qm' not in g.nodes['g'].data.keys(): @@ -223,8 +287,8 @@ def subtract_nonbonded_interactions(self, subtract_vdw=False, subtract_ele=True) # subtract_nonbonded_force() will return the coulomb interactions using the predefined partial charges. # # Reference: - # [1] https://github.com/choderalab/espaloma/blob/main/espaloma/data/md.py#L503C19-L503C19 - # [2] https://github.com/openmm/openmmforcefields/blob/637d551a4408cc6145529cd9dc30e267f4178367/openmmforcefields/generators/template_generators.py#L1432 + # [1] https://github.com/choderalab/espaloma/blob/main/espaloma/data/md.py#L503 + # [2] https://github.com/openmm/openmmforcefields/blob/637d551a4408cc6145529cd9dc30e267f4178367/openmmforcefields/generators/template_generators.py#L607 g = subtract_nonbonded_force(g, forcefield=self.reference_forcefield, subtract_charges=True) elif subtract_vdw == False and subtract_ele == False: g = subtract_nonbonded_force(g, forcefield=self.reference_forcefield, subtract_charges=False) @@ -255,6 +319,7 @@ def filter_high_energy_conformers(self, relative_energy_threshold=0.1, node_feat ------- None """ + _logger.info(f'Filter high energy conformers with relative energy threshold {relative_energy_threshold}') if node_feature == None: raise Exception(f'Please specify the node feature name under node type `g`') @@ -278,20 +343,21 @@ def filter_high_energy_conformers(self, relative_energy_threshold=0.1, node_feat del new_graphs - def filter_minimum_conformers(self, n_conformer_threshold=3): + def filter_minimum_conformers(self, n_conformer_threshold=5): """Filter molecules with conformers below given threshold. Modifies list of esp.Graph's in place. Parameters ---------- - n_conformer_threshold : int, default=3 + n_conformer_threshold : int, default=5 The minimium number of conformers per entry. Returns ------- None """ + _logger.info(f'Filter molecules with conformers below {n_conformer_threshold} conformers') new_graphs = [] for i, g in enumerate(self.graphs): n_confs = g.nodes['n1'].data['xyz'].shape[1] @@ -303,15 +369,15 @@ def filter_minimum_conformers(self, n_conformer_threshold=3): del new_graphs - def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']): + def compute_baseline_energy_force(self, forcefield_list=['openff-2.1.0']): """Compute energies and forces using other force fields. - New node features are added to g.nodes['g']. For example, g.nodes['g'].data['u_openff-2.0.0'] and - g.nodes['n1'].data['u_openff-2.0.0_prime'] will be created for energies and forces, respectively. + New node features are added to g.nodes['g']. For example, g.nodes['g'].data['u_openff-2.1.0'] and + g.nodes['n1'].data['u_openff-2.1.0_prime'] will be created for energies and forces, respectively. Parameters ---------- - forcefield_list : list, default=['openff-2.0.0'] + forcefield_list : list, default=['openff-2.1.0'] Currently supports the following force fields: 'gaff-1.81', 'gaff-2.11', 'openff-1.2.0', 'openff-2.0.0', 'openff-2.1.0', 'amber14-all.xml', 'amber/protein.ff14SBonlysc.xml' @@ -338,82 +404,89 @@ def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']): from openmm.unit import Quantity from openmmforcefields.generators import SystemGenerator + _logger.info(f'Compute energies and forces using other force fields') + # Simulation Specs (not important, just place holders) TEMPERATURE = 350 * unit.kelvin STEP_SIZE = 1.0 * unit.femtosecond COLLISION_RATE = 1.0 / unit.picosecond if not all(_ in self.available_forcefields for _ in forcefield_list): - raise Exception(f'{forcefield} force field not supported. Supported force fields are {SUPPORTED_FORCEFIELD_LIST}.') + raise Exception(f'{forcefield} force field not supported. Supported force fields are {self.available_forcefields}.') new_graphs = [] - for i, g in enumerate(self.graphs): - for forcefield in forcefield_list: - if forcefield.startswith('gaff') or forcefield.startswith('openff'): - generator = SystemGenerator( - small_molecule_forcefield=forcefield, - molecules=[g.mol], - forcefield_kwargs={"constraints": None, "removeCMMotion": False}, - ) - name = forcefield - elif forcefield.startswith('amber') or forcefield.startswith('protein'): - generator = SystemGenerator( - forcefields=[forcefield], - molecules=[g.mol], - forcefield_kwargs={"constraints": None, "removeCMMotion": False}, - ) - if forcefield == 'amber14-all.xml': - name = 'amber14sb' - elif forcefield == 'amber/protein.ff14SBonlysc.xml': - name = 'amber14sb_onlysc' - else: - import warnings - warnings.warn(f'{forcefield} not supported for molecule {g.mol.to_smiles()}') - - suffix = name - - # Parameterize topology - topology = g.mol.to_topology().to_openmm() - # Create openmm system - system = generator.create_system(topology) - # Use langevin integrator, although it's not super useful here - integrator = openmm.LangevinIntegrator(TEMPERATURE, COLLISION_RATE, STEP_SIZE) - # Create simulation - simulation = Simulation(topology=topology, system=system, integrator=integrator) - # Get energy - us = [] - us_prime = [] - xs = ( - Quantity( - g.nodes["n1"].data["xyz"].detach().numpy(), - espunits.DISTANCE_UNIT, - ) - .value_in_unit(unit.nanometer) - .transpose((1, 0, 2)) - ) - for x in xs: - simulation.context.setPositions(x) - us.append( - simulation.context.getState(getEnergy=True) - .getPotentialEnergy() - .value_in_unit(espunits.ENERGY_UNIT) + for g in self.graphs: + try: + for forcefield in forcefield_list: + if forcefield.startswith('gaff') or forcefield.startswith('openff'): + generator = SystemGenerator( + small_molecule_forcefield=forcefield, + molecules=[g.mol], + forcefield_kwargs={"constraints": None, "removeCMMotion": False}, + ) + name = forcefield + elif forcefield.startswith('amber') or forcefield.startswith('protein'): + generator = SystemGenerator( + forcefields=[forcefield], + molecules=[g.mol], + forcefield_kwargs={"constraints": None, "removeCMMotion": False}, + ) + if forcefield == 'amber14-all.xml': + name = 'amber14sb' + elif forcefield == 'amber/protein.ff14SBonlysc.xml': + name = 'amber14sb_onlysc' + else: + import warnings + warnings.warn(f'{forcefield} not supported for molecule {g.mol.to_smiles()}') + + suffix = name + + # Parameterize topology + topology = g.mol.to_topology().to_openmm() + # Create openmm system + system = generator.create_system(topology) + # Use langevin integrator, although it's not super useful here + integrator = openmm.LangevinIntegrator(TEMPERATURE, COLLISION_RATE, STEP_SIZE) + # Create simulation + simulation = Simulation(topology=topology, system=system, integrator=integrator) + # Get energy + us = [] + us_prime = [] + xs = ( + Quantity( + g.nodes["n1"].data["xyz"].detach().numpy(), + espunits.DISTANCE_UNIT, + ) + .value_in_unit(unit.nanometer) + .transpose((1, 0, 2)) ) - us_prime.append( - simulation.context.getState(getForces=True) - .getForces(asNumpy=True) - .value_in_unit(espunits.FORCE_UNIT) * -1 + for x in xs: + simulation.context.setPositions(x) + us.append( + simulation.context.getState(getEnergy=True) + .getPotentialEnergy() + .value_in_unit(espunits.ENERGY_UNIT) + ) + us_prime.append( + simulation.context.getState(getForces=True) + .getForces(asNumpy=True) + .value_in_unit(espunits.FORCE_UNIT) * -1 + ) + + us = torch.tensor(us, dtype=torch.float64)[None, :] + us_prime = torch.tensor( + np.stack(us_prime, axis=1), + dtype=torch.get_default_dtype(), ) - us = torch.tensor(us, dtype=torch.float64)[None, :] - us_prime = torch.tensor( - np.stack(us_prime, axis=1), - dtype=torch.get_default_dtype(), - ) - - g.nodes['g'].data['u_%s' % suffix] = us - g.nodes['n1'].data['u_%s_prime' % suffix] = us_prime + g.nodes['g'].data['u_%s' % suffix] = us + g.nodes['n1'].data['u_%s_prime' % suffix] = us_prime - new_graphs.append(g) + new_graphs.append(g) + except Exception as e: + mol_err = g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + _logger.warning(f'Error occured during processing {mol_err}: {e}') + continue # Update in place self.graphs = new_graphs @@ -429,6 +502,7 @@ def compute_relative_energy(self): ------- None """ + _logger.info(f'Compute relative energy') new_graphs = [] for g in self.graphs: g.nodes['g'].data['u_ref_relative'] = g.nodes['g'].data['u_ref'].detach().clone() @@ -440,7 +514,7 @@ def compute_relative_energy(self): del new_graphs - def reshape_conformation_size(self, n_confs=50): + def reshape_conformation_size(self, n_confs=50, include_min_energy_conf=False): """Reshape conformation size. This is a work around to handle different graph size (shape). DGL requires at least one dimension with same size. @@ -455,11 +529,14 @@ def reshape_conformation_size(self, n_confs=50): n_confs : int, default=50 Number of conformations per graph (molecule). + include_min_energy_conf : boolean, default=False + If True, then minimum energy conformer will be included for all split graphs. + Returns ------- None """ - _logger.info(f'Reshape graphs size') + _logger.info(f'Reshape graph size') import random import copy @@ -469,17 +546,18 @@ def reshape_conformation_size(self, n_confs=50): self._remove_node_features() new_graphs = [] + n_confs_cache = n_confs for i, g in enumerate(self.graphs): n = g.nodes['n1'].data['xyz'].shape[1] if n == n_confs: - _logger.info(f"Molecule #{i} ({n} conformations)") + _logger.info(f"Mol #{i} ({n} conformers)") new_graphs.append(g) elif n < n_confs: random.seed(self.random_seed) index_random = random.choices(range(0, n), k=n_confs-n) - _logger.info(f"Molecule #{i} ({n} conformations). Randomly select {len(index_random)} conformations") + _logger.info(f"Randomly select {len(index_random)} conformers from Mol #{i} ({n} conformers)") _g = copy.deepcopy(g) _g.nodes["g"].data["u_ref"] = torch.cat((_g.nodes['g'].data['u_ref'], _g.nodes['g'].data['u_ref'][:, index_random]), dim=-1) @@ -488,9 +566,17 @@ def reshape_conformation_size(self, n_confs=50): new_graphs.append(_g) else: - _logger.info(f"Molecule #{i} ({n} conformations). Shuffle indices and split data into chunks") random.seed(self.random_seed) idx_range = random.sample(range(n), k=n) + + # Get index for minimum energy conformer + if include_min_energy_conf: + index_min = [g.nodes['g'].data['u_ref'].argmin().item()] + n_confs = n_confs_cache - 1 + _logger.info(f"Shuffe Mol #{i} ({n} conformers) and split into {n_confs} conformers and add minimum energy conformer (index #{index_min[0]})") + else: + _logger.info(f"Shuffe Mol #{i} ({n} conformers) and split into {n_confs} conformers") + for j in range(n // n_confs + 1): _g = copy.deepcopy(g) @@ -498,7 +584,12 @@ def reshape_conformation_size(self, n_confs=50): index = range(j*n_confs, n) random.seed(self.random_seed) index_random = random.choices(range(0, n), k=(j+1)*n_confs-n) - _logger.debug(f"Iteration {j}: Randomly select {len(index_random)} conformers") + + if include_min_energy_conf: + index_random = index_random + index_min + _logger.debug(f"Iteration {j}: Randomly select {len(index_random)} conformers and add minimum energy conformer") + else: + _logger.debug(f"Iteration {j}: Randomly select {len(index_random)} conformers") _g.nodes["g"].data["u_ref"] = torch.cat((_g.nodes['g'].data['u_ref'][:, index], _g.nodes['g'].data['u_ref'][:, index_random]), dim=-1) _g.nodes["n1"].data["xyz"] = torch.cat((_g.nodes['n1'].data['xyz'][:, index, :], _g.nodes['n1'].data['xyz'][:, index_random, :]), dim=1) @@ -507,7 +598,12 @@ def reshape_conformation_size(self, n_confs=50): idx1 = j*n_confs idx2 = (j+1)*n_confs index = idx_range[idx1:idx2] - _logger.debug(f"Iteration {j}: Extract indice from {idx1} to {idx2}") + + if include_min_energy_conf: + index = index + index_min + _logger.debug(f"Iteration {j}: Extract indice from {idx1} to {idx2} and add minimum energy conformer") + else: + _logger.debug(f"Iteration {j}: Extract indice from {idx1} to {idx2}") _g.nodes["g"].data["u_ref"] = _g.nodes['g'].data['u_ref'][:, index] _g.nodes["n1"].data["xyz"] = _g.nodes['n1'].data['xyz'][:, index, :] @@ -546,12 +642,12 @@ def _remove_node_features(self): @staticmethod - def _merge_graphs(ds): + def _merge_graphs(subset, isomeric_flag): """Merge multiple Graph instances into a single Graph. Parameters ---------- - ds : list of espaloma.graphs.graph.Graph + subset : list of espaloma.graphs.graph.Graph, default=None The list of Graph instances to be merged. All Graphs in the list must be equivalent. Returns @@ -564,30 +660,76 @@ def _merge_graphs(ds): import copy import torch + if isomeric_flag == True: + mapped_smiles = subset[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + else: + mapped_smiles = subset[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) + _logger.info(f'Merge {len(subset)} graphs: {mapped_smiles}') + # Check if graphs are equivalent - for i in range(1, len(ds)): - # Openff molecule - assert ds[0].mol == ds[i].mol - # Mapped isomeric smiles - assert ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) == ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) - # Other node features - for key in ["sum_q"]: - np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy()) - for key in ["q_ref", "idxs", "h0"]: - np.testing.assert_array_equal(ds[0].nodes['n1'].data[key].flatten().numpy(), ds[i].nodes['n1'].data[key].flatten().numpy()) + charge_index = [] # book keep indices with inconsistent partial charges + atol = rtol = 1e-2 # charge tolerance + for i in range(1, len(subset)): + if isomeric_flag == True: + mapped_smiles_i = subset[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) + assert mapped_smiles == mapped_smiles_i, f"Isomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" + else: + mapped_smiles_i = subset[i].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) + assert mapped_smiles == mapped_smiles_i, f"Nonisomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}" + # Net charge + np.testing.assert_array_equal(subset[0].nodes['g'].data['sum_q'].flatten().numpy(), subset[i].nodes['g'].data['sum_q'].flatten().numpy()) + # Input node features + np.testing.assert_array_equal(subset[0].nodes['n1'].data['h0'].flatten().numpy(), subset[i].nodes['n1'].data['h0'].flatten().numpy()) + # Atom ordering: As long as mapped smiles are the same, we don't need to compare n1, n2, n3 nodes? + np.testing.assert_array_equal(subset[0].nodes['n1'].data['idxs'].flatten().numpy(), subset[i].nodes['n1'].data['idxs'].flatten().numpy()) + np.testing.assert_array_equal(subset[0].nodes['n2'].data['idxs'].flatten().numpy(), subset[i].nodes['n2'].data['idxs'].flatten().numpy()) + np.testing.assert_array_equal(subset[0].nodes['n3'].data['idxs'].flatten().numpy(), subset[i].nodes['n3'].data['idxs'].flatten().numpy()) + # Partial charges: There could be inconsistency due to different 3D conformers generated during partial charge calculation process. + charge_boolean = np.allclose(subset[0].nodes['n1'].data['q_ref'].flatten().numpy(), subset[i].nodes['n1'].data['q_ref'].flatten().numpy(), rtol=rtol, atol=atol) + if charge_boolean == False: + charge_diff = np.abs(subset[0].nodes['n1'].data['q_ref'].flatten().numpy() - subset[i].nodes['n1'].data['q_ref'].flatten().numpy()) + _logger.warning(f"Entry {i}: Maximum charge difference {charge_diff.max()} is higher than {atol} when compared to the first graph") + charge_index.append(i) + + # Handle partial charges if inconsistent + if charge_index: + # Get indices with unique partial charges + # Book keep indices with unique partial charges starting from the first graph + unique_charge_index = [0] + for i in charge_index: + is_equal = [] + for j in unique_charge_index: + # Extract the arrays to compare + arr_i = subset[i].nodes['n1'].data['q_ref'].flatten().numpy() + arr_j = subset[j].nodes['n1'].data['q_ref'].flatten().numpy() + is_equal.append(np.array_equal(arr_i, arr_j)) + # Check if all False + if not any(is_equal): + unique_charge_index.append(i) + # Average partial charges + _logger.info(f'Average partial charges ({unique_charge_index})...') + q_ref = subset[0].nodes['n1'].data['q_ref'] + _logger.info(f'Entry #0: {q_ref.flatten().numpy()}') + for index in unique_charge_index[1:]: + _q_ref = subset[index].nodes['n1'].data['q_ref'] + _logger.info(f'Entry #{index}: {_q_ref.flatten().numpy()}') + q_ref += _q_ref + q_ref = q_ref / len(unique_charge_index) + # Update partial charges in-place + for i in range(len(subset)): + subset[i].nodes['n1'].data['q_ref'] = q_ref + _logger.info(f'Averaged partial charges: {subset[0].nodes["n1"].data["q_ref"].flatten().numpy()}') # Merge graphs - g = copy.deepcopy(ds[0]) + g = copy.deepcopy(subset[0]) for key in g.nodes['g'].data.keys(): if key not in ["sum_q"]: - for i in range(1, len(ds)): - g.nodes['g'].data[key] = torch.cat((g.nodes['g'].data[key], ds[i].nodes['g'].data[key]), dim=-1) + for i in range(1, len(subset)): + g.nodes['g'].data[key] = torch.cat((g.nodes['g'].data[key], subset[i].nodes['g'].data[key]), dim=-1) for key in g.nodes['n1'].data.keys(): if key not in ["q_ref", "idxs", "h0"]: - for i in range(1, len(ds)): - if key == "xyz": - n_confs = ds[i].nodes['n1'].data['xyz'].shape[1] - g.nodes['n1'].data[key] = torch.cat((g.nodes['n1'].data[key], ds[i].nodes['n1'].data[key]), dim=1) + for i in range(1, len(subset)): + g.nodes['n1'].data[key] = torch.cat((g.nodes['n1'].data[key], subset[i].nodes['n1'].data[key]), dim=1) return g diff --git a/espfit/utils/logging.py b/espfit/utils/logging.py index 97b825a..0fd0f22 100644 --- a/espfit/utils/logging.py +++ b/espfit/utils/logging.py @@ -11,8 +11,8 @@ def set_logging_level(level): Parameters ---------- - level : int - The logging level. For example, logging.INFO. + level : str + The logging level. Options are [NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL, FATAL]. Returns ------- diff --git a/espfit/utils/sampler/__init__.py b/espfit/utils/sampler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py new file mode 100644 index 0000000..cfb14bc --- /dev/null +++ b/espfit/utils/sampler/reweight.py @@ -0,0 +1,244 @@ +""" +Compute effective sample size and weights for each simulation. + +TODO +---- +* Check J-coupling experimental error. Currently, fixed to 0.5 Hz. +""" +import os +import logging + +_logger = logging.getLogger(__name__) + + +class SetupSamplerReweight(object): + """Setup sampler for reweighting simulation. + + This class is responsible for setting up the sampler for reweighting simulation. + It provides methods to run the simulation, compute the effective sample size, + compute the loss, and compute the weighted observable. + + Methods + ------- + run(): + Runs the simulation for each sampler. + + get_effective_sample_size(temporary_samplers): + Computes the effective sample size and sampling weights for each sampler. + + compute_loss(): + Computes the loss for each sampler. + """ + def __init__(self): + self.samplers = None + self.weights = dict() # {'target_name': {'weights': w_i}, {'neff': neff}} + + + def run(self): + """Runs the simulation for each sampler. + + Returns + ------- + None + """ + for sampler in self.samplers: + _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...') + sampler.minimize() + sampler.run() + + + def get_effective_sample_size(self, temporary_samplers): + """Computes the effective sample size and sampling weights for each sampler. + + Parameters + ---------- + temporary_samplers : list + List of temporary samplers. + + Returns + ------- + float + The minimum effective sample size among all samplers. + """ + import mdtraj + import numpy as np + from openmm.unit import kilocalories_per_mole as kcalpermol + from espfit.utils.units import KB_T_KCALPERMOL + + if self.samplers is None: + return -1 + + for sampler, temporary_sampler in zip(self.samplers, temporary_samplers): + _logger.info(f'Compute effective sample size and sampling weights for {sampler.target_name}') + + # Get temperature + temp0 = sampler.temperature._value + temp1 = temporary_sampler.temperature._value + assert temp0 == temp1, f'Temperature should be equivalent but got sampler {temp0} K and temporary sampler {temp1} K' + beta = 1 / (KB_T_KCALPERMOL * temp0) + _logger.debug(f'beta temperature in kcal/mol: {beta}') + + # Get position from trajectory + traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb') + _logger.info(f'Found {traj.n_frames} frames in trajectory') + + # Compute weights and effective sample size + log_w = [] + for i in range(traj.n_frames): + # U(x0, theta0) + sampler.simulation.context.setPositions(traj.openmm_positions(i)) + potential_energy = sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy() + # U(x0, theta1) + temporary_sampler.simulation.context.setPositions(traj.openmm_positions(i)) + reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy() + # deltaU = U(x0, theta1) - U(x0, theta0) + delta = (reduced_potential_energy - potential_energy).value_in_unit(kcalpermol) + # log_w = ln(exp(-beta * delta)) + w = -1 * beta * delta + log_w.append(w) + + #_logger.debug(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') + #_logger.debug(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol') + #_logger.debug(f'deltaU: {delta:10.3f} kcal/mol') + #_logger.debug(f'log_w: {w:10.3f}') + + # Compute weights and effective sample size (ratio: 0 to 1) + w_i = np.exp(log_w) / np.sum(np.exp(log_w)) + neff = np.sum(w_i) ** 2 / np.sum(w_i ** 2) / len(w_i) + #_logger.debug(f'w_i_sum: {np.sum(w_i):10.3f}') + #_logger.debug(f'neff: {neff:10.3f}') + + self.weights[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i} + #_logger.info(f'{self.weights}') + neffs = [self.weights[key]['neff'] for key in self.weights.keys()] + + return min(neffs) + + + def compute_loss(self): + """Computes the loss for each sampler. + + Returns + ------- + list + List of torch tensors representing the loss for each sampler. + """ + loss_list = [] + for sampler in self.samplers: + _logger.info(f'Compute loss for {sampler.target_name}') + loss = self._compute_loss_per_system(sampler) # torch.tensor + loss_list.append(loss) + + return loss_list + + + def _compute_loss_per_system(self, sampler): + """Computes the loss per system for a given sampler. + + Parameters + ---------- + sampler : object + The sampler object. + + Returns + ------- + torch.Tensor + The loss per system as a torch tensor. + """ + import torch + + # Compute experimental observable + exp = self._get_experiment_data(sampler.target_class, sampler.target_name) + pred = self._compute_weighted_observable(sampler.atomSubset, sampler.target_name, sampler.output_directory_path) + + loss = [] + for resi_index, exp_dict in enumerate(exp.values()): + for key, value in exp_dict.items(): + # {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}} + if value['operator'] in ['>', '<', '>=', '<=', '~'] or value['value'] == None: + # Dont use uncertain data + pass + else: + exp_value = value['value'] + exp_error = value['error'] + if exp_error == None: + exp_error = 0.5 # TODO: Check experimental error + resi_index = int(resi_index) + pred_value = list(pred.values())[resi_index][key]['avg'] + pred_error = list(pred.values())[resi_index][key]['std'] + _logger.debug(f'Exp ({resi_index}-{key}): {exp}') + _logger.debug(f'Pred ({resi_index}-{key}): {pred}') + + numerator = (pred_value - exp_value) ** 2 + dominator = (exp_error ** 2 + pred_error ** 2) + loss.append(numerator / dominator) + # Compute loss + loss_avg = torch.mean(torch.tensor(loss)) + _logger.info(f'Sampler loss: {loss_avg.item():.3f}') + + return loss_avg + + + def _get_experiment_data(self, target_class, target_name): + """Retrieves the experimental data for a given target. + + Parameters + ---------- + target_class : str + The class of the target. + + target_name : str + The name of the target. + + Returns + ------- + dict : The experimental data for the target. + """ + import yaml + from importlib.resources import files + + yaml_file = str(files('espfit').joinpath(f'data/target/{target_class}/{target_name}/experiment.yml')) + with open(yaml_file, 'r', encoding='utf8') as f: + d = yaml.safe_load(f) + + # {'resi_1': {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}}} + return d['experiment_1']['measurement'] + + + def _compute_weighted_observable(self, atomSubset, target_name, output_directory_path): + """Computes the weighted observable for a given target. + + Parameters + ---------- + atomSubset : str + The atom subset. + + target_name : str + The name of the target. + + output_directory_path : str + The output directory path. + + Returns + ------- + dict : The computed weighted observable. + """ + import yaml + from espfit.app.analysis import RNASystem + + # Load trajectory + target = RNASystem(atomSubset=atomSubset) + target.load_traj(input_directory_path=output_directory_path) + + # Compute observable + if self.weights.keys(): + pred = target.compute_jcouplings(weights=self.weights[target_name]['weights']) + else: + pred = target.compute_jcouplings(weights=None) + _logger.debug(f'Computed observable: {pred}') + + # Export observable + with open(os.path.join(output_directory_path, 'pred.yaml'), 'w') as f: + yaml.dump(pred, f, allow_unicode=True) + + return pred diff --git a/espfit/utils/units.py b/espfit/utils/units.py index 011dadc..9db83d5 100644 --- a/espfit/utils/units.py +++ b/espfit/utils/units.py @@ -1,3 +1,57 @@ -# Constants for unit conversions -HARTEE_TO_KCALPERMOL = 627.509 -BOHR_TO_ANGSTROMS = 0.529177 \ No newline at end of file +import openmm.unit as unit +from pint import UnitRegistry + +# Define pint unit registry +ureg = UnitRegistry() +hartree = 1 * ureg.hartree +bohr = 1 * ureg.bohr +angstrom = 1 * ureg.angstrom +kelvin = 1 * ureg.kelvin +kB= ureg.boltzmann_constant +kBT = kB * kelvin +kcalpermol = ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole) + +# Conversion factors +#HARTEE_TO_KCALPERMOL = 627.509 +#BOHR_TO_ANGSTROMS = 0.529 +HARTREE_TO_KCALPERMOL = hartree.to(kcalpermol).magnitude +BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude +KB_T_KCALPERMOL = kBT.to(kcalpermol).magnitude + + +def convert_string_to_unit(unit_string): + """Convert a unit string to a openmm unit object. + + Parameters + ---------- + unit_string : str + The string representation of the unit. + + Returns + ------- + openmm.unit + The openmm unit object. + """ + unit_mapping = { + "nanometer": unit.nanometer, + "angstrom": unit.angstrom, + "nanometers": unit.nanometers, + "angstroms": unit.angstroms, + "kelvin": unit.kelvin, + "molar": unit.molar, + "millimolar": unit.millimolar, + "micromolar": unit.micromolar, + "atomsphere": unit.atmosphere, + "bar": unit.bar, + "nanoseconds": unit.nanoseconds, + "picoseconds": unit.picoseconds, + "femtoseconds": unit.femtoseconds, + "nanosecond": unit.nanosecond, + "picosecond": unit.picosecond, + "femtosecond": unit.femtosecond, + # Add more units as needed + } + if unit_string in unit_mapping: + return unit_mapping[unit_string] + else: + raise ValueError(f"Unit '{unit_string}' is not recognized.") \ No newline at end of file diff --git a/examples/mockcode/mockcode.ipynb b/examples/mockcode/mockcode.ipynb deleted file mode 100644 index 490e58b..0000000 --- a/examples/mockcode/mockcode.ipynb +++ /dev/null @@ -1,911 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8425a3a5", - "metadata": {}, - "source": [ - "# Mock code for preparing and loading data for training espaloma" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6db9e281", - "metadata": {}, - "outputs": [], - "source": [ - "import espaloma\n", - "import espfit" - ] - }, - { - "cell_type": "markdown", - "id": "fa0f1027", - "metadata": {}, - "source": [ - "## Download QC datasets from QCArchive as HDF5 (SKIP IMPLEMENTATION) \n", - "\n", - "This functionality will not be implemented at the moment and alternatively rely on external scripts (e.g. https://github.com/choderalab/download-qca-datasets)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b69087c6", - "metadata": {}, - "outputs": [], - "source": [ - "# place holder\n", - "\n", - "outdir='/DATASET_HDF_PATH/MYDATA' \n", - "outfile='small_basic.hdf5'\n", - "\n", - "espfit.utils.data.download_qcarchive(workflow='Datataset', \n", - " qc_specification='default', \n", - " outdir=outdir,\n", - " outfile=outfile\n", - " )\n", - "#> raise NotImplemented Error" - ] - }, - { - "cell_type": "markdown", - "id": "aae11b77", - "metadata": {}, - "source": [ - "## Convert HDF5 to DGL graphs (SKIP IMPLEMENTATION)\n", - "\n", - "This function will not be implemented at the moment and alternatively rely on external scripts (e.g. https://github.com/choderalab/refit-espaloma/blob/main/openff-default/01-create-dataset/script/getgraph_hdf5.py)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45c152ff", - "metadata": {}, - "outputs": [], - "source": [ - "# place holder\n", - "\n", - "indir = '/DATASET_HDF_PATH/MYDATA'\n", - "outdir = '/DATASET_DGL_PATH/MYDATA'\n", - "\n", - "_filenames = [ 'small_basic.hdf5', 'small_optimize.hdf5', 'small_torsiondrive.hdf5', 'peptide_basic.hdf5', 'peptide_optimize.hdf5', 'peptide_torsiondrive.hdf5' ]\n", - "filenames = [ os.path.join(indir, filename) for filename in _filenames ]\n", - "\n", - "for filename in filenames:\n", - " ds += espfit.utils.data.hdf5_to_dgl(infile=filename,outdir=outdir)\n", - " \n", - "#> raise NotImplemented Error" - ] - }, - { - "cell_type": "markdown", - "id": "013f75f8", - "metadata": {}, - "source": [ - "## Filter DGL graphs (SKIP IMPLEMENTATION)\n", - "\n", - "This function will not be implemented at the moment and rely on external scripts (e.g. https://github.com/choderalab/refit-espaloma/tree/main/openff-default/02-train/merge-data/script)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6f81b71", - "metadata": {}, - "outputs": [], - "source": [ - "# place holder\n", - "\n", - "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED'\n", - "ds.filter(min_energy=0.1,\n", - " min_conformer=3,\n", - " compute_am1bcc='AM1BCC-ELF10', \n", - " compute_baseline_forcefields=forcefield_list, \n", - " compute_relative_energy=True,\n", - " subtract_nonbonded=True,\n", - " base_forcefiled='openff-2.0.0',\n", - " inplace=False,\n", - " outdir=outdir\n", - " )\n", - " \n", - "#> raise NotImplemented Error" - ] - }, - { - "cell_type": "markdown", - "id": "a5da73a3", - "metadata": {}, - "source": [ - "## Load preprocessed DGL graphs\n", - "\n", - "We are going to start from here." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f402f1a1", - "metadata": {}, - "outputs": [], - "source": [ - "indir = '/DATASET_DGL_PATH/MYDATA/FILTERED/*' # single path or list of paths\n", - "ds = espfit.utils.data.load(in_prefix)" - ] - }, - { - "cell_type": "markdown", - "id": "c06051b0", - "metadata": {}, - "source": [ - "#### Check properties" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26b1b096", - "metadata": {}, - "outputs": [], - "source": [ - "ds.n_data # number of data (entries)\n", - "#> 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97fca213", - "metadata": {}, - "outputs": [], - "source": [ - "ds.n_conf # number of conformations\n", - "#> 10000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edd853ac", - "metadata": {}, - "outputs": [], - "source": [ - "ds.elements # elements\n", - "#> H,B,Br,C,N,O,I" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d118175", - "metadata": {}, - "outputs": [], - "source": [ - "ds.duplicate_isomeric_smiles # isomeric smiles\n", - "#> returns list of duplicate isomeric smiles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05bdc5b7", - "metadata": {}, - "outputs": [], - "source": [ - "ds.duplicate_nonisomeric_smiles # nonisomeric smiles\n", - "#> returns list of duplicate nonisomeric smiles" - ] - }, - { - "cell_type": "markdown", - "id": "e0af7103", - "metadata": {}, - "source": [ - "#### Drop/merge duplicate smiles and filter datasets\n", - "\n", - "Ensure the datasets loaded from different sources have no duplicated smiles. \n", - "Drop duplicate isomeric (nonisomeric) smiles across different sources of datasets. \n", - "Merge duplicate dgl graphs with same smiles into a single dgl graph and create a new dataset called 'misc'." - ] - }, - { - "cell_type": "markdown", - "id": "15340bae", - "metadata": {}, - "source": [ - "##### drop and merge smiles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47cb8e2b", - "metadata": {}, - "outputs": [], - "source": [ - "outdir = '/DATASET_DGL_PATH/MYDATA'\n", - "ds.drop_merge_nonisomeric_smiles(outdir=outdir, outname='misc') # miscellaneous\n", - "\n", - "# Alteratively,\n", - "ds.drop_merge_isomeric_smiles(outdir=outdir, outname='misc')" - ] - }, - { - "cell_type": "markdown", - "id": "1a84aa3d", - "metadata": {}, - "source": [ - "##### filter dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "958112bf", - "metadata": {}, - "outputs": [], - "source": [ - "# Add misc dataset that was just created\n", - "ds += espfit.utils.data.load('/DATASET_DGL_PATH/MYDATA/misc')\n", - "\n", - "# Filter all dataset\n", - "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED'\n", - "ds.filter(min_energy=0.1,\n", - " min_conformer=3,\n", - " inplace=False,\n", - " outdir=outdir\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "457c7c7c-bd74-4009-a748-b014a9e21e6a", - "metadata": {}, - "outputs": [], - "source": [ - "# Compute all dataset\n", - "ds.compute(compute_am1bcc=None, \n", - " compute_baseline_forcefields=None, \n", - " compute_relative_energy=True,\n", - " subtract_nonbonded=True,\n", - " base_forcefiled='openff-2.0.0',\n", - " inplace=False,\n", - " outdir=outdir\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f96faf73", - "metadata": {}, - "outputs": [], - "source": [ - "# Alternatively, we could just filter the misc data and reload all filtered dataset later\n", - "\n", - "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED'\n", - "misc_data = espfit.utils.data.load('/DATASET_DGL_PATH/MYDATA/misc')\n", - "misc_data.filter(min_energy=0.1,\n", - " min_conformer=3,\n", - " inplace=False,\n", - " outdir=outdir\n", - " )\n", - "misc_data.compute(compute_am1bcc=None, \n", - " compute_baseline_forcefields=None, \n", - " compute_relative_energy=True,\n", - " subtract_nonbonded=True,\n", - " base_forcefiled='openff-2.0.0',\n", - " inplace=False,\n", - " outdir=outdir\n", - " )\n", - "\n", - "# load filtered\n", - "input_dirs = glob.glob('/DATASET_DGL_PATH/MYDATA/FILTERED/*') # list of paths\n", - "ds = espfit.utils.data.load(input_dirs)" - ] - }, - { - "cell_type": "markdown", - "id": "a132c68d", - "metadata": {}, - "source": [ - "## Prepare for training" - ] - }, - { - "cell_type": "markdown", - "id": "8174a95f", - "metadata": {}, - "source": [ - "#### Split datasets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8b64d22", - "metadata": {}, - "outputs": [], - "source": [ - "RANDOM_SEED = 2666\n", - "ds.shuffle(RANDOM_SEED)\n", - "\n", - "ds_tr, ds_vl_te = ds.split(0.8, 0.2)\n", - "ds_vl, ds_te = ds_vl_te.split(0.5, 0.5)" - ] - }, - { - "cell_type": "markdown", - "id": "06de6513", - "metadata": {}, - "source": [ - "#### Augment conformations to handle heterographs\n", - "\n", - "This is a work around to handle different graph size (shape). DGL requires at least one dimension with same size. \n", - "Here, we will modify the graphs so that each graph has the same number of conformations instead fo concatenating \n", - "graphs into heterogenous graphs with the same number of conformations. This will allow batching and shuffling \n", - "during the training. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28ce7bd3", - "metadata": {}, - "outputs": [], - "source": [ - "# Remove unnecessary data from graph in backend? (will this speed up training?)\n", - "# e.g. g.nodes['g'].data.pop('u_qm')\n", - "\n", - "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED/RESHAPE'\n", - "ds_tr.reshape(n_conf=50,\n", - " preserve_min=True,\n", - " inplace=True,\n", - " outdir=outdir,\n", - " verbose=1,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9deeabf", - "metadata": {}, - "outputs": [], - "source": [ - "# regenerate impropers (forgot why we need to do this)\n", - "ds_tr.apply(regenerate_impropers, in_place=True)" - ] - }, - { - "cell_type": "markdown", - "id": "d45feb3f", - "metadata": {}, - "source": [ - "## Train espaloma" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eeb8b4f9", - "metadata": {}, - "outputs": [], - "source": [ - "# initialize\n", - "model = espfit.app.experiment()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c71d7e80", - "metadata": {}, - "outputs": [], - "source": [ - "# define espaloma architecture\n", - "\n", - "# use toml\n", - "import yaml\n", - "with open('config.yml', 'r') as file:\n", - " config = yaml.safe_load(file) \n", - " \n", - "# Possible methods\n", - "# 1. call predefined model?\n", - "model.call(model_name='model1')\n", - "# 2. create model using yaml config\n", - "model.create(config=config)\n", - "# 3. from file\n", - "model.from_file('config.toml')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32760aff", - "metadata": {}, - "outputs": [], - "source": [ - "# check neural network model\n", - "\n", - "model.net\n", - "#> returns neural network architecture" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "69a9e8d2", - "metadata": {}, - "outputs": [], - "source": [ - "# load dataset\n", - "\n", - "model.train_data = ds_tr\n", - "model.validation_data = ds_vl\n", - "model.test_data = ds_te" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "adeeb626", - "metadata": {}, - "outputs": [], - "source": [ - "# check data property\n", - "\n", - "model.train_data.n_data\n", - "model.train_data.n_conf\n", - "model.train_data.elements" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20caa2bf", - "metadata": {}, - "outputs": [], - "source": [ - "# save checkpoint file to `checkpoints` every 10 epochs\n", - "# restart training from checkpoint file\n", - "# validation is excluded from the training to decrease inference time\n", - "\n", - "model.train(steps, lr, batch_size, restart=checkpoint, checkpoint_frequency=10, log_file=logfile, log_level='debug')" - ] - }, - { - "cell_type": "markdown", - "id": "fd25f33e", - "metadata": {}, - "source": [ - "#### Validate and find best model\n", - "\n", - "Use job array to speed up this process using external scripts (e.g. https://github.com/choderalab/refit-espaloma/tree/main/openff-default/02-train/joint-improper-charge/charge-weight-1.0/eval)" - ] - }, - { - "cell_type": "markdown", - "id": "b94b4be6", - "metadata": {}, - "source": [ - "## Alternatively, train and validate simultaneously\n", - "\n", - "Not sure how slower this will be compared to just doing trainig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc585ee8", - "metadata": {}, - "outputs": [], - "source": [ - "model.train_val(steps, lr, batch_size, restart=checkpoint, checkpoint_frequency=10, logfile=logfile, verbose=1, early_stopping=800, patience=5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80498680", - "metadata": {}, - "outputs": [], - "source": [ - "# save model\n", - "model.save() # saves best model as 'model.pt'\n", - "\n", - "# plot loss validation\n", - "model.plot_loss()" - ] - }, - { - "cell_type": "markdown", - "id": "245006e3", - "metadata": {}, - "source": [ - "## Benchmark" - ] - }, - { - "cell_type": "markdown", - "id": "9861df48", - "metadata": {}, - "source": [ - "#### RMSE metric" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ff42837", - "metadata": {}, - "outputs": [], - "source": [ - "RANDOM_SEED = 2666\n", - "indir='/DATASET_DGL_PATH/MYDATA/FILTERED/RESHAPE'\n", - "data_split_size = [0.8, 0.1, 0.1]\n", - "best_model = 'model.pt'\n", - "\n", - "df = espfit.utils.rmse_metric(best_model, indir, data_split_size, RANDOM_SEED) # pandas dataframe\n", - "df.to_csv('rmse_metric.csv', index=False, sep='¥t', float_format='%.3f')" - ] - }, - { - "cell_type": "markdown", - "id": "dc75ac51", - "metadata": {}, - "source": [ - "#### Run other benchmarks independantly.\n", - "\n", - "- Small molecule geometry optmization (https://github.com/choderalab/geometry-benchmark-espaloma/tree/main/qc-opt-geo)\n", - "- ESP benchmark" - ] - }, - { - "cell_type": "markdown", - "id": "379ad7fb", - "metadata": {}, - "source": [ - "## Train espaloma with experimental observable refitting\n", - "\n", - "- `espfit_experiment/`\n", - " - `data/`: Cached dataset ready for training\n", - " - `utils/`: Stores scripts to run external benchmarks\n", - " - `small_molecule_geometry`\n", - " - geo.py\n", - " - `partial_charge_esp`\n", - " - ele.py\n", - " - `rna_nucleoside`\n", - " - rna_nucleoside.py\n", - " - `rna_tetramer`:\n", - " - rna_tetramer.py\n", - " - `experiment/`\n", - " - `001/`: Create new directory for each refitting experiment\n", - " - `xml/`: Stores openmm xml\n", - " - `refit/`: Espaloma training\n", - " - `checkpoints/`: Stores checkpoint files\n", - " - `sampling/`: MD simulation\n", - " - `iter_0`: Initial MD sampling\n", - " - `iter_n`: MD sampling at epoch-n when necesssary\n", - " - `train.log`: Log file during espaloma training\n", - " - `benchmark/`\n", - " - `rmse_metric`\n", - " - `small_molecule_geometry`\n", - " - `partial_charge_esp`\n", - " - `rna_nucleoside`\n", - " - `rna_tetramer`" - ] - }, - { - "cell_type": "markdown", - "id": "67893219", - "metadata": {}, - "source": [ - "#### Basic usage to run simulations for registered systems" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "343b1a35", - "metadata": {}, - "outputs": [], - "source": [ - "# check registered systems\n", - "registered_systems = espfit.system.available()\n", - "\n", - "registered_systems.get_names\n", - "#> ['A', 'G', 'C', 'U', 'ApA']\n", - "\n", - "registered_systems.get('name').observables\n", - "#> returns pandas dataframe with all experimental obervables and corresponding literature" - ] - }, - { - "cell_type": "markdown", - "id": "c5daf12a", - "metadata": {}, - "source": [ - "##### Prepare system" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c74f2962", - "metadata": {}, - "outputs": [], - "source": [ - "system = registered_systems.get('name')\n", - "simulation = system.setup(system_name=name, espaloma_model = 'model.pt', config=config, outdir=outdir) # save xml\n", - "\n", - "# minimize\n", - "simulation.min()" - ] - }, - { - "cell_type": "markdown", - "id": "66d56fba", - "metadata": {}, - "source": [ - "##### Load a system already prepared" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e673e6f4", - "metadata": {}, - "outputs": [], - "source": [ - "system = espfit.system.load()" - ] - }, - { - "cell_type": "markdown", - "id": "5fc790e6", - "metadata": {}, - "source": [ - "##### Run simulation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a1528dc", - "metadata": {}, - "outputs": [], - "source": [ - "simulation.run(steps=100) # standard MD?" - ] - }, - { - "cell_type": "markdown", - "id": "bc77fe98", - "metadata": {}, - "source": [ - "##### Compute loss" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0657b0a", - "metadata": {}, - "outputs": [], - "source": [ - "obs_exp = system.get_experimental_value()\n", - "obs_calc = simulation.compute_observable()\n", - "loss = simulation.compute_loss(obs_exp, obs_calc)" - ] - }, - { - "cell_type": "markdown", - "id": "a358db78", - "metadata": {}, - "source": [ - "##### Reweight observable using updated espaloma model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18099bb4", - "metadata": {}, - "outputs": [], - "source": [ - "result = simulation.compute_reweighted_observable(update_espaloma_model='new.pt')\n", - "\n", - "# reweighted observable\n", - "obs_calc = result.observable\n", - "\n", - "# effective sample size\n", - "n_eff = result.effective_sample_size\n", - "\n", - "# loss with reweighted observable\n", - "loss = simulation.compute_loss(obs_exp, obs_calc)" - ] - }, - { - "cell_type": "markdown", - "id": "98996eed", - "metadata": {}, - "source": [ - "## Pseudo code for training espaloma with reweighting on the fly" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f73acfab", - "metadata": {}, - "outputs": [], - "source": [ - "RANDOM_SEED = 2666\n", - "\n", - "input_dirs = glob.glob('/DATASET_DGL_PATH/MYDATA/FILTERED/RESHAPE/*') # list of paths\n", - "ds = espfit.utils.data.load(input_dirs)\n", - "ds.shuffle(RANDOM_SEED)\n", - "\n", - "ds_tr, ds_vl_te = ds.split(0.8, 0.2)\n", - "ds_vl, ds_te = ds_vl_te.split(0.5, 0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35e48171", - "metadata": {}, - "outputs": [], - "source": [ - "model = espfit.app.experiment()\n", - "\n", - "with open('config.yml', 'r') as file:\n", - " config = yaml.safe_load(file) \n", - "model.create(config=config)" - ] - }, - { - "cell_type": "markdown", - "id": "6a835423", - "metadata": {}, - "source": [ - "##### Run simulation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c7e206e", - "metadata": {}, - "outputs": [], - "source": [ - "system = registered_systems.get('A')\n", - "simulation = system.setup(system_name=name, espaloma_model = 'model.pt', config=config, outdir=outdir) # save xml\n", - "simulation.min()\n", - "simulation.run(1000)" - ] - }, - { - "cell_type": "markdown", - "id": "fe5fcd47", - "metadata": {}, - "source": [ - "##### Get experimental observables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ff3ca80", - "metadata": {}, - "outputs": [], - "source": [ - "obs_exp = system.get_experimental_value()" - ] - }, - { - "cell_type": "markdown", - "id": "2915bb00", - "metadata": {}, - "source": [ - "##### Train with MD reweighting\n", - "\n", - "[Iterative Optimization of Molecular Mechanics Force Fields from NMR Data of Full-Length Proteins, JCTC, 2011](https://pubs.acs.org/doi/full/10.1021/ct200094b) \n", - "[Automatic Learning of Hydrogen-Bond Fixes in the AMBER RNA Force Field, JCTC, 2022](https://pubs.acs.org/doi/10.1021/acs.jctc.2c00200) \n", - "[Enhanced sampling methods for molecular dynamics simulations, arXiv, 2022](https://arxiv.org/abs/2202.04164) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c4b766c", - "metadata": {}, - "outputs": [], - "source": [ - "ds_tr_loader = dgl.dataloading.GraphDataLoader(ds_tr, batch_size=batch_size, shuffle=True)\n", - "optimizer = torch.optim.Adam(model.net().parameters(), lr=learning_rate)\n", - "\n", - "with torch.autograd.set_detect_anomaly(True):\n", - " for idx in range(steps):\n", - " n_eff = [] # store effective sample size\n", - " for g in ds_tr_loader:\n", - " optimizer.zero_grad()\n", - " g = g.to(\"cuda:0\")\n", - " g.nodes[\"n1\"].data[\"xyz\"].requires_grad = True \n", - " \n", - " # Original espaloma loss\n", - " loss = net(g)\n", - "\n", - " # Reweighting \n", - " result = simulation.compute_reweighted_observable(net) # return: (reweighted observable, effective sample size)\n", - " obs_calc = result.observable\n", - " loss_md = simulation.compute_loss(obs_exp, obs_calc) \n", - " \n", - " n_eff += result.n_eff\n", - " \n", - " # Joint loss\n", - " loss += weight * loss_md\n", - " \n", - " loss.backward()\n", - " optimizer.step()\n", - " \n", - " # save checkpoint file \n", - " if idx % 10 == 0:\n", - " if not os.path.exists(output_prefix):\n", - " os.mkdir(output_prefix)\n", - " torch.save(net.state_dict(), output_prefix + \"/net%s.pth\" % idx)\n", - " \n", - " # Averaged effective samples\n", - " if n_eff.mean() < effective_sample_size_tolerance:\n", - " # rebuild system with current net model\n", - " # rerun simulation\n", - " # cache new trajectory\n", - " simulation.rebuild()\n", - " simulation.run()" - ] - }, - { - "cell_type": "raw", - "id": "3900fc9f", - "metadata": {}, - "source": [] - }, - { - "cell_type": "raw", - "id": "5735fb72", - "metadata": {}, - "source": [] - }, - { - "cell_type": "raw", - "id": "ec7b85df", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}