From 33bbababddac14c48af5728522a23b59f6a43c68 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 22 Mar 2024 10:00:31 -0400
Subject: [PATCH 01/11] add data/target and data/forcefield to package"

---
 pyproject.toml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index bffcfc8..4247bea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,7 +63,10 @@ where = ["."]
 # Ref https://setuptools.pypa.io/en/latest/userguide/datafiles.html#package-data
 [tool.setuptools.package-data]
 espfit = [
-    "py.typed"
+    "py.typed",
+    "data/target/nucleoside/*/target.pdb",
+    "data/target/nucleoside/*/experiment.yml",
+    "data/forcefield/espaloma-0.3.2.pt"
 ]
 
 [tool.versioningit]

From dedf30d6eaaca8f29a7a194f4035d904321cbe80 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 22 Mar 2024 10:02:13 -0400
Subject: [PATCH 02/11] fix #13 and #14

---
 espfit/app/sampler.py | 67 +++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 21 deletions(-)

diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index eb211e5..8efc348 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -38,8 +38,8 @@ class BaseSimulation(object):
     export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None):
         Export serialized system XML file and solvated pdb file.
     """
-    def __init__(self, maxIterations=100, nsteps=250000, atomSubset='solute', 
-                 checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, 
+    def __init__(self, maxIterations=100, nsteps=2500000, atomSubset='solute', 
+                 checkpoint_frequency=250000, logging_frequency=250000, netcdf_frequency=250000, 
                  output_directory_path=None, input_directory_path=None):
         """Initialize base simulation object.
         
@@ -48,19 +48,19 @@ def __init__(self, maxIterations=100, nsteps=250000, atomSubset='solute',
         maxIterations : int, default=100
             Maximum number of iterations to perform minimization.
 
-        nsteps : int, default=250000 (10 ns using 4 fs timestep)
+        nsteps : int, default=2500000 (10 ns using 4 fs timestep)
             Number of steps to run the simulation.
 
         atomSubset : str, default='solute'
             Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'.
             
-        checkpoint_frequency : int, default=25000 (1 ns)
+        checkpoint_frequency : int, default=250000 (1 ns)
             Frequency (in steps) at which to write checkpoint files.
 
-        logging_frequency : int, default=250000 (10 ns)
+        logging_frequency : int, default=250000 (1 ns)
             Frequency (in steps) at which to write logging files.
 
-        netcdf_frequency : int, default=250000 (10 ns)
+        netcdf_frequency : int, default=250000 (1 ns)
             Frequency (in steps) at which to write netcdf files.
 
         output_directory_path : str, optional
@@ -336,7 +336,7 @@ class SetupSampler(BaseSimulation):
     ['amber/protein.ff14SB.xml', 'amber/RNA.OL3.xml']       : pl-multi (TPO): NG, pl-single: OK, RNA: OK
     """
     def __init__(self, 
-                 small_molecule_forcefield='espfit/data/forcefield/espaloma-0.3.2.pt',
+                 small_molecule_forcefield='espaloma-0.3.2',
                  forcefield_files = ['amber/ff14SB.xml', 'amber/phosaa14SB.xml'],
                  water_model='tip3p', 
                  solvent_padding=9.0 * unit.angstroms, 
@@ -355,7 +355,8 @@ def __init__(self,
         Parameters
         ----------
         small_molecule_forcefield : str, optional
-            The force field to be used for small molecules. Default is 'openff-2.1.0'.
+            The force field to be used for small molecules. Default is 'espaloma-0.3.2'.
+            Alternative recommended choice is 'openff-2.1.0'.
         forcefield_files : list, optional
             List of force field files. Default is ['amber14-all.xml'].
         water_model : str, optional
@@ -378,8 +379,12 @@ def __init__(self,
             The integration timestep. Default is 4 * unit.femtoseconds.
         override_with_espaloma : bool, optional
             Whether to override the original parameters with espaloma. Default is True.
+            This will override all solute molecules with espaloma parameters.
         """
         super(SetupSampler, self).__init__(**kwargs)
+        if small_molecule_forcefield == 'espaloma-0.3.2':
+            from importlib.resources import files
+            small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt"))
         self.small_molecule_forcefield = small_molecule_forcefield
         self.water_model = water_model
         self.forcefield_files = self._update_forcefield_files(forcefield_files)
@@ -397,9 +402,13 @@ def __init__(self,
 
 
     @classmethod
-    def from_toml(cls, filename, *args, **override_sampler_kwargs):
+    def _from_toml(cls, filename, *args, **override_sampler_kwargs):
         """Create SetupSampler from a TOML configuration file.
         
+        Note that this is designed for creating new systems with temporary espaloma models generated during 
+        espaloma training. It supports multiple systems in the configuration file and returns a list of 
+        SetupSampler instances.
+
         Parameters
         ----------
         filename : str
@@ -429,30 +438,48 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs):
             print(e)
             raise
         
-        config = config['sampler']['setup']  # list
+        config = config['sampler']['setup']  # this could be list to support multiple systems
         if config is None:
             raise ValueError("target is not specified in the configuration file")
         
+        # If the configuration file contains a single system, convert it to a list
+        if not isinstance(config, list):
+            config = [config]
+
         samplers = []
         _logger.debug(f'Found {len(config)} systems in the configuration file')
         for _config in config:
             sampler = cls()
 
             # Target information
-            target_class = _config['target_class']
-            target_name = _config['target_name']
-
+            target_class = _config.get('target_class', None)
+            target_name = _config.get('target_name', None)
             sampler.target_class = target_class
             sampler.target_name = target_name
 
-            biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb')
-            ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf')
-            if not ligand_file.exists():
-                ligand_file = None
+            # Get biopolymer and ligand file if given
+            # Priority 1: Use input files if given
+            biopolymer_file, ligand_file = None, None
+            if _config.get('biopolymer_file'):
+                biopolymer_file = _config['biopolymer_file']
+                if not os.path.exists(biopolymer_file):
+                    raise FileNotFoundError(f"File not found: {biopolymer_file}")
+            if _config.get('ligand_file'):
+                ligand_file = _config['ligand_file']
+                if not os.path.exists(ligand_file):
+                    raise FileNotFoundError(f"File not found: {ligand_file}")
+            # Priority 2: Search espfit/data/target directory if input files are not given
+            if biopolymer_file is None and ligand_file is None:
+                biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb')
+                ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf')
+                if not biopolymer_file.exists():
+                    raise FileNotFoundError(f"File not found: {biopolymer_file}")
+                if not ligand_file.exists():
+                    ligand_file = None
 
             # System settings
             for key, value in _config.items():
-                if key not in ['target_class', 'target_name']:
+                if key not in ['target_class', 'target_name', 'biopolymer_file', 'ligand_file']:
                     if hasattr(sampler, key):
                         if isinstance(value, str) and "*" in value:
                             _value = float(value.split('*')[0].strip())
@@ -684,7 +711,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
             # As a workaround, we will delete the original `self._system_generator` and create a new one to regenerate the system with espaloma.
             # Only water and ion forcefield files will be used to regenerate the system. Solute molecules will be parametrized with espaloma.
             # 
-            _logger.debug("Regenerate system with espaloma.")
+            _logger.info("Regenerate system with espaloma")
 
             # Re-create system generator
             del self._system_generator
@@ -729,8 +756,6 @@ def _regenerate_espaloma_system(self):
         import mdtraj
         from openff.toolkit import Molecule
 
-        _logger.debug("Regenerate system with espaloma")
-
         # Check biopolymer chains
         mdtop = mdtraj.Topology.from_openmm(self.modeller_solvated_topology)
         chain_indices = [ chain.index for chain in self.modeller_solvated_topology.chains() ]

From 8529b32c5ce13016c38cf351901c00315c67b51f Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 22 Mar 2024 10:03:40 -0400
Subject: [PATCH 03/11] add retrain_graph in train_sampler during espaloma
 training

---
 espfit/app/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index 0dd37a6..047e96a 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -515,7 +515,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
 
                     loss, loss_dict = self.net(g)
                     loss = loss/accumulation_steps
-                    loss.backward()
+                    loss.backward(retain_graph=True)
 
                 if epoch > self.sampler_patience:
                     # Save checkpoint as local model (net.pt)
@@ -666,7 +666,7 @@ def _setup_local_samplers(self, epoch, net_copy, debug):
             }
         
         # Create sampler system from configuration file. Returns list of systems.                        
-        samplers = SetupSampler.from_toml(self.configfile, *args, **override_sampler_kwargs)
+        samplers = SetupSampler._from_toml(self.configfile, *args, **override_sampler_kwargs)
 
         return samplers
     
\ No newline at end of file

From 177ce743565d43072865578f27a50ad7204faa38 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 22 Mar 2024 19:58:55 -0400
Subject: [PATCH 04/11] fix bug to support water models other than tip3p

---
 espfit/app/sampler.py | 61 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index 8efc348..863e6c7 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -387,7 +387,7 @@ def __init__(self,
             small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt"))
         self.small_molecule_forcefield = small_molecule_forcefield
         self.water_model = water_model
-        self.forcefield_files = self._update_forcefield_files(forcefield_files)
+        self.forcefield_files = forcefield_files
         self.solvent_padding = solvent_padding
         self.ionic_strength = ionic_strength
         self.hmass = hmass
@@ -399,6 +399,10 @@ def __init__(self,
         self.override_with_espaloma = override_with_espaloma
         self.target_class = None
         self.target_name = None
+        # Update forcefield file list to add water model files
+        self._update_forcefield_files()
+        # Get water class (3-site: tip3p, 4-site model: tip4pew)
+        self._get_water_class()
 
 
     @classmethod
@@ -438,7 +442,7 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs):
             print(e)
             raise
         
-        config = config['sampler']['setup']  # this could be list to support multiple systems
+        config = config['sampler']['setup']  # list of multiple setups
         if config is None:
             raise ValueError("target is not specified in the configuration file")
         
@@ -449,6 +453,8 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs):
         samplers = []
         _logger.debug(f'Found {len(config)} systems in the configuration file')
         for _config in config:
+            # Create SetupSampler instance with default settings
+            # Note that this automatically adds the default water model (tip3p) to `self.forcefield_files`.
             sampler = cls()
 
             # Target information
@@ -489,7 +495,7 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs):
                         setattr(sampler, key, value)
                     else:
                         raise ValueError(f"Invalid keyword argument: {key}")
-            
+
             # Pass temporary espaloma model to the sampler if kwargs are given
             for key, value in override_sampler_kwargs.items():
                 if hasattr(sampler, key):
@@ -503,7 +509,18 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs):
                     sampler.output_directory_path = os.path.join(sampler.output_directory_path, sampler.target_name, f'{args[0]}')
                 else:
                     raise ValueError(f"Invalid argument: {args}. Expected a single integer value for the epoch number.")
-            
+
+            # Empty and recreate `forcefield_files` to avoid appending multiple water model forcefields.
+            # Use list of forcefield files if given in the `override_sampler_kwargs`. If not given, use the default forcefield_files.
+            if 'forcefield_files' in override_sampler_kwargs.keys():
+                pass
+            else:
+                forcefield_files = ['amber/ff14SB.xml', 'amber/phosaa14SB.xml']
+            sampler.forcefield_files = forcefield_files
+            sampler._update_forcefield_files()
+            # Update water class (3-site: tip3p, 4-site model: tip4pew)
+            sampler._get_water_class()
+
             # Create system
             sampler.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file)            
             samplers.append(sampler)
@@ -512,7 +529,7 @@ def _from_toml(cls, filename, *args, **override_sampler_kwargs):
         return samplers
 
 
-    def _update_forcefield_files(self, forcefield_files):
+    def _update_forcefield_files(self):
         """Get forcefield files.
 
         Update `forcefield_files` depending on the type of water model.
@@ -526,28 +543,23 @@ def _update_forcefield_files(self, forcefield_files):
         # For some reason, the original forcefield_files keeps appending when SetupSampler is called.
         # TODO: Is this the right way to handle this issue?
         import copy
-        _forcefield_files = copy.deepcopy(forcefield_files)
+        _forcefield_files = copy.deepcopy(self.forcefield_files)
 
         # 3-site water models
         if self.water_model == 'tip3p':
             _forcefield_files.append(['amber/tip3p_standard.xml', 'amber/tip3p_HFE_multivalent.xml'])
         elif self.water_model == 'tip3pfb':
-            self.water_model = 'tip3p'
             _forcefield_files.append(['amber/tip3pfb_standard.xml', 'amber/tip3pfb_HFE_multivalent.xml'])
         elif self.water_model == 'spce':
-            self.water_model = 'tip3p'
             _forcefield_files.append(['amber/spce_standard.xml', 'amber/spce_HFE_multivalent.xml'])
         elif self.water_model == 'opc3':
             raise NotImplementedError('see https://github.com/choderalab/rna-espaloma/blob/main/experiment/nucleoside/script/create_system_espaloma.py#L366')
         # 4-site water models
         elif self.water_model == 'tip4pew':
-            self.water_model = 'tip4pew'
             _forcefield_files.append(['amber/tip4pew_standard.xml', 'amber/tip4pew_HFE_multivalent.xml'])
         elif self.water_model == 'tip4pfb':
-            self.water_model = 'tip4pew'
             _forcefield_files.append(['amber/tip4pfb_standard.xml', 'amber/tip4pfb_HFE_multivalent.xml'])
         elif self.water_model == 'opc':
-            self.water_model = 'tip4pew'
             _forcefield_files.append(['amber/opc_standard.xml'])
         else:
             raise NotImplementedError(f'Water model {self.water_model} is not supported.')
@@ -559,7 +571,24 @@ def _update_forcefield_files(self, forcefield_files):
             else:
                 new_forcefield_files.append(f)
 
-        return new_forcefield_files
+        #return new_forcefield_files
+        self.forcefield_files = new_forcefield_files
+
+
+    def _get_water_class(self):
+        """Get water class (3-site or 4-site) based on the water model.
+
+        Set self.water_class to tip3p or tip4pew based on the water model.
+
+        3-site water models: tip3p, tip3pfb, spce
+        4-site water models: tip4pew, tip4pfb, opc
+        """
+        if self.water_model in ['tip3p', 'tip3pfb', 'spce']:
+            self.water_class = 'tip3p'
+        elif self.water_model in ['tip4pew', 'tip4pfb', 'opc']:
+            self.water_class = 'tip4pew'
+        else:
+            raise NotImplementedError(f'Water model {self.water_model} is not supported.')
 
 
     def _load_biopolymer_file(self):
@@ -642,10 +671,10 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
         from openmm import MonteCarloBarostat
         from openmm import LangevinMiddleIntegrator
 
+        # Load biopolymer and ligand files
         self._biopolymer_file = biopolymer_file
         self._ligand_file = ligand_file
 
-        # Load biopolymer and ligand files
         if self._biopolymer_file is None and self._ligand_file is None:
             raise ValueError("At least one biopolymer (.pdb) or ligand (.sdf) file must be provided")
         if self._biopolymer_file is not None:
@@ -687,7 +716,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
         # Solvate system
         _logger.debug("Solvating system...")
         modeller = app.Modeller(self._complex_topology, self._complex_positions)
-        modeller.addSolvent(self._system_generator.forcefield, model=self.water_model, padding=self.solvent_padding, ionicStrength=self.ionic_strength)
+        modeller.addSolvent(self._system_generator.forcefield, model=self.water_class, padding=self.solvent_padding, ionicStrength=self.ionic_strength)
 
         # Create system
         self.modeller_solvated_topology = modeller.getTopology()
@@ -715,7 +744,9 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
 
             # Re-create system generator
             del self._system_generator
-            self.forcefield_files = self._update_forcefield_files(forcefield_files=[])  # Get water and ion forcefield files
+            #self.forcefield_files = self._update_forcefield_files(forcefield_files=[])  # Get water and ion forcefield files
+            self.forcefield_files = []    # Initialize forcefield_files to empty list and get water and ion forcefield files
+            self._update_forcefield_files()
             self._system_generator = SystemGenerator(
                 forcefields=self.forcefield_files, forcefield_kwargs=forcefield_kwargs, periodic_forcefield_kwargs = periodic_forcefield_kwargs, barostat=barostat, 
                 small_molecule_forcefield=self.small_molecule_forcefield, cache=None, template_generator_kwargs=template_generator_kwargs)

From 5d8fe9812c46cf537d58bfb4e016ea444c165fee Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 22 Mar 2024 20:00:13 -0400
Subject: [PATCH 05/11] add working draft of train_sampler for backup purpose

---
 espfit/app/train.py | 87 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 3 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index 047e96a..e397a37 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -405,7 +405,7 @@ def train(self):
                     self._save_checkpoint(epoch)
     
     
-    def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False):
+    def _train_sampler_old(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False):
         """
         Train the Espaloma network model with sampler.
 
@@ -456,7 +456,6 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
             for i in range(self.restart_epoch, self.epochs):
                 epoch = i + 1    # Start from 1 (not zero-indexing)
             
-                """
                 # torch.cuda.OutOfMemoryError: CUDA out of memory. 
                 # Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; 
                 # 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) 
@@ -503,7 +502,89 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
                 # Back propagate
                 loss.backward()
                 optimizer.step()
-                """
+
+                if epoch % self.checkpoint_frequency == 0:
+                    # Note: returned loss is a joint loss of different units.
+                    #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
+                    _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}')
+                    self._save_checkpoint(epoch)
+
+
+
+    def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False):
+        """
+        Train the Espaloma network model with sampler.
+
+        Parameters
+        ----------
+        sampler_patience : int, default=800
+            The number of epochs to wait before using sampler.
+
+        neff_threshold : float, default=0.2
+            The minimum effective sample size threshold.
+
+        sampler_weight : float, default=1.0
+            The weight for the sampler loss.
+
+        debug : bool, default=False
+            If True, use espaloma-0.3.pt for debugging.
+
+        Returns
+        -------
+        None
+        """
+        from espfit.utils.sampler.reweight import SetupSamplerReweight
+
+        # Note: RuntimeError will be raised if copy.deepcopy is used.
+        # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace 
+        # operation: [torch.cuda.FloatTensor [512, 1]], which is output 0 of AsStridedBackward0, is at version 2; 
+        # expected version 1 instead. Hint: the backtrace further above shows the operation that failed to 
+        # compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
+        import copy
+        net_copy = copy.deepcopy(self.net)
+
+        self.sampler_patience = sampler_patience
+        self.neff_threshold = neff_threshold
+
+        if self.dataset_train is None:
+            raise ValueError('Training dataset is not provided.')
+
+        # Load checkpoint
+        self.restart_epoch = self._load_checkpoint()
+
+        # Initialize
+        SamplerReweight = SetupSamplerReweight()
+        
+        # Train
+        ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
+        optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
+        with torch.autograd.set_detect_anomaly(True):
+            for i in range(self.restart_epoch, self.epochs):
+                epoch = i + 1    # Start from 1 (not zero-indexing)
+            
+                #
+                # Accumulate gradients to avoid CUDA out of memory error.
+                # ---
+                # torch.cuda.OutOfMemoryError: CUDA out of memory. 
+                # Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; 
+                # 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) 
+                # If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  
+                # See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
+
+                # 
+                if epoch > self.sampler_patience:
+                    # Save checkpoint as local model (net.pt)
+                    # `neff_min` is -1 if SamplerReweight.samplers is None
+                    samplers = self._setup_local_samplers(epoch, net_copy, debug)
+                    neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers)
+
+                    # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton
+                    if neff_min < self.neff_threshold:
+                        _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})')
+                        SamplerReweight.samplers = samplers
+                        SamplerReweight.run()
+                    del samplers
+
 
                 # Gradient accumulation
                 accumulation_steps = len(ds_tr_loader)

From 83cbf0d5efe184c76cf5b1c75254e841676b3f49 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 22 Mar 2024 20:47:21 -0400
Subject: [PATCH 06/11] update train_sampler

---
 espfit/app/train.py | 42 +++++++++++-------------------------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index e397a37..193d7aa 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -571,21 +571,18 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
                 # If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  
                 # See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
 
-                # 
+                # Check effective sample size and re-run simulation if necessary
                 if epoch > self.sampler_patience:
                     # Save checkpoint as local model (net.pt)
                     # `neff_min` is -1 if SamplerReweight.samplers is None
                     samplers = self._setup_local_samplers(epoch, net_copy, debug)
                     neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers)
-
-                    # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton
                     if neff_min < self.neff_threshold:
                         _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})')
                         SamplerReweight.samplers = samplers
                         SamplerReweight.run()
                     del samplers
 
-
                 # Gradient accumulation
                 accumulation_steps = len(ds_tr_loader)
                 for g in ds_tr_loader:
@@ -593,39 +590,22 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
                     if torch.cuda.is_available():
                         g = g.to("cuda:0")
                     g.nodes["n1"].data["xyz"].requires_grad = True
-
+                    # Compute loss
                     loss, loss_dict = self.net(g)
                     loss = loss/accumulation_steps
+                    # MD sampler loss
+                    if epoch > self.sampler_patience:
+                        loss_list = SamplerReweight.compute_loss()   # list of torch.tensor
+                        for sampler_index, sampler_loss in enumerate(loss_list):
+                            sampler = SamplerReweight.samplers[sampler_index]
+                            loss += (sampler_loss/accumulation_steps) * sampler_weight
+                            loss_dict[f'{sampler.target_name}'] = sampler_loss.item()
+                        loss_dict['neff'] = neff_min
                     loss.backward(retain_graph=True)
-
-                if epoch > self.sampler_patience:
-                    # Save checkpoint as local model (net.pt)
-                    # `neff_min` is -1 if SamplerReweight.samplers is None
-                    samplers = self._setup_local_samplers(epoch, net_copy, debug)
-                    neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers)
-
-                    # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton
-                    if neff_min < self.neff_threshold:
-                        _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})')
-                        SamplerReweight.samplers = samplers
-                        SamplerReweight.run()
-                    del samplers
-
-                    # Compute sampler loss
-                    loss_list = SamplerReweight.compute_loss()   # list of torch.tensor
-                    for sampler_index, sampler_loss in enumerate(loss_list):
-                        sampler = SamplerReweight.samplers[sampler_index]
-                        loss += sampler_loss * sampler_weight
-                        loss_dict[f'{sampler.target_name}'] = sampler_loss.item()
-                    loss.backward()
-                    loss_dict['neff'] = neff_min
-
                 loss_dict['loss'] = loss.item()
                 self.report_loss(epoch, loss_dict)
-
-                # Update
                 optimizer.step()
-
+                
                 if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
                     #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()

From 9c8e8afd85c3316fca68266cdd8cd9a2bc13298f Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 22 Mar 2024 21:05:02 -0400
Subject: [PATCH 07/11] change test_train_sampler to debug mode

---
 espfit/tests/test_app_train_sampler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py
index 1f71247..2de9625 100644
--- a/espfit/tests/test_app_train_sampler.py
+++ b/espfit/tests/test_app_train_sampler.py
@@ -68,7 +68,9 @@ def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml):
     # Train
     sampler_patience = 10
     # Force sampler to run after reaching sampler patience by setting neff_threshold to 1.0
-    model.train_sampler(sampler_patience=sampler_patience, neff_threshold=1.0, sampler_weight=1)
+    # Set debug=True to use espaloma-0.3.2.pt instead of on-the-fly intermediate espaloma model.
+    # Epochs are too short for stable espaloma model.
+    model.train_sampler(sampler_patience=sampler_patience, neff_threshold=1.0, sampler_weight=1, debug=True)
 
     # Check outputs
     n_ckpt = len(glob.glob(os.path.join(model.output_directory_path, 'ckpt*pt')))

From 63e17dcf2d6b1519a770c8c1c35cb7648629d866 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 3 Apr 2024 16:44:32 -0400
Subject: [PATCH 08/11] dump backup

---
 espfit/app/analysis.py           | 24 +++++++++--
 espfit/app/sampler.py            |  7 ++++
 espfit/app/train.py              | 56 +++++++++++++++-----------
 espfit/utils/sampler/reweight.py | 68 ++++++++++++++++++++------------
 4 files changed, 103 insertions(+), 52 deletions(-)

diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py
index 5235935..e512424 100644
--- a/espfit/app/analysis.py
+++ b/espfit/app/analysis.py
@@ -58,7 +58,7 @@ def output_directory_path(self, value):
         os.makedirs(value, exist_ok=True)
 
 
-    def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None):
+    def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, exclude_n_frames=0, input_directory_path=None):
         """Load MD trajectory.
         
         Parameters
@@ -71,6 +71,11 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s
 
         stride : int, optional
             Stride to load the trajectory. Default is 1.
+        
+        exclude_n_frames : float or int, optional
+            Exclude the first n frames from the trajectory. Default is 0.
+            If float, the first n percentange of frames will be excluded.
+            If int, the first n frames will be excluded.
 
         input_directory_path : str, optional
             Input directory path. Default is None.
@@ -114,6 +119,17 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s
         else:
             self.traj = traj
 
+        # Exclude the first n frames from the trajectory
+        if isinstance(exclude_n_frames, int):
+            if 0 < exclude_n_frames <= traj.n_frames:
+                self.traj = self.traj[exclude_n_frames:]
+        elif isinstance(exclude_n_frames, float):
+            if 0 < exclude_n_frames < 1:
+                exclude_n_frames = int(exclude_n_frames * traj.n_frames)
+                self.traj = self.traj[exclude_n_frames:]
+        else:
+            raise ValueError(f"Invalid exclude_n_frames: {exclude_n_frames}. Expected int or float.")
+
         
 class RNASystem(BaseDataLoader):
     """RNA system class to compute experimental observables from MD simulations.
@@ -235,9 +251,9 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None):
                 std_raw = replace_nan_with_none(std_raw)
                 if weights is not None:
                     arr = _values[:,j] * weights
-                    #_logger.info(f'non-weighted: {_values[:,j]}')
-                    #_logger.info(f'weights:      {weights}')
-                    #_logger.info(f'weighted:     {arr}')
+                    _logger.info(f'non-weighted: {_values[:,j]}')
+                    _logger.info(f'weights:      {weights}')
+                    _logger.info(f'weighted:     {arr}')
                     avg = np.round(arr.mean(), 5)
                     std = np.round(arr.std(), 5)
                     avg = replace_nan_with_none(avg)
diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index 863e6c7..387b684 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -757,6 +757,13 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
             self.new_solvated_system = self.modeller_solvated_system
             self.new_solvated_topology = self.modeller_solvated_topology
 
+        # Set force groups
+        # https://github.com/openmm/openmm-cookbook/blob/main/notebooks/cookbook/Analyzing%20Energy%20Contributions.ipynb
+        # OpenMM does not have a way to directly query the energy of a Force object. 
+        # Instead, it lets you query the energy of a force group. We therefore need each Force object to be in a different group.
+        for i, f in enumerate(self.new_solvated_system.getForces()):
+            f.setForceGroup(i)
+
         # Save solvated pdb file
         outfile = os.path.join(self.output_directory_path, f"solvated.pdb")
         with open(f"{outfile}", "w") as wf:
diff --git a/espfit/app/train.py b/espfit/app/train.py
index 193d7aa..0338b20 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -532,6 +532,17 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
         Returns
         -------
         None
+
+        Notes
+        -----
+        Back-propagation is performed against accumulated gradients to avoid CUDA out of memory error.
+        Following error is raised if back-propagation is performed without gradient accumulation:
+
+        > torch.cuda.OutOfMemoryError: CUDA out of memory. 
+        > Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; 
+        > 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) 
+        > If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  
+        > See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
         """
         from espfit.utils.sampler.reweight import SetupSamplerReweight
 
@@ -557,34 +568,38 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
         
         # Train
         ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
+        accumulation_steps = len(ds_tr_loader)
         optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
         with torch.autograd.set_detect_anomaly(True):
             for i in range(self.restart_epoch, self.epochs):
                 epoch = i + 1    # Start from 1 (not zero-indexing)
-            
-                #
-                # Accumulate gradients to avoid CUDA out of memory error.
-                # ---
-                # torch.cuda.OutOfMemoryError: CUDA out of memory. 
-                # Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; 
-                # 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) 
-                # If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  
-                # See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
 
-                # Check effective sample size and re-run simulation if necessary
+                # Compute sampler loss if epoch > sampler_patience
                 if epoch > self.sampler_patience:
-                    # Save checkpoint as local model (net.pt)
-                    # `neff_min` is -1 if SamplerReweight.samplers is None
+                    # Save checkpoint as local model (net.pt) and create list of sampler instances
                     samplers = self._setup_local_samplers(epoch, net_copy, debug)
+                    # Check effective sample size (neff). neff is computed for each sampler (sampler). 
+                    # Re-run the simulations based on the minimum neff value. `neff_min` is -1 if SamplerReweight.samplers is None, 
+                    # which is the default value when SamplerReweight is initialized.
                     neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers)
+                    # Re-run simulation if neff is below threshold
                     if neff_min < self.neff_threshold:
                         _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})')
                         SamplerReweight.samplers = samplers
                         SamplerReweight.run()
                     del samplers
+                    # Compute sampler loss
+                    _loss = torch.tensor(0.0)
+                    _loss_dict = dict()
+                    if SamplerReweight.samplers:
+                        sampler_loss_list = SamplerReweight.compute_loss()   # list of torch.tensor
+                        for sampler_index, sampler_loss in enumerate(sampler_loss_list):
+                            sampler = SamplerReweight.samplers[sampler_index]
+                            _loss += (sampler_loss/accumulation_steps) * sampler_weight
+                            _loss_dict[f'{sampler.target_name}'] = sampler_loss.item()
+                        _loss_dict['neff'] = neff_min
 
-                # Gradient accumulation
-                accumulation_steps = len(ds_tr_loader)
+                # Accumulate gradiants
                 for g in ds_tr_loader:
                     optimizer.zero_grad()
                     if torch.cuda.is_available():
@@ -593,16 +608,13 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
                     # Compute loss
                     loss, loss_dict = self.net(g)
                     loss = loss/accumulation_steps
-                    # MD sampler loss
+                    # Add sampler loss
                     if epoch > self.sampler_patience:
-                        loss_list = SamplerReweight.compute_loss()   # list of torch.tensor
-                        for sampler_index, sampler_loss in enumerate(loss_list):
-                            sampler = SamplerReweight.samplers[sampler_index]
-                            loss += (sampler_loss/accumulation_steps) * sampler_weight
-                            loss_dict[f'{sampler.target_name}'] = sampler_loss.item()
-                        loss_dict['neff'] = neff_min
-                    loss.backward(retain_graph=True)
+                        loss += _loss
+                        loss.backward(retain_graph=True)
                 loss_dict['loss'] = loss.item()
+                if _loss_dict:
+                    loss_dict.update(_loss_dict)
                 self.report_loss(epoch, loss_dict)
                 optimizer.step()
                 
diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
index cfb14bc..e91f1ec 100644
--- a/espfit/utils/sampler/reweight.py
+++ b/espfit/utils/sampler/reweight.py
@@ -31,7 +31,9 @@ class SetupSamplerReweight(object):
     """
     def __init__(self):
         self.samplers = None
-        self.weights = dict()   # {'target_name': {'weights': w_i}, {'neff': neff}}
+        self.weights_neff_dict = dict()   # {'target_name': {'weights': w_i}, {'neff': neff}}
+        self.force_group_names = ['HarmonicBondForce', 'HarmonicAngleForce', 'PeriodicTorsionForce']
+        self.exclude_n_frames = 0.1
 
 
     def run(self):
@@ -60,10 +62,10 @@ def get_effective_sample_size(self, temporary_samplers):
         float
             The minimum effective sample size among all samplers.
         """
-        import mdtraj
         import numpy as np
         from openmm.unit import kilocalories_per_mole as kcalpermol
         from espfit.utils.units import KB_T_KCALPERMOL
+        from espfit.app.analysis import BaseDataLoader
         
         if self.samplers is None:
             return -1
@@ -76,41 +78,55 @@ def get_effective_sample_size(self, temporary_samplers):
             temp1 = temporary_sampler.temperature._value
             assert temp0 == temp1, f'Temperature should be equivalent but got sampler {temp0} K and temporary sampler {temp1} K'
             beta = 1 / (KB_T_KCALPERMOL * temp0)
-            _logger.debug(f'beta temperature in kcal/mol: {beta}')
+            _logger.info(f'beta temperature in kcal/mol: {beta}')
 
             # Get position from trajectory
-            traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb')
-            _logger.info(f'Found {traj.n_frames} frames in trajectory')
+            #traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb')
+            baseloader = BaseDataLoader()
+            baseloader.load_traj(input_directory_path=sampler.output_directory_path, exclude_n_frames=self.exclude_n_frames)
+            _logger.info(f'Found {baseloader.traj.n_frames} frames in trajectory')
             
             # Compute weights and effective sample size
-            log_w = []
-            for i in range(traj.n_frames):
+            w_arr = []
+            for i in range(baseloader.traj.n_frames):
                 # U(x0, theta0)
-                sampler.simulation.context.setPositions(traj.openmm_positions(i))
-                potential_energy = sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy()
+                sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i))
+                for gid, force in enumerate(sampler.simulation.system.getForces()):
+                    if force.getName() in self.force_group_names:
+                        print(f'{force.getName()}')
+                        try:
+                            potential_energy += sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy()
+                        except:
+                            potential_energy = sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy()
                 # U(x0, theta1)
-                temporary_sampler.simulation.context.setPositions(traj.openmm_positions(i))
-                reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy()
+                temporary_sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i))
+                for gid, force in enumerate(temporary_sampler.simulation.system.getForces()):
+                    if force.getName() in self.force_group_names:
+                        print(f'{force.getName()}')
+                        try:
+                            reduced_potential_energy += sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy()
+                        except:
+                            reduced_potential_energy = sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy()
                 # deltaU = U(x0, theta1) - U(x0, theta0)
                 delta = (reduced_potential_energy - potential_energy).value_in_unit(kcalpermol)
-                # log_w = ln(exp(-beta * delta))
+                # w = ln(exp(-beta * delta))
                 w = -1 * beta * delta
-                log_w.append(w)
+                w_arr.append(w)
 
-                #_logger.debug(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol')
-                #_logger.debug(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol')
-                #_logger.debug(f'deltaU:        {delta:10.3f} kcal/mol')
-                #_logger.debug(f'log_w:         {w:10.3f}')
+                _logger.info(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol')
+                _logger.info(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol')
+                _logger.info(f'deltaU:        {delta:10.3f} kcal/mol')
+                _logger.info(f'ln_w:          {w:10.3f}')
 
             # Compute weights and effective sample size (ratio: 0 to 1)
-            w_i = np.exp(log_w) / np.sum(np.exp(log_w))
+            w_i = np.exp(w_arr) / np.sum(np.exp(w_arr))
             neff = np.sum(w_i) ** 2 / np.sum(w_i ** 2) / len(w_i)
-            #_logger.debug(f'w_i_sum:       {np.sum(w_i):10.3f}')
-            #_logger.debug(f'neff:          {neff:10.3f}')
+            _logger.info(f'w_i_sum:       {np.sum(w_i):10.3f}')
+            _logger.info(f'neff:          {neff:10.3f}')
 
-            self.weights[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i}
-            #_logger.info(f'{self.weights}')
-            neffs = [self.weights[key]['neff'] for key in self.weights.keys()]
+            self.weights_neff_dict[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i}
+            _logger.info(f'{self.weights_neff_dict}')
+            neffs = [self.weights_neff_dict[key]['neff'] for key in self.weights_neff_dict.keys()]
 
         return min(neffs)
     
@@ -228,11 +244,11 @@ def _compute_weighted_observable(self, atomSubset, target_name, output_directory
 
         # Load trajectory
         target = RNASystem(atomSubset=atomSubset)
-        target.load_traj(input_directory_path=output_directory_path)
+        target.load_traj(input_directory_path=output_directory_path, exclude_n_frames=self.exclude_n_frames)
         
         # Compute observable
-        if self.weights.keys():
-            pred = target.compute_jcouplings(weights=self.weights[target_name]['weights'])
+        if self.weights_neff_dict.keys():
+            pred = target.compute_jcouplings(weights=self.weights_neff_dict[target_name]['weights'])
         else:
             pred = target.compute_jcouplings(weights=None)
         _logger.debug(f'Computed observable: {pred}')

From 73dae5840a0dc202ef4f6b45a11029444ba4c45f Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Sun, 7 Apr 2024 16:16:07 -0400
Subject: [PATCH 09/11] compute  sugar couplings as default instead of all
 possible couplings

---
 espfit/app/analysis.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py
index e512424..296cf4d 100644
--- a/espfit/app/analysis.py
+++ b/espfit/app/analysis.py
@@ -84,6 +84,10 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s
         Returns
         -------
         None
+
+        Notes
+        -----
+        * Should `stride` and `exclude_n_frames` be set as class attributes?
         """
         import mdtraj
 
@@ -191,7 +195,7 @@ def radian_to_degree(self, a):
         return a
 
 
-    def compute_jcouplings(self, weights=None, couplings=None, residues=None):
+    def compute_jcouplings(self, weights=None, couplings=['H1H2', 'H2H3', 'H3H4'], residues=None):
         """Compute J-couplings from MD trajectory.
         
         Parameters
@@ -200,7 +204,7 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None):
             Weights to compute the J-couplings. Default is None.
 
         couplings : str, optional
-            Name of the couplings to compute. Default is None. 
+            Name of the couplings to compute. Default is ['H1H2', 'H2H3', 'H3H4']. 
             If a list of couplings to be chosen from [H1H2,H2H3,H3H4,1H5P,2H5P,C4Pb,1H5H4,2H5H4,H3P,C4Pe,H1C2/4,H1C6/8] 
             is provided, only the selected couplings will be computed. Otherwise, all couplings will be computed.
 
@@ -224,7 +228,7 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None):
         """
         import barnaba as bb
 
-        _logger.info("Compute J-couplings from MD trajectory")
+        _logger.debug("Compute J-couplings from MD trajectory")
 
         if couplings is not None:
             # Check if the provided coupling names are valid
@@ -244,23 +248,25 @@ def compute_jcouplings(self, weights=None, couplings=None, residues=None):
         for i, resname in enumerate(resname_list):
             _values = values[:,i,:]  # Coupling values of i-th residue
             values_by_names = dict()
-            for j, coupling_name in enumerate(couplings):
+            for j, coupling_name in enumerate(couplings):                
                 avg_raw = np.round(_values[:,j].mean(), 5)  # e.g. mean value of H1H2 coupling of i-th residue
                 std_raw = np.round(_values[:,j].std(), 5)   # e.g. standard deviation of H1H2 coupling of i-th residue
                 avg_raw = replace_nan_with_none(avg_raw)
                 std_raw = replace_nan_with_none(std_raw)
                 if weights is not None:
                     arr = _values[:,j] * weights
-                    _logger.info(f'non-weighted: {_values[:,j]}')
-                    _logger.info(f'weights:      {weights}')
-                    _logger.info(f'weighted:     {arr}')
+                    _logger.info(f'weights:\n{weights}')
+                    _logger.info(f'non-weighted observalble:\n{_values[:,j]}')
+                    _logger.info(f'weighted observable:\n{arr}')
                     avg = np.round(arr.mean(), 5)
                     std = np.round(arr.std(), 5)
                     avg = replace_nan_with_none(avg)
                     std = replace_nan_with_none(std)
                 else:
+                    _logger.info(f'No weights are provided. Using predicted values without reweighting.')
                     avg = avg_raw
                     std = std_raw
+                _logger.info(f'J-scalar ({coupling_name}-{resname}): avg={avg}, std={std}')
                 values_by_names[coupling_name] = {'avg': avg, 'std': std, 'avg_raw': avg_raw, 'std_raw': std_raw}
             coupling_dict[resname] =  values_by_names
 

From 1591436bc604d1448772db79ccc0f8483bdf412f Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Sun, 7 Apr 2024 16:18:08 -0400
Subject: [PATCH 10/11] remove old train_sampler andupdate logic in
 train_sampler

---
 espfit/app/train.py | 146 ++++++++------------------------------------
 1 file changed, 27 insertions(+), 119 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index 0338b20..1d90360 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -384,6 +384,8 @@ def train(self):
         with torch.autograd.set_detect_anomaly(True):
             for i in range(self.restart_epoch, self.epochs):
                 epoch = i + 1    # Start from epoch 1 (not zero-indexing)
+                _logger.info(f'Epoch {epoch}')
+                
                 for g in ds_tr_loader:
                     optimizer.zero_grad()
                     # TODO: Better way to handle this?
@@ -401,115 +403,9 @@ def train(self):
                 if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
                     loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
-                    _logger.info(f'Epoch {epoch}: loss={loss:.3f}')
+                    _logger.info(f'Total loss: {loss:.3f}')
                     self._save_checkpoint(epoch)
     
-    
-    def _train_sampler_old(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False):
-        """
-        Train the Espaloma network model with sampler.
-
-        Parameters
-        ----------
-        sampler_patience : int, default=800
-            The number of epochs to wait before using sampler.
-
-        neff_threshold : float, default=0.2
-            The minimum effective sample size threshold.
-
-        sampler_weight : float, default=1.0
-            The weight for the sampler loss.
-
-        debug : bool, default=False
-            If True, use espaloma-0.3.pt for debugging.
-
-        Returns
-        -------
-        None
-        """
-        from espfit.utils.sampler.reweight import SetupSamplerReweight
-
-        # Note: RuntimeError will be raised if copy.deepcopy is used.
-        # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace 
-        # operation: [torch.cuda.FloatTensor [512, 1]], which is output 0 of AsStridedBackward0, is at version 2; 
-        # expected version 1 instead. Hint: the backtrace further above shows the operation that failed to 
-        # compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
-        import copy
-        net_copy = copy.deepcopy(self.net)
-
-        self.sampler_patience = sampler_patience
-        self.neff_threshold = neff_threshold
-
-        if self.dataset_train is None:
-            raise ValueError('Training dataset is not provided.')
-
-        # Load checkpoint
-        self.restart_epoch = self._load_checkpoint()
-
-        # Initialize
-        SamplerReweight = SetupSamplerReweight()
-        
-        # Train
-        ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
-        optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
-        with torch.autograd.set_detect_anomaly(True):
-            for i in range(self.restart_epoch, self.epochs):
-                epoch = i + 1    # Start from 1 (not zero-indexing)
-            
-                # torch.cuda.OutOfMemoryError: CUDA out of memory. 
-                # Tried to allocate 80.00 MiB (GPU 0; 10.75 GiB total capacity; 
-                # 9.76 GiB already allocated; 7.62 MiB free; 10.40 GiB reserved in total by PyTorch) 
-                # If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  
-                # See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
-
-                loss = torch.tensor(0.0)
-                if torch.cuda.is_available():
-                    loss = loss.cuda("cuda:0")
-
-                for g in ds_tr_loader:
-                    optimizer.zero_grad()
-                    if torch.cuda.is_available():
-                        g = g.to("cuda:0")
-                    g.nodes["n1"].data["xyz"].requires_grad = True
-
-                    _loss, loss_dict = self.net(g)
-                    loss += _loss
-
-                if epoch > self.sampler_patience:
-                    # Save checkpoint as local model (net.pt)
-                    # `neff_min` is -1 if SamplerReweight.samplers is None
-                    samplers = self._setup_local_samplers(epoch, net_copy, debug)
-                    neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers)
-
-                    # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton
-                    if neff_min < self.neff_threshold:
-                        _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})')
-                        SamplerReweight.samplers = samplers
-                        SamplerReweight.run()
-                    del samplers
-
-                    # Compute sampler loss
-                    loss_list = SamplerReweight.compute_loss()   # list of torch.tensor
-                    for sampler_index, sampler_loss in enumerate(loss_list):
-                        sampler = SamplerReweight.samplers[sampler_index]
-                        loss += sampler_loss * sampler_weight
-                        loss_dict[f'{sampler.target_name}'] = sampler_loss.item()
-                    loss_dict['neff'] = neff_min
-
-                loss_dict['loss'] = loss.item()
-                self.report_loss(epoch, loss_dict)
-
-                # Back propagate
-                loss.backward()
-                optimizer.step()
-
-                if epoch % self.checkpoint_frequency == 0:
-                    # Note: returned loss is a joint loss of different units.
-                    #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
-                    _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}')
-                    self._save_checkpoint(epoch)
-
-
 
     def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False):
         """
@@ -573,21 +469,33 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
         with torch.autograd.set_detect_anomaly(True):
             for i in range(self.restart_epoch, self.epochs):
                 epoch = i + 1    # Start from 1 (not zero-indexing)
+                _logger.info(f'# Epoch {epoch}')
 
                 # Compute sampler loss if epoch > sampler_patience
                 if epoch > self.sampler_patience:
                     # Save checkpoint as local model (net.pt) and create list of sampler instances
-                    samplers = self._setup_local_samplers(epoch, net_copy, debug)
-                    # Check effective sample size (neff). neff is computed for each sampler (sampler). 
-                    # Re-run the simulations based on the minimum neff value. `neff_min` is -1 if SamplerReweight.samplers is None, 
-                    # which is the default value when SamplerReweight is initialized.
-                    neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers)
-                    # Re-run simulation if neff is below threshold
+                    temporary_samplers = self._setup_temporary_samplers(epoch, net_copy, debug)
+                
+                    # Check the effective sample size (neff), computed for each sampler stored in `SamplerReweight.samplers`.
+                    # Coordinates from `SamplerReweight.samplers` are used to compute the effective sample size and Boltzmann weights 
+                    # using force field parameters (Espaloma models) stored in `SamplerReweight.samplers` and `temporary_samplers`, 
+                    # which are the updated force field parameters from the current epoch.
+                    # Re-run the simulations based on the minimum neff value. `neff_min` is set to -1 if `SamplerReweight.samplers` is None, 
+                    # which is the default state when `SetupSamplerReweight()` is initialized.
+                    # Note that `SamplerReweight.get_effective_sample_size` updates the `SamplerReweight.weights_neff_dict` attribute.
+                    neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=temporary_samplers)
+                    
+                    # Re-run simulation if `neff_min` is below threshold
                     if neff_min < self.neff_threshold:
-                        _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})')
-                        SamplerReweight.samplers = samplers
+                        _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold:.3f})')
+                        # Update SamplerReweight.sampler with the temporary_samplers
+                        SamplerReweight.samplers = temporary_samplers
                         SamplerReweight.run()
-                    del samplers
+                        # Initiliaze `SamplerReweight.weights_neff_dict` to empty dictionary
+                        _logger.info(f'Reset sampler weights')
+                        SamplerReweight.weights_neff_dict = dict()
+                    del temporary_samplers
+                    
                     # Compute sampler loss
                     _loss = torch.tensor(0.0)
                     _loss_dict = dict()
@@ -621,7 +529,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
                 if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
                     #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
-                    _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}')
+                    _logger.info(f'Total loss: {loss.item():.3f}')
                     self._save_checkpoint(epoch)
 
 
@@ -701,8 +609,8 @@ def _save_local_model(self, epoch, net_copy):
         self.save_model(net=net_copy, checkpoint_file=local_model, output_model=f"net.pt", output_directory_path=self.output_directory_path)
 
 
-    def _setup_local_samplers(self, epoch, net_copy, debug):
-        """Setup local samplers.
+    def _setup_temporary_samplers(self, epoch, net_copy, debug):
+        """Setup samplers using temporary espaloma models generated during training.
         
         Parameters
         ----------

From 3455a6fe7bb13015762c47fff429e0b1ddaef197 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Sun, 7 Apr 2024 16:20:40 -0400
Subject: [PATCH 11/11] use espfit.analysis.BaseDataLoader to load trajectories

---
 espfit/utils/sampler/reweight.py | 38 ++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
index e91f1ec..ba47a73 100644
--- a/espfit/utils/sampler/reweight.py
+++ b/espfit/utils/sampler/reweight.py
@@ -32,7 +32,7 @@ class SetupSamplerReweight(object):
     def __init__(self):
         self.samplers = None
         self.weights_neff_dict = dict()   # {'target_name': {'weights': w_i}, {'neff': neff}}
-        self.force_group_names = ['HarmonicBondForce', 'HarmonicAngleForce', 'PeriodicTorsionForce']
+        self.force_group_names = ['HarmonicBondForce', 'HarmonicAngleForce', 'PeriodicTorsionForce', 'NonbondedForce']
         self.exclude_n_frames = 0.1
 
 
@@ -44,7 +44,7 @@ def run(self):
         None
         """
         for sampler in self.samplers:
-            _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...')
+            _logger.info(f'Re-run simulation for {sampler.target_name}')
             sampler.minimize()
             sampler.run()
 
@@ -67,24 +67,26 @@ def get_effective_sample_size(self, temporary_samplers):
         from espfit.utils.units import KB_T_KCALPERMOL
         from espfit.app.analysis import BaseDataLoader
         
+        _logger.info(f'Compute effective sample size and sampling weights')
+
         if self.samplers is None:
+            _logger.info('No samplers found. Return effective sample size -1.')
             return -1
 
         for sampler, temporary_sampler in zip(self.samplers, temporary_samplers):
-            _logger.info(f'Compute effective sample size and sampling weights for {sampler.target_name}')
+            _logger.info(f'Analyzing {sampler.target_name}...')
 
             # Get temperature
             temp0 = sampler.temperature._value
             temp1 = temporary_sampler.temperature._value
             assert temp0 == temp1, f'Temperature should be equivalent but got sampler {temp0} K and temporary sampler {temp1} K'
             beta = 1 / (KB_T_KCALPERMOL * temp0)
-            _logger.info(f'beta temperature in kcal/mol: {beta}')
+            #_logger.debug(f'beta temperature in kcal/mol: {beta}')
 
             # Get position from trajectory
-            #traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb')
-            baseloader = BaseDataLoader()
+            baseloader = BaseDataLoader(atomSubset=sampler.atomSubset)
             baseloader.load_traj(input_directory_path=sampler.output_directory_path, exclude_n_frames=self.exclude_n_frames)
-            _logger.info(f'Found {baseloader.traj.n_frames} frames in trajectory')
+            _logger.info(f'Found {baseloader.traj.n_frames} frames from {sampler.output_directory_path}')
             
             # Compute weights and effective sample size
             w_arr = []
@@ -93,7 +95,6 @@ def get_effective_sample_size(self, temporary_samplers):
                 sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i))
                 for gid, force in enumerate(sampler.simulation.system.getForces()):
                     if force.getName() in self.force_group_names:
-                        print(f'{force.getName()}')
                         try:
                             potential_energy += sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy()
                         except:
@@ -102,11 +103,10 @@ def get_effective_sample_size(self, temporary_samplers):
                 temporary_sampler.simulation.context.setPositions(baseloader.traj.openmm_positions(i))
                 for gid, force in enumerate(temporary_sampler.simulation.system.getForces()):
                     if force.getName() in self.force_group_names:
-                        print(f'{force.getName()}')
                         try:
-                            reduced_potential_energy += sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy()
+                            reduced_potential_energy += temporary_sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy()
                         except:
-                            reduced_potential_energy = sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy()
+                            reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True, groups={gid}).getPotentialEnergy()
                 # deltaU = U(x0, theta1) - U(x0, theta0)
                 delta = (reduced_potential_energy - potential_energy).value_in_unit(kcalpermol)
                 # w = ln(exp(-beta * delta))
@@ -116,16 +116,22 @@ def get_effective_sample_size(self, temporary_samplers):
                 _logger.info(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol')
                 _logger.info(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol')
                 _logger.info(f'deltaU:        {delta:10.3f} kcal/mol')
-                _logger.info(f'ln_w:          {w:10.3f}')
+                _logger.info(f'w:             {w:10.3f}')
 
             # Compute weights and effective sample size (ratio: 0 to 1)
+            # Prevent RuntimeWarning: overflow encountered in exp
+            w_arr = np.float128(w_arr)
             w_i = np.exp(w_arr) / np.sum(np.exp(w_arr))
             neff = np.sum(w_i) ** 2 / np.sum(w_i ** 2) / len(w_i)
-            _logger.info(f'w_i_sum:       {np.sum(w_i):10.3f}')
-            _logger.info(f'neff:          {neff:10.3f}')
+            _logger.debug(f'w_i_sum:       {np.sum(w_i):10.3f}')
+            _logger.debug(f'neff:          {neff:10.3f}')
+
+            # Check if sum of weights is 1
+            sum_w = abs(np.sum(w_i))
+            assert abs(1 - sum_w) <= 0.001, f"Weight sum {sum_w} is greater than tolerance 0.001"
 
             self.weights_neff_dict[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i}
-            _logger.info(f'{self.weights_neff_dict}')
+            _logger.debug(f'{self.weights_neff_dict}')
             neffs = [self.weights_neff_dict[key]['neff'] for key in self.weights_neff_dict.keys()]
 
         return min(neffs)
@@ -141,7 +147,7 @@ def compute_loss(self):
         """
         loss_list = []
         for sampler in self.samplers:
-            _logger.info(f'Compute loss for {sampler.target_name}')
+            _logger.debug(f'Compute loss for {sampler.target_name}')
             loss = self._compute_loss_per_system(sampler)  # torch.tensor
             loss_list.append(loss)