From ef0420006ee673bbddc11ba70418ef33ae46f7d9 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 9 Feb 2024 15:25:57 -0500
Subject: [PATCH 01/59] use pint.UnitRegistry to convert units

---
 espfit/utils/units.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/espfit/utils/units.py b/espfit/utils/units.py
index 011dadc..4e480ab 100644
--- a/espfit/utils/units.py
+++ b/espfit/utils/units.py
@@ -1,3 +1,13 @@
-# Constants for unit conversions
-HARTEE_TO_KCALPERMOL = 627.509
-BOHR_TO_ANGSTROMS = 0.529177
\ No newline at end of file
+from pint import UnitRegistry
+
+# Define pint unit registry
+ureg = UnitRegistry()
+hartree = 1 * ureg.hartree
+bohr = 1 * ureg.bohr
+angstrom = 1 * ureg.angstrom
+
+# Conversion factors
+#HARTEE_TO_KCALPERMOL = 627.509
+#BOHR_TO_ANGSTROMS = 0.529177
+HARTREE_TO_KCALMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude
+BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude

From b63147fec68dd4339f9fdd893833260d766d9fe9 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 12 Feb 2024 13:44:09 -0500
Subject: [PATCH 02/59] change units.py

---
 espfit/utils/units.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/espfit/utils/units.py b/espfit/utils/units.py
index 4e480ab..ccc236a 100644
--- a/espfit/utils/units.py
+++ b/espfit/utils/units.py
@@ -9,5 +9,5 @@
 # Conversion factors
 #HARTEE_TO_KCALPERMOL = 627.509
 #BOHR_TO_ANGSTROMS = 0.529177
-HARTREE_TO_KCALMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude
+HARTREE_TO_KCALPERMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude
 BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude

From 3176da2d1f4c07b9aba3703d7d069f344523590c Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 12 Feb 2024 13:46:28 -0500
Subject: [PATCH 03/59] add implementation error for other systems in
 experiment.py

---
 espfit/app/experiment.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/espfit/app/experiment.py b/espfit/app/experiment.py
index 86165a7..c574fcd 100644
--- a/espfit/app/experiment.py
+++ b/espfit/app/experiment.py
@@ -242,4 +242,15 @@ def get_available_couplings(self):
         import barnaba as bb
         available_coupling_names = list(bb.definitions.couplings_idx.keys())
         return available_coupling_names
-        
\ No newline at end of file
+        
+
+class ProteinSystem(BaseDataLoader):
+    def __init__(self, **kwargs):
+        super(ProteinSystem, self).__init__(**kwargs)
+        raise NotImplementedError("ProteinSystem class is not implemented yet.")
+
+
+class ProteinLigandSystem(BaseDataLoader):
+    def __init__(self, **kwargs):
+        super(ProteinLigandSystem, self).__init__(**kwargs)
+        raise NotImplementedError("ProteinLigandSystem class is not implemented yet.")

From 12a749e7ea0b7478ea0df0371a1100441317ce78 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 12 Feb 2024 13:46:58 -0500
Subject: [PATCH 04/59] add draft for train.py

---
 espfit/app/train.py | 173 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 129 insertions(+), 44 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index aa35a08..a44e03f 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -70,7 +70,7 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase
         self.dataset_validation = dataset_validation
         self.dataset_test = dataset_test
         self.random_seed = random_seed
-        self.config = None   # TODO: Better way to handle this?
+        self.config = config
         if output_directory_path is None:
             import os
             self.output_directory_path = os.getcwd()
@@ -103,12 +103,7 @@ def from_toml(cls, filename):
         except FileNotFoundError as e:
             print(e)
             raise
-        #model = cls.create_model(config['espaloma'])
-        
-        # TODO: Better way to handle this?
-        #model = cls(model)
-        #model.config = config
-        
+
         model = cls()
         net = model.create_model(config['espaloma'])
         model.net = net
@@ -234,49 +229,14 @@ def _restart_checkpoint(self, output_directory_path):
             restart_epoch = 0
         
         return restart_epoch
-    
-
-    def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None):
-        """
-        Train the Espaloma network model.
-
-        This method trains the Espaloma network model using the training dataset. The training process can be customized 
-        by specifying the number of epochs, batch size, learning rate, checkpoint frequency, and an output directory. 
-        The method also supports restarting the training from a checkpoint.
 
-        Parameters
-        ----------
-        epochs : int, default=1000
-            The number of epochs to train the model for.
-
-        batch_size : int, default=128
-            The number of samples per batch.
-
-        learning_rate : float, default=1e-4
-            The learning rate for the optimizer.
-
-        checkpoint_frequency : int, default=10
-            The frequency at which the model should be saved.
 
-        output_directory_path : str, default=None
-            The directory where the model checkpoints should be saved. If None, the default output directory is used.
-
-        Returns
-        -------
-        None
-        """
+    def _get_train_parameters(self, output_directory_path):
         import os
         import torch
-        from pathlib import Path
 
         if torch.cuda.is_available():
             _logger.info('GPU is available for training.')
-
-            # Change default device to GPU if available
-            # Will this map all data onto GPU and cause memory error if the data is too large?
-            # https://pytorch.org/tutorials/recipes/recipes/changing_default_device.html
-
-            #torch.set_default_device('cuda')
         else:
             _logger.info('GPU is not available for training.')
 
@@ -305,10 +265,48 @@ def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_freq
         else:
             _logger.info(f'Training from scratch for {epochs} epochs.')
 
+        return restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency
+    
+
+    def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None):
+        """
+        Train the Espaloma network model.
+
+        This method trains the Espaloma network model using the training dataset. The training process can be customized 
+        by specifying the number of epochs, batch size, learning rate, checkpoint frequency, and an output directory. 
+        The method also supports restarting the training from a checkpoint.
+
+        Parameters
+        ----------
+        epochs : int, default=1000
+            The number of epochs to train the model for.
+
+        batch_size : int, default=128
+            The number of samples per batch.
+
+        learning_rate : float, default=1e-4
+            The learning rate for the optimizer.
+
+        checkpoint_frequency : int, default=10
+            The frequency at which the model should be saved.
+
+        output_directory_path : str, default=None
+            The directory where the model checkpoints should be saved. If None, the default output directory is used.
+
+        Returns
+        -------
+        None
+        """
+        import os
+        import torch
+        from espfit.utils.units import HARTREE_TO_KCALPERMOL
+
+        # Get training parameters
+        restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency = self._get_train_parameters(epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path)
+
         # Train
         # https://github.com/choderalab/espaloma/blob/main/espaloma/app/train.py#L33
         # https://github.com/choderalab/espaloma/blob/main/espaloma/data/dataset.py#L310            
-        from espfit.utils.units import HARTEE_TO_KCALPERMOL
         ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=batch_size, shuffle=True)
         optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
         with torch.autograd.set_detect_anomaly(True):
@@ -332,7 +330,94 @@ def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_freq
                     _logger.info(f'epoch {epoch}: {_loss:.3f}')
                     checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt")
                     torch.save(self.net.state_dict(), checkpoint_file)
+    
+
+    def train_reweight(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None, 
+                       system_type='RNA', neff_threshold=0.2, filename=None, maxIterations=10, nsteps=10):
+        import os
+        import torch
+        from espfit.utils.units import HARTREE_TO_KCALPERMOL
+        from espfit.app.sampler import SetupSampler
+
+        if system_type == 'RNA':
+            from espfit.app.experiment import RNASystem
+        else:
+            raise NotImplementedError("Only RNA system is supported at the moment.")
+
+        # Get training parameters
+        restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency = self._get_train_parameters(epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path)
+
+        # Run MD simulation
+        if restart_epoch == 0:
+            _logger.info('Running MD simulation for the first time.')
+            sampler = SetupSampler()
+            sampler.create_system(biopolymer_file=filename)
+            sampler.minimize(maxIterations)
+            sampler.run(nsteps)
+            
+            sampler_output_directory_path = os.path.join(output_directory_path, "md", restart_epoch)
+            sampler.export_xml(output_directory_path=sampler_output_directory_path)
+
+            # Bookkeep last output directory
+            old_sampler_output_directory_path = sampler_output_directory_path
+
+            target = RNASystem()
+            target.load_traj(input_directory_path=sampler_output_directory_path)
+            obs = target.compute_jcouplings()
+            _logger.info(f'Computed observable: {obs}')
+
+        # Train
+        ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=batch_size, shuffle=True)
+        optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
+        with torch.autograd.set_detect_anomaly(True):
+            for i in range(restart_epoch, epochs):
+                epoch = i + 1    # Start from epoch 1 (not zero-indexing)
+                for g in ds_tr_loader:
+                    optimizer.zero_grad()
+                    
+                    if torch.cuda.is_available():
+                        g = g.to("cuda:0")
+                    
+                    g.nodes["n1"].data["xyz"].requires_grad = True 
+
+                    # QC loss
+                    loss += self.net(g)
+
+
+                    # Compute MD loss                        
+                    neff = 0.5
+                    if neff < neff_threshold:
+                        # Re-run MD simulation
+                        sampler_output_directory_path = os.path.join(output_directory_path, "md", restart_epoch)
+                        sampler = SetupSampler.from_xml(input_directory_path=old_sampler_output_directory_path, output_directory_path=sampler_output_directory_path)
+                        sampler.minimize(maxIterations)
+                        sampler.run(nsteps)
+                        # Check if new observable is computed
+                        target = RNASystem()
+                        target.load_traj(input_directory_path=sampler_output_directory_path)
+                        obs = target.compute_jcouplings()
+                        # Update directory
+                        old_sampler_output_directory_path = sampler_output_directory_path
+                    else:
+                        pass
+
+
+                # Update weights
+                loss.backward()
+                optimizer.step()
+                
+                if epoch % checkpoint_frequency == 0:
+                    # Note: returned loss is a joint loss of different units.
+                    _loss = HARTEE_TO_KCALPERMOL * loss.pow(0.5).item()
+                    _logger.info(f'epoch {epoch}: {_loss:.3f}')
+                    checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt")
+                    torch.save(self.net.state_dict(), checkpoint_file)
+
+
+
+    def compute_md_loss(self, couplings):
 
+        pass
 
     def validate():
         raise NotImplementedError

From 47f00ce0b9d42b101943cc5267dc5180cd719bd5 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 15 Feb 2024 15:33:34 -0500
Subject: [PATCH 05/59] change logging info in
 espfit.utils.graphs.drop_and_merge_duplicates

---
 espfit/utils/graphs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index 914efb3..3ab1fd0 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -459,7 +459,7 @@ def reshape_conformation_size(self, n_confs=50):
         -------
         None
         """
-        _logger.info(f'Reshape graphs size')
+        _logger.info(f'Reshape graph size')
         
         import random
         import copy
@@ -473,13 +473,13 @@ def reshape_conformation_size(self, n_confs=50):
             n = g.nodes['n1'].data['xyz'].shape[1]
 
             if n == n_confs:
-                _logger.info(f"Molecule #{i} ({n} conformations)")
+                _logger.info(f"Mol #{i} ({n} conformations)")
                 new_graphs.append(g)
 
             elif n < n_confs:
                 random.seed(self.random_seed)
                 index_random = random.choices(range(0, n), k=n_confs-n)
-                _logger.info(f"Molecule #{i} ({n} conformations). Randomly select {len(index_random)} conformations")
+                _logger.info(f"Randomly select {len(index_random)} conformations from Mol #{i} ({n} conformations)")
 
                 _g = copy.deepcopy(g)
                 _g.nodes["g"].data["u_ref"] = torch.cat((_g.nodes['g'].data['u_ref'], _g.nodes['g'].data['u_ref'][:, index_random]), dim=-1)
@@ -488,7 +488,7 @@ def reshape_conformation_size(self, n_confs=50):
                 new_graphs.append(_g)
 
             else:
-                _logger.info(f"Molecule #{i} ({n} conformations). Shuffle indices and split data into chunks")
+                _logger.info(f"Shuffling Mol #{i} ({n} conformations) and splitting into {n_confs}")
                 random.seed(self.random_seed)
                 idx_range = random.sample(range(n), k=n)
                 for j in range(n // n_confs + 1):

From f62bfd4ada10e87e7d2baf9eab7ca745aab04e6f Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 15 Feb 2024 15:58:10 -0500
Subject: [PATCH 06/59] first commit espfit/utils/sampler

---
 espfit/utils/sampler/__init__.py |  0
 espfit/utils/sampler/module.py   | 48 ++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 espfit/utils/sampler/__init__.py
 create mode 100644 espfit/utils/sampler/module.py

diff --git a/espfit/utils/sampler/__init__.py b/espfit/utils/sampler/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/espfit/utils/sampler/module.py b/espfit/utils/sampler/module.py
new file mode 100644
index 0000000..d6e2492
--- /dev/null
+++ b/espfit/utils/sampler/module.py
@@ -0,0 +1,48 @@
+import logging
+
+_logger = logging.getLogger(__name__)
+
+
+def check_effective_sample_size():
+    # Compute effective sample size
+    neff = 0.5
+    
+    return neff
+
+
+def run_sampler(sampler_output_directory_path, biopolymer_file, ligand_file, maxIterations, nsteps, small_molecule_forcefield):
+    import os
+    from espfit.app.sampler import SetupSampler
+
+    c = SetupSampler(output_directory_path=sampler_output_directory_path, small_molecule_forcefield=small_molecule_forcefield)
+    c.create_system(biopolymer_file, ligand_file)
+    c.minimize(maxIterations)
+    c.run(nsteps=nsteps)    
+    c.export_xml()
+
+
+def compute_observable(input_directory_path):
+    from espfit.app.experiment import RNASystem
+    target = RNASystem()
+    target.load_traj(input_directory_path=input_directory_path)
+    val = target.compute_jcouplings()
+    _logger.info(f'Computed observable: {val}')
+    
+    import os
+    import yaml
+    with open(os.path.join(input_directory_path, 'observable.yaml'), 'w') as f:
+        yaml.dump(val, f, allow_unicode=True)
+
+    return val
+
+
+def compute_loss(input_directory_path):
+    # Compute observable
+    val = compute_observable(input_directory_path)
+    _logger.info(f'Computed observable: {val}')
+
+    # Compute loss
+    import torch
+    loss = torch.tensor(0.0)
+
+    return loss

From cd23fff4b792da1382bf62f804d7977778da459a Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 15 Feb 2024 15:59:12 -0500
Subject: [PATCH 07/59] use pint.UnitRegistry to define units

---
 espfit/utils/units.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/espfit/utils/units.py b/espfit/utils/units.py
index ccc236a..7082d9a 100644
--- a/espfit/utils/units.py
+++ b/espfit/utils/units.py
@@ -8,6 +8,6 @@
 
 # Conversion factors
 #HARTEE_TO_KCALPERMOL = 627.509
-#BOHR_TO_ANGSTROMS = 0.529177
+#BOHR_TO_ANGSTROMS = 0.529
 HARTREE_TO_KCALPERMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude
 BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude

From 91ac11803a1d41a77657909db1235723101d74a3 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 15 Feb 2024 16:11:22 -0500
Subject: [PATCH 08/59] remove espaloma train settings and add sampler settings
 to config.toml

---
 espfit/data/config/config.toml | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml
index 1cab4a8..d020d18 100644
--- a/espfit/data/config/config.toml
+++ b/espfit/data/config/config.toml
@@ -1,7 +1,7 @@
 # configuration for gnn
 [espaloma.gnn]
 method = "SAGEConv"
-aggregator_type = 'mean'
+aggregator_type = "mean"
 feat_drop = 0.1
 
 # configuration for stage 1 (gnn) & 2 (janossy pooling)
@@ -9,13 +9,6 @@ feat_drop = 0.1
 stage1 = [ 512, "relu", 0.1, 512, "relu", 0.1, 512, "relu", 0.1 ]   # (units, activation, dropout)
 stage2 = [ 512, "relu", 0.1, 512, "relu", 0.1, 512, "relu", 0.1, 512, "relu", 0.1 ]   # (units, activation, dropout)
 
-# training
-[espaloma.train]
-epochs = 20
-batch_size = 128
-learning_rate = 1e-4
-checkpoint_frequency = 10
-
 # loss weights
 [espaloma.weights]
 energy = 1.0
@@ -23,3 +16,22 @@ force = 1.0
 charge = 1.0
 torsion = 1.0
 improper = 1.0
+
+# system setup parameters
+[sampler.setup]
+water_model = tip3p
+solvent_padding = 9.0 * angstroms
+ionic_strength = 0.15 * molar
+temperature = 300 * kelvin
+
+# sampler production
+[sampler.run]
+maxIterations = 100
+nsteps = 100
+checkpoint_frequency = 25000
+logging_frequency = 250000
+netcdf_frequency = 250000
+
+# reweighting settings
+[sampler.reweight]
+neff = 0.2

From 08f43192cd2c57987a68dbfeed86fe7cd262db1e Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 15 Feb 2024 16:23:34 -0500
Subject: [PATCH 09/59] use local variables instead of instance attributes

---
 espfit/app/sampler.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index 1eb1de7..19d1ead 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -113,7 +113,7 @@ def minimize(self, maxIterations=100):
         -------
         None
         """
-        _logger.info(f"Minimizing system...")
+        _logger.info(f"Minimizing system for maximum {maxIterations} steps.")
         self.simulation.minimizeEnergy(maxIterations)
 
 
@@ -144,23 +144,18 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ
         -------
         None
         """
-        self.checkpoint_frequency = checkpoint_frequency
-        self.logging_frequency = logging_frequency
-        self.netcdf_frequency = netcdf_frequency
-        self.nsteps = nsteps
-        self.atom_indices = atom_indices
         if output_directory_path is not None:
             self.output_directory_path = output_directory_path  # property decorator is called
 
         # Select atoms to save
         import mdtraj
-        if self.atom_indices is None:
-            self.atom_indices = []
+        if atom_indices is None:
+            atom_indices = []
             mdtop = mdtraj.Topology.from_openmm(self.simulation.topology)
             res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
             for r in res:
                 for a in r.atoms:
-                    self.atom_indices.append(a.index)
+                    atom_indices.append(a.index)
        
         # Define reporter
         from mdtraj.reporters import NetCDFReporter
@@ -168,22 +163,22 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ
 
         self._check_file_exists("traj.nc")
         self.simulation.reporters.append(NetCDFReporter(os.path.join(self.output_directory_path, f"traj.nc"), 
-                                                        min(self.netcdf_frequency, self.nsteps), 
-                                                        atomSubset=self.atom_indices))
+                                                        min(netcdf_frequency, nsteps), 
+                                                        atomSubset=atom_indices))
         
         self._check_file_exists("checkpoint.chk")
         self.simulation.reporters.append(CheckpointReporter(os.path.join(self.output_directory_path, f"checkpoint.chk"), 
-                                                            min(self.checkpoint_frequency, self.nsteps)))
+                                                            min(checkpoint_frequency, nsteps)))
         
         self._check_file_exists("reporter.log")
         self.simulation.reporters.append(StateDataReporter(os.path.join(self.output_directory_path, f"reporter.log"), 
-                                                           min(self.logging_frequency, self.nsteps), 
+                                                           min(logging_frequency, nsteps), 
                                                            step=True, potentialEnergy=True, kineticEnergy=True, 
                                                            totalEnergy=True, temperature=True, volume=True, density=True, speed=True))
         
         # Run
-        _logger.info(f"Run MD simulation for {self.nsteps} steps")
-        self.simulation.step(self.nsteps)
+        _logger.info(f"Run MD simulation for {nsteps} steps")
+        self.simulation.step(nsteps)
 
 
     def export_xml(self, exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None):

From cef64974b558730e33efe063f0b4a94715f39b42 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 15 Feb 2024 16:24:39 -0500
Subject: [PATCH 10/59] add train_weight method

---
 espfit/app/train.py | 254 +++++++++++++++++++-------------------------
 1 file changed, 112 insertions(+), 142 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index a44e03f..99e2a53 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -6,6 +6,7 @@
 * Add support to use multiple GPUs
 * Add support to validate model? (or use independent script?)
 * Add support to save model? (or use independent script?)
+* Improve how data are parsed using dataclasses or pydantic
 """
 import logging
 
@@ -33,7 +34,8 @@ class EspalomaModel(object):
     >>> model.train()
     """
 
-    def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, random_seed=2666, config=None, output_directory_path=None):
+    def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, random_seed=2666, output_directory_path=None, 
+                 epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10):
         """Initialize an instance of the class with an Espaloma network model and a random seed.
 
         This constructor method sets up the Espaloma network model, the training, validation, test datasets, 
@@ -44,8 +46,8 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase
         Parameters
         ----------
         net : torch.nn.Sequential, default=None
-            The Espaloma network model to be used for training.
-
+            The Espaloma network model to be used for training.        
+        
         dataset_train : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None
             The training dataset. espaloma.graphs.graph.Graph. If not provided, the `train_data` attribute will be set to None.
 
@@ -58,22 +60,46 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase
         random_seed : int, default=2666
             The random seed used throughout the espaloma training.
 
-        config : dict, default=None
-            The configuration for the espaloma model. If not provided, the `config` attribute will be set to None.
-
         output_directory_path : str, default=None
             The directory where the model checkpoints should be saved. 
             If not provided, the checkpoints will be saved in the current working directory.
+
+        epochs : int, default=1000
+            The number of epochs to train the model for.
+
+        batch_size : int, default=128
+            The number of samples per batch.
+
+        learning_rate : float, default=1e-4
+            The learning rate for the optimizer.
+
+        checkpoint_frequency : int, default=10
+            The frequency at which the model should be saved.
         """
-        self.net = net
+        import os
+        import torch
         self.dataset_train = dataset_train
         self.dataset_validation = dataset_validation
         self.dataset_test = dataset_test
+        self.net = net
         self.random_seed = random_seed
-        self.config = config
+        self.epochs = epochs
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.checkpoint_frequency = checkpoint_frequency
         if output_directory_path is None:
-            import os
             self.output_directory_path = os.getcwd()
+        else:
+            self.output_directory_path = output_directory_path
+        
+        # Check if GPU is available
+        if torch.cuda.is_available():
+            _logger.info('GPU is available for training.')
+        else:
+            _logger.info('GPU is not available for training.')
+
+        # Check torch data type
+        _logger.info(f'Torch data type is {torch.get_default_dtype()}')
 
 
     @classmethod
@@ -107,7 +133,6 @@ def from_toml(cls, filename):
         model = cls()
         net = model.create_model(config['espaloma'])
         model.net = net
-        model.config = config
 
         return model
 
@@ -195,17 +220,12 @@ def create_model(espaloma_config):
             return net
 
 
-    def _restart_checkpoint(self, output_directory_path):
+    def _load_checkpoint(self):
         """Load the last checkpoint and restart the training process.
 
-        This method finds all the checkpoint files in the directory specified by `output_directory_path`, 
-        loads the last checkpoint (e.g. net100.pt), and restarts the training process from the next step. If no 
-        checkpoint files are found, the training process starts from the first step.
-
-        Parameters
-        ----------
-        output_directory_path : str
-            The directory where the checkpoint files are stored.
+        This method finds all the checkpoint files in the output directory, loads the 
+        last checkpoint (e.g. net100.pt), and restarts the training process from the next step. 
+        If no checkpoint files are found, the training process starts from the first step.
 
         Returns
         -------
@@ -213,83 +233,39 @@ def _restart_checkpoint(self, output_directory_path):
             The step from which the training process should be restarted.
         """
         import os
+        import sys
         import glob
         import torch
 
-        checkpoints = glob.glob("{}/*.pt".format(output_directory_path))
+        checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path))
         
         if checkpoints:
             n = [ int(c.split('net')[1].split('.')[0]) for c in checkpoints ]
             n.sort()
             restart_epoch = n[-1]
-            restart_checkpoint = os.path.join(output_directory_path, f"net{restart_epoch}.pt")
+            restart_checkpoint = os.path.join(self.output_directory_path, f"net{restart_epoch}.pt")
             self.net.load_state_dict(torch.load(restart_checkpoint))
             logging.info(f'Restarting from ({restart_checkpoint}).')
         else:
             restart_epoch = 0
         
-        return restart_epoch
-
-
-    def _get_train_parameters(self, output_directory_path):
-        import os
-        import torch
-
-        if torch.cuda.is_available():
-            _logger.info('GPU is available for training.')
-        else:
-            _logger.info('GPU is not available for training.')
-
-        # Check if training dataset is provided
-        if self.dataset_train is None:
-            raise ValueError('Training dataset is not provided.')
-        
-        # Espaloma settings for training
-        config = self.config['espaloma']['train']
-        epochs = config.get('epochs', epochs)
-        batch_size = config.get('batch_size', batch_size)
-        learning_rate = config.get('learning_rate', learning_rate)
-        checkpoint_frequency = config.get('checkpoint_frequency', checkpoint_frequency)
-        if output_directory_path is not None:
-            self.output_directory_path = output_directory_path
-            # Create output directory if not exists
-            os.makedirs(output_directory_path, exist_ok=True)
-
-        # Restart from checkpoint if exists
-        restart_epoch = self._restart_checkpoint(output_directory_path)
-        if restart_epoch >= epochs:
-            _logger.info(f'Already trained for {epochs} epochs.')
-            return
+        if restart_epoch >= self.epochs:
+            _logger.info(f'Already trained for {self.epochs} epochs.')
+            sys.exit(0)
         elif restart_epoch > 0:
-            _logger.info(f'Training for additional {epochs-restart_epoch} epochs.')
+            _logger.info(f'Training for additional {self.epochs-restart_epoch} epochs.')
         else:
-            _logger.info(f'Training from scratch for {epochs} epochs.')
+            _logger.info(f'Training from scratch for {self.epochs} epochs.')
+
+        return restart_epoch
 
-        return restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency
-    
 
-    def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None):
+    def train(self, output_directory_path=None):
         """
         Train the Espaloma network model.
 
-        This method trains the Espaloma network model using the training dataset. The training process can be customized 
-        by specifying the number of epochs, batch size, learning rate, checkpoint frequency, and an output directory. 
-        The method also supports restarting the training from a checkpoint.
-
         Parameters
         ----------
-        epochs : int, default=1000
-            The number of epochs to train the model for.
-
-        batch_size : int, default=128
-            The number of samples per batch.
-
-        learning_rate : float, default=1e-4
-            The learning rate for the optimizer.
-
-        checkpoint_frequency : int, default=10
-            The frequency at which the model should be saved.
-
         output_directory_path : str, default=None
             The directory where the model checkpoints should be saved. If None, the default output directory is used.
 
@@ -301,16 +277,23 @@ def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_freq
         import torch
         from espfit.utils.units import HARTREE_TO_KCALPERMOL
 
-        # Get training parameters
-        restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency = self._get_train_parameters(epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path)
+        if self.dataset_train is None:
+            raise ValueError('Training dataset is not provided.')
+
+        if output_directory_path is not None:
+            self.output_directory_path = output_directory_path
+            os.makedirs(self.output_directory_path, exist_ok=True)
+
+        # Load checkpoint
+        restart_epoch = self._load_checkpoint()
 
         # Train
         # https://github.com/choderalab/espaloma/blob/main/espaloma/app/train.py#L33
         # https://github.com/choderalab/espaloma/blob/main/espaloma/data/dataset.py#L310            
-        ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=batch_size, shuffle=True)
-        optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
+        ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
+        optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
         with torch.autograd.set_detect_anomaly(True):
-            for i in range(restart_epoch, epochs):
+            for i in range(restart_epoch, self.epochs):
                 epoch = i + 1    # Start from epoch 1 (not zero-indexing)
                 for g in ds_tr_loader:
                     optimizer.zero_grad()
@@ -324,101 +307,88 @@ def train(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_freq
                     loss.backward()
                     optimizer.step()
                 
-                if epoch % checkpoint_frequency == 0:
+                if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
-                    _loss = HARTEE_TO_KCALPERMOL * loss.pow(0.5).item()
+                    _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
                     _logger.info(f'epoch {epoch}: {_loss:.3f}')
                     checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt")
                     torch.save(self.net.state_dict(), checkpoint_file)
     
-
-    def train_reweight(self, epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path=None, 
-                       system_type='RNA', neff_threshold=0.2, filename=None, maxIterations=10, nsteps=10):
+    
+    def train_sampler(self, output_directory_path=None, 
+                      biopolymer_file=None, ligand_file=None, small_molecule_forcefield=None,
+                      sampler_patience=800, maxIterations=10, nsteps=10, neff_threshold=0.2):
         import os
         import torch
         from espfit.utils.units import HARTREE_TO_KCALPERMOL
-        from espfit.app.sampler import SetupSampler
+        from espfit.utils.sampler import module
 
-        if system_type == 'RNA':
-            from espfit.app.experiment import RNASystem
-        else:
-            raise NotImplementedError("Only RNA system is supported at the moment.")
-
-        # Get training parameters
-        restart_epoch, epochs, batch_size, learning_rate, checkpoint_frequency = self._get_train_parameters(epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, output_directory_path)
-
-        # Run MD simulation
-        if restart_epoch == 0:
-            _logger.info('Running MD simulation for the first time.')
-            sampler = SetupSampler()
-            sampler.create_system(biopolymer_file=filename)
-            sampler.minimize(maxIterations)
-            sampler.run(nsteps)
-            
-            sampler_output_directory_path = os.path.join(output_directory_path, "md", restart_epoch)
-            sampler.export_xml(output_directory_path=sampler_output_directory_path)
-
-            # Bookkeep last output directory
-            old_sampler_output_directory_path = sampler_output_directory_path
-
-            target = RNASystem()
-            target.load_traj(input_directory_path=sampler_output_directory_path)
-            obs = target.compute_jcouplings()
-            _logger.info(f'Computed observable: {obs}')
+        # Parameters for sampling and reweighting
+        self.biopolymer_file = biopolymer_file
+        self.ligand_file = ligand_file
+        self.sampler_patience = sampler_patience
+        self.maxIterations = maxIterations
+        self.nsteps = nsteps
+        self.neff_threshold = neff_threshold
+        self.small_molecule_forcefield = small_molecule_forcefield
+
+        if self.dataset_train is None:
+            raise ValueError('Training dataset is not provided.')
+
+        if output_directory_path is not None:
+            self.output_directory_path = output_directory_path
+            os.makedirs(self.output_directory_path, exist_ok=True)
+
+        # Load checkpoint
+        restart_epoch = self._load_checkpoint()
+
+        # Initialize neff to -1 to trigger the first sampling
+        neff = -1
 
         # Train
-        ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=batch_size, shuffle=True)
-        optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
+        ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
+        optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
         with torch.autograd.set_detect_anomaly(True):
-            for i in range(restart_epoch, epochs):
+            for i in range(restart_epoch, self.epochs):
                 epoch = i + 1    # Start from epoch 1 (not zero-indexing)
+                loss = torch.tensor(0.0)
                 for g in ds_tr_loader:
                     optimizer.zero_grad()
-                    
+
                     if torch.cuda.is_available():
                         g = g.to("cuda:0")
-                    
-                    g.nodes["n1"].data["xyz"].requires_grad = True 
 
-                    # QC loss
+                    g.nodes["n1"].data["xyz"].requires_grad = True 
                     loss += self.net(g)
 
+                # Run sampling
+                if epoch > self.sampler_patience:
+                    if neff < self.neff_threshold:
+                        _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')
+                        # Create system and run sampling, instead of restarting from previous checkpoint
+                        _logger.info(f'Run simulation...')
+                        sampler_output_directory_path = os.path.join(self.output_directory_path, "sampler", str(epoch))
+                        module.run_sampler(sampler_output_directory_path, self.biopolymer_file, self.ligand_file, self.maxIterations, self.nsteps, self.small_molecule_forcefield)
 
-                    # Compute MD loss                        
-                    neff = 0.5
-                    if neff < neff_threshold:
-                        # Re-run MD simulation
-                        sampler_output_directory_path = os.path.join(output_directory_path, "md", restart_epoch)
-                        sampler = SetupSampler.from_xml(input_directory_path=old_sampler_output_directory_path, output_directory_path=sampler_output_directory_path)
-                        sampler.minimize(maxIterations)
-                        sampler.run(nsteps)
-                        # Check if new observable is computed
-                        target = RNASystem()
-                        target.load_traj(input_directory_path=sampler_output_directory_path)
-                        obs = target.compute_jcouplings()
-                        # Update directory
-                        old_sampler_output_directory_path = sampler_output_directory_path
-                    else:
-                        pass
+                    # Compute MD loss
+                    _logger.info(f'Compute sampler loss.')
+                    sampler_loss = module.compute_loss(input_directory_path=sampler_output_directory_path)
 
+                    # Add MD loss to the joint loss
+                    loss += sampler_loss
 
                 # Update weights
                 loss.backward()
                 optimizer.step()
                 
-                if epoch % checkpoint_frequency == 0:
+                if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
-                    _loss = HARTEE_TO_KCALPERMOL * loss.pow(0.5).item()
+                    _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
                     _logger.info(f'epoch {epoch}: {_loss:.3f}')
-                    checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt")
+                    checkpoint_file = os.path.join(self.output_directory_path, f"net{epoch}.pt")
                     torch.save(self.net.state_dict(), checkpoint_file)
 
 
-
-    def compute_md_loss(self, couplings):
-
-        pass
-
     def validate():
         raise NotImplementedError
 

From d72a53148812f174d18c7f6c76e30abd04e4d03c Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 15 Feb 2024 16:25:33 -0500
Subject: [PATCH 11/59] first commit test_app_train_sampler.py

---
 espfit/tests/test_app_train_sampler.py | 67 ++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 espfit/tests/test_app_train_sampler.py

diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py
new file mode 100644
index 0000000..222f86c
--- /dev/null
+++ b/espfit/tests/test_app_train_sampler.py
@@ -0,0 +1,67 @@
+import pytest
+from importlib.resources import files
+from espfit.utils.graphs import CustomGraphDataset
+from espfit.app.train import EspalomaModel
+
+
+@pytest.fixture
+def test_create_espaloma_model():
+    """Test function to load a TOML configuration file and create an EspalomaModel object.
+
+    Returns
+    -------
+    model : espfit.app.train.EspalomaModel
+        The created EspalomaModel object.
+    """
+    filename = files('espfit').joinpath('data/config/config.toml')   # PosixPath
+    model = EspalomaModel.from_toml(str(filename))
+
+    return model
+
+
+@pytest.fixture
+def test_load_dataset(tmpdir):
+    """Test function to load a dataset and prepare it for training.
+
+    Parameters
+    ----------
+    tmpdir : py._path.local.LocalPath   # IS THIS CORRECT?
+        Temporary directory.
+
+    Notes
+    -----
+    This function is not intended for production use. It is a minimal example for testing purposes.
+
+    Returns
+    -------
+    ds : espfit.utils.graphs.CustomGraphDataset
+        The loaded dataset.
+    """
+    # load dataset
+    path = 'data/qcdata/openff-toolkit-0.10.6/dgl2/gen2-torsion-sm'
+    mydata = files('espfit').joinpath(path)
+    ds = CustomGraphDataset.load(str(mydata))
+
+    # Prepare input dataset ready for training
+    temporary_directory = tmpdir.mkdir('misc')
+    ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory))
+    ds.reshape_conformation_size(n_confs=50)
+    ds.compute_relative_energy()
+
+    return ds
+
+
+def test_train_sampler(test_load_dataset, test_create_espaloma_model):
+
+    # Load dataset and model
+    ds = test_load_dataset
+    model = test_create_espaloma_model
+    model.epochs = 50
+    model.dataset_train = ds
+
+    # Train
+    small_molecule_forcefield=files('espfit').joinpath('data/forcefield/espaloma-0.3.2.pt')
+    biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb')   # PosixPath
+    output_directory_path = 'examples/checkpoints_sampler'
+    model.train_sampler(biopolymer_file=biopolymer_file, sampler_patience=3, maxIterations=100, nsteps=1000, neff_threshold=0.2, output_directory_path=output_directory_path, small_molecule_forcefield=str(small_molecule_forcefield))
+    raise ValueError('This test is not yet implemented.')

From ab0a2513316749a53e1cd0b2bfbc1b0c9acf3e0d Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 16 Feb 2024 10:34:14 -0500
Subject: [PATCH 12/59] fix python to 3.11 and add pytest-cov in test_env.yaml

---
 devtools/conda-envs/test_env.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
index 5fac361..b683519 100644
--- a/devtools/conda-envs/test_env.yaml
+++ b/devtools/conda-envs/test_env.yaml
@@ -3,13 +3,15 @@ channels:
   - conda-forge
   - openeye
 dependencies:
+  - python=3.11
   - espaloma=0.3.2
   - pdbfixer
   - ca-certificates
   - certifi
   - openssl
   - pytest
+  - pytest-cov
   - openeye-toolkits
   - openmmtools
   - barnaba
-prefix: /lila/home/takabak/mambaforge/envs/espfit
+prefix: /lila/home/takabak/mambaforge/envs/espfit
\ No newline at end of file

From ed42b0c28565727f8bfc170c3f341498d3ed93d7 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 16 Feb 2024 10:40:53 -0500
Subject: [PATCH 13/59] add space to indent lines

---
 devtools/conda-envs/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devtools/conda-envs/README.md b/devtools/conda-envs/README.md
index 83704b9..58cc1cb 100644
--- a/devtools/conda-envs/README.md
+++ b/devtools/conda-envs/README.md
@@ -6,7 +6,7 @@
 >#uninstall openff-toolkit and install a customized version to support dgl graphs created using openff-toolkit=0.10.6  
 >conda uninstall --force openff-toolkit  
 >pip install git+https://github.com/kntkb/openff-toolkit.git@7e9d0225782ef723083407a1cbf1f4f70631f934  
->#uninstall openmmforcefields if < 0.12.0
+>#uninstall openmmforcefields if < 0.12.0  
 >#use pip instead of mamba to avoid dependency issues with ambertools and python  
 >conda uninstall --force openmmforcefields  
 >pip install git+https://github.com/openmm/openmmforcefields@0.12.0  

From f546b7feb97f43e658df043e8e2d3f6374ff2df6 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 16 Feb 2024 10:53:16 -0500
Subject: [PATCH 14/59] add space to indent line

---
 devtools/conda-envs/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devtools/conda-envs/README.md b/devtools/conda-envs/README.md
index 58cc1cb..83776c6 100644
--- a/devtools/conda-envs/README.md
+++ b/devtools/conda-envs/README.md
@@ -2,7 +2,7 @@
 
 >conda activate espfit  
 >conda env export --from-history > test_env.yaml  
->conda env create -f test_env.yaml -n test_env 
+>conda env create -f test_env.yaml -n test_env  
 >#uninstall openff-toolkit and install a customized version to support dgl graphs created using openff-toolkit=0.10.6  
 >conda uninstall --force openff-toolkit  
 >pip install git+https://github.com/kntkb/openff-toolkit.git@7e9d0225782ef723083407a1cbf1f4f70631f934  

From 628440fe4fdc77cf597aae287e2d1ec4afc0fbcb Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 19 Feb 2024 15:58:43 -0500
Subject: [PATCH 15/59] convert unit string into openmm unit

---
 espfit/utils/units.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/espfit/utils/units.py b/espfit/utils/units.py
index 7082d9a..3d5c2c4 100644
--- a/espfit/utils/units.py
+++ b/espfit/utils/units.py
@@ -1,3 +1,4 @@
+import openmm.unit as unit
 from pint import UnitRegistry
 
 # Define pint unit registry
@@ -11,3 +12,41 @@
 #BOHR_TO_ANGSTROMS = 0.529
 HARTREE_TO_KCALPERMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude
 BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude
+
+
+def convert_string_to_unit(unit_string):
+    """Convert a unit string to a openmm unit object.
+    
+    Parameters
+    ----------
+    unit_string : str
+        The string representation of the unit.
+
+    Returns
+    -------
+    openmm.unit
+        The openmm unit object.
+    """
+    unit_mapping = {
+        "nanometer": unit.nanometer,
+        "angstrom": unit.angstrom,
+        "nanometers": unit.nanometers,
+        "angstroms": unit.angstroms,
+        "kelvin": unit.kelvin,
+        "molar": unit.molar,
+        "millimolar": unit.millimolar,
+        "micromolar": unit.micromolar,
+        "atomsphere": unit.atmosphere,
+        "bar": unit.bar,
+        "nanoseconds": unit.nanoseconds,
+        "picoseconds": unit.picoseconds,
+        "femtoseconds": unit.femtoseconds,
+        "nanosecond": unit.nanosecond,
+        "picosecond": unit.picosecond,
+        "femtosecond": unit.femtosecond,
+        # Add more units as needed
+    }
+    if unit_string in unit_mapping:
+        return unit_mapping[unit_string]
+    else:
+        raise ValueError(f"Unit '{unit_string}' is not recognized.")
\ No newline at end of file

From 245ab2ca4b6b9b49d9633eb7a94c02714fb3fabf Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 19 Feb 2024 17:09:15 -0500
Subject: [PATCH 16/59] create sampler system from toml file

---
 espfit/app/sampler.py | 153 +++++++++++++++++++++++++++++++-----------
 1 file changed, 114 insertions(+), 39 deletions(-)

diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index 19d1ead..45e113b 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -29,20 +29,43 @@ class BaseSimulation(object):
 
     Methods
     -------
-    minimize(maxIterations=100):
+    minimize(output_directory_path=None):
         Minimize solvated system.
     
-    run(checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, nsteps=250000, atom_indices=None):
+    run(output_directory_path=None):
         Run standard MD simulation.
 
-    export_xml(exportSystem=True, exportState=True, exportIntegrator=True):
+    export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None):
         Export serialized system XML file and solvated pdb file.
     """
-    def __init__(self, output_directory_path=None, input_directory_path=None):
+    def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, neff_threshold=0.2, 
+                 checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, 
+                 output_directory_path=None, input_directory_path=None):
         """Initialize base simulation object.
         
         Parameters
         ----------
+        maxIterations : int, default=100
+            Maximum number of iterations to perform minimization.
+
+        nsteps : int, default=250000 (10 ns using 4 fs timestep)
+            Number of steps to run the simulation.
+
+        atom_indices : list, default=None
+            List of atom indices to save. If None, save all atoms except water and ions.
+
+        neff_threshold : float, default=0.2
+            Effective sample size threshold to rerun the simulation.
+
+        checkpoint_frequency : int, default=25000 (1 ns)
+            Frequency (in steps) at which to write checkpoint files.
+
+        logging_frequency : int, default=250000 (10 ns)
+            Frequency (in steps) at which to write logging files.
+
+        netcdf_frequency : int, default=250000 (10 ns)
+            Frequency (in steps) at which to write netcdf files.
+
         output_directory_path : str, optional
             Output directory path. Default is None.
             If None, the current working directory will be used.
@@ -51,6 +74,14 @@ def __init__(self, output_directory_path=None, input_directory_path=None):
             Input directory path to restart simulation. Default is None.
             If None, the current working directory will be used.
         """
+        self.maxIterations = maxIterations
+        self.nsteps = nsteps
+        self.atom_indices = atom_indices
+        self.neff_threshold = neff_threshold
+        self.checkpoint_frequency = checkpoint_frequency
+        self.logging_frequency = logging_frequency
+        self.netcdf_frequency = netcdf_frequency
+
         if output_directory_path is None:
             output_directory_path = os.getcwd()  # Is this right?
         if input_directory_path is None:
@@ -101,42 +132,29 @@ def _get_platform(self):
         return platform
     
 
-    def minimize(self, maxIterations=100):
+    def minimize(self, output_directory_path=None):
         """Minimize solvated system.
 
-        Parameters
-        ----------
-        maxIterations : int, default=100
-            Maximum number of iterations to perform.
+        output_directory_path : str, default=None
+            The path to the output directory. If None, the default output directory is used.
 
         Returns
         -------
         None
         """
-        _logger.info(f"Minimizing system for maximum {maxIterations} steps.")
-        self.simulation.minimizeEnergy(maxIterations)
 
+        if output_directory_path is not None:
+            self.output_directory_path = output_directory_path  # property decorator is called
 
-    def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, nsteps=250000, atom_indices=None, output_directory_path=None):
+        _logger.info(f"Minimizing system for maximum {self.maxIterations} steps.")
+        self.simulation.minimizeEnergy(self.maxIterations)
+
+
+    def run(self, output_directory_path=None):
         """Run standard MD simulation.
 
         Parameters
         ----------
-        checkpoint_frequency : int, default=25000 (1 ns)
-            Frequency (in steps) at which to write checkpoint files.
-
-        logging_frequency : int, default=250000 (10 ns)
-            Frequency (in steps) at which to write logging files.
-
-        netcdf_frequency : int, default=250000 (10 ns)
-            Frequency (in steps) at which to write netcdf files.
-
-        nsteps : int, default=250000 (10 ns)
-            Number of steps to run the simulation.
-
-        atom_indices : list, default=None
-            List of atom indices to save. If None, save all atoms except water and ions.
-
         output_directory_path : str, default=None
             The path to the output directory. If None, the default output directory is used.
 
@@ -149,13 +167,13 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ
 
         # Select atoms to save
         import mdtraj
-        if atom_indices is None:
-            atom_indices = []
+        if self.atom_indices is None:
+            self.atom_indices = []
             mdtop = mdtraj.Topology.from_openmm(self.simulation.topology)
             res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
             for r in res:
                 for a in r.atoms:
-                    atom_indices.append(a.index)
+                    self.atom_indices.append(a.index)
        
         # Define reporter
         from mdtraj.reporters import NetCDFReporter
@@ -163,22 +181,22 @@ def run(self, checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequ
 
         self._check_file_exists("traj.nc")
         self.simulation.reporters.append(NetCDFReporter(os.path.join(self.output_directory_path, f"traj.nc"), 
-                                                        min(netcdf_frequency, nsteps), 
-                                                        atomSubset=atom_indices))
+                                                        min(self.netcdf_frequency, self.nsteps), 
+                                                        atomSubset=self.atom_indices))
         
         self._check_file_exists("checkpoint.chk")
         self.simulation.reporters.append(CheckpointReporter(os.path.join(self.output_directory_path, f"checkpoint.chk"), 
-                                                            min(checkpoint_frequency, nsteps)))
+                                                            min(self.checkpoint_frequency, self.nsteps)))
         
         self._check_file_exists("reporter.log")
         self.simulation.reporters.append(StateDataReporter(os.path.join(self.output_directory_path, f"reporter.log"), 
-                                                           min(logging_frequency, nsteps), 
+                                                           min(self.logging_frequency, self.nsteps), 
                                                            step=True, potentialEnergy=True, kineticEnergy=True, 
                                                            totalEnergy=True, temperature=True, volume=True, density=True, speed=True))
         
         # Run
-        _logger.info(f"Run MD simulation for {nsteps} steps")
-        self.simulation.step(nsteps)
+        _logger.info(f"Run MD simulation for {self.nsteps} steps")
+        self.simulation.step(self.nsteps)
 
 
     def export_xml(self, exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None):
@@ -296,8 +314,10 @@ class SetupSampler(BaseSimulation):
     >>> from espfit.app.sampler import SetupSampler
     >>> c = SetupSampler()
     >>> c.create_system(biopolymer_file='protein.pdb', ligand_file='ligand.sdf')
-    >>> c.minimize(maxIterations=10)
-    >>> c.run(nsteps=10)
+    >>> c.maxIterations = 10   # change default setting
+    >>> c.minimize()
+    >>> c.nsteps = 100         # change default setting
+    >>> c.run()
 
     Notes
     -----
@@ -374,6 +394,61 @@ def __init__(self,
         self.barostat_period = barostat_period
         self.timestep = timestep
         self.override_with_espaloma = override_with_espaloma
+        self.target_class = None
+        self.target_name = None
+
+
+    @classmethod
+    def from_toml(cls, filename):
+        import tomllib
+        from espfit.utils.units import convert_string_to_unit
+        from importlib.resources import files
+
+        try:
+            with open(filename, 'rb') as f:
+                config = tomllib.load(f)
+        except FileNotFoundError as e:
+            print(e)
+            raise
+        
+        config = config['sampler']['setup']  # list
+        if config is None:
+            raise ValueError("target is not specified in the configuration file")
+        
+        systems = []
+        _logger.info(f'Found {len(config)} systems in the configuration file')
+        for _config in config:
+            system = cls()
+            # Target information
+            target_class = _config['target_class']
+            target_name = _config['target_name']
+
+            system.target_class = target_class
+            system.target_name = target_name
+
+            biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb')
+            ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf')
+            if not ligand_file.exists():
+                ligand_file = None
+
+            # System settings
+            for key, value in _config.items():
+                if key not in ['target_class', 'target_name']:
+                    if "*" in value:
+                        _value = float(value.split('*')[0].strip())
+                        unit_string = value.split('*')[1].strip()
+                        unit_mapping = convert_string_to_unit(unit_string)
+                        value = _value * unit_mapping
+                    
+                    # All key should be instance variable of the class
+                    setattr(system, key, value)
+
+            # Create system
+            system.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file)
+            systems.append(system)
+            del system
+        
+        return systems
 
 
     def _update_forcefield_files(self, forcefield_files):
@@ -486,7 +561,7 @@ def _get_complex(self):
        
         return complex_topology, complex_positions
     
-        
+    
     def create_system(self, biopolymer_file=None, ligand_file=None):
         """Create biopolymer-ligand system and export serialized system XML file and solvated pdb file.
 

From e4ecdf40135fdb5a44faeba5870e4bdc6858938d Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 19 Feb 2024 17:10:13 -0500
Subject: [PATCH 17/59] allow creating multiple sampler systems

---
 espfit/data/config/config.toml | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml
index d020d18..99caf42 100644
--- a/espfit/data/config/config.toml
+++ b/espfit/data/config/config.toml
@@ -18,20 +18,21 @@ torsion = 1.0
 improper = 1.0
 
 # system setup parameters
-[sampler.setup]
-water_model = tip3p
-solvent_padding = 9.0 * angstroms
-ionic_strength = 0.15 * molar
-temperature = 300 * kelvin
+# Note: only target class/name and variables related to `openmm.unit` are supported
+[[sampler.setup]]
+target_class = "nucleoside"
+target_name = "cytidine"
+water_model = "tip3p"
+solvent_padding = "10.0 * angstroms"
+ionic_strength = "0.08 * molar"   # 80 mM NaCl
+temperature = "303.15 * kelvin"
 
-# sampler production
-[sampler.run]
-maxIterations = 100
-nsteps = 100
-checkpoint_frequency = 25000
-logging_frequency = 250000
-netcdf_frequency = 250000
-
-# reweighting settings
-[sampler.reweight]
-neff = 0.2
+# system setup parameters
+# Note: only target class/name and variables related to `openmm.unit` are supported
+[[sampler.setup]]
+target_class = "nucleoside"
+target_name = "cytidine"
+water_model = "tip3p"
+solvent_padding = "10.0 * angstroms"
+ionic_strength = "0.08 * molar"   # 80 mM NaCl
+temperature = "303.15 * kelvin"
\ No newline at end of file

From 04145ffad640f7fe818c96810e1d1bba26851258 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 19 Feb 2024 17:11:50 -0500
Subject: [PATCH 18/59] rename pdbfixer_min.pdb to target.pdb for nucleoside
 systems

---
 .../{pdbfixer_min.pdb => target.pdb}          |  0
 .../cytidine/{pdbfixer_min.pdb => target.pdb} |  0
 .../{pdbfixer_min.pdb => target.pdb}          |  0
 .../uridine/{pdbfixer_min.pdb => target.pdb}  |  0
 .../testsystems/nucleoside/pdbfixer_min.pdb   | 35 -------------------
 5 files changed, 35 deletions(-)
 rename espfit/data/target/nucleoside/adenosine/{pdbfixer_min.pdb => target.pdb} (100%)
 rename espfit/data/target/nucleoside/cytidine/{pdbfixer_min.pdb => target.pdb} (100%)
 rename espfit/data/target/nucleoside/guanosine/{pdbfixer_min.pdb => target.pdb} (100%)
 rename espfit/data/target/nucleoside/uridine/{pdbfixer_min.pdb => target.pdb} (100%)
 delete mode 100644 espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb

diff --git a/espfit/data/target/nucleoside/adenosine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/adenosine/target.pdb
similarity index 100%
rename from espfit/data/target/nucleoside/adenosine/pdbfixer_min.pdb
rename to espfit/data/target/nucleoside/adenosine/target.pdb
diff --git a/espfit/data/target/nucleoside/cytidine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/cytidine/target.pdb
similarity index 100%
rename from espfit/data/target/nucleoside/cytidine/pdbfixer_min.pdb
rename to espfit/data/target/nucleoside/cytidine/target.pdb
diff --git a/espfit/data/target/nucleoside/guanosine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/guanosine/target.pdb
similarity index 100%
rename from espfit/data/target/nucleoside/guanosine/pdbfixer_min.pdb
rename to espfit/data/target/nucleoside/guanosine/target.pdb
diff --git a/espfit/data/target/nucleoside/uridine/pdbfixer_min.pdb b/espfit/data/target/nucleoside/uridine/target.pdb
similarity index 100%
rename from espfit/data/target/nucleoside/uridine/pdbfixer_min.pdb
rename to espfit/data/target/nucleoside/uridine/target.pdb
diff --git a/espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb b/espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb
deleted file mode 100644
index c80385e..0000000
--- a/espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb
+++ /dev/null
@@ -1,35 +0,0 @@
-REMARK   1 CREATED WITH OPENMM 8.1, 2024-01-26
-ATOM      1  C5'   A A   1       5.609   7.878  -3.616  1.00  0.00           C  
-ATOM      2  H5'   A A   1       5.926   8.406  -4.515  1.00  0.00           H  
-ATOM      3 H5''   A A   1       5.488   8.603  -2.812  1.00  0.00           H  
-ATOM      4  O5'   A A   1       4.377   7.214  -3.875  1.00  0.00           O  
-ATOM      5 HO5'   A A   1       3.732   7.873  -4.143  1.00  0.00           H  
-ATOM      6  C4'   A A   1       6.701   6.879  -3.233  1.00  0.00           C  
-ATOM      7  H4'   A A   1       7.663   7.391  -3.262  1.00  0.00           H  
-ATOM      8  O4'   A A   1       6.494   6.375  -1.913  1.00  0.00           O  
-ATOM      9  C3'   A A   1       6.778   5.649  -4.144  1.00  0.00           C  
-ATOM     10  H3'   A A   1       5.784   5.235  -4.319  1.00  0.00           H  
-ATOM     11  O3'   A A   1       7.428   5.896  -5.380  1.00  0.00           O  
-ATOM     12 HO3'   A A   1       8.365   5.865  -5.178  1.00  0.00           H  
-ATOM     13  C2'   A A   1       7.561   4.713  -3.219  1.00  0.00           C  
-ATOM     14  H2'   A A   1       7.428   3.673  -3.519  1.00  0.00           H  
-ATOM     15  C1'   A A   1       6.878   5.002  -1.872  1.00  0.00           C  
-ATOM     16  H1'   A A   1       7.580   4.816  -1.059  1.00  0.00           H  
-ATOM     17  N1    A A   1       5.124   0.467  -0.163  1.00  0.00           N  
-ATOM     18  C2    A A   1       6.351   0.978  -0.252  1.00  0.00           C  
-ATOM     19  H2    A A   1       7.149   0.341   0.097  1.00  0.00           H  
-ATOM     20  N3    A A   1       6.723   2.170  -0.712  1.00  0.00           N  
-ATOM     21  C4    A A   1       5.650   2.896  -1.124  1.00  0.00           C  
-ATOM     22  C5    A A   1       4.335   2.511  -1.094  1.00  0.00           C  
-ATOM     23  C6    A A   1       4.098   1.219  -0.579  1.00  0.00           C  
-ATOM     24  N6    A A   1       2.894   0.673  -0.481  1.00  0.00           N  
-ATOM     25  H61   A A   1       2.824  -0.276  -0.142  1.00  0.00           H  
-ATOM     26  H62   A A   1       2.092   1.188  -0.815  1.00  0.00           H  
-ATOM     27  N7    A A   1       3.506   3.517  -1.595  1.00  0.00           N  
-ATOM     28  C8    A A   1       4.352   4.463  -1.905  1.00  0.00           C  
-ATOM     29  H8    A A   1       4.040   5.410  -2.321  1.00  0.00           H  
-ATOM     30  N9    A A   1       5.670   4.162  -1.657  1.00  0.00           N  
-ATOM     31  O2'   A A   1       8.945   5.071  -3.169  1.00  0.00           O  
-ATOM     32 HO2'   A A   1       9.415   4.404  -2.663  1.00  0.00           H  
-TER      33        A A   1
-END

From 1ada151293c00f8f511be2cfa65aab727c26baeb Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 19 Feb 2024 17:12:48 -0500
Subject: [PATCH 19/59] add reference fig, table information

---
 espfit/data/target/nucleoside/adenosine/experiment.yml | 1 +
 espfit/data/target/nucleoside/cytidine/experiment.yml  | 1 +
 espfit/data/target/nucleoside/guanosine/experiment.yml | 1 +
 espfit/data/target/nucleoside/uridine/experiment.yml   | 1 +
 4 files changed, 4 insertions(+)

diff --git a/espfit/data/target/nucleoside/adenosine/experiment.yml b/espfit/data/target/nucleoside/adenosine/experiment.yml
index 5d300f9..c4ffa09 100644
--- a/espfit/data/target/nucleoside/adenosine/experiment.yml
+++ b/espfit/data/target/nucleoside/adenosine/experiment.yml
@@ -9,6 +9,7 @@ experiment_1:
     name: nmr
     concentration: 0.2 mM
     temperature: 303.15 * kelvin
+    reference: table S12
   comment: 
   sequence: a
   smiles:
diff --git a/espfit/data/target/nucleoside/cytidine/experiment.yml b/espfit/data/target/nucleoside/cytidine/experiment.yml
index ecd73f8..b0354a3 100644
--- a/espfit/data/target/nucleoside/cytidine/experiment.yml
+++ b/espfit/data/target/nucleoside/cytidine/experiment.yml
@@ -9,6 +9,7 @@ experiment_1:
     name: nmr
     concentration: 5.0 mM
     temperature: 303.15 * kelvin
+    reference: table S11
   comment: 
   sequence: c
   smiles:
diff --git a/espfit/data/target/nucleoside/guanosine/experiment.yml b/espfit/data/target/nucleoside/guanosine/experiment.yml
index 8adf911..a313122 100644
--- a/espfit/data/target/nucleoside/guanosine/experiment.yml
+++ b/espfit/data/target/nucleoside/guanosine/experiment.yml
@@ -9,6 +9,7 @@ experiment_1:
     name: nmr
     concentration: 0.2 mM
     temperature: 303.15 * kelvin
+    reference: table S12
   comment: 
   sequence: g
   smiles:
diff --git a/espfit/data/target/nucleoside/uridine/experiment.yml b/espfit/data/target/nucleoside/uridine/experiment.yml
index 181d12c..a75185c 100644
--- a/espfit/data/target/nucleoside/uridine/experiment.yml
+++ b/espfit/data/target/nucleoside/uridine/experiment.yml
@@ -9,6 +9,7 @@ experiment_1:
     name: nmr
     concentration: 5.0 mM
     temperature: 303.15 * kelvin
+    reference: table S11
   comment: 
   sequence: u
   smiles:

From 9d9bbe253b8afce7bd51b58adcceb5513dfba201 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 19 Feb 2024 17:13:46 -0500
Subject: [PATCH 20/59] change output file name from pdbfixer_min.pdb to
 target.pdb

---
 espfit/data/target/nucleoside/pdbfixer_setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/espfit/data/target/nucleoside/pdbfixer_setup.py b/espfit/data/target/nucleoside/pdbfixer_setup.py
index 1269915..bde251b 100644
--- a/espfit/data/target/nucleoside/pdbfixer_setup.py
+++ b/espfit/data/target/nucleoside/pdbfixer_setup.py
@@ -46,7 +46,8 @@ def prep(inputfile):
     # minimize: fix hydrogen positions
     simulation.minimizeEnergy(maxIterations=50)
     positions = simulation.context.getState(getPositions=True).getPositions()
-    PDBFile.writeFile(model.topology, positions, open("pdbfixer_min.pdb", 'w'))   
+    #PDBFile.writeFile(model.topology, positions, open("pdbfixer_min.pdb", 'w'))
+    PDBFile.writeFile(model.topology, positions, open("target.pdb", 'w'))
 
 
 @click.command()

From f337f55c5be5df130adf3c5e342c00df98d7e266 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Tue, 20 Feb 2024 21:27:31 -0500
Subject: [PATCH 21/59] replace module.py with reweight.py

---
 espfit/utils/sampler/module.py   | 48 --------------------------------
 espfit/utils/sampler/reweight.py | 47 +++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 48 deletions(-)
 delete mode 100644 espfit/utils/sampler/module.py
 create mode 100644 espfit/utils/sampler/reweight.py

diff --git a/espfit/utils/sampler/module.py b/espfit/utils/sampler/module.py
deleted file mode 100644
index d6e2492..0000000
--- a/espfit/utils/sampler/module.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import logging
-
-_logger = logging.getLogger(__name__)
-
-
-def check_effective_sample_size():
-    # Compute effective sample size
-    neff = 0.5
-    
-    return neff
-
-
-def run_sampler(sampler_output_directory_path, biopolymer_file, ligand_file, maxIterations, nsteps, small_molecule_forcefield):
-    import os
-    from espfit.app.sampler import SetupSampler
-
-    c = SetupSampler(output_directory_path=sampler_output_directory_path, small_molecule_forcefield=small_molecule_forcefield)
-    c.create_system(biopolymer_file, ligand_file)
-    c.minimize(maxIterations)
-    c.run(nsteps=nsteps)    
-    c.export_xml()
-
-
-def compute_observable(input_directory_path):
-    from espfit.app.experiment import RNASystem
-    target = RNASystem()
-    target.load_traj(input_directory_path=input_directory_path)
-    val = target.compute_jcouplings()
-    _logger.info(f'Computed observable: {val}')
-    
-    import os
-    import yaml
-    with open(os.path.join(input_directory_path, 'observable.yaml'), 'w') as f:
-        yaml.dump(val, f, allow_unicode=True)
-
-    return val
-
-
-def compute_loss(input_directory_path):
-    # Compute observable
-    val = compute_observable(input_directory_path)
-    _logger.info(f'Computed observable: {val}')
-
-    # Compute loss
-    import torch
-    loss = torch.tensor(0.0)
-
-    return loss
diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
new file mode 100644
index 0000000..69c3c5e
--- /dev/null
+++ b/espfit/utils/sampler/reweight.py
@@ -0,0 +1,47 @@
+import os
+import logging
+from espfit.app.sampler import SetupSampler, BaseSimulation
+
+_logger = logging.getLogger(__name__)
+
+
+class SamplerReweight(SetupSampler, BaseSimulation):
+
+    def __init__(self, weight=1, **kwargs):
+        super().__init__(**kwargs)
+        self.weight = weight
+
+
+    def get_effective_sample_size(self):
+        # Compute effective sample size
+        neff = 0.5
+        return neff
+
+
+    def _compute_observable(self):
+        if self.target_class == 'nucleoside':
+            from espfit.app.analysis import RNASystem
+            target = RNASystem()
+            target.load_traj(input_directory_path=self.output_directory_path)
+            obs_calc = target.compute_jcouplings()
+            _logger.info(f'Computed observable: {obs_calc}')
+        else:
+            raise NotImplementedError(f'Observable for {self.target_class} is not implemented.')
+
+        import yaml
+        with open(os.path.join(self.output_directory_path, 'observable.yaml'), 'w') as f:
+            yaml.dump(obs_calc, f, allow_unicode=True)
+
+        return obs_calc
+
+
+    def compute_loss(self):
+        # Compute experimental observable
+        obs_calc = self._compute_observable()
+        _logger.info(f'Computed observable: {obs_calc}')
+
+        # Compute loss
+        import torch
+        loss = torch.tensor(0.0)
+
+        return loss
\ No newline at end of file

From 585a6850e281660e3bb5ac8849214ece216a2f04 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Tue, 20 Feb 2024 21:28:04 -0500
Subject: [PATCH 22/59] rename experiment.py to analysis.py

---
 espfit/app/{experiment.py => analysis.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename espfit/app/{experiment.py => analysis.py} (100%)

diff --git a/espfit/app/experiment.py b/espfit/app/analysis.py
similarity index 100%
rename from espfit/app/experiment.py
rename to espfit/app/analysis.py

From 1e5b510a5ee1a4e30d50558beb6e50da799a4307 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Tue, 20 Feb 2024 21:29:00 -0500
Subject: [PATCH 23/59] add simulation settings to config

---
 espfit/data/config/config.toml | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml
index 99caf42..9088025 100644
--- a/espfit/data/config/config.toml
+++ b/espfit/data/config/config.toml
@@ -17,7 +17,16 @@ charge = 1.0
 torsion = 1.0
 improper = 1.0
 
-# system setup parameters
+# training settings
+[espaloma.train]
+epochs = 10
+batch_size = 128
+learning_rate = 1e-4
+checkpoint_frequency = 1
+output_directory_path = "checkpoints"
+
+
+# System setup parameters
 # Note: only target class/name and variables related to `openmm.unit` are supported
 [[sampler.setup]]
 target_class = "nucleoside"
@@ -26,13 +35,28 @@ water_model = "tip3p"
 solvent_padding = "10.0 * angstroms"
 ionic_strength = "0.08 * molar"   # 80 mM NaCl
 temperature = "303.15 * kelvin"
+maxIterations = 100
+nsteps = 1000
+neff_threshold = 0.2
+checkpoint_frequency = 10
+logging_frequency = 1
+netcdf_frequency = 10
+weight = 1
+
 
 # system setup parameters
 # Note: only target class/name and variables related to `openmm.unit` are supported
 [[sampler.setup]]
 target_class = "nucleoside"
-target_name = "cytidine"
+target_name = "adenosine"
 water_model = "tip3p"
 solvent_padding = "10.0 * angstroms"
 ionic_strength = "0.08 * molar"   # 80 mM NaCl
-temperature = "303.15 * kelvin"
\ No newline at end of file
+temperature = "303.15 * kelvin"
+maxIterations = 100
+nsteps = 1000
+neff_threshold = 0.2
+checkpoint_frequency = 10
+logging_frequency = 1
+netcdf_frequency = 10
+weight = 1
\ No newline at end of file

From cf39f68a23f34d2c7d0f63d9ac0da57946889018 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Tue, 20 Feb 2024 21:30:30 -0500
Subject: [PATCH 24/59] remove constraint and nonbonded method from instance
 variable

---
 espfit/app/sampler.py | 52 ++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index 45e113b..7e232bf 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -335,12 +335,12 @@ def __init__(self,
                  water_model='tip3p', 
                  solvent_padding=9.0 * unit.angstroms, 
                  ionic_strength=0.15 * unit.molar, 
-                 constraints=app.HBonds, 
+                 #constraints=app.HBonds, 
                  hmass=3.0 * unit.amu, 
                  temperature=300.0 * unit.kelvin, 
                  pressure=1.0 * unit.atmosphere, 
                  pme_tol=2.5e-04, 
-                 nonbonded_method=app.PME, 
+                 #nonbonded_method=app.PME, 
                  barostat_period=50, 
                  timestep=4 * unit.femtoseconds, 
                  override_with_espaloma=True,
@@ -385,12 +385,12 @@ def __init__(self,
         self.forcefield_files = self._update_forcefield_files(forcefield_files)
         self.solvent_padding = solvent_padding
         self.ionic_strength = ionic_strength
-        self.constraints = constraints
+        #self.constraints = constraints
         self.hmass = hmass
         self.temperature = temperature
         self.pressure = pressure
         self.pme_tol = pme_tol
-        self.nonbonded_method = nonbonded_method
+        #self.nonbonded_method = nonbonded_method
         self.barostat_period = barostat_period
         self.timestep = timestep
         self.override_with_espaloma = override_with_espaloma
@@ -399,7 +399,10 @@ def __init__(self,
 
 
     @classmethod
-    def from_toml(cls, filename):
+    def from_toml(cls, filename, *epoch, **override_sampler_kwargs):
+
+        # kwargs: force setting updates for SetupSampler and BaseSimulation
+                    
         import tomllib
         from espfit.utils.units import convert_string_to_unit
         from importlib.resources import files
@@ -415,16 +418,17 @@ def from_toml(cls, filename):
         if config is None:
             raise ValueError("target is not specified in the configuration file")
         
-        systems = []
+        samplers = []
         _logger.info(f'Found {len(config)} systems in the configuration file')
         for _config in config:
-            system = cls()
+            sampler = cls()
+
             # Target information
             target_class = _config['target_class']
             target_name = _config['target_name']
 
-            system.target_class = target_class
-            system.target_name = target_name
+            sampler.target_class = target_class
+            sampler.target_name = target_name
 
             biopolymer_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/target.pdb')
             ligand_file = files('espfit').joinpath(f'data/target/{target_class}/{target_name}/ligand.sdf')
@@ -434,21 +438,31 @@ def from_toml(cls, filename):
             # System settings
             for key, value in _config.items():
                 if key not in ['target_class', 'target_name']:
-                    if "*" in value:
+                    if isinstance(value, str) and "*" in value:
                         _value = float(value.split('*')[0].strip())
                         unit_string = value.split('*')[1].strip()
                         unit_mapping = convert_string_to_unit(unit_string)
                         value = _value * unit_mapping
-                    
                     # All key should be instance variable of the class
-                    setattr(system, key, value)
+                    setattr(sampler, key, value)
+            
+            # Override system settings by kwargs
+            for key, value in override_sampler_kwargs.items():
+                setattr(sampler, key, value)
+
+            # Update output directory path if epoch is given
+            if epoch is not None:
+                if len(epoch) == 1 and isinstance(epoch[0], int):
+                    sampler.output_directory_path = os.path.join(sampler.output_directory_path, f'{epoch[0]}')
+                else:
+                    raise ValueError("epoch should be a single value or a list of a single value")
 
             # Create system
-            system.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file)
-            systems.append(system)
-            del system
+            sampler.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file)            
+            samplers.append(sampler)
+            del sampler
         
-        return systems
+        return samplers
 
 
     def _update_forcefield_files(self, forcefield_files):
@@ -601,8 +615,10 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
 
         # Initialize system generator.
         _logger.debug("Initialize system generator")
-        forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : self.constraints, 'rigidWater': True, 'hydrogenMass' : self.hmass}
-        periodic_forcefield_kwargs = {'nonbondedMethod': self.nonbonded_method}
+        #forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : self.constraints, 'rigidWater': True, 'hydrogenMass' : self.hmass}
+        #periodic_forcefield_kwargs = {'nonbondedMethod': self.nonbonded_method}
+        forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : app.HBonds, 'rigidWater': True, 'hydrogenMass' : self.hmass}
+        periodic_forcefield_kwargs = {'nonbondedMethod': app.PME}
         barostat = MonteCarloBarostat(self.pressure, self.temperature, self.barostat_period)
 
         # SystemGenerator will automatically load the TemplateGenerator based on the given `small_molecule_forcefield`.

From eb234c0d90d21d36e47eb5d9e2d73cbf46ea709c Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Tue, 20 Feb 2024 21:31:24 -0500
Subject: [PATCH 25/59] improve running sampler during espaloma training

---
 espfit/app/train.py | 110 ++++++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 50 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index 99e2a53..386e4c4 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -8,6 +8,7 @@
 * Add support to save model? (or use independent script?)
 * Improve how data are parsed using dataclasses or pydantic
 """
+import os
 import logging
 
 _logger = logging.getLogger(__name__)
@@ -76,8 +77,6 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase
         checkpoint_frequency : int, default=10
             The frequency at which the model should be saved.
         """
-        import os
-        import torch
         self.dataset_train = dataset_train
         self.dataset_validation = dataset_validation
         self.dataset_test = dataset_test
@@ -87,12 +86,14 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase
         self.batch_size = batch_size
         self.learning_rate = learning_rate
         self.checkpoint_frequency = checkpoint_frequency
+        self.restart_epoch = 0
+        self.configfile = None
         if output_directory_path is None:
-            self.output_directory_path = os.getcwd()
-        else:
-            self.output_directory_path = output_directory_path
-        
+            output_directory_path = os.getcwd()
+        self.output_directory_path = output_directory_path
+
         # Check if GPU is available
+        import torch
         if torch.cuda.is_available():
             _logger.info('GPU is available for training.')
         else:
@@ -102,6 +103,20 @@ def __init__(self, net=None, dataset_train=None, dataset_validation=None, datase
         _logger.info(f'Torch data type is {torch.get_default_dtype()}')
 
 
+    @property
+    def output_directory_path(self):
+        """Get output directory path."""
+        return self._output_directory_path
+
+
+    @output_directory_path.setter
+    def output_directory_path(self, value):
+        """Set output directory path."""
+        self._output_directory_path = value
+        # Create output directory if it does not exist
+        os.makedirs(value, exist_ok=True)
+
+
     @classmethod
     def from_toml(cls, filename):
         """Create an instance of the class from a TOML configuration file.
@@ -133,6 +148,11 @@ def from_toml(cls, filename):
         model = cls()
         net = model.create_model(config['espaloma'])
         model.net = net
+        model.configfile = filename
+
+        # Update training settings
+        for key, value in config['espaloma']['train'].items():
+            setattr(model, key, value)
 
         return model
 
@@ -195,7 +215,7 @@ def create_model(espaloma_config):
         readout_improper = esp.nn.readout.janossy.JanossyPoolingWithSmirnoffImproper(in_features=units, config=config_2, out_features={"k": 2})
 
         # Get loss weights
-        # TODO: Better way to handle this?
+        # TODO: Better way to initialize weights?
         weights = { 'energy': 1.0, 'force': 1.0, 'charge': 1.0, 'torsion': 1.0, 'improper': 1.0 }
         if 'weights' in espaloma_config.keys():
             for key in espaloma_config['weights'].keys():
@@ -260,14 +280,13 @@ def _load_checkpoint(self):
         return restart_epoch
 
 
-    def train(self, output_directory_path=None):
+    def train(self):
         """
         Train the Espaloma network model.
 
-        Parameters
-        ----------
-        output_directory_path : str, default=None
-            The directory where the model checkpoints should be saved. If None, the default output directory is used.
+        TODO
+        ----
+        * Export training settings to a file?
 
         Returns
         -------
@@ -279,13 +298,9 @@ def train(self, output_directory_path=None):
 
         if self.dataset_train is None:
             raise ValueError('Training dataset is not provided.')
-
-        if output_directory_path is not None:
-            self.output_directory_path = output_directory_path
-            os.makedirs(self.output_directory_path, exist_ok=True)
-
+        
         # Load checkpoint
-        restart_epoch = self._load_checkpoint()
+        self.restart_epoch = self._load_checkpoint()
 
         # Train
         # https://github.com/choderalab/espaloma/blob/main/espaloma/app/train.py#L33
@@ -293,15 +308,13 @@ def train(self, output_directory_path=None):
         ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
         optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
         with torch.autograd.set_detect_anomaly(True):
-            for i in range(restart_epoch, self.epochs):
+            for i in range(self.restart_epoch, self.epochs):
                 epoch = i + 1    # Start from epoch 1 (not zero-indexing)
                 for g in ds_tr_loader:
                     optimizer.zero_grad()
-                    
                     # TODO: Better way to handle this?
                     if torch.cuda.is_available():
                         g = g.to("cuda:0")
-                    
                     g.nodes["n1"].data["xyz"].requires_grad = True 
                     loss = self.net(g)
                     loss.backward()
@@ -311,36 +324,27 @@ def train(self, output_directory_path=None):
                     # Note: returned loss is a joint loss of different units.
                     _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
                     _logger.info(f'epoch {epoch}: {_loss:.3f}')
-                    checkpoint_file = os.path.join(output_directory_path, f"net{epoch}.pt")
+                    checkpoint_file = os.path.join(self.output_directory_path, f"net{epoch}.pt")
                     torch.save(self.net.state_dict(), checkpoint_file)
     
     
-    def train_sampler(self, output_directory_path=None, 
-                      biopolymer_file=None, ligand_file=None, small_molecule_forcefield=None,
-                      sampler_patience=800, maxIterations=10, nsteps=10, neff_threshold=0.2):
+    def train_sampler(self, sampler_patience=800, neff_threshold=0.2):
+
+        # sampler_kwargs: attributes supported by BaseSimulation
+
         import os
         import torch
         from espfit.utils.units import HARTREE_TO_KCALPERMOL
-        from espfit.utils.sampler import module
+        from espfit.utils.sampler.reweight import SamplerReweight
 
-        # Parameters for sampling and reweighting
-        self.biopolymer_file = biopolymer_file
-        self.ligand_file = ligand_file
         self.sampler_patience = sampler_patience
-        self.maxIterations = maxIterations
-        self.nsteps = nsteps
         self.neff_threshold = neff_threshold
-        self.small_molecule_forcefield = small_molecule_forcefield
 
         if self.dataset_train is None:
             raise ValueError('Training dataset is not provided.')
 
-        if output_directory_path is not None:
-            self.output_directory_path = output_directory_path
-            os.makedirs(self.output_directory_path, exist_ok=True)
-
         # Load checkpoint
-        restart_epoch = self._load_checkpoint()
+        self.restart_epoch = self._load_checkpoint()
 
         # Initialize neff to -1 to trigger the first sampling
         neff = -1
@@ -349,33 +353,39 @@ def train_sampler(self, output_directory_path=None,
         ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
         optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
         with torch.autograd.set_detect_anomaly(True):
-            for i in range(restart_epoch, self.epochs):
-                epoch = i + 1    # Start from epoch 1 (not zero-indexing)
+            for i in range(self.restart_epoch, self.epochs):
+                epoch = i + 1    # Start from 1 (not zero-indexing)
                 loss = torch.tensor(0.0)
+                if torch.cuda.is_available():
+                    loss = loss.cuda("cuda:0")
                 for g in ds_tr_loader:
                     optimizer.zero_grad()
-
                     if torch.cuda.is_available():
                         g = g.to("cuda:0")
-
                     g.nodes["n1"].data["xyz"].requires_grad = True 
                     loss += self.net(g)
 
                 # Run sampling
                 if epoch > self.sampler_patience:
                     if neff < self.neff_threshold:
-                        _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')
-                        # Create system and run sampling, instead of restarting from previous checkpoint
-                        _logger.info(f'Run simulation...')
-                        sampler_output_directory_path = os.path.join(self.output_directory_path, "sampler", str(epoch))
-                        module.run_sampler(sampler_output_directory_path, self.biopolymer_file, self.ligand_file, self.maxIterations, self.nsteps, self.small_molecule_forcefield)
+                        # Get Effective sample size
+                        if neff < 0:
+                            _logger.info(f'Reached sampler patience {self.sampler_patience}. Run sampler for the first time.')                        
+                        else:
+                            _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')                        
+                        
+                        # Create sampler system from configuration file. Returns list of systems.
+                        override_sampler_kwargs = { "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt" }  # change this to local espaloma model
+                        samplers = SamplerReweight.from_toml(self.configfile, epoch, override_sampler_kwargs)
+                        for sampler in samplers:
+                            _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...')
+                            sampler.minimize()
+                            sampler.run()
 
                     # Compute MD loss
                     _logger.info(f'Compute sampler loss.')
-                    sampler_loss = module.compute_loss(input_directory_path=sampler_output_directory_path)
-
-                    # Add MD loss to the joint loss
-                    loss += sampler_loss
+                    for sampler in samplers:
+                        loss += sampler.compute_loss() * sampler.weight
 
                 # Update weights
                 loss.backward()

From 56570cbc065cc94db57405f3f7b8f2b0457a9d8b Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 21 Feb 2024 14:13:13 -0500
Subject: [PATCH 26/59] update usage in README.md

---
 README.md | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 47b67ca..b012b38 100644
--- a/README.md
+++ b/README.md
@@ -28,10 +28,26 @@ Infrastruture to train espaloma with experimental observables
 
 
 ### Quick Usage
+```python
+from espfit.utils.graphs import CustomGraphDataset  
+path = 'espfit/data/qcdata/openff-toolkit-0.10.6/dgl2/protein-torsion-sm/'
+ds = CustomGraphDataset.load(path)
+ds.reshape_conformation_size(n_confs=50)
+ds.compute_relative_energy()
+# Create esplama model
+from espfit.app.train import EspalomaModel
+filename = 'espfit/data/config/config.toml'
+model = EspalomaModel.from_toml(filename)
+model.dataset_train = ds
+# Change default training settings
+model.epochs = 100
+# Set sampler settings
+model.train_sampler(sampler_patience=800, neff_threshold=0.2)
+```
 
+### Standalone Usage
 #### Change logging
 ```python
-# load dgl graph data
 from espfit.utils import logging
 logging.get_logging_level()
 #>'INFO'
@@ -51,12 +67,11 @@ from espfit.app.train import EspalomaModel
 filename = 'espfit/data/config/config.toml'
 model = EspalomaModel.from_toml(filename)
 model.dataset_train = ds
-# Train
-model.train(output_directory_path='path/to/output')
-# To extend training, update the `epoch` in config.toml
-# Alternatively, do the following:
-model.config['espaloma']['train']['epochs'] = 50
-model.train(output_directory_path='path/to/output')
+# Change default training settings
+model.epochs = 100
+model.output_directory_path = 'path/to/output'
+# Train (default output directory is current path)
+model.train()
 ```
 
 #### Standard MD (default: espaloma-0.3.2 force field for solute molecules)
@@ -66,8 +81,10 @@ from espfit.app.sampler import SetupSampler
 c = SetupSampler()
 filename = 'espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb'
 c.create_system(biopolymer_file=filename)
-c.minimize(maxIterations=10)
-c.run(nsteps=10, output_directory_path='path/to/output')
+c.minimize()
+# Change default settings
+c.nsteps = 1000
+c.run()
 # Export to XML
 c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path='path/to/output')
 ```
@@ -76,7 +93,8 @@ c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_
 ```python
 from espfit.app.sampler import SetupSampler
 c = SetupSampler.from_xml(input_directory_path='path/to/input')
-c.run(nsteps=10, output_directory_path='path/to/output')
+c.nsteps = 1000
+c.run()
 ```
 
 #### Compute RNA J-couplings from MD trajectory

From 7db4dba73d5dc62b5949d779536b98564ae64bc5 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 21 Feb 2024 14:14:24 -0500
Subject: [PATCH 27/59] create new samplers during espaloma training

---
 espfit/app/sampler.py            | 57 ++++++++++++++++++++++----------
 espfit/app/train.py              |  7 ++--
 espfit/utils/sampler/reweight.py |  2 +-
 3 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index 7e232bf..5c1fe75 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -399,10 +399,27 @@ def __init__(self,
 
 
     @classmethod
-    def from_toml(cls, filename, *epoch, **override_sampler_kwargs):
+    def from_toml(cls, filename, *args, **override_sampler_kwargs):
+        """Create SetupSampler from a TOML configuration file.
+        
+        Parameters
+        ----------
+        filename : str
+            The path to the TOML configuration file.
 
-        # kwargs: force setting updates for SetupSampler and BaseSimulation
-                    
+        *args : list
+            This is used to update the output directory path during espaloma training.
+            The list should contain a single integer value, corresponding to the epoch number.
+            
+        **override_sampler_kwargs : dict
+            The dictionary of keyword arguments to override the default settings of the 
+            BaseSimulation and SetupSampler classes. This option is intended for creating
+            new systems with temporary espaloma models generated during espaloma training.
+
+        Returns
+        -------
+        samplers : list of SetupSampler instances
+        """                    
         import tomllib
         from espfit.utils.units import convert_string_to_unit
         from importlib.resources import files
@@ -438,25 +455,31 @@ def from_toml(cls, filename, *epoch, **override_sampler_kwargs):
             # System settings
             for key, value in _config.items():
                 if key not in ['target_class', 'target_name']:
-                    if isinstance(value, str) and "*" in value:
-                        _value = float(value.split('*')[0].strip())
-                        unit_string = value.split('*')[1].strip()
-                        unit_mapping = convert_string_to_unit(unit_string)
-                        value = _value * unit_mapping
-                    # All key should be instance variable of the class
-                    setattr(sampler, key, value)
+                    if hasattr(sampler, key):
+                        if isinstance(value, str) and "*" in value:
+                            _value = float(value.split('*')[0].strip())
+                            unit_string = value.split('*')[1].strip()
+                            unit_mapping = convert_string_to_unit(unit_string)
+                            value = _value * unit_mapping                        
+                        setattr(sampler, key, value)
+                    else:
+                        raise ValueError(f"Invalid keyword argument: {key}")
             
-            # Override system settings by kwargs
+            # Expected kwargs: output_directory_path
+            # Pass temporary espaloma model to the sampler if kwargs are given
             for key, value in override_sampler_kwargs.items():
-                setattr(sampler, key, value)
+                if hasattr(sampler, key):
+                    setattr(sampler, key, value)
+                else:
+                    raise ValueError(f"Invalid keyword argument: {key}")
 
             # Update output directory path if epoch is given
-            if epoch is not None:
-                if len(epoch) == 1 and isinstance(epoch[0], int):
-                    sampler.output_directory_path = os.path.join(sampler.output_directory_path, f'{epoch[0]}')
+            if args is not None:
+                if len(args) == 1 and isinstance(args[0], int):
+                    sampler.output_directory_path = os.path.join(sampler.output_directory_path, sampler.target_name, f'{args[0]}')
                 else:
-                    raise ValueError("epoch should be a single value or a list of a single value")
-
+                    raise ValueError(f"Invalid argument: {args}. Expected a single integer value for the epoch number.")
+            
             # Create system
             sampler.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file)            
             samplers.append(sampler)
diff --git a/espfit/app/train.py b/espfit/app/train.py
index 386e4c4..bf5ec99 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -375,8 +375,11 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2):
                             _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')                        
                         
                         # Create sampler system from configuration file. Returns list of systems.
-                        override_sampler_kwargs = { "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt" }  # change this to local espaloma model
-                        samplers = SamplerReweight.from_toml(self.configfile, epoch, override_sampler_kwargs)
+                        args = [epoch]
+                        override_sampler_kwargs = { 
+                            "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt", # change this to local espaloma model
+                            "output_directory_path": self.output_directory_path }
+                        samplers = SamplerReweight.from_toml(self.configfile, *args, **override_sampler_kwargs)
                         for sampler in samplers:
                             _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...')
                             sampler.minimize()
diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
index 69c3c5e..f1d7a53 100644
--- a/espfit/utils/sampler/reweight.py
+++ b/espfit/utils/sampler/reweight.py
@@ -38,7 +38,7 @@ def _compute_observable(self):
     def compute_loss(self):
         # Compute experimental observable
         obs_calc = self._compute_observable()
-        _logger.info(f'Computed observable: {obs_calc}')
+        _logger.info(f'Compute loss')
 
         # Compute loss
         import torch

From f7dbd7a805fea717a8a5dcf13d4ed62033cbd023 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 21 Feb 2024 14:14:43 -0500
Subject: [PATCH 28/59] remove old comment

---
 espfit/data/config/config.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml
index 9088025..0a219ed 100644
--- a/espfit/data/config/config.toml
+++ b/espfit/data/config/config.toml
@@ -27,7 +27,6 @@ output_directory_path = "checkpoints"
 
 
 # System setup parameters
-# Note: only target class/name and variables related to `openmm.unit` are supported
 [[sampler.setup]]
 target_class = "nucleoside"
 target_name = "cytidine"
@@ -45,7 +44,6 @@ weight = 1
 
 
 # system setup parameters
-# Note: only target class/name and variables related to `openmm.unit` are supported
 [[sampler.setup]]
 target_class = "nucleoside"
 target_name = "adenosine"

From c6b0c3e7f1850da1ed8cb1cc8a66189319b7664d Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 22 Feb 2024 14:17:02 -0500
Subject: [PATCH 29/59] rename test_app_experiment.py to test_app_analysis.py

---
 ...app_experiment.py => test_app_analysis.py} | 35 +++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)
 rename espfit/tests/{test_app_experiment.py => test_app_analysis.py} (51%)

diff --git a/espfit/tests/test_app_experiment.py b/espfit/tests/test_app_analysis.py
similarity index 51%
rename from espfit/tests/test_app_experiment.py
rename to espfit/tests/test_app_analysis.py
index ecea804..9c783d7 100644
--- a/espfit/tests/test_app_experiment.py
+++ b/espfit/tests/test_app_analysis.py
@@ -1,33 +1,40 @@
 import pytest
 from importlib.resources import files
-from espfit.app.experiment import RNASystem
+from espfit.app.analysis import RNASystem
 
 
-def test_load_traj():
+@pytest.fixture
+def _get_input_directory_path():
     input_directory_path = files('espfit').joinpath('data/sampler')   # PosixPath
+    return input_directory_path 
+
+
+def test_load_traj(_get_input_directory_path):
+    # TODO: Better test
+    input_directory_path = _get_input_directory_path
     data = RNASystem(input_directory_path=input_directory_path)
     data.load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc')
 
-    # TODO: Better test
-    return data
+    assert data.traj is not None
 
 
-def test_compute_jcouplings_1():
-    input_directory_path = files('espfit').joinpath('data/sampler')   # PosixPath
+def test_compute_jcouplings(_get_input_directory_path):
+    # TODO: Better test
+    input_directory_path = _get_input_directory_path
     data = RNASystem(input_directory_path=input_directory_path)    
     data.load_traj()
     couplings = data.compute_jcouplings(couplings=['H1H2', 'H2H3', 'H3H4'])
-    
-    # TODO: Better test
-    return couplings
 
+    assert couplings is not None
 
-def test_compute_jcouplings_2():
-    input_directory_path = files('espfit').joinpath('data/sampler')   # PosixPath
+
+def test_compute_jcouplings_all(_get_input_directory_path):
+    # TODO: Better test
+    input_directory_path = _get_input_directory_path
     data = RNASystem()
     data.input_directory_path = str(input_directory_path)
     data.load_traj()
-    couplings = data.compute_jcouplings(couplings=None)
+    couplings = data.compute_jcouplings()
     
-    # TODO: Better test
-    return couplings
\ No newline at end of file
+    assert couplings is not None
+    
\ No newline at end of file

From 7d67a3b4a6f4763a65f14217abc802cac15f163b Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 22 Feb 2024 14:18:00 -0500
Subject: [PATCH 30/59] fix minor bug to pass all tests

---
 espfit/app/analysis.py                 | 14 ++++++-
 espfit/app/sampler.py                  |  8 +---
 espfit/app/train.py                    | 11 +++++-
 espfit/data/config/config.toml         |  4 --
 espfit/tests/test_app_sampler.py       | 54 +++++++++++++-------------
 espfit/tests/test_app_train.py         | 50 +++++++++++++++---------
 espfit/tests/test_app_train_sampler.py | 29 +++++++++-----
 espfit/utils/sampler/reweight.py       | 12 +++---
 8 files changed, 107 insertions(+), 75 deletions(-)

diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py
index c574fcd..5234e53 100644
--- a/espfit/app/analysis.py
+++ b/espfit/app/analysis.py
@@ -222,8 +222,18 @@ def compute_jcouplings(self, couplings=None, residues=None):
             _values = values[:,i,:]  # Coupling values of i-th residue
             values_by_names = dict()
             for j, coupling_name in enumerate(couplings):
-                avg = _values[:,j].mean()  # Mean value of H1H2 coupling of i-th residue
-                std = _values[:,j].std()   # Standard deviation of H1H2 coupling of i-th residue
+                # Function to replace np.nan with None
+                avg = np.round(_values[:,j].mean(), 5)  # Mean value of H1H2 coupling of i-th residue
+                std = np.round(_values[:,j].std(), 5)   # Standard deviation of H1H2 coupling of i-th residue
+
+                replace_nan_with_none = lambda x: None if np.isscalar(x) and np.isnan(x) else x
+                avg = replace_nan_with_none(avg)
+                std = replace_nan_with_none(std)
+                if avg:
+                    avg = avg.item()
+                if std:
+                    std = std.item()
+                # Convert numpy.float to float to avoid serialization issues
                 values_by_names[coupling_name] = {'avg': avg, 'std': std}
             coupling_dict[resname] =  values_by_names
 
diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index 5c1fe75..c75a82c 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -38,7 +38,7 @@ class BaseSimulation(object):
     export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None):
         Export serialized system XML file and solvated pdb file.
     """
-    def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, neff_threshold=0.2, 
+    def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, 
                  checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, 
                  output_directory_path=None, input_directory_path=None):
         """Initialize base simulation object.
@@ -54,9 +54,6 @@ def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, neff_thr
         atom_indices : list, default=None
             List of atom indices to save. If None, save all atoms except water and ions.
 
-        neff_threshold : float, default=0.2
-            Effective sample size threshold to rerun the simulation.
-
         checkpoint_frequency : int, default=25000 (1 ns)
             Frequency (in steps) at which to write checkpoint files.
 
@@ -77,7 +74,6 @@ def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, neff_thr
         self.maxIterations = maxIterations
         self.nsteps = nsteps
         self.atom_indices = atom_indices
-        self.neff_threshold = neff_threshold
         self.checkpoint_frequency = checkpoint_frequency
         self.logging_frequency = logging_frequency
         self.netcdf_frequency = netcdf_frequency
@@ -419,7 +415,7 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs):
         Returns
         -------
         samplers : list of SetupSampler instances
-        """                    
+        """
         import tomllib
         from espfit.utils.units import convert_string_to_unit
         from importlib.resources import files
diff --git a/espfit/app/train.py b/espfit/app/train.py
index bf5ec99..824b9c1 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -329,9 +329,15 @@ def train(self):
     
     
     def train_sampler(self, sampler_patience=800, neff_threshold=0.2):
+        """
+        Train the Espaloma network model with sampler.
 
-        # sampler_kwargs: attributes supported by BaseSimulation
+        TODO
+        ----
+        * Should `nsteps` be a variable when calling train_sampler?
+        * Should `sampler_patience` and `neff_threshold` be an instance variable of sampler.BaseSimulation?
 
+        """
         import os
         import torch
         from espfit.utils.units import HARTREE_TO_KCALPERMOL
@@ -378,7 +384,8 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2):
                         args = [epoch]
                         override_sampler_kwargs = { 
                             "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt", # change this to local espaloma model
-                            "output_directory_path": self.output_directory_path }
+                            "output_directory_path": self.output_directory_path 
+                            }
                         samplers = SamplerReweight.from_toml(self.configfile, *args, **override_sampler_kwargs)
                         for sampler in samplers:
                             _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...')
diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml
index 0a219ed..31ec82c 100644
--- a/espfit/data/config/config.toml
+++ b/espfit/data/config/config.toml
@@ -23,7 +23,6 @@ epochs = 10
 batch_size = 128
 learning_rate = 1e-4
 checkpoint_frequency = 1
-output_directory_path = "checkpoints"
 
 
 # System setup parameters
@@ -36,13 +35,11 @@ ionic_strength = "0.08 * molar"   # 80 mM NaCl
 temperature = "303.15 * kelvin"
 maxIterations = 100
 nsteps = 1000
-neff_threshold = 0.2
 checkpoint_frequency = 10
 logging_frequency = 1
 netcdf_frequency = 10
 weight = 1
 
-
 # system setup parameters
 [[sampler.setup]]
 target_class = "nucleoside"
@@ -53,7 +50,6 @@ ionic_strength = "0.08 * molar"   # 80 mM NaCl
 temperature = "303.15 * kelvin"
 maxIterations = 100
 nsteps = 1000
-neff_threshold = 0.2
 checkpoint_frequency = 10
 logging_frequency = 1
 netcdf_frequency = 10
diff --git a/espfit/tests/test_app_sampler.py b/espfit/tests/test_app_sampler.py
index b7ab1d3..09a44b1 100644
--- a/espfit/tests/test_app_sampler.py
+++ b/espfit/tests/test_app_sampler.py
@@ -15,7 +15,7 @@ def test_create_test_espaloma_system(tmpdir):
     -------
     c : espfit.app.sampler.SetupSampler
     """
-    biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb')   # PosixPath
+    biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/target.pdb')   # PosixPath
     c = SetupSampler(small_molecule_forcefield=ESPALOMA_FORCEFIELD, output_directory_path=str(tmpdir))
     c.create_system(biopolymer_file=biopolymer_file)  # Exports solvated system as pdb file automatically.
 
@@ -31,7 +31,7 @@ def test_create_nucleoside_espaloma_system(tmpdir):
     -------
     None
     """
-    biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb')
+    biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/target.pdb')
     c = SetupSampler(small_molecule_forcefield=ESPALOMA_FORCEFIELD, output_directory_path=str(tmpdir))
     c.create_system(biopolymer_file=biopolymer_file)
 
@@ -100,7 +100,7 @@ def test_create_multi_protein_ligand_espaloma_system(tmpdir):
     c.create_system(biopolymer_file=biopolymer_file, ligand_file=ligand_file)
 
 
-def test_export_system(test_create_test_espaloma_system, tmpdir):
+def test_export_system(test_create_test_espaloma_system):
     """Test exporting the system to xml files.
     
     Parameters
@@ -108,18 +108,15 @@ def test_export_system(test_create_test_espaloma_system, tmpdir):
     test_create_test_espaloma_system : espfit.app.sampler.SetupSampler
         Test system instance.
 
-    tmpdir : tmpdir fixture from pytest
-
     Returns
     -------
     None
     """
     c = test_create_test_espaloma_system
-    c.output_directory_path = str(tmpdir)
     c.export_xml()
+    
 
-
-def test_export_system_change_outdir(test_create_test_espaloma_system, tmpdir):
+def test_export_system_change_outdir(test_create_test_espaloma_system):
     """Test exporting the system to xml files.
     
     Change the output directory path and check if the new directory is created.
@@ -129,18 +126,16 @@ def test_export_system_change_outdir(test_create_test_espaloma_system, tmpdir):
     test_create_test_espaloma_system : espfit.app.sampler.SetupSampler
         Test system instance.
 
-    tmpdir : tmpdir fixture from pytest
-
     Returns
     -------
     None
     """
+    import os
     c = test_create_test_espaloma_system
-    old_outdir = c.output_directory_path
-    c.export_xml(output_directory_path=str(tmpdir.join('newdir')))
-    new_outdir = c.output_directory_path
+    old_output_directory_path = c.output_directory_path
+    c.export_xml(output_directory_path=os.path.join(old_output_directory_path, 'newdir'))
     
-    assert old_outdir != new_outdir
+    assert old_output_directory_path != c.output_directory_path
 
 
 def test_minimize(test_create_test_espaloma_system):
@@ -156,10 +151,14 @@ def test_minimize(test_create_test_espaloma_system):
     None
     """
     c = test_create_test_espaloma_system
-    c.minimize(maxIterations=10)
+    old_maxIterations = c.maxIterations
+    c.maxIterations = 9   # change default
+    c.minimize()
+
+    assert old_maxIterations != c.maxIterations
 
 
-def test_standard_md(test_create_test_espaloma_system, tmpdir):
+def test_standard_md(test_create_test_espaloma_system):
     """Test standard md simulation.
 
     Parameters
@@ -172,12 +171,13 @@ def test_standard_md(test_create_test_espaloma_system, tmpdir):
     None
     """
     c = test_create_test_espaloma_system
-    c.output_directory_path = str(tmpdir)
-    c.minimize(maxIterations=10)  # Minimize the system before running the simulation to avoid Energy NaN.
-    c.run(nsteps=10)
+    c.maxIterations = 10   # update maxIterations to speed up the test
+    c.nsteps = 10             
+    c.minimize()           # minimize the system before running the simulation to avoid Energy NaN.
+    c.run()
 
 
-def test_create_system_from_xml(test_create_test_espaloma_system, tmpdir):
+def test_create_system_from_xml(test_create_test_espaloma_system):
     """Test creating a system from loading existing xml files.
     
     Parameters
@@ -185,20 +185,20 @@ def test_create_system_from_xml(test_create_test_espaloma_system, tmpdir):
     test_create_test_espaloma_system : espfit.app.sampler.SetupSampler
         Test system instance.
 
-    tmpdir : tmpdir fixture from pytest
-
     Returns
     -------
     None
     """
+    import os
+    import glob
+    
     c = test_create_test_espaloma_system
-    c.output_directory_path = str(tmpdir)
     c.export_xml()
 
-    c2 = SetupSampler.from_xml(input_directory_path=str(tmpdir))
-    c2.export_xml(output_directory_path=str(tmpdir))
+    c2 = SetupSampler.from_xml(input_directory_path=c.output_directory_path)
+    c2.export_xml(output_directory_path=c.output_directory_path)
 
     # Check number of exported files. Check state.xml as a representative file.
-    import glob
-    n_files = len(glob.glob(str(tmpdir.join('state*.xml'))))
+    # If the same file exists, then suffix number will be added to the file name. 
+    n_files = len(glob.glob(os.path.join(c.output_directory_path, 'state*.xml')))
     assert n_files == 2
diff --git a/espfit/tests/test_app_train.py b/espfit/tests/test_app_train.py
index d24fbd7..05d29d5 100644
--- a/espfit/tests/test_app_train.py
+++ b/espfit/tests/test_app_train.py
@@ -74,14 +74,17 @@ def test_train(test_load_dataset, test_create_espaloma_model, tmpdir):
 
     # Create temporary checkpoint directory
     checkpoint_directory = tmpdir.mkdir('checkpoints')   # PosixPath
+    model.output_directory_path=str(checkpoint_directory)
 
-    # Train model
-    model.train(output_directory_path=str(checkpoint_directory))
+    # Train model with arbitrary number of epochs and checkpoint frequency
+    model.epochs = 20
+    model.checkpoint_frequency = 5
+    model.train()
 
     # Test if the model has been trained
     n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt'))))
-    expected_n_checkpoints = int(model.config['espaloma']['train']['epochs']/model.config['espaloma']['train']['checkpoint_frequency'])
-    assert expected_n_checkpoints == n_checkpoints
+    expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency)
+    assert expected_n_checkpoints == n_checkpoints == 4   # 20/5 = 4
 
 
 def test_train_extend(test_load_dataset, test_create_espaloma_model, tmpdir):
@@ -107,20 +110,24 @@ def test_train_extend(test_load_dataset, test_create_espaloma_model, tmpdir):
 
     # Create temporary checkpoint directory
     checkpoint_directory = tmpdir.mkdir('checkpoints')   # PosixPath
+    model.output_directory_path=str(checkpoint_directory)
 
-    # Train model
-    model.train(output_directory_path=str(checkpoint_directory))
+    # Train model with arbitrary number of epochs and checkpoint frequency
+    model.epochs = 10
+    model.checkpoint_frequency = 2
+    model.train()
 
     # Test if the model has been trained
     n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt'))))
-    expected_n_checkpoints = int(model.config['espaloma']['train']['epochs']/model.config['espaloma']['train']['checkpoint_frequency'])
-    assert n_checkpoints == expected_n_checkpoints
+    expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency)
+    assert n_checkpoints == expected_n_checkpoints == 5   # 10/2 = 5
 
     # Extend training
-    model.config['espaloma']['train']['epochs'] = 40
-    model.train(output_directory_path=str(checkpoint_directory))
+    model.epochs = 40
+    model.train()
     n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt'))))
-    assert n_checkpoints == 4
+    expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency)
+    assert n_checkpoints == expected_n_checkpoints == 20   # 40/2 = 20
 
 
 def test_train_extend_failure(test_load_dataset, test_create_espaloma_model, tmpdir):
@@ -146,19 +153,24 @@ def test_train_extend_failure(test_load_dataset, test_create_espaloma_model, tmp
 
     # Create temporary checkpoint directory
     checkpoint_directory = tmpdir.mkdir('checkpoints')   # PosixPath
+    model.output_directory_path=str(checkpoint_directory)
 
     # Train model
-    model.train(output_directory_path=str(checkpoint_directory))
+    model.epochs = 20
+    model.checkpoint_frequency = 10
+    model.train()
 
     # Test if the model has been trained
     n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt'))))
-    expected_n_checkpoints = int(model.config['espaloma']['train']['epochs']/model.config['espaloma']['train']['checkpoint_frequency'])
-    assert n_checkpoints == expected_n_checkpoints
+    expected_n_checkpoints = int(model.epochs/model.checkpoint_frequency)
+    assert n_checkpoints == expected_n_checkpoints == 2   # 20/10 = 2
 
     # Extend training
-    # This should fail to extend the training because the given new number of epoch (i.e. 10) is less than the 
+    # The training should not extend because the given new number of epoch (i.e. 10) is less than the 
     # last epoch of the checkpoint file (i.e. 20). 
-    model.config['espaloma']['train']['epochs'] = 10
-    model.train(output_directory_path=str(checkpoint_directory))
-    n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt'))))
-    assert n_checkpoints == expected_n_checkpoints
\ No newline at end of file
+    with pytest.raises(SystemExit) as excinfo:
+        model.epochs = 10
+        model.train()
+    assert excinfo.value.code == 0
+    #n_checkpoints = len(glob.glob(str(checkpoint_directory.join('*.pt'))))
+    #assert n_checkpoints == expected_n_checkpoints == 2   # 20/10 = 2
\ No newline at end of file
diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py
index 222f86c..babf1bf 100644
--- a/espfit/tests/test_app_train_sampler.py
+++ b/espfit/tests/test_app_train_sampler.py
@@ -5,7 +5,7 @@
 
 
 @pytest.fixture
-def test_create_espaloma_model():
+def test_create_espaloma_from_toml(tmpdir):
     """Test function to load a TOML configuration file and create an EspalomaModel object.
 
     Returns
@@ -15,6 +15,7 @@ def test_create_espaloma_model():
     """
     filename = files('espfit').joinpath('data/config/config.toml')   # PosixPath
     model = EspalomaModel.from_toml(str(filename))
+    model.output_directory_path = str(tmpdir)   # Update output directory path
 
     return model
 
@@ -51,17 +52,27 @@ def test_load_dataset(tmpdir):
     return ds
 
 
-def test_train_sampler(test_load_dataset, test_create_espaloma_model):
+def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml):
+
+    """
+    TODO
+    ----
+
+    * sampler.py needs to support loading temporary espaloma model during training
+    """
 
     # Load dataset and model
     ds = test_load_dataset
-    model = test_create_espaloma_model
-    model.epochs = 50
+    model = test_create_espaloma_from_toml
+
+    # Set espaloma parameters
     model.dataset_train = ds
+    model.epochs = 10
 
     # Train
-    small_molecule_forcefield=files('espfit').joinpath('data/forcefield/espaloma-0.3.2.pt')
-    biopolymer_file = files('espfit').joinpath('data/target/testsystems/nucleoside/pdbfixer_min.pdb')   # PosixPath
-    output_directory_path = 'examples/checkpoints_sampler'
-    model.train_sampler(biopolymer_file=biopolymer_file, sampler_patience=3, maxIterations=100, nsteps=1000, neff_threshold=0.2, output_directory_path=output_directory_path, small_molecule_forcefield=str(small_molecule_forcefield))
-    raise ValueError('This test is not yet implemented.')
+    model.train_sampler(sampler_patience=3, neff_threshold=0.2)   # fails if sampler_patience is < epochs
+
+    # Check outputs
+    import glob
+    #assert len(glob.glob(model.output_directory_path + '/*')) > 0
+    #assert model.sampler is not None
\ No newline at end of file
diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
index f1d7a53..47a6983 100644
--- a/espfit/utils/sampler/reweight.py
+++ b/espfit/utils/sampler/reweight.py
@@ -23,21 +23,21 @@ def _compute_observable(self):
             from espfit.app.analysis import RNASystem
             target = RNASystem()
             target.load_traj(input_directory_path=self.output_directory_path)
-            obs_calc = target.compute_jcouplings()
-            _logger.info(f'Computed observable: {obs_calc}')
+            val = target.compute_jcouplings()
+            _logger.info(f'Computed observable: {val}')
         else:
             raise NotImplementedError(f'Observable for {self.target_class} is not implemented.')
 
         import yaml
-        with open(os.path.join(self.output_directory_path, 'observable.yaml'), 'w') as f:
-            yaml.dump(obs_calc, f, allow_unicode=True)
+        with open(os.path.join(self.output_directory_path, 'pred.yaml'), 'w') as f:
+            yaml.dump(val, f, allow_unicode=True)
 
-        return obs_calc
+        return val
 
 
     def compute_loss(self):
         # Compute experimental observable
-        obs_calc = self._compute_observable()
+        val = self._compute_observable()
         _logger.info(f'Compute loss')
 
         # Compute loss

From cbcd34385858883e0608cc60eba5b2af9637a370 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 23 Feb 2024 15:23:26 -0500
Subject: [PATCH 31/59] run sampler using on-the-fly espaloma model created
 during training

---
 espfit/app/train.py                    | 403 ++++++++++++++++---------
 espfit/tests/test_app_train_sampler.py |   4 +-
 2 files changed, 268 insertions(+), 139 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index 824b9c1..f64ecfb 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -3,118 +3,27 @@
 
 TODO
 ----
+* Export loss to a file (e.g. LossReporter class?)
 * Add support to use multiple GPUs
-* Add support to validate model? (or use independent script?)
-* Add support to save model? (or use independent script?)
 * Improve how data are parsed using dataclasses or pydantic
 """
 import os
+import torch
 import logging
 
 _logger = logging.getLogger(__name__)
 
 
-class EspalomaModel(object):
-    """Espaloma network model and training modules.
-
-    Methods
-    -------
-    from_toml(filename):
-        Load espaloma configuration file in TOML format.
-    
-    Examples
-    --------
-    >>> from espfit.app.train import EspalomaModel
-    >>> filename = 'espfit/data/config/config.toml'
-    >>> # create espaloma network model from toml file
-    >>> model = EspalomaModel.from_toml(filename)
-    >>> # check espaloma network model
-    >>> model.net
-    >>> # load training dataset
-    >>> model.dataset_train = ds
-    >>> model.train()
-    """
-
-    def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, random_seed=2666, output_directory_path=None, 
-                 epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10):
-        """Initialize an instance of the class with an Espaloma network model and a random seed.
-
-        This constructor method sets up the Espaloma network model, the training, validation, test datasets, 
-        a configuratino file, and the random seed that will be used throughout the training process. 
-        If no model or datasets are provided, the corresponding attributes will be set to None. If no random seed is 
-        provided, the `random_seed` attribute will be set to 2666.
-
-        Parameters
-        ----------
-        net : torch.nn.Sequential, default=None
-            The Espaloma network model to be used for training.        
-        
-        dataset_train : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None
-            The training dataset. espaloma.graphs.graph.Graph. If not provided, the `train_data` attribute will be set to None.
-
-        dataset_validation : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None
-            The validation dataset. If not provided, the `validation_data` attribute will be set to None.
-
-        dataset_test : Dataset, espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None
-            The test dataset. If not provided, the `test_data` attribute will be set to None.
-
-        random_seed : int, default=2666
-            The random seed used throughout the espaloma training.
-
-        output_directory_path : str, default=None
-            The directory where the model checkpoints should be saved. 
-            If not provided, the checkpoints will be saved in the current working directory.
-
-        epochs : int, default=1000
-            The number of epochs to train the model for.
-
-        batch_size : int, default=128
-            The number of samples per batch.
-
-        learning_rate : float, default=1e-4
-            The learning rate for the optimizer.
-
-        checkpoint_frequency : int, default=10
-            The frequency at which the model should be saved.
-        """
-        self.dataset_train = dataset_train
-        self.dataset_validation = dataset_validation
-        self.dataset_test = dataset_test
-        self.net = net
-        self.random_seed = random_seed
-        self.epochs = epochs
-        self.batch_size = batch_size
-        self.learning_rate = learning_rate
-        self.checkpoint_frequency = checkpoint_frequency
-        self.restart_epoch = 0
-        self.configfile = None
-        if output_directory_path is None:
-            output_directory_path = os.getcwd()
-        self.output_directory_path = output_directory_path
-
+class EspalomaBase(object):
+    def __init__(self):
         # Check if GPU is available
-        import torch
         if torch.cuda.is_available():
             _logger.info('GPU is available for training.')
         else:
             _logger.info('GPU is not available for training.')
 
         # Check torch data type
-        _logger.info(f'Torch data type is {torch.get_default_dtype()}')
-
-
-    @property
-    def output_directory_path(self):
-        """Get output directory path."""
-        return self._output_directory_path
-
-
-    @output_directory_path.setter
-    def output_directory_path(self, value):
-        """Set output directory path."""
-        self._output_directory_path = value
-        # Create output directory if it does not exist
-        os.makedirs(value, exist_ok=True)
+        _logger.debug(f'Torch data type is {torch.get_default_dtype()}')
 
 
     @classmethod
@@ -158,14 +67,9 @@ def from_toml(cls, filename):
 
 
     @staticmethod
-    def create_model(espaloma_config):
-        """Create an Espaloma network model using the provided configuration.
-
-        This function constructs a PyTorch Sequential model with two stages of Graph Neural Network (GNN) layers,
-        JanossyPooling readout layers for various features, and additional layers for energy computation and loss calculation.
-        The specifics of the GNN layers and the readout layers are controlled by the `espaloma_config` dictionary.
-        If a CUDA-compatible GPU is available, the model is moved to the GPU before being returned.
-
+    def _get_base_module(espaloma_config):
+        """Create base modules for Espaloma network model.
+        
         Parameters
         ----------
         espaloma_config : dict
@@ -175,11 +79,12 @@ def create_model(espaloma_config):
 
         Returns
         -------
-        torch.nn.Sequential
-            The constructed Espaloma network model.
+        list
+            A list of modules for the Espaloma network model.
         """
+
         import espaloma as esp
-        
+
         # GNN
         gnn_method = 'SAGEConv'
         gnn_options = {}
@@ -214,30 +119,203 @@ def create_model(espaloma_config):
         # Improper torsions (multiplicity n=2)
         readout_improper = esp.nn.readout.janossy.JanossyPoolingWithSmirnoffImproper(in_features=units, config=config_2, out_features={"k": 2})
 
-        # Get loss weights
-        # TODO: Better way to initialize weights?
+        # Initialize loss weights and update if provided
         weights = { 'energy': 1.0, 'force': 1.0, 'charge': 1.0, 'torsion': 1.0, 'improper': 1.0 }
         if 'weights' in espaloma_config.keys():
             for key in espaloma_config['weights'].keys():
                 weights[key] = espaloma_config['weights'][key]
 
-        # Define espaloma architecture
-        import torch
+        # Append base modules
+        modules = []
+        modules.append(representation)
+        modules.append(readout)
+        modules.append(readout_improper)
+        modules.append(esp.nn.readout.janossy.ExpCoefficients())
+        modules.append(esp.nn.readout.charge_equilibrium.ChargeEquilibrium())
+
+        return modules, weights
+
+
+    @staticmethod
+    def create_model(espaloma_config):
+        """Create an Espaloma network model using the provided configuration.
+
+        This function constructs a PyTorch Sequential model with two stages of Graph Neural Network (GNN) layers,
+        JanossyPooling readout layers for various features, and additional layers for energy computation and loss calculation.
+        The specifics of the GNN layers and the readout layers are controlled by the `espaloma_config` dictionary.
+        If a CUDA-compatible GPU is available, the model is moved to the GPU before being returned.
+
+        Parameters
+        ----------
+        espaloma_config : dict
+            A dictionary containing the configuration for the Espaloma network.
+            This includes the method and options for the GNN layers, the configurations for the two stages of the network,
+            and optionally the weights for different loss components.
+
+        Returns
+        -------
+        torch.nn.Sequential
+            The constructed Espaloma network model.
+        """
+        import espaloma as esp
         from espfit.utils.espaloma.module import GetLoss
-        net = torch.nn.Sequential(
-            representation,
-            readout,
-            readout_improper,
-            esp.nn.readout.janossy.ExpCoefficients(),
-            esp.nn.readout.charge_equilibrium.ChargeEquilibrium(),
-            esp.mm.geometry.GeometryInGraph(),
-            esp.mm.energy.EnergyInGraph(terms=["n2", "n3", "n4", "n4_improper"]),
-            GetLoss(weights),
-        )
+
+        # Get base model
+        modules, weights = EspalomaBase._get_base_module(espaloma_config)
+
+        # Define espaloma architecture
+        modules.append(esp.mm.geometry.GeometryInGraph())
+        modules.append(esp.mm.energy.EnergyInGraph(terms=["n2", "n3", "n4", "n4_improper"]))
+        modules.append(GetLoss(weights))
+
+        # Create model
+        net = torch.nn.Sequential(*modules)
         if torch.cuda.is_available():
             return net.cuda()
         else:
             return net
+        
+
+    def save_model(self, net=None, best_model=None, model_name='espaloma.pt', output_directory_path=None):
+        """Save the Espaloma network model to a file.
+        
+        This method saves the Espaloma network model to a file in the specified output directory.
+        
+        Parameters
+        ----------
+        net : torch.nn.Sequential
+            The Espaloma network model to be saved.
+
+        best_model : str
+            The path to the best model file.
+
+        model_name : str, default='espaloma.pt'
+            The name of the file to save the model to.
+
+        output_directory_path : str, default=None
+            The directory where the model should be saved. 
+            If not provided, the model will be saved in the current working directory.
+
+        Returns
+        -------
+        None
+        """
+        import espaloma as esp
+
+        if output_directory_path is not None:
+            os.makedirs(output_directory_path, exist_ok=True)
+        else:
+            output_directory_path = os.getcwd()
+
+        if net:
+            modules = []
+            for module in net:
+                if isinstance(module, esp.mm.geometry.GeometryInGraph):
+                    break
+                modules.append(module)
+            modules.append(esp.nn.readout.janossy.LinearMixtureToOriginal())
+            net = torch.nn.Sequential(*modules)
+        else:
+            raise ValueError('No model provided.')
+        
+        # Save model
+        state_dict = torch.load(best_model, map_location=torch.device('cpu'))
+        net.load_state_dict(state_dict)
+        torch.save(net, os.path.join(output_directory_path, model_name))
+
+
+class EspalomaModel(EspalomaBase):
+    """Espaloma network model and training modules.
+
+    Methods
+    -------
+    from_toml(filename):
+        Load espaloma configuration file in TOML format.
+    
+    Examples
+    --------
+    >>> from espfit.app.train import EspalomaModel
+    >>> filename = 'espfit/data/config/config.toml'
+    >>> # create espaloma network model from toml file
+    >>> model = EspalomaModel.from_toml(filename)
+    >>> # check espaloma network model
+    >>> model.net
+    >>> # load training dataset
+    >>> model.dataset_train = ds
+    >>> model.train()
+    """
+
+    def __init__(self, net=None, dataset_train=None, dataset_validation=None, dataset_test=None, 
+                 epochs=1000, batch_size=128, learning_rate=1e-4, checkpoint_frequency=10, 
+                 random_seed=2666, output_directory_path=None):
+        """Initialize an instance of the class with an Espaloma network model and a random seed.
+
+        This constructor method sets up the Espaloma network model, the training, validation, test datasets, 
+        a configuratino file, and the random seed that will be used throughout the training process. 
+        If no model or datasets are provided, the corresponding attributes will be set to None. If no random seed is 
+        provided, the `random_seed` attribute will be set to 2666.
+
+        Parameters
+        ----------
+        net : torch.nn.Sequential, default=None
+            The Espaloma network model to be used for training.        
+        
+        dataset_train : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None
+            The training dataset. espaloma.graphs.graph.Graph. If not provided, the `train_data` attribute will be set to None.
+
+        dataset_validation : espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None
+            The validation dataset. If not provided, the `validation_data` attribute will be set to None.
+
+        dataset_test : Dataset, espfit.utils.data.graphs.CustomGraphDataset or espaloma.data.dataset.GraphDataset, default=None
+            The test dataset. If not provided, the `test_data` attribute will be set to None.
+
+        epochs : int, default=1000
+            The number of epochs to train the model for.
+
+        batch_size : int, default=128
+            The number of samples per batch.
+
+        learning_rate : float, default=1e-4
+            The learning rate for the optimizer.
+
+        checkpoint_frequency : int, default=10
+            The frequency at which the model should be saved.
+
+        random_seed : int, default=2666
+            The random seed used throughout the espaloma training.
+
+        output_directory_path : str, default=None
+            The directory where the model checkpoints should be saved. 
+            If not provided, the checkpoints will be saved in the current working directory.
+        """
+        super(EspalomaBase, self).__init__()
+        self.net = net
+        self.dataset_train = dataset_train
+        self.dataset_validation = dataset_validation
+        self.dataset_test = dataset_test
+        self.epochs = epochs
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.checkpoint_frequency = checkpoint_frequency
+        self.restart_epoch = 0
+        self.random_seed = random_seed
+        if output_directory_path is None:
+            output_directory_path = os.getcwd()
+            self.output_directory_path = output_directory_path
+
+
+    @property
+    def output_directory_path(self):
+        """Get output directory path."""
+        return self._output_directory_path
+
+
+    @output_directory_path.setter
+    def output_directory_path(self, value):
+        """Set output directory path."""
+        self._output_directory_path = value
+        # Create output directory if it does not exist
+        os.makedirs(value, exist_ok=True)
 
 
     def _load_checkpoint(self):
@@ -252,10 +330,8 @@ def _load_checkpoint(self):
         int
             The step from which the training process should be restarted.
         """
-        import os
         import sys
         import glob
-        import torch
 
         checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path))
         
@@ -292,8 +368,6 @@ def train(self):
         -------
         None
         """
-        import os
-        import torch
         from espfit.utils.units import HARTREE_TO_KCALPERMOL
 
         if self.dataset_train is None:
@@ -328,21 +402,29 @@ def train(self):
                     torch.save(self.net.state_dict(), checkpoint_file)
     
     
-    def train_sampler(self, sampler_patience=800, neff_threshold=0.2):
+    def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
         """
         Train the Espaloma network model with sampler.
 
         TODO
         ----
+        * Export loss to a file (e.g. LossReporter class?)
         * Should `nsteps` be a variable when calling train_sampler?
         * Should `sampler_patience` and `neff_threshold` be an instance variable of sampler.BaseSimulation?
 
         """
-        import os
-        import torch
         from espfit.utils.units import HARTREE_TO_KCALPERMOL
         from espfit.utils.sampler.reweight import SamplerReweight
 
+
+        # Note: RuntimeError will be raised if copy.deepcopy is used.
+        # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace 
+        # operation: [torch.cuda.FloatTensor [512, 1]], which is output 0 of AsStridedBackward0, is at version 2; 
+        # expected version 1 instead. Hint: the backtrace further above shows the operation that failed to 
+        # compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
+        import copy
+        net_local = copy.deepcopy(self.net)
+
         self.sampler_patience = sampler_patience
         self.neff_threshold = neff_threshold
 
@@ -361,18 +443,23 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2):
         with torch.autograd.set_detect_anomaly(True):
             for i in range(self.restart_epoch, self.epochs):
                 epoch = i + 1    # Start from 1 (not zero-indexing)
+                
                 loss = torch.tensor(0.0)
                 if torch.cuda.is_available():
                     loss = loss.cuda("cuda:0")
+
                 for g in ds_tr_loader:
                     optimizer.zero_grad()
                     if torch.cuda.is_available():
                         g = g.to("cuda:0")
                     g.nodes["n1"].data["xyz"].requires_grad = True 
-                    loss += self.net(g)
+                    loss += self.net(g)    # Return each loss component?
 
                 # Run sampling
                 if epoch > self.sampler_patience:
+                    # Compute effective sample size
+                    #neff = xxx
+                    
                     if neff < self.neff_threshold:
                         # Get Effective sample size
                         if neff < 0:
@@ -380,12 +467,26 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2):
                         else:
                             _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')                        
                         
-                        # Create sampler system from configuration file. Returns list of systems.
+                        # Save espaloma model
+                        self._save_local_model(epoch)
+                        local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt")
+                        self.save_model(net=net_local, best_model=local_model, model_name=f"net{epoch}.pt", output_directory_path=self.output_directory_path)
+                        
+                        # Define sampler settings to force the use of local espaloma model
                         args = [epoch]
+                        if debug == True:
+                            # DEBUG PURPOSE 
+                            from importlib.resources import files
+                            small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt"))
+                        else:
+                            small_molecule_forcefield = os.path.join(self.output_directory_path, f"net{epoch}.pt")
+
                         override_sampler_kwargs = { 
-                            "small_molecule_forcefield": "espfit/data/forcefield/espaloma-0.3.2.pt", # change this to local espaloma model
+                            "small_molecule_forcefield": small_molecule_forcefield,
                             "output_directory_path": self.output_directory_path 
                             }
+                        
+                        # Create sampler system from configuration file. Returns list of systems.                        
                         samplers = SamplerReweight.from_toml(self.configfile, *args, **override_sampler_kwargs)
                         for sampler in samplers:
                             _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...')
@@ -401,17 +502,45 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2):
                 loss.backward()
                 optimizer.step()
                 
+                # Report loss?
+                #self.export_loss(loss, epoch)
+
                 if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
                     _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
                     _logger.info(f'epoch {epoch}: {_loss:.3f}')
-                    checkpoint_file = os.path.join(self.output_directory_path, f"net{epoch}.pt")
-                    torch.save(self.net.state_dict(), checkpoint_file)
+                    self._save_local_model(epoch)
 
 
-    def validate():
-        raise NotImplementedError
+    def _save_local_model(self, epoch):
+        """Save local model.
 
+        Parameters
+        ----------
+        epoch : int
+            The epoch number.
+
+        Returns
+        -------
+        None
+        """
+        checkpoint_file = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt")
+        torch.save(self.net.state_dict(), checkpoint_file)
+
+    
+    def report_loss(self, loss, epoch):
+        """Report loss.
 
-    def save_model():
-        raise NotImplementedError
\ No newline at end of file
+        Parameters
+        ----------
+        loss : float
+            The loss value.
+
+        epoch : int
+            The epoch number.
+
+        Returns
+        -------
+        None
+        """
+        pass
\ No newline at end of file
diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py
index babf1bf..e0747a1 100644
--- a/espfit/tests/test_app_train_sampler.py
+++ b/espfit/tests/test_app_train_sampler.py
@@ -70,9 +70,9 @@ def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml):
     model.epochs = 10
 
     # Train
-    model.train_sampler(sampler_patience=3, neff_threshold=0.2)   # fails if sampler_patience is < epochs
+    model.train_sampler(sampler_patience=3, neff_threshold=0.2, debug=True)   # fails if sampler_patience is < epochs
 
     # Check outputs
-    import glob
+    #import glob
     #assert len(glob.glob(model.output_directory_path + '/*')) > 0
     #assert model.sampler is not None
\ No newline at end of file

From ff0468213ada48b9744933f06247ebbb94a29e4a Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 26 Feb 2024 10:06:37 -0500
Subject: [PATCH 32/59] export loss per epoch to reporter.log

---
 espfit/app/train.py             | 53 +++++++++++++++++++++++----------
 espfit/utils/espaloma/module.py | 16 ++++++++--
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index f64ecfb..d0053b6 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -27,7 +27,7 @@ def __init__(self):
 
 
     @classmethod
-    def from_toml(cls, filename):
+    def from_toml(cls, filename, **override_espalomamodel_kwargs):
         """Create an instance of the class from a TOML configuration file.
 
         This method reads a TOML file specified by `filename`, extracts the 'espaloma'
@@ -61,7 +61,17 @@ def from_toml(cls, filename):
 
         # Update training settings
         for key, value in config['espaloma']['train'].items():
-            setattr(model, key, value)
+            if hasattr(model, key):
+                setattr(model, key, value)
+            else:
+                raise ValueError(f'Invalid attribute {key}.')
+
+        # Override training settings
+        for key, value in override_espalomamodel_kwargs.items():
+            if hasattr(model, key):
+                setattr(model, key, value)
+            else:
+                raise ValueError(f'Invalid attribute {key}.')
 
         return model
 
@@ -438,6 +448,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
         neff = -1
 
         # Train
+        loss_trajectory = {}
         ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
         optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
         with torch.autograd.set_detect_anomaly(True):
@@ -452,8 +463,13 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
                     optimizer.zero_grad()
                     if torch.cuda.is_available():
                         g = g.to("cuda:0")
-                    g.nodes["n1"].data["xyz"].requires_grad = True 
-                    loss += self.net(g)    # Return each loss component?
+                    g.nodes["n1"].data["xyz"].requires_grad = True
+                    
+                    # Forward pass
+                    # Note that returned values are weighted losses.
+                    _loss, loss_dict = self.net(g)
+                    # Append loss
+                    loss += _loss
 
                 # Run sampling
                 if epoch > self.sampler_patience:
@@ -495,22 +511,28 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
 
                     # Compute MD loss
                     _logger.info(f'Compute sampler loss.')
-                    for sampler in samplers:
-                        loss += sampler.compute_loss() * sampler.weight
+                    for sampler_index, sampler in enumerate(samplers):
+                        loss_sampler = sampler.compute_loss() * sampler.weight
+                        loss += loss_sampler
+                        loss_dict[f'sampler{sampler_index}'] = loss_sampler.item()
+
+                # Append individual loss to loss_trajectory
+                loss_trajectory[epoch] = loss_dict
 
                 # Update weights
                 loss.backward()
                 optimizer.step()
                 
-                # Report loss?
-                #self.export_loss(loss, epoch)
-
                 if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
                     _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
                     _logger.info(f'epoch {epoch}: {_loss:.3f}')
                     self._save_local_model(epoch)
 
+        # Export loss trajectory
+        _logger.info(f'Export loss trajectory to a file.')
+        self.report_loss(loss_trajectory)
+
 
     def _save_local_model(self, epoch):
         """Save local model.
@@ -528,19 +550,18 @@ def _save_local_model(self, epoch):
         torch.save(self.net.state_dict(), checkpoint_file)
 
     
-    def report_loss(self, loss, epoch):
+    def report_loss(self, loss_trajecotry):
         """Report loss.
 
         Parameters
         ----------
-        loss : float
-            The loss value.
-
-        epoch : int
-            The epoch number.
+        loss : dict
+            The loss trajectory that stores individual weighted losses for each epoch.
 
         Returns
         -------
         None
         """
-        pass
\ No newline at end of file
+        import pandas as pd
+        df = pd.DataFrame.from_dict(loss_trajecotry, orient='index')
+        df.to_csv(os.path.join(self.output_directory_path, 'report.log'), sep='\t', float_format='%.4f')
diff --git a/espfit/utils/espaloma/module.py b/espfit/utils/espaloma/module.py
index e3d27b4..1e5ea18 100644
--- a/espfit/utils/espaloma/module.py
+++ b/espfit/utils/espaloma/module.py
@@ -166,7 +166,11 @@ def forward(self, g):
 
         Returns
         -------
-        loss : torch.Tensor       
+        loss : torch.Tensor
+            Total weighted loss
+
+        loss_dict : dict
+            Dictionary of individual weighted losses
         """
         loss_energy = self.compute_energy_loss(g) * self.weights['energy']
         loss_force = self.compute_force_loss(g) * self.weights['force']
@@ -180,5 +184,13 @@ def forward(self, g):
 
         _logger.debug(f"energy: {loss_energy:.5f}, force: {loss_force:.5f}, charge: {loss_charge:.5f}, torsion: {loss_torsion:.5f}, improper: {loss_improper:.5f}")
         loss = loss_energy + loss_force + loss_charge + loss_torsion + loss_improper
+
+        loss_dict = {
+            'energy': loss_energy.item(),
+            'force': loss_force.item(),
+            'charge': loss_charge.item(),
+            'torsion': loss_torsion.item(),
+            'improper': loss_improper.item(),
+        }
         
-        return loss
\ No newline at end of file
+        return loss, loss_dict
\ No newline at end of file

From 43f7e9940c458982f8ec04b711f1d86b723a06d7 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 28 Feb 2024 13:09:24 -0500
Subject: [PATCH 33/59] use j-coupling names for keyname

---
 .../nucleoside/adenosine/experiment.yml       | 24 ++++++++++++-------
 .../target/nucleoside/cytidine/experiment.yml | 24 ++++++++++++-------
 .../nucleoside/guanosine/experiment.yml       | 24 ++++++++++++-------
 .../target/nucleoside/uridine/experiment.yml  | 24 ++++++++++++-------
 4 files changed, 64 insertions(+), 32 deletions(-)

diff --git a/espfit/data/target/nucleoside/adenosine/experiment.yml b/espfit/data/target/nucleoside/adenosine/experiment.yml
index c4ffa09..b7efedf 100644
--- a/espfit/data/target/nucleoside/adenosine/experiment.yml
+++ b/espfit/data/target/nucleoside/adenosine/experiment.yml
@@ -15,35 +15,43 @@ experiment_1:
   smiles:
   measurement:
     resi_1:
-      beta_1:
+      1H5P:
+        name: beta_1
         value:
         operator:
         error:
-      beta_2:
+      2H5P:
+        name: beta_2
         value:
         operator:
         error:
-      gamma_1:
+      1H5H4:
+        name: gamma_1
         value: 
         operator:
         error:
-      gamma_2:
+      2H5P:
+        name: gamma_2
         value: 
         operator:
         error:
-      epsilon:
+      H3P:
+        name: epsilon
         value: 
         operator:
         error:
-      nu_1:
+      H1H2:
+        name: nu_1
         value: 6.0
         operator:
         error:
-      nu_2:
+      H2H3:
+        name: nu_2
         value: 5.1
         operator:
         error:
-      nu_3:
+      H3H4:
+        name: nu_3
         value: 3.5
         operator:
         error:
diff --git a/espfit/data/target/nucleoside/cytidine/experiment.yml b/espfit/data/target/nucleoside/cytidine/experiment.yml
index b0354a3..e6bbbe6 100644
--- a/espfit/data/target/nucleoside/cytidine/experiment.yml
+++ b/espfit/data/target/nucleoside/cytidine/experiment.yml
@@ -15,35 +15,43 @@ experiment_1:
   smiles:
   measurement:
     resi_1:
-      beta_1:
+      1H5P:
+        name: beta_1
         value:
         operator:
         error:
-      beta_2:
+      2H5P:
+        name: beta_2
         value:
         operator:
         error:
-      gamma_1:
+      1H5H4:
+        name: gamma_1
         value: 
         operator:
         error:
-      gamma_2:
+      2H5P:
+        name: gamma_2
         value: 
         operator:
         error:
-      epsilon:
+      H3P:
+        name: epsilon
         value: 
         operator:
         error:
-      nu_1:
+      H1H2:
+        name: nu_1
         value: 4.02
         operator:
         error:
-      nu_2:
+      H2H3:
+        name: nu_2
         value: 5.49
         operator:
         error:
-      nu_3:
+      H3H4:
+        name: nu_3
         value: 6.15
         operator:
         error:
diff --git a/espfit/data/target/nucleoside/guanosine/experiment.yml b/espfit/data/target/nucleoside/guanosine/experiment.yml
index a313122..46bcbc4 100644
--- a/espfit/data/target/nucleoside/guanosine/experiment.yml
+++ b/espfit/data/target/nucleoside/guanosine/experiment.yml
@@ -15,35 +15,43 @@ experiment_1:
   smiles:
   measurement:
     resi_1:
-      beta_1:
+      1H5P:
+        name: beta_1
         value:
         operator:
         error:
-      beta_2:
+      2H5P:
+        name: beta_2
         value:
         operator:
         error:
-      gamma_1:
+      1H5H4:
+        name: gamma_1
         value: 
         operator:
         error:
-      gamma_2:
+      2H5P:
+        name: gamma_2
         value: 
         operator:
         error:
-      epsilon:
+      H3P:
+        name: epsilon
         value: 
         operator:
         error:
-      nu_1:
+      H1H2:
+        name: nu_1
         value: 5.9
         operator:
         error:
-      nu_2:
+      H2H3:
+        name: nu_2
         value: 5.3
         operator:
         error:
-      nu_3:
+      H3H4:
+        name: nu_3
         value: 4.1
         operator:
         error:
diff --git a/espfit/data/target/nucleoside/uridine/experiment.yml b/espfit/data/target/nucleoside/uridine/experiment.yml
index a75185c..47ba261 100644
--- a/espfit/data/target/nucleoside/uridine/experiment.yml
+++ b/espfit/data/target/nucleoside/uridine/experiment.yml
@@ -15,35 +15,43 @@ experiment_1:
   smiles:
   measurement:
     resi_1:
-      beta_1:
+      1H5P:
+        name: beta_1
         value:
         operator:
         error:
-      beta_2:
+      2H5P:
+        name: beta_2
         value:
         operator:
         error:
-      gamma_1:
+      1H5H4:
+        name: gamma_1
         value: 
         operator:
         error:
-      gamma_2:
+      2H5P:
+        name: gamma_2
         value: 
         operator:
         error:
-      epsilon:
+      H3P:
+        name: epsilon
         value: 
         operator:
         error:
-      nu_1:
+      H1H2:
+        name: nu_1
         value: 4.59
         operator:
         error:
-      nu_2:
+      H2H3:
+        name: nu_2
         value: 5.45
         operator:
         error:
-      nu_3:
+      H3H4:
+        name: nu_3
         value: 5.77
         operator:
         error:

From bdb143384a5ff46ae6c4593e00f6dc4b19e3b7da Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 28 Feb 2024 13:10:08 -0500
Subject: [PATCH 34/59] add support to compute sampler loss

---
 espfit/utils/sampler/reweight.py | 60 ++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
index 47a6983..672002d 100644
--- a/espfit/utils/sampler/reweight.py
+++ b/espfit/utils/sampler/reweight.py
@@ -5,16 +5,28 @@
 _logger = logging.getLogger(__name__)
 
 
-class SamplerReweight(SetupSampler, BaseSimulation):
+class SetupSamplerReweight(SetupSampler, BaseSimulation):
 
     def __init__(self, weight=1, **kwargs):
         super().__init__(**kwargs)
         self.weight = weight
 
+        
+    def _get_experiment_data(self, target_class, target_name):
+        import yaml
+        from importlib.resources import files
+
+        yaml_file = str(files('espfit').joinpath(f'data/target/{target_class}/{target_name}/experiment.yml'))
+        with open(yaml_file, 'r') as f:
+            d = yaml.safe_load(f)
+
+        # {'resi_1': {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}}}
+        return d['experiment_1']['measurement']
+
 
     def get_effective_sample_size(self):
         # Compute effective sample size
-        neff = 0.5
+        neff = 0.5        
         return neff
 
 
@@ -23,25 +35,53 @@ def _compute_observable(self):
             from espfit.app.analysis import RNASystem
             target = RNASystem()
             target.load_traj(input_directory_path=self.output_directory_path)
-            val = target.compute_jcouplings()
-            _logger.info(f'Computed observable: {val}')
+            pred = target.compute_jcouplings()
+            #_logger.debug(f'Computed observable: {pred}')
         else:
             raise NotImplementedError(f'Observable for {self.target_class} is not implemented.')
 
         import yaml
         with open(os.path.join(self.output_directory_path, 'pred.yaml'), 'w') as f:
-            yaml.dump(val, f, allow_unicode=True)
+            yaml.dump(pred, f, allow_unicode=True)
 
-        return val
+        return pred
 
 
     def compute_loss(self):
         # Compute experimental observable
-        val = self._compute_observable()
-        _logger.info(f'Compute loss')
+        exp = self._get_experiment_data(self.target_class, self.target_name)
+        pred = self._compute_observable()
+
+        loss = []
+        for resi_index, exp_dict in enumerate(exp.values()):
+            for key, value in exp_dict.items():
+                # {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}}
+                if value['operator'] in ['>', '<', '>=', '<=', '~'] or value['value'] == None:
+                    # Dont use uncertain data
+                    pass
+                else:
+                    exp_value = value['value']
+                    exp_error = value['error']
+                    if exp_error == None:
+                        exp_error = 0.5  # TODO: Check experimental error
+
+                    resi_index = int(resi_index)
+                    pred_value = list(pred.values())[resi_index][key]['avg']
+                    pred_error = list(pred.values())[resi_index][key]['std']  # standard deviation
+
+                    # TODO: change to debug
+                    _logger.info(f'Exp ({resi_index}-{key}): {exp}')
+                    _logger.info(f'Pred ({resi_index}-{key}): {pred}')
+
+                    # Compute loss
+                    numerator = (pred_value - exp_value) ** 2
+                    dominator = (exp_error ** 2 + pred_error ** 2)
+                    loss.append(numerator / dominator)
+                
 
         # Compute loss
         import torch
-        loss = torch.tensor(0.0)
+        loss_avg = torch.mean(torch.tensor(loss))
+        _logger.info(f'Computed sampler loss: {loss_avg.item()}')
 
-        return loss
\ No newline at end of file
+        return loss_avg
\ No newline at end of file

From 095c87353be204d6c7d646ea839ec90a0c93aeb0 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 4 Mar 2024 16:48:03 -0500
Subject: [PATCH 35/59] refactor reweight.py

---
 espfit/app/analysis.py           |  38 ++++--
 espfit/app/sampler.py            |  41 +++++-
 espfit/app/train.py              | 212 ++++++++++++++++---------------
 espfit/data/config/config.toml   |   2 -
 espfit/utils/espaloma/module.py  |   1 +
 espfit/utils/sampler/reweight.py | 110 ++++++++++------
 6 files changed, 241 insertions(+), 163 deletions(-)

diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py
index 5234e53..1b6d97d 100644
--- a/espfit/app/analysis.py
+++ b/espfit/app/analysis.py
@@ -26,11 +26,14 @@ class BaseDataLoader(object):
     load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1):
         Load MD trajectory.
     """
-    def __init__(self, input_directory_path=None, output_directory_path=None):
+    def __init__(self, atomSubset='solute', input_directory_path=None, output_directory_path=None):
         """Initialize base data loader object.
         
         Parameters
         ----------
+        atomSubset : str, default='solute'
+            Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'.
+
         input_directory_path : str, optional
             Input directory path. Default is None.
             If None, the current working directory will be used.
@@ -39,6 +42,10 @@ def __init__(self, input_directory_path=None, output_directory_path=None):
             Output directory path. Default is None.
             If None, the current working directory will be used.
         """
+        self.atomSubset = atomSubset
+        if self.atomSubset not in ['solute', 'all', 'not water']:
+            raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.")
+
         if input_directory_path is None:
             input_directory_path = os.getcwd()
         if output_directory_path is None:
@@ -62,7 +69,7 @@ def output_directory_path(self, value):
 
 
     # Should this be a classmethod?
-    def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1, input_directory_path=None):
+    def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None):
         """Load MD trajectory.
         
         Parameters
@@ -73,10 +80,6 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a
         trajectory_netcdf : str, optional
             Trajectory netcdf file name. Default is 'traj.nc'.
 
-        atom_indices : list, optional
-            List of atom indices to load from trajectory. Default is None.
-            If None, all atoms will be loaded.
-
         stride : int, optional
             Stride to load the trajectory. Default is 1.
 
@@ -92,22 +95,31 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a
         # Load reference pdb (solvated system)
         pdb = os.path.join(self.input_directory_path, reference_pdb)
         ref_traj = mdtraj.load(pdb)
+        
         # Select atoms to load from trajectory
-        if atom_indices is None:
+        if self.atomSubset == 'all':
+            self.atom_indices = None
+            self.ref_traj = ref_traj
+        else:
             self.atom_indices = []
             mdtop = ref_traj.topology
-            res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
+            if self.atomSubset == 'solute':
+                res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
+            elif self.atomSubset == 'not water':
+                res = [ r for r in mdtop.residues if r.name not in ('HOH') ]
+            # Get atom indices
             for r in res:
                 for a in r.atoms:
                     self.atom_indices.append(a.index)
-        else:
-            self.atom_indices = atom_indices
-        self.ref_traj = ref_traj.atom_slice(self.atom_indices)
-
+            self.ref_traj = ref_traj.atom_slice(self.atom_indices)
+        
         # Load trajectory
         netcdf = os.path.join(self.input_directory_path, trajectory_netcdf)
         traj = mdtraj.load(netcdf, top=self.ref_traj.topology, stride=stride)
-        self.traj = traj.atom_slice(self.atom_indices)
+        if self.atom_indices:
+            self.traj = traj.atom_slice(self.atom_indices)
+        else:
+            self.traj = traj
 
         
 class RNASystem(BaseDataLoader):
diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index c75a82c..4d431d3 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -38,7 +38,7 @@ class BaseSimulation(object):
     export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path=None):
         Export serialized system XML file and solvated pdb file.
     """
-    def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None, 
+    def __init__(self, maxIterations=100, nsteps=250000, atomSubset='solute', 
                  checkpoint_frequency=25000, logging_frequency=250000, netcdf_frequency=250000, 
                  output_directory_path=None, input_directory_path=None):
         """Initialize base simulation object.
@@ -51,9 +51,9 @@ def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None,
         nsteps : int, default=250000 (10 ns using 4 fs timestep)
             Number of steps to run the simulation.
 
-        atom_indices : list, default=None
-            List of atom indices to save. If None, save all atoms except water and ions.
-
+        atomSubset : str, default='solute'
+            Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'.
+            
         checkpoint_frequency : int, default=25000 (1 ns)
             Frequency (in steps) at which to write checkpoint files.
 
@@ -73,11 +73,14 @@ def __init__(self, maxIterations=100, nsteps=250000, atom_indices=None,
         """
         self.maxIterations = maxIterations
         self.nsteps = nsteps
-        self.atom_indices = atom_indices
+        self.atomSubset = atomSubset
         self.checkpoint_frequency = checkpoint_frequency
         self.logging_frequency = logging_frequency
         self.netcdf_frequency = netcdf_frequency
 
+        if self.atomSubset not in ['solute', 'all', 'not water']:
+            raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.")
+
         if output_directory_path is None:
             output_directory_path = os.getcwd()  # Is this right?
         if input_directory_path is None:
@@ -163,10 +166,34 @@ def run(self, output_directory_path=None):
 
         # Select atoms to save
         import mdtraj
-        if self.atom_indices is None:
+        #if self.atomSubset == 'solute':
+        #    self.atom_indices = []
+        #    mdtop = mdtraj.Topology.from_openmm(self.simulation.topology)
+        #    res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
+        #    for r in res:
+        #        for a in r.atoms:
+        #            self.atom_indices.append(a.index)
+        #elif self.atomSubset == 'all':
+        #    self.atom_indices = None
+        #elif self.atomSubset == 'not water':
+        #    self.atom_indices = []
+        #    mdtop = mdtraj.Topology.from_openmm(self.simulation.topology)
+        #    res = [ r for r in mdtop.residues if r.name not in ('HOH') ]
+        #    for r in res:
+        #        for a in r.atoms:
+        #            self.atom_indices.append(a.index)
+        #else:
+        #    raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.")
+
+        if self.atomSubset == 'all':
+            self.atom_indices = None
+        else:
             self.atom_indices = []
             mdtop = mdtraj.Topology.from_openmm(self.simulation.topology)
-            res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
+            if self.atomSubset == 'solute':
+                res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
+            elif self.atomSubset == 'not water':
+               res = [ r for r in mdtop.residues if r.name not in ('HOH') ]
             for r in res:
                 for a in r.atoms:
                     self.atom_indices.append(a.index)
diff --git a/espfit/app/train.py b/espfit/app/train.py
index d0053b6..3b4d007 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -328,42 +328,21 @@ def output_directory_path(self, value):
         os.makedirs(value, exist_ok=True)
 
 
-    def _load_checkpoint(self):
-        """Load the last checkpoint and restart the training process.
+    def report_loss(self, loss_trajecotry):
+        """Report loss.
 
-        This method finds all the checkpoint files in the output directory, loads the 
-        last checkpoint (e.g. net100.pt), and restarts the training process from the next step. 
-        If no checkpoint files are found, the training process starts from the first step.
+        Parameters
+        ----------
+        loss : dict
+            The loss trajectory that stores individual weighted losses for each epoch.
 
         Returns
         -------
-        int
-            The step from which the training process should be restarted.
+        None
         """
-        import sys
-        import glob
-
-        checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path))
-        
-        if checkpoints:
-            n = [ int(c.split('net')[1].split('.')[0]) for c in checkpoints ]
-            n.sort()
-            restart_epoch = n[-1]
-            restart_checkpoint = os.path.join(self.output_directory_path, f"net{restart_epoch}.pt")
-            self.net.load_state_dict(torch.load(restart_checkpoint))
-            logging.info(f'Restarting from ({restart_checkpoint}).')
-        else:
-            restart_epoch = 0
-        
-        if restart_epoch >= self.epochs:
-            _logger.info(f'Already trained for {self.epochs} epochs.')
-            sys.exit(0)
-        elif restart_epoch > 0:
-            _logger.info(f'Training for additional {self.epochs-restart_epoch} epochs.')
-        else:
-            _logger.info(f'Training from scratch for {self.epochs} epochs.')
-
-        return restart_epoch
+        import pandas as pd
+        df = pd.DataFrame.from_dict(loss_trajecotry, orient='index')
+        df.to_csv(os.path.join(self.output_directory_path, 'reporter.log'), sep='\t', float_format='%.4f')
 
 
     def train(self):
@@ -408,11 +387,10 @@ def train(self):
                     # Note: returned loss is a joint loss of different units.
                     _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
                     _logger.info(f'epoch {epoch}: {_loss:.3f}')
-                    checkpoint_file = os.path.join(self.output_directory_path, f"net{epoch}.pt")
-                    torch.save(self.net.state_dict(), checkpoint_file)
+                    self._save_checkpoint(epoch)
     
     
-    def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
+    def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight=1.0, debug=False):
         """
         Train the Espaloma network model with sampler.
 
@@ -424,8 +402,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
 
         """
         from espfit.utils.units import HARTREE_TO_KCALPERMOL
-        from espfit.utils.sampler.reweight import SamplerReweight
-
+        from espfit.utils.sampler.reweight import SetupSamplerReweight
 
         # Note: RuntimeError will be raised if copy.deepcopy is used.
         # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace 
@@ -433,7 +410,7 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
         # expected version 1 instead. Hint: the backtrace further above shows the operation that failed to 
         # compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
         import copy
-        net_local = copy.deepcopy(self.net)
+        net_copy = copy.deepcopy(self.net)
 
         self.sampler_patience = sampler_patience
         self.neff_threshold = neff_threshold
@@ -444,10 +421,8 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
         # Load checkpoint
         self.restart_epoch = self._load_checkpoint()
 
-        # Initialize neff to -1 to trigger the first sampling
-        neff = -1
-
         # Train
+        neff = -1
         loss_trajectory = {}
         ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
         optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
@@ -471,52 +446,37 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
                     # Append loss
                     loss += _loss
 
-                # Run sampling
+                # Include sampler loss after certain epochs
                 if epoch > self.sampler_patience:
-                    # Compute effective sample size
-                    #neff = xxx
-                    
-                    if neff < self.neff_threshold:
-                        # Get Effective sample size
-                        if neff < 0:
-                            _logger.info(f'Reached sampler patience {self.sampler_patience}. Run sampler for the first time.')                        
-                        else:
-                            _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')                        
-                        
-                        # Save espaloma model
-                        self._save_local_model(epoch)
-                        local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt")
-                        self.save_model(net=net_local, best_model=local_model, model_name=f"net{epoch}.pt", output_directory_path=self.output_directory_path)
-                        
-                        # Define sampler settings to force the use of local espaloma model
-                        args = [epoch]
-                        if debug == True:
-                            # DEBUG PURPOSE 
-                            from importlib.resources import files
-                            small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt"))
-                        else:
-                            small_molecule_forcefield = os.path.join(self.output_directory_path, f"net{epoch}.pt")
-
-                        override_sampler_kwargs = { 
-                            "small_molecule_forcefield": small_molecule_forcefield,
-                            "output_directory_path": self.output_directory_path 
-                            }
-                        
-                        # Create sampler system from configuration file. Returns list of systems.                        
-                        samplers = SamplerReweight.from_toml(self.configfile, *args, **override_sampler_kwargs)
-                        for sampler in samplers:
-                            _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...')
-                            sampler.minimize()
-                            sampler.run()
-
-                    # Compute MD loss
+                    # Run sampling for the first time
+                    if neff == -1:
+                        _logger.info(f'Reached sampler patience epoch={self.sampler_patience}. Run sampler for the first time.')
+                        # Initialize
+                        SamplerReweight = SetupSamplerReweight()
+                        # Create new sampler system using local espaloma model
+                        samplers = self._setup_local_samplers(epoch, net_copy, debug)
+                        SamplerReweight.update(samplers)
+                        SamplerReweight.run()
+                    else:
+                        # If effective sample size is below threshold, re-run sampler
+                        neff = SamplerReweight.get_effective_sample_size()
+                        if neff < self.neff_threshold:
+                            _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')
+                            samplers = self._setup_local_samplers(epoch, net_copy, debug)
+                            SamplerReweight.update(samplers)
+                            SamplerReweight.run()
+
+                    # Compute sampler loss
                     _logger.info(f'Compute sampler loss.')
-                    for sampler_index, sampler in enumerate(samplers):
-                        loss_sampler = sampler.compute_loss() * sampler.weight
-                        loss += loss_sampler
-                        loss_dict[f'sampler{sampler_index}'] = loss_sampler.item()
-
-                # Append individual loss to loss_trajectory
+                    loss_list = SamplerReweight.compute_loss()   # list of torch.tensor
+                    for sampler_index, _loss in enumerate(loss_list):
+                        #loss_dict[f'sampler{sampler_index}'] = _loss.item()
+                        _sampler = SamplerReweight.samplers[sampler_index]
+                        loss_dict[f'{_sampler.target_name}'] = _loss.item()
+                        loss += _loss * sampler_weight
+
+                # Append total and individual loss to loss_trajectory
+                loss_dict['loss'] = loss.item()
                 loss_trajectory[epoch] = loss_dict
 
                 # Update weights
@@ -527,14 +487,53 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, debug=False):
                     # Note: returned loss is a joint loss of different units.
                     _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
                     _logger.info(f'epoch {epoch}: {_loss:.3f}')
-                    self._save_local_model(epoch)
+                    self._save_checkpoint(epoch)
 
         # Export loss trajectory
+        # TODO: Report losses at every epoch
         _logger.info(f'Export loss trajectory to a file.')
         self.report_loss(loss_trajectory)
 
 
-    def _save_local_model(self, epoch):
+    def _load_checkpoint(self):
+        """Load the last checkpoint and restart the training process.
+
+        This method finds all the checkpoint files in the output directory, loads the 
+        last checkpoint (e.g. net100.pt), and restarts the training process from the next step. 
+        If no checkpoint files are found, the training process starts from the first step.
+
+        Returns
+        -------
+        int
+            The step from which the training process should be restarted.
+        """
+        import sys
+        import glob
+
+        checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path))
+        
+        if checkpoints:
+            n = [ int(c.split('net')[1].split('.')[0]) for c in checkpoints ]
+            n.sort()
+            restart_epoch = n[-1]
+            restart_checkpoint = os.path.join(self.output_directory_path, f"net{restart_epoch}.pt")
+            self.net.load_state_dict(torch.load(restart_checkpoint))
+            logging.info(f'Restarting from ({restart_checkpoint}).')
+        else:
+            restart_epoch = 0
+        
+        if restart_epoch >= self.epochs:
+            _logger.info(f'Already trained for {self.epochs} epochs.')
+            sys.exit(0)
+        elif restart_epoch > 0:
+            _logger.info(f'Training for additional {self.epochs-restart_epoch} epochs.')
+        else:
+            _logger.info(f'Training from scratch for {self.epochs} epochs.')
+
+        return restart_epoch
+
+
+    def _save_checkpoint(self, epoch):
         """Save local model.
 
         Parameters
@@ -549,19 +548,32 @@ def _save_local_model(self, epoch):
         checkpoint_file = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt")
         torch.save(self.net.state_dict(), checkpoint_file)
 
-    
-    def report_loss(self, loss_trajecotry):
-        """Report loss.
 
-        Parameters
-        ----------
-        loss : dict
-            The loss trajectory that stores individual weighted losses for each epoch.
+    def _setup_local_samplers(self, epoch, net_copy, debug):
+        from espfit.app.sampler import SetupSampler
 
-        Returns
-        -------
-        None
-        """
-        import pandas as pd
-        df = pd.DataFrame.from_dict(loss_trajecotry, orient='index')
-        df.to_csv(os.path.join(self.output_directory_path, 'report.log'), sep='\t', float_format='%.4f')
+        # Save espaloma checkpoint models
+        self._save_checkpoint(epoch)
+        # Save checkpoint as temporary espaloma model (force field)
+        local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt")
+        self.save_model(net=net_copy, best_model=local_model, model_name=f"net{epoch}.pt", output_directory_path=self.output_directory_path)
+        
+        # Define sampler settings with override arguments
+        args = [epoch]
+        if debug == True:
+            from importlib.resources import files
+            small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt"))
+        else:
+            small_molecule_forcefield = os.path.join(self.output_directory_path, f"net{epoch}.pt")
+
+        override_sampler_kwargs = { 
+            "atomSubset": 'all',
+            "small_molecule_forcefield": small_molecule_forcefield,
+            "output_directory_path": self.output_directory_path 
+            }
+        
+        # Create sampler system from configuration file. Returns list of systems.                        
+        samplers = SetupSampler.from_toml(self.configfile, *args, **override_sampler_kwargs)
+
+        return samplers
+    
\ No newline at end of file
diff --git a/espfit/data/config/config.toml b/espfit/data/config/config.toml
index 31ec82c..a3f9138 100644
--- a/espfit/data/config/config.toml
+++ b/espfit/data/config/config.toml
@@ -38,7 +38,6 @@ nsteps = 1000
 checkpoint_frequency = 10
 logging_frequency = 1
 netcdf_frequency = 10
-weight = 1
 
 # system setup parameters
 [[sampler.setup]]
@@ -53,4 +52,3 @@ nsteps = 1000
 checkpoint_frequency = 10
 logging_frequency = 1
 netcdf_frequency = 10
-weight = 1
\ No newline at end of file
diff --git a/espfit/utils/espaloma/module.py b/espfit/utils/espaloma/module.py
index 1e5ea18..6f3fb34 100644
--- a/espfit/utils/espaloma/module.py
+++ b/espfit/utils/espaloma/module.py
@@ -186,6 +186,7 @@ def forward(self, g):
         loss = loss_energy + loss_force + loss_charge + loss_torsion + loss_improper
 
         loss_dict = {
+            'loss': None,
             'energy': loss_energy.item(),
             'force': loss_force.item(),
             'charge': loss_charge.item(),
diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
index 672002d..917ef8d 100644
--- a/espfit/utils/sampler/reweight.py
+++ b/espfit/utils/sampler/reweight.py
@@ -5,52 +5,58 @@
 _logger = logging.getLogger(__name__)
 
 
-class SetupSamplerReweight(SetupSampler, BaseSimulation):
+class SetupSamplerReweight(object):
 
-    def __init__(self, weight=1, **kwargs):
-        super().__init__(**kwargs)
-        self.weight = weight
+    def __init__(self):
+        self.samplers = None
+        self.samplers_old = None
+        self.weights = None   # list
 
-        
-    def _get_experiment_data(self, target_class, target_name):
-        import yaml
-        from importlib.resources import files
 
-        yaml_file = str(files('espfit').joinpath(f'data/target/{target_class}/{target_name}/experiment.yml'))
-        with open(yaml_file, 'r') as f:
-            d = yaml.safe_load(f)
+    def run(self):
+        for sampler in self.samplers:
+            _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...')
+            sampler.minimize()
+            sampler.run()
 
-        # {'resi_1': {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}}}
-        return d['experiment_1']['measurement']
+
+    def update(self, samplers):
+        # Update sampler
+        self.samplers_old = self.samplers
+        self.samplers = samplers
 
 
     def get_effective_sample_size(self):
         # Compute effective sample size
-        neff = 0.5        
-        return neff
 
+        # U(x0, theta0)
+        old_potential_energy = 0
 
-    def _compute_observable(self):
-        if self.target_class == 'nucleoside':
-            from espfit.app.analysis import RNASystem
-            target = RNASystem()
-            target.load_traj(input_directory_path=self.output_directory_path)
-            pred = target.compute_jcouplings()
-            #_logger.debug(f'Computed observable: {pred}')
-        else:
-            raise NotImplementedError(f'Observable for {self.target_class} is not implemented.')
+        # U(x0, theta1)
+        reduced_potential_energy = 0
 
-        import yaml
-        with open(os.path.join(self.output_directory_path, 'pred.yaml'), 'w') as f:
-            yaml.dump(pred, f, allow_unicode=True)
+        neff = 0.5
+        return neff
+    
+        
+    def compute_loss(self):
 
-        return pred
+        loss_list = []
+        for sampler in self.samplers:
+            loss = self._compute_loss_per_system(sampler)  # torch.tensor
+            loss_list.append(loss)
 
+        # list of torch.tensor        
+        return loss_list
+    
+
+    def _compute_loss_per_system(self, sampler):
+
+        import torch
 
-    def compute_loss(self):
         # Compute experimental observable
-        exp = self._get_experiment_data(self.target_class, self.target_name)
-        pred = self._compute_observable()
+        exp = self._get_experiment_data(sampler.target_class, sampler.target_name)
+        pred = self._compute_observable(sampler.atomSubset, sampler.target_class, sampler.output_directory_path)
 
         loss = []
         for resi_index, exp_dict in enumerate(exp.values()):
@@ -64,24 +70,46 @@ def compute_loss(self):
                     exp_error = value['error']
                     if exp_error == None:
                         exp_error = 0.5  # TODO: Check experimental error
-
                     resi_index = int(resi_index)
                     pred_value = list(pred.values())[resi_index][key]['avg']
                     pred_error = list(pred.values())[resi_index][key]['std']  # standard deviation
-
-                    # TODO: change to debug
-                    _logger.info(f'Exp ({resi_index}-{key}): {exp}')
-                    _logger.info(f'Pred ({resi_index}-{key}): {pred}')
-
+                    _logger.debug(f'Exp ({resi_index}-{key}): {exp}')
+                    _logger.debug(f'Pred ({resi_index}-{key}): {pred}')
                     # Compute loss
                     numerator = (pred_value - exp_value) ** 2
                     dominator = (exp_error ** 2 + pred_error ** 2)
                     loss.append(numerator / dominator)
-                
-
         # Compute loss
-        import torch
         loss_avg = torch.mean(torch.tensor(loss))
         _logger.info(f'Computed sampler loss: {loss_avg.item()}')
 
-        return loss_avg
\ No newline at end of file
+        return loss_avg
+
+
+    def _get_experiment_data(self, target_class, target_name):
+        import yaml
+        from importlib.resources import files
+
+        yaml_file = str(files('espfit').joinpath(f'data/target/{target_class}/{target_name}/experiment.yml'))
+        with open(yaml_file, 'r', encoding='utf8') as f:
+            d = yaml.safe_load(f)
+
+        # {'resi_1': {'1H5P': {'name': 'beta_1', 'value': None, 'operator': None, 'error': None}}}
+        return d['experiment_1']['measurement']
+
+
+    def _compute_observable(self, atomSubset, target_class, output_directory_path):
+        if target_class == 'nucleoside':
+            from espfit.app.analysis import RNASystem
+            target = RNASystem(atomSubset=atomSubset)
+            target.load_traj(input_directory_path=output_directory_path)
+            pred = target.compute_jcouplings()
+            _logger.debug(f'Computed observable: {pred}')
+        else:
+            raise NotImplementedError(f'Observable for {target_class} is not implemented.')
+
+        import yaml
+        with open(os.path.join(output_directory_path, 'pred.yaml'), 'w') as f:
+            yaml.dump(pred, f, allow_unicode=True)
+
+        return pred

From 09565a9a6203ec1ada510859c42f3f19b68dd4e4 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 4 Mar 2024 21:01:21 -0500
Subject: [PATCH 36/59] fix SetupSampler.from_toml when args is not defined

---
 espfit/app/train.py | 52 ++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index 3b4d007..49aeea8 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -421,8 +421,10 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
         # Load checkpoint
         self.restart_epoch = self._load_checkpoint()
 
+        # Initialize
+        SamplerReweight = SetupSamplerReweight()
+        
         # Train
-        neff = -1
         loss_trajectory = {}
         ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
         optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
@@ -448,29 +450,23 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
 
                 # Include sampler loss after certain epochs
                 if epoch > self.sampler_patience:
-                    # Run sampling for the first time
-                    if neff == -1:
-                        _logger.info(f'Reached sampler patience epoch={self.sampler_patience}. Run sampler for the first time.')
-                        # Initialize
-                        SamplerReweight = SetupSamplerReweight()
-                        # Create new sampler system using local espaloma model
-                        samplers = self._setup_local_samplers(epoch, net_copy, debug)
-                        SamplerReweight.update(samplers)
+                    # Save checkpoint as local model (force field)
+                    _samplers = self._setup_local_samplers(epoch, net_copy, debug)
+                    neff = SamplerReweight.get_effective_sample_size(temporary_samplers=_samplers)  # returns -1 if SamplerReweight.samplers is None
+
+                    # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton
+                    if neff < self.neff_threshold:
+                        _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')
+                        SamplerReweight.samplers = _samplers
                         SamplerReweight.run()
-                    else:
-                        # If effective sample size is below threshold, re-run sampler
-                        neff = SamplerReweight.get_effective_sample_size()
-                        if neff < self.neff_threshold:
-                            _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')
-                            samplers = self._setup_local_samplers(epoch, net_copy, debug)
-                            SamplerReweight.update(samplers)
-                            SamplerReweight.run()
+
+                    # Delete temporary_samplers
+                    del _samplers
 
                     # Compute sampler loss
                     _logger.info(f'Compute sampler loss.')
                     loss_list = SamplerReweight.compute_loss()   # list of torch.tensor
                     for sampler_index, _loss in enumerate(loss_list):
-                        #loss_dict[f'sampler{sampler_index}'] = _loss.item()
                         _sampler = SamplerReweight.samplers[sampler_index]
                         loss_dict[f'{_sampler.target_name}'] = _loss.item()
                         loss += _loss * sampler_weight
@@ -549,27 +545,31 @@ def _save_checkpoint(self, epoch):
         torch.save(self.net.state_dict(), checkpoint_file)
 
 
-    def _setup_local_samplers(self, epoch, net_copy, debug):
-        from espfit.app.sampler import SetupSampler
-
-        # Save espaloma checkpoint models
-        self._save_checkpoint(epoch)
+    def _save_local_model(self, epoch, net_copy):
         # Save checkpoint as temporary espaloma model (force field)
+        _logger.info(f'Save checkpoint{epoch}.pt as temporary espaloma model (force field).')
+        self._save_checkpoint(epoch)
         local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt")
-        self.save_model(net=net_copy, best_model=local_model, model_name=f"net{epoch}.pt", output_directory_path=self.output_directory_path)
+        self.save_model(net=net_copy, best_model=local_model, model_name=f"net.pt", output_directory_path=self.output_directory_path)
+
+
+    def _setup_local_samplers(self, epoch, net_copy, debug):
+        from espfit.app.sampler import SetupSampler
         
+        self._save_local_model(epoch, net_copy)
+
         # Define sampler settings with override arguments
         args = [epoch]
         if debug == True:
             from importlib.resources import files
             small_molecule_forcefield = str(files('espfit').joinpath("data/forcefield/espaloma-0.3.2.pt"))
         else:
-            small_molecule_forcefield = os.path.join(self.output_directory_path, f"net{epoch}.pt")
+            small_molecule_forcefield = os.path.join(self.output_directory_path, f"net.pt")
 
         override_sampler_kwargs = { 
             "atomSubset": 'all',
             "small_molecule_forcefield": small_molecule_forcefield,
-            "output_directory_path": self.output_directory_path 
+            "output_directory_path": self.output_directory_path
             }
         
         # Create sampler system from configuration file. Returns list of systems.                        

From f70140f039a1d78eeb6716eaa7b6d5e70270d93a Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Mon, 4 Mar 2024 21:02:12 -0500
Subject: [PATCH 37/59] add pseudo code for get_effective_sampler_size

---
 espfit/app/sampler.py            |  4 ++--
 espfit/utils/sampler/reweight.py | 33 ++++++++++++++++++++------------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index 4d431d3..6ec6d5e 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -496,8 +496,8 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs):
                 else:
                     raise ValueError(f"Invalid keyword argument: {key}")
 
-            # Update output directory path if epoch is given
-            if args is not None:
+            # Update output directory path if args (epoch) is given
+            if args:
                 if len(args) == 1 and isinstance(args[0], int):
                     sampler.output_directory_path = os.path.join(sampler.output_directory_path, sampler.target_name, f'{args[0]}')
                 else:
diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
index 917ef8d..3142932 100644
--- a/espfit/utils/sampler/reweight.py
+++ b/espfit/utils/sampler/reweight.py
@@ -9,7 +9,6 @@ class SetupSamplerReweight(object):
 
     def __init__(self):
         self.samplers = None
-        self.samplers_old = None
         self.weights = None   # list
 
 
@@ -20,20 +19,30 @@ def run(self):
             sampler.run()
 
 
-    def update(self, samplers):
-        # Update sampler
-        self.samplers_old = self.samplers
-        self.samplers = samplers
-
+    def get_effective_sample_size(self, temporary_samplers):
+        
+        # Check if sampler is None
+        if self.samplers is None:
+            return -1
 
-    def get_effective_sample_size(self):
         # Compute effective sample size
+        import mdtraj
+        from openmm.unit import kilocalories_per_mole as kcalpermol
+        potential_energy_diff = []
+        for sampler, temporary_sampler in zip(self.samplers, temporary_samplers):
+            traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb')
+            for i in range(traj.n_frames):
+                # U(x0, theta0)
+                sampler.simulation.context.setPositions(traj.openmm_positions(i))
+                potential_energy = sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy()
+                # U(x0, theta1)
+                temporary_sampler.simulation.context.setPositions(traj.openmm_positions(i))
+                reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy()
+        
+                delta = (potential_energy - reduced_potential_energy).value_in_unit(kcalpermol)
+                potential_energy_diff.append(delta)
 
-        # U(x0, theta0)
-        old_potential_energy = 0
-
-        # U(x0, theta1)
-        reduced_potential_energy = 0
+                _logger.info(f'{potential_energy._value}, {reduced_potential_energy._value}, {delta}')
 
         neff = 0.5
         return neff

From 52d32b5cc6cb0463bbbf68b40df04a8864d3c385 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Tue, 5 Mar 2024 18:18:30 -0500
Subject: [PATCH 38/59] clean up

---
 README.md                        |   8 +-
 espfit/app/analysis.py           |  62 ++++++-----
 espfit/app/sampler.py            |  76 ++++---------
 espfit/app/train.py              | 145 ++++++++++++++++---------
 espfit/utils/espaloma/module.py  |   2 +
 espfit/utils/graphs.py           |   2 -
 espfit/utils/sampler/reweight.py | 181 ++++++++++++++++++++++++++-----
 espfit/utils/units.py            |   7 +-
 8 files changed, 311 insertions(+), 172 deletions(-)

diff --git a/README.md b/README.md
index b012b38..d8b18a2 100644
--- a/README.md
+++ b/README.md
@@ -37,12 +37,12 @@ ds.compute_relative_energy()
 # Create esplama model
 from espfit.app.train import EspalomaModel
 filename = 'espfit/data/config/config.toml'
-model = EspalomaModel.from_toml(filename)
+# Override training settings in config.toml
+kwargs = {'output_directory_path': 'checkpoints', 'epochs': 100}
+model = EspalomaModel.from_toml(filename, **kwargs)
 model.dataset_train = ds
-# Change default training settings
-model.epochs = 100
 # Set sampler settings
-model.train_sampler(sampler_patience=800, neff_threshold=0.2)
+model.train_sampler(sampler_patience=800, neff_threshold=0.2, sampler_weight=1)
 ```
 
 ### Standalone Usage
diff --git a/espfit/app/analysis.py b/espfit/app/analysis.py
index 1b6d97d..5235935 100644
--- a/espfit/app/analysis.py
+++ b/espfit/app/analysis.py
@@ -1,11 +1,5 @@
 """
 Compute experimental observables from MD simulations.
-
-Notes
------
-
-TODO
-----
 """
 import os
 import numpy as np
@@ -17,10 +11,6 @@
 class BaseDataLoader(object):
     """Base class for data loader.
     
-    TODO
-    ----
-    * Add more methods to check trajectory information (e.g. number of frames, number of atoms, etc.)
-
     Methods
     -------
     load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1):
@@ -68,7 +58,6 @@ def output_directory_path(self, value):
         os.makedirs(value, exist_ok=True)
 
 
-    # Should this be a classmethod?
     def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None):
         """Load MD trajectory.
         
@@ -86,6 +75,10 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', s
         input_directory_path : str, optional
             Input directory path. Default is None.
             If None, the current working directory will be used.
+
+        Returns
+        -------
+        None
         """
         import mdtraj
 
@@ -182,15 +175,14 @@ def radian_to_degree(self, a):
         return a
 
 
-    def compute_jcouplings(self, couplings=None, residues=None):
+    def compute_jcouplings(self, weights=None, couplings=None, residues=None):
         """Compute J-couplings from MD trajectory.
         
-        TODO
-        ----
-        * Compute confidence interval.
-
         Parameters
         ----------
+        weights : numpy.ndarray, optional
+            Weights to compute the J-couplings. Default is None.
+
         couplings : str, optional
             Name of the couplings to compute. Default is None. 
             If a list of couplings to be chosen from [H1H2,H2H3,H3H4,1H5P,2H5P,C4Pb,1H5H4,2H5H4,H3P,C4Pe,H1C2/4,H1C6/8] 
@@ -216,7 +208,7 @@ def compute_jcouplings(self, couplings=None, residues=None):
         """
         import barnaba as bb
 
-        _logger.info("Computing J-couplings from MD trajectory...")
+        _logger.info("Compute J-couplings from MD trajectory")
 
         if couplings is not None:
             # Check if the provided coupling names are valid
@@ -228,25 +220,32 @@ def compute_jcouplings(self, couplings=None, residues=None):
         # residue_list: list of M nucleobases
         values, resname_list = bb.jcouplings_traj(self.traj, couplings=couplings, residues=residues)
 
+        # Convert numpy.float to float to avoid serialization issues
+        replace_nan_with_none = lambda x: None if np.isscalar(x) and np.isnan(x) else x.item()
+
         # Loop over residues and couplings to store the computed values
         coupling_dict = dict()
         for i, resname in enumerate(resname_list):
             _values = values[:,i,:]  # Coupling values of i-th residue
             values_by_names = dict()
             for j, coupling_name in enumerate(couplings):
-                # Function to replace np.nan with None
-                avg = np.round(_values[:,j].mean(), 5)  # Mean value of H1H2 coupling of i-th residue
-                std = np.round(_values[:,j].std(), 5)   # Standard deviation of H1H2 coupling of i-th residue
-
-                replace_nan_with_none = lambda x: None if np.isscalar(x) and np.isnan(x) else x
-                avg = replace_nan_with_none(avg)
-                std = replace_nan_with_none(std)
-                if avg:
-                    avg = avg.item()
-                if std:
-                    std = std.item()
-                # Convert numpy.float to float to avoid serialization issues
-                values_by_names[coupling_name] = {'avg': avg, 'std': std}
+                avg_raw = np.round(_values[:,j].mean(), 5)  # e.g. mean value of H1H2 coupling of i-th residue
+                std_raw = np.round(_values[:,j].std(), 5)   # e.g. standard deviation of H1H2 coupling of i-th residue
+                avg_raw = replace_nan_with_none(avg_raw)
+                std_raw = replace_nan_with_none(std_raw)
+                if weights is not None:
+                    arr = _values[:,j] * weights
+                    #_logger.info(f'non-weighted: {_values[:,j]}')
+                    #_logger.info(f'weights:      {weights}')
+                    #_logger.info(f'weighted:     {arr}')
+                    avg = np.round(arr.mean(), 5)
+                    std = np.round(arr.std(), 5)
+                    avg = replace_nan_with_none(avg)
+                    std = replace_nan_with_none(std)
+                else:
+                    avg = avg_raw
+                    std = std_raw
+                values_by_names[coupling_name] = {'avg': avg, 'std': std, 'avg_raw': avg_raw, 'std_raw': std_raw}
             coupling_dict[resname] =  values_by_names
 
         return coupling_dict
@@ -266,6 +265,9 @@ def get_available_couplings(self):
         return available_coupling_names
         
 
+#
+# Future work?
+#
 class ProteinSystem(BaseDataLoader):
     def __init__(self, **kwargs):
         super(ProteinSystem, self).__init__(**kwargs)
diff --git a/espfit/app/sampler.py b/espfit/app/sampler.py
index 6ec6d5e..eb211e5 100644
--- a/espfit/app/sampler.py
+++ b/espfit/app/sampler.py
@@ -123,7 +123,7 @@ def _get_platform(self):
         from openmmtools.utils import get_fastest_platform
         platform = get_fastest_platform()
         platform_name = platform.getName()
-        _logger.info(f"Fastest platform: {platform_name}")
+        _logger.debug(f"Fastest platform: {platform_name}")
         if platform_name == "CUDA":
             platform.setPropertyDefaultValue('DeterministicForces', 'true')  # default is false
             platform.setPropertyDefaultValue('Precision', 'mixed')  # default is single
@@ -141,11 +141,10 @@ def minimize(self, output_directory_path=None):
         -------
         None
         """
-
         if output_directory_path is not None:
             self.output_directory_path = output_directory_path  # property decorator is called
 
-        _logger.info(f"Minimizing system for maximum {self.maxIterations} steps.")
+        _logger.debug(f"Minimizing system for maximum {self.maxIterations} steps")
         self.simulation.minimizeEnergy(self.maxIterations)
 
 
@@ -161,30 +160,14 @@ def run(self, output_directory_path=None):
         -------
         None
         """
+        import mdtraj
+        from mdtraj.reporters import NetCDFReporter
+        from openmm.app import CheckpointReporter, StateDataReporter
+        
         if output_directory_path is not None:
             self.output_directory_path = output_directory_path  # property decorator is called
 
         # Select atoms to save
-        import mdtraj
-        #if self.atomSubset == 'solute':
-        #    self.atom_indices = []
-        #    mdtop = mdtraj.Topology.from_openmm(self.simulation.topology)
-        #    res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
-        #    for r in res:
-        #        for a in r.atoms:
-        #            self.atom_indices.append(a.index)
-        #elif self.atomSubset == 'all':
-        #    self.atom_indices = None
-        #elif self.atomSubset == 'not water':
-        #    self.atom_indices = []
-        #    mdtop = mdtraj.Topology.from_openmm(self.simulation.topology)
-        #    res = [ r for r in mdtop.residues if r.name not in ('HOH') ]
-        #    for r in res:
-        #        for a in r.atoms:
-        #            self.atom_indices.append(a.index)
-        #else:
-        #    raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.")
-
         if self.atomSubset == 'all':
             self.atom_indices = None
         else:
@@ -199,18 +182,13 @@ def run(self, output_directory_path=None):
                     self.atom_indices.append(a.index)
        
         # Define reporter
-        from mdtraj.reporters import NetCDFReporter
-        from openmm.app import CheckpointReporter, StateDataReporter
-
         self._check_file_exists("traj.nc")
         self.simulation.reporters.append(NetCDFReporter(os.path.join(self.output_directory_path, f"traj.nc"), 
                                                         min(self.netcdf_frequency, self.nsteps), 
                                                         atomSubset=self.atom_indices))
-        
         self._check_file_exists("checkpoint.chk")
         self.simulation.reporters.append(CheckpointReporter(os.path.join(self.output_directory_path, f"checkpoint.chk"), 
                                                             min(self.checkpoint_frequency, self.nsteps)))
-        
         self._check_file_exists("reporter.log")
         self.simulation.reporters.append(StateDataReporter(os.path.join(self.output_directory_path, f"reporter.log"), 
                                                            min(self.logging_frequency, self.nsteps), 
@@ -218,7 +196,7 @@ def run(self, output_directory_path=None):
                                                            totalEnergy=True, temperature=True, volume=True, density=True, speed=True))
         
         # Run
-        _logger.info(f"Run MD simulation for {self.nsteps} steps")
+        _logger.info(f"Running simulation for {self.nsteps} steps...")
         self.simulation.step(self.nsteps)
 
 
@@ -248,7 +226,7 @@ def export_xml(self, exportSystem=True, exportState=True, exportIntegrator=True,
         None
         """
         from openmm import XmlSerializer
-        _logger.info(f"Serialize and export system")
+        _logger.debug(f"Serialize and export system")
 
         if output_directory_path is not None:
             # Create a new output directory different from the one specified when the SetupSampler instance was created.
@@ -332,6 +310,12 @@ class SetupSampler(BaseSimulation):
     create_system(biopolymer_file=None, ligand_file=None):
         Create biopolymer-ligand system and export serialized system XML file and solvated pdb file.
 
+    from_toml(filename, *args, **override_sampler_kwargs):
+        Create SetupSampler from a TOML configuration file.
+
+    from_xml(filename):
+        Create SetupSampler from a serialized system XML file.
+
     Examples
     --------
     >>> from espfit.app.sampler import SetupSampler
@@ -352,18 +336,15 @@ class SetupSampler(BaseSimulation):
     ['amber/protein.ff14SB.xml', 'amber/RNA.OL3.xml']       : pl-multi (TPO): NG, pl-single: OK, RNA: OK
     """
     def __init__(self, 
-                 #small_molecule_forcefield='openff-2.1.0',
                  small_molecule_forcefield='espfit/data/forcefield/espaloma-0.3.2.pt',
                  forcefield_files = ['amber/ff14SB.xml', 'amber/phosaa14SB.xml'],
                  water_model='tip3p', 
                  solvent_padding=9.0 * unit.angstroms, 
                  ionic_strength=0.15 * unit.molar, 
-                 #constraints=app.HBonds, 
                  hmass=3.0 * unit.amu, 
                  temperature=300.0 * unit.kelvin, 
                  pressure=1.0 * unit.atmosphere, 
                  pme_tol=2.5e-04, 
-                 #nonbonded_method=app.PME, 
                  barostat_period=50, 
                  timestep=4 * unit.femtoseconds, 
                  override_with_espaloma=True,
@@ -383,8 +364,6 @@ def __init__(self,
             The padding distance around the solute in the solvent box. Default is 9.0 * unit.angstroms.
         ionic_strength : Quantity, optional
             The ionic strength of the solvent. Default is 0.15 * unit.molar.
-        constraints : object, optional
-            The type of constraints to be applied to the system. Default is app.HBonds.
         hmass : Quantity, optional
             The mass of the hydrogen atoms. Default is 3.0 * unit.amu.
         temperature : Quantity, optional
@@ -393,8 +372,6 @@ def __init__(self,
             The pressure of the system. Default is 1.0 * unit.atmosphere.
         pme_tol : float, optional
             The Ewald error tolerance for PME electrostatics. Default is 2.5e-04.
-        nonbonded_method : object, optional
-            The nonbonded method to be used for the system. Default is app.PME.
         barostat_period : int, optional
             The frequency at which the barostat is applied. Default is 50.
         timestep : Quantity, optional
@@ -408,12 +385,10 @@ def __init__(self,
         self.forcefield_files = self._update_forcefield_files(forcefield_files)
         self.solvent_padding = solvent_padding
         self.ionic_strength = ionic_strength
-        #self.constraints = constraints
         self.hmass = hmass
         self.temperature = temperature
         self.pressure = pressure
         self.pme_tol = pme_tol
-        #self.nonbonded_method = nonbonded_method
         self.barostat_period = barostat_period
         self.timestep = timestep
         self.override_with_espaloma = override_with_espaloma
@@ -459,7 +434,7 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs):
             raise ValueError("target is not specified in the configuration file")
         
         samplers = []
-        _logger.info(f'Found {len(config)} systems in the configuration file')
+        _logger.debug(f'Found {len(config)} systems in the configuration file')
         for _config in config:
             sampler = cls()
 
@@ -488,7 +463,6 @@ def from_toml(cls, filename, *args, **override_sampler_kwargs):
                     else:
                         raise ValueError(f"Invalid keyword argument: {key}")
             
-            # Expected kwargs: output_directory_path
             # Pass temporary espaloma model to the sampler if kwargs are given
             for key, value in override_sampler_kwargs.items():
                 if hasattr(sampler, key):
@@ -661,8 +635,6 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
 
         # Initialize system generator.
         _logger.debug("Initialize system generator")
-        #forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : self.constraints, 'rigidWater': True, 'hydrogenMass' : self.hmass}
-        #periodic_forcefield_kwargs = {'nonbondedMethod': self.nonbonded_method}
         forcefield_kwargs = {'removeCMMotion': True, 'ewaldErrorTolerance': self.pme_tol, 'constraints' : app.HBonds, 'rigidWater': True, 'hydrogenMass' : self.hmass}
         periodic_forcefield_kwargs = {'nonbondedMethod': app.PME}
         barostat = MonteCarloBarostat(self.pressure, self.temperature, self.barostat_period)
@@ -682,11 +654,11 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
                                                  template_generator_kwargs=template_generator_kwargs)
         
         if ligand_file is not None:
-            _logger.info("Add molecules to system generator")
+            _logger.debug("Add molecules to system generator")
             self._system_generator.template_generator.add_molecules(self._ligand_offmol)
             
         # Solvate system
-        _logger.info("Solvating system...")
+        _logger.debug("Solvating system...")
         modeller = app.Modeller(self._complex_topology, self._complex_positions)
         modeller.addSolvent(self._system_generator.forcefield, model=self.water_model, padding=self.solvent_padding, ionicStrength=self.ionic_strength)
 
@@ -704,7 +676,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
             # (espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb).
             # No explicit error message was given. It failed to show the following logging information:
             #
-            # _logger.info(f'Requested to generate parameters for residue {residue}')
+            # _logger.debug(f'Requested to generate parameters for residue {residue}')
             # https://github.com/openmm/openmmforcefields/blob/main/openmmforcefields/generators/template_generators.py#L285
             #
             # However, it works for protein test systems (espfit/data/target/testsystems/protein-ligand/target.pdb).
@@ -712,7 +684,7 @@ def create_system(self, biopolymer_file=None, ligand_file=None):
             # As a workaround, we will delete the original `self._system_generator` and create a new one to regenerate the system with espaloma.
             # Only water and ion forcefield files will be used to regenerate the system. Solute molecules will be parametrized with espaloma.
             # 
-            _logger.info("Regenerate system with espaloma.")
+            _logger.debug("Regenerate system with espaloma.")
 
             # Re-create system generator
             del self._system_generator
@@ -757,13 +729,13 @@ def _regenerate_espaloma_system(self):
         import mdtraj
         from openff.toolkit import Molecule
 
-        _logger.info("Regenerate system with espaloma")
+        _logger.debug("Regenerate system with espaloma")
 
         # Check biopolymer chains
         mdtop = mdtraj.Topology.from_openmm(self.modeller_solvated_topology)
         chain_indices = [ chain.index for chain in self.modeller_solvated_topology.chains() ]
         biopolymer_chain_indices = [ chain_index for chain_index in chain_indices if mdtop.select(f"not (water or resname NA or resname K or resname CL or resname UNK) and chainid == {chain_index}").any() ]
-        _logger.info(f"Biopolymer chain indices: {biopolymer_chain_indices}")
+        _logger.debug(f"Biopolymer chain indices: {biopolymer_chain_indices}")
 
         # Get OpenMM topology of solute with one residue per molecule. 
         # Espaloma will use residue name "XX". Check conflicting residue names.
@@ -772,14 +744,14 @@ def _regenerate_espaloma_system(self):
             raise Exception('Found conflict residue name in biopolymer.')
 
         # Initilize espaloma topology
-        # TODO: From software engineering point of view, should this be `self.new_solvated_topology` or `new_solvated_topology`?
+        # TODO: Should this be `self.new_solvated_topology` or `new_solvated_topology`?
         self.new_solvated_topology = app.Topology()
         self.new_solvated_topology.setPeriodicBoxVectors(self.modeller_solvated_topology.getPeriodicBoxVectors())
         new_atoms = {}
 
         # Regenerate biopolymer topology
         chain_index = 0
-        _logger.info(f"Regenerating biopolymer topology...")
+        _logger.debug(f"Regenerating biopolymer topology...")
         for chain in self.modeller_solvated_topology.chains():
             new_chain = self.new_solvated_topology.addChain(chain.id)
             # Convert biopolymer into a single residue
@@ -844,7 +816,7 @@ def _update_espaloma_topology(self):
         -------
         app.Topology : The updated topology reflecting the new system.
         """
-        _logger.info("Update residue names in espaloma topology.")
+        _logger.debug("Update residue names in espaloma topology.")
         
         # Get original residue names.
         atom_name_lookup = []
diff --git a/espfit/app/train.py b/espfit/app/train.py
index 49aeea8..1f3d977 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -3,12 +3,12 @@
 
 TODO
 ----
-* Export loss to a file (e.g. LossReporter class?)
 * Add support to use multiple GPUs
 * Improve how data are parsed using dataclasses or pydantic
 """
 import os
 import torch
+import espaloma as esp
 import logging
 
 _logger = logging.getLogger(__name__)
@@ -18,9 +18,9 @@ class EspalomaBase(object):
     def __init__(self):
         # Check if GPU is available
         if torch.cuda.is_available():
-            _logger.info('GPU is available for training.')
+            _logger.debug('GPU is available for training.')
         else:
-            _logger.info('GPU is not available for training.')
+            _logger.debug('GPU is not available for training.')
 
         # Check torch data type
         _logger.debug(f'Torch data type is {torch.get_default_dtype()}')
@@ -40,6 +40,10 @@ def from_toml(cls, filename, **override_espalomamodel_kwargs):
         filename : str
             Path to the TOML file containing the configuration for the espaloma model.
 
+        override_espalomamodel_kwargs : dict
+            A dictionary of keyword arguments to override the default settings for the 
+            espaloma model.
+
         Returns
         -------
         object
@@ -92,9 +96,6 @@ def _get_base_module(espaloma_config):
         list
             A list of modules for the Espaloma network model.
         """
-
-        import espaloma as esp
-
         # GNN
         gnn_method = 'SAGEConv'
         gnn_options = {}
@@ -167,7 +168,6 @@ def create_model(espaloma_config):
         torch.nn.Sequential
             The constructed Espaloma network model.
         """
-        import espaloma as esp
         from espfit.utils.espaloma.module import GetLoss
 
         # Get base model
@@ -210,8 +210,6 @@ def save_model(self, net=None, best_model=None, model_name='espaloma.pt', output
         -------
         None
         """
-        import espaloma as esp
-
         if output_directory_path is not None:
             os.makedirs(output_directory_path, exist_ok=True)
         else:
@@ -328,31 +326,39 @@ def output_directory_path(self, value):
         os.makedirs(value, exist_ok=True)
 
 
-    def report_loss(self, loss_trajecotry):
+    def report_loss(self, epoch, loss_dict):
         """Report loss.
 
         Parameters
         ----------
-        loss : dict
-            The loss trajectory that stores individual weighted losses for each epoch.
+        loss_dict : dict
+            The loss trajectory that stores individual weighted losses at a given epoch.
 
         Returns
         -------
         None
         """
         import pandas as pd
-        df = pd.DataFrame.from_dict(loss_trajecotry, orient='index')
-        df.to_csv(os.path.join(self.output_directory_path, 'reporter.log'), sep='\t', float_format='%.4f')
+        df = pd.DataFrame.from_dict(loss_dict, orient='index').T
+        df.insert(0, 'epoch', epoch)
+        
+        log_file_path = os.path.join(self.output_directory_path, 'reporter.log')
+
+        if os.path.exists(log_file_path):
+            existing_headers = pd.read_csv(log_file_path, sep='\t', nrows=0).columns.tolist()            
+            if set(df.columns) != set(existing_headers):
+                df_old = pd.read_csv(log_file_path, sep='\t')
+                df = pd.concat([df_old, df], ignore_index=True)
+            else:
+                df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False, header=False, mode='a')
+        else:
+                df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False)
 
 
     def train(self):
         """
         Train the Espaloma network model.
 
-        TODO
-        ----
-        * Export training settings to a file?
-
         Returns
         -------
         None
@@ -394,12 +400,23 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
         """
         Train the Espaloma network model with sampler.
 
-        TODO
-        ----
-        * Export loss to a file (e.g. LossReporter class?)
-        * Should `nsteps` be a variable when calling train_sampler?
-        * Should `sampler_patience` and `neff_threshold` be an instance variable of sampler.BaseSimulation?
+        Parameters
+        ----------
+        sampler_patience : int, default=800
+            The number of epochs to wait before using sampler.
+
+        neff_threshold : float, default=0.2
+            The minimum effective sample size threshold.
 
+        sampler_weight : float, default=1.0
+            The weight for the sampler loss.
+
+        debug : bool, default=False
+            If True, use espaloma-0.3.pt for debugging.
+
+        Returns
+        -------
+        None
         """
         from espfit.utils.units import HARTREE_TO_KCALPERMOL
         from espfit.utils.sampler.reweight import SetupSamplerReweight
@@ -425,7 +442,6 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
         SamplerReweight = SetupSamplerReweight()
         
         # Train
-        loss_trajectory = {}
         ds_tr_loader = self.dataset_train.view(collate_fn='graph', batch_size=self.batch_size, shuffle=True)
         optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
         with torch.autograd.set_detect_anomaly(True):
@@ -441,55 +457,44 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
                     if torch.cuda.is_available():
                         g = g.to("cuda:0")
                     g.nodes["n1"].data["xyz"].requires_grad = True
-                    
-                    # Forward pass
-                    # Note that returned values are weighted losses.
+
                     _loss, loss_dict = self.net(g)
-                    # Append loss
                     loss += _loss
 
-                # Include sampler loss after certain epochs
                 if epoch > self.sampler_patience:
-                    # Save checkpoint as local model (force field)
-                    _samplers = self._setup_local_samplers(epoch, net_copy, debug)
-                    neff = SamplerReweight.get_effective_sample_size(temporary_samplers=_samplers)  # returns -1 if SamplerReweight.samplers is None
+                    # Save checkpoint as local model (net.pt)
+                    samplers = self._setup_local_samplers(epoch, net_copy, debug)
+                    # neff_min is -1 if SamplerReweight.samplers is None
+                    neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers)
 
                     # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton
-                    if neff < self.neff_threshold:
-                        _logger.info(f'Effective sample size ({neff}) below threshold ({self.neff_threshold}).')
-                        SamplerReweight.samplers = _samplers
+                    if neff_min < self.neff_threshold:
+                        _logger.info(f'Minimum effective sample size ({neff_min:.3f}) below threshold ({self.neff_threshold})')
+                        SamplerReweight.samplers = samplers
                         SamplerReweight.run()
-
-                    # Delete temporary_samplers
-                    del _samplers
+                    del samplers
 
                     # Compute sampler loss
-                    _logger.info(f'Compute sampler loss.')
                     loss_list = SamplerReweight.compute_loss()   # list of torch.tensor
-                    for sampler_index, _loss in enumerate(loss_list):
-                        _sampler = SamplerReweight.samplers[sampler_index]
-                        loss_dict[f'{_sampler.target_name}'] = _loss.item()
-                        loss += _loss * sampler_weight
+                    for sampler_index, sampler_loss in enumerate(loss_list):
+                        sampler = SamplerReweight.samplers[sampler_index]
+                        loss += sampler_loss * sampler_weight
+                        loss_dict[f'{sampler.target_name}'] = sampler_loss.item()
+                    loss_dict['neff'] = neff_min
 
-                # Append total and individual loss to loss_trajectory
                 loss_dict['loss'] = loss.item()
-                loss_trajectory[epoch] = loss_dict
+                self.report_loss(epoch, loss_dict)
 
-                # Update weights
+                # Back propagate
                 loss.backward()
                 optimizer.step()
                 
                 if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
-                    _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
-                    _logger.info(f'epoch {epoch}: {_loss:.3f}')
+                    #_loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
+                    _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}')
                     self._save_checkpoint(epoch)
 
-        # Export loss trajectory
-        # TODO: Report losses at every epoch
-        _logger.info(f'Export loss trajectory to a file.')
-        self.report_loss(loss_trajectory)
-
 
     def _load_checkpoint(self):
         """Load the last checkpoint and restart the training process.
@@ -546,14 +551,46 @@ def _save_checkpoint(self, epoch):
 
 
     def _save_local_model(self, epoch, net_copy):
+        """Save local model (force field).
+        
+        Parameters
+        ----------
+        epoch : int
+            The epoch number.
+
+        net_copy : torch.nn.Sequential
+            A deep copy of the Espaloma network model.
+
+        Returns
+        -------
+        None
+        """
         # Save checkpoint as temporary espaloma model (force field)
-        _logger.info(f'Save checkpoint{epoch}.pt as temporary espaloma model (force field).')
+        _logger.info(f'Save checkpoint{epoch}.pt as temporary espaloma model (net.pt)')
         self._save_checkpoint(epoch)
         local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt")
         self.save_model(net=net_copy, best_model=local_model, model_name=f"net.pt", output_directory_path=self.output_directory_path)
 
 
     def _setup_local_samplers(self, epoch, net_copy, debug):
+        """Setup local samplers.
+        
+        Parameters
+        ----------
+        epoch : int
+            The epoch number.
+
+        net_copy : torch.nn.Sequential
+            A deep copy of the Espaloma network model.
+        
+        debug : bool
+            If True, use espaloma-0.3.2.pt for debugging.
+
+        Returns
+        -------
+        list
+            A list of sampler systems.
+        """
         from espfit.app.sampler import SetupSampler
         
         self._save_local_model(epoch, net_copy)
diff --git a/espfit/utils/espaloma/module.py b/espfit/utils/espaloma/module.py
index 6f3fb34..642d91e 100644
--- a/espfit/utils/espaloma/module.py
+++ b/espfit/utils/espaloma/module.py
@@ -56,6 +56,8 @@ class GetLoss(torch.nn.Module):
     compute_improper_loss(g):
         Compute improper l2 regularization
 
+    forward(g):
+        Compute joint loss
     """
     def __init__(self, weights={'energy': 1.0, 'force': 1.0, 'charge': 1.0, 'torsion': 1.0, 'improper': 1.0}):
         """Define loss function.
diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index 3ab1fd0..32adeb1 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -585,8 +585,6 @@ def _merge_graphs(ds):
         for key in g.nodes['n1'].data.keys():
             if key not in ["q_ref", "idxs", "h0"]:
                 for i in range(1, len(ds)):
-                    if key == "xyz":
-                        n_confs = ds[i].nodes['n1'].data['xyz'].shape[1]
                     g.nodes['n1'].data[key] = torch.cat((g.nodes['n1'].data[key], ds[i].nodes['n1'].data[key]), dim=1)
         
         return g
diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
index 3142932..58b65ad 100644
--- a/espfit/utils/sampler/reweight.py
+++ b/espfit/utils/sampler/reweight.py
@@ -1,18 +1,46 @@
+"""
+Compute effective sample size and weights for each simulation.
+
+TODO
+----
+* Check J-coupling experimental error. Currently, fixed to 0.5 Hz.
+"""
 import os
 import logging
-from espfit.app.sampler import SetupSampler, BaseSimulation
 
 _logger = logging.getLogger(__name__)
 
 
 class SetupSamplerReweight(object):
+    """Setup sampler for reweighting simulation.
+
+    This class is responsible for setting up the sampler for reweighting simulation.
+    It provides methods to run the simulation, compute the effective sample size,
+    compute the loss, and compute the weighted observable.
+
+    Methods
+    -------
+    run():
+        Runs the simulation for each sampler.
+
+    get_effective_sample_size(temporary_samplers):
+        Computes the effective sample size and sampling weights for each sampler.
 
+    compute_loss():
+        Computes the loss for each sampler.    
+    """
     def __init__(self):
         self.samplers = None
-        self.weights = None   # list
+        self.weights = dict()   # {'target_name': {'weights': w_i}, {'neff': neff}}
 
 
     def run(self):
+        """Runs the simulation for each sampler.
+        
+        Returns
+        -------
+        None
+        """
         for sampler in self.samplers:
             _logger.info(f'Running simulation for {sampler.target_name} for {sampler.nsteps} steps...')
             sampler.minimize()
@@ -20,17 +48,42 @@ def run(self):
 
 
     def get_effective_sample_size(self, temporary_samplers):
+        """Computes the effective sample size and sampling weights for each sampler.
+
+        Parameters
+        ----------
+        temporary_samplers : list
+            List of temporary samplers.
+
+        Returns
+        -------
+        float
+            The minimum effective sample size among all samplers.
+        """
+        import mdtraj
+        import numpy as np
+        from openmm.unit import kilocalories_per_mole as kcalpermol
+        from espfit.utils.units import KB_T_KCALPERMOL
         
-        # Check if sampler is None
         if self.samplers is None:
             return -1
 
-        # Compute effective sample size
-        import mdtraj
-        from openmm.unit import kilocalories_per_mole as kcalpermol
-        potential_energy_diff = []
         for sampler, temporary_sampler in zip(self.samplers, temporary_samplers):
+            _logger.info(f'Compute effective sample size and sampling weights for {sampler.target_name}')
+
+            # Get temperature
+            temp0 = sampler.temperature._value
+            temp1 = temporary_sampler.temperature._value
+            assert temp0 == temp1, f'Temperature should be equivalent but got sampler {temp0} K and temporary sampler {temp1} K'
+            beta = 1 / (KB_T_KCALPERMOL * temp0)
+            _logger.debug(f'beta temperature in kcal/mol: {beta}')
+
+            # Get position from trajectory
             traj = mdtraj.load(sampler.output_directory_path + '/traj.nc', top=sampler.output_directory_path + '/solvated.pdb')
+            _logger.info(f'Found {traj.n_frames} frames in trajectory')
+            
+            # Compute weights and effective sample size
+            log_w = []
             for i in range(traj.n_frames):
                 # U(x0, theta0)
                 sampler.simulation.context.setPositions(traj.openmm_positions(i))
@@ -38,34 +91,65 @@ def get_effective_sample_size(self, temporary_samplers):
                 # U(x0, theta1)
                 temporary_sampler.simulation.context.setPositions(traj.openmm_positions(i))
                 reduced_potential_energy = temporary_sampler.simulation.context.getState(getEnergy=True).getPotentialEnergy()
-        
-                delta = (potential_energy - reduced_potential_energy).value_in_unit(kcalpermol)
-                potential_energy_diff.append(delta)
-
-                _logger.info(f'{potential_energy._value}, {reduced_potential_energy._value}, {delta}')
-
-        neff = 0.5
-        return neff
+                # deltaU = U(x0, theta1) - U(x0, theta0)
+                delta = (reduced_potential_energy - potential_energy).value_in_unit(kcalpermol)
+                # log_w = ln(exp(-beta * delta))
+                w = -1 * beta * delta
+                log_w.append(w)
+
+                #_logger.debug(f'U(x0, theta0): {potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol')
+                #_logger.debug(f'U(x0, theta1): {reduced_potential_energy.value_in_unit(kcalpermol):10.3f} kcal/mol')
+                #_logger.debug(f'deltaU:        {delta:10.3f} kcal/mol')
+                #_logger.debug(f'log_w:         {w:10.3f}')
+
+            # Compute weights and effective sample size (ratio: 0 to 1)
+            w_i = np.exp(log_w) / np.sum(np.exp(log_w))
+            neff = np.sum(w_i) ** 2 / np.sum(w_i ** 2) / len(w_i)
+            #_logger.debug(f'w_i_sum:       {np.sum(w_i):10.3f}')
+            #_logger.debug(f'neff:          {neff:10.3f}')
+
+            self.weights[f'{sampler.target_name}'] = {'neff': neff, 'weights': w_i}
+            #_logger.info(f'{self.weights}')
+            neffs = [self.weights[key]['neff'] for key in self.weights.keys()]
+
+        return min(neffs)
     
         
     def compute_loss(self):
+        """Computes the loss for each sampler.
 
+        Returns
+        -------
+        list
+            List of torch tensors representing the loss for each sampler.
+        """
         loss_list = []
         for sampler in self.samplers:
+            _logger.info(f'Compute loss for {sampler.target_name}')
             loss = self._compute_loss_per_system(sampler)  # torch.tensor
             loss_list.append(loss)
 
-        # list of torch.tensor        
         return loss_list
     
 
     def _compute_loss_per_system(self, sampler):
-
+        """Computes the loss per system for a given sampler.
+
+        Parameters
+        ----------
+        sampler : object
+            The sampler object.
+
+        Returns
+        -------
+        torch.Tensor
+            The loss per system as a torch tensor.
+        """
         import torch
 
         # Compute experimental observable
         exp = self._get_experiment_data(sampler.target_class, sampler.target_name)
-        pred = self._compute_observable(sampler.atomSubset, sampler.target_class, sampler.output_directory_path)
+        pred = self._compute_weighted_observable(sampler.atomSubset, sampler.target_name, sampler.output_directory_path)
 
         loss = []
         for resi_index, exp_dict in enumerate(exp.values()):
@@ -84,18 +168,32 @@ def _compute_loss_per_system(self, sampler):
                     pred_error = list(pred.values())[resi_index][key]['std']  # standard deviation
                     _logger.debug(f'Exp ({resi_index}-{key}): {exp}')
                     _logger.debug(f'Pred ({resi_index}-{key}): {pred}')
-                    # Compute loss
+
                     numerator = (pred_value - exp_value) ** 2
                     dominator = (exp_error ** 2 + pred_error ** 2)
                     loss.append(numerator / dominator)
         # Compute loss
         loss_avg = torch.mean(torch.tensor(loss))
-        _logger.info(f'Computed sampler loss: {loss_avg.item()}')
+        _logger.info(f'Sampler loss: {loss_avg.item():.3f}')
 
         return loss_avg
 
 
     def _get_experiment_data(self, target_class, target_name):
+        """Retrieves the experimental data for a given target.
+
+        Parameters
+        ----------
+        target_class : str
+            The class of the target.
+
+        target_name : str
+            The name of the target.
+        
+        Returns
+        -------
+        dict : The experimental data for the target.
+        """
         import yaml
         from importlib.resources import files
 
@@ -107,17 +205,42 @@ def _get_experiment_data(self, target_class, target_name):
         return d['experiment_1']['measurement']
 
 
-    def _compute_observable(self, atomSubset, target_class, output_directory_path):
-        if target_class == 'nucleoside':
-            from espfit.app.analysis import RNASystem
-            target = RNASystem(atomSubset=atomSubset)
-            target.load_traj(input_directory_path=output_directory_path)
-            pred = target.compute_jcouplings()
-            _logger.debug(f'Computed observable: {pred}')
-        else:
-            raise NotImplementedError(f'Observable for {target_class} is not implemented.')
+    def _compute_weighted_observable(self, atomSubset, target_name, output_directory_path):
+        """Computes the weighted observable for a given target.
+
+        Parameters
+        ----------
+        atomSubset : str
+            The atom subset.
 
+        target_name : str
+            The name of the target.
+
+        output_directory_path : str
+            The output directory path.
+
+        Returns
+        -------
+        dict : The computed weighted observable.
+        """
         import yaml
+        from espfit.app.analysis import RNASystem
+
+        # Load trajectory
+        target = RNASystem(atomSubset=atomSubset)
+        target.load_traj(input_directory_path=output_directory_path)
+        
+        # Compute observable
+        if self.weights.keys():
+            #print('weights key found')
+            #print(f'{self.weights}')
+            pred = target.compute_jcouplings(weights=self.weights[target_name]['weights'])
+        else:
+            #print('weights key not found')
+            pred = target.compute_jcouplings(weights=None)
+        _logger.debug(f'Computed observable: {pred}')
+
+        # Export observable
         with open(os.path.join(output_directory_path, 'pred.yaml'), 'w') as f:
             yaml.dump(pred, f, allow_unicode=True)
 
diff --git a/espfit/utils/units.py b/espfit/utils/units.py
index 3d5c2c4..9db83d5 100644
--- a/espfit/utils/units.py
+++ b/espfit/utils/units.py
@@ -6,12 +6,17 @@
 hartree = 1 * ureg.hartree
 bohr = 1 * ureg.bohr
 angstrom = 1 * ureg.angstrom
+kelvin = 1 * ureg.kelvin
+kB= ureg.boltzmann_constant
+kBT = kB * kelvin
+kcalpermol = ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)
 
 # Conversion factors
 #HARTEE_TO_KCALPERMOL = 627.509
 #BOHR_TO_ANGSTROMS = 0.529
-HARTREE_TO_KCALPERMOL = hartree.to(ureg.kilocalorie/(ureg.avogadro_constant*ureg.mole)).magnitude
+HARTREE_TO_KCALPERMOL = hartree.to(kcalpermol).magnitude
 BOHR_TO_ANGSTROMS = bohr.to(ureg.angstrom).magnitude
+KB_T_KCALPERMOL = kBT.to(kcalpermol).magnitude
 
 
 def convert_string_to_unit(unit_string):

From 6a0f43db4517c59d273743a09bf6d8202deae96d Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 13:17:54 -0500
Subject: [PATCH 39/59] fix docstring

---
 espfit/utils/logging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/espfit/utils/logging.py b/espfit/utils/logging.py
index 97b825a..0fd0f22 100644
--- a/espfit/utils/logging.py
+++ b/espfit/utils/logging.py
@@ -11,8 +11,8 @@ def set_logging_level(level):
 
     Parameters
     ----------
-    level : int
-        The logging level. For example, logging.INFO.
+    level : str 
+        The logging level. Options are [NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL, FATAL].
 
     Returns
     -------

From fd052d53b8c00e0c1f5a86173397ff90e6e7af5e Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 14:34:01 -0500
Subject: [PATCH 40/59] remove old comment

---
 espfit/utils/sampler/reweight.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/espfit/utils/sampler/reweight.py b/espfit/utils/sampler/reweight.py
index 58b65ad..cfb14bc 100644
--- a/espfit/utils/sampler/reweight.py
+++ b/espfit/utils/sampler/reweight.py
@@ -165,7 +165,7 @@ def _compute_loss_per_system(self, sampler):
                         exp_error = 0.5  # TODO: Check experimental error
                     resi_index = int(resi_index)
                     pred_value = list(pred.values())[resi_index][key]['avg']
-                    pred_error = list(pred.values())[resi_index][key]['std']  # standard deviation
+                    pred_error = list(pred.values())[resi_index][key]['std']
                     _logger.debug(f'Exp ({resi_index}-{key}): {exp}')
                     _logger.debug(f'Pred ({resi_index}-{key}): {pred}')
 
@@ -232,11 +232,8 @@ def _compute_weighted_observable(self, atomSubset, target_name, output_directory
         
         # Compute observable
         if self.weights.keys():
-            #print('weights key found')
-            #print(f'{self.weights}')
             pred = target.compute_jcouplings(weights=self.weights[target_name]['weights'])
         else:
-            #print('weights key not found')
             pred = target.compute_jcouplings(weights=None)
         _logger.debug(f'Computed observable: {pred}')
 

From 3ad0aa926d2bd2e13b66348cddb95ae2c136daeb Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 14:34:20 -0500
Subject: [PATCH 41/59] uncomment version

---
 espfit/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/espfit/__init__.py b/espfit/__init__.py
index d312d34..564e5cb 100644
--- a/espfit/__init__.py
+++ b/espfit/__init__.py
@@ -25,4 +25,4 @@
 #from .espfit import *
 
 
-#from ._version import __version__
+from ._version import __version__

From 19f442f232871b2cf3a2d91d51fa5842e03ec77f Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 14:36:25 -0500
Subject: [PATCH 42/59] support minima conformer for each chunk when reshaping
 graphs

---
 espfit/utils/graphs.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index 32adeb1..75d47f4 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -440,7 +440,7 @@ def compute_relative_energy(self):
         del new_graphs
 
 
-    def reshape_conformation_size(self, n_confs=50):
+    def reshape_conformation_size(self, n_confs=50, include_min_energy_conf=False):
         """Reshape conformation size.
 
         This is a work around to handle different graph size (shape). DGL requires at least one dimension with same size. 
@@ -469,17 +469,18 @@ def reshape_conformation_size(self, n_confs=50):
         self._remove_node_features()
 
         new_graphs = []
+        n_confs_cache = n_confs
         for i, g in enumerate(self.graphs):
             n = g.nodes['n1'].data['xyz'].shape[1]
 
             if n == n_confs:
-                _logger.info(f"Mol #{i} ({n} conformations)")
+                _logger.info(f"Mol #{i} ({n} conformers)")
                 new_graphs.append(g)
 
             elif n < n_confs:
                 random.seed(self.random_seed)
                 index_random = random.choices(range(0, n), k=n_confs-n)
-                _logger.info(f"Randomly select {len(index_random)} conformations from Mol #{i} ({n} conformations)")
+                _logger.info(f"Randomly select {len(index_random)} conformers from Mol #{i} ({n} conformers)")
 
                 _g = copy.deepcopy(g)
                 _g.nodes["g"].data["u_ref"] = torch.cat((_g.nodes['g'].data['u_ref'], _g.nodes['g'].data['u_ref'][:, index_random]), dim=-1)
@@ -488,9 +489,17 @@ def reshape_conformation_size(self, n_confs=50):
                 new_graphs.append(_g)
 
             else:
-                _logger.info(f"Shuffling Mol #{i} ({n} conformations) and splitting into {n_confs}")
                 random.seed(self.random_seed)
                 idx_range = random.sample(range(n), k=n)
+
+                # Get index for minimum energy conformer
+                if include_min_energy_conf:
+                    index_min = [g.nodes['g'].data['u_ref'].argmin().item()]
+                    n_confs = n_confs_cache - 1
+                    _logger.info(f"Shuffe Mol #{i} ({n} conformers) and split into {n_confs} conformers and add minimum energy conformer (index #{index_min[0]})")
+                else:
+                    _logger.info(f"Shuffe Mol #{i} ({n} conformers) and split into {n_confs} conformers")
+
                 for j in range(n // n_confs + 1):
                     _g = copy.deepcopy(g)
 
@@ -498,7 +507,12 @@ def reshape_conformation_size(self, n_confs=50):
                         index = range(j*n_confs, n)
                         random.seed(self.random_seed)
                         index_random = random.choices(range(0, n), k=(j+1)*n_confs-n)
-                        _logger.debug(f"Iteration {j}: Randomly select {len(index_random)} conformers")
+
+                        if include_min_energy_conf:
+                            index_random = index_random + index_min
+                            _logger.debug(f"Iteration {j}: Randomly select {len(index_random)} conformers and add minimum energy conformer")
+                        else:
+                            _logger.debug(f"Iteration {j}: Randomly select {len(index_random)} conformers")
 
                         _g.nodes["g"].data["u_ref"] = torch.cat((_g.nodes['g'].data['u_ref'][:, index], _g.nodes['g'].data['u_ref'][:, index_random]), dim=-1)
                         _g.nodes["n1"].data["xyz"] = torch.cat((_g.nodes['n1'].data['xyz'][:, index, :], _g.nodes['n1'].data['xyz'][:, index_random, :]), dim=1)
@@ -507,7 +521,12 @@ def reshape_conformation_size(self, n_confs=50):
                         idx1 = j*n_confs
                         idx2 = (j+1)*n_confs
                         index = idx_range[idx1:idx2]
-                        _logger.debug(f"Iteration {j}: Extract indice from {idx1} to {idx2}")
+
+                        if include_min_energy_conf:
+                            index = index + index_min
+                            _logger.debug(f"Iteration {j}: Extract indice from {idx1} to {idx2} and add minimum energy conformer")
+                        else:
+                            _logger.debug(f"Iteration {j}: Extract indice from {idx1} to {idx2}")
 
                         _g.nodes["g"].data["u_ref"] = _g.nodes['g'].data['u_ref'][:, index]
                         _g.nodes["n1"].data["xyz"] = _g.nodes['n1'].data['xyz'][:, index, :]

From 252012e59e30e2a4b0cd8cc79e8499b934a3bf20 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 14:38:23 -0500
Subject: [PATCH 43/59] fix report_loss to properly append loss data at each
 epoch

---
 espfit/app/train.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index 1f3d977..2190baa 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -339,20 +339,17 @@ def report_loss(self, epoch, loss_dict):
         None
         """
         import pandas as pd
-        df = pd.DataFrame.from_dict(loss_dict, orient='index').T
-        df.insert(0, 'epoch', epoch)
         
         log_file_path = os.path.join(self.output_directory_path, 'reporter.log')
+        df_new = pd.DataFrame.from_dict(loss_dict, orient='index').T
+        df_new.insert(0, 'epoch', epoch)
 
         if os.path.exists(log_file_path):
-            existing_headers = pd.read_csv(log_file_path, sep='\t', nrows=0).columns.tolist()            
-            if set(df.columns) != set(existing_headers):
-                df_old = pd.read_csv(log_file_path, sep='\t')
-                df = pd.concat([df_old, df], ignore_index=True)
-            else:
-                df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False, header=False, mode='a')
+            df_old = pd.read_csv(log_file_path, sep='\t')
+            df = pd.concat([df_old, df_new])
         else:
-                df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False)
+            df = df_new
+        df.to_csv(log_file_path, sep='\t', float_format='%.4f', index=False)
 
 
     def train(self):
@@ -385,14 +382,18 @@ def train(self):
                     if torch.cuda.is_available():
                         g = g.to("cuda:0")
                     g.nodes["n1"].data["xyz"].requires_grad = True 
-                    loss = self.net(g)
+                    #loss = self.net(g)
+                    loss, loss_dict = self.net(g)
                     loss.backward()
                     optimizer.step()
-                
+
+                loss_dict['loss'] = loss.item()
+                self.report_loss(epoch, loss_dict)
+
                 if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
-                    _loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
-                    _logger.info(f'epoch {epoch}: {_loss:.3f}')
+                    loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
+                    _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}')
                     self._save_checkpoint(epoch)
     
     
@@ -418,7 +419,6 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
         -------
         None
         """
-        from espfit.utils.units import HARTREE_TO_KCALPERMOL
         from espfit.utils.sampler.reweight import SetupSamplerReweight
 
         # Note: RuntimeError will be raised if copy.deepcopy is used.
@@ -463,8 +463,8 @@ def train_sampler(self, sampler_patience=800, neff_threshold=0.2, sampler_weight
 
                 if epoch > self.sampler_patience:
                     # Save checkpoint as local model (net.pt)
+                    # `neff_min` is -1 if SamplerReweight.samplers is None
                     samplers = self._setup_local_samplers(epoch, net_copy, debug)
-                    # neff_min is -1 if SamplerReweight.samplers is None
                     neff_min = SamplerReweight.get_effective_sample_size(temporary_samplers=samplers)
 
                     # If effective sample size is below threshold, update SamplerReweight.samplers and re-run simulaton
@@ -546,7 +546,7 @@ def _save_checkpoint(self, epoch):
         -------
         None
         """
-        checkpoint_file = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt")
+        checkpoint_file = os.path.join(self.output_directory_path, f"ckpt{epoch}.pt")
         torch.save(self.net.state_dict(), checkpoint_file)
 
 
@@ -566,9 +566,9 @@ def _save_local_model(self, epoch, net_copy):
         None
         """
         # Save checkpoint as temporary espaloma model (force field)
-        _logger.info(f'Save checkpoint{epoch}.pt as temporary espaloma model (net.pt)')
+        _logger.info(f'Save ckpt{epoch}.pt as temporary espaloma model (net.pt)')
         self._save_checkpoint(epoch)
-        local_model = os.path.join(self.output_directory_path, f"checkpoint{epoch}.pt")
+        local_model = os.path.join(self.output_directory_path, f"ckpt{epoch}.pt")
         self.save_model(net=net_copy, best_model=local_model, model_name=f"net.pt", output_directory_path=self.output_directory_path)
 
 

From 5bf8717f85d3fa3057891ac64db618c5b8a3fb8d Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 15:22:08 -0500
Subject: [PATCH 44/59] update docstring for reshape_conformation_size

---
 espfit/utils/graphs.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index 75d47f4..d6b0d43 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -455,6 +455,9 @@ def reshape_conformation_size(self, n_confs=50, include_min_energy_conf=False):
         n_confs : int, default=50
             Number of conformations per graph (molecule).
 
+        include_min_energy_conf : boolean, default=False
+            If True, then minimum energy conformer will be included for all split graphs.
+
         Returns
         -------
         None

From 8d2871f1405a08155406791616376fe83ab07a88 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 15:23:17 -0500
Subject: [PATCH 45/59] update docstring in CustomGraphDataset

---
 espfit/utils/graphs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index d6b0d43..359dbff 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -33,7 +33,7 @@ class CustomGraphDataset(GraphDataset):
     compute_baseline_energy_force(forcefield_list=['openff-2.0.0']):
         Compute energies and forces using other force fields.
 
-    reshape_conformation_size(n_confs=50):
+    reshape_conformation_size(n_confs=50, include_min_energy_conf=False):
         Reshape conformation size.
     
     compute_relative_energy():

From ee1503e62568a54434bc1a18563cebfd610f1ac6 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 16:58:24 -0500
Subject: [PATCH 46/59] fix bug in training when restarting from existing
 checkpoint file

---
 espfit/app/train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/espfit/app/train.py b/espfit/app/train.py
index 2190baa..e4cbfdb 100644
--- a/espfit/app/train.py
+++ b/espfit/app/train.py
@@ -393,7 +393,7 @@ def train(self):
                 if epoch % self.checkpoint_frequency == 0:
                     # Note: returned loss is a joint loss of different units.
                     loss = HARTREE_TO_KCALPERMOL * loss.pow(0.5).item()
-                    _logger.info(f'Epoch {epoch}: loss={loss.item():.3f}')
+                    _logger.info(f'Epoch {epoch}: loss={loss:.3f}')
                     self._save_checkpoint(epoch)
     
     
@@ -514,10 +514,10 @@ def _load_checkpoint(self):
         checkpoints = glob.glob("{}/*.pt".format(self.output_directory_path))
         
         if checkpoints:
-            n = [ int(c.split('net')[1].split('.')[0]) for c in checkpoints ]
+            n = [ int(c.split('ckpt')[1].split('.')[0]) for c in checkpoints ]
             n.sort()
             restart_epoch = n[-1]
-            restart_checkpoint = os.path.join(self.output_directory_path, f"net{restart_epoch}.pt")
+            restart_checkpoint = os.path.join(self.output_directory_path, f"ckpt{restart_epoch}.pt")
             self.net.load_state_dict(torch.load(restart_checkpoint))
             logging.info(f'Restarting from ({restart_checkpoint}).')
         else:

From c757c73efa03dd7da9da93f0a90e08a8750cc77e Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 17:00:13 -0500
Subject: [PATCH 47/59] check output files after running test in
 test_train_sampler

---
 espfit/tests/test_app_train_sampler.py | 27 ++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py
index e0747a1..8d39e32 100644
--- a/espfit/tests/test_app_train_sampler.py
+++ b/espfit/tests/test_app_train_sampler.py
@@ -53,13 +53,9 @@ def test_load_dataset(tmpdir):
 
 
 def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml):
-
-    """
-    TODO
-    ----
-
-    * sampler.py needs to support loading temporary espaloma model during training
-    """
+    """Test function to train a sampler."""
+    import os
+    import glob
 
     # Load dataset and model
     ds = test_load_dataset
@@ -67,12 +63,19 @@ def test_train_sampler(test_load_dataset, test_create_espaloma_from_toml):
 
     # Set espaloma parameters
     model.dataset_train = ds
-    model.epochs = 10
+    model.epochs = 15
 
     # Train
-    model.train_sampler(sampler_patience=3, neff_threshold=0.2, debug=True)   # fails if sampler_patience is < epochs
+    sampler_patience = 10
+    # Force sampler to run after reaching sampler patience by setting neff_threshold to 1.0
+    model.train_sampler(sampler_patience=sampler_patience, neff_threshold=1.0, sampler_weight=1)
 
     # Check outputs
-    #import glob
-    #assert len(glob.glob(model.output_directory_path + '/*')) > 0
-    #assert model.sampler is not None
\ No newline at end of file
+    n_ckpt = len(glob.glob(os.path.join(model.output_directory_path, 'ckpt*pt')))
+    assert n_ckpt == int(model.epochs / model.checkpoint_frequency)
+
+    n_adenosine_pred_yaml = len(glob.glob(os.path.join(model.output_directory_path, 'adenosine/*/pred.yaml')))
+    assert n_adenosine_pred_yaml == int(model.epochs - sampler_patience)
+
+    n_cytidine_pred_yaml = len(glob.glob(os.path.join(model.output_directory_path, 'cytidine/*/pred.yaml')))
+    assert n_cytidine_pred_yaml == int(model.epochs - sampler_patience)

From 72946c49f32cfc13bfc6a2cb53f6e414165b6724 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 17:00:26 -0500
Subject: [PATCH 48/59] minor update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d8b18a2..f23f56d 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ Infrastruture to train espaloma with experimental observables
 from espfit.utils.graphs import CustomGraphDataset  
 path = 'espfit/data/qcdata/openff-toolkit-0.10.6/dgl2/protein-torsion-sm/'
 ds = CustomGraphDataset.load(path)
-ds.reshape_conformation_size(n_confs=50)
+ds.reshape_conformation_size(n_confs=50, include_min_energy_conf=True)
 ds.compute_relative_energy()
 # Create esplama model
 from espfit.app.train import EspalomaModel

From 9b9b96305656b1a3b25fda0f37f96b2e0ce2b093 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Wed, 6 Mar 2024 17:07:22 -0500
Subject: [PATCH 49/59] remove mockcode.ipynb

---
 examples/mockcode/mockcode.ipynb | 911 -------------------------------
 1 file changed, 911 deletions(-)
 delete mode 100644 examples/mockcode/mockcode.ipynb

diff --git a/examples/mockcode/mockcode.ipynb b/examples/mockcode/mockcode.ipynb
deleted file mode 100644
index 490e58b..0000000
--- a/examples/mockcode/mockcode.ipynb
+++ /dev/null
@@ -1,911 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "8425a3a5",
-   "metadata": {},
-   "source": [
-    "# Mock code for preparing and loading data for training espaloma"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6db9e281",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import espaloma\n",
-    "import espfit"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fa0f1027",
-   "metadata": {},
-   "source": [
-    "## Download QC datasets from QCArchive as HDF5 (SKIP IMPLEMENTATION) \n",
-    "\n",
-    "This functionality will not be implemented at the moment and alternatively rely on external scripts (e.g. https://github.com/choderalab/download-qca-datasets)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b69087c6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# place holder\n",
-    "\n",
-    "outdir='/DATASET_HDF_PATH/MYDATA' \n",
-    "outfile='small_basic.hdf5'\n",
-    "\n",
-    "espfit.utils.data.download_qcarchive(workflow='Datataset', \n",
-    "                                     qc_specification='default', \n",
-    "                                     outdir=outdir,\n",
-    "                                     outfile=outfile\n",
-    "                                     )\n",
-    "#> raise NotImplemented Error"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "aae11b77",
-   "metadata": {},
-   "source": [
-    "## Convert HDF5 to DGL graphs (SKIP IMPLEMENTATION)\n",
-    "\n",
-    "This function will not be implemented at the moment and alternatively rely on external scripts (e.g. https://github.com/choderalab/refit-espaloma/blob/main/openff-default/01-create-dataset/script/getgraph_hdf5.py)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "45c152ff",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# place holder\n",
-    "\n",
-    "indir = '/DATASET_HDF_PATH/MYDATA'\n",
-    "outdir = '/DATASET_DGL_PATH/MYDATA'\n",
-    "\n",
-    "_filenames = [ 'small_basic.hdf5', 'small_optimize.hdf5', 'small_torsiondrive.hdf5', 'peptide_basic.hdf5', 'peptide_optimize.hdf5', 'peptide_torsiondrive.hdf5' ]\n",
-    "filenames = [ os.path.join(indir, filename) for filename in _filenames ]\n",
-    "\n",
-    "for filename in filenames:\n",
-    "    ds += espfit.utils.data.hdf5_to_dgl(infile=filename,outdir=outdir)\n",
-    "    \n",
-    "#> raise NotImplemented Error"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "013f75f8",
-   "metadata": {},
-   "source": [
-    "## Filter DGL graphs (SKIP IMPLEMENTATION)\n",
-    "\n",
-    "This function will not be implemented at the moment and rely on external scripts (e.g. https://github.com/choderalab/refit-espaloma/tree/main/openff-default/02-train/merge-data/script)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e6f81b71",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# place holder\n",
-    "\n",
-    "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED'\n",
-    "ds.filter(min_energy=0.1,\n",
-    "          min_conformer=3,\n",
-    "          compute_am1bcc='AM1BCC-ELF10', \n",
-    "          compute_baseline_forcefields=forcefield_list, \n",
-    "          compute_relative_energy=True,\n",
-    "          subtract_nonbonded=True,\n",
-    "          base_forcefiled='openff-2.0.0',\n",
-    "          inplace=False,\n",
-    "          outdir=outdir\n",
-    "         )\n",
-    "            \n",
-    "#> raise NotImplemented Error"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a5da73a3",
-   "metadata": {},
-   "source": [
-    "## Load preprocessed DGL graphs\n",
-    "\n",
-    "We are going to start from here."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f402f1a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "indir = '/DATASET_DGL_PATH/MYDATA/FILTERED/*'   # single path or list of paths\n",
-    "ds = espfit.utils.data.load(in_prefix)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c06051b0",
-   "metadata": {},
-   "source": [
-    "#### Check properties"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "26b1b096",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds.n_data   # number of data (entries)\n",
-    "#> 100"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "97fca213",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds.n_conf   # number of conformations\n",
-    "#> 10000"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "edd853ac",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds.elements   # elements\n",
-    "#> H,B,Br,C,N,O,I"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0d118175",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds.duplicate_isomeric_smiles   # isomeric smiles\n",
-    "#> returns list of duplicate isomeric smiles"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "05bdc5b7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds.duplicate_nonisomeric_smiles   # nonisomeric smiles\n",
-    "#> returns list of duplicate nonisomeric smiles"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e0af7103",
-   "metadata": {},
-   "source": [
-    "#### Drop/merge duplicate smiles and filter datasets\n",
-    "\n",
-    "Ensure the datasets loaded from different sources have no duplicated smiles.  \n",
-    "Drop duplicate isomeric (nonisomeric) smiles across different sources of datasets.  \n",
-    "Merge duplicate dgl graphs with same smiles into a single dgl graph and create a new dataset called 'misc'."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "15340bae",
-   "metadata": {},
-   "source": [
-    "##### drop and merge smiles"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "47cb8e2b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "outdir = '/DATASET_DGL_PATH/MYDATA'\n",
-    "ds.drop_merge_nonisomeric_smiles(outdir=outdir, outname='misc')   # miscellaneous\n",
-    "\n",
-    "# Alteratively,\n",
-    "ds.drop_merge_isomeric_smiles(outdir=outdir, outname='misc')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1a84aa3d",
-   "metadata": {},
-   "source": [
-    "##### filter dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "958112bf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Add misc dataset that was just created\n",
-    "ds += espfit.utils.data.load('/DATASET_DGL_PATH/MYDATA/misc')\n",
-    "\n",
-    "# Filter all dataset\n",
-    "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED'\n",
-    "ds.filter(min_energy=0.1,\n",
-    "          min_conformer=3,\n",
-    "          inplace=False,\n",
-    "          outdir=outdir\n",
-    "          )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "457c7c7c-bd74-4009-a748-b014a9e21e6a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Compute all dataset\n",
-    "ds.compute(compute_am1bcc=None, \n",
-    "           compute_baseline_forcefields=None, \n",
-    "           compute_relative_energy=True,\n",
-    "           subtract_nonbonded=True,\n",
-    "           base_forcefiled='openff-2.0.0',\n",
-    "           inplace=False,\n",
-    "           outdir=outdir\n",
-    "           )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f96faf73",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Alternatively, we could just filter the misc data and reload all filtered dataset later\n",
-    "\n",
-    "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED'\n",
-    "misc_data = espfit.utils.data.load('/DATASET_DGL_PATH/MYDATA/misc')\n",
-    "misc_data.filter(min_energy=0.1,\n",
-    "                 min_conformer=3,\n",
-    "                 inplace=False,\n",
-    "                 outdir=outdir\n",
-    "                 )\n",
-    "misc_data.compute(compute_am1bcc=None, \n",
-    "                  compute_baseline_forcefields=None, \n",
-    "                  compute_relative_energy=True,\n",
-    "                  subtract_nonbonded=True,\n",
-    "                  base_forcefiled='openff-2.0.0',\n",
-    "                  inplace=False,\n",
-    "                  outdir=outdir\n",
-    "                  )\n",
-    "\n",
-    "# load filtered\n",
-    "input_dirs = glob.glob('/DATASET_DGL_PATH/MYDATA/FILTERED/*')   # list of paths\n",
-    "ds = espfit.utils.data.load(input_dirs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a132c68d",
-   "metadata": {},
-   "source": [
-    "## Prepare for training"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8174a95f",
-   "metadata": {},
-   "source": [
-    "#### Split datasets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f8b64d22",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RANDOM_SEED = 2666\n",
-    "ds.shuffle(RANDOM_SEED)\n",
-    "\n",
-    "ds_tr, ds_vl_te = ds.split(0.8, 0.2)\n",
-    "ds_vl, ds_te = ds_vl_te.split(0.5, 0.5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "06de6513",
-   "metadata": {},
-   "source": [
-    "#### Augment conformations to handle heterographs\n",
-    "\n",
-    "This is a work around to handle different graph size (shape). DGL requires at least one dimension with same size. \n",
-    "Here, we will modify the graphs so that each graph has the same number of conformations instead fo concatenating \n",
-    "graphs into heterogenous graphs with the same number of conformations. This will allow batching and shuffling \n",
-    "during the training. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "28ce7bd3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Remove unnecessary data from graph in backend? (will this speed up training?)\n",
-    "# e.g. g.nodes['g'].data.pop('u_qm')\n",
-    "\n",
-    "outdir = '/DATASET_DGL_PATH/MYDATA/FILTERED/RESHAPE'\n",
-    "ds_tr.reshape(n_conf=50,\n",
-    "              preserve_min=True,\n",
-    "              inplace=True,\n",
-    "              outdir=outdir,\n",
-    "              verbose=1,\n",
-    "             )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a9deeabf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# regenerate impropers (forgot why we need to do this)\n",
-    "ds_tr.apply(regenerate_impropers, in_place=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d45feb3f",
-   "metadata": {},
-   "source": [
-    "## Train espaloma"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "eeb8b4f9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# initialize\n",
-    "model = espfit.app.experiment()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c71d7e80",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# define espaloma architecture\n",
-    "\n",
-    "# use toml\n",
-    "import yaml\n",
-    "with open('config.yml', 'r') as file:\n",
-    "    config = yaml.safe_load(file)    \n",
-    "    \n",
-    "# Possible methods\n",
-    "# 1. call predefined model?\n",
-    "model.call(model_name='model1')\n",
-    "# 2. create model using yaml config\n",
-    "model.create(config=config)\n",
-    "# 3. from file\n",
-    "model.from_file('config.toml')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "32760aff",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# check neural network model\n",
-    "\n",
-    "model.net\n",
-    "#> returns neural network architecture"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "69a9e8d2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load dataset\n",
-    "\n",
-    "model.train_data = ds_tr\n",
-    "model.validation_data   = ds_vl\n",
-    "model.test_data  = ds_te"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "adeeb626",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# check data property\n",
-    "\n",
-    "model.train_data.n_data\n",
-    "model.train_data.n_conf\n",
-    "model.train_data.elements"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "20caa2bf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# save checkpoint file to `checkpoints` every 10 epochs\n",
-    "# restart training from checkpoint file\n",
-    "# validation is excluded from the training to decrease inference time\n",
-    "\n",
-    "model.train(steps, lr, batch_size, restart=checkpoint, checkpoint_frequency=10, log_file=logfile, log_level='debug')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fd25f33e",
-   "metadata": {},
-   "source": [
-    "#### Validate and find best model\n",
-    "\n",
-    "Use job array to speed up this process using external scripts (e.g. https://github.com/choderalab/refit-espaloma/tree/main/openff-default/02-train/joint-improper-charge/charge-weight-1.0/eval)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b94b4be6",
-   "metadata": {},
-   "source": [
-    "## Alternatively, train and validate simultaneously\n",
-    "\n",
-    "Not sure how slower this will be compared to just doing trainig"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dc585ee8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.train_val(steps, lr, batch_size, restart=checkpoint, checkpoint_frequency=10, logfile=logfile, verbose=1, early_stopping=800, patience=5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "80498680",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# save model\n",
-    "model.save()   # saves best model as 'model.pt'\n",
-    "\n",
-    "# plot loss validation\n",
-    "model.plot_loss()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "245006e3",
-   "metadata": {},
-   "source": [
-    "## Benchmark"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9861df48",
-   "metadata": {},
-   "source": [
-    "#### RMSE metric"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2ff42837",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RANDOM_SEED = 2666\n",
-    "indir='/DATASET_DGL_PATH/MYDATA/FILTERED/RESHAPE'\n",
-    "data_split_size = [0.8, 0.1, 0.1]\n",
-    "best_model = 'model.pt'\n",
-    "\n",
-    "df = espfit.utils.rmse_metric(best_model, indir, data_split_size, RANDOM_SEED)   # pandas dataframe\n",
-    "df.to_csv('rmse_metric.csv', index=False, sep='¥t', float_format='%.3f')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dc75ac51",
-   "metadata": {},
-   "source": [
-    "#### Run other benchmarks independantly.\n",
-    "\n",
-    "- Small molecule geometry optmization (https://github.com/choderalab/geometry-benchmark-espaloma/tree/main/qc-opt-geo)\n",
-    "- ESP benchmark"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "379ad7fb",
-   "metadata": {},
-   "source": [
-    "## Train espaloma with experimental observable refitting\n",
-    "\n",
-    "- `espfit_experiment/`\n",
-    "    - `data/`: Cached dataset ready for training\n",
-    "    - `utils/`: Stores scripts to run external benchmarks\n",
-    "        - `small_molecule_geometry`\n",
-    "            - geo.py\n",
-    "        - `partial_charge_esp`\n",
-    "            - ele.py\n",
-    "        - `rna_nucleoside`\n",
-    "            - rna_nucleoside.py\n",
-    "        - `rna_tetramer`:\n",
-    "            - rna_tetramer.py\n",
-    "    - `experiment/`\n",
-    "        - `001/`: Create new directory for each refitting experiment\n",
-    "            - `xml/`: Stores openmm xml\n",
-    "            - `refit/`: Espaloma training\n",
-    "                - `checkpoints/`: Stores checkpoint files\n",
-    "                - `sampling/`: MD simulation\n",
-    "                    - `iter_0`: Initial MD sampling\n",
-    "                    - `iter_n`: MD sampling at epoch-n when necesssary\n",
-    "                - `train.log`: Log file during espaloma training\n",
-    "            - `benchmark/`\n",
-    "                - `rmse_metric`\n",
-    "                - `small_molecule_geometry`\n",
-    "                - `partial_charge_esp`\n",
-    "                - `rna_nucleoside`\n",
-    "                - `rna_tetramer`"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "67893219",
-   "metadata": {},
-   "source": [
-    "#### Basic usage to run simulations for registered systems"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "343b1a35",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# check registered systems\n",
-    "registered_systems = espfit.system.available()\n",
-    "\n",
-    "registered_systems.get_names\n",
-    "#> ['A', 'G', 'C', 'U', 'ApA']\n",
-    "\n",
-    "registered_systems.get('name').observables\n",
-    "#> returns pandas dataframe with all experimental obervables and corresponding literature"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c5daf12a",
-   "metadata": {},
-   "source": [
-    "##### Prepare system"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c74f2962",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "system = registered_systems.get('name')\n",
-    "simulation = system.setup(system_name=name, espaloma_model = 'model.pt', config=config, outdir=outdir)   # save xml\n",
-    "\n",
-    "# minimize\n",
-    "simulation.min()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "66d56fba",
-   "metadata": {},
-   "source": [
-    "##### Load a system already prepared"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e673e6f4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "system = espfit.system.load()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5fc790e6",
-   "metadata": {},
-   "source": [
-    "##### Run simulation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0a1528dc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "simulation.run(steps=100)   # standard MD?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bc77fe98",
-   "metadata": {},
-   "source": [
-    "##### Compute loss"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e0657b0a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "obs_exp = system.get_experimental_value()\n",
-    "obs_calc = simulation.compute_observable()\n",
-    "loss = simulation.compute_loss(obs_exp, obs_calc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a358db78",
-   "metadata": {},
-   "source": [
-    "##### Reweight observable using updated espaloma model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "18099bb4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "result = simulation.compute_reweighted_observable(update_espaloma_model='new.pt')\n",
-    "\n",
-    "# reweighted observable\n",
-    "obs_calc = result.observable\n",
-    "\n",
-    "# effective sample size\n",
-    "n_eff = result.effective_sample_size\n",
-    "\n",
-    "# loss with reweighted observable\n",
-    "loss = simulation.compute_loss(obs_exp, obs_calc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "98996eed",
-   "metadata": {},
-   "source": [
-    "## Pseudo code for training espaloma with reweighting on the fly"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f73acfab",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RANDOM_SEED = 2666\n",
-    "\n",
-    "input_dirs = glob.glob('/DATASET_DGL_PATH/MYDATA/FILTERED/RESHAPE/*')   # list of paths\n",
-    "ds = espfit.utils.data.load(input_dirs)\n",
-    "ds.shuffle(RANDOM_SEED)\n",
-    "\n",
-    "ds_tr, ds_vl_te = ds.split(0.8, 0.2)\n",
-    "ds_vl, ds_te = ds_vl_te.split(0.5, 0.5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "35e48171",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = espfit.app.experiment()\n",
-    "\n",
-    "with open('config.yml', 'r') as file:\n",
-    "    config = yaml.safe_load(file)    \n",
-    "model.create(config=config)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6a835423",
-   "metadata": {},
-   "source": [
-    "##### Run simulation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c7e206e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "system = registered_systems.get('A')\n",
-    "simulation = system.setup(system_name=name, espaloma_model = 'model.pt', config=config, outdir=outdir)   # save xml\n",
-    "simulation.min()\n",
-    "simulation.run(1000)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fe5fcd47",
-   "metadata": {},
-   "source": [
-    "##### Get experimental observables"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1ff3ca80",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "obs_exp = system.get_experimental_value()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2915bb00",
-   "metadata": {},
-   "source": [
-    "##### Train with MD reweighting\n",
-    "\n",
-    "[Iterative Optimization of Molecular Mechanics Force Fields from NMR Data of Full-Length Proteins, JCTC, 2011](https://pubs.acs.org/doi/full/10.1021/ct200094b)  \n",
-    "[Automatic Learning of Hydrogen-Bond Fixes in the AMBER RNA Force Field, JCTC, 2022](https://pubs.acs.org/doi/10.1021/acs.jctc.2c00200)  \n",
-    "[Enhanced sampling methods for molecular dynamics simulations, arXiv, 2022](https://arxiv.org/abs/2202.04164)  "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5c4b766c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds_tr_loader = dgl.dataloading.GraphDataLoader(ds_tr, batch_size=batch_size, shuffle=True)\n",
-    "optimizer = torch.optim.Adam(model.net().parameters(), lr=learning_rate)\n",
-    "\n",
-    "with torch.autograd.set_detect_anomaly(True):\n",
-    "    for idx in range(steps):\n",
-    "        n_eff = []   # store effective sample size\n",
-    "        for g in ds_tr_loader:\n",
-    "            optimizer.zero_grad()\n",
-    "            g = g.to(\"cuda:0\")\n",
-    "            g.nodes[\"n1\"].data[\"xyz\"].requires_grad = True \n",
-    "            \n",
-    "            # Original espaloma loss\n",
-    "            loss = net(g)\n",
-    "\n",
-    "            # Reweighting \n",
-    "            result = simulation.compute_reweighted_observable(net)   # return: (reweighted observable, effective sample size)\n",
-    "            obs_calc = result.observable\n",
-    "            loss_md = simulation.compute_loss(obs_exp, obs_calc)   \n",
-    "            \n",
-    "            n_eff += result.n_eff\n",
-    "            \n",
-    "            # Joint loss\n",
-    "            loss += weight * loss_md\n",
-    "            \n",
-    "            loss.backward()\n",
-    "            optimizer.step()\n",
-    "            \n",
-    "            # save checkpoint file \n",
-    "            if idx % 10 == 0:\n",
-    "                if not os.path.exists(output_prefix):\n",
-    "                    os.mkdir(output_prefix)\n",
-    "                torch.save(net.state_dict(), output_prefix + \"/net%s.pth\" % idx)\n",
-    "                \n",
-    "        # Averaged effective samples\n",
-    "        if n_eff.mean() < effective_sample_size_tolerance:\n",
-    "            # rebuild system with current net model\n",
-    "            # rerun simulation\n",
-    "            # cache new trajectory\n",
-    "            simulation.rebuild()\n",
-    "            simulation.run()"
-   ]
-  },
-  {
-   "cell_type": "raw",
-   "id": "3900fc9f",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "raw",
-   "id": "5735fb72",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "raw",
-   "id": "ec7b85df",
-   "metadata": {},
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From 58036dd7592ba6ba381e37707612e8b05523c228 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 7 Mar 2024 12:58:51 -0500
Subject: [PATCH 50/59] average q_ref if different isomeric smiles are found in
 merging nonisomeric molecules

---
 espfit/utils/graphs.py | 53 +++++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index 359dbff..32776c3 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -96,8 +96,8 @@ def __init__(self, graphs=[], reference_forcefield='openff-2.0.0', random_seed=2
         self.random_seed = random_seed
 
 
-    def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc', output_directory_path=None):
-        """Drop and merge duplicate nonisomeric smiles across different data sources.
+    def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc', output_directory_path=None, isomeric=True):
+        """Drop and merge duplicate (non)isomeric smiles within the dataset.
 
         Modifies list of esp.Graph's in place.
 
@@ -112,6 +112,14 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc
         output_directory_path : str, default=None
             Output directory path to save the merged dataset. 
             If None, then the current working directory is used.
+
+        isoemric : boolean, default=True
+            If True, then isomeric smiles will be used to identify unique molecules.
+            
+            If False, then nonisomeric smiles will be used to identify unique molecules.
+            Note that partial charges will be averaged for the same nonisomeric smiles.
+            This is because different 3D structures can have different partial charges 
+            due to different conformations.
         
         Returns
         -------
@@ -123,8 +131,11 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc
         if output_directory_path == None:
             output_directory_path = os.getcwd()
 
-        _logger.info(f'Drop and merge duplicate smiles')
-        smiles = [ g.mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=False) for g in self.graphs ]
+        if isomeric == True:
+            _logger.info(f'Drop and merge duplicate isomeric smiles')
+        else:
+            _logger.info(f'Drop and merge duplicate nonisomeric smiles')
+        smiles = [ g.mol.to_smiles(isomeric=isomeric, explicit_hydrogens=True, mapped=False) for g in self.graphs ]
         _logger.info(f'Found {len(smiles)} molecules')
 
         # Unique entries
@@ -160,8 +171,8 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc
                 # Temporary directory needs to be created beforehand for `test_drop_and_merge_duplicates`.
                 _output_directory_path = os.path.join(output_directory_path, dataset_name)
                 os.makedirs(_output_directory_path, exist_ok=True)
-                output_directory_path = os.path.join(_output_directory_path, molname)
-                g.save(output_directory_path)
+                new_output_directory_path = os.path.join(_output_directory_path, molname)
+                g.save(new_output_directory_path)
 
         # Update in place
         new_graphs = unique_graphs + duplicated_graphs
@@ -344,7 +355,7 @@ def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']):
         COLLISION_RATE = 1.0 / unit.picosecond
 
         if not all(_ in self.available_forcefields for _ in forcefield_list):
-            raise Exception(f'{forcefield} force field not supported. Supported force fields are {SUPPORTED_FORCEFIELD_LIST}.')
+            raise Exception(f'{forcefield} force field not supported. Supported force fields are {self.available_forcefields}.')
 
         new_graphs = []
         for i, g in enumerate(self.graphs):
@@ -573,7 +584,7 @@ def _merge_graphs(ds):
 
         Parameters
         ----------
-        ds : list of espaloma.graphs.graph.Graph
+        ds : list of espaloma.graphs.graph.Graph, default=None
             The list of Graph instances to be merged. All Graphs in the list must be equivalent.
 
         Returns
@@ -586,12 +597,32 @@ def _merge_graphs(ds):
         import copy
         import torch
 
+        # Check if all inputs are equivalent (isomeric smiles)
+        # If not, get average partial charges across different isomeric smiles (molecules)
+        isomeric_smiles = [g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=False) for g in ds]
+        unique_isomeric_smiles = set(isomeric_smiles)
+        if len(unique_isomeric_smiles) != 1:
+            n_atoms = ds[0].nodes['n1'].data['q_ref'].shape[0]
+            q_ref = torch.zeros(n_atoms, 1)
+            for unique_isomeric_smile in unique_isomeric_smiles:
+                index = [i for i, isomeric_smile in enumerate(isomeric_smiles) if isomeric_smile in unique_isomeric_smile][0]
+                q_ref += ds[index].nodes['n1'].data['q_ref']
+            q_ref = q_ref / len(set(isomeric_smiles))
+            # Update partial charges in-place
+            for i in range(len(ds)):
+                ds[i].nodes['n1'].data['q_ref'] = q_ref
+
         # Check if graphs are equivalent
         for i in range(1, len(ds)):
-            # Openff molecule
-            assert ds[0].mol == ds[i].mol
             # Mapped isomeric smiles
-            assert ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) == ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
+            if len(unique_isomeric_smiles) != 1:
+                mapped_smiles = ds[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) 
+                mapped_smiles_i = ds[i].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True)
+                assert mapped_smiles == mapped_smiles_i, f"Mapped nonisomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
+            else:
+                mapped_smiles = ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) 
+                mapped_smiles_i = ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
+                assert mapped_smiles == mapped_smiles_i, f"Mapped isomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
             # Other node features
             for key in ["sum_q"]:
                 np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy())

From 5be247c6d754b16dce81df6872f679f9b0db94fe Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 7 Mar 2024 13:45:28 -0500
Subject: [PATCH 51/59] add testsystems/nucleoside/target.pdb

---
 .../target/testsystems/nucleoside/target.pdb  | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 espfit/data/target/testsystems/nucleoside/target.pdb

diff --git a/espfit/data/target/testsystems/nucleoside/target.pdb b/espfit/data/target/testsystems/nucleoside/target.pdb
new file mode 100644
index 0000000..c80385e
--- /dev/null
+++ b/espfit/data/target/testsystems/nucleoside/target.pdb
@@ -0,0 +1,35 @@
+REMARK   1 CREATED WITH OPENMM 8.1, 2024-01-26
+ATOM      1  C5'   A A   1       5.609   7.878  -3.616  1.00  0.00           C  
+ATOM      2  H5'   A A   1       5.926   8.406  -4.515  1.00  0.00           H  
+ATOM      3 H5''   A A   1       5.488   8.603  -2.812  1.00  0.00           H  
+ATOM      4  O5'   A A   1       4.377   7.214  -3.875  1.00  0.00           O  
+ATOM      5 HO5'   A A   1       3.732   7.873  -4.143  1.00  0.00           H  
+ATOM      6  C4'   A A   1       6.701   6.879  -3.233  1.00  0.00           C  
+ATOM      7  H4'   A A   1       7.663   7.391  -3.262  1.00  0.00           H  
+ATOM      8  O4'   A A   1       6.494   6.375  -1.913  1.00  0.00           O  
+ATOM      9  C3'   A A   1       6.778   5.649  -4.144  1.00  0.00           C  
+ATOM     10  H3'   A A   1       5.784   5.235  -4.319  1.00  0.00           H  
+ATOM     11  O3'   A A   1       7.428   5.896  -5.380  1.00  0.00           O  
+ATOM     12 HO3'   A A   1       8.365   5.865  -5.178  1.00  0.00           H  
+ATOM     13  C2'   A A   1       7.561   4.713  -3.219  1.00  0.00           C  
+ATOM     14  H2'   A A   1       7.428   3.673  -3.519  1.00  0.00           H  
+ATOM     15  C1'   A A   1       6.878   5.002  -1.872  1.00  0.00           C  
+ATOM     16  H1'   A A   1       7.580   4.816  -1.059  1.00  0.00           H  
+ATOM     17  N1    A A   1       5.124   0.467  -0.163  1.00  0.00           N  
+ATOM     18  C2    A A   1       6.351   0.978  -0.252  1.00  0.00           C  
+ATOM     19  H2    A A   1       7.149   0.341   0.097  1.00  0.00           H  
+ATOM     20  N3    A A   1       6.723   2.170  -0.712  1.00  0.00           N  
+ATOM     21  C4    A A   1       5.650   2.896  -1.124  1.00  0.00           C  
+ATOM     22  C5    A A   1       4.335   2.511  -1.094  1.00  0.00           C  
+ATOM     23  C6    A A   1       4.098   1.219  -0.579  1.00  0.00           C  
+ATOM     24  N6    A A   1       2.894   0.673  -0.481  1.00  0.00           N  
+ATOM     25  H61   A A   1       2.824  -0.276  -0.142  1.00  0.00           H  
+ATOM     26  H62   A A   1       2.092   1.188  -0.815  1.00  0.00           H  
+ATOM     27  N7    A A   1       3.506   3.517  -1.595  1.00  0.00           N  
+ATOM     28  C8    A A   1       4.352   4.463  -1.905  1.00  0.00           C  
+ATOM     29  H8    A A   1       4.040   5.410  -2.321  1.00  0.00           H  
+ATOM     30  N9    A A   1       5.670   4.162  -1.657  1.00  0.00           N  
+ATOM     31  O2'   A A   1       8.945   5.071  -3.169  1.00  0.00           O  
+ATOM     32 HO2'   A A   1       9.415   4.404  -2.663  1.00  0.00           H  
+TER      33        A A   1
+END

From d53595a4cf7f84f4b89ab5433f5c57fcce45f018 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 7 Mar 2024 19:23:14 -0500
Subject: [PATCH 52/59] compare sorted number of conformers to prevent github
 CI failure

---
 espfit/tests/test_utils_graphs.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/espfit/tests/test_utils_graphs.py b/espfit/tests/test_utils_graphs.py
index 17f097b..ac94543 100644
--- a/espfit/tests/test_utils_graphs.py
+++ b/espfit/tests/test_utils_graphs.py
@@ -68,7 +68,9 @@ def test_load_dataset(mydata_gen2_torsion_sm):
     """
     ds = mydata_gen2_torsion_sm
     nconfs = [g.nodes['g'].data['u_ref'].shape[1] for g in ds]
-    assert nconfs == [24, 24, 24, 13, 24, 24, 24, 24], 'Number of molecular conformers does not match'
+    # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI.
+    #assert nconfs == [24, 24, 24, 13, 24, 24, 24, 24], 'Number of molecular conformers does not match'
+    assert nconfs.sort() == [24, 24, 24, 13, 24, 24, 24, 24].sort(), 'Number of molecular conformers does not match'
 
 
 def test_load_dataset_multiple(mydata_gen2_torsion_sm, mydata_protein_torsion_sm, mydata_rna_diverse_sm):
@@ -119,7 +121,9 @@ def test_drop_and_merge_duplicates(mydata_gen2_torsion_sm, tmpdir):
     temporary_directory = tmpdir.mkdir('misc')
     ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory))
     nconfs = [ g.nodes['g'].data['u_ref'].shape[1] for g in ds ]
-    assert nconfs == [24, 13, 24, 24, 24, 72], 'Number of molecular conformers does not match'
+    # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI.
+    #assert nconfs == [24, 13, 24, 24, 24, 72], 'Number of molecular conformers does not match'
+    assert nconfs.sort() == [24, 13, 24, 24, 24, 72].sort(), 'Number of molecular conformers does not match'
 
 
 def test_subtract_nonbonded_interactions(mydata_gen2_torsion_sm):
@@ -165,7 +169,9 @@ def test_filter_high_energy_conformers(mydata_gen2_torsion_sm):
     # set relative_energy_thershold very small to ensure some conformers will be filtered
     ds.filter_high_energy_conformers(relative_energy_threshold=0.01, node_feature='u_ref')
     nconfs = [ g.nodes['g'].data['u_ref'].shape[1] for g in ds ]
-    assert nconfs == [14, 19, 19, 5, 14, 19, 24, 24], 'Number of molecular conformers does not match'
+    # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI.
+    #assert nconfs == [14, 19, 19, 5, 14, 19, 24, 24], 'Number of molecular conformers does not match'
+    assert nconfs.sort() == [14, 19, 19, 5, 14, 19, 24, 24].sort(), 'Number of molecular conformers does not match'
 
 
 def test_filter_minimum_conformers(mydata_gen2_torsion_sm):

From 41555f2242e628edc2b817663ae763166440efc1 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Thu, 7 Mar 2024 19:25:09 -0500
Subject: [PATCH 53/59] add try-except to prevent
 UnassignedProperTorsionParameterException

---
 espfit/utils/graphs.py | 165 ++++++++++++++++++++++-------------------
 1 file changed, 88 insertions(+), 77 deletions(-)

diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index 32776c3..2ac2798 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -27,10 +27,10 @@ class CustomGraphDataset(GraphDataset):
     filter_high_energy_conformers(relative_energy_threshold=0.1, node_feature='u_ref'):
         Filter high energy conformers and ensure minimum number of conformers.
 
-    filter_minimum_conformers(n_conformer_threshold=3):
+    filter_minimum_conformers(n_conformer_threshold=5):
         Filter molecules with conformers below given threshold.
 
-    compute_baseline_energy_force(forcefield_list=['openff-2.0.0']):
+    compute_baseline_energy_force(forcefield_list=['openff-2.1.0']):
         Compute energies and forces using other force fields.
 
     reshape_conformation_size(n_confs=50, include_min_energy_conf=False):
@@ -62,9 +62,9 @@ class CustomGraphDataset(GraphDataset):
     >>> # Filter high energy conformers (u_ref: QM reference after nonbonded interactions are subtracted)
     >>> ds.filter_high_energy_conformers(relative_energy_threshold=0.1, node_feature='u_ref')
     >>> # Filter conformers below certain number
-    >>> ds.filter_minimum_conformers(n_conformer_threshold=3)
+    >>> ds.filter_minimum_conformers(n_conformer_threshold=5)
     >>> # Compute energies and forces using other force fields
-    >>> ds.compute_baseline_energy_force(forcefield_list=['openff-2.0.0'])
+    >>> ds.compute_baseline_energy_force(forcefield_list=['openff-2.1.0'])
     >>> # Regenerate improper torsions in-place
     >>> from espaloma.graphs.utils.regenerate_impropers import regenerate_impropers
     >>> ds.apply(regenerate_impropers, in_place=True)
@@ -75,7 +75,7 @@ class CustomGraphDataset(GraphDataset):
     """
 
 
-    def __init__(self, graphs=[], reference_forcefield='openff-2.0.0', random_seed=2666):
+    def __init__(self, graphs=[], reference_forcefield='openff-2.1.0', random_seed=2666):
         """Construct custom GraphDataset instance to prepare QC dataset for espaloma training.
 
         Parameters
@@ -83,7 +83,7 @@ def __init__(self, graphs=[], reference_forcefield='openff-2.0.0', random_seed=2
         graphs : list of espaloma.graphs.graph.Graph, default=[]
             DGL graphs loaded from `espaloma.data.dataset.GraphDataset.load`.
              
-        reference_forcefield : str, default=openff-2.0.0
+        reference_forcefield : str, default=openff-2.1.0
             Reference force field used to compute force field parameters if not present in espaloma.
             The default behavior is to compute the LJ parameters with `reference_forcefield`.
         
@@ -215,6 +215,7 @@ def subtract_nonbonded_interactions(self, subtract_vdw=False, subtract_ele=True)
         -------
         None
         """
+        _logger.info(f'Subtract nonbonded interactions from QC reference')
         new_graphs = []
         from espaloma.data.md import subtract_nonbonded_force
 
@@ -234,8 +235,8 @@ def subtract_nonbonded_interactions(self, subtract_vdw=False, subtract_ele=True)
                 # subtract_nonbonded_force() will return the coulomb interactions using the predefined partial charges.
                 #
                 # Reference:
-                # [1] https://github.com/choderalab/espaloma/blob/main/espaloma/data/md.py#L503C19-L503C19
-                # [2] https://github.com/openmm/openmmforcefields/blob/637d551a4408cc6145529cd9dc30e267f4178367/openmmforcefields/generators/template_generators.py#L1432
+                # [1] https://github.com/choderalab/espaloma/blob/main/espaloma/data/md.py#L503
+                # [2] https://github.com/openmm/openmmforcefields/blob/637d551a4408cc6145529cd9dc30e267f4178367/openmmforcefields/generators/template_generators.py#L607
                 g = subtract_nonbonded_force(g, forcefield=self.reference_forcefield, subtract_charges=True)
             elif subtract_vdw == False and subtract_ele == False:
                 g = subtract_nonbonded_force(g, forcefield=self.reference_forcefield, subtract_charges=False)
@@ -266,6 +267,7 @@ def filter_high_energy_conformers(self, relative_energy_threshold=0.1, node_feat
         -------
         None
         """
+        _logger.info(f'Filter high energy conformers with relative energy threshold {relative_energy_threshold}')
         if node_feature == None:
             raise Exception(f'Please specify the node feature name under node type `g`')
 
@@ -289,20 +291,21 @@ def filter_high_energy_conformers(self, relative_energy_threshold=0.1, node_feat
         del new_graphs
 
 
-    def filter_minimum_conformers(self, n_conformer_threshold=3):
+    def filter_minimum_conformers(self, n_conformer_threshold=5):
         """Filter molecules with conformers below given threshold.
 
         Modifies list of esp.Graph's in place.
     
         Parameters
         ----------        
-        n_conformer_threshold : int, default=3
+        n_conformer_threshold : int, default=5
             The minimium number of conformers per entry.
 
         Returns
         -------
         None
         """
+        _logger.info(f'Filter molecules with conformers below {n_conformer_threshold} conformers')
         new_graphs = []
         for i, g in enumerate(self.graphs):
             n_confs = g.nodes['n1'].data['xyz'].shape[1]
@@ -314,15 +317,15 @@ def filter_minimum_conformers(self, n_conformer_threshold=3):
         del new_graphs
 
 
-    def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']):
+    def compute_baseline_energy_force(self, forcefield_list=['openff-2.1.0']):
         """Compute energies and forces using other force fields.
 
-        New node features are added to g.nodes['g']. For example, g.nodes['g'].data['u_openff-2.0.0'] and 
-        g.nodes['n1'].data['u_openff-2.0.0_prime'] will be created for energies and forces, respectively.
+        New node features are added to g.nodes['g']. For example, g.nodes['g'].data['u_openff-2.1.0'] and 
+        g.nodes['n1'].data['u_openff-2.1.0_prime'] will be created for energies and forces, respectively.
         
         Parameters
         ----------
-        forcefield_list : list, default=['openff-2.0.0']
+        forcefield_list : list, default=['openff-2.1.0']
             Currently supports the following force fields:
             'gaff-1.81', 'gaff-2.11', 'openff-1.2.0', 'openff-2.0.0', 'openff-2.1.0', 
             'amber14-all.xml', 'amber/protein.ff14SBonlysc.xml'
@@ -349,6 +352,8 @@ def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']):
         from openmm.unit import Quantity
         from openmmforcefields.generators import SystemGenerator
 
+        _logger.info(f'Compute energies and forces using other force fields')
+
         # Simulation Specs (not important, just place holders)
         TEMPERATURE = 350 * unit.kelvin
         STEP_SIZE = 1.0 * unit.femtosecond
@@ -358,73 +363,78 @@ def compute_baseline_energy_force(self, forcefield_list=['openff-2.0.0']):
             raise Exception(f'{forcefield} force field not supported. Supported force fields are {self.available_forcefields}.')
 
         new_graphs = []
-        for i, g in enumerate(self.graphs):
-            for forcefield in forcefield_list:
-                if forcefield.startswith('gaff') or forcefield.startswith('openff'):
-                    generator = SystemGenerator(
-                        small_molecule_forcefield=forcefield,
-                        molecules=[g.mol],
-                        forcefield_kwargs={"constraints": None, "removeCMMotion": False},
-                    )
-                    name = forcefield
-                elif forcefield.startswith('amber') or forcefield.startswith('protein'):
-                    generator = SystemGenerator(
-                        forcefields=[forcefield],
-                        molecules=[g.mol],
-                        forcefield_kwargs={"constraints": None, "removeCMMotion": False},
-                    )
-                    if forcefield == 'amber14-all.xml':
-                        name = 'amber14sb'
-                    elif forcefield == 'amber/protein.ff14SBonlysc.xml':
-                        name = 'amber14sb_onlysc'
-                else:
-                    import warnings
-                    warnings.warn(f'{forcefield} not supported for molecule {g.mol.to_smiles()}')
-                
-                suffix = name
-
-                # Parameterize topology
-                topology = g.mol.to_topology().to_openmm()
-                # Create openmm system
-                system = generator.create_system(topology)
-                # Use langevin integrator, although it's not super useful here
-                integrator = openmm.LangevinIntegrator(TEMPERATURE, COLLISION_RATE, STEP_SIZE)
-                # Create simulation
-                simulation = Simulation(topology=topology, system=system, integrator=integrator)
-                # Get energy
-                us = []
-                us_prime = []
-                xs = (
-                    Quantity(
-                        g.nodes["n1"].data["xyz"].detach().numpy(),
-                        espunits.DISTANCE_UNIT,
-                    )
-                    .value_in_unit(unit.nanometer)
-                    .transpose((1, 0, 2))
-                )
-                for x in xs:
-                    simulation.context.setPositions(x)
-                    us.append(
-                        simulation.context.getState(getEnergy=True)
-                        .getPotentialEnergy()
-                        .value_in_unit(espunits.ENERGY_UNIT)
+        for g in self.graphs:
+            try:
+                for forcefield in forcefield_list:
+                    if forcefield.startswith('gaff') or forcefield.startswith('openff'):
+                        generator = SystemGenerator(
+                            small_molecule_forcefield=forcefield,
+                            molecules=[g.mol],
+                            forcefield_kwargs={"constraints": None, "removeCMMotion": False},
+                        )
+                        name = forcefield
+                    elif forcefield.startswith('amber') or forcefield.startswith('protein'):
+                        generator = SystemGenerator(
+                            forcefields=[forcefield],
+                            molecules=[g.mol],
+                            forcefield_kwargs={"constraints": None, "removeCMMotion": False},
+                        )
+                        if forcefield == 'amber14-all.xml':
+                            name = 'amber14sb'
+                        elif forcefield == 'amber/protein.ff14SBonlysc.xml':
+                            name = 'amber14sb_onlysc'
+                    else:
+                        import warnings
+                        warnings.warn(f'{forcefield} not supported for molecule {g.mol.to_smiles()}')
+                    
+                    suffix = name
+
+                    # Parameterize topology
+                    topology = g.mol.to_topology().to_openmm()
+                    # Create openmm system
+                    system = generator.create_system(topology)
+                    # Use langevin integrator, although it's not super useful here
+                    integrator = openmm.LangevinIntegrator(TEMPERATURE, COLLISION_RATE, STEP_SIZE)
+                    # Create simulation
+                    simulation = Simulation(topology=topology, system=system, integrator=integrator)
+                    # Get energy
+                    us = []
+                    us_prime = []
+                    xs = (
+                        Quantity(
+                            g.nodes["n1"].data["xyz"].detach().numpy(),
+                            espunits.DISTANCE_UNIT,
+                        )
+                        .value_in_unit(unit.nanometer)
+                        .transpose((1, 0, 2))
                     )
-                    us_prime.append(
-                        simulation.context.getState(getForces=True)
-                        .getForces(asNumpy=True)
-                        .value_in_unit(espunits.FORCE_UNIT) * -1
+                    for x in xs:
+                        simulation.context.setPositions(x)
+                        us.append(
+                            simulation.context.getState(getEnergy=True)
+                            .getPotentialEnergy()
+                            .value_in_unit(espunits.ENERGY_UNIT)
+                        )
+                        us_prime.append(
+                            simulation.context.getState(getForces=True)
+                            .getForces(asNumpy=True)
+                            .value_in_unit(espunits.FORCE_UNIT) * -1
+                        )
+
+                    us = torch.tensor(us, dtype=torch.float64)[None, :]
+                    us_prime = torch.tensor(
+                        np.stack(us_prime, axis=1),
+                        dtype=torch.get_default_dtype(),
                     )
 
-                us = torch.tensor(us, dtype=torch.float64)[None, :]
-                us_prime = torch.tensor(
-                    np.stack(us_prime, axis=1),
-                    dtype=torch.get_default_dtype(),
-                )
-
-                g.nodes['g'].data['u_%s' % suffix] = us
-                g.nodes['n1'].data['u_%s_prime' % suffix] = us_prime
+                    g.nodes['g'].data['u_%s' % suffix] = us
+                    g.nodes['n1'].data['u_%s_prime' % suffix] = us_prime
 
-            new_graphs.append(g)
+                new_graphs.append(g)
+            except Exception as e:
+                mol_err = g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
+                _logger.warning(f'Error occured during processing {mol_err}: {e}')
+                continue
 
         # Update in place
         self.graphs = new_graphs
@@ -440,6 +450,7 @@ def compute_relative_energy(self):
         -------
         None
         """
+        _logger.info(f'Compute relative energy')
         new_graphs = []
         for g in self.graphs:
             g.nodes['g'].data['u_ref_relative'] = g.nodes['g'].data['u_ref'].detach().clone()

From 716425314dc08741270410a020a506243d9276b9 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 8 Mar 2024 11:01:56 -0500
Subject: [PATCH 54/59] deprecate isomeric=False in drop_and_merge_duplicates

---
 espfit/utils/graphs.py | 55 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index 2ac2798..c886d07 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -116,11 +116,31 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc
         isoemric : boolean, default=True
             If True, then isomeric smiles will be used to identify unique molecules.
             
-            If False, then nonisomeric smiles will be used to identify unique molecules.
-            Note that partial charges will be averaged for the same nonisomeric smiles.
-            This is because different 3D structures can have different partial charges 
-            due to different conformations.
-        
+            Note: isomeric=False is deprecated at the moment.
+            There is no guarantee that the atom order is consistent across different molecules
+            with the same nonisomeric smiles. In consistent atom order, results in different
+            mapped smiles (atom mapping). To concatenate graphs with different mapped smiles 
+            but with the same nonisomeric smiles, atom orders needs to be fixed. 
+            This is not implemented yet.
+
+            For example, different mapped smiles for the same nonisomeric smiles:
+            
+            [H:21][c:1]1[c:2]([c:4]([c:7]([c:5]([c:3]1[H:23])[H:25])[N:14]=[N:15][C:8]2=[C:10]3[N:16]\
+            ([C:9](=[C:6]([C:11](=[O:19])[N:18]3[N:17]([C:12]2=[O:20])[H:31])[H:26])[C:13]([H:27])\
+            ([H:28])[H:29])[H:30])[H:24])[H:22]
+            
+            [H:22][c:11]1[c:12]([c:14]([c:16]([c:15]([c:13]1[H:24])[H:26])[N:4]=[N:3][C:8]2=[C:9]3[N:17]\
+            ([C:10](=[C:7]([C:6](=[O:2])[N:19]3[N:18]([C:5]2=[O:1])[H:28])[H:21])[C:20]([H:29])\
+            ([H:30])[H:31])[H:27])[H:25])[H:23]
+
+            This will give you different g.nodes['n2'].data['idxs'] which is problematic when
+            concatenating graphs with different mapped smiles but the same nonisomeric smiles.
+
+            #If False, then nonisomeric smiles will be used to identify unique molecules.
+            #Note that partial charges will be averaged for the same nonisomeric smiles.
+            #This is because different 3D structures can have different partial charges 
+            #due to different conformations.
+
         Returns
         -------
         None
@@ -134,7 +154,8 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc
         if isomeric == True:
             _logger.info(f'Drop and merge duplicate isomeric smiles')
         else:
-            _logger.info(f'Drop and merge duplicate nonisomeric smiles')
+            #_logger.info(f'Drop and merge duplicate nonisomeric smiles')
+            raise ImportError(f'isomeric=False is deprecated at the moment')
         smiles = [ g.mol.to_smiles(isomeric=isomeric, explicit_hydrogens=True, mapped=False) for g in self.graphs ]
         _logger.info(f'Found {len(smiles)} molecules')
 
@@ -219,7 +240,7 @@ def subtract_nonbonded_interactions(self, subtract_vdw=False, subtract_ele=True)
         new_graphs = []
         from espaloma.data.md import subtract_nonbonded_force
 
-        for i, g in enumerate(self.graphs):
+        for g in self.graphs:
             # `espaloma.data.md.subtract_nonbonded_force` will update g.nodes['g'].data['u_ref'] and g.nodes['g'].data['u_ref_prime'] in place. 
             # Clone QM reference into g.nodes['g'].data['u_qm'] and g.nodes['g'].data['u_qm_prime'], if not exist
             if 'u_qm' not in g.nodes['g'].data.keys():
@@ -608,6 +629,14 @@ def _merge_graphs(ds):
         import copy
         import torch
 
+        """
+        #
+        # NOTE
+        # ----
+        # THIS IS DEPRECATED AT THE MOMENT. SEE LINE 116 FOR MORE DETAILS.
+        # KEEPING THIS FOR FUTURE REFERENCE.
+        #
+        
         # Check if all inputs are equivalent (isomeric smiles)
         # If not, get average partial charges across different isomeric smiles (molecules)
         isomeric_smiles = [g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=False) for g in ds]
@@ -639,6 +668,18 @@ def _merge_graphs(ds):
                 np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy())
             for key in ["q_ref", "idxs", "h0"]:
                 np.testing.assert_array_equal(ds[0].nodes['n1'].data[key].flatten().numpy(), ds[i].nodes['n1'].data[key].flatten().numpy())
+        """
+
+        # Check if graphs are equivalent
+        for i in range(1, len(ds)):
+            mapped_smiles = ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) 
+            mapped_smiles_i = ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
+            assert mapped_smiles == mapped_smiles_i, f"Mapped isomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
+            # Other node features
+            for key in ["sum_q"]:
+                np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy())
+            for key in ["q_ref", "idxs", "h0"]:
+                np.testing.assert_array_equal(ds[0].nodes['n1'].data[key].flatten().numpy(), ds[i].nodes['n1'].data[key].flatten().numpy())
 
         # Merge graphs
         g = copy.deepcopy(ds[0])

From 7490db5f2545789690cf9878da316953b27e7c33 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 8 Mar 2024 21:40:17 -0500
Subject: [PATCH 55/59] reflect changes in drop_duplicates in graphs.py

---
 espfit/tests/test_utils_graphs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/espfit/tests/test_utils_graphs.py b/espfit/tests/test_utils_graphs.py
index ac94543..0446290 100644
--- a/espfit/tests/test_utils_graphs.py
+++ b/espfit/tests/test_utils_graphs.py
@@ -104,7 +104,7 @@ def test_load_dataset_multiple(mydata_gen2_torsion_sm, mydata_protein_torsion_sm
     assert sum(nconfs) == 5636, 'Total number of conformations does not match'
 
 
-def test_drop_and_merge_duplicates(mydata_gen2_torsion_sm, tmpdir):
+def test_drop_duplicates(mydata_gen2_torsion_sm, tmpdir):
     """Test function to drop and merge duplicate molecules.
 
     Parameters
@@ -119,7 +119,7 @@ def test_drop_and_merge_duplicates(mydata_gen2_torsion_sm, tmpdir):
     """
     ds = mydata_gen2_torsion_sm
     temporary_directory = tmpdir.mkdir('misc')
-    ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory))
+    ds.drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory))
     nconfs = [ g.nodes['g'].data['u_ref'].shape[1] for g in ds ]
     # Sort the list of nconfs. For some reason, the order of the list is not consistent when running the test locally and on GitHub CI.
     #assert nconfs == [24, 13, 24, 24, 24, 72], 'Number of molecular conformers does not match'

From 1c844106e19ace56d036a41e5837a3ec5a7cc03c Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Fri, 8 Mar 2024 21:40:43 -0500
Subject: [PATCH 56/59] use nonisomeric smiles to detect unique molecules

---
 espfit/utils/graphs.py | 248 ++++++++++++++++++++---------------------
 1 file changed, 124 insertions(+), 124 deletions(-)

diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index c886d07..7a00451 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -18,7 +18,7 @@ class CustomGraphDataset(GraphDataset):
 
     Methods
     -------
-    drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=None):
+    drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=None):
         Drop and merge duplicate nonisomeric smiles across different data sources.
 
     subtract_nonbonded_interactions(subtract_vdw=False, subtract_ele=True):
@@ -53,7 +53,7 @@ class CustomGraphDataset(GraphDataset):
     >>> ds = GraphDataset.load(path)
     >>> # Drop and merge duplicate molecules. Save merged dataset as a new dataset.
     >>> # If `output_directory_path` is None, then the current working directory is used.
-    >>> ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=None)
+    >>> ds.drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=None)
     >>> # Subtract nonbonded energies and forces from QC reference (e.g. subtract all valence and ele interactions)
     >>> # This will update u_ref and u_ref_relative in-place. copy of raw u_ref (QM reference) will be copied to u_qm.
     >>> ds.subtract_nonbonded_interactions(subtract_vdw=False, subtract_ele=True)
@@ -96,34 +96,25 @@ def __init__(self, graphs=[], reference_forcefield='openff-2.1.0', random_seed=2
         self.random_seed = random_seed
 
 
-    def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc', output_directory_path=None, isomeric=True):
-        """Drop and merge duplicate (non)isomeric smiles within the dataset.
+    def drop_duplicates(self, isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=None):
+        """Drop duplicate (non)isomeric smiles within the dataset.
 
         Modifies list of esp.Graph's in place.
 
         Parameters
         ----------
-        save_merged_datest : boolean, default=True
-            If True, then merged datasets will be saved as a new dataset.
-        
-        dataset_name : str, default=misc
-            Name of the merged dataset.
+        isomeric : boolean, default=False
+            If True, then duplicated molecules are merged based on isomeric mapped smiles.
+            If False, then duplicated molecules are merged based on nonisomeric mapped smiles.
 
-        output_directory_path : str, default=None
-            Output directory path to save the merged dataset. 
-            If None, then the current working directory is used.
+            Unique molecules are identified by nonisomeric non-mapped smiles. 
+            Duplicated molecules (nonisomeric smiles) are merged into a single molecule based on 
+            the isomeric mapped smiles (isomeric=True) or nonisomeric mapped smiles (isomeric=False).
 
-        isoemric : boolean, default=True
-            If True, then isomeric smiles will be used to identify unique molecules.
-            
-            Note: isomeric=False is deprecated at the moment.
-            There is no guarantee that the atom order is consistent across different molecules
-            with the same nonisomeric smiles. In consistent atom order, results in different
-            mapped smiles (atom mapping). To concatenate graphs with different mapped smiles 
-            but with the same nonisomeric smiles, atom orders needs to be fixed. 
-            This is not implemented yet.
-
-            For example, different mapped smiles for the same nonisomeric smiles:
+            Note that there is no guarantee that the atom order (mapping) is consistent across different 
+            molecules with the same (non)isomeric smiles.
+
+            For example, molecules with same nonisomeric smiles could have different mapped smiles:
             
             [H:21][c:1]1[c:2]([c:4]([c:7]([c:5]([c:3]1[H:23])[H:25])[N:14]=[N:15][C:8]2=[C:10]3[N:16]\
             ([C:9](=[C:6]([C:11](=[O:19])[N:18]3[N:17]([C:12]2=[O:20])[H:31])[H:26])[C:13]([H:27])\
@@ -133,13 +124,28 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc
             ([C:10](=[C:7]([C:6](=[O:2])[N:19]3[N:18]([C:5]2=[O:1])[H:28])[H:21])[C:20]([H:29])\
             ([H:30])[H:31])[H:27])[H:25])[H:23]
 
-            This will give you different g.nodes['n2'].data['idxs'] which is problematic when
-            concatenating graphs with different mapped smiles but the same nonisomeric smiles.
+            This will give different atom ordering, leading to, for example, different 
+            bond atom index (g.nodes['n2'].data['idxs']).
 
-            #If False, then nonisomeric smiles will be used to identify unique molecules.
-            #Note that partial charges will be averaged for the same nonisomeric smiles.
-            #This is because different 3D structures can have different partial charges 
-            #due to different conformations.
+            To alleviate this issue, nonisomeric smiles without atom mapping is used to identify 
+            unique molecules and remove any duplicated molecules. Then, duplicated molecules are
+            merged into a single molecule based on the isomeric mapped smiles (isomeric=True) or
+            nonisomeric mapped smiles (isomeric=False).
+
+        keep : boolean, default=True
+            If True, then duplicate entries dropped from the dataset will be added back to the unique entries
+            after the dropped duplicated entries are merged into a single molecule. If False, then duplicated
+            entries dropped will be removed.
+            
+        save_merged_datest : boolean, default=True
+            If True, then duplicated molecules are merged into a single molecule and saved as a new dataset.
+        
+        dataset_name : str, default=misc
+            Name of the merged dataset.
+
+        output_directory_path : str, default=None
+            Output directory path to save the merged dataset. 
+            If None, then the current working directory is used.
 
         Returns
         -------
@@ -150,54 +156,79 @@ def drop_and_merge_duplicates(self, save_merged_dataset=True, dataset_name='misc
 
         if output_directory_path == None:
             output_directory_path = os.getcwd()
-
+        
+        _logger.info(f'Remove duplicated nonisomeric smiles from dataset')
         if isomeric == True:
-            _logger.info(f'Drop and merge duplicate isomeric smiles')
+            _logger.info(f'Merge duplicated nonisomeric smiles into unique isomeric mapped smiles')
         else:
-            #_logger.info(f'Drop and merge duplicate nonisomeric smiles')
-            raise ImportError(f'isomeric=False is deprecated at the moment')
-        smiles = [ g.mol.to_smiles(isomeric=isomeric, explicit_hydrogens=True, mapped=False) for g in self.graphs ]
-        _logger.info(f'Found {len(smiles)} molecules')
-
-        # Unique entries
-        df = pd.DataFrame.from_dict({'smiles': smiles})
-        unique_index = df.drop_duplicates(keep=False).index.to_list()
+            _logger.info(f'Merge duplicated nonisomeric smiles into unique nonisomeric mapped smiles')
+        
+        # Get smiles
+        nonisomeric_smiles = [ g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=False) for g in self.graphs ]
+        nonisomeric_mapped_smiles = [ g.mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) for g in self.graphs ]
+        isomeric_mapped_smiles = [ g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) for g in self.graphs ]
+        _logger.info(f'Found {len(self.graphs)} graph entries')
+
+        # Create pandas dataframe
+        df = pd.DataFrame.from_dict({'nonisomeric_smiles': nonisomeric_smiles, 'nonisomeric_mapped_smiles': nonisomeric_mapped_smiles, 'isomeric_mapped_smiles': isomeric_mapped_smiles})
+        _logger.info(f'Unique nonisomeric smiles:        {len(df.nonisomeric_smiles.unique())}')
+        _logger.info(f'Unique nonisomeric mapped smiles: {len(df.nonisomeric_mapped_smiles.unique())}')
+        _logger.info(f'Unique isomeric mapped smiles:    {len(df.isomeric_mapped_smiles.unique())}')
+
+        # Get unique and duplicated entries using nonisomeric smiles (non-mapped)
+        unique_index = df.nonisomeric_smiles.drop_duplicates(keep=False).index.to_list()
         unique_graphs = [self.graphs[_idx] for _idx in unique_index]
-        _logger.info(f'Found {len(unique_index)} unique molecules')
+        _logger.info(f'Drop all duplicated nonisomeric smiles from the dataset (unique nonisomeric smiles: {len(unique_index)})')
 
-        # Duplicated entries
-        index = df.duplicated(keep=False)   # Mark all duplicate entries True
+        index = df.nonisomeric_smiles.duplicated(keep=False)   # Mark all duplicate entries True
         duplicated_index = df[index].index.to_list()
-        _logger.info(f'Found {len(duplicated_index)} duplicated molecules')
+        assert len(unique_index) + len(duplicated_index) == len(self.graphs), \
+            f'Unique + duplicated nonisomeric smiles: {len(unique_index)} + {len(duplicated_index)} != total dataset ({len(self.graphs)})'
         
-        # Get unique smiles and assign new molecule name `e.g. mol0001`
-        duplicated_df = df.iloc[duplicated_index]
-        duplicated_smiles = duplicated_df.smiles.unique().tolist()
-        molnames = [ f'mol{i:04d}' for i in range(len(duplicated_smiles)) ]
-        _logger.info(f'Found {len(molnames)} unique molecules within duplicate entries')
-
-        # Merge duplicate entries into a new single graph
-        duplicated_graphs = []
-        molnames_dict = {}
-        for molname, duplicated_smile in zip(molnames, duplicated_smiles):
-            # Map new molecule name with its unique smiles and dataframe indices
-            index = duplicated_df[duplicated_df['smiles'] == duplicated_smile].index.tolist()
-            molnames_dict[molname] = {'smiles': duplicated_smiles, 'index': index}
-            # Merge graphs
-            g = self._merge_graphs([self.graphs[_idx] for _idx in index])
-            duplicated_graphs.append(g)
-            # Save graphs (optional)
-            if save_merged_dataset == True:
-                # Notes: Create a temporary directory, `_output_directory_path`, to support pytest in test_utils_graphs.py.
-                # Temporary directory needs to be created beforehand for `test_drop_and_merge_duplicates`.
-                _output_directory_path = os.path.join(output_directory_path, dataset_name)
-                os.makedirs(_output_directory_path, exist_ok=True)
-                new_output_directory_path = os.path.join(_output_directory_path, molname)
-                g.save(new_output_directory_path)
-
-        # Update in place
-        new_graphs = unique_graphs + duplicated_graphs
-        _logger.info(f'Graph dataset reconstructed: {len(new_graphs)} unique molecules')
+        if keep == True:
+            if isomeric == True:
+                _logger.info(f'Merge dropped duplicated nonisomeric smiles into unique isomeric mapped smiles')
+            else:
+                _logger.info(f'Merge dropped duplicated nonisomeric smiles into unique nonisomeric mapped smiles')
+
+            # Get unique (non)isomeric mapped smiles from duplicated nonisomeric smiles (non-mapped) and assign new molecule name `e.g. mol0001`
+            # Use copy() to prevent SettingWithCopyWarning when assigning new values to a new column
+            duplicated_df = df.iloc[duplicated_index].copy()
+            if isomeric == True:
+                duplicated_smiles = duplicated_df.isomeric_mapped_smiles.unique().tolist()
+                duplicated_df['smiles'] = duplicated_df.isomeric_mapped_smiles
+                _logger.info(f'Found {len(duplicated_smiles)} unique isomeric mapped smiles within duplicated {len(duplicated_index)} nonisomeric smiles')
+            else:
+                duplicated_smiles = duplicated_df.nonisomeric_mapped_smiles.unique().tolist()
+                duplicated_df['smiles'] = duplicated_df.nonisomeric_mapped_smiles
+                _logger.info(f'Found {len(duplicated_smiles)} unique nonisomeric mapped smiles within duplicated {len(duplicated_index)} nonisomeric smiles')
+            molnames = [ f'mol{i:04d}' for i in range(len(duplicated_smiles)) ]
+
+            # Merge duplicate entries into a new single graph
+            duplicated_graphs = []
+            #molnames_dict = {}   # This is never used but keep this to export the dictionary?
+            for molname, duplicated_smile in zip(molnames, duplicated_smiles):
+                # Map new molecule name with its unique smiles and dataframe indices
+                index = duplicated_df[duplicated_df['smiles'] == duplicated_smile].index.tolist()
+                #molnames_dict[molname] = {'smiles': duplicated_smiles, 'index': index}
+                # Merge graphs
+                g = self._merge_graphs(subset=[self.graphs[_idx] for _idx in index], isomeric_flag=isomeric)
+                duplicated_graphs.append(g)
+                # Save graphs (optional)
+                if save_merged_dataset == True:
+                    # Notes: Create a temporary directory, `_output_directory_path`, to support pytest in test_utils_graphs.py.
+                    # Temporary directory needs to be created beforehand for `test_drop_and_merge_duplicates`.
+                    _output_directory_path = os.path.join(output_directory_path, dataset_name)
+                    os.makedirs(_output_directory_path, exist_ok=True)
+                    new_output_directory_path = os.path.join(_output_directory_path, molname)
+                    g.save(new_output_directory_path)
+
+            new_graphs = unique_graphs + duplicated_graphs
+            _logger.info(f'Add back {len(duplicated_graphs)} merged duplicated (non)isomeric mapped smiles into the dataset')
+            _logger.info(f'Dataset reconstructed: {len(new_graphs)} unique molecules')
+        else:
+            new_graphs = unique_graphs
+            _logger.info(f'Dataset reconstructed: {len(new_graphs)} unique molecules')
         self.graphs = new_graphs
         del unique_graphs, duplicated_graphs, df, duplicated_df
 
@@ -611,12 +642,12 @@ def _remove_node_features(self):
 
 
     @staticmethod
-    def _merge_graphs(ds):
+    def _merge_graphs(subset, isomeric_flag):
         """Merge multiple Graph instances into a single Graph.
 
         Parameters
         ----------
-        ds : list of espaloma.graphs.graph.Graph, default=None
+        subset : list of espaloma.graphs.graph.Graph, default=None
             The list of Graph instances to be merged. All Graphs in the list must be equivalent.
 
         Returns
@@ -629,68 +660,37 @@ def _merge_graphs(ds):
         import copy
         import torch
 
-        """
-        #
-        # NOTE
-        # ----
-        # THIS IS DEPRECATED AT THE MOMENT. SEE LINE 116 FOR MORE DETAILS.
-        # KEEPING THIS FOR FUTURE REFERENCE.
-        #
-        
-        # Check if all inputs are equivalent (isomeric smiles)
-        # If not, get average partial charges across different isomeric smiles (molecules)
-        isomeric_smiles = [g.mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=False) for g in ds]
-        unique_isomeric_smiles = set(isomeric_smiles)
-        if len(unique_isomeric_smiles) != 1:
-            n_atoms = ds[0].nodes['n1'].data['q_ref'].shape[0]
-            q_ref = torch.zeros(n_atoms, 1)
-            for unique_isomeric_smile in unique_isomeric_smiles:
-                index = [i for i, isomeric_smile in enumerate(isomeric_smiles) if isomeric_smile in unique_isomeric_smile][0]
-                q_ref += ds[index].nodes['n1'].data['q_ref']
-            q_ref = q_ref / len(set(isomeric_smiles))
-            # Update partial charges in-place
-            for i in range(len(ds)):
-                ds[i].nodes['n1'].data['q_ref'] = q_ref
-
         # Check if graphs are equivalent
-        for i in range(1, len(ds)):
-            # Mapped isomeric smiles
-            if len(unique_isomeric_smiles) != 1:
-                mapped_smiles = ds[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) 
-                mapped_smiles_i = ds[i].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True)
-                assert mapped_smiles == mapped_smiles_i, f"Mapped nonisomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
+        for i in range(1, len(subset)):
+            if isomeric_flag == True:
+                mapped_smiles = subset[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) 
+                mapped_smiles_i = subset[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
+                assert mapped_smiles == mapped_smiles_i, f"Isomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
             else:
-                mapped_smiles = ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) 
-                mapped_smiles_i = ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
-                assert mapped_smiles == mapped_smiles_i, f"Mapped isomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
+                mapped_smiles = subset[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) 
+                mapped_smiles_i = subset[i].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True)
+                assert mapped_smiles == mapped_smiles_i, f"Nonisomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
             # Other node features
             for key in ["sum_q"]:
-                np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy())
-            for key in ["q_ref", "idxs", "h0"]:
-                np.testing.assert_array_equal(ds[0].nodes['n1'].data[key].flatten().numpy(), ds[i].nodes['n1'].data[key].flatten().numpy())
-        """
+                np.testing.assert_array_equal(subset[0].nodes['g'].data[key].flatten().numpy(), subset[i].nodes['g'].data[key].flatten().numpy())
+            for key in ["q_ref", "h0"]:
+                np.testing.assert_array_equal(subset[0].nodes['n1'].data[key].flatten().numpy(), subset[i].nodes['n1'].data[key].flatten().numpy())
+            # As long as mapped smiles are the same, we don't need to compare n1, n2, n3 nodes. Maybe we don't need the above either?
+            #np.testing.assert_array_equal(subset[0].nodes['n1'].data['idxs'].flatten().numpy(), subset[i].nodes['n1'].data['idxs'].flatten().numpy())
+            #np.testing.assert_array_equal(subset[0].nodes['n2'].data['idxs'].flatten().numpy(), subset[i].nodes['n2'].data['idxs'].flatten().numpy())
+            #np.testing.assert_array_equal(subset[0].nodes['n3'].data['idxs'].flatten().numpy(), subset[i].nodes['n3'].data['idxs'].flatten().numpy())
 
-        # Check if graphs are equivalent
-        for i in range(1, len(ds)):
-            mapped_smiles = ds[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) 
-            mapped_smiles_i = ds[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
-            assert mapped_smiles == mapped_smiles_i, f"Mapped isomeric smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
-            # Other node features
-            for key in ["sum_q"]:
-                np.testing.assert_array_equal(ds[0].nodes['g'].data[key].flatten().numpy(), ds[i].nodes['g'].data[key].flatten().numpy())
-            for key in ["q_ref", "idxs", "h0"]:
-                np.testing.assert_array_equal(ds[0].nodes['n1'].data[key].flatten().numpy(), ds[i].nodes['n1'].data[key].flatten().numpy())
 
         # Merge graphs
-        g = copy.deepcopy(ds[0])
+        g = copy.deepcopy(subset[0])
         for key in g.nodes['g'].data.keys():
             if key not in ["sum_q"]:
-                for i in range(1, len(ds)):
-                    g.nodes['g'].data[key] = torch.cat((g.nodes['g'].data[key], ds[i].nodes['g'].data[key]), dim=-1)
+                for i in range(1, len(subset)):
+                    g.nodes['g'].data[key] = torch.cat((g.nodes['g'].data[key], subset[i].nodes['g'].data[key]), dim=-1)
         for key in g.nodes['n1'].data.keys():
             if key not in ["q_ref", "idxs", "h0"]:
-                for i in range(1, len(ds)):
-                    g.nodes['n1'].data[key] = torch.cat((g.nodes['n1'].data[key], ds[i].nodes['n1'].data[key]), dim=1)
+                for i in range(1, len(subset)):
+                    g.nodes['n1'].data[key] = torch.cat((g.nodes['n1'].data[key], subset[i].nodes['n1'].data[key]), dim=1)
         
         return g
 

From cf5a36db15f951a4a6e6175602c5c65270366c27 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Sun, 10 Mar 2024 15:01:44 -0400
Subject: [PATCH 57/59] fix bug in averaging partial charges for duplicate
 entries

---
 espfit/utils/graphs.py | 65 +++++++++++++++++++++++++++++++++---------
 1 file changed, 52 insertions(+), 13 deletions(-)

diff --git a/espfit/utils/graphs.py b/espfit/utils/graphs.py
index 7a00451..a9f7c20 100644
--- a/espfit/utils/graphs.py
+++ b/espfit/utils/graphs.py
@@ -660,26 +660,65 @@ def _merge_graphs(subset, isomeric_flag):
         import copy
         import torch
 
+        if isomeric_flag == True:
+            mapped_smiles = subset[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
+        else:
+            mapped_smiles = subset[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) 
+        _logger.info(f'Merge {len(subset)} graphs: {mapped_smiles}')
+
         # Check if graphs are equivalent
+        charge_index = []    # book keep indices with inconsistent partial charges
+        atol = rtol = 1e-2   # charge tolerance
         for i in range(1, len(subset)):
             if isomeric_flag == True:
-                mapped_smiles = subset[0].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) 
                 mapped_smiles_i = subset[i].mol.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
                 assert mapped_smiles == mapped_smiles_i, f"Isomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
             else:
-                mapped_smiles = subset[0].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True) 
                 mapped_smiles_i = subset[i].mol.to_smiles(isomeric=False, explicit_hydrogens=True, mapped=True)
-                assert mapped_smiles == mapped_smiles_i, f"Nonisomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"
-            # Other node features
-            for key in ["sum_q"]:
-                np.testing.assert_array_equal(subset[0].nodes['g'].data[key].flatten().numpy(), subset[i].nodes['g'].data[key].flatten().numpy())
-            for key in ["q_ref", "h0"]:
-                np.testing.assert_array_equal(subset[0].nodes['n1'].data[key].flatten().numpy(), subset[i].nodes['n1'].data[key].flatten().numpy())
-            # As long as mapped smiles are the same, we don't need to compare n1, n2, n3 nodes. Maybe we don't need the above either?
-            #np.testing.assert_array_equal(subset[0].nodes['n1'].data['idxs'].flatten().numpy(), subset[i].nodes['n1'].data['idxs'].flatten().numpy())
-            #np.testing.assert_array_equal(subset[0].nodes['n2'].data['idxs'].flatten().numpy(), subset[i].nodes['n2'].data['idxs'].flatten().numpy())
-            #np.testing.assert_array_equal(subset[0].nodes['n3'].data['idxs'].flatten().numpy(), subset[i].nodes['n3'].data['idxs'].flatten().numpy())
-
+                assert mapped_smiles == mapped_smiles_i, f"Nonisomeric mapped smiles are not equivalent: {mapped_smiles} != {mapped_smiles_i}"            
+            # Net charge
+            np.testing.assert_array_equal(subset[0].nodes['g'].data['sum_q'].flatten().numpy(), subset[i].nodes['g'].data['sum_q'].flatten().numpy())
+            # Input node features
+            np.testing.assert_array_equal(subset[0].nodes['n1'].data['h0'].flatten().numpy(), subset[i].nodes['n1'].data['h0'].flatten().numpy())
+            # Atom ordering: As long as mapped smiles are the same, we don't need to compare n1, n2, n3 nodes?
+            np.testing.assert_array_equal(subset[0].nodes['n1'].data['idxs'].flatten().numpy(), subset[i].nodes['n1'].data['idxs'].flatten().numpy())
+            np.testing.assert_array_equal(subset[0].nodes['n2'].data['idxs'].flatten().numpy(), subset[i].nodes['n2'].data['idxs'].flatten().numpy())
+            np.testing.assert_array_equal(subset[0].nodes['n3'].data['idxs'].flatten().numpy(), subset[i].nodes['n3'].data['idxs'].flatten().numpy())
+            # Partial charges: There could be inconsistency due to different 3D conformers generated during partial charge calculation process.
+            charge_boolean = np.allclose(subset[0].nodes['n1'].data['q_ref'].flatten().numpy(), subset[i].nodes['n1'].data['q_ref'].flatten().numpy(), rtol=rtol, atol=atol)
+            if charge_boolean == False:
+                charge_diff = np.abs(subset[0].nodes['n1'].data['q_ref'].flatten().numpy() - subset[i].nodes['n1'].data['q_ref'].flatten().numpy())
+                _logger.warning(f"Entry {i}: Maximum charge difference {charge_diff.max()} is higher than {atol} when compared to the first graph")
+                charge_index.append(i)
+        
+        # Handle partial charges if inconsistent
+        if charge_index:
+            # Get indices with unique partial charges
+            # Book keep indices with unique partial charges starting from the first graph
+            unique_charge_index = [0]
+            for i in charge_index:
+                is_equal = []
+                for j in unique_charge_index:
+                    # Extract the arrays to compare
+                    arr_i = subset[i].nodes['n1'].data['q_ref'].flatten().numpy()
+                    arr_j = subset[j].nodes['n1'].data['q_ref'].flatten().numpy()
+                    is_equal.append(np.array_equal(arr_i, arr_j))
+                # Check if all False
+                if not any(is_equal):
+                    unique_charge_index.append(i)
+            # Average partial charges
+            _logger.info(f'Average partial charges ({unique_charge_index})...')
+            q_ref = subset[0].nodes['n1'].data['q_ref']
+            _logger.info(f'Entry #0: {q_ref.flatten().numpy()}')
+            for index in unique_charge_index[1:]:
+                _q_ref = subset[index].nodes['n1'].data['q_ref']
+                _logger.info(f'Entry #{index}: {_q_ref.flatten().numpy()}')
+                q_ref += _q_ref
+            q_ref = q_ref / len(unique_charge_index)
+            # Update partial charges in-place
+            for i in range(len(subset)):
+                subset[i].nodes['n1'].data['q_ref'] = q_ref
+            _logger.info(f'Averaged partial charges: {subset[0].nodes["n1"].data["q_ref"].flatten().numpy()}')
 
         # Merge graphs
         g = copy.deepcopy(subset[0])

From 0963d0fc8b130c8c05bed1f8c552419a0ac3cfb0 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Tue, 12 Mar 2024 10:05:04 -0400
Subject: [PATCH 58/59] fix drop_and_merge_duplicates to drop_duplicates

---
 espfit/tests/test_app_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/espfit/tests/test_app_train.py b/espfit/tests/test_app_train.py
index 05d29d5..1f671c6 100644
--- a/espfit/tests/test_app_train.py
+++ b/espfit/tests/test_app_train.py
@@ -44,7 +44,7 @@ def test_load_dataset(tmpdir):
 
     # Prepare input dataset ready for training
     temporary_directory = tmpdir.mkdir('misc')
-    ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory))
+    ds.drop_duplicates(isomeric=False, keep=True, save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory))
     ds.reshape_conformation_size(n_confs=50)
     ds.compute_relative_energy()
 

From fb97040f2e2ecaf948f52afdd5919fa3f1406d38 Mon Sep 17 00:00:00 2001
From: kt <kntkb.qbr@gmail.com>
Date: Tue, 12 Mar 2024 14:59:38 -0400
Subject: [PATCH 59/59] fix drop_and_merge_duplicates to drop_duplicates

---
 espfit/tests/test_app_train_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/espfit/tests/test_app_train_sampler.py b/espfit/tests/test_app_train_sampler.py
index 8d39e32..1292f4c 100644
--- a/espfit/tests/test_app_train_sampler.py
+++ b/espfit/tests/test_app_train_sampler.py
@@ -45,7 +45,7 @@ def test_load_dataset(tmpdir):
 
     # Prepare input dataset ready for training
     temporary_directory = tmpdir.mkdir('misc')
-    ds.drop_and_merge_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory))
+    ds.drop_duplicates(save_merged_dataset=True, dataset_name='misc', output_directory_path=str(temporary_directory))
     ds.reshape_conformation_size(n_confs=50)
     ds.compute_relative_energy()