Merge pull request #1 from kntkb/feature/train_with_reweight

train espaloma with MD simulation
choderalab · Mar 12, 2024 · 0284c10 · 0284c10
2 parents dfe7ca1 + fb97040
commit 0284c10
Show file tree

Hide file tree

Showing 29 changed files with 1,615 additions and 1,468 deletions.
diff --git a/README.md b/README.md
@@ -28,10 +28,26 @@ Infrastruture to train espaloma with experimental observables
 
 
 ### Quick Usage
+```python
+from espfit.utils.graphs import CustomGraphDataset  
+path = 'espfit/data/qcdata/openff-toolkit-0.10.6/dgl2/protein-torsion-sm/'
+ds = CustomGraphDataset.load(path)
+ds.reshape_conformation_size(n_confs=50, include_min_energy_conf=True)
+ds.compute_relative_energy()
+# Create esplama model
+from espfit.app.train import EspalomaModel
+filename = 'espfit/data/config/config.toml'
+# Override training settings in config.toml
+kwargs = {'output_directory_path': 'checkpoints', 'epochs': 100}
+model = EspalomaModel.from_toml(filename, **kwargs)
+model.dataset_train = ds
+# Set sampler settings
+model.train_sampler(sampler_patience=800, neff_threshold=0.2, sampler_weight=1)
+```
 
+### Standalone Usage
 #### Change logging
 ```python
-# load dgl graph data
 from espfit.utils import logging
 logging.get_logging_level()
 #>'INFO'
@@ -51,12 +67,11 @@ from espfit.app.train import EspalomaModel
 filename = 'espfit/data/config/config.toml'
 model = EspalomaModel.from_toml(filename)
 model.dataset_train = ds
-# Train
-model.train(output_directory_path='path/to/output')
-# To extend training, update the `epoch` in config.toml
-# Alternatively, do the following:
-model.config['espaloma']['train']['epochs'] = 50
-model.train(output_directory_path='path/to/output')
+# Change default training settings
+model.epochs = 100
+model.output_directory_path = 'path/to/output'
+# Train (default output directory is current path)
+model.train()
 ```
 
 #### Standard MD (default: espaloma-0.3.2 force field for solute molecules)
@@ -66,8 +81,10 @@ from espfit.app.sampler import SetupSampler
 c = SetupSampler()
 filename = 'espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb'
 c.create_system(biopolymer_file=filename)
-c.minimize(maxIterations=10)
-c.run(nsteps=10, output_directory_path='path/to/output')
+c.minimize()
+# Change default settings
+c.nsteps = 1000
+c.run()
 # Export to XML
 c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path='path/to/output')
 ```
@@ -76,7 +93,8 @@ c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_
 ```python
 from espfit.app.sampler import SetupSampler
 c = SetupSampler.from_xml(input_directory_path='path/to/input')
-c.run(nsteps=10, output_directory_path='path/to/output')
+c.nsteps = 1000
+c.run()
 ```
 
 #### Compute RNA J-couplings from MD trajectory

diff --git a/devtools/conda-envs/README.md b/devtools/conda-envs/README.md
@@ -2,11 +2,11 @@
 
 >conda activate espfit  
 >conda env export --from-history > test_env.yaml  
->conda env create -f test_env.yaml -n test_env 
+>conda env create -f test_env.yaml -n test_env  
 >#uninstall openff-toolkit and install a customized version to support dgl graphs created using openff-toolkit=0.10.6  
 >conda uninstall --force openff-toolkit  
 >pip install git+https://github.com/kntkb/openff-toolkit.git@7e9d0225782ef723083407a1cbf1f4f70631f934  
->#uninstall openmmforcefields if < 0.12.0
+>#uninstall openmmforcefields if < 0.12.0  
 >#use pip instead of mamba to avoid dependency issues with ambertools and python  
 >conda uninstall --force openmmforcefields  
 >pip install git+https://github.com/openmm/[email protected]  

diff --git a/espfit/__init__.py b/espfit/__init__.py
@@ -25,4 +25,4 @@
 #from .espfit import *
 
 
-#from ._version import __version__
+from ._version import __version__
diff --git a/espfit/app/experiment.py → espfit/app/analysis.py b/espfit/app/experiment.py → espfit/app/analysis.py
@@ -1,11 +1,5 @@
 """
 Compute experimental observables from MD simulations.
-
-Notes
------
-
-TODO
-----
 """
 import os
 import numpy as np
@@ -17,20 +11,19 @@
 class BaseDataLoader(object):
     """Base class for data loader.
     
-    TODO
-    ----
-    * Add more methods to check trajectory information (e.g. number of frames, number of atoms, etc.)
-
     Methods
     -------
     load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1):
         Load MD trajectory.
     """
-    def __init__(self, input_directory_path=None, output_directory_path=None):
+    def __init__(self, atomSubset='solute', input_directory_path=None, output_directory_path=None):
         """Initialize base data loader object.
         
         Parameters
         ----------
+        atomSubset : str, default='solute'
+            Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'.
+
         input_directory_path : str, optional
             Input directory path. Default is None.
             If None, the current working directory will be used.
@@ -39,6 +32,10 @@ def __init__(self, input_directory_path=None, output_directory_path=None):
             Output directory path. Default is None.
             If None, the current working directory will be used.
         """
+        self.atomSubset = atomSubset
+        if self.atomSubset not in ['solute', 'all', 'not water']:
+            raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.")
+
         if input_directory_path is None:
             input_directory_path = os.getcwd()
         if output_directory_path is None:
@@ -61,8 +58,7 @@ def output_directory_path(self, value):
         os.makedirs(value, exist_ok=True)
 
 
-    # Should this be a classmethod?
-    def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1, input_directory_path=None):
+    def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None):
         """Load MD trajectory.
         
         Parameters
@@ -73,16 +69,16 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a
         trajectory_netcdf : str, optional
             Trajectory netcdf file name. Default is 'traj.nc'.
 
-        atom_indices : list, optional
-            List of atom indices to load from trajectory. Default is None.
-            If None, all atoms will be loaded.
-
         stride : int, optional
             Stride to load the trajectory. Default is 1.
 
         input_directory_path : str, optional
             Input directory path. Default is None.
             If None, the current working directory will be used.
+
+        Returns
+        -------
+        None
         """
         import mdtraj
 
@@ -92,22 +88,31 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a
         # Load reference pdb (solvated system)
         pdb = os.path.join(self.input_directory_path, reference_pdb)
         ref_traj = mdtraj.load(pdb)
+
         # Select atoms to load from trajectory
-        if atom_indices is None:
+        if self.atomSubset == 'all':
+            self.atom_indices = None
+            self.ref_traj = ref_traj
+        else:
             self.atom_indices = []
             mdtop = ref_traj.topology
-            res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
+            if self.atomSubset == 'solute':
+                res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
+            elif self.atomSubset == 'not water':
+                res = [ r for r in mdtop.residues if r.name not in ('HOH') ]
+            # Get atom indices
             for r in res:
                 for a in r.atoms:
                     self.atom_indices.append(a.index)
-        else:
-            self.atom_indices = atom_indices
-        self.ref_traj = ref_traj.atom_slice(self.atom_indices)
-
+            self.ref_traj = ref_traj.atom_slice(self.atom_indices)
+
         # Load trajectory
         netcdf = os.path.join(self.input_directory_path, trajectory_netcdf)
         traj = mdtraj.load(netcdf, top=self.ref_traj.topology, stride=stride)
-        self.traj = traj.atom_slice(self.atom_indices)
+        if self.atom_indices:
+            self.traj = traj.atom_slice(self.atom_indices)
+        else:
+            self.traj = traj
 
 
 class RNASystem(BaseDataLoader):
@@ -170,15 +175,14 @@ def radian_to_degree(self, a):
         return a
 
 
-    def compute_jcouplings(self, couplings=None, residues=None):
+    def compute_jcouplings(self, weights=None, couplings=None, residues=None):
         """Compute J-couplings from MD trajectory.
         
-        TODO
-        ----
-        * Compute confidence interval.
-
         Parameters
         ----------
+        weights : numpy.ndarray, optional
+            Weights to compute the J-couplings. Default is None.
+
         couplings : str, optional
             Name of the couplings to compute. Default is None. 
             If a list of couplings to be chosen from [H1H2,H2H3,H3H4,1H5P,2H5P,C4Pb,1H5H4,2H5H4,H3P,C4Pe,H1C2/4,H1C6/8] 
@@ -204,7 +208,7 @@ def compute_jcouplings(self, couplings=None, residues=None):
         """
         import barnaba as bb
 
-        _logger.info("Computing J-couplings from MD trajectory...")
+        _logger.info("Compute J-couplings from MD trajectory")
 
         if couplings is not None:
             # Check if the provided coupling names are valid
@@ -216,15 +220,32 @@ def compute_jcouplings(self, couplings=None, residues=None):
         # residue_list: list of M nucleobases
         values, resname_list = bb.jcouplings_traj(self.traj, couplings=couplings, residues=residues)
 
+        # Convert numpy.float to float to avoid serialization issues
+        replace_nan_with_none = lambda x: None if np.isscalar(x) and np.isnan(x) else x.item()
+
         # Loop over residues and couplings to store the computed values
         coupling_dict = dict()
         for i, resname in enumerate(resname_list):
             _values = values[:,i,:]  # Coupling values of i-th residue
             values_by_names = dict()
             for j, coupling_name in enumerate(couplings):
-                avg = _values[:,j].mean()  # Mean value of H1H2 coupling of i-th residue
-                std = _values[:,j].std()   # Standard deviation of H1H2 coupling of i-th residue
-                values_by_names[coupling_name] = {'avg': avg, 'std': std}
+                avg_raw = np.round(_values[:,j].mean(), 5)  # e.g. mean value of H1H2 coupling of i-th residue
+                std_raw = np.round(_values[:,j].std(), 5)   # e.g. standard deviation of H1H2 coupling of i-th residue
+                avg_raw = replace_nan_with_none(avg_raw)
+                std_raw = replace_nan_with_none(std_raw)
+                if weights is not None:
+                    arr = _values[:,j] * weights
+                    #_logger.info(f'non-weighted: {_values[:,j]}')
+                    #_logger.info(f'weights:      {weights}')
+                    #_logger.info(f'weighted:     {arr}')
+                    avg = np.round(arr.mean(), 5)
+                    std = np.round(arr.std(), 5)
+                    avg = replace_nan_with_none(avg)
+                    std = replace_nan_with_none(std)
+                else:
+                    avg = avg_raw
+                    std = std_raw
+                values_by_names[coupling_name] = {'avg': avg, 'std': std, 'avg_raw': avg_raw, 'std_raw': std_raw}
             coupling_dict[resname] =  values_by_names
 
         return coupling_dict
@@ -242,4 +263,18 @@ def get_available_couplings(self):
         import barnaba as bb
         available_coupling_names = list(bb.definitions.couplings_idx.keys())
         return available_coupling_names
-
+
+
+#
+# Future work?
+#
+class ProteinSystem(BaseDataLoader):
+    def __init__(self, **kwargs):
+        super(ProteinSystem, self).__init__(**kwargs)
+        raise NotImplementedError("ProteinSystem class is not implemented yet.")
+
+
+class ProteinLigandSystem(BaseDataLoader):
+    def __init__(self, **kwargs):
+        super(ProteinLigandSystem, self).__init__(**kwargs)
+        raise NotImplementedError("ProteinLigandSystem class is not implemented yet.")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,4 +25,4 @@
		#from .espfit import *


		#from ._version import __version__
		from ._version import __version__