Skip to content

Commit

Permalink
Merge pull request #1 from kntkb/feature/train_with_reweight
Browse files Browse the repository at this point in the history
train espaloma with MD simulation
  • Loading branch information
kntkb authored Mar 12, 2024
2 parents dfe7ca1 + fb97040 commit 0284c10
Show file tree
Hide file tree
Showing 29 changed files with 1,615 additions and 1,468 deletions.
38 changes: 28 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,26 @@ Infrastruture to train espaloma with experimental observables

### Quick Usage
```python
from espfit.utils.graphs import CustomGraphDataset
path = 'espfit/data/qcdata/openff-toolkit-0.10.6/dgl2/protein-torsion-sm/'
ds = CustomGraphDataset.load(path)
ds.reshape_conformation_size(n_confs=50, include_min_energy_conf=True)
ds.compute_relative_energy()
# Create esplama model
from espfit.app.train import EspalomaModel
filename = 'espfit/data/config/config.toml'
# Override training settings in config.toml
kwargs = {'output_directory_path': 'checkpoints', 'epochs': 100}
model = EspalomaModel.from_toml(filename, **kwargs)
model.dataset_train = ds
# Set sampler settings
model.train_sampler(sampler_patience=800, neff_threshold=0.2, sampler_weight=1)
```

### Standalone Usage
#### Change logging
```python
# load dgl graph data
from espfit.utils import logging
logging.get_logging_level()
#>'INFO'
Expand All @@ -51,12 +67,11 @@ from espfit.app.train import EspalomaModel
filename = 'espfit/data/config/config.toml'
model = EspalomaModel.from_toml(filename)
model.dataset_train = ds
# Train
model.train(output_directory_path='path/to/output')
# To extend training, update the `epoch` in config.toml
# Alternatively, do the following:
model.config['espaloma']['train']['epochs'] = 50
model.train(output_directory_path='path/to/output')
# Change default training settings
model.epochs = 100
model.output_directory_path = 'path/to/output'
# Train (default output directory is current path)
model.train()
```

#### Standard MD (default: espaloma-0.3.2 force field for solute molecules)
Expand All @@ -66,8 +81,10 @@ from espfit.app.sampler import SetupSampler
c = SetupSampler()
filename = 'espfit/data/target/testsystems/nucleoside/pdbfixer_min.pdb'
c.create_system(biopolymer_file=filename)
c.minimize(maxIterations=10)
c.run(nsteps=10, output_directory_path='path/to/output')
c.minimize()
# Change default settings
c.nsteps = 1000
c.run()
# Export to XML
c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_directory_path='path/to/output')
```
Expand All @@ -76,7 +93,8 @@ c.export_xml(exportSystem=True, exportState=True, exportIntegrator=True, output_
```python
from espfit.app.sampler import SetupSampler
c = SetupSampler.from_xml(input_directory_path='path/to/input')
c.run(nsteps=10, output_directory_path='path/to/output')
c.nsteps = 1000
c.run()
```

#### Compute RNA J-couplings from MD trajectory
Expand Down
4 changes: 2 additions & 2 deletions devtools/conda-envs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

>conda activate espfit
>conda env export --from-history > test_env.yaml
>conda env create -f test_env.yaml -n test_env
>conda env create -f test_env.yaml -n test_env
>#uninstall openff-toolkit and install a customized version to support dgl graphs created using openff-toolkit=0.10.6
>conda uninstall --force openff-toolkit
>pip install git+https://github.com/kntkb/openff-toolkit.git@7e9d0225782ef723083407a1cbf1f4f70631f934
>#uninstall openmmforcefields if < 0.12.0
>#uninstall openmmforcefields if < 0.12.0
>#use pip instead of mamba to avoid dependency issues with ambertools and python
>conda uninstall --force openmmforcefields
>pip install git+https://github.com/openmm/[email protected]
Expand Down
2 changes: 1 addition & 1 deletion espfit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@
#from .espfit import *


#from ._version import __version__
from ._version import __version__
103 changes: 69 additions & 34 deletions espfit/app/experiment.py → espfit/app/analysis.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
"""
Compute experimental observables from MD simulations.
Notes
-----
TODO
----
"""
import os
import numpy as np
Expand All @@ -17,20 +11,19 @@
class BaseDataLoader(object):
"""Base class for data loader.
TODO
----
* Add more methods to check trajectory information (e.g. number of frames, number of atoms, etc.)
Methods
-------
load_traj(reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1):
Load MD trajectory.
"""
def __init__(self, input_directory_path=None, output_directory_path=None):
def __init__(self, atomSubset='solute', input_directory_path=None, output_directory_path=None):
"""Initialize base data loader object.
Parameters
----------
atomSubset : str, default='solute'
Subset of atoms to save. Default is 'solute'. Other options 'all' and 'not water'.
input_directory_path : str, optional
Input directory path. Default is None.
If None, the current working directory will be used.
Expand All @@ -39,6 +32,10 @@ def __init__(self, input_directory_path=None, output_directory_path=None):
Output directory path. Default is None.
If None, the current working directory will be used.
"""
self.atomSubset = atomSubset
if self.atomSubset not in ['solute', 'all', 'not water']:
raise ValueError(f"Invalid atomSubset: {self.atomSubset}. Expected 'solute', 'all', or 'not water'.")

if input_directory_path is None:
input_directory_path = os.getcwd()
if output_directory_path is None:
Expand All @@ -61,8 +58,7 @@ def output_directory_path(self, value):
os.makedirs(value, exist_ok=True)


# Should this be a classmethod?
def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', atom_indices=None, stride=1, input_directory_path=None):
def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', stride=1, input_directory_path=None):
"""Load MD trajectory.
Parameters
Expand All @@ -73,16 +69,16 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a
trajectory_netcdf : str, optional
Trajectory netcdf file name. Default is 'traj.nc'.
atom_indices : list, optional
List of atom indices to load from trajectory. Default is None.
If None, all atoms will be loaded.
stride : int, optional
Stride to load the trajectory. Default is 1.
input_directory_path : str, optional
Input directory path. Default is None.
If None, the current working directory will be used.
Returns
-------
None
"""
import mdtraj

Expand All @@ -92,22 +88,31 @@ def load_traj(self, reference_pdb='solvated.pdb', trajectory_netcdf='traj.nc', a
# Load reference pdb (solvated system)
pdb = os.path.join(self.input_directory_path, reference_pdb)
ref_traj = mdtraj.load(pdb)

# Select atoms to load from trajectory
if atom_indices is None:
if self.atomSubset == 'all':
self.atom_indices = None
self.ref_traj = ref_traj
else:
self.atom_indices = []
mdtop = ref_traj.topology
res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
if self.atomSubset == 'solute':
res = [ r for r in mdtop.residues if r.name not in ('HOH', 'NA', 'CL', 'K') ]
elif self.atomSubset == 'not water':
res = [ r for r in mdtop.residues if r.name not in ('HOH') ]
# Get atom indices
for r in res:
for a in r.atoms:
self.atom_indices.append(a.index)
else:
self.atom_indices = atom_indices
self.ref_traj = ref_traj.atom_slice(self.atom_indices)

self.ref_traj = ref_traj.atom_slice(self.atom_indices)

# Load trajectory
netcdf = os.path.join(self.input_directory_path, trajectory_netcdf)
traj = mdtraj.load(netcdf, top=self.ref_traj.topology, stride=stride)
self.traj = traj.atom_slice(self.atom_indices)
if self.atom_indices:
self.traj = traj.atom_slice(self.atom_indices)
else:
self.traj = traj


class RNASystem(BaseDataLoader):
Expand Down Expand Up @@ -170,15 +175,14 @@ def radian_to_degree(self, a):
return a


def compute_jcouplings(self, couplings=None, residues=None):
def compute_jcouplings(self, weights=None, couplings=None, residues=None):
"""Compute J-couplings from MD trajectory.
TODO
----
* Compute confidence interval.
Parameters
----------
weights : numpy.ndarray, optional
Weights to compute the J-couplings. Default is None.
couplings : str, optional
Name of the couplings to compute. Default is None.
If a list of couplings to be chosen from [H1H2,H2H3,H3H4,1H5P,2H5P,C4Pb,1H5H4,2H5H4,H3P,C4Pe,H1C2/4,H1C6/8]
Expand All @@ -204,7 +208,7 @@ def compute_jcouplings(self, couplings=None, residues=None):
"""
import barnaba as bb

_logger.info("Computing J-couplings from MD trajectory...")
_logger.info("Compute J-couplings from MD trajectory")

if couplings is not None:
# Check if the provided coupling names are valid
Expand All @@ -216,15 +220,32 @@ def compute_jcouplings(self, couplings=None, residues=None):
# residue_list: list of M nucleobases
values, resname_list = bb.jcouplings_traj(self.traj, couplings=couplings, residues=residues)

# Convert numpy.float to float to avoid serialization issues
replace_nan_with_none = lambda x: None if np.isscalar(x) and np.isnan(x) else x.item()

# Loop over residues and couplings to store the computed values
coupling_dict = dict()
for i, resname in enumerate(resname_list):
_values = values[:,i,:] # Coupling values of i-th residue
values_by_names = dict()
for j, coupling_name in enumerate(couplings):
avg = _values[:,j].mean() # Mean value of H1H2 coupling of i-th residue
std = _values[:,j].std() # Standard deviation of H1H2 coupling of i-th residue
values_by_names[coupling_name] = {'avg': avg, 'std': std}
avg_raw = np.round(_values[:,j].mean(), 5) # e.g. mean value of H1H2 coupling of i-th residue
std_raw = np.round(_values[:,j].std(), 5) # e.g. standard deviation of H1H2 coupling of i-th residue
avg_raw = replace_nan_with_none(avg_raw)
std_raw = replace_nan_with_none(std_raw)
if weights is not None:
arr = _values[:,j] * weights
#_logger.info(f'non-weighted: {_values[:,j]}')
#_logger.info(f'weights: {weights}')
#_logger.info(f'weighted: {arr}')
avg = np.round(arr.mean(), 5)
std = np.round(arr.std(), 5)
avg = replace_nan_with_none(avg)
std = replace_nan_with_none(std)
else:
avg = avg_raw
std = std_raw
values_by_names[coupling_name] = {'avg': avg, 'std': std, 'avg_raw': avg_raw, 'std_raw': std_raw}
coupling_dict[resname] = values_by_names

return coupling_dict
Expand All @@ -242,4 +263,18 @@ def get_available_couplings(self):
import barnaba as bb
available_coupling_names = list(bb.definitions.couplings_idx.keys())
return available_coupling_names



#
# Future work?
#
class ProteinSystem(BaseDataLoader):
def __init__(self, **kwargs):
super(ProteinSystem, self).__init__(**kwargs)
raise NotImplementedError("ProteinSystem class is not implemented yet.")


class ProteinLigandSystem(BaseDataLoader):
def __init__(self, **kwargs):
super(ProteinLigandSystem, self).__init__(**kwargs)
raise NotImplementedError("ProteinLigandSystem class is not implemented yet.")
Loading

0 comments on commit 0284c10

Please sign in to comment.