Update fep with alchemlyb and multiple edr files input (#132)

* Add eneconv to combine multiple edr files * use glob to gather all files * add dgdl_total_edr * add alchemlyb dependece * add collect and analyze functions for alchemlyb estimators * add function to convert a energy from kBT to kJ/mol * add test for kBT_to_kJ * Add error calculation for BAR method. fix unit problem of VDW and Coulomb components * add test for analyze_alchemlyb * add pandas dependece * remove duplicate check for method * update SI test with consevative=True * kBT_to_kJ() now always returns a copy * return a copy instead of changing it * add test for start stop stride Co-authored-by: Oliver Beckstein <[email protected]>
Becksteinlab · Mar 20, 2021 · 97f0cf6 · 97f0cf6
1 parent 544e338
commit 97f0cf6
Show file tree

Hide file tree

Showing 4 changed files with 298 additions and 10 deletions.
diff --git a/mdpow/fep.py b/mdpow/fep.py
@@ -129,18 +129,26 @@
 import warnings
 
 import numpy
+import pandas as pd
 
 import scipy.integrate
+from scipy import constants
 import numkit.integration
 import numkit.timeseries
 
+from alchemlyb.parsing.gmx import extract_dHdl, extract_u_nk
+from alchemlyb.estimators import TI, BAR, MBAR
+from alchemlyb.parsing.gmx import _extract_dataframe
+from alchemlyb.preprocessing.subsampling import statistical_inefficiency
+
 import gromacs, gromacs.utilities
 try:
     import gromacs.setup
 except (ImportError, OSError):
     raise ImportError("Gromacs installation not found, source GMXRC?")
 from gromacs.utilities import asiterable, AttributeDict, in_dir, openany
 from numkit.observables import QuantityWithError
+from glob import glob
 
 import logging
 logger = logging.getLogger('mdpow.fep')
@@ -168,6 +176,10 @@ def kJ_to_kcal(x):
     """Convert a energy in kJ to kcal."""
     return x / 4.184
 
+def kBT_to_kJ(x, T):
+    """Convert a energy in kBT to kJ/mol."""
+    return x * constants.N_A*constants.k*T*1e-3
+
 
 class FEPschedule(AttributeDict):
     """Describe mdp parameter choices as key - value pairs.
@@ -289,6 +301,12 @@ class Gsolv(Journalled):
     #: :meth:`Simulation.get_protocol` and :class:`restart.Journal`
     protocols = ["setup", "fep_run"]
 
+    #: Estimators in alchemlyb
+    estimators = {'TI': {'extract': extract_dHdl, 'estimator': TI},
+                  'BAR': {'extract': extract_u_nk, 'estimator': BAR},
+                  'MBAR': {'extract': extract_u_nk, 'estimator': MBAR}
+                  }
+
     # TODO: initialize from default cfg
     schedules_default = {'coulomb':
                          FEPschedule(name='coulomb',
@@ -373,6 +391,13 @@ def __init__(self, molecule=None, top=None, struct=None, method="BAR", **kwargs)
                the value set in a loaded pickle file. [``False``]
            *stride*
                collect every *stride* data line, see :meth:`Gsolv.collect` [1]
+           *start*
+               Start frame of data analyzed in every fep window.
+           *stop*
+               Stop frame of data analyzed in every fep window.
+           *SI*
+               Set to ``True`` if you want to perform statistical inefficiency
+               to preprocess the data.
            *kwargs*
                other undocumented arguments (see source for the moment)
 
@@ -461,6 +486,9 @@ def __init__(self, molecule=None, top=None, struct=None, method="BAR", **kwargs)
 
             # for analysis
             self.stride = kwargs.pop('stride', 1)
+            self.start = kwargs.pop('start', 0)
+            self.stop = kwargs.pop('stop', None)
+            self.SI = kwargs.pop('SI', False)
 
             # other variables
             #: Results from the analysis
@@ -689,11 +717,12 @@ def dgdl_edr(self, *args):
                  could be found
 
          """
-        fn = os.path.join(*args + (self.deffnm + '.edr',))
-        if not os.path.exists(fn):
-            logger.error("Missing dgdl.edr file %(fn)r.", vars())
-            raise IOError(errno.ENOENT, "Missing dgdl.edr file", fn)
-        return fn
+        pattern = os.path.join(*args + (self.deffnm + '*.edr',))
+        edrs = glob(pattern)
+        if not edrs:
+                    logger.error("Missing dgdl.edr file %(pattern)r.", vars())
+                    raise IOError(errno.ENOENT, "Missing dgdl.edr file", pattern)
+        return [os.path.abspath(i) for i in edrs]
 
     def dgdl_tpr(self, *args):
         """Return filename of the dgdl TPR file.
@@ -715,6 +744,24 @@ def dgdl_tpr(self, *args):
             raise IOError(errno.ENOENT, "Missing TPR file", fn)
         return fn
 
+    def dgdl_total_edr(self, *args, **kwargs):
+        """Return filename of the combined dgdl EDR file.
+
+        :Arguments:
+           *args*
+               joins the arguments into a path and adds the default
+               filename for the dvdl file
+
+        :Keywords:
+           *total_edr_name*
+               Name of the user defined total edr file.
+
+        :Returns: path to total EDR
+
+        """
+        total_edr_name = kwargs.get("total_edr_name", "total.edr")
+        fn = os.path.join(*args + (total_edr_name,))
+        return fn
 
     def convert_edr(self):
         """Convert EDR files to compressed XVG files."""
@@ -725,10 +772,13 @@ def convert_edr(self):
             edr_files = [self.dgdl_edr(self.wdir(component, l)) for l in lambdas]
             tpr_files = [self.dgdl_tpr(self.wdir(component, l)) for l in lambdas]
             for tpr, edr in zip(tpr_files, edr_files):
-                deffnm = os.path.splitext(edr)[0]
-                xvgfile = deffnm + ".xvg"  # hack
-                logger.info("  {0} --> {1}".format(edr, xvgfile))
-                gromacs.g_energy(s=tpr, f=edr, odh=xvgfile)
+                dirct = os.path.abspath(os.path.dirname(tpr))
+                total_edr = self.dgdl_total_edr(dirct)
+                logger.info("  {0} --> {1}".format('edrs', total_edr))
+                gromacs.eneconv(f=edr, o=total_edr)
+                xvgfile = os.path.join(dirct, self.deffnm + ".xvg")  # hack
+                logger.info("  {0} --> {1}".format(total_edr, xvgfile))
+                gromacs.g_energy(s=tpr, f=total_edr, odh=xvgfile)
 
     def collect(self, stride=None, autosave=True, autocompress=True):
         """Collect dV/dl from output.
@@ -953,6 +1003,87 @@ def analyze(self, force=False, stride=None, autosave=True, ncorrel=25000):
         self.logger_DeltaA0()
         return self.results.DeltaA.Gibbs
 
+    def collect_alchemlyb(self, start=0, stop=None, stride=None, autosave=True, autocompress=True):
+        extract = self.estimators[self.method]['extract']
+
+        if autocompress:
+            # must be done before adding to results.xvg or we will not find the file later
+            self.compress_dgdl_xvg()
+
+        logger.info("[%(dirname)s] Finding dgdl xvg files, reading with "
+                    "stride=%(stride)d permissive=%(permissive)r." % vars(self))
+        for component, lambdas in self.lambdas.items():
+            val = []
+            for l in lambdas:
+                xvg_file = self.dgdl_xvg(self.wdir(component, l))
+                xvg_df = extract(xvg_file, T=self.Temperature).iloc[start:stop:stride]
+                if self.SI:
+                    ts = _extract_dataframe(xvg_file).iloc[start:stop:stride]
+                    ts = pd.DataFrame({'time': ts.iloc[:,0], 'dhdl': ts.iloc[:,1]})
+                    ts = ts.set_index('time')
+                    xvg_df = statistical_inefficiency(xvg_df, series=ts, conservative=True)
+                val.append(xvg_df)
+            self.results.xvg[component] = (numpy.array(lambdas), pd.concat(val))
+
+        if autosave:
+            self.save()
+
+    def analyze_alchemlyb(self, start=0, stop=None, stride=None, force=False, autosave=True):
+        stride = stride or self.stride
+        start = start or self.start
+        stop = stop or self.stop
+
+        if self.method in ['TI', 'BAR', 'MBAR']:
+            estimator = self.estimators[self.method]['estimator']
+        else:
+            errmsg = "The method is not supported."
+            logger.error(errmsg)
+            raise ValueError(errmsg)
+
+        if force or not self.has_dVdl():
+            try:
+                self.collect_alchemlyb(start, stop, stride, autosave=False)
+            except IOError as err:
+                if err.errno == errno.ENOENT:
+                    self.convert_edr()
+                    self.collect_alchemlyb(start, stop, stride, autosave=False)
+                else:
+                    logger.exception()
+                    raise
+        else:
+            logger.info("Analyzing stored data.")
+
+        # total free energy difference at const P (all simulations are done in NPT)
+        GibbsFreeEnergy = QuantityWithError(0,0)
+
+        for component, (lambdas, xvgs) in self.results.xvg.items():
+            result = estimator().fit(xvgs)
+            if self.method == 'BAR':
+                DeltaA = QuantityWithError(0,0)
+                a_s= numpy.diagonal(result.delta_f_, offset=1)
+                da_s = numpy.diagonal(result.d_delta_f_, offset=1)
+                for a, da in zip(a_s, da_s):
+                    DeltaA += QuantityWithError(a, da)
+                self.results.DeltaA[component] = kBT_to_kJ(DeltaA, self.Temperature)
+            else:
+                a = result.delta_f_.loc[0.00, 1.00]
+                da = result.d_delta_f_.loc[0.00, 1.00]
+                self.results.DeltaA[component] = kBT_to_kJ(QuantityWithError(a, da), self.Temperature)
+            GibbsFreeEnergy += self.results.DeltaA[component]  # error propagation is automagic!
+
+        # hydration free energy Delta A = -(Delta A_coul + Delta A_vdw)
+        GibbsFreeEnergy *= -1
+        self.results.DeltaA.Gibbs = GibbsFreeEnergy
+
+        if autosave:
+            self.save()
+
+        self.logger_DeltaA0()
+        return self.results.DeltaA.Gibbs
+
+        if autosave:
+            self.save()
+
     def write_DeltaA0(self, filename, mode='w'):
         """Write free energy components to a file.
 
@@ -1087,6 +1218,7 @@ def choose_script_from(scripts):
     def __repr__(self):
         return "%s(filename=%r)" % (self.__class__.__name__, self.filename)
 
+
 class Ghyd(Gsolv):
     """Sets up and analyses MD to obtain the hydration free energy of a solute."""
     solvent_default = "water"
@@ -1138,7 +1270,7 @@ class Gwoct(Goct):
     """
     solvent_default = "wetoctanol"
     dirname_default = os.path.join(Gsolv.topdir_default, solvent_default)
-    
+
 
 def p_transfer(G1, G2, **kwargs):
     """Compute partition coefficient from two :class:`Gsolv` objects.

diff --git a/mdpow/tests/test_analysis_alchemlyb.py b/mdpow/tests/test_analysis_alchemlyb.py
@@ -0,0 +1,149 @@
+import os.path
+import pytest
+import py.path
+
+import yaml
+import pybol
+
+from numpy.testing import assert_array_almost_equal
+
+from six.moves import cPickle as pickle
+
+import mdpow.fep
+
+from pkg_resources import resource_filename
+RESOURCES = py.path.local(resource_filename(__name__, 'testing_resources'))
+MANIFEST = RESOURCES.join("manifest.yml")
+
+def fix_manifest(topdir):
+    """Create a temporary manifest with a custom `path`.
+
+    Fix manifest in a local temporary copy in existing dir topdir
+    where the `path` is an absolute path to our "states" directory. We
+    use `pkg_resources.resource_filename` to anchor the path.
+
+    Arguments
+    ---------
+    topdir : py.path.local
+        existing temporary directory (as provided by, for instance,
+        `pytest.tmpdir`)
+
+    Returns
+    -------
+    new_manifest : py.path.local
+        Path to the new manifest.
+
+    Example
+    -------
+    Use as ::
+
+        new_manifest = fix_manifest(tmpdir)
+        m = pybol.Manifest(new_manifest.strpath)
+
+    """
+    manifest = yaml.load(MANIFEST.open())
+    # simple heuristic: last element of the recorded manifest::path is the name
+    # of the states directory, typically 'states' (from .../testing_resources/states)
+    manifest['path'] = RESOURCES.join(os.path.basename(manifest['path'])).strpath
+    new_manifest = topdir.join("local_manifest.yml")
+    yaml.dump(manifest, stream=new_manifest.open("w"))
+    return new_manifest
+
+
+# session scope if read-only use
+
+@pytest.fixture(scope="function")
+def fep_benzene_directory(tmpdir_factory):
+    topdir = tmpdir_factory.mktemp('analysis')
+    m = pybol.Manifest(fix_manifest(topdir).strpath)
+    m.assemble('FEP', topdir.strpath)
+    return topdir.join("benzene")
+
+class TestAnalyze(object):
+    def get_Gsolv(self, pth):
+        gsolv = pth.join("FEP", "water", "Gsolv.fep")
+        G = pickle.load(gsolv.open())
+        # patch paths
+        G.basedir = pth.strpath
+        G.filename = gsolv.strpath
+        return G
+
+    @pytest.mark.parametrize('method, Gibbs, coulomb, vdw', [
+                ('TI',
+                 (-3.901068,  0.550272),
+                 (8.417035, 0.22289),
+                 (-4.515967,  0.50311)),
+                ('BAR',
+                 (-4.091241, 0.385413),
+                 (8.339705, 0.166802),
+                 (-4.248463, 0.347449)),
+                ('MBAR',
+                 (-6.793117,  0.475149),
+                 (8.241836, 0.219235),
+                 (-1.448719,  0.421548))
+                ])
+    def test_estimator_alchemlyb(self, fep_benzene_directory, method,
+                                 Gibbs, coulomb, vdw):
+        G = self.get_Gsolv(fep_benzene_directory)
+        G.SI = False
+        G.method = method
+        G.start = 0
+        G.stop = None
+        # ensure conversion EDR to XVG.bz2; if the fixture is session scoped
+        # then other workers will pick up these files. Make sure that only one
+        # runs convert because there is no file locking, if in doubt, make
+        # fep_benzene_directory locally scoped
+        G.convert_edr()
+        try:
+            G.analyze_alchemlyb(force=True, autosave=False)
+        except IOError as err:
+            raise AssertionError("Failed to convert edr to xvg: {0}: {1}".format(
+                err.strerror, err.filename))
+        DeltaA = G.results.DeltaA
+        assert_array_almost_equal(DeltaA.Gibbs.astuple(), Gibbs,
+                                  decimal=6)
+        assert_array_almost_equal(DeltaA.coulomb.astuple(), coulomb,
+                                  decimal=6)
+        assert_array_almost_equal(DeltaA.vdw.astuple(), vdw,
+                                  decimal=6)
+
+    def test_SI(self, fep_benzene_directory):
+        G = self.get_Gsolv(fep_benzene_directory)
+        G.SI = True
+        G.method = 'TI'
+        G.start = 0
+        G.stop = None
+        G.convert_edr()
+        try:
+            G.analyze_alchemlyb(force=True, autosave=False)
+        except IOError as err:
+            raise AssertionError("Failed to convert edr to xvg: {0}: {1}".format(
+                err.strerror, err.filename))
+        DeltaA = G.results.DeltaA
+        assert_array_almost_equal(DeltaA.Gibbs.astuple(), (-2.908885,  2.175976),
+                                  decimal=6)
+        assert_array_almost_equal(DeltaA.coulomb.astuple(), (7.755779, 0.531481),
+                                  decimal=6)
+        assert_array_almost_equal(DeltaA.vdw.astuple(), (-4.846894,  2.110071),
+                                  decimal=6)
+
+    def test_start_stop_stride(self, fep_benzene_directory):
+        G = self.get_Gsolv(fep_benzene_directory)
+        G.SI = False
+        G.method = 'TI'
+        G.start = 10
+        G.stride = 2
+        G.stop = 200
+        G.convert_edr()
+        try:
+            G.analyze_alchemlyb(force=True, autosave=False)
+        except IOError as err:
+            raise AssertionError("Failed to convert edr to xvg: {0}: {1}".format(
+                err.strerror, err.filename))
+        DeltaA = G.results.DeltaA
+        assert_array_almost_equal(DeltaA.Gibbs.astuple(), (-3.318109,  0.905128),
+                                  decimal=6)
+        assert_array_almost_equal(DeltaA.coulomb.astuple(), (8.146806, 0.348866),
+                                  decimal=6)
+        assert_array_almost_equal(DeltaA.vdw.astuple(), (-4.828696,  0.835195),
+                                  decimal=6)