diff --git a/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst b/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst index bbc41dd1..b36b581c 100644 --- a/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst +++ b/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst @@ -1,3 +1,5 @@ +.. currentmodule:: alchemlyb.preprocessing.subsampling + .. _subsampling: Subsampling @@ -5,7 +7,262 @@ Subsampling .. automodule:: alchemlyb.preprocessing.subsampling -The functions featured in this module can be used to easily subsample either :ref:`dHdl ` or :ref:`u_nk ` datasets to give less correlated timeseries. +The functions featured in this module can be used to easily subsample either :math:`\frac{dH}{d\lambda}` (:ref:`dHdl `) or :math:`u_{nk}` (:ref:`u_nk `) datasets to give uncorrelated timeseries. + +Each of these functions splits the dataset into groups based on :math:`\lambda` index values, with each group featuring all samples with the same :math:`\lambda` values. +The function then performs its core operation (described below) on each group individually, with samples sorted according to the outermost ``time`` index beforehand. +Each of these groups therefore functions as an individual *timeseries* being subsampled. +The resulting :py:class:`pandas.DataFrame` is the concatenation of all groups after they have been subsampled. + + +Slicing +------- +The :func:`~alchemlyb.preprocessing.subsampling.slicing` function only performs slicing of the dataset on the basis of ``time``. +The ``lower`` and ``upper`` keyword specify the lower and upper bounds, *inclusive*, while ``step`` indicates the degree to which rows are skipped (e.g. ``step=2`` means to keep every other sample within the bounds). + +For example, if we have:: + + >>> u_nk + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 10.0 0.0 0.308844 2.616688 4.924532 7.232376 9.540220 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 30.0 0.0 0.309712 1.579647 2.849583 4.119518 5.389454 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + ... ... ... ... ... ... + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39970.0 1.0 -0.365724 -0.197390 -0.029055 0.139279 0.307614 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 39990.0 1.0 1.578606 1.260235 0.941863 0.623492 0.305121 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [20005 rows x 5 columns] + +Then we can grab all samples between :math:`t = 35.0` and :math:`t = 200.0`, inclusive:: + + >>> slicing(u_nk, lower=35.0, upper=200.0) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 50.0 0.0 0.301968 3.209507 6.117047 9.024587 11.932126 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 70.0 0.0 0.311610 2.773057 5.234504 7.695950 10.157397 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 160.0 1.0 1.396968 1.122475 0.847982 0.573489 0.298995 + 170.0 1.0 -1.812027 -1.283715 -0.755404 -0.227092 0.301219 + 180.0 1.0 0.979355 0.810205 0.641054 0.471904 0.302754 + 190.0 1.0 -2.455231 -1.766201 -1.077171 -0.388141 0.300889 + 200.0 1.0 2.419113 1.890386 1.361659 0.832932 0.304205 + + [85 rows x 5 columns] + + +In practice, this approach is not much different than directly using :py:property:`pandas.DataFrame.loc`:: + + >>> lower, upper, step = (35.0, 200.0, 1) + >>> u_nk.loc[lower:upper:step] + +The :func:`~alchemlyb.preprocessing.subsampling.slicing` function, however, performs some additional checks, +such as ensuring there are not duplicate time values in the dataset, which can happen if data from repeat simulations were concatenated together prior to use. +It's generally advisable to use subsampling functions on data from repeat simulations with ``time`` overlap *individually*, only concatenating afterward just before use with an :ref:`estimator `. + + +.. _subsampling_statinef: + +Statistical Inefficiency +------------------------ +The :func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` function subsamples each timeseries in the dataset by its calculated *statistical inefficiency*, :math:`g`, defined as: + +.. math:: + + g \equiv (1 + 2\tau) > 1 + +where :math:`\tau` is the autocorrelation time of the timeseries. +:math:`g` therefore functions as the spacing between uncorrelated samples in the timeseries. +The timeseries is sliced with the :math:`\text{ceil}(g)` as its step (in the conservative case, see the API docs below). + +For example, if we have:: + + >>> u_nk + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 10.0 0.0 0.308844 2.616688 4.924532 7.232376 9.540220 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 30.0 0.0 0.309712 1.579647 2.849583 4.119518 5.389454 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + ... ... ... ... ... ... + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39970.0 1.0 -0.365724 -0.197390 -0.029055 0.139279 0.307614 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 39990.0 1.0 1.578606 1.260235 0.941863 0.623492 0.305121 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [20005 rows x 5 columns] + +We can extract uncorrelated samples for each ``fep-lambda`` with:: + + >>> statistical_inefficiency(u_nk, how='right') + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39920.0 1.0 3.175234 2.457369 1.739504 1.021640 0.303775 + 39940.0 1.0 -1.480193 -1.034104 -0.588015 -0.141925 0.304164 + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [12005 rows x 5 columns] + +The ``how`` parameter indicates the choice of observable for performing the calculation of :math:`g`. + +* For :math:`u_{nk}` datasets, the choice of ``'right'`` is recommended: the column immediately to the right of the column corresponding to the group's lambda index value is used as the observable. +* For :math:`\frac{dH}{d\lambda}` datasets, the choice of ``'sum'`` is recommended: the columns are simply summed, and the resulting :py:class:`pandas.Series` is used as the observable. + +See the API documentation below on the possible values for ``how``, as well as more detailed explanations for each choice. + +It is also possible to choose a specific column, or an arbitrary :py:class:`pandas.Series` (with an *exactly* matching index to the dataset), as the basis for calculating :math:`g`:: + + >>> statistical_inefficiency(u_nk, column=0.75) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39920.0 1.0 3.175234 2.457369 1.739504 1.021640 0.303775 + 39940.0 1.0 -1.480193 -1.034104 -0.588015 -0.141925 0.304164 + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [12005 rows x 5 columns] + + >>> statistical_inefficiency(u_nk, column=u_nk[0.75]) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39920.0 1.0 3.175234 2.457369 1.739504 1.021640 0.303775 + 39940.0 1.0 -1.480193 -1.034104 -0.588015 -0.141925 0.304164 + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [12005 rows x 5 columns] + +See :py:func:`pymbar.timeseries.statisticalInefficiency` for more details; this is used internally by this subsampler. +Please reference the following if you use this function in your research: + + [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted histogram analysis method for the analysis of simulated and parallel tempering simulations. JCTC 3(1):26-41, 2007. + + +Equilibrium Detection +--------------------- +The :func:`~alchemlyb.preprocessing.subsampling.equilibrium_detection` function subsamples each timeseries in the dataset using the equilibration detection approach developed by John Chodera (see reference below). + +For each sorted timeseries in the dataset, and for each sample in each timeseries, the sample is treated as the starting point of the trajectory and the *statistical inefficiency*, :math:`g`, calculated. +Each of these starting points yields an *effective* number of uncorrelated samples, :math:`N_{\text{eff}}`, and the starting point with the greatest :math:`N_{\text{eff}}` is chosen as the start of the *equilibrated* region of the trajectory. + +The sorted timeseries is then subsampled by dropping the samples prior to the chosen starting point, then slicing the remaining samples with :math:`\text{ceil}(g)` as its step (in the conservative case, see the API docs below). + +For example, if we have:: + + >>> u_nk + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 10.0 0.0 0.308844 2.616688 4.924532 7.232376 9.540220 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 30.0 0.0 0.309712 1.579647 2.849583 4.119518 5.389454 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + ... ... ... ... ... ... + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39970.0 1.0 -0.365724 -0.197390 -0.029055 0.139279 0.307614 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 39990.0 1.0 1.578606 1.260235 0.941863 0.623492 0.305121 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [20005 rows x 5 columns] + +We can extract uncorrelated samples for each ``fep-lambda`` with:: + + >>> equilibrium_detection(u_nk, how='right') + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39820.0 1.0 5.238549 4.004569 2.770589 1.536609 0.302630 + 39840.0 1.0 -1.611068 -1.133603 -0.656139 -0.178674 0.298791 + 39860.0 1.0 0.052599 0.116262 0.179924 0.243587 0.307249 + 39880.0 1.0 1.312812 1.060874 0.808936 0.556998 0.305060 + 39900.0 1.0 6.940932 5.280870 3.620806 1.960743 0.300680 + + [11968 rows x 5 columns] + +The ``how`` parameter indicates the choice of observable for performing the calculation of :math:`g`. +See the brief recommendations for ``how`` in the :ref:`subsampling_statinef` section above, or the API documentation below on the possible values for ``how`` as well as more detailed explanations for each choice. + +It is also possible to choose a specific column, or an arbitrary :py:class:`pandas.Series` (with an *exactly* matching index to the dataset), as the basis for subsampling with equilibrium detection:: + + >>> equilibrium_detection(u_nk, column=0.75) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39820.0 1.0 5.238549 4.004569 2.770589 1.536609 0.302630 + 39840.0 1.0 -1.611068 -1.133603 -0.656139 -0.178674 0.298791 + 39860.0 1.0 0.052599 0.116262 0.179924 0.243587 0.307249 + 39880.0 1.0 1.312812 1.060874 0.808936 0.556998 0.305060 + 39900.0 1.0 6.940932 5.280870 3.620806 1.960743 0.300680 + + [11961 rows x 5 columns] + + >>> equilibrium_detection(u_nk, column=u_nk[0.75]) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39820.0 1.0 5.238549 4.004569 2.770589 1.536609 0.302630 + 39840.0 1.0 -1.611068 -1.133603 -0.656139 -0.178674 0.298791 + 39860.0 1.0 0.052599 0.116262 0.179924 0.243587 0.307249 + 39880.0 1.0 1.312812 1.060874 0.808936 0.556998 0.305060 + 39900.0 1.0 6.940932 5.280870 3.620806 1.960743 0.300680 + + [11961 rows x 5 columns] + +See :py:func:`pymbar.timeseries.detectEquilibration` for more details; this is used internally by this subsampler. +Please reference the following if you use this function in your research: + + [1] John D. Chodera. A simple method for automated equilibration detection in molecular simulations. Journal of Chemical Theory and Computation, 12:1799, 2016. + + [2] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted histogram analysis method for the analysis of simulated and parallel tempering simulations. JCTC 3(1):26-41, 2007. + API Reference ------------- diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index 0b1a91a1..ea0230f5 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -1,26 +1,79 @@ """Functions for subsampling datasets. """ +import random + +from collections import defaultdict import numpy as np -from pymbar.timeseries import (statisticalInefficiency, - detectEquilibration, - subsampleCorrelatedData, ) +import pandas as pd +from pymbar import timeseries +from pymbar.utils import ParameterError + + +class CorrelationError(Exception): + pass + + +def _check_multiple_times(data): + return data.index.get_level_values(0).duplicated().any() + + +def _how_lr(name, group, direction): + + # first, get column index of column `name`, if it exists + # if it does exist, increment or decrement position by one based on + # `direction` + try: + pos = group.columns.get_loc(name) + except KeyError: + raise KeyError("No column with label '{}'".format(name)) + else: + if direction == 'right': + pos += 1 + elif direction == 'left': + pos -= 1 + else: + raise ValueError("`direction` must be either 'right' or 'left'") + + # handle cases where we are dealing with the leftmost column or rightmost + # column + if pos == -1: + pos += 2 + elif pos == len(group.columns): + pos -= 2 + elif (pos < -1) or (pos > len(group.columns)): + raise IndexError("Position of selected column is outside of all expected bounds") + + return group[group.columns[pos]] + + +def _how_random(name, group, tried=None): + + candidates = set(group.columns) - (set(tried) | {name}) + + if not candidates: + raise CorrelationError("No column in the dataset could be used" + " successfully for decorrelation") + else: + selection = random.choice(list(candidates)) + return group[selection], selection -def _check_multiple_times(df): - return df.sort_index(0).reset_index(0).duplicated('time').any() +def _how_sum(name, group): + return group.sum(axis=1) -def _check_sorted(df): - return df.reset_index(0)['time'].is_monotonic_increasing +def slicing(data, lower=None, upper=None, step=None, force=False): + """Subsample an alchemlyb DataFrame using slicing on the outermost index (time). -def slicing(df, lower=None, upper=None, step=None, force=False): - """Subsample a DataFrame using simple slicing. + Slicing will be performed separately on groups of rows corresponding to + each set of lambda values present in the DataFrame's index. Each group will + be sorted on the outermost (time) index prior to slicing. Parameters ---------- - df : DataFrame + data : DataFrame DataFrame to subsample. lower : float Lower time to slice from. @@ -34,63 +87,198 @@ def slicing(df, lower=None, upper=None, step=None, force=False): Returns ------- DataFrame - `df` subsampled. + `data` subsampled. """ - try: - df = df.loc[lower:upper:step] - except KeyError: - raise KeyError("DataFrame rows must be sorted by time, increasing.") - - if not force and _check_multiple_times(df): - raise KeyError("Duplicate time values found; it's generally advised " - "to use slicing on DataFrames with unique time values " - "for each row. Use `force=True` to ignore this error.") - - # drop any rows that have missing values - df = df.dropna() - - return df - - -def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None, - conservative=True): - """Subsample a DataFrame based on the calculated statistical inefficiency - of a timeseries. - - If `series` is ``None``, then this function will behave the same as - :func:`slicing`. + # we always start with a full index sort on the whole dataframe + data = data.sort_index() + + index_names = list(data.index.names[1:]) + resdata = list() + + for name, group in data.groupby(level=index_names): + group_s = group.loc[lower:upper:step] + + if not force and _check_multiple_times(group_s): + raise KeyError("Duplicate time values found; it's generally advised " + "to use slicing on DataFrames with unique time values " + "for each row. Use `force=True` to ignore this error.") + + resdata.append(group_s) + + return pd.concat(resdata) + + +def _decorrelator(subsample, data, column=None, how=None, lower=None, + upper=None, step=None, conservative=True, return_calculated=False, + force=False, random_state=None): + + # input checks + if column is not None: + if isinstance(column, pd.Series): + if not (data.index == column.index).all(): + raise ValueError("The index of `column` must match the index of `data` exactly" + " to be used for decorrelation.") + elif column in data.columns: + pass + else: + raise ValueError("`column` must give either an existing column label in `data`" + " or a Series object with a matching index.") + elif how is not None: + pass + else: + raise ValueError("One of `column` or `how` must be set") + + np.random.seed(random_state) + + # we always start with a full index sort on the whole dataframe + # should produce a copy + data = data.sort_index() + + index_names = list(data.index.names[1:]) + resdata = list() + + if return_calculated: + calculated = defaultdict(dict) + + def random_selection(name, group): + tried = set() + while True: + group_c, selection = _how_random(name, group, tried=tried) + try: + indices, calculated = subsample(group_c, group) + except: + tried.add(selection) + else: + break + + return indices, calculated + + for name, group in data.groupby(level=index_names): + + if not force and _check_multiple_times(group): + raise KeyError("Duplicate time values found; decorrelation " + "is only meaningful for a single, contiguous, " + "and sorted timeseries") + + if column is not None: + if isinstance(column, pd.Series): + group_c = column.groupby(level=index_names).get_group(name) + indices, calc = subsample(group_c, group) + elif column in group.columns: + group_c = group[column] + indices, calc = subsample(group_c, group) + elif how is not None: + if (how == 'right') or (how == 'left'): + try: + group_c = _how_lr(name, group, how) + except KeyError: + indices, calc = random_selection(name, group) + else: + indices, calc = subsample(group_c, group) + elif how == 'random': + indices, calc = random_selection(name, group) + elif how == 'sum': + group_c = _how_sum(name, group) + indices, calc = subsample(group_c, group) + else: + raise ValueError("`how` cannot be '{}';" + " see docstring for available options".format(how)) + + group_s = slicing(group, lower=lower, upper=upper, step=step) + resdata.append(group_s.iloc[indices]) + + if return_calculated: + calculated[name] = calc + + if return_calculated: + return pd.concat(resdata), pd.DataFrame(calculated) + else: + return pd.concat(resdata) + + +def statistical_inefficiency(data, column=None, how=None, lower=None, + upper=None, step=None, conservative=True, return_calculated=False, + force=False, random_state=None): + """Subsample an alchemlyb DataFrame based on the calculated statistical + inefficiency of one of its columns. + + Calculation of statistical inefficiency and subsequent subsampling will be + performed separately on groups of rows corresponding to each set of lambda + values present in the DataFrame's index. Each group will be sorted on the + outermost (time) index prior to any calculation. + + The `how` parameter determines the observable used for calculating the + correlations within each group of samples. The options are as follows: + + 'right' + The recommended setting for 'u_nk' datasets; the column immediately + to the right of the column corresponding to the group's lambda + index value is used. If there is no column to the right, then the + column to the left is used. If there is no column corresponding to + the group's lambda index value, then 'random' is used (see below). + 'left' + The opposite of the 'right' approach; the column immediately to the + left of the column corresponding to the group's lambda index value + is used. If there is no column to the left, then the column to the + right is used. If there is no column corresponding to the group's + lambda index value, then 'random' is used for that group (see below). + 'random' + A column is chosen uniformly at random from the set of columns + available in the group, with the column corresponding to the + group's lambda index value excluded, if present. If the correlation + calculation fails, then another column is tried. This process + continues until success or until all columns have been attempted + without success (in which case, ``CorrelationError`` is raised). + 'sum' + The recommended setting for 'dHdl' datasets; the columns are simply + summed, and the resulting `Series` is used. + + Specifying the 'column' parameter overrides the behavior of 'how'. This + allows the user to use a particular column or a specially-crafted `Series` + for correlation calculation. Parameters ---------- - df : DataFrame + data : DataFrame DataFrame to subsample according statistical inefficiency of `series`. - series : Series - Series to use for calculating statistical inefficiency. If ``None``, - no statistical inefficiency-based subsampling will be performed. + column : label or `pandas.Series` + Label of column to use for calculating statistical inefficiency. + Overrides `how`; can also take a `Series` object, but the index of the + `Series` *must* match that of `data` exactly. + how : {'right', 'left', 'random', 'sum'} + The approach used to choose the observable on which correlations are + calculated. See explanation above. lower : float - Lower bound to pre-slice `series` data from. + Lower time to pre-slice data from. upper : float - Upper bound to pre-slice `series` to (inclusive). + Upper time to pre-slice data to (inclusive). step : int - Step between `series` items to pre-slice by. + Step between rows to pre-slice by. conservative : bool - ``True`` use ``ceil(statistical_inefficiency)`` to slice the data in uniform - intervals (the default). ``False`` will sample at non-uniform intervals to - closely match the (fractional) statistical_inefficieny, as implemented - in :func:`pymbar.timeseries.subsampleCorrelatedData`. + ``True`` use ``ceil(statistical_inefficiency)`` to slice the data in + uniform intervals (the default). ``False`` will sample at non-uniform + intervals to closely match the (fractional) statistical_inefficieny, as + implemented in :func:`pymbar.timeseries.subsampleCorrelatedData`. + return_calculated : bool + ``True`` return a tuple, with the second item a dict giving, e.g. `statinef` + for each group. + force : bool + Ignore checks that DataFrame is in proper form for expected behavior. + random_state : int + Integer between 0 and 2**32 -1 inclusive; fed to `numpy.random.seed`. + Running this function on the same data with a specific random seed will + produce the same result each time. Returns ------- DataFrame - `df` subsampled according to subsampled `series`. + `data` subsampled according to subsampled `column`. - Warning - ------- - The `series` and the data to be sliced, `df`, need to have the same number - of elements because the statistical inefficiency is calculated based on - the index of the series (and not an associated time). At the moment there is - no automatic conversion from a time to an index. + Raises + ------ + CorrelationError + If correlation removal fails irrecoverably. Note ---- @@ -102,101 +290,208 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None, decreases a false sense of accuracy and is deemed the more careful and conservative approach. + References + ---------- + [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted + histogram analysis method for the analysis of simulated and parallel tempering simulations. + JCTC 3(1):26-41, 2007. + See Also -------- pymbar.timeseries.statisticalInefficiency : detailed background pymbar.timeseries.subsampleCorrelatedData : used for subsampling + .. versionchanged:: 0.4.0 + The ``series`` keyword was replaced with the ``column`` keyword. + The function no longer takes an arbitrary series as input for + calculating statistical inefficiency. + .. versionchanged:: 0.2.0 The ``conservative`` keyword was added and the method is now using - ``pymbar.timeseries.statisticalInefficiency()``; previously, the statistical - inefficiency was _rounded_ (instead of ``ceil()``) and thus one could - end up with correlated data. + ``pymbar.timeseries.subsampleCorrelatedData()``; previously, the + statistical inefficiency was _rounded_ (instead of ``ceil()``) and thus + one could end up with correlated data. """ - if _check_multiple_times(df): - raise KeyError("Duplicate time values found; statistical inefficiency " - "only works on a single, contiguous, " - "and sorted timeseries.") - - if not _check_sorted(df): - raise KeyError("Statistical inefficiency only works as expected if " - "values are sorted by time, increasing.") - if series is not None: - series = slicing(series, lower=lower, upper=upper, step=step) + def subsample(group_c, group): + group_cs = slicing(group_c, lower=lower, upper=upper, step=step) - if (len(series) != len(df) or - not all(series.reset_index()['time'] == df.reset_index()['time'])): - raise ValueError("series and data must be sampled at the same times") - - # calculate statistical inefficiency of series (could use fft=True but needs test) - statinef = statisticalInefficiency(series, fast=False) + # calculate statistical inefficiency of column (could use fft=True but needs test) + statinef = timeseries.statisticalInefficiency(group_cs, fast=False) # use the subsampleCorrelatedData function to get the subsample index - indices = subsampleCorrelatedData(series, g=statinef, - conservative=conservative) - df = df.iloc[indices] - else: - df = slicing(df, lower=lower, upper=upper, step=step) - - return df - - -def equilibrium_detection(df, series=None, lower=None, upper=None, step=None): - """Subsample a DataFrame using automated equilibrium detection on a timeseries. - - If `series` is ``None``, then this function will behave the same as - :func:`slicing`. + indices = timeseries.subsampleCorrelatedData(group_cs, g=statinef, + conservative=conservative) + + calculated = dict() + calculated['statinef'] = statinef + + return indices, calculated + + return _decorrelator( + subsample=subsample, + data=data, + column=column, + how=how, + lower=lower, + upper=upper, + step=step, + conservative=conservative, + return_calculated=return_calculated, + force=force, + random_state=random_state) + + +def equilibrium_detection(data, column=None, how=None, lower=None, + upper=None, step=None, conservative=True, return_calculated=False, + force=False, random_state=None, nskip=1): + """Subsample a DataFrame using automated equilibrium detection on one of + its columns. + + Equilibrium detection and subsequent subsampling will be performed + separately on groups of rows corresponding to each set of lambda values + present in the DataFrame's index. Each group will be sorted on the + outermost (time) index prior to any calculation. + + The `how` parameter determines the observable used for calculating the + correlations within each group of samples. The options are as follows: + + 'right' + The recommended setting for 'u_nk' datasets; the column immediately + to the right of the column corresponding to the group's lambda + index value is used. If there is no column to the right, then the + column to the left is used. If there is no column corresponding to + the group's lambda index value, then 'random' is used (see below). + 'left' + The opposite of the 'right' approach; the column immediately to the + left of the column corresponding to the group's lambda index value + is used. If there is no column to the left, then the column to the + right is used. If there is no column corresponding to the group's + lambda index value, then 'random' is used for that group (see below). + 'random' + A column is chosen at random from the set of columns available in + the group. If the correlation calculation fails, then another + column is tried. This process continues until success or until all + columns have been attempted without success. + 'sum' + The recommended setting for 'dHdl' datasets; the columns are simply + summed, and the resulting `Series` is used. + + Specifying the 'column' parameter overrides the behavior of 'how'. This + allows the user to use a particular column or a specially-crafted `Series` + for correlation calculation. Parameters ---------- - df : DataFrame + data : DataFrame DataFrame to subsample according to equilibrium detection on `series`. - series : Series - Series to detect equilibration on. If ``None``, no equilibrium - detection-based subsampling will be performed. + column : label or `pandas.Series` + Label of column to use for calculating statistical inefficiency. + Overrides `how`; can also take a `Series` object, but the index of the + `Series` *must* match that of `data` exactly. + how : {'right', 'left', 'random', 'sum'} + The approach used to choose the observable on which correlations are + calculated. See explanation above. lower : float - Lower bound to pre-slice `series` data from. + Lower time to pre-slice data from. upper : float - Upper bound to pre-slice `series` to (inclusive). + Upper time to pre-slice data to (inclusive). step : int - Step between `series` items to pre-slice by. + Step between rows to pre-slice by. + conservative : bool + ``True`` use ``ceil(statistical_inefficiency)`` to slice the data in uniform + intervals (the default). ``False`` will sample at non-uniform intervals to + closely match the (fractional) statistical_inefficieny, as implemented + in :func:`pymbar.timeseries.subsampleCorrelatedData`. + return_calculated : bool + ``True`` return a tuple, with the second item a dict giving, e.g. `statinef` + for each group. + force : bool + Ignore checks that DataFrame is in proper form for expected behavior. + random_state : int + Integer between 0 and 2**32 -1 inclusive; fed to `numpy.random.seed`. + Running this function on the same data with a specific random seed will + produce the same result each time. + nskip : int + Number of samples to skip when walking through dataset; for long + trajectories, can offer substantial speedups at the cost of possibly + discarding slightly more data. Returns ------- DataFrame - `df` subsampled according to subsampled `series`. + `data` subsampled according to subsampled `column`. + + Note + ---- + For a non-integer statistical ineffciency :math:`g`, the default value + ``conservative=True`` will provide _fewer_ data points than allowed by + :math:`g` and thus error estimates will be _higher_. For large numbers of + data points and converged free energies, the choice should not make a + difference. For small numbers of data points, ``conservative=True`` + decreases a false sense of accuracy and is deemed the more careful and + conservative approach. + + References + ---------- + [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted + histogram analysis method for the analysis of simulated and parallel tempering simulations. + JCTC 3(1):26-41, 2007. + [2] John D. Chodera. A simple method for automated + equilibration detection in molecular simulations. + Journal of Chemical Theory and Computation, + 12:1799, 2016. See Also -------- pymbar.timeseries.detectEquilibration : detailed background + pymbar.timeseries.subsampleCorrelatedData : used for subsampling - """ - if _check_multiple_times(df): - raise KeyError("Duplicate time values found; equilibrium detection " - "is only meaningful for a single, contiguous, " - "and sorted timeseries.") - if not _check_sorted(df): - raise KeyError("Equilibrium detection only works as expected if " - "values are sorted by time, increasing.") + .. versionchanged:: 0.4.0 + The ``series`` keyword was replaced with the ``column`` keyword. + The function no longer takes an arbitrary series as input for + calculating statistical inefficiency. - if series is not None: - series = slicing(series, lower=lower, upper=upper, step=step) + .. versionchanged:: 0.4.0 + The ``conservative`` keyword was added and the method is now using + ``pymbar.timeseries.subsampleCorrelatedData()``; previously, the statistical + inefficiency was _rounded_ (instead of ``ceil()``) and thus one could + end up with correlated data. - # calculate statistical inefficiency of series, with equilibrium detection - t, statinef, Neff_max = detectEquilibration(series.values) + """ - # we round up - statinef = int(np.rint(statinef)) + def subsample(group_c, group): + group_cs = slicing(group_c, lower=lower, upper=upper, step=step) - # subsample according to statistical inefficiency - series = series.iloc[t::statinef] + # calculate statistical inefficiency of series, with equilibrium detection + t, statinef, Neff_max = timeseries.detectEquilibration(group_cs, nskip=nskip) - df = df.loc[series.index] - else: - df = slicing(df, lower=lower, upper=upper, step=step) + # only keep values past `t` + group_cs = group_cs.iloc[t:] - return df + # use the subsampleCorrelatedData function to get the subsample index + indices = timeseries.subsampleCorrelatedData(group_cs, g=statinef, + conservative=conservative) + + calculated = dict() + calculated['t'] = t + calculated['statinef'] = statinef + calculated['Neff_max'] = Neff_max + + return indices, calculated + + return _decorrelator( + subsample=subsample, + data=data, + column=column, + how=how, + lower=lower, + upper=upper, + step=step, + conservative=conservative, + return_calculated=return_calculated, + force=force, + random_state=random_state) diff --git a/src/alchemlyb/tests/test_fep_estimators.py b/src/alchemlyb/tests/test_fep_estimators.py index f33a72fd..d1a704bc 100644 --- a/src/alchemlyb/tests/test_fep_estimators.py +++ b/src/alchemlyb/tests/test_fep_estimators.py @@ -103,6 +103,7 @@ def namd_tyr2ala(): return u_nk + class FEPestimatorMixin: """Mixin for all FEP Estimator test classes. diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 4e895c73..6c889691 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -10,44 +10,127 @@ from alchemlyb.preprocessing import (slicing, statistical_inefficiency, equilibrium_detection,) +from . import test_ti_estimators as tti +from . import test_fep_estimators as tfep + import alchemtest.gmx +@pytest.fixture(scope="module", + params = [(tti.gmx_benzene_coul_dHdl, "single"), + (tti.gmx_benzene_vdw_dHdl, "single"), + (tti.gmx_expanded_ensemble_case_1_dHdl, "single"), + (tti.gmx_expanded_ensemble_case_2_dHdl, "repeat"), + (tti.gmx_expanded_ensemble_case_3_dHdl, "repeat"), + (tti.gmx_water_particle_with_total_energy_dHdl, "single"), + (tti.gmx_water_particle_with_potential_energy_dHdl, "single"), + (tti.gmx_water_particle_without_energy_dHdl, "single"), + (tti.amber_simplesolvated_charge_dHdl, "single"), + (tti.amber_simplesolvated_vdw_dHdl, "single") + ], + ids = ["tti.gmx_benzene_coul_dHdl", + "tti.gmx_benzene_vdw_dHdl", + "tti.gmx_expanded_ensemble_case_1_dHdl", + "tti.gmx_expanded_ensemble_case_2_dHdl", + "tti.gmx_expanded_ensemble_case_3_dHdl", + "tti.gmx_water_particle_with_total_energy_dHdl", + "tti.gmx_water_particle_with_potential_energy_dHdl", + "tti.gmx_water_particle_without_energy_dHdl", + "tti.amber_simplesolvated_charge_dHdl", + "tti.amber_simplesolvated_vdw_dHdl", + ]) +def dHdl(request): + get_dHdl, nsims = request.param + return get_dHdl(), nsims + + +@pytest.fixture(scope="class", + params=[(tfep.gmx_benzene_coul_u_nk, "single"), + (tfep.gmx_benzene_vdw_u_nk, "single"), + (tfep.gmx_expanded_ensemble_case_1, "single"), + (tfep.gmx_expanded_ensemble_case_2, "repeat"), + (tfep.gmx_expanded_ensemble_case_3, "repeat"), + (tfep.gmx_water_particle_with_total_energy, "single"), + (tfep.gmx_water_particle_with_potential_energy, "single"), + (tfep.gmx_water_particle_without_energy, "single"), + (tfep.amber_bace_example_complex_vdw, "single"), + (tfep.gomc_benzene_u_nk, "single"), + ], + ids = ["tfep.gmx_benzene_coul_u_nk", + "tfep.gmx_benzene_vdw_u_nk", + "tfep.gmx_expanded_ensemble_case_1", + "tfep.gmx_expanded_ensemble_case_2", + "tfep.gmx_expanded_ensemble_case_3", + "tfep.gmx_water_particle_with_total_energy", + "tfep.gmx_water_particle_with_potential_energy", + "tfep.gmx_water_particle_without_energy", + "tfep.amber_bace_example_complex_vdw", + "tfep.gomc_benzene_u_nk", + ]) +def u_nk(request): + get_unk, nsims = request.param + return get_unk(), nsims + def gmx_benzene_dHdl(): dataset = alchemtest.gmx.load_benzene() - return gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) + dHdl = gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) + + return dHdl + + +def gmx_benzene_dHdl_duplicated(): + dataset = alchemtest.gmx.load_benzene() + df = gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) + dHdl = pd.concat([df, df]) + + return dHdl def gmx_benzene_u_nk(): dataset = alchemtest.gmx.load_benzene() - return gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) + u_nk = gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) + + return u_nk + + +def gmx_benzene_u_nk_duplicated(): + dataset = alchemtest.gmx.load_benzene() + df = gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) + u_nk = pd.concat([df, df]) + + return u_nk def gmx_benzene_dHdl_full(): dataset = alchemtest.gmx.load_benzene() - return pd.concat([gmx.extract_dHdl(i, T=300) for i in dataset['data']['Coulomb']]) + dHdl = pd.concat([gmx.extract_dHdl(i, T=300) for i in dataset['data']['Coulomb']]) + + return dHdl def gmx_benzene_u_nk_full(): dataset = alchemtest.gmx.load_benzene() - return pd.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']]) + u_nk = pd.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']]) + + return u_nk + class TestSlicing: """Test slicing functionality. """ - def slicer(self, *args, **kwargs): + def subsampler(self, *args, **kwargs): return slicing(*args, **kwargs) @pytest.mark.parametrize(('data', 'size'), [(gmx_benzene_dHdl(), 661), (gmx_benzene_u_nk(), 661)]) def test_basic_slicing(self, data, size): - assert len(self.slicer(data, lower=1000, upper=34000, step=5)) == size + assert len(self.subsampler(data, lower=1000, upper=34000, step=5)) == size @pytest.mark.parametrize('data', [gmx_benzene_dHdl(), gmx_benzene_u_nk()]) - def test_disordered_exception(self, data): - """Test that a shuffled DataFrame yields a KeyError. + def test_disordered(self, data): + """Test that a shuffled DataFrame yields same result as unshuffled. """ indices = np.arange(len(data)) @@ -55,72 +138,156 @@ def test_disordered_exception(self, data): df = data.iloc[indices] - with pytest.raises(KeyError): - self.slicer(df, lower=200) + assert (self.subsampler(df, lower=200) == self.subsampler(data, lower=200)).all().all() - @pytest.mark.parametrize('data', [gmx_benzene_dHdl_full(), - gmx_benzene_u_nk_full()]) + @pytest.mark.parametrize('data', [gmx_benzene_dHdl_duplicated(), + gmx_benzene_u_nk_duplicated()]) def test_duplicated_exception(self, data): - """Test that a DataFrame with duplicate times yields a KeyError. + """Test that a DataFrame with duplicate times for a lambda combination + yields a KeyError. """ with pytest.raises(KeyError): - self.slicer(data.sort_index(0), lower=200) + self.subsampler(data, lower=200) + + def test_slicing_dHdl(self, dHdl): + data, nsims = dHdl + + if nsims == "single": + dHdl_s = self.subsampler(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + dHdl_s = self.subsampler(data) + + def test_slicing_u_nk(self, u_nk): + data, nsims = u_nk + + if nsims == "single": + u_nk_s = self.subsampler(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + u_nk_s = self.subsampler(data) class CorrelatedPreprocessors: - @pytest.mark.parametrize(('data', 'size'), [(gmx_benzene_dHdl(), 4001), - (gmx_benzene_u_nk(), 4001)]) - def test_subsampling(self, data, size): + @pytest.mark.parametrize(('data', 'size', 'how'), + [(gmx_benzene_dHdl(), 4001, 'sum',), + (gmx_benzene_u_nk(), 4001, 'right')]) + def test_subsampling(self, data, size, how): """Basic test for execution; resulting size of dataset sensitive to machine and depends on algorithm. """ - assert len(self.slicer(data, series=data.iloc[:, 0])) <= size + assert len(self.subsampler(data, how=how)) <= size + + @pytest.mark.parametrize(('data', 'size', 'column'), + [(gmx_benzene_dHdl(), 20005, 0), + (gmx_benzene_u_nk(), 20005, 0)]) + def test_subsampling_column(self, data, size, column): + assert len(self.subsampler(data, column=data.columns[column])) <= size + + @pytest.mark.parametrize(('data', 'size', 'column'), + [(gmx_benzene_dHdl(), 20005, 0), + (gmx_benzene_u_nk(), 20005, 0)]) + def test_subsampling_series(self, data, size, column): + assert len(self.subsampler(data, column=data[data.columns[column]])) <= size + + def test_subsampling_dHdl(self, dHdl): + data, nsims = dHdl + + if nsims == "single": + dHdl_s = self.subsampler(data, how='sum') + assert len(dHdl_s) < len(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + dHdl_s = self.subsampler(data, how='sum') + + def test_subsampling_u_nk(self, u_nk): + data, nsims = u_nk + + if nsims == "single": + u_nk_s = self.subsampler(data, how='right') + assert len(u_nk_s) < len(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + u_nk_s = self.subsampler(data, how='right') + + def test_subsampling_u_nk_left(self, u_nk): + data, nsims = u_nk + + if nsims == "single": + u_nk_s = self.subsampler(data, how='left') + assert len(u_nk_s) < len(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + u_nk_s = self.subsampler(data, how='left') + + def test_subsampling_u_nk_random(self, u_nk): + data, nsims = u_nk + + if nsims == "single": + u_nk_s = self.subsampler(data, how='random', random_state=42) + assert len(u_nk_s) < len(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + u_nk_s = self.subsampler(data, how='random', random_state=42) + + +class TestStatisticalInefficiency(CorrelatedPreprocessors): + + def subsampler(self, *args, **kwargs): + return statistical_inefficiency(*args, **kwargs) - @pytest.mark.parametrize('data', [gmx_benzene_dHdl(), - gmx_benzene_u_nk()]) - def test_no_series(self, data): - """Check that we get the same result as simple slicing with no Series. + @pytest.mark.parametrize(('conservative', 'data', 'size', 'how'), + [ + (True, gmx_benzene_dHdl(), 2001, 'sum'), # 0.00: g = 1.0559445620585415 + (True, gmx_benzene_u_nk(), 2001, 'right'), # 'fep': g = 1.0560203916559594 + (False, gmx_benzene_dHdl(), 3789, 'sum'), + (False, gmx_benzene_u_nk(), 3788, 'right'), + ]) + def test_conservative(self, data, size, conservative, how): + sliced = self.subsampler(data, how=how, conservative=conservative) + # results can vary slightly with different machines + # so possibly do + # delta = 10 + # assert size - delta < len(sliced) < size + delta + assert len(sliced) == size - """ - df_sub = self.slicer(data, lower=200, upper=5000, step=2) - df_sliced = slicing(data, lower=200, upper=5000, step=2) + @pytest.mark.parametrize(('data', 'size', 'how'), + [(gmx_benzene_dHdl(), 4001, 'sum',), + (gmx_benzene_u_nk(), 4001, 'right')]) + def test_subsampling_return_calc(self, data, size, how): + data_s, calculated = self.subsampler(data, how=how, return_calculated=True) - assert np.all((df_sub == df_sliced)) + assert all(i in calculated.index for i in ['statinef']) + assert len(calculated.columns) == len(data.groupby(level=list(data.index.names[1:]))) -class TestStatisticalInefficiency(TestSlicing, CorrelatedPreprocessors): +class TestEquilibriumDetection(CorrelatedPreprocessors): - def slicer(self, *args, **kwargs): - return statistical_inefficiency(*args, **kwargs) + def subsampler(self, *args, **kwargs): + return equilibrium_detection(*args, nskip=5, **kwargs) - @pytest.mark.parametrize(('conservative', 'data', 'size'), + @pytest.mark.parametrize(('conservative', 'data', 'size', 'how'), [ - (True, gmx_benzene_dHdl(), 2001), # 0.00: g = 1.0559445620585415 - (True, gmx_benzene_u_nk(), 2001), # 'fep': g = 1.0560203916559594 - (False, gmx_benzene_dHdl(), 3789), - (False, gmx_benzene_u_nk(), 3571), + (True, gmx_benzene_dHdl(), 2001, 'sum'), # 0.00: g = 1.0559445620585415 + (True, gmx_benzene_u_nk(), 2001, 'right'), # 'fep': g = 1.0560203916559594 + (False, gmx_benzene_dHdl(), 2844, 'sum'), + (False, gmx_benzene_u_nk(), 2845, 'right'), ]) - def test_conservative(self, data, size, conservative): - sliced = self.slicer(data, series=data.iloc[:, 0], conservative=conservative) + def test_conservative(self, data, size, conservative, how): + sliced = self.subsampler(data, how=how, conservative=conservative) # results can vary slightly with different machines # so possibly do # delta = 10 # assert size - delta < len(sliced) < size + delta assert len(sliced) == size - @pytest.mark.parametrize('series', [ - gmx_benzene_dHdl()['fep'][:20], # wrong length - gmx_benzene_dHdl()['fep'][::-1], # wrong time stamps (reversed) - ]) - def test_raise_ValueError_for_mismatched_data(self, series): - data = gmx_benzene_dHdl() - with pytest.raises(ValueError): - self.slicer(data, series=series) - - -class TestEquilibriumDetection(TestSlicing, CorrelatedPreprocessors): + @pytest.mark.parametrize(('data', 'size', 'how'), + [(gmx_benzene_dHdl(), 4001, 'sum',), + (gmx_benzene_u_nk(), 4001, 'right')]) + def test_subsampling_return_calc(self, data, size, how): + data_s, calculated = self.subsampler(data, how=how, return_calculated=True) - def slicer(self, *args, **kwargs): - return equilibrium_detection(*args, **kwargs) + assert all(i in calculated.index for i in ['t', 'statinef', 'Neff_max']) + assert len(calculated.columns) == len(data.groupby(level=list(data.index.names[1:])))