tum-pbs · Ceyron · Nov 13, 2024 · Nov 13, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/apebench/__init__.py b/apebench/__init__.py
@@ -20,6 +20,8 @@
 from ._utils import (
     aggregate_gmean,
     check_for_nan,
+    compute_pvalues_against_best,
+    cumulative_aggregation,
     melt_data,
     melt_loss,
     melt_metrics,
@@ -57,4 +59,6 @@
     "split_train",
     "components",
     "check_for_nan",
+    "cumulative_aggregation",
+    "compute_pvalues_against_best",
 ]
diff --git a/apebench/_utils.py b/apebench/_utils.py
@@ -1,8 +1,10 @@
-from typing import Union
+from typing import Literal, Union
 
 import jax
 import jax.numpy as jnp
+import numpy as np
 import pandas as pd
+from scipy import stats
 from scipy.stats import gmean
 
 from ._base_scenario import BaseScenario
@@ -250,3 +252,257 @@ def check_for_nan(scene: BaseScenario):
         del test_data_coarse
     except NotImplementedError:
         return
+
+
+def cumulative_aggregation(
+    df: pd.DataFrame,
+    grouping_cols: Union[str, list[str]],
+    rolling_col: str,
+    agg_fn_name: str = "mean",
+    prefix: str = "cum",
+) -> pd.DataFrame:
+    """
+    Apply a cumulative aggregation to a DataFrame. This can be used to
+    cumulatively aggregate a metric over time.
+
+    If you are only interested in the aggregation over some rows (but not
+    cumulatively), you can use the [`apebench.aggregate_gmean`][] function.
+
+    **Arguments:**
+
+    - `df`: The DataFrame to aggregate.
+    - `grouping_cols`: The columns to group by. Supply a list of column names
+      such that if `df.groupby(grouping_cols)` is called, the groups contain
+      only different time steps (or whatever you want to aggregate over).
+    - `rolling_col`: The column to aggregate. Ususally, this is the name of the
+      metric in a long DataFrame.
+    - `agg_fn_name`: The aggregation function to use. Must be one of `"mean"`
+      or, `"gmean"` (geometric mean), or `"sum"`.
+
+    **Returns:**
+
+    - A DataFrame with the cumulatively aggregated columns added.
+
+    !!! example
+
+        Train a feedforward ConvNet to emulate advection and then display the
+        mean nRMSE error rollout and the cumulative mean/gmean mean nRMSE error
+        rollout. The only column that varies is the `"see"` column, so we group
+        by that.
+
+        ```python
+        import apebench
+        advection_scenario = apebench.scenarios.difficulty.Advection()
+
+        data, trained_net = advection_scenario(num_seeds=3)
+
+        metric_data = apebench.melt_metrics(data)
+
+        metric_df = apebench.cumulative_aggregation(
+            metric_df, "seed", "mean_nRMSE", agg_fn_name="mean"
+        )
+        metric_df = apebench.cumulative_aggregation(
+            metric_df, "seed", "mean_nRMSE", agg_fn_name="gmean"
+        )
+
+        fig, ax = plt.subplots()
+        sns.lineplot(
+            data=metric_df, x="time_step",
+            y="mean_nRMSE", ax=ax, label="original"
+        )
+        sns.lineplot(
+            data=metric_df, x="time_step",
+            y="cummean_mean_nRMSE", ax=ax, label="cummean"
+        )
+        sns.lineplot(
+            data=metric_df, x="time_step",
+            y="cumgmean_mean_nRMSE", ax=ax, label="cumgmean"
+        )
+        ```
+
+        ![Metric Rollout absolutely and cumulatively aggregated via mean and
+        gmean](https://github.com/user-attachments/assets/ba342f26-a6ef-4a67-8a2c-2b1d94ddfde1)
+    """
+    agg_fn = {
+        "mean": np.mean,
+        "gmean": stats.gmean,
+        "sum": np.sum,
+    }[agg_fn_name]
+    return df.groupby(grouping_cols, observed=True, group_keys=False).apply(
+        lambda x: x.assign(
+            **{
+                f"{prefix}{agg_fn_name}_{rolling_col}": x[rolling_col]
+                .expanding()
+                .apply(agg_fn)
+            }
+        )
+    )
+
+
+def compute_pvalues_against_best(
+    df: pd.DataFrame,
+    grouping_cols: list[str],
+    sorting_cols: list[str],
+    value_col: str,
+    performance_indicator="mean",
+    alternative: Literal["two-sided", "less", "greater"] = "two-sided",
+    equal_var: bool = True,
+    pivot: bool = False,
+):
+    """
+    Performs a t-test of the best configuration in a group against all other
+    configurations and returns the p-values. "Best" is defined as the
+    configuration with the lowest aggregated value (typically the mean).
+
+    The returned DataFrame can be interpreted in that the best configuation has
+    a p-value of 1.0 against itself (or 0.5 in case `alternative` is not
+    `"two-sided"`) and a lower p-value against all other configurations. Only if
+    the p-value against the other configurations is below the significance level
+    (typically 0.05), the best configuration can be considered significantly
+    better.
+
+    **Arguments:**
+
+    - `df`: The DataFrame to compute the p-values for.
+    - `grouping_cols`: The columns to group by. Select the `grouping_cols` such
+      that when `df.groupby(grouping_cols + sorting_cols)` is called, the groups
+      only contains single samples to be used for the hypothesis test. For
+      example, if you want to compare network architecture and training
+      methodology for all time steps and investigated scenarios, then you would
+      use `grouping_cols = ["scenario", "time_step"]` together with
+      `sorting_cols = ["net", "train"]`.
+    - `sorting_cols`: The columns to sort by. Once the dataframe is grouped by
+      `grouping_cols`, the configuration out of all combinations in
+      `sorting_cols` with the lowest aggregated value (typically the mean) is
+      considered the best.
+    - `value_col`: The column to use for the hypothesis test. Typically, this is
+      the column with a test metric. (The default metric in APEBench is
+      `mean_nRMSE`.)
+    - `performance_indicator`: The aggregation to determine the best
+      configuration. Typically, this is the mean, which is also what is checked
+      with the hypothesis test.
+    - `alternative`: The alternative hypothesis to test. Must be one of
+      `"two-sided"`, `"less"`, or `"greater"`.
+    - `equal_var`: Whether to assume equal variance in the two samples. (This is
+      a parameter of the t-test.)
+    - `pivot`: Whether to pivot the DataFrame to have the p-values in a matrix
+        form. This is useful for directly comparing across the `sorting_cols`.
+
+    See also [this SciPy
+    page](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html)
+    for more details on the t-test.
+
+    **Returns:**
+
+    - A DataFrame with the p-values of the hypothesis test. If `pivot` is
+      `True`,
+        the DataFrame is pivoted such that the p-values are in a matrix form.
+
+
+    !!! example
+
+        Test for the significantly better architecture when emulating advection
+        under `advection_gamma=2.0`.
+
+        ```python
+        CONFIGS = [
+            {
+                "scenario": "diff_adv",
+                "task": "predict",
+                "net": net,
+                "train": "one",
+                "start_seed": 0,
+                "num_seeds": 10,
+                "advection_gamma": 2.0,
+            }
+            for net in ["Conv;34;10;relu", "FNO;12;8;4;gelu", "UNet;12;2;relu"]
+        ]
+
+        metric_df, _, _, _ = apebench.run_study_convenience(
+            CONFIGS, "conv_vs_fno_vs_unet"
+        )
+
+        p_value_df_pivoted = apebench.compute_pvalues_against_best(
+            metric_df, ["time_step",], ["net",], "mean_nRMSE", pivot=True
+        )
+
+        print(
+            p_value_df_pivoted.query(
+                "time_step in [1, 5, 10, 50, 100, 200]"
+            ).round(4).to_markdown()
+        )
+        ```
+
+        |   time_step |   Conv;34;10;relu |   FNO;12;8;4;gelu |   UNet;12;2;relu |
+        |------------:|------------------:|------------------:|-----------------:|
+        |           1 |                 1 |            0.0017 |           0      |
+        |           5 |                 1 |            0.0013 |           0      |
+        |          10 |                 1 |            0.0007 |           0      |
+        |          50 |                 1 |            0.0001 |           0.0457 |
+        |         100 |                 1 |            0      |           0.0601 |
+        |         200 |                 1 |            0      |           0.0479 |
+
+        We can also visualize the p-values over time:
+        ```python
+        import seaborn as sns
+        import matplotlib.pyplot as plt
+
+        p_value_df = apebench.compute_pvalues_against_best(
+            metric_df, ["time_step",], ["net",], "mean_nRMSE", "mean"
+        )
+
+        sns.lineplot(
+            p_value_df,
+            x="time_step",
+            y="p_value",
+            hue="net",
+        )
+        plt.hlines(0.05, 0, 200, linestyles="--", colors="black")
+
+        plt.yscale("log")
+        ```
+        ![pvalue rollout over time](https://github.com/user-attachments/assets/4e99ed9b-735f-499c-8374-a7d9d94d1aca)
+
+        Since the p-value of the ConvNet (the best architecture according to its
+        mean performance) against the FNO is constantly below 0.05, we can
+        confidently say the ConvNet is significantly better than the FNO.
+        However, the UNet's performance's p-value is above 0.05 after ~50 time
+        steps and thus, for this temporal horizon, we cannot reject the null
+        hypothesis.
+    """
+    stats_df = (
+        df.groupby(grouping_cols + sorting_cols, observed=True, group_keys=True)
+        .agg(
+            performance_indicator=(value_col, performance_indicator),
+            mean=(value_col, "mean"),
+            std=(value_col, "std"),
+            count=(value_col, "count"),
+        )
+        .reset_index()
+    )
+
+    stats_df = stats_df.groupby(grouping_cols, observed=True, group_keys=False).apply(
+        lambda x: x.assign(
+            p_value=stats.ttest_ind_from_stats(
+                mean1=x["mean"].values[x["performance_indicator"].argmin()],
+                std1=x["std"].values[x["performance_indicator"].argmin()],
+                nobs1=x["count"].values[x["performance_indicator"].argmin()],
+                mean2=x["mean"].values,
+                std2=x["std"].values,
+                nobs2=x["count"].values,
+                alternative=alternative,
+                equal_var=equal_var,
+            ).pvalue
+        )
+    )
+
+    if not pivot:
+        return stats_df
+
+    p_value_df = stats_df.pivot(
+        index=grouping_cols,
+        columns=sorting_cols,
+        values="p_value",
+    )
+
+    return p_value_df
diff --git a/docs/api/postprocess/dataframe_manipulation.md b/docs/api/postprocess/dataframe_manipulation.md
@@ -12,4 +12,12 @@
 
 ---
 
-::: apebench.relative_by_config
+::: apebench.cumulative_aggregation
+
+---
+
+::: apebench.relative_by_config
+
+---
+
+::: apebench.compute_pvalues_against_best