From 2e8035273e6d7533c801c4dee874f178bfda1541 Mon Sep 17 00:00:00 2001
From: Edoardo Conti <edoardo.conti@gmail.com>
Date: Wed, 7 Jul 2021 15:59:27 -0400
Subject: [PATCH] Add bootstrapping to evlauation metrics (#72)

---
 ...aluating a new fraud policy with IPS.ipynb | 13 ++++--
 ... fraud policy with the direct method.ipynb | 13 ++++--
 ...policy with the doubly robust method.ipynb | 13 ++++--
 ope/methods/direct_method.py                  | 43 +++++++++++++++---
 ope/methods/doubly_robust.py                  | 45 ++++++++++++++++---
 ope/methods/inverse_propensity_scoring.py     | 35 +++++++++++++--
 ope/utils/__init__.py                         |  0
 ope/utils/stats.py                            | 14 ++++++
 setup.py                                      |  2 +-
 9 files changed, 153 insertions(+), 25 deletions(-)
 create mode 100644 ope/utils/__init__.py
 create mode 100644 ope/utils/stats.py

diff --git a/examples/1 - Evaluating a new fraud policy with IPS.ipynb b/examples/1 - Evaluating a new fraud policy with IPS.ipynb
index f559db2..c840436 100644
--- a/examples/1 - Evaluating a new fraud policy with IPS.ipynb	
+++ b/examples/1 - Evaluating a new fraud policy with IPS.ipynb	
@@ -281,7 +281,12 @@
     {
      "data": {
       "text/plain": [
-       "{'expected_reward_logging_policy': 2.5, 'expected_reward_new_policy': -37.5}"
+       "{'expected_reward_logging_policy': {'mean': 2.98,\n",
+       "  'ci_low': -11.92,\n",
+       "  'ci_high': 17.87},\n",
+       " 'expected_reward_new_policy': {'mean': -37.42,\n",
+       "  'ci_low': -122.51,\n",
+       "  'ci_high': 47.66}}"
       ]
      },
      "execution_count": 5,
@@ -290,7 +295,7 @@
     }
    ],
    "source": [
-    "inverse_propensity_scoring.evaluate(logs_df, action_probabilities)"
+    "inverse_propensity_scoring.evaluate(logs_df, action_probabilities, num_bootstrap_samples=100)"
    ]
   },
   {
@@ -298,7 +303,9 @@
    "id": "062ac2e2",
    "metadata": {},
    "source": [
-    "The expected reward per observation for the new policy is much worse than the logging policy (due to the observation that allowed fraud to go through (`row: 3`)) so we wouldn't roll out this new policy into an A/B test or production and instead should test some different policies offline."
+    "The expected reward per observation for the new policy is much worse than the logging policy (due to the observation that allowed fraud to go through (`row: 3`)) so we wouldn't roll out this new policy into an A/B test or production and instead should test some different policies offline.\n",
+    "\n",
+    "However, the confidence intervals around the expected rewards for our old and new policies overlap. If we want to be really certain, it's might be best to gather some more data to ensure the difference is signal and not noise. In this case, fortunately, we have strong reason to suspect the new policy is worse, but these confidence intervals can be important in cases where we have less prior certainty."
    ]
   }
  ],
diff --git a/examples/2 - Evaluating a new fraud policy with the direct method.ipynb b/examples/2 - Evaluating a new fraud policy with the direct method.ipynb
index 44e9a7e..80704f2 100644
--- a/examples/2 - Evaluating a new fraud policy with the direct method.ipynb	
+++ b/examples/2 - Evaluating a new fraud policy with the direct method.ipynb	
@@ -191,7 +191,12 @@
     {
      "data": {
       "text/plain": [
-       "{'expected_reward_logging_policy': 3.33, 'expected_reward_new_policy': 3.16}"
+       "{'expected_reward_logging_policy': {'mean': 3.18,\n",
+       "  'ci_low': -8.98,\n",
+       "  'ci_high': 15.35},\n",
+       " 'expected_reward_new_policy': {'mean': 1.74,\n",
+       "  'ci_low': -15.14,\n",
+       "  'ci_high': 18.63}}"
       ]
      },
      "execution_count": 4,
@@ -200,7 +205,7 @@
     }
    ],
    "source": [
-    "direct_method.evaluate(logs_df, action_probabilities)"
+    "direct_method.evaluate(logs_df, action_probabilities, num_bootstrap_samples=100)"
    ]
   },
   {
@@ -208,7 +213,9 @@
    "id": "062ac2e2",
    "metadata": {},
    "source": [
-    "The direct method estimates that the expected reward per observation for the new policy is slightly worse than the logging policy so we wouldn't roll out this new policy into an A/B test or production and instead should test some different policies offline."
+    "The direct method estimates that the expected reward per observation for the new policy is slightly worse than the logging policy so we wouldn't roll out this new policy into an A/B test or production and instead should test some different policies offline.\n",
+    "\n",
+    "However, the confidence intervals around the expected rewards for our old and new policies overlap heavily. If we want to be really certain, it's might be best to gather some more data to ensure the difference is signal and not noise. In this case, fortunately, we have strong reason to suspect the new policy is worse, but these confidence intervals can be important in cases where we have less prior certainty."
    ]
   }
  ],
diff --git a/examples/3 - Evaluating a new fraud policy with the doubly robust method.ipynb b/examples/3 - Evaluating a new fraud policy with the doubly robust method.ipynb
index ef41db8..12ed55f 100644
--- a/examples/3 - Evaluating a new fraud policy with the doubly robust method.ipynb	
+++ b/examples/3 - Evaluating a new fraud policy with the doubly robust method.ipynb	
@@ -191,7 +191,12 @@
     {
      "data": {
       "text/plain": [
-       "{'expected_reward_logging_policy': 3.33, 'expected_reward_new_policy': -28.47}"
+       "{'expected_reward_logging_policy': {'mean': 2.53,\n",
+       "  'ci_low': -8.42,\n",
+       "  'ci_high': 13.48},\n",
+       " 'expected_reward_new_policy': {'mean': -18.22,\n",
+       "  'ci_low': -97.56,\n",
+       "  'ci_high': 61.12}}"
       ]
      },
      "execution_count": 4,
@@ -200,7 +205,7 @@
     }
    ],
    "source": [
-    "doubly_robust.evaluate(logs_df, action_probabilities)"
+    "doubly_robust.evaluate(logs_df, action_probabilities, num_bootstrap_samples=50)"
    ]
   },
   {
@@ -208,7 +213,9 @@
    "id": "062ac2e2",
    "metadata": {},
    "source": [
-    "The doubly robust method estimates that the expected reward per observation for the new policy is much worse than the logging policy so we wouldn't roll out this new policy into an A/B test or production and instead should test some different policies offline."
+    "The doubly robust method estimates that the expected reward per observation for the new policy is much worse than the logging policy so we wouldn't roll out this new policy into an A/B test or production and instead should test some different policies offline.\n",
+    "\n",
+    "However, the confidence intervals around the expected rewards for our old and new policies overlap heavily. If we want to be really certain, it's might be best to gather some more data to ensure the difference is signal and not noise. In this case, fortunately, we have strong reason to suspect the new policy is worse, but these confidence intervals can be important in cases where we have less prior certainty."
    ]
   }
  ],
diff --git a/ope/methods/direct_method.py b/ope/methods/direct_method.py
index c1adcf2..909e32e 100644
--- a/ope/methods/direct_method.py
+++ b/ope/methods/direct_method.py
@@ -4,9 +4,12 @@
 import pandas as pd
 
 from ..training.predictor import Predictor
+from ..utils.stats import compute_list_stats
 
 
-def evaluate(df: pd.DataFrame, action_prob_function: Callable) -> Dict[str, float]:
+def evaluate(
+    df: pd.DataFrame, action_prob_function: Callable, num_bootstrap_samples: int = 0
+) -> Dict[str, Dict[str, float]]:
     """
     Direct method (DM) tutorial:
     See section 3.2 in https://arxiv.org/pdf/1503.02834.pdf
@@ -16,11 +19,41 @@ def evaluate(df: pd.DataFrame, action_prob_function: Callable) -> Dict[str, floa
     reward_model = Predictor()
     reward_model.fit(df)
 
-    context_df = df.context.apply(pd.Series)
+    results = [
+        evaluate_raw(df, action_prob_function, sample=True, reward_model=reward_model)
+        for _ in range(num_bootstrap_samples)
+    ]
+
+    if not results:
+        results = [
+            evaluate_raw(
+                df, action_prob_function, sample=False, reward_model=reward_model
+            )
+        ]
+
+    logging_policy_rewards = [result["logging_policy"] for result in results]
+    new_policy_rewards = [result["new_policy"] for result in results]
+
+    return {
+        "expected_reward_logging_policy": compute_list_stats(logging_policy_rewards),
+        "expected_reward_new_policy": compute_list_stats(new_policy_rewards),
+    }
+
+
+def evaluate_raw(
+    df: pd.DataFrame,
+    action_prob_function: Callable,
+    sample: bool,
+    reward_model: Predictor,
+) -> Dict[str, float]:
+
+    tmp_df = df.sample(df.shape[0], replace=True) if sample else df
+
+    context_df = tmp_df.context.apply(pd.Series)
     context_array = context_df[reward_model.context_column_order].values
     cum_reward_new_policy = 0
 
-    for idx, row in df.iterrows():
+    for idx, row in tmp_df.iterrows():
         observation_expected_reward = 0
         action_probabilities = action_prob_function(row["context"])
         for action, action_probability in action_probabilities.items():
@@ -33,6 +66,6 @@ def evaluate(df: pd.DataFrame, action_prob_function: Callable) -> Dict[str, floa
         cum_reward_new_policy += observation_expected_reward
 
     return {
-        "expected_reward_logging_policy": round(df.reward.sum() / len(df), 2),
-        "expected_reward_new_policy": round(cum_reward_new_policy / len(df), 2),
+        "logging_policy": tmp_df.reward.sum() / len(tmp_df),
+        "new_policy": cum_reward_new_policy / len(tmp_df),
     }
diff --git a/ope/methods/doubly_robust.py b/ope/methods/doubly_robust.py
index 89c1ee9..e3d2fcf 100644
--- a/ope/methods/doubly_robust.py
+++ b/ope/methods/doubly_robust.py
@@ -4,9 +4,12 @@
 import pandas as pd
 
 from ..training.predictor import Predictor
+from ..utils.stats import compute_list_stats
 
 
-def evaluate(df: pd.DataFrame, action_prob_function: Callable) -> Dict[str, float]:
+def evaluate(
+    df: pd.DataFrame, action_prob_function: Callable, num_bootstrap_samples: int = 0
+) -> Dict[str, Dict[str, float]]:
     """
     Doubly robust (DR) tutorial:
     https://arxiv.org/pdf/1503.02834.pdf
@@ -15,11 +18,41 @@ def evaluate(df: pd.DataFrame, action_prob_function: Callable) -> Dict[str, floa
     reward_model = Predictor()
     reward_model.fit(df)
 
-    context_df = df.context.apply(pd.Series)
+    results = [
+        evaluate_raw(df, action_prob_function, sample=True, reward_model=reward_model)
+        for _ in range(num_bootstrap_samples)
+    ]
+
+    if not results:
+        results = [
+            evaluate_raw(
+                df, action_prob_function, sample=False, reward_model=reward_model
+            )
+        ]
+
+    logging_policy_rewards = [result["logging_policy"] for result in results]
+    new_policy_rewards = [result["new_policy"] for result in results]
+
+    return {
+        "expected_reward_logging_policy": compute_list_stats(logging_policy_rewards),
+        "expected_reward_new_policy": compute_list_stats(new_policy_rewards),
+    }
+
+
+def evaluate_raw(
+    df: pd.DataFrame,
+    action_prob_function: Callable,
+    sample: bool,
+    reward_model: Predictor,
+) -> Dict[str, float]:
+
+    tmp_df = df.sample(df.shape[0], replace=True) if sample else df
+
+    context_df = tmp_df.context.apply(pd.Series)
     context_array = context_df[reward_model.context_column_order].values
     cum_reward_new_policy = 0
 
-    for idx, row in df.iterrows():
+    for idx, row in tmp_df.iterrows():
         observation_expected_reward = 0
         processed_context = context_array[idx]
 
@@ -35,7 +68,7 @@ def evaluate(df: pd.DataFrame, action_prob_function: Callable) -> Dict[str, floa
 
         # then compute the right hand term, which is similar to IPS
         logged_action = row["action"]
-        new_action_probability = action_probabilities[row["action"]]
+        new_action_probability = action_probabilities[logged_action]
         weight = new_action_probability / row["action_prob"]
         one_hot_action = reward_model.action_preprocessor.transform(
             np.array(row["action"]).reshape(-1, 1)
@@ -47,6 +80,6 @@ def evaluate(df: pd.DataFrame, action_prob_function: Callable) -> Dict[str, floa
         cum_reward_new_policy += observation_expected_reward
 
     return {
-        "expected_reward_logging_policy": round(df.reward.sum() / len(df), 2),
-        "expected_reward_new_policy": round(cum_reward_new_policy / len(df), 2),
+        "logging_policy": tmp_df.reward.sum() / len(tmp_df),
+        "new_policy": cum_reward_new_policy / len(tmp_df),
     }
diff --git a/ope/methods/inverse_propensity_scoring.py b/ope/methods/inverse_propensity_scoring.py
index 66de197..2192096 100644
--- a/ope/methods/inverse_propensity_scoring.py
+++ b/ope/methods/inverse_propensity_scoring.py
@@ -2,21 +2,48 @@
 
 import pandas as pd
 
+from ..utils.stats import compute_list_stats
 
-def evaluate(df: pd.DataFrame, action_prob_function: Callable) -> Dict[str, float]:
+
+def evaluate(
+    df: pd.DataFrame, action_prob_function: Callable, num_bootstrap_samples: int = 0
+) -> Dict[str, Dict[str, float]]:
     """
     Inverse propensity scoring (IPS) tutorial:
     https://www.cs.cornell.edu/courses/cs7792/2016fa/lectures/03-counterfactualmodel_6up.pdf
     """
 
+    results = [
+        evaluate_raw(df, action_prob_function, sample=True)
+        for _ in range(num_bootstrap_samples)
+    ]
+
+    if not results:
+        results = [evaluate_raw(df, action_prob_function, sample=False)]
+
+    logging_policy_rewards = [result["logging_policy"] for result in results]
+    new_policy_rewards = [result["new_policy"] for result in results]
+
+    return {
+        "expected_reward_logging_policy": compute_list_stats(logging_policy_rewards),
+        "expected_reward_new_policy": compute_list_stats(new_policy_rewards),
+    }
+
+
+def evaluate_raw(
+    df: pd.DataFrame, action_prob_function: Callable, sample: bool
+) -> Dict[str, float]:
+
+    tmp_df = df.sample(df.shape[0], replace=True) if sample else df
+
     cum_reward_new_policy = 0
-    for _, row in df.iterrows():
+    for _, row in tmp_df.iterrows():
         action_probabilities = action_prob_function(row["context"])
         cum_reward_new_policy += (
             action_probabilities[row["action"]] / row["action_prob"]
         ) * row["reward"]
 
     return {
-        "expected_reward_logging_policy": round(df.reward.sum() / len(df), 2),
-        "expected_reward_new_policy": round(cum_reward_new_policy / len(df), 2),
+        "logging_policy": tmp_df.reward.sum() / len(tmp_df),
+        "new_policy": cum_reward_new_policy / len(tmp_df),
     }
diff --git a/ope/utils/__init__.py b/ope/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ope/utils/stats.py b/ope/utils/stats.py
new file mode 100644
index 0000000..1ebe196
--- /dev/null
+++ b/ope/utils/stats.py
@@ -0,0 +1,14 @@
+from typing import List
+import statistics
+
+P95_Z_SCORE = 1.96
+
+
+def compute_list_stats(input: List):
+    """Compute mean and P95 CI of mean for a list of floats."""
+    mean = statistics.mean(input)
+    std_dev = statistics.stdev(input) if len(input) > 1 else None
+    ci_low = round(mean - P95_Z_SCORE * std_dev, 2) if std_dev else None
+    ci_high = round(mean + P95_Z_SCORE * std_dev, 2) if std_dev else None
+
+    return {"mean": round(mean, 2), "ci_low": ci_low, "ci_high": ci_high}
diff --git a/setup.py b/setup.py
index c958f2a..9c5563f 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 
 setuptools.setup(
     name="offline-evaluation",
-    version="0.0.5",
+    version="0.0.6",
     author="Edoardo Conti, Lionel Vital, Joseph Gilley",
     author_email="team@banditml.com",
     description="Implementations of common offline policy evaluation methods.",