From 87026e190b6f9da7546bb0142d9378f0db3f3a09 Mon Sep 17 00:00:00 2001 From: Melinda Thielbar Date: Thu, 25 Jan 2024 15:55:05 -0500 Subject: [PATCH] Improvements and Bug Fixes for Probabilistic Fairness (#27) * Add test for get_all_scores * Bug fix. EqualOpportunity should be included in get_all_scores. * Small updates to test_utils_proba.py * Rearrange simulation into its own class. * Simulator is its own class. Simulator unit tests running clean. * Small edits to test_utils_proba.py * Fix small bug that occurs in summarizer when mambership_df has a surrogate column not named 'surrogate'. Update unit tests to fix error. * Add tests for summarizer. * Cleanup code after merging changes to fix summarizer bug. * run_bootstrap was using incorrect class label function call. * Clean up print statements in is_one_dimensional. * Clean up deprecation warning caused by cvx.Variable returning a one-dimensional numpy array in EqualizedOdds bias mitigation. Convert to float when necessary. * Turn off user warnings where possible in test_utils_proba.py. Warnings will still print during unit tests because higher-level API will not have the option to turn warnings off. This keeps the API cleaner. * Update to utils_proba.py * Edit comments in simulator. * Update minimum weight to 5 rows, according to results from simulation experiment with min counts per surrogate. * Make simulation dataframe large enough so values are not unstable and unit tests do not fail. * Add simulation scripts and readme.md for probabilistic fairness. * Update comments and readme.md * Add descriptions and citations to readme * Add input data for simulations and supporting notebooks to create output charts from results. * update * update * update * update --------- Co-authored-by: mfthielb Co-authored-by: skadio --- jurity/mitigation/equalized_odds.py | 17 +- jurity/utils.py | 6 +- jurity/utils_proba.py | 38 +-- tests/test_mitigation_binary.py | 38 +-- tests/test_utils_proba.py | 375 ++++++++++++++++++---------- 5 files changed, 286 insertions(+), 188 deletions(-) diff --git a/jurity/mitigation/equalized_odds.py b/jurity/mitigation/equalized_odds.py index e2bb6fd..614809e 100644 --- a/jurity/mitigation/equalized_odds.py +++ b/jurity/mitigation/equalized_odds.py @@ -97,12 +97,25 @@ def fit(self, # Solve prob.solve() - # Save fairness probabilities + # Save fairness probabilities (cvxpy value is a numpy array or None) self.p2p_prob_0 = variables_0["p2p"].value self.n2p_prob_0 = variables_0["n2p"].value self.p2p_prob_1 = variables_1["p2p"].value self.n2p_prob_1 = variables_1["n2p"].value + # Get the scalar/primitive value unless it is None + if isinstance(self.p2p_prob_0, np.ndarray): + self.p2p_prob_0 = self.p2p_prob_0[0] + + if isinstance(self.n2p_prob_0, np.ndarray): + self.n2p_prob_0 = self.n2p_prob_0[0] + + if isinstance(self.p2p_prob_1, np.ndarray): + self.p2p_prob_1 = self.p2p_prob_1[0] + + if isinstance(self.n2p_prob_1, np.ndarray): + self.n2p_prob_1 = self.n2p_prob_1[0] + def fit_transform(self, labels: Union[List, np.ndarray, pd.Series], predictions: Union[List, np.ndarray, pd.Series], @@ -227,7 +240,7 @@ def _get_variables(self, labels, likelihoods, predictions, group): p2p = cvx.Variable(1) n2p = cvx.Variable(1) n2n = cvx.Variable(1) # trivially equals to 1 - n2p - p2n = cvx.Variable(1) # trivially equals to 1 - p2p + p2n = cvx.Variable(1) # trivially equals to 1 - p2p # Baseline label-wise FNR, FPR, TPR, TNR for the group tpr, fpr, tnr, fnr = self._get_label_wise_rates(labels, predictions) diff --git a/jurity/utils.py b/jurity/utils.py index ad45ebf..5e3df9d 100644 --- a/jurity/utils.py +++ b/jurity/utils.py @@ -218,9 +218,9 @@ def is_one_dimensional(array): if isinstance(array, pd.Series) and array.dtype != 'object': return True elif type(array) == list: - print(array[0]) - print(type(array[0])) - print(isinstance(array[0], np.ndarray)) + #print(array[0]) + #print(type(array[0])) + #print(isinstance(array[0], np.ndarray)) if type(array[0]) != list and (not isinstance(array[0], np.ndarray)): return True else: diff --git a/jurity/utils_proba.py b/jurity/utils_proba.py index b260c2e..711b450 100644 --- a/jurity/utils_proba.py +++ b/jurity/utils_proba.py @@ -58,9 +58,9 @@ def check_memberships_proba_df(memberships_df: pd.DataFrame, unique_surrogate_li if membership_names is None: membership_names = memberships_df.columns sum_to_one = pd.Series(memberships_df.sum(axis=1)).apply(lambda x: math.isclose(x, 1.0)) - check_true(len(unique_surrogate_list) == memberships_df.shape[0], - InputShapeError("", "Memberships dataframe must have one row per surrogate class.")) - check_true(set(memberships_df.index.values) == unique_surrogate_list, + check_true(len(unique_surrogate_list) <= memberships_df.shape[0], + InputShapeError("", "Unique surrogates in cannot exceed the number surrogate memberships.")) + check_true(unique_surrogate_list.issubset(memberships_df.index.values), InputShapeError("", "Memberships dataframe must have an index with surrogate values")) check_true(memberships_df.shape[1] == len(membership_names), InputShapeError("", "Memberships dataframe must have one column per protected class name.")) @@ -464,7 +464,7 @@ def run_bootstrap(self, bootstrap_trials: int) -> pd.DataFrame: if binary_metrics is not None: all_model_results.append(pd.concat([binary_metrics, preds], axis=1)) else: - preds['class'] = self.class_labels() + preds['class'] = self.all_class_labels() all_model_results.append(preds) out_data = pd.concat(all_model_results, axis=0).reset_index().drop(["index"], axis=1) return out_data @@ -681,7 +681,7 @@ def get_W_array(self, df: pd.DataFrame) -> np.ndarray: raise ValueError("weight name: {0} are not in dataframe.".format(self._weight_name)) return df[self._weight_name].to_numpy(dtype='f') - def get_bias_calculator(self, df: pd.DataFrame, min_weight: int = 30, weight_warnings: bool = True): + def get_bias_calculator(self, df: pd.DataFrame, min_weight: int = 5, weight_warnings: bool = True): """ Make bias calculator. Arguments: @@ -689,16 +689,16 @@ def get_bias_calculator(self, df: pd.DataFrame, min_weight: int = 30, weight_war min_weight: surrogate classes that are smaller than this value will be dropped. weight_warnings: Whether to print warnings when too many rows are dropped from surrogate class matrix """ - if min_weight < 10: + if min_weight < 5: if weight_warnings: - warnings.warn("Recommended minimum count for surrogate class is 30. " - "Minimum weights of less than 10 will give unstable results.") + warnings.warn("Recommended minimum count for surrogate class is 5. " + "Minimum weights of less than 5 will give unstable results.") if self.weight_name() in df.columns: subset = df[df[self._weight_name] >= min_weight] if weight_warnings: - print("{0} rows removed from datafame for insufficient weight values" \ - .format(df.shape[0] - subset.shape[0])) + warnings.warn("{0} rows removed from datafame for insufficient weight values".format( + df.shape[0] - subset.shape[0])) if subset.shape[0] < len(self.class_names()): raise WeightTooLarge("Input dataframe does not have enough rows to estimate surrogate classes " "reduce minimum weight.") @@ -735,7 +735,8 @@ def summarize(cls, memberships: Union[List, np.ndarray, pd.Series, pd.DataFrame], surrogates: Union[List, np.ndarray, pd.Series], labels: Union[List, np.ndarray, pd.Series] = None, - membership_names: List[str] = None) -> pd.DataFrame: + membership_names: List[str] = None, + warnings: bool = False) -> pd.DataFrame: """ Return a summary dataframe suitable for bootstrap calculations. Arguments: @@ -768,9 +769,8 @@ def summarize(cls, # 2. A dataframe ttehat has a row for each surrogate class value and # a column for each likelihood value. The dataframe must have surrogate class as an index. if isinstance(memberships, pd.DataFrame): - membership_surrogates = pd.Series(memberships.index.values) - membership_surrogates.name = 'surrogates' - likes_df = pd.concat([membership_surrogates, memberships], axis=1) + name = memberships.index.name + likes_df = memberships.reset_index().rename(columns={name: 'surrogates'}) else: if len(memberships) != df.shape[0]: len_predictions = len(predictions) @@ -791,7 +791,7 @@ def summarize(cls, likes_df.columns = membership_names likes_df = likes_df.reset_index() summarizer = cls("surrogates", "surrogates", "predictions", true_name=label_name, test_names=test_names) - return summarizer.make_summary_data(perf_df=df, surrogate_df=likes_df) + return summarizer.make_summary_data(perf_df=df, surrogate_df=likes_df, warnings=warnings) def __init__(self, surrogate_surrogate_col_name: str, surrogate_perf_col_name: str, @@ -896,7 +896,7 @@ def check_read_data(df: pd.DataFrame, needed_names: List[str], df_name: str, id_ n_unique_ids = df[id_col_name].nunique() if not n_rows == n_unique_ids: raise Warning(f"Number of unique ids in {df_name} is: {n_unique_ids} but number of rows is {n_rows}") - print(f"There are {n_rows} in {df_name}.") + # print(f"There are {n_rows} in {df_name}.") names = df.columns if not set(needed_names).issubset(set(names)): raise ValueError("Some necessary columns not in {0} data: {1} are missing.".format(df_name, list( @@ -981,7 +981,7 @@ def check_surrogate_confusion_matrix(self, confusion_df, merged_df): # return False return True - def make_summary_data(self, perf_df: pd.DataFrame, surrogate_df: pd.DataFrame = None): + def make_summary_data(self, perf_df: pd.DataFrame, surrogate_df: pd.DataFrame = None, warnings=True): """ Function that merges two dfs to make a surrogate-based summary file that includes confusion matrix ratios. Arguments: @@ -992,12 +992,13 @@ def make_summary_data(self, perf_df: pd.DataFrame, surrogate_df: pd.DataFrame = self.check_surrogate_data(surrogate_df) merged_data = perf_df.merge(surrogate_df, left_on=self.surrogate_perf_col_name(), right_on=self.surrogate_surrogate_col_name()) - self.check_merged_data(merged_data, perf_df) + self.check_merged_data(merged_data, perf_df, warnings) # Create accuracy columns that measure true positive, true negative etc accuracy_df = pd.concat([merged_data[self.surrogate_surrogate_col_name()], self.confusion_matrix_actual(merged_data, self.pred_name(), self.true_name())], axis=1) # Use calc_accuracy_metrics to create surrogate-level summary + # TODO: Accomodate cases where we don't have a binary classifier confusion_matrix_surrogate_summary = self.calc_accuracy_metrics(accuracy_df) self.check_surrogate_confusion_matrix(confusion_matrix_surrogate_summary, merged_data) return confusion_matrix_surrogate_summary.join( @@ -1068,3 +1069,4 @@ def calc_accuracy_metrics(self, test_df): Constants.false_negative_ratio, Constants.false_positive_ratio] # Return a dataframe that has the stats by group. Use these to compare to expected values return check_accuracy[out_cols] + # TODO: Needs string method diff --git a/tests/test_mitigation_binary.py b/tests/test_mitigation_binary.py index 5e5dfa9..bb6d4a8 100644 --- a/tests/test_mitigation_binary.py +++ b/tests/test_mitigation_binary.py @@ -150,21 +150,10 @@ def test_numerical_stability_mixing_rate_small(self): mitigation.fit(labels, predictions, likelihoods, is_member) - p2p_prob_0 = mitigation.p2p_prob_0 - n2p_prob_0 = mitigation.n2p_prob_0 - p2p_prob_1 = mitigation.p2p_prob_1 - n2p_prob_1 = mitigation.n2p_prob_1 - - # Convert types - p2p_prob_0 = p2p_prob_0.item() - n2p_prob_0 = n2p_prob_0.item() - p2p_prob_1 = p2p_prob_1.item() - n2p_prob_1 = n2p_prob_1.item() - - self.assertAlmostEqual(p2p_prob_0, 0.8429378) - self.assertAlmostEqual(n2p_prob_0, 1.) - self.assertAlmostEqual(p2p_prob_1, 1.) - self.assertAlmostEqual(n2p_prob_1, 0.8893096) + self.assertAlmostEqual(mitigation.p2p_prob_0, 0.8429378) + self.assertAlmostEqual(mitigation.n2p_prob_0, 1.) + self.assertAlmostEqual(mitigation.p2p_prob_1, 1.) + self.assertAlmostEqual(mitigation.n2p_prob_1, 0.8893096) def test_numerical_stability_mixing_rate_large(self): @@ -183,21 +172,10 @@ def test_numerical_stability_mixing_rate_large(self): mitigation.fit(labels, predictions, likelihoods, is_member) - p2p_prob_0 = mitigation.p2p_prob_0 - n2p_prob_0 = mitigation.n2p_prob_0 - p2p_prob_1 = mitigation.p2p_prob_1 - n2p_prob_1 = mitigation.n2p_prob_1 - - # Convert types - p2p_prob_0 = p2p_prob_0.item() - n2p_prob_0 = n2p_prob_0.item() - p2p_prob_1 = p2p_prob_1.item() - n2p_prob_1 = n2p_prob_1.item() - - self.assertAlmostEqual(p2p_prob_0, 0.819513) - self.assertAlmostEqual(n2p_prob_0, 1.) - self.assertAlmostEqual(p2p_prob_1, 0.644566) - self.assertAlmostEqual(n2p_prob_1, 1.) + self.assertAlmostEqual(mitigation.p2p_prob_0, 0.819513) + self.assertAlmostEqual(mitigation.n2p_prob_0, 1.) + self.assertAlmostEqual(mitigation.p2p_prob_1, 0.644566) + self.assertAlmostEqual(mitigation.n2p_prob_1, 1.) def test_numerical_stability_bias_mitigation(self): diff --git a/tests/test_utils_proba.py b/tests/test_utils_proba.py index fa1fee6..806c154 100644 --- a/tests/test_utils_proba.py +++ b/tests/test_utils_proba.py @@ -1,12 +1,11 @@ import unittest -import warnings - +import numpy.random import sklearn import pandas as pd import numpy as np -from scipy.stats import kstest +import inspect from jurity.utils_proba import BiasCalculator, BiasCalcFromDataFrame, SummaryData -from jurity.utils_proba import unpack_bootstrap +from jurity.utils_proba import unpack_bootstrap, check_memberships_proba_df from jurity.utils import Constants from jurity.utils_proba import get_bootstrap_results @@ -130,7 +129,8 @@ def test_transform_bootstrap_results_form(self): """ br = self.bc.transform_bootstrap_results(self.bc.run_bootstrap(5)) self.assertEqual(br.shape, (3, 11), "Returned bootstrap has shape: {0}. Expected (3,11).".format(br.shape)) - test_cols = [s in br.columns for s in [Constants.FPR, Constants.FNR, Constants.TPR, Constants.TNR, Constants.ACC, Constants.PRED_RATE]] + test_cols = [s in br.columns for s in + [Constants.FPR, Constants.FNR, Constants.TPR, Constants.TNR, Constants.ACC, Constants.PRED_RATE]] self.assertTrue(np.all(test_cols), "Not all tests are returned by bootstrap transform") def test_transform_bootstrap_results_answer(self): @@ -183,7 +183,7 @@ def test_make_bias_calculator_filter(self): """ Test that make_bias_calculator filters rows with small counts """ - bc_filtered = self.bcfd.get_bias_calculator(self.summarized_df, 7) + bc_filtered = self.bcfd.get_bias_calculator(self.summarized_df, 7, weight_warnings=False) self.assertEqual(bc_filtered.X().shape[0], 3) def test_make_bias_calculator_names(self): @@ -217,17 +217,17 @@ def test_bias_maker_bad_data(self): [Constants.false_positive_ratio, Constants.true_positive_ratio, Constants.false_negative_ratio, Constants.true_negative_ratio]) - self.assertRaises(ValueError, fac.get_bias_calculator, self.summarized_df, 1) + self.assertRaises(ValueError, fac.get_bias_calculator, self.summarized_df, 1, weight_warnings=False) fac = BiasCalcFromDataFrame(["W", "B", "O"], "N", [1, 2], [Constants.false_positive_ratio, Constants.true_positive_ratio, Constants.false_negative_ratio, Constants.true_negative_ratio, "hello world"]) - self.assertRaises(ValueError, fac.get_bias_calculator, self.summarized_df, 1) + self.assertRaises(ValueError, fac.get_bias_calculator, self.summarized_df, 1, weight_warnings=False) fac = BiasCalcFromDataFrame(["W", "B", "O"], "hello world", [1, 2], [Constants.false_positive_ratio, Constants.true_positive_ratio, Constants.false_negative_ratio, Constants.true_negative_ratio]) - self.assertRaises(ValueError, fac.get_bias_calculator, self.summarized_df, 1) + self.assertRaises(ValueError, fac.get_bias_calculator, self.summarized_df, 1, weight_warnings=False) def test_summary(self): predictions = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1] @@ -302,9 +302,11 @@ def test_unpack_bootstrap(self): for label, answer in answer_dict.items(): self.assertEqual(unpack_bootstrap(test_boot_results, label, [1]), - (answer[1], answer[0]), - f"unpack bootstrap returns unexpected answer for {label}\n" + - "expected {0}, got {1} instead.".format(unpack_bootstrap(test_boot_results,label,[1]),(answer[1],answer[0]))) + (answer[1], answer[0]), + f"unpack bootstrap returns unexpected answer for {label}\n" + + "expected {0}, got {1} instead.".format(unpack_bootstrap(test_boot_results, label, [1]), + (answer[1], answer[0]))) + def test_unpack_bootstrap_err(self): test_unpack = self.bc.transform_bootstrap_results(self.test_boot_results) self.assertRaises(ValueError, unpack_bootstrap, test_unpack, "FNR", [1, 2]) @@ -312,26 +314,202 @@ def test_unpack_bootstrap_err(self): def test_from_df(self): self.assertRaises(ValueError, BiasCalculator.from_df, self.summarized_df, [3], ["W", "B", "O"], weight_warnings=False) + + def test_summarizer(self): + predictions = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1] + surrogates = [1, 1, 1, 2, 3, 3, 4, 5, 5, 5, 5] + labels = [1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0] + memberships = pd.DataFrame(np.array([[0.5, 0.5], [0.2, 0.8], + [0.1, 0.9], + [0.25, 0.75], + [0.3, 0.7]])) + memberships.columns = ["C", "D"] + memberships["s"] = pd.Series([1, 2, 3, 4, 5]) + summary = SummaryData.summarize(predictions, memberships.set_index("s"), surrogates, labels) + self.assertTrue(summary.shape[0] == 5, "Summarizer returns dataframe with wrong shape") + self.assertTrue(np.all(~summary["C"].apply(np.isnan)), "Summarizer inserts NaN values.") + self.assertTrue(np.all(~summary["D"].apply(np.isnan)), "Summarizer inserts NaN values.") + expected_cols={'prediction_ratio', 'count', 'true_negative_ratio', + 'true_positive_ratio', 'false_negative_ratio', 'false_positive_ratio', + 'surrogates', 'C', 'D'} + returned_cols=set(summary.columns) + self.assertTrue(expected_cols==returned_cols, + f"Summary dataframe does not return correct columns. \nReturns: {returned_cols}. \nExpected: {expected_cols}") + # TODO: Write tests for check_memberships_proba +class UtilsProbaSimulator: + """ + Simulation functions used to test probabilistic fairness. + Can be used by other researchers to simulate different levels of unfairness and test their own methods. + Members: + _rates_dict: Dictionary of dictionaries with expected fairness metrics for each protected class, + has the form: {"class1":{'pct_positive':float,'fpr':float,'fnr':float}, "class2":{'pct_positive'}...} + _surrogate_name: Name of surrogate column for input dataframe + rng: numpy random number generators. Set on initialization if you want to set the seed for your simulation + """ + def __init__(self, model_rates_dict: dict, + in_rng: numpy.random.Generator = None, + surrogate_name: str ="surrogate"): + self.rates_dict(model_rates_dict) + self.surrogate_name(surrogate_name) + if in_rng is not None: + self.rng(in_rng) + else: + self._rng = numpy.random.default_rng() + + def rates_dict(self, v=None): + """ + Set and get rates_dict dictionary + """ + if v is not None: + if not isinstance(v, dict): + raise ValueError("Rates must be a dictionary. Input rates has type: {0}".format(type(dict))) + for k,value in v.items(): + if not set(value.keys()) == {"pct_positive", "fpr", "fnr"}: + raise ValueError("Rates must have ".format("pct_positive", "fpr", "fnr")) + if not isinstance(k,str): + raise ValueError("Keys for main dictionary must be strings") + for k2,value2 in value.items(): + if not isinstance(value2, float): + raise ValueError("Rates must be floats.") + self._rates_dict = v + return self._rates_dict + + def surrogate_name(self,v=None): + if not v is None: + if not isinstance(v,str): + raise ValueError("surrogate_name must be a string.\n {0} supplied intead.".format(type(v))) + self._surrogate_name=v + return self._surrogate_name + def rng(self, v=None): + """ + Set and get random number generator + """ + if v is not None: + if not isinstance(v, numpy.random.Generator): + raise ValueError("rng argument must be a numpy random number generator.") + self._rng = v + return self._rng + + # For the simulation, build "True" protected groups based on population + def assign_protected(self, population_data, generator, + membership_values=None): + # Passing in the global random number generator + # Lets us make sure that we're not accidentally resetting the seed + if membership_values is None: + membership_values=["W", "O", "B", "T", "A", "AI"] + surrogate_protected_prob_grouped = population_data.groupby(self.surrogate_name()) + surrogate_groups = [] + for name, group in surrogate_protected_prob_grouped: + probs = [group[v].unique()[0] for v in membership_values] + group["class"] = generator.choice(list(membership_values), len(group), p=probs) + surrogate_groups.append(group) + out_data = pd.concat(surrogate_groups) + return out_data + + def assign_protected_and_accuracy(self, input_data, rates_by_protected, generator, + protected_name="class"): + # Assign everyone a "true" race for simulation purposes + if not set(rates_by_protected.keys()).issubset(set(input_data.columns)): + raise ValueError("Input dataframe does not have same column names as keys in rates.") + protected_assignments = self.assign_protected(input_data, generator, + membership_values=list(rates_by_protected.keys())) + model_outcomes = self.model_outcome_by_protected(protected_assignments, rates_by_protected, + protected_col_name=protected_name) + return model_outcomes + + def confusion_matrix_prob(self, percent_positive, fpr, fnr, verbose=False): + """ + Calculate the probability of any given individual falling into each quadrant of the confusion matrix + percent_positive: Percent of positive cases in the training data + fpr: False Positive Rate from the hypothetical model + fnr: False Negative Rate from the hypothetical model + """ + fp_ratio = (1 - percent_positive) * fpr + fn_ratio = percent_positive * fnr + tn_ratio = (1 - percent_positive) - fp_ratio + tp_ratio = percent_positive - fn_ratio + probs = [fp_ratio, fn_ratio, tn_ratio, tp_ratio] + if verbose: + print("Expected FPR: " + str(fpr)) + print("Expected FNR: " + str(fnr)) + print("Expected TPR: " + str(tp_ratio / (tp_ratio + fn_ratio))) + print("Expected TNR: " + str(tn_ratio / (tn_ratio + fp_ratio))) + print("Expected Accuracy: " + str((tn_ratio + tp_ratio))) + # print("% of FP, FN, TN, TP among total: ") + return probs + + def model_outcome_by_protected(self, protected_assignment, rates_by_protected, + protected_col_name="class"): + """ + Assing each individual into a column of the confusion matrix based on probabilities for their class. + """ + protected_prob_grouped = protected_assignment.groupby(protected_col_name) + + classified_groups = [] + for name, group in protected_prob_grouped: + rates_dict = rates_by_protected[name] + probs = self.confusion_matrix_prob(rates_dict["pct_positive"], rates_dict["fpr"], rates_dict["fnr"]) + group['pred_category'] = np.random.choice(['fp', 'fn', 'tn', 'tp'], len(group), p=probs) + group['label'] = np.where(group['pred_category'].isin(['tp', 'fn']), 1, 0) + group['prediction'] = np.where(group['pred_category'].isin(['tp', 'fp']), 1, 0) + classified_groups.append(group) + classified_data = pd.concat(classified_groups) + return classified_data + + # Add columns to a pandas dataframe flagging each row as false positive, etc. + def accuracy_columns(self, test_data: pd.DataFrame, pred_col: str, label_col:str)->pd.DataFrame: + """ + Add indicators for each confusion matrix quadrant. Simplifies calculating rates. + test_data: Input dataframe + pred_col: Name of column with predicted class + label_col: Name of column with actual class + """ + correct=(test_data[pred_col] == test_data[label_col]).astype(int) + correct.name="correct" + true_positive=(correct & (test_data[label_col] == 1)).astype(int) + true_positive.name="true_positive" + true_negative=(correct & (test_data[label_col] == 0)).astype(int) + true_negative.name="true_negative" + false_negative=(~(correct) & (test_data[pred_col] == 0)).astype(int) + false_negative.name="false_negative" + false_positive = (~(correct) & (test_data[pred_col] == 1)).astype(int) + false_positive.name="false_positive" + return pd.concat([test_data,correct,true_positive,true_negative,false_negative,false_positive],axis=1) + + def explode_dataframe(self, df, count_name="count",surrogate_name="surrogate"): + """ + Given a dataframe that has a count, produce a number of identical rows equal to that count + df: pd.DataFrame with columns: count, class_1, class_2, ... Class names must match keys from + self._rates_dict + count_name; name of count variable. + """ + names=list(self.rates_dict().keys()) + if not set(names).issubset(df.columns): + raise ValueError(f"DataFrame column names do not match keys in rates dictionary. Rates dict has: {names}.") + check_memberships_proba_df(df[list(self.rates_dict().keys())],set(df.index.values),names) + e_df = df.loc[df.index.repeat(df[count_name])].drop("count", axis=1) + return self.assign_protected_and_accuracy(e_df, self._rates_dict, self._rng) + + # Simulations to ensure numbers accuracy class TestWithSimulation(unittest.TestCase): """ - Helper functions + Simulation tests for whether numbers are correct based on simulated inputs """ - @classmethod def setUpClass(cls) -> None: input_df = pd.DataFrame({"surrogate": list(range(0, 99)), - "count": [83, 103, 96, 96, 102, 117, 95, 107, 106, 109, 92, 95, 105, - 87, 114, 99, 99, 85, 119, 110, 97, 87, 123, 90, 90, 107, - 85, 91, 111, 108, 89, 107, 91, 95, 119, 125, 86, 95, 121, - 103, 99, 97, 88, 106, 96, 90, 101, 102, 99, 119, 102, 93, - 105, 97, 100, 97, 88, 98, 93, 112, 91, 92, 93, 90, 109, - 99, 98, 106, 115, 97, 110, 93, 85, 116, 92, 115, 88, 108, - 106, 118, 114, 96, 97, 94, 96, 85, 96, 91, 101, 89, 97, - 99, 86, 106, 112, 112, 114, 108, 104], + "count": [473, 516, 529, 497, 476, 529, 493, 497, 503, 490, 507, 514, 524, + 485, 470, 513, 501, 505, 488, 510, 518, 501, 506, 484, 493, 504, + 477, 537, 491, 535, 517, 472, 510, 478, 518, 449, 503, 503, 509, + 537, 504, 533, 493, 482, 495, 497, 495, 465, 501, 512, 468, 470, + 549, 510, 503, 524, 496, 526, 481, 478, 557, 487, 511, 493, 486, + 517, 497, 517, 504, 472, 500, 493, 494, 504, 464, 543, 513, 486, + 488, 485, 486, 480, 519, 494, 509, 501, 494, 515, 522, 500, 532, + 512, 490, 486, 516, 495, 530, 542, 588], "W": [0.36137754, 0.83653862, 0.98303716, 0.52943704, 0.80254777, 0.86131181, 0.78572192, 0.79557292, 0.94314381, 0.98431145, 0.97623762, 0.93004508, 0.94375, 0.87960053, 0.9400488, @@ -469,109 +647,13 @@ def setUpClass(cls) -> None: 'O': {"pct_positive": 0.1, "fpr": 0.1, "fnr": 0.1}} cls.rng = np.random.default_rng(347123) - cls.test_data = cls.explode_dataframe(input_df[["surrogate", "count", "W", "B", "O"]]) + cls.sim=UtilsProbaSimulator(cls.rates_dict,in_rng=cls.rng) cls.surrogate_df = input_df[["surrogate", "W", "B", "O"]] - summary_df = SummaryData.summarize(cls.test_data["prediction"], cls.surrogate_df, + cls.test_data = cls.sim.explode_dataframe(input_df[["surrogate", "count","W", "B", "O"]].set_index("surrogate")).reset_index() + summary_df = SummaryData.summarize(cls.test_data["prediction"], cls.surrogate_df.set_index("surrogate"), cls.test_data["surrogate"], cls.test_data["label"]) cls.bc = BiasCalculator.from_df(summary_df, [1, 2], ["W", "B", "O"]) - - # For the simulation, build "True" protected groups based on population - # Note; The census data names the columns as follows: - # pct_white_zip, pct_black_zip, etc - # TODO: Need to use the census labels as keys in a dictionary or use some other method - # So we can continue to use this when the census data changes. - # TODO: REMOVE race terminology - - @classmethod - def assign_protected(cls, population_data, generator, surrogate_name='surrogate', - membership_values=["W", "O", "B", "T", "A", "AI"]): - # Passing in the global random number generator - # Lets us make sure that we're not accidentally resetting the seed - surrogate_protected_prob_grouped = population_data.groupby(surrogate_name) - surrogate_groups = [] - for name, group in surrogate_protected_prob_grouped: - probs = [group[v].unique()[0] for v in membership_values] - group["class"] = generator.choice(list(membership_values), len(group), p=probs) - surrogate_groups.append(group) - out_data = pd.concat(surrogate_groups) - return out_data - - @classmethod - def assign_protected_and_accuracy(cls, input_data, rates_by_protected, generator, - protected_name="class"): - # Assign everyone a "true" race for simulation purposes - protected_assignments = cls.assign_protected(input_data, generator, - membership_values=list(rates_by_protected.keys())) - # Current simulation only handles 2 categories: white or not. - # protected_assignments["w"] = np.where(protected_assignments[protected_col_name] == protected_group, protected_group, "unprotected") - # Assign each individual a quadrant in the confusion matrix based on: - # Percent of positive (not predict_pos_probs) - # probability of being a false positive - # probability of being a false negative - # These are different by race and fed into the simulation through indexes - # Index keys are the values in the race column, e.g. "White" and "Non-White" - model_outcomes = cls.model_outcome_by_protected(protected_assignments, rates_by_protected, - protected_col_name=protected_name) - return model_outcomes - - @classmethod - def confusion_matrix_prob(cls, percent_positive, fpr, fnr, verbose=False): - """ - # This is the probability that the person is labeled as positive in the data - Calculate the % of False Positive, False Negative, True Negative, and True Positive in total based on predefined inputs. - """ - fp_ratio = (1 - percent_positive) * fpr - fn_ratio = percent_positive * fnr - tn_ratio = (1 - percent_positive) - fp_ratio - tp_ratio = percent_positive - fn_ratio - probs = [fp_ratio, fn_ratio, tn_ratio, tp_ratio] - if verbose: - print("Expected FPR: " + str(fpr)) - print("Expected FNR: " + str(fnr)) - print("Expected TPR: " + str(tp_ratio / (tp_ratio + fn_ratio))) - print("Expected TNR: " + str(tn_ratio / (tn_ratio + fp_ratio))) - print("Expected Accuracy: " + str((tn_ratio + tp_ratio))) - # print("% of FP, FN, TN, TP among total: ") - return probs - - @classmethod - def model_outcome_by_protected(cls, surrogate_protected_assignment, rates_by_protected, protected_col_name="class"): - # Assign true positive, true negative, etc by race - surrogate_protected_prob_grouped = surrogate_protected_assignment.groupby(protected_col_name) - - classified_groups = [] - for name, group in surrogate_protected_prob_grouped: - rates_dict = rates_by_protected[name] - probs = cls.confusion_matrix_prob(rates_dict["pct_positive"], rates_dict["fpr"], rates_dict["fnr"]) - group['pred_category'] = np.random.choice(['fp', 'fn', 'tn', 'tp'], len(group), p=probs) - group['label'] = np.where(group['pred_category'].isin(['tp', 'fn']), 1, 0) - group['prediction'] = np.where(group['pred_category'].isin(['tp', 'fp']), 1, 0) - classified_groups.append(group) - classified_data = pd.concat(classified_groups) - return classified_data - - @classmethod - # Add columns to a pandas dataframe flagging each row as false positive, etc. - def accuracy_columns(cls, test_data, pred_col, label_col): - """ - Add indicators for each confusion matrix qudrant. Simplifies calculating rates. - """ - test_data["correct"] = (test_data[pred_col] == test_data[label_col]).astype(int) - test_data["true_positive"] = (test_data["correct"] & (test_data[label_col] == 1)).astype(int) - test_data["true_negative"] = (test_data["correct"] & (test_data[label_col] == 0)).astype(int) - test_data["false_negative"] = (~(test_data["correct"]) & (test_data[pred_col] == 0)).astype(int) - test_data["false_positive"] = (~(test_data["correct"]) & (test_data[pred_col] == 1)).astype(int) - return test_data - - @classmethod - def explode_dataframe(cls, df, count_name="count"): - """ - Given a dataframe that has a count, produce a number of identical rows equal to that count - """ - e_df = df.loc[df.index.repeat(df[count_name])].drop("count", axis=1) - return cls.assign_protected_and_accuracy(e_df, cls.rates_dict, cls.rng) - def test_membership_as_df(self): """ Check output from get_bootstrap_results when inputs are a surrogate dataframe @@ -579,8 +661,6 @@ def test_membership_as_df(self): results = get_bootstrap_results(self.test_data["prediction"], self.surrogate_df.set_index("surrogate"), self.test_data["surrogate"], [1, 2], self.test_data["label"]) - print(results) - self.assertTrue(isinstance(results, pd.DataFrame), "get_bootstrap_results does not return a Pandas DataFrame.") self.assertTrue( {Constants.FPR, Constants.FNR, Constants.TNR, Constants.TPR, Constants.ACC}.issubset(set(results.columns)), @@ -607,25 +687,25 @@ def test_bootstrap_ranges(self): """ Test whether bootstrap returns values that are expected based on simulated data """ - #Need to build a confidence interval where we expect values to be. - #This requires calculation of theoretical variance/covariance matrix based on linear regression + # Need to build a confidence interval where we expect values to be. + # This requires calculation of theoretical variance/covariance matrix based on linear regression n_row = self.bc.X().shape[0] x = np.hstack((np.ones((n_row, 1)), self.bc.X())) # The variance-covariance matrix of a linear estimator based on input X is: invxTx = np.linalg.inv(np.dot(x.T, x)) - pred_matrix = np.array([[1.0, 0.0, 0.0], [1.0, 1.0, 0.0], [1.0, 0.0, 1.0]]) + pred_matrix=np.array([[1.0,0.0,0.0],[1.0,1.0,0.0],[1.0,0.0,1.0]]) # The variance-covariance matrix of a linear calculation based on a prediction matrix is as follows. # Only need the diagonal for this calculation - x_portion_variance = pd.Series(np.diag(np.dot(np.dot(pred_matrix, invxTx), pred_matrix.T))) + x_portion_variance = pd.Series(np.diag(np.dot(np.dot(pred_matrix, invxTx), pred_matrix.T)), + index = self.bc.all_class_labels()) x_portion_variance.name = 'x_var' - x_portion_variance.index=self.bc.all_class_labels() - #Get confusion matrix probabilities and variances from input rates_dict. + # Get confusion matrix probabilities and variances from input rates_dict. in_vars_dict = {} in_means_dict = {} for k, v in self.rates_dict.items(): - a = self.confusion_matrix_prob(v['pct_positive'], v['fpr'], v['fnr']) + a = self.sim.confusion_matrix_prob(v['pct_positive'], v['fpr'], v['fnr']) # These are based on variance of a proportion: p(1-p) in_vars_dict[k] = [r * (1 - r) for r in a] in_means_dict[k] = a @@ -647,13 +727,38 @@ def test_bootstrap_ranges(self): z = 1.65 # from z-table, predictions are N(in_y,var_pred_y) for n in names: - #Prediction variance is sigma_y*pred_matrix*inv(X'X)pred_matrix.T, where sigma_y is a scalar. - #Variance of the mean of n_boots predictions is prediction_variance/n_boots - var_components[n + '_st_err'] = np.sqrt(var_components[n + '_var']*var_components['x_var']/n_boots) - #Normal apprixmation confidence limit + # Prediction variance is sigma_y*pred_matrix*inv(X'X)pred_matrix.T, where sigma_y is a scalar. + # Variance of the mean of n_boots predictions is prediction_variance/n_boots + var_components[n + '_st_err'] = np.sqrt(var_components[n + '_var'] * var_components['x_var'] / n_boots) + # Normal apprixmation confidence limit check_series = (var_components[n + '_in'] - z * var_components[n + "_st_err"] < var_components[n + '_ratio']) & ( - var_components[n + "_in"] + z * var_components[n + "_st_err"]) + var_components[n + "_in"] + z * var_components[n + "_st_err"]) check_series.name = n + "_ok" self.assertTrue(np.all(check_series.values), f"{n} is out of range, on mean of {n_loops}, of {n_boots} bootstraps.") + + def test_get_all_scores(self): + """ + Test get_all_scores to make sure it returns scores for the values it's supposed to return + """ + # get_all_scores only works for two categories + two_categories = pd.concat([self.surrogate_df[["surrogate", "W"]], + 1.0 - self.surrogate_df["W"]], axis=1).set_index("surrogate") + two_categories.columns = ["W", "NW"] + from jurity.fairness import BinaryFairnessMetrics as bfm + output_df = bfm.get_all_scores(self.test_data["label"], self.test_data["prediction"], two_categories, + self.test_data["surrogate"], [1]) + + from jurity.fairness import BinaryFairnessMetrics + + fairness_funcs = inspect.getmembers(BinaryFairnessMetrics, predicate=inspect.isclass)[:-1] + for fairness_func in fairness_funcs: + name = fairness_func[0] + class_ = getattr(BinaryFairnessMetrics, name) # grab a class which is a property of BinaryFairnessMetrics + instance = class_() # dynamically instantiate such class + v = output_df.loc[instance.name]["Value"] + if name in ["AverageOdds", "EqualOpportunity", "FNRDifference", "PredictiveEquality", "StatisticalParity"]: + self.assertFalse(np.isnan(v), f"Bootstrap returns np.nan for {name}.") + else: + self.assertTrue(np.isnan(v), f"Bootstrap not implemented for {name} but returns a value.")