diff --git a/src/aequitas/flow/methods/preprocessing/unawareness.py b/src/aequitas/flow/methods/preprocessing/correlation_suppression.py similarity index 70% rename from src/aequitas/flow/methods/preprocessing/unawareness.py rename to src/aequitas/flow/methods/preprocessing/correlation_suppression.py index 0b30721f..bcec5993 100644 --- a/src/aequitas/flow/methods/preprocessing/unawareness.py +++ b/src/aequitas/flow/methods/preprocessing/correlation_suppression.py @@ -1,51 +1,29 @@ -from typing import Optional, Literal - +from typing import Optional import pandas as pd import numpy as np from scipy.stats import chi2_contingency - from ...utils import create_logger from .preprocessing import PreProcessing -class Unawareness(PreProcessing): - def __init__( - self, - correlation_threshold: Optional[float] = 0.5, - strategy: Literal["correlation", "featureselection"] = "correlation", - seed: int = 0, - ): +class CorrelaitonSuppression(PreProcessing): + def __init__(self, correlation_threshold: Optional[float] = 0.5): """Removes features that are highly correlated with the sensitive attribute. Note: For this method, the vector s (protected attribute) is assumed to be categorical. - Parameters ---------- - top_k : int, optional - Number of features to remove. If None, the correlation_threshold - must be passed by the user. Defaults to 1. correlation_threshold : float, optional Features with a correlation value higher than this thresold are removed. If None, the top_k parameter is used to determine how many features to remove. Defaults to None. - strategy : {"correlation", "featureselection"}, optional - Strategy to use to calculate how much each feature is related to the - sensitive attribute. If "correlation", correlation between features - is used. "featureselection" is not implemented yet. Defaults to - "correlation". - """ - self.logger = create_logger("methods.preprocessing.Unawareness") - self.logger.info("Instantiating an Unawareness preprocessing method.") + self.logger = create_logger("methods.preprocessing.CorrelaitonSuppression") + self.logger.info( + "Instantiating an CorrelaitonSuppression preprocessing method." + ) self.used_in_inference = True - self.correlation_threshold = correlation_threshold - if strategy == "featureselection": - raise NotImplementedError( - "The feature selection strategy is not implemented yet." - ) - self.strategy = strategy - self.seed = seed def _correlation_ratio( self, categorical_feature: np.ndarray, numeric_feature: np.ndarray @@ -57,14 +35,12 @@ def _correlation_ratio( the numeric data is purely due to the difference within the categorical data. A value of 0 indicates that the variance in the numeric data is completely unaffected by any differences within the categorical data. - Parameters ---------- categorical_feature : numpy.ndarray Categorical column. numeric_feature : numpy.ndarray Numeric column. - Returns ------- float @@ -89,14 +65,12 @@ def _cramerv(self, a: np.ndarray, b: np.ndarray): Cramer's V is a heavily biased estimator and tends to overestimate the strength of the correlation. Therefore, a biased correction is normally applied to the statistic. - Parameters ---------- a : numpy.ndarray First categorical column. b : numpy.ndarray Second categorical column. - Returns ------- float @@ -107,16 +81,14 @@ def _cramerv(self, a: np.ndarray, b: np.ndarray): n = np.sum(contingency.values) r, k = contingency.shape phi2 = chi2 / n - phi2_corrected = max(0, phi2 - (k - 1) * (r - 1) / (n - 1)) r_corrected = r - (r - 1) ** 2 / (n - 1) k_corrected = k - (k - 1) ** 2 / (n - 1) - statistic = np.sqrt(phi2_corrected / min(r_corrected - 1, k_corrected - 1)) return statistic def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None: - """Calculates how related each feature is to the sensitive attribute. + """Calculates correlation between each feature and the sensitive attribute. Parameters ---------- @@ -128,24 +100,24 @@ def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None: Protected attribute vector. """ super().fit(X, y, s) - - self.logger.info("Calculating feature correlation with sensitive attribute.") - - if self.strategy == "correlation": - self.scores = pd.Series(index=X.columns) - for col in X.columns: - if X[col].dtype.name == "category": - self.scores[col] = self._cramerv(s.values, X[col].values) - else: - self.scores[col] = self._correlation_ratio(s.values, X[col].values) - - self.scores = self.scores.sort_values(ascending=False) + self.logger.info( + "Identifying features correlated with the sensitive attribute." + ) + scores = pd.Series(index=X.columns) + for col in X.columns: + if X[col].dtype.name == "category": + scores[col] = self._cramerv(s.values, X[col].values) + else: + scores[col] = self._correlation_ratio(s.values, X[col].values) + scores = scores.sort_values(ascending=False) + self.remove_features = list( + scores.loc[scores >= self.correlation_threshold].index + ) def transform( self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series] = None ) -> tuple[pd.DataFrame, pd.Series, pd.Series]: """Removes the most correlated features with the sensitive attribute. - Parameters ---------- X : pd.DataFrame @@ -154,22 +126,15 @@ def transform( Label vector. s : pd.Series, optional Protected attribute vector. - Returns ------- tuple[pd.DataFrame, pd.Series, pd.Series] The transformed input, X, y, and s. """ super().transform(X, y, s) - - remove_features = list( - self.scores.loc[self.scores >= self.correlation_threshold].index - ) - self.logger.info( f"Removing most correlated features with sensitive attribute: " - f"{remove_features}" + f"{self.remove_features}" ) - X_transformed = X.drop(columns=remove_features) - + X_transformed = X.drop(columns=self.remove_features) return X_transformed, y, s diff --git a/src/aequitas/flow/methods/preprocessing/feature_importance_suppression.py b/src/aequitas/flow/methods/preprocessing/feature_importance_suppression.py new file mode 100644 index 00000000..144f30ac --- /dev/null +++ b/src/aequitas/flow/methods/preprocessing/feature_importance_suppression.py @@ -0,0 +1,137 @@ +from typing import Optional + +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics import roc_auc_score + +from ...utils import create_logger +from .preprocessing import PreProcessing + + +class FeatureImportanceSuppression(PreProcessing): + def __init__( + self, + auc_threshold: Optional[int] = 0.5, + feature_importance_threshold: Optional[float] = 0.1, + n_estimators: Optional[int] = 10, + seed: int = 0, + ): + """Iterively removes the most important features with respect to the sensitive + attribute. + + Parameters + ---------- + auc_threshold : int, optional + The value of AUC above which the removal of features continues. Defaults to + 0.5. + feature_importance_threshold : float, optional + The value of feature importance above which the most important feature needs + to have to be removed. Defaults to 0.1. + n_estimators : int, optional + The number of trees in the random forest. Defaults to 10. + seed : int, optional + The seed for the random forest. Defaults to 0. + """ + self.logger = create_logger( + "methods.preprocessing.FeatureImportanceSuppression" + ) + self.logger.info( + "Instantiating a FeatureImportanceSuppression preprocessing method." + ) + self.used_in_inference = True + + self.auc_threshold = auc_threshold + self.feature_importance_threshold = feature_importance_threshold + self.n_estimators = n_estimators + self.seed = seed + + def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None: + """Iteratively removes the most important features to predict the sensitive + attribute. + + Parameters + ---------- + X : pandas.DataFrame + Feature matrix. + y : pandas.Series + Label vector. + s : pandas.Series + Protected attribute vector. + """ + super().fit(X, y, s) + + self.logger.info("Identifying features to remove.") + + rf = RandomForestClassifier( + n_estimators=self.n_estimators, random_state=self.seed + ) + + features = pd.concat([X, y], axis=1) + features = pd.get_dummies(features) + target = s.copy() + + features_train, features_val, target_train, target_val = train_test_split( + features, target + ) + self.remove_features = [] + + while features_train.shape[1] > 1: + rf.fit(features_train, target_train) + predictions = rf.predict_proba(features_val)[:, 1] + auc = roc_auc_score(target_val, predictions) + + if auc > self.auc_threshold: + scores = pd.Series( + rf.feature_importances_, index=features_train.columns + ) + feature = scores.sort_values(ascending=False).index[0] + if scores[feature] < self.feature_importance_threshold: + break + + i = feature.rfind("_") + if feature[:i] in X.columns: + eliminate = [ + col + for col in features_train.columns + if col.startswith(feature[:i]) + ] + self.remove_features.append(feature[:i]) + else: + eliminate = [feature] + self.remove_features.append(feature) + + features_train = features_train.drop(columns=eliminate) + features_val = features_val.drop(columns=eliminate) + else: + break + + def transform( + self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series] = None + ) -> tuple[pd.DataFrame, pd.Series, pd.Series]: + """Removes the features which are related with the sensitive attribute the most + from the data. + + Parameters + ---------- + X : pd.DataFrame + Feature matrix. + y : pd.Series + Label vector. + s : pd.Series, optional + Protected attribute vector. + + Returns + ------- + tuple[pd.DataFrame, pd.Series, pd.Series] + The transformed input, X, y, and s. + """ + super().transform(X, y, s) + + self.logger.info( + f"Removing most correlated features with sensitive attribute: " + f"{self.remove_features}" + ) + X_transformed = X.drop(columns=self.remove_features) + + return X_transformed, y, s