diff --git a/.gitignore b/.gitignore index f8462af..77a87f8 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ __pycache__/ # Distribution / packaging .Python env/ +.venv build/ develop-eggs/ dist/ diff --git a/examples/plot_group_explanations.py b/examples/plot_group_explanations.py index e81c7b1..89b5418 100644 --- a/examples/plot_group_explanations.py +++ b/examples/plot_group_explanations.py @@ -15,14 +15,30 @@ plt.style.use("seaborn-v0_8-whitegrid") -######################################################################################### -# Let's start with data preparation +############################################################################################# +# Let's start with data preparation. For this example, we chose The ACSIncome dataset which +# is one of five datasets created by Ding et al. as an improved alternative to the +# popular UCI Adult dataset. Data is provided for all 50 states and Puerto Rico. Its initial +# purpose was to predict whether US working adults’ yearly income is above $50,000. +# The features are the following: +# - AGEP: age +# - COW: class of worker +# - SCHL: educational attainment +# - MAR: marital status +# - OCCP: occupation +# - POBP: place of birth +# - RELP: relationship +# - WKHP: usual hours worked per week +# - SEX: sex +# - RAC1P: recoded detailed race code + X, y = fetch_openml( data_id=43141, parser="auto", return_X_y=True, read_csv_kwargs={"nrows": 150} ) -# Get the indices of samples that belong to each group +# Get the indices of samples that belong to each group. +# We will use SEX as sensitive attribute adv_idx = X[X["SEX"] == 1].index dis_idx = X[X["SEX"] == 2].index @@ -64,7 +80,7 @@ def score_function(X): ######################################################################################### -# Let's take a look at contributions for both QOIs +# Let's compute contributions for the whole dataset contributions = xai.all(X) @@ -83,6 +99,17 @@ def score_function(X): plt.show() + +######################################################################################### +# On the plots, feature contributions are shown across different strata, e.g., for +# top-20% of ranked individuals, for the whole dataset (i.e., `All`), and both advantaged +# and disadvantaged groups. For the advantaged group, age (AGEP) exhibits stable +# importance across all strata, while the contributions of worked hours (WKHP) and +# educational attainment (SCHL) increase as we move toward lower-ranked individuals. +# Conversely, for the disadvantaged group, the importance of `AGEP` decreases with lower +# ranks, while `WKHP` and `SCHL` remain relatively stable. + + ######################################################################################### # We can also compare contributions across groups overall: @@ -91,3 +118,7 @@ def score_function(X): X.loc[dis_idx, "Sex"] = "Female" xai.plot.box(X, scores, contributions, group="Sex") plt.show() + +# We can see that for males, the importance of `WKHP` is lower compared to females, while +# `AGEP` shows also a positive contribution to the outcomes, indicating that older +# individuals are more likely to achieve favorable results. diff --git a/examples/plot_mixed_data.py b/examples/plot_mixed_data.py index 8771939..65d606d 100644 --- a/examples/plot_mixed_data.py +++ b/examples/plot_mixed_data.py @@ -25,6 +25,10 @@ from sklearn.pipeline import make_pipeline from sharp import ShaRP +##################################################################################### +# Let's get the data first. We will use the dataset that classifies people described +# by a set of attributes as good or bad credit risks. + df = fetch_openml(data_id=31, parser="auto")["frame"] df.head(5) diff --git a/examples/plot_population_analysis.py b/examples/plot_population_analysis.py index 5a7450d..c6ff2cc 100644 --- a/examples/plot_population_analysis.py +++ b/examples/plot_population_analysis.py @@ -24,6 +24,24 @@ # This will help make our visualizations look beautiful :) plt.style.use("seaborn-v0_8-whitegrid") + +############################################################################################# +# Let's start with data preparation. For this example, we chose The ACSIncome dataset which +# is one of five datasets created by Ding et al. as an improved alternative to the +# popular UCI Adult dataset. Data is provided for all 50 states and Puerto Rico. Its initial +# purpose was to predict whether US working adults’ yearly income is above $50,000. +# The features are the following: +# - AGEP: age +# - COW: class of worker +# - SCHL: educational attainment +# - MAR: marital status +# - OCCP: occupation +# - POBP: place of birth +# - RELP: relationship +# - WKHP: usual hours worked per week +# - SEX: sex +# - RAC1P: recoded detailed race code + X, y = fetch_openml( data_id=43141, parser="auto", return_X_y=True, read_csv_kwargs={"nrows": 150} ) diff --git a/examples/plot_qoi_comparison.py b/examples/plot_qoi_comparison.py index 5ea022e..861b05c 100644 --- a/examples/plot_qoi_comparison.py +++ b/examples/plot_qoi_comparison.py @@ -107,11 +107,13 @@ def csrank_score(X): # Let's take a look at some contributions for both QOIs contributions_rank = xai_rank.all(X) -contributions_rank[:5] +print(contributions_rank[:5]) contributions_score = xai_score.all(X) -contributions_score[:5] +print(contributions_score[:5]) + +############################################################################################# # Now let's plot the waterfall plots for different universities and check # if the results for `score` and `rank` differ @@ -182,3 +184,15 @@ def csrank_score(X): plt.tight_layout() plt.show() + + +############################################################################################# +# As the result we see that for each university, the importance of features for each QOI +# may be either very similar or differ a lot. For Stanford University, the feature `Systems` +# is the most impactful for `rank` and is followed by the feature `Interdisciplinary`, while +# for `score`, it's vice versa. For University of Texas, the features have the same order of +# importance for both its QOIs, and all of them contribute to improving the `score`/`rank` +# of the university. On the contrary, for Indiana University, while the order of features by +# their importance is the same as for Stanford University for `rank` QOI, in the case of +# `score`, features `AI` and `Theory` contribute negatively, and `AI` is the most impactful +# feature. diff --git a/sharp/base.py b/sharp/base.py index 3b577a1..4bd500f 100644 --- a/sharp/base.py +++ b/sharp/base.py @@ -12,34 +12,81 @@ class ShaRP(BaseEstimator): """ - Explains the contributions of features to different aspects of a ranked outcome, - based on Shapley values. + The ShaRP (Shapley for Rankings and Preferences) class provides a novel framework + for explaining the contributions of features to various aspects of ranked + outcomes. Built on Shapley values, it quantifies feature importance for rankings, + which is fundamentally different from feature importance in classification or + regression. This framework is essential for understanding, auditing, + and improving algorithmic ranking systems in critical domains such as + hiring, education, and lending. + + ShaRP extends the Quantitative Input Influence (QII) framework to compute feature + contributions to multiple ranking-specific Quantities of Interest (QoIs). + These QoIs include: + - Score: Contribution of features to an item's score. + - Rank: Impact of features on an item's rank. + - Top-k: Influence of features on whether an item appears in the top-k positions. + - Pairwise Preference: Contribution of features to the relative order between + two items. + + ShaRP uses Shapley values, a cooperative game theory concept, to distribute + the "value" of a ranked outcome among the features. For each QoI, the class: + - Constructs feature coalitions by masking subsets of features. + - Evaluates the impact of these coalitions on the QoI using a payoff function. + - Aggregates the marginal contributions of features across all possible coalitions + to compute their Shapley values. This algorithm is an implementation of Shapley for Rankings and Preferences (ShaRP), as presented in [1]_. - If QoI is None, ``target_function`` and parameters ``X`` and ``y`` need to be passed. - if QoI is not None, ``target_function`` is ignored. - Parameters ---------- - estimator : ML classifier - - qoi : Quantity of interest, default: "rank" - - measure : measure used to estimate feature contributions (unary, set, banzhaf, etc.) - - sample_size : amount of perturbations applied per data point - - replace : Whether to sample with replacement - - predict_method : estimator's function that provides inference - - random_state : random seed - - X : reference input - - y : target + qoi : str, optional + The quantity of interest to compute feature contributions for. Options include: + - "score" : Contribution to an item's score. + - "rank" : Contribution to an item's rank. + - "top-k" : Contribution to whether an item appears in the top-k. + - "pairwise" : Contribution to the relative order between two items. + By default, in method ``fit()``, "rank" will be used. + If QoI is None, ``target_function`` and parameters ``X`` and ``y`` + need to be passed. + + target_function : function, optional + A custom function defining the outcome of interest for the data. + Ignored if `qoi` is specified. + + measure : str, default="shapley" + The method used to compute feature contributions. Options include: + - "set" + - "marginal" + - "shapley" + - "banzhaff" + + sample_size : int, optional + The number of perturbations to apply per data point when calculating + feature importance. Default is `None`, which uses all available samples. + + coalition_size : int, optional + The maximum size of feature coalitions to consider. Default is `None`, + which uses all features except one. + + replace : bool, default=False + Whether to sample feature values with replacement during perturbation. + + random_state : int, RandomState instance, or None, optional + Seed or random number generator for reproducibility. Default is `None`. + + n_jobs : int, default=1 + Number of jobs to run in parallel for computations. Use `-1` to use all + available processors. + + verbose : int, default=0 + Verbosity level. Use 0 for no output and higher numbers for more verbose output. + + kwargs : dict, optional + Additional parameters such as: + - ``X`` : array-like, reference input data. + - ``y`` : array-like, target outcomes for the reference data. Notes -----