Merge pull request #52 from akhynkokateryna/main

Improve documentation
DataResponsibly · Nov 26, 2024 · b4238a4 · b4238a4
2 parents 10d29eb + 4ecd812
commit b4238a4
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 28 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ __pycache__/
 # Distribution / packaging
 .Python
 env/
+.venv
 build/
 develop-eggs/
 dist/

diff --git a/examples/plot_group_explanations.py b/examples/plot_group_explanations.py
@@ -15,14 +15,30 @@
 
 plt.style.use("seaborn-v0_8-whitegrid")
 
-#########################################################################################
-# Let's start with data preparation
+#############################################################################################
+# Let's start with data preparation. For this example, we chose The ACSIncome dataset which
+# is one of five datasets created by Ding et al. as an improved alternative to the
+# popular UCI Adult dataset. Data is provided for all 50 states and Puerto Rico. Its initial
+# purpose was to predict whether US working adults’ yearly income is above $50,000.
+# The features are the following:
+# - AGEP: age
+# - COW: class of worker
+# - SCHL: educational attainment
+# - MAR: marital status
+# - OCCP: occupation
+# - POBP: place of birth
+# - RELP: relationship
+# - WKHP: usual hours worked per week
+# - SEX: sex
+# - RAC1P: recoded detailed race code
+
 
 X, y = fetch_openml(
     data_id=43141, parser="auto", return_X_y=True, read_csv_kwargs={"nrows": 150}
 )
 
-# Get the indices of samples that belong to each group
+# Get the indices of samples that belong to each group.
+# We will use SEX as sensitive attribute
 adv_idx = X[X["SEX"] == 1].index
 dis_idx = X[X["SEX"] == 2].index
 
@@ -64,7 +80,7 @@ def score_function(X):
 
 
 #########################################################################################
-# Let's take a look at contributions for both QOIs
+# Let's compute contributions for the whole dataset
 
 contributions = xai.all(X)
 
@@ -83,6 +99,17 @@ def score_function(X):
 
 plt.show()
 
+
+#########################################################################################
+# On the plots, feature contributions are shown across different strata, e.g., for
+# top-20% of ranked individuals, for the whole dataset (i.e., `All`), and both advantaged
+# and disadvantaged groups. For the advantaged group, age (AGEP) exhibits stable
+# importance across all strata, while the contributions of worked hours (WKHP) and
+# educational attainment (SCHL) increase as we move toward lower-ranked individuals.
+# Conversely, for the disadvantaged group, the importance of `AGEP` decreases with lower
+# ranks, while `WKHP` and `SCHL` remain relatively stable.
+
+
 #########################################################################################
 # We can also compare contributions across groups overall:
 
@@ -91,3 +118,7 @@ def score_function(X):
 X.loc[dis_idx, "Sex"] = "Female"
 xai.plot.box(X, scores, contributions, group="Sex")
 plt.show()
+
+# We can see that for males, the importance of `WKHP` is lower compared to females, while
+# `AGEP` shows also a positive contribution to the outcomes, indicating that older
+# individuals are more likely to achieve favorable results.
diff --git a/examples/plot_mixed_data.py b/examples/plot_mixed_data.py
@@ -25,6 +25,10 @@
 from sklearn.pipeline import make_pipeline
 from sharp import ShaRP
 
+#####################################################################################
+# Let's get the data first. We will use the dataset that classifies people described
+# by a set of attributes as good or bad credit risks.
+
 df = fetch_openml(data_id=31, parser="auto")["frame"]
 df.head(5)
 

diff --git a/examples/plot_population_analysis.py b/examples/plot_population_analysis.py
@@ -24,6 +24,24 @@
 # This will help make our visualizations look beautiful :)
 plt.style.use("seaborn-v0_8-whitegrid")
 
+
+#############################################################################################
+# Let's start with data preparation. For this example, we chose The ACSIncome dataset which
+# is one of five datasets created by Ding et al. as an improved alternative to the
+# popular UCI Adult dataset. Data is provided for all 50 states and Puerto Rico. Its initial
+# purpose was to predict whether US working adults’ yearly income is above $50,000.
+# The features are the following:
+# - AGEP: age
+# - COW: class of worker
+# - SCHL: educational attainment
+# - MAR: marital status
+# - OCCP: occupation
+# - POBP: place of birth
+# - RELP: relationship
+# - WKHP: usual hours worked per week
+# - SEX: sex
+# - RAC1P: recoded detailed race code
+
 X, y = fetch_openml(
     data_id=43141, parser="auto", return_X_y=True, read_csv_kwargs={"nrows": 150}
 )

diff --git a/examples/plot_qoi_comparison.py b/examples/plot_qoi_comparison.py
@@ -107,11 +107,13 @@ def csrank_score(X):
 # Let's take a look at some contributions for both QOIs
 
 contributions_rank = xai_rank.all(X)
-contributions_rank[:5]
+print(contributions_rank[:5])
 
 contributions_score = xai_score.all(X)
-contributions_score[:5]
+print(contributions_score[:5])
 
+
+#############################################################################################
 # Now let's plot the waterfall plots for different universities and check
 # if the results for `score` and `rank` differ
 
@@ -182,3 +184,15 @@ def csrank_score(X):
 
 plt.tight_layout()
 plt.show()
+
+
+#############################################################################################
+# As the result we see that for each university, the importance of features for each QOI
+# may be either very similar or differ a lot. For Stanford University, the feature `Systems`
+# is the most impactful for `rank` and is followed by the feature `Interdisciplinary`, while
+# for `score`, it's vice versa. For University of Texas, the features have the same order of
+# importance for both its QOIs, and all of them contribute to improving the `score`/`rank`
+# of the university. On the contrary, for Indiana University, while the order of features by
+# their importance is the same as for Stanford University for `rank` QOI, in the case of
+# `score`, features `AI` and `Theory` contribute negatively, and `AI` is the most impactful
+# feature.
diff --git a/sharp/base.py b/sharp/base.py
@@ -12,34 +12,81 @@
 
 class ShaRP(BaseEstimator):
     """
-    Explains the contributions of features to different aspects of a ranked outcome,
-    based on Shapley values.
+    The ShaRP (Shapley for Rankings and Preferences) class provides a novel framework
+    for explaining the contributions of features to various aspects of ranked
+    outcomes. Built on Shapley values, it quantifies feature importance for rankings,
+    which is fundamentally different from feature importance in classification or
+    regression. This framework is essential for understanding, auditing,
+    and improving algorithmic ranking systems in critical domains such as
+    hiring, education, and lending.
+
+    ShaRP extends the Quantitative Input Influence (QII) framework to compute feature
+    contributions to multiple ranking-specific Quantities of Interest (QoIs).
+    These QoIs include:
+    - Score: Contribution of features to an item's score.
+    - Rank: Impact of features on an item's rank.
+    - Top-k: Influence of features on whether an item appears in the top-k positions.
+    - Pairwise Preference: Contribution of features to the relative order between
+    two items.
+
+    ShaRP uses Shapley values, a cooperative game theory concept, to distribute
+    the "value" of a ranked outcome among the features. For each QoI, the class:
+    - Constructs feature coalitions by masking subsets of features.
+    - Evaluates the impact of these coalitions on the QoI using a payoff function.
+    - Aggregates the marginal contributions of features across all possible coalitions
+    to compute their Shapley values.
 
     This algorithm is an implementation of Shapley for Rankings and Preferences (ShaRP),
     as presented in [1]_.
 
-    If QoI is None, ``target_function`` and parameters ``X`` and ``y`` need to be passed.
-    if QoI is not None, ``target_function`` is ignored.
-
     Parameters
     ----------
-    estimator : ML classifier
-
-    qoi : Quantity of interest, default: "rank"
-
-    measure : measure used to estimate feature contributions (unary, set, banzhaf, etc.)
-
-    sample_size : amount of perturbations applied per data point
-
-    replace : Whether to sample with replacement
-
-    predict_method : estimator's function that provides inference
-
-    random_state : random seed
-
-    X : reference input
-
-    y : target
+    qoi : str, optional
+        The quantity of interest to compute feature contributions for. Options include:
+        - "score" : Contribution to an item's score.
+        - "rank" : Contribution to an item's rank.
+        - "top-k" : Contribution to whether an item appears in the top-k.
+        - "pairwise" : Contribution to the relative order between two items.
+        By default, in method ``fit()``, "rank" will be used.
+        If QoI is None, ``target_function`` and parameters ``X`` and ``y``
+        need to be passed.
+
+    target_function : function, optional
+        A custom function defining the outcome of interest for the data.
+        Ignored if `qoi` is specified.
+
+    measure : str, default="shapley"
+        The method used to compute feature contributions. Options include:
+        - "set"
+        - "marginal"
+        - "shapley"
+        - "banzhaff"
+
+    sample_size : int, optional
+        The number of perturbations to apply per data point when calculating
+        feature importance. Default is `None`, which uses all available samples.
+
+    coalition_size : int, optional
+        The maximum size of feature coalitions to consider. Default is `None`,
+        which uses all features except one.
+
+    replace : bool, default=False
+        Whether to sample feature values with replacement during perturbation.
+
+    random_state : int, RandomState instance, or None, optional
+        Seed or random number generator for reproducibility. Default is `None`.
+
+    n_jobs : int, default=1
+        Number of jobs to run in parallel for computations. Use `-1` to use all
+        available processors.
+
+    verbose : int, default=0
+        Verbosity level. Use 0 for no output and higher numbers for more verbose output.
+
+    kwargs : dict, optional
+        Additional parameters such as:
+        - ``X`` : array-like, reference input data.
+        - ``y`` : array-like, target outcomes for the reference data.
 
     Notes
     -----
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ __pycache__/ @@
     # Distribution / packaging
     .Python
     env/
+    .venv
     build/
     develop-eggs/
     dist/
@@ Expand Down @@