-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DOC add example for mixed data types (#22)
- Loading branch information
1 parent
dcd3063
commit 7a8a5e5
Showing
1 changed file
with
94 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
""" | ||
ShaRP for classification on large datasets with mixed data types | ||
================================================================ | ||
This example showcases a more complex setting, where we will develop and interpret a | ||
classification model using a larger dataset with both categorical and continuous | ||
features. | ||
``sharp`` is designed to operate over the unprocessed input space, to ensure every | ||
"Frankenstein" point generated to compute feature contributions are plausible. This means | ||
that the function producing the scores (or class predictions) should take as input the | ||
raw dataset, and every preprocessing step leading to the black box predictions/scores | ||
should be included within it. | ||
We will start by downloading the German Credit dataset. | ||
""" | ||
|
||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
from sklearn.datasets import fetch_openml | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler | ||
from sklearn.compose import ColumnTransformer | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.pipeline import make_pipeline | ||
from sharp import ShaRP | ||
|
||
sns.set() | ||
|
||
df = fetch_openml(data_id=31, parser="auto")["frame"] | ||
df.head(5) | ||
|
||
###################################################################### | ||
# Split X and y (input and target) from `df` and split train and test: | ||
|
||
X = df.drop(columns="class") | ||
y = df["class"] | ||
|
||
categorical_features = X.dtypes.apply( | ||
lambda dtype: isinstance(dtype, pd.CategoricalDtype) | ||
).values | ||
|
||
X_train, X_test, y_train, y_test = train_test_split( | ||
X, y, test_size=0.1, random_state=42 | ||
) | ||
|
||
######################################################################################### | ||
# Now we will set up model. Here, we will use a pipeline to combine all the preprocessing | ||
# steps. However, to use ``sharp``, it is also sufficient to pass any function | ||
# (containing all the preprocessing steps) that takes a numpy array as input and outputs | ||
# the model's predictions. | ||
|
||
transformer = ColumnTransformer( | ||
transformers=[ | ||
("onehot", OneHotEncoder(sparse_output=False), categorical_features), | ||
("minmax", MinMaxScaler(), ~categorical_features), | ||
], | ||
remainder="passthrough", | ||
n_jobs=-1, | ||
) | ||
classifier = LogisticRegression(random_state=42) | ||
model = make_pipeline(transformer, classifier) | ||
model.fit(X_train.values, y_train.values) | ||
|
||
######################################################################################### | ||
# We can now use ``sharp`` to explain our model's predictions! If we consider the dataset | ||
# to be too large, we have a few options to reduce computational complexity, such as | ||
# configuring the ``n_jobs`` parameter, setting a value on ``sample_size``, or setting | ||
# ``measure=unary``. | ||
|
||
xai = ShaRP( | ||
qoi="flip", | ||
target_function=model.predict, | ||
measure="unary", | ||
sample_size=None, | ||
random_state=42, | ||
n_jobs=-1, | ||
verbose=1 | ||
) | ||
xai.fit(X_test) | ||
|
||
unary_values = pd.DataFrame(xai.all(X_test), columns=X.columns) | ||
unary_values | ||
|
||
############################################################## | ||
# Finally, we can plot the mean contributions of each feature: | ||
|
||
fig, ax = plt.subplots() | ||
xai.plot.bar(unary_values.mean(), ax=ax) | ||
ax.set_ylim(bottom=0) | ||
ax.tick_params(labelrotation=90) | ||
fig.tight_layout() | ||
plt.show() |