From a81de623635ec021e1f203dc4552f65ea818c2da Mon Sep 17 00:00:00 2001 From: Anders Bogsnes Date: Mon, 29 Oct 2018 17:07:53 +0100 Subject: [PATCH] Added implementation of feature_permutation --- src/ml_tooling/metrics.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/ml_tooling/metrics.py b/src/ml_tooling/metrics.py index 229f9b66..fd70e2e8 100644 --- a/src/ml_tooling/metrics.py +++ b/src/ml_tooling/metrics.py @@ -2,6 +2,7 @@ from typing import Union import numpy as np +import pandas as pd from sklearn import metrics from .utils import _is_percent @@ -70,6 +71,44 @@ def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, normalized=True) -> return cm +def permuted_feature_importance(model, + x, + y, + refit=True, + metric=None): + """ + Calculates feature importance by randomly permuting features and comparing result to baseline + :param y: + DataFrame of trainining features + :param x: + DataFrame of testing features + :param model: + A sklearn-compatible estimator + :param refit: + Refit the model to get baseline score + + :param metric: + Metric to use + :return: + """ + + if refit: + model.fit(x, y) + + baseline_score = model.score(x, y) + importances = {} + + for column in x.columns: + original_data = x[column].copy() + x[column] = np.random.permutation(x[column]) + column_score = model.score(x, y) + importances[column] = baseline_score - column_score + x[column] = original_data + + importance_df = pd.DataFrame.from_dict(importances, orient='index', columns=['importance']) + return importance_df.sort_values(by='importance') + + def sorted_feature_importance(labels: np.ndarray, importance: np.ndarray, top_n: Union[int, float] = None,