Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Analysis of feature importances #5

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions hela/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

from .dataset import *
from .feature_importance import *
from .models import *
from .plot import *
from .utils import *
from .wpercentile import *
36 changes: 18 additions & 18 deletions dataset.py → hela/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,48 +8,48 @@

__all__ = ["Dataset", "load_dataset", "load_data_file"]

logger = logging.getLogger(__name__)

LOGGER = logging.getLogger(__name__)

Dataset = namedtuple("Dataset", ["training_x", "training_y",
"testing_x", "testing_y",
"names", "ranges", "colors"])


def load_data_file(data_file, num_features):

data = np.load(data_file)

if data.ndim == 1:
data = data[None, :]

x = data[:, :num_features]
y = data[:, num_features:]

return x, y


def load_dataset(dataset_file):

with open(dataset_file, "r") as f:
dataset_info = json.load(f)

metadata = dataset_info["metadata"]

base_path = os.path.dirname(dataset_file)

# Load training data
training_file = os.path.join(base_path, dataset_info["training_data"])
logger.debug("Loading training data from '{}'...".format(training_file))
training_x, training_y = load_data_file(training_file, metadata["num_features"])

training_file = os.path.join(base_path, dataset_info["training_data"])
LOGGER.debug("Loading training data from '%s'...", training_file)
training_x, training_y = load_data_file(training_file,
metadata["num_features"])

# Optionally, load testing data
testing_x, testing_y = None, None
if dataset_info["testing_data"] is not None:
testing_file = os.path.join(base_path, dataset_info["testing_data"])
logger.debug("Loading testing data from '{}'...".format(testing_file))
testing_x, testing_y = load_data_file(testing_file, metadata["num_features"])

LOGGER.debug("Loading testing data from '%s'...", testing_file)
testing_x, testing_y = load_data_file(testing_file,
metadata["num_features"])

return Dataset(training_x, training_y, testing_x, testing_y,
metadata["names"], metadata["ranges"], metadata["colors"])

183 changes: 183 additions & 0 deletions hela/feature_importance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@

from collections import namedtuple
from multiprocessing import Pool
from typing import Optional, Tuple
import logging

import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from .models import _tree_weights
from .utils import tqdm

__all__ = [
'importances_per_output'
]

LOGGER = logging.getLogger(__name__)


def importances_per_output(
forest: RandomForestRegressor,
x: np.ndarray,
y: np.ndarray
) -> np.ndarray:
"""
Compute the feature importances with a breakdown per output variable (i.e.,
per label).

Parameters
----------
forest : RandomForestRegressor
A trained random forest from which feature importances will be
extracted.
x : array-like of shape (n_samples, n_features)
The input samples of the training dataset used to train `forest`.
y : array-like of shape (n_samples, n_outputs)

Returns
-------
importances : np.ndarray of shape (n_features, n_outputs)
The feature importances splitted according to the contributions to each
output variable. Summing along the columns of this array will provide
the global feature importances contained in
forest.feature_importances_.
"""

impurities = impurities_per_output(forest, x, y)
return importances_from_impurities(forest, impurities)


def importances_from_impurities(
forest: RandomForestRegressor,
impurities: np.ndarray
) -> np.ndarray:

LOGGER.info("Computing importances from impurities...")
with Pool(forest.n_jobs) as pool:
importances_ = list(tqdm(
pool.imap(_tree_importances_per_output,
zip(forest, impurities)),
total=len(forest)
))

importances = np.mean(importances_, axis=0)

return importances / importances.sum()


def _tree_importances_per_output(
args: Tuple[DecisionTreeRegressor, np.ndarray]
) -> np.ndarray:

tree, impurities = args

tree_ = tree.tree_
importances = np.zeros((tree_.n_features, tree_.n_outputs))

Node = namedtuple(
"Node",
['left', 'right', 'weighted_n_node_samples', 'feature', 'impurities']
)

for node in zip(tree_.children_left, tree_.children_right,
tree_.weighted_n_node_samples, tree_.feature, impurities):
node = Node(*node)
if node.left == -1:
continue

left = Node(
None, None,
tree_.weighted_n_node_samples[node.left],
tree_.feature[node.left],
impurities[node.left]
)
right = Node(
None, None,
tree_.weighted_n_node_samples[node.right],
tree_.feature[node.right], impurities[node.right]
)

importances[node.feature] += (
node.weighted_n_node_samples * node.impurities -
left.weighted_n_node_samples * left.impurities -
right.weighted_n_node_samples * right.impurities
)

return importances / importances.sum()


# Global variables to avoid pickling very large arrays with multiprocessing
_X = None
_Y = None


def impurities_per_output(
forest: RandomForestRegressor,
x: np.ndarray,
y: np.ndarray
) -> np.ndarray:

LOGGER.info("Computing impurities...")
global _X
global _Y
_X = x
_Y = y
with Pool(forest.n_jobs) as pool:
impurities = list(tqdm(
pool.imap(_tree_impurities_per_output, forest),
total=len(forest)
))

return np.array(impurities)


def _tree_impurities_per_output(tree: DecisionTreeRegressor) -> np.ndarray:

assert _X is not None and _Y is not None

tree_ = tree.tree_
x, y = _X, _Y

impurities = np.zeros((tree.tree_.node_count, y.shape[1]))
weights = _tree_weights(tree, len(x))

def __tree_impurities_per_output(node_idx, x, y, weights):

imp = _wvariance(y, weights, axis=0)
impurities[node_idx] = imp

if tree_.children_left[node_idx] == -1:
return

feature = tree_.feature[node_idx]
threshold = tree_.threshold[node_idx]
mask = x[:, feature] <= threshold
__tree_impurities_per_output(
tree_.children_left[node_idx],
x[mask],
y[mask],
weights[mask]
)
mask = np.logical_not(mask)
__tree_impurities_per_output(
tree_.children_right[node_idx],
x[mask],
y[mask],
weights[mask]
)

__tree_impurities_per_output(0, x, y, weights)
return impurities


def _wvariance(
data: np.ndarray,
weights: np.ndarray,
axis: Optional[int] = None
) -> np.ndarray:

avg = np.average(data, weights=weights, axis=axis)
return np.average((data - avg)**2, weights=weights, axis=axis)
14 changes: 5 additions & 9 deletions models.py → hela/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@
from sklearn.utils import check_random_state
from sklearn.preprocessing import MinMaxScaler

try:
from tqdm import tqdm
except ImportError:
def tqdm(x, *_, **__):
return x
from .utils import tqdm

__all__ = [
"Model",
Expand Down Expand Up @@ -75,14 +71,14 @@ def _scaler_fit(self, y):

self.scaler.fit(y)

def _scaler_transform(self, y):
def scaler_transform(self, y):
if y.ndim == 1:
y = y[:, None]
return self.scaler.transform(y)[:, 0]

return self.scaler.transform(y)

def _scaler_inverse_transform(self, y):
def scaler_inverse_transform(self, y):

if y.ndim == 1:
y = y[:, None]
Expand All @@ -92,7 +88,7 @@ def _scaler_inverse_transform(self, y):

def fit(self, x, y):
self._scaler_fit(y)
self.rf.fit(x, self._scaler_transform(y))
self.rf.fit(x, self.scaler_transform(y))

# Build the structures to quickly compute the posteriors
if self.enable_posterior:
Expand All @@ -105,7 +101,7 @@ def fit(self, x, y):

def predict(self, x):
pred = self.rf.predict(x)
return self._scaler_inverse_transform(pred)
return self.scaler_inverse_transform(pred)

def predict_median(self, x):
return self.predict_percentile(x, 50)
Expand Down
38 changes: 27 additions & 11 deletions plot.py → hela/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,15 @@
from sklearn import metrics, neighbors
from sklearn.preprocessing import MinMaxScaler

from models import resample_posterior
from wpercentile import wmedian

try:
from tqdm import tqdm
except ImportError:
def tqdm(x, *_, **__):
return x
from .models import resample_posterior
from .wpercentile import wmedian
from .utils import tqdm

__all__ = [
"predicted_vs_real",
"feature_importances",
"posterior_matrix"
'predicted_vs_real',
'feature_importances',
'stacked_feature_importances',
'posterior_matrix'
]

POSTERIOR_MAX_SIZE = 10000
Expand Down Expand Up @@ -96,6 +92,26 @@ def feature_importances(forests, names, colors):
return fig


def stacked_feature_importances(importances, names, colors):

bottoms = np.zeros(importances.shape[0])

ind = np.arange(len(importances))

fig = plt.figure()
ax = fig.gca()
for data_i, name_i, color_i in zip(importances.T, names, colors):
ax.bar(ind, data_i, bottom=bottoms, color=color_i, label=name_i)
bottoms += data_i

ax.set_xlabel("Feature index", fontsize=18)
ax.legend(fontsize=16)
ax.grid()

fig.tight_layout()
return fig


def posterior_matrix(posterior, names, ranges, colors, soft_colors=None):

samples, weights = posterior
Expand Down
Loading