Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prediction of the best-fit model #6

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions hela/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

from .dataset import *
from .feature_importance import *
from .models import *
from .posterior import *
from .plot import *
from .utils import *
from .wpercentile import *
40 changes: 20 additions & 20 deletions dataset.py → hela/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,48 +8,48 @@

__all__ = ["Dataset", "load_dataset", "load_data_file"]

logger = logging.getLogger(__name__)

LOGGER = logging.getLogger(__name__)

Dataset = namedtuple("Dataset", ["training_x", "training_y",
"testing_x", "testing_y",
"names", "ranges", "colors"])


def load_data_file(data_file, num_features):

data = np.load(data_file)

if data.ndim == 1:
data = data[None, :]

x = data[:, :num_features]
y = data[:, num_features:]

return x, y


def load_dataset(dataset_file):
def load_dataset(dataset_file, load_testing_data=True):

with open(dataset_file, "r") as f:
dataset_info = json.load(f)

metadata = dataset_info["metadata"]

base_path = os.path.dirname(dataset_file)

# Load training data
training_file = os.path.join(base_path, dataset_info["training_data"])
logger.debug("Loading training data from '{}'...".format(training_file))
training_x, training_y = load_data_file(training_file, metadata["num_features"])

training_file = os.path.join(base_path, dataset_info["training_data"])
LOGGER.debug("Loading training data from '%s'...", training_file)
training_x, training_y = load_data_file(training_file,
metadata["num_features"])

# Optionally, load testing data
testing_x, testing_y = None, None
if dataset_info["testing_data"] is not None:
if load_testing_data and dataset_info["testing_data"] is not None:
testing_file = os.path.join(base_path, dataset_info["testing_data"])
logger.debug("Loading testing data from '{}'...".format(testing_file))
testing_x, testing_y = load_data_file(testing_file, metadata["num_features"])

LOGGER.debug("Loading testing data from '%s'...", testing_file)
testing_x, testing_y = load_data_file(testing_file,
metadata["num_features"])

return Dataset(training_x, training_y, testing_x, testing_y,
metadata["names"], metadata["ranges"], metadata["colors"])

183 changes: 183 additions & 0 deletions hela/feature_importance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@

from collections import namedtuple
from multiprocessing import Pool
from typing import Optional, Tuple
import logging

import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from .models import _tree_weights
from .utils import tqdm

__all__ = [
'importances_per_output'
]

LOGGER = logging.getLogger(__name__)


def importances_per_output(
forest: RandomForestRegressor,
x: np.ndarray,
y: np.ndarray
) -> np.ndarray:
"""
Compute the feature importances with a breakdown per output variable (i.e.,
per label).

Parameters
----------
forest : RandomForestRegressor
A trained random forest from which feature importances will be
extracted.
x : array-like of shape (n_samples, n_features)
The input samples of the training dataset used to train `forest`.
y : array-like of shape (n_samples, n_outputs)

Returns
-------
importances : np.ndarray of shape (n_features, n_outputs)
The feature importances splitted according to the contributions to each
output variable. Summing along the columns of this array will provide
the global feature importances contained in
forest.feature_importances_.
"""

impurities = impurities_per_output(forest, x, y)
return importances_from_impurities(forest, impurities)


def importances_from_impurities(
forest: RandomForestRegressor,
impurities: np.ndarray
) -> np.ndarray:

LOGGER.info("Computing importances from impurities...")
with Pool(forest.n_jobs) as pool:
importances_ = list(tqdm(
pool.imap(_tree_importances_per_output,
zip(forest, impurities)),
total=len(forest)
))

importances = np.mean(importances_, axis=0)

return importances / importances.sum()


def _tree_importances_per_output(
args: Tuple[DecisionTreeRegressor, np.ndarray]
) -> np.ndarray:

tree, impurities = args

tree_ = tree.tree_
importances = np.zeros((tree_.n_features, tree_.n_outputs))

Node = namedtuple(
"Node",
['left', 'right', 'weighted_n_node_samples', 'feature', 'impurities']
)

for node in zip(tree_.children_left, tree_.children_right,
tree_.weighted_n_node_samples, tree_.feature, impurities):
node = Node(*node)
if node.left == -1:
continue

left = Node(
None, None,
tree_.weighted_n_node_samples[node.left],
tree_.feature[node.left],
impurities[node.left]
)
right = Node(
None, None,
tree_.weighted_n_node_samples[node.right],
tree_.feature[node.right], impurities[node.right]
)

importances[node.feature] += (
node.weighted_n_node_samples * node.impurities -
left.weighted_n_node_samples * left.impurities -
right.weighted_n_node_samples * right.impurities
)

return importances / importances.sum()


# Global variables to avoid pickling very large arrays with multiprocessing
_X = None
_Y = None


def impurities_per_output(
forest: RandomForestRegressor,
x: np.ndarray,
y: np.ndarray
) -> np.ndarray:

LOGGER.info("Computing impurities...")
global _X
global _Y
_X = x
_Y = y
with Pool(forest.n_jobs) as pool:
impurities = list(tqdm(
pool.imap(_tree_impurities_per_output, forest),
total=len(forest)
))

return np.array(impurities)


def _tree_impurities_per_output(tree: DecisionTreeRegressor) -> np.ndarray:

assert _X is not None and _Y is not None

tree_ = tree.tree_
x, y = _X, _Y

impurities = np.zeros((tree.tree_.node_count, y.shape[1]))
weights = _tree_weights(tree, len(x))

def __tree_impurities_per_output(node_idx, x, y, weights):

imp = _wvariance(y, weights, axis=0)
impurities[node_idx] = imp

if tree_.children_left[node_idx] == -1:
return

feature = tree_.feature[node_idx]
threshold = tree_.threshold[node_idx]
mask = x[:, feature] <= threshold
__tree_impurities_per_output(
tree_.children_left[node_idx],
x[mask],
y[mask],
weights[mask]
)
mask = np.logical_not(mask)
__tree_impurities_per_output(
tree_.children_right[node_idx],
x[mask],
y[mask],
weights[mask]
)

__tree_impurities_per_output(0, x, y, weights)
return impurities


def _wvariance(
data: np.ndarray,
weights: np.ndarray,
axis: Optional[int] = None
) -> np.ndarray:

avg = np.average(data, weights=weights, axis=axis)
return np.average((data - avg)**2, weights=weights, axis=axis)
Loading