diff --git a/hela/__init__.py b/hela/__init__.py new file mode 100644 index 0000000..844990f --- /dev/null +++ b/hela/__init__.py @@ -0,0 +1,8 @@ + +from .dataset import * +from .feature_importance import * +from .models import * +from .posterior import * +from .plot import * +from .utils import * +from .wpercentile import * diff --git a/dataset.py b/hela/dataset.py similarity index 65% rename from dataset.py rename to hela/dataset.py index 727db15..5be73a5 100644 --- a/dataset.py +++ b/hela/dataset.py @@ -8,8 +8,7 @@ __all__ = ["Dataset", "load_dataset", "load_data_file"] -logger = logging.getLogger(__name__) - +LOGGER = logging.getLogger(__name__) Dataset = namedtuple("Dataset", ["training_x", "training_y", "testing_x", "testing_y", @@ -17,39 +16,40 @@ def load_data_file(data_file, num_features): - + data = np.load(data_file) - + if data.ndim == 1: data = data[None, :] - + x = data[:, :num_features] y = data[:, num_features:] - + return x, y -def load_dataset(dataset_file): - +def load_dataset(dataset_file, load_testing_data=True): + with open(dataset_file, "r") as f: dataset_info = json.load(f) - + metadata = dataset_info["metadata"] - + base_path = os.path.dirname(dataset_file) - + # Load training data - training_file = os.path.join(base_path, dataset_info["training_data"]) - logger.debug("Loading training data from '{}'...".format(training_file)) - training_x, training_y = load_data_file(training_file, metadata["num_features"]) - + training_file = os.path.join(base_path, dataset_info["training_data"]) + LOGGER.debug("Loading training data from '%s'...", training_file) + training_x, training_y = load_data_file(training_file, + metadata["num_features"]) + # Optionally, load testing data testing_x, testing_y = None, None - if dataset_info["testing_data"] is not None: + if load_testing_data and dataset_info["testing_data"] is not None: testing_file = os.path.join(base_path, dataset_info["testing_data"]) - logger.debug("Loading testing data from '{}'...".format(testing_file)) - testing_x, testing_y = load_data_file(testing_file, metadata["num_features"]) - + LOGGER.debug("Loading testing data from '%s'...", testing_file) + testing_x, testing_y = load_data_file(testing_file, + metadata["num_features"]) + return Dataset(training_x, training_y, testing_x, testing_y, metadata["names"], metadata["ranges"], metadata["colors"]) - diff --git a/hela/feature_importance.py b/hela/feature_importance.py new file mode 100644 index 0000000..1f1537e --- /dev/null +++ b/hela/feature_importance.py @@ -0,0 +1,183 @@ + +from collections import namedtuple +from multiprocessing import Pool +from typing import Optional, Tuple +import logging + +import numpy as np + +from sklearn.ensemble import RandomForestRegressor +from sklearn.tree import DecisionTreeRegressor + +from .models import _tree_weights +from .utils import tqdm + +__all__ = [ + 'importances_per_output' +] + +LOGGER = logging.getLogger(__name__) + + +def importances_per_output( + forest: RandomForestRegressor, + x: np.ndarray, + y: np.ndarray + ) -> np.ndarray: + """ + Compute the feature importances with a breakdown per output variable (i.e., + per label). + + Parameters + ---------- + forest : RandomForestRegressor + A trained random forest from which feature importances will be + extracted. + x : array-like of shape (n_samples, n_features) + The input samples of the training dataset used to train `forest`. + y : array-like of shape (n_samples, n_outputs) + + Returns + ------- + importances : np.ndarray of shape (n_features, n_outputs) + The feature importances splitted according to the contributions to each + output variable. Summing along the columns of this array will provide + the global feature importances contained in + forest.feature_importances_. + """ + + impurities = impurities_per_output(forest, x, y) + return importances_from_impurities(forest, impurities) + + +def importances_from_impurities( + forest: RandomForestRegressor, + impurities: np.ndarray + ) -> np.ndarray: + + LOGGER.info("Computing importances from impurities...") + with Pool(forest.n_jobs) as pool: + importances_ = list(tqdm( + pool.imap(_tree_importances_per_output, + zip(forest, impurities)), + total=len(forest) + )) + + importances = np.mean(importances_, axis=0) + + return importances / importances.sum() + + +def _tree_importances_per_output( + args: Tuple[DecisionTreeRegressor, np.ndarray] + ) -> np.ndarray: + + tree, impurities = args + + tree_ = tree.tree_ + importances = np.zeros((tree_.n_features, tree_.n_outputs)) + + Node = namedtuple( + "Node", + ['left', 'right', 'weighted_n_node_samples', 'feature', 'impurities'] + ) + + for node in zip(tree_.children_left, tree_.children_right, + tree_.weighted_n_node_samples, tree_.feature, impurities): + node = Node(*node) + if node.left == -1: + continue + + left = Node( + None, None, + tree_.weighted_n_node_samples[node.left], + tree_.feature[node.left], + impurities[node.left] + ) + right = Node( + None, None, + tree_.weighted_n_node_samples[node.right], + tree_.feature[node.right], impurities[node.right] + ) + + importances[node.feature] += ( + node.weighted_n_node_samples * node.impurities - + left.weighted_n_node_samples * left.impurities - + right.weighted_n_node_samples * right.impurities + ) + + return importances / importances.sum() + + +# Global variables to avoid pickling very large arrays with multiprocessing +_X = None +_Y = None + + +def impurities_per_output( + forest: RandomForestRegressor, + x: np.ndarray, + y: np.ndarray + ) -> np.ndarray: + + LOGGER.info("Computing impurities...") + global _X + global _Y + _X = x + _Y = y + with Pool(forest.n_jobs) as pool: + impurities = list(tqdm( + pool.imap(_tree_impurities_per_output, forest), + total=len(forest) + )) + + return np.array(impurities) + + +def _tree_impurities_per_output(tree: DecisionTreeRegressor) -> np.ndarray: + + assert _X is not None and _Y is not None + + tree_ = tree.tree_ + x, y = _X, _Y + + impurities = np.zeros((tree.tree_.node_count, y.shape[1])) + weights = _tree_weights(tree, len(x)) + + def __tree_impurities_per_output(node_idx, x, y, weights): + + imp = _wvariance(y, weights, axis=0) + impurities[node_idx] = imp + + if tree_.children_left[node_idx] == -1: + return + + feature = tree_.feature[node_idx] + threshold = tree_.threshold[node_idx] + mask = x[:, feature] <= threshold + __tree_impurities_per_output( + tree_.children_left[node_idx], + x[mask], + y[mask], + weights[mask] + ) + mask = np.logical_not(mask) + __tree_impurities_per_output( + tree_.children_right[node_idx], + x[mask], + y[mask], + weights[mask] + ) + + __tree_impurities_per_output(0, x, y, weights) + return impurities + + +def _wvariance( + data: np.ndarray, + weights: np.ndarray, + axis: Optional[int] = None + ) -> np.ndarray: + + avg = np.average(data, weights=weights, axis=axis) + return np.average((data - avg)**2, weights=weights, axis=axis) diff --git a/hela/models.py b/hela/models.py new file mode 100644 index 0000000..bd0c8b8 --- /dev/null +++ b/hela/models.py @@ -0,0 +1,288 @@ + +import logging +from typing import Optional, Union, Dict, Iterable, List + +import numpy as np + +from sklearn import ensemble +from sklearn.base import BaseEstimator +from sklearn.utils import check_random_state +from sklearn.preprocessing import MinMaxScaler + +from .utils import tqdm +from .posterior import Posterior, posterior_percentile + +__all__ = [ + "Model", +] + +LOGGER = logging.getLogger(__name__) + + +class Model(BaseEstimator): + + def __init__( + self, + n_estimators: int = 1000, + criterion: str = 'mse', + max_features: Union[str, int, float] = 'sqrt', + min_impurity_decrease: float = 0.0, + bootstrap: bool = True, + n_jobs: Optional[int] = None, + random_state: Union[int, np.random.RandomState, None] = None, + verbose: int = 0, + max_samples: Union[float, int, None] = None, + enable_posterior: bool = True + ): + + scaler = MinMaxScaler(feature_range=(0, 100)) + self.random_forest = ensemble.RandomForestRegressor( + n_estimators=n_estimators, + criterion=criterion, + max_features=max_features, + min_impurity_decrease=min_impurity_decrease, + bootstrap=bootstrap, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + max_samples=max_samples + ) + + self.scaler = scaler + + # To compute the posteriors + self.enable_posterior = enable_posterior + self.data_leaves = None + self.data_weights = None + self.data_y = None + + def _scaler_fit(self, y): + if y.ndim == 1: + y = y[:, None] + + self.scaler.fit(y) + + def scaler_transform(self, y: np.ndarray) -> np.ndarray: + if y.ndim == 1: + y = y[:, None] + return self.scaler.transform(y)[:, 0] + + return self.scaler.transform(y) + + def scaler_inverse_transform(self, y: np.ndarray) -> np.ndarray: + + if y.ndim == 1: + y = y[:, None] + # return self.scaler.inverse_transform(y)[:, 0] + + return self.scaler.inverse_transform(y) + + def fit(self, x: np.ndarray, y: np.ndarray) -> np.ndarray: + self._scaler_fit(y) + self.random_forest.fit(x, self.scaler_transform(y)) + + # Build the structures to quickly compute the posteriors + if self.enable_posterior: + data_leaves = self.random_forest.apply(x).T + self.data_leaves = _as_smallest_udtype(data_leaves) + self.data_weights = np.array( + [_tree_weights(tree, len(y)) for tree in self.random_forest] + ) + self.data_y = y + + def predict(self, x: np.ndarray) -> np.ndarray: + pred = self.random_forest.predict(x) + return self.scaler_inverse_transform(pred) + + def predict_median( + self, + x: np.ndarray, + prior_samples: Optional[np.ndarray] = None + ) -> np.ndarray: + + return self.predict_percentile(x, 50, prior_samples) + + def predict_percentile( + self, + x: np.ndarray, + percentile: Union[float, Iterable[float]], + prior_samples: Optional[np.ndarray] = None + ) -> np.ndarray: + """ + Equivalent to compute the posterior for each element of `x` and then + calling `posterior_percentile`. However, this is usually more efficient + than the naive approach when `x` has many elements. + """ + + if not self.enable_posterior: + raise ValueError("Cannot compute posteriors with this model. " + "Set `enable_posterior` to True to enable " + "posterior computation.") + + # Find the leaves for the query points + leaves_x = self.random_forest.apply(x) + + if prior_samples is None: + prior_samples = self.data_y + + if len(x) > self.random_forest.n_estimators: + # If there are many queries, + # it is faster to find points using a cache + return _posterior_percentile_cache( + self.data_leaves, self.data_weights, + prior_samples, leaves_x, percentile + ) + + # For few queries, it is faster if we just compute the posterior + # for each element + return _posterior_percentile_nocache( + self.data_leaves, self.data_weights, + prior_samples, leaves_x, percentile + ) + + def posterior( + self, + x: np.ndarray, + prior_samples: Optional[np.ndarray] = None + ) -> Posterior: + + if not self.enable_posterior: + raise ValueError("Cannot compute posteriors with this model. " + "Set `enable_posterior` to True to enable " + "posterior computation.") + + if x.ndim > 1: + raise ValueError("x.ndim must be 1") + + leaves_x = self.random_forest.apply(x[None, :])[0] + + if prior_samples is None: + prior_samples = self.data_y + + return _posterior( + self.data_leaves, self.data_weights, + prior_samples, leaves_x + ) + + +def _posterior( + data_leaves: np.ndarray, + data_weights: np.ndarray, + data_y: np.ndarray, + query_leaves: np.ndarray + ) -> Posterior: + + weights_x = (query_leaves[:, None] == data_leaves) * data_weights + weights_x = weights_x.sum(0) + + # Remove samples with weight zero + mask = weights_x != 0 + samples = data_y[mask] + weights = weights_x[mask].astype(np.int_) + + return Posterior(samples, weights) + + +def _posterior_percentile_nocache( + data_leaves: np.ndarray, + data_weights: np.ndarray, + prior_samples: np.ndarray, + query_leaves: np.ndarray, + percentile: Union[float, Iterable[float]] + ) -> np.ndarray: + + values = [] + + LOGGER.info("Computing percentiles...") + # This can be parallelized with multiprocessing. + for leaves_x_i in tqdm(query_leaves): + posterior = _posterior( + data_leaves, data_weights, + prior_samples, leaves_x_i + ) + value = posterior_percentile(posterior, percentile) + values.append(value) + + return np.array(values) + + +def _posterior_percentile_cache( + data_leaves: np.ndarray, + data_weights: np.ndarray, + prior_samples: np.ndarray, + query_leaves: np.ndarray, + percentile: Union[float, Iterable[float]] + ) -> np.ndarray: + + # Build a dictionary for fast access of the contents of the leaves. + LOGGER.info("Building cache...") + cache = [ + _build_leaves_cache(leaves_i, weights_i) + for leaves_i, weights_i in zip(data_leaves, data_weights) + ] + + values = [] + # Check the contents of the leaves in leaves_x + LOGGER.info("Computing percentiles...") + # This can be parallelized with multiprocessing. + for leaves_x_i in tqdm(query_leaves): + indices: List[int] = [] + weights: List[int] = [] + for tree, leaves_x_i_j in enumerate(leaves_x_i): + cur_indices = cache[tree][0][leaves_x_i_j] + cur_weights = cache[tree][1][leaves_x_i_j] + indices.extend(cur_indices) + weights.extend(cur_weights) + + posterior = Posterior(prior_samples[indices], weights) + value = posterior_percentile(posterior, percentile) + values.append(value) + + return np.array(values) + + +def _build_leaves_cache(data_leaves, data_weights): + + indices: Dict[int, List[int]] = {} + weights: Dict[int, List[int]] = {} + + for index, (leaf, weight) in enumerate(zip(data_leaves, data_weights)): + if weight == 0: + continue + + if leaf not in indices: + indices[leaf] = [index] + weights[leaf] = [weight] + else: + indices[leaf].append(index) + weights[leaf].append(weight) + + return indices, weights + + +def _generate_sample_indices(random_state, n_samples): + random_instance = check_random_state(random_state) + sample_indices = random_instance.randint(0, n_samples, n_samples) + + return sample_indices + + +def _tree_weights(tree, n_samples): + indices = _generate_sample_indices(tree.random_state, n_samples) + res = np.bincount(indices, minlength=n_samples) + return _as_smallest_udtype(res) + + +def _as_smallest_udtype(arr): + return arr.astype(_smallest_udtype(arr.max())) + + +def _smallest_udtype(value): + + dtypes = [np.uint8, np.uint16, np.uint32, np.uint64] + + for dtype in dtypes: + if value <= np.iinfo(dtype).max: + return dtype + + raise ValueError("value is too large for any dtype") diff --git a/plot.py b/hela/plot.py similarity index 93% rename from plot.py rename to hela/plot.py index a3434f2..e72c107 100644 --- a/plot.py +++ b/hela/plot.py @@ -9,19 +9,15 @@ from sklearn import metrics, neighbors from sklearn.preprocessing import MinMaxScaler -from models import resample_posterior -from wpercentile import wmedian - -try: - from tqdm import tqdm -except ImportError: - def tqdm(x, *_, **__): - return x +from .posterior import resample_posterior +from .wpercentile import wmedian +from .utils import tqdm __all__ = [ - "predicted_vs_real", - "feature_importances", - "posterior_matrix" + 'predicted_vs_real', + 'feature_importances', + 'stacked_feature_importances', + 'posterior_matrix' ] POSTERIOR_MAX_SIZE = 10000 @@ -96,6 +92,26 @@ def feature_importances(forests, names, colors): return fig +def stacked_feature_importances(importances, names, colors): + + bottoms = np.zeros(importances.shape[0]) + + ind = np.arange(len(importances)) + + fig = plt.figure() + ax = fig.gca() + for data_i, name_i, color_i in zip(importances.T, names, colors): + ax.bar(ind, data_i, bottom=bottoms, color=color_i, label=name_i) + bottoms += data_i + + ax.set_xlabel("Feature index", fontsize=18) + ax.legend(fontsize=16) + ax.grid() + + fig.tight_layout() + return fig + + def posterior_matrix(posterior, names, ranges, colors, soft_colors=None): samples, weights = posterior diff --git a/hela/posterior.py b/hela/posterior.py new file mode 100644 index 0000000..1302d19 --- /dev/null +++ b/hela/posterior.py @@ -0,0 +1,40 @@ + +from collections import namedtuple +import logging +from typing import Union, Iterable + +import numpy as np + +from .wpercentile import wpercentile + +__all__ = [ + "Posterior", + "resample_posterior", + "posterior_percentile" +] + +LOGGER = logging.getLogger(__name__) + +# Posteriors are represented as a collection of weighted samples +Posterior = namedtuple("Posterior", ["samples", "weights"]) + + +def resample_posterior(posterior: Posterior, num_draws: int) -> Posterior: + + p = posterior.weights / posterior.weights.sum() + indices = np.random.choice(len(posterior.samples), size=num_draws, p=p) + + new_weights = np.bincount(indices, minlength=len(posterior.samples)) + mask = new_weights != 0 + new_samples = posterior.samples[mask] + new_weights = posterior.weights[mask] + + return Posterior(new_samples, new_weights) + + +def posterior_percentile( + posterior: Posterior, + percentiles: Union[float, Iterable[float]]) -> np.ndarray: + + samples, weights = posterior + return wpercentile(samples, weights, percentiles, axis=0) diff --git a/utils.py b/hela/utils.py similarity index 88% rename from utils.py rename to hela/utils.py index 414cc49..2cba725 100644 --- a/utils.py +++ b/hela/utils.py @@ -1,40 +1,46 @@ -import os import logging +try: + from tqdm import tqdm +except ImportError: + def tqdm(x, *_, **__): + return x + __all__ = [ - "config_logger" + 'config_logger', + 'tqdm' ] def config_logger(log_file="/dev/null", level=logging.INFO): - + class MyFormatter(logging.Formatter): - + info_format = "\x1b[32;1m%(asctime)s [%(name)s]\x1b[0m %(message)s" error_format = "\x1b[31;1m%(asctime)s [%(name)s] [%(levelname)s]\x1b[0m %(message)s" - + def format(self, record): - + if record.levelno > logging.INFO: self._style._fmt = self.error_format else: self._style._fmt = self.info_format - + res = super(MyFormatter, self).format(record) return res - + rootLogger = logging.getLogger() - + fileHandler = logging.FileHandler(log_file) fileFormatter = logging.Formatter("%(asctime)s [%(name)s] [%(levelname)s]> %(message)s") fileHandler.setFormatter(fileFormatter) rootLogger.addHandler(fileHandler) - + consoleHandler = logging.StreamHandler() consoleFormatter = MyFormatter() consoleHandler.setFormatter(consoleFormatter) rootLogger.addHandler(consoleHandler) - + rootLogger.setLevel(level) diff --git a/hela/wpercentile.py b/hela/wpercentile.py new file mode 100644 index 0000000..8ab0d8f --- /dev/null +++ b/hela/wpercentile.py @@ -0,0 +1,78 @@ + +import numpy as np + +__all__ = [ + "wpercentile", + "wmedian" +] + + +# def _wpercentile1d(data, weights, percentiles): + +# if data.ndim > 1 or weights.ndim > 1: +# raise ValueError("data and weights must be one-dimensional arrays") + +# if data.shape != weights.shape: +# raise ValueError("data and weights must have the same shape") + +# data = np.asarray(data) +# weights = np.asarray(weights) +# percentiles = np.asarray(percentiles) + +# sort_indices = np.argsort(data) +# sorted_data = data[sort_indices] +# sorted_weights = weights[sort_indices] + +# cumsum_weights = np.cumsum(sorted_weights) +# sum_weights = cumsum_weights[-1] + +# pn = 100 * (cumsum_weights - 0.5*sorted_weights) / sum_weights + +# return np.interp(percentiles, pn, sorted_data) + + +# def wpercentile_unsafe(data, weights, percentiles, axis=None): +# """ +# Compute percentiles of a weighted sample. +# """ + +# # TODO: This code might fail with esoteric shapes of data, weights and +# # for some values of axis. Check this properly. + +# if axis is None: +# data = np.ravel(data) +# weights = np.ravel(weights) +# return _wpercentile1d(data, weights, percentiles) + +# axis = np.atleast_1d(axis) + +# # Reshape the arrays for proper computation +# # Move the requested axis to the final dimensions +# dest_axis = list(range(len(axis))) +# data2 = np.moveaxis(data, axis, dest_axis) + +# ndim = len(axis) +# shape = data2.shape +# newshape = (np.prod(shape[:ndim]), np.prod(shape[ndim:])) +# newdata = np.reshape(data2, newshape) +# newweights = np.reshape(weights, newshape[0]) + +# result = np.apply_along_axis(_wpercentile1d, 0, newdata, newweights, +# percentiles) + +# final_shape = (*np.shape(percentiles), *shape[ndim:]) +# return np.reshape(result, final_shape) + + +def wpercentile(data, weights, percentiles, axis=None): + + # This is a simple, error-free but pretty inefficient implementation of + # wpercentile. + + data = np.repeat(data, weights, axis=axis) + return np.percentile(data, percentiles, axis=axis) + + +def wmedian(data, weights, axis=None): + """Compute the weighted median.""" + return wpercentile(data, weights, 50, axis) diff --git a/models.py b/models.py deleted file mode 100644 index 6dc3369..0000000 --- a/models.py +++ /dev/null @@ -1,262 +0,0 @@ - -import logging -from collections import namedtuple - -import numpy as np - -from sklearn import ensemble -from sklearn.utils import check_random_state -from sklearn.preprocessing import MinMaxScaler - -try: - from tqdm import tqdm -except ImportError: - def tqdm(x, *_, **__): - return x - -__all__ = [ - "Model", - "Posterior", - "resample_posterior" -] - -logger = logging.getLogger(__name__) - -# Posteriors are represented as a collection of weighted samples -Posterior = namedtuple("Posterior", ["samples", "weights"]) - - -def resample_posterior(posterior, num_draws): - - p = posterior.weights / posterior.weights.sum() - indices = np.random.choice(len(posterior.samples), size=num_draws, p=p) - - new_weights = np.bincount(indices, minlength=len(posterior.samples)) - mask = new_weights != 0 - new_samples = posterior.samples[mask] - new_weights = posterior.weights[mask] - - return Posterior(new_samples, new_weights) - - -class Model: - - def __init__(self, num_trees, num_jobs, - names, ranges, colors, enable_posterior=True, - verbose=1): - scaler = MinMaxScaler(feature_range=(0, 100)) - rf = ensemble.RandomForestRegressor(n_estimators=num_trees, - oob_score=True, - verbose=verbose, - n_jobs=num_jobs, - max_features="sqrt", - min_impurity_decrease=0.01) - - self.scaler = scaler - self.rf = rf - - self.num_trees = num_trees - self.num_jobs = num_jobs - self.verbose = verbose - - self.ranges = ranges - self.names = names - self.colors = colors - - # To compute the posteriors - self.enable_posterior = enable_posterior - self.data_leaves = None - self.data_weights = None - self.data_y = None - - def _scaler_fit(self, y): - if y.ndim == 1: - y = y[:, None] - - self.scaler.fit(y) - - def _scaler_transform(self, y): - if y.ndim == 1: - y = y[:, None] - return self.scaler.transform(y)[:, 0] - - return self.scaler.transform(y) - - def _scaler_inverse_transform(self, y): - - if y.ndim == 1: - y = y[:, None] - # return self.scaler.inverse_transform(y)[:, 0] - - return self.scaler.inverse_transform(y) - - def fit(self, x, y): - self._scaler_fit(y) - self.rf.fit(x, self._scaler_transform(y)) - - # Build the structures to quickly compute the posteriors - if self.enable_posterior: - data_leaves = self.rf.apply(x).T - self.data_leaves = _as_smallest_udtype(data_leaves) - self.data_weights = np.array( - [_tree_weights(tree, len(y)) for tree in self.rf] - ) - self.data_y = y - - def predict(self, x): - pred = self.rf.predict(x) - return self._scaler_inverse_transform(pred) - - def predict_median(self, x): - return self.predict_percentile(x, 50) - - def predict_percentile(self, x, percentile): - - if not self.enable_posterior: - raise ValueError("Cannot compute posteriors with this model. " - "Set `enable_posterior` to True to enable " - "posterior computation.") - - # Find the leaves for the query points - leaves_x = self.rf.apply(x) - - if len(x) > self.num_trees: - # If there are many queries, - # it is faster to find points using a cache - return _posterior_percentile_cache( - self.data_leaves, self.data_weights, - self.data_y, leaves_x, percentile - ) - else: - # For few queries, it is faster if we just compute the posterior - # for each element - return _posterior_percentile_nocache( - self.data_leaves, self.data_weights, - self.data_y, leaves_x, percentile - ) - - def get_params(self, deep=True): - return { - "num_trees": self.num_trees, - "num_jobs": self.num_jobs, - "names": self.names, - "ranges": self.ranges, - "colors": self.colors, - "enable_posterior": self.enable_posterior, - "verbose": self.verbose - } - - def posterior(self, x): - - if not self.enable_posterior: - raise ValueError("Cannot compute posteriors with this model. " - "Set `enable_posterior` to True to enable " - "posterior computation.") - - if x.ndim > 1: - raise ValueError("x.ndim must be 1") - - leaves_x = self.rf.apply(x[None, :])[0] - - return _posterior( - self.data_leaves, self.data_weights, - self.data_y, leaves_x - ) - - -def _posterior(data_leaves, data_weights, data_y, query_leaves): - - weights_x = (query_leaves[:, None] == data_leaves) * data_weights - weights_x = _as_smallest_udtype(weights_x.sum(0)) - - # Remove samples with weight zero - mask = weights_x != 0 - samples = data_y[mask] - weights = weights_x[mask].astype(np.uint) - - return Posterior(samples, weights) - - -def _posterior_percentile_nocache(data_leaves, data_weights, data_y, - query_leaves, percentile): - - values = [] - - logger.info("Computing percentiles...") - for leaves_x_i in tqdm(query_leaves): - posterior = _posterior( - data_leaves, data_weights, - data_y, leaves_x_i - ) - samples = np.repeat(posterior.samples, posterior.weights, axis=0) - value = np.percentile(samples, percentile, axis=0) - values.append(value) - - return np.array(values) - - -def _posterior_percentile_cache(data_leaves, data_weights, data_y, - query_leaves, percentile): - - # Build a dictionary for fast access of the contents of the leaves. - logger.info("Building cache...") - cache = [ - _build_leaves_cache(leaves_i, weights_i) - for leaves_i, weights_i in zip(data_leaves, data_weights) - ] - - values = [] - # Check the contents of the leaves in leaves_x - logger.info("Computing percentiles...") - for leaves_x_i in tqdm(query_leaves): - data_elements = [] - for tree, leaves_x_i_j in enumerate(leaves_x_i): - aux = cache[tree][leaves_x_i_j] - data_elements.extend(aux) - value = np.percentile(data_y[data_elements], percentile, axis=0) - values.append(value) - - return np.array(values) - - -def _build_leaves_cache(leaves, weights): - - result = {} - for index, (leaf, weight) in enumerate(zip(leaves, weights)): - if weight == 0: - continue - - if leaf not in result: - result[leaf] = [index] * weight - else: - result[leaf].extend([index] * weight) - - return result - - -def _generate_sample_indices(random_state, n_samples): - random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_samples) - - return sample_indices - - -def _tree_weights(tree, n_samples): - indices = _generate_sample_indices(tree.random_state, n_samples) - res = np.bincount(indices, minlength=n_samples) - return _as_smallest_udtype(res) - - -def _as_smallest_udtype(arr): - return arr.astype(_smallest_udtype(arr.max())) - - -def _smallest_udtype(value): - - dtypes = [np.uint8, np.uint16, np.uint32, np.uint64] - - for dtype in dtypes: - if value <= np.iinfo(dtype).max: - return dtype - - raise ValueError("value is too large for any dtype") diff --git a/rfretrieval.py b/rfretrieval.py index 662265f..bde664a 100644 --- a/rfretrieval.py +++ b/rfretrieval.py @@ -7,122 +7,204 @@ from sklearn import metrics, multioutput import joblib -from dataset import load_dataset, load_data_file -from models import Model -from utils import config_logger -from wpercentile import wpercentile -import plot +import hela -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) -def train_model(dataset, num_trees, num_jobs, verbose=1): - pipeline = Model(num_trees, num_jobs, - names=dataset.names, - ranges=dataset.ranges, - colors=dataset.colors, - verbose=verbose) +def train_model( + dataset, + num_trees, + min_impurity_decrease, + num_jobs, + verbose=1): + + pipeline = hela.Model( + num_trees=num_trees, + min_impurity_decrease=min_impurity_decrease, + num_jobs=num_jobs, + verbose=verbose + ) pipeline.fit(dataset.training_x, dataset.training_y) return pipeline def test_model(model, dataset, output_path): - + if dataset.testing_x is None: return - - logger.info("Testing model...") + + LOGGER.info("Testing model...") pred = model.predict(dataset.testing_x) # pred = model.predict_median(dataset.testing_x) - r2scores = {name_i: metrics.r2_score(real_i, pred_i) - for name_i, real_i, pred_i in zip(dataset.names, dataset.testing_y.T, pred.T)} + r2scores = { + name_i: metrics.r2_score(real_i, pred_i) + for name_i, real_i, pred_i in zip(dataset.names, + dataset.testing_y.T, + pred.T) + } print("Testing scores:") for name, values in r2scores.items(): print("\tR^2 score for {}: {:.3f}".format(name, values)) - - logger.info("Plotting testing results...") - fig = plot.predicted_vs_real(dataset.testing_y, pred, dataset.names, dataset.ranges) + + LOGGER.info("Plotting testing results...") + fig = hela.predicted_vs_real( + dataset.testing_y, + pred, + names=dataset.names, + ranges=dataset.ranges + ) fig.savefig(os.path.join(output_path, "predicted_vs_real.pdf"), bbox_inches='tight') -def compute_feature_importance(model, dataset, output_path): - - logger.info("Computing feature importance for individual parameters...") +def _plot_feature_importances(model, dataset, output_path): + + LOGGER.info("Computing feature importances for individual parameters...") regr = multioutput.MultiOutputRegressor(model, n_jobs=1) regr.fit(dataset.training_x, dataset.training_y) - - fig = plot.feature_importances(forests=[i.rf for i in regr.estimators_] + [model.rf], - names=dataset.names + ["joint prediction"], - colors=dataset.colors + ["C0"]) - + + fig = hela.plot.feature_importances( + forests=( + [i.random_forest for i in regr.estimators_] + + [model.random_forest] + ), + names=dataset.names + ["joint prediction"], + colors=dataset.colors + ["C0"] + ) + fig.savefig(os.path.join(output_path, "feature_importances.pdf"), bbox_inches='tight') +def _plot_feature_importances_breakdown(model, dataset, output_path): + + LOGGER.info("Computing feature importances per output...") + importances = hela.importances_per_output( + model.random_forest, + dataset.training_x, + model.scaler_transform(dataset.training_y) + ) + + fig = hela.plot.stacked_feature_importances( + importances, + dataset.names, + dataset.colors + ) + + fig.tight_layout() + fig.savefig( + os.path.join(output_path, "feature_importances_breakdown.pdf"), + bbox_inches='tight' + ) + + +def _compute_best_fit(model, training_x, query, out_filename): + + posterior_x = model.posterior(query, prior_samples=training_x) + best_fit = hela.posterior_percentile(posterior_x, [50, 16, 84]) + np.save(out_filename, best_fit) + + def data_ranges(posterior, percentiles=(50, 16, 84)): - - samples, weights = posterior - values = wpercentile(samples, weights, percentiles, axis=0) + + values = hela.posterior_percentile(posterior, percentiles) ranges = np.array([values[0], values[2]-values[0], values[0]-values[1]]) return ranges.T -def main_train(training_dataset, model_path, - num_trees, num_jobs, - feature_importance, quiet, - **_): - - logger.info("Loading dataset '{}'...".format(training_dataset)) - dataset = load_dataset(training_dataset) - - logger.info("Training model...") - model = train_model(dataset, num_trees, num_jobs, not quiet) - +def main_train( + training_dataset, + model_path, + num_trees, + min_impurity_decrease, + num_jobs, + feature_importances, + feature_importances_breakdown, + quiet, + **_): + + LOGGER.info("Loading dataset '%s'...", training_dataset) + dataset = hela.load_dataset(training_dataset) + + LOGGER.info("Training model...") + model = train_model( + dataset=dataset, + num_trees=num_trees, + min_impurity_decrease=min_impurity_decrease, + num_jobs=num_jobs, + verbose=not quiet) + os.makedirs(model_path, exist_ok=True) model_file = os.path.join(model_path, "model.pkl") - logger.info("Saving model to '{}'...".format(model_file)) + LOGGER.info("Saving model to '%s'...", model_file) joblib.dump(model, model_file) - - logger.info("Printing model information...") - print("OOB score: {:.4f}".format(model.rf.oob_score_)) - + + LOGGER.info("Printing model information...") + print("OOB score: {:.4f}".format(model.random_forest.oob_score_)) + test_model(model, dataset, model_path) - - if feature_importance: + + if feature_importances: model.enable_posterior = False - compute_feature_importance(model, dataset, model_path) + _plot_feature_importances(model, dataset, model_path) + + if feature_importances_breakdown: + _plot_feature_importances_breakdown(model, dataset, model_path) + +def main_predict( + training_dataset, + model_path, + data_file, + output_path, + plot_posterior, + best_fit_model, + **_): + + LOGGER.info("Loading dataset '%s'...", training_dataset) + dataset = hela.load_dataset(training_dataset, load_testing_data=False) -def main_predict(model_path, data_file, output_path, plot_posterior, **_): - model_file = os.path.join(model_path, "model.pkl") - logger.info("Loading random forest from '{}'...".format(model_file)) - model = joblib.load(model_file) - - logger.info("Loading data from '{}'...".format(data_file)) - data, _ = load_data_file(data_file, model.rf.n_features_) - + LOGGER.info("Loading random forest from '%s'...", model_file) + model: hela.Model = joblib.load(model_file) + + LOGGER.info("Loading data from '%s'...", data_file) + data, _ = hela.load_data_file(data_file, model.random_forest.n_features_) + + os.makedirs(output_path, exist_ok=True) + posterior = model.posterior(data[0]) - + posterior_ranges = data_ranges(posterior) - for name_i, pred_range_i in zip(model.names, posterior_ranges): - print("Prediction for {}: {:.3g} [+{:.3g} -{:.3g}]".format(name_i, *pred_range_i)) - + for name_i, pred_range_i in zip(dataset.names, posterior_ranges): + format_str = "Prediction for {}: {:.3g} [+{:.3g} -{:.3g}]" + print(format_str.format(name_i, *pred_range_i)) + if plot_posterior: - logger.info("Plotting the posterior matrix...") - - fig = plot.posterior_matrix( + LOGGER.info("Plotting the posterior matrix...") + + fig = hela.plot.posterior_matrix( posterior, - names=model.names, - ranges=model.ranges, - colors=model.colors + names=dataset.names, + ranges=dataset.ranges, + colors=dataset.colors ) os.makedirs(output_path, exist_ok=True) - logger.info("Saving the figure....") + LOGGER.info("Saving the figure....") fig.savefig(os.path.join(output_path, "posterior_matrix.pdf"), bbox_inches='tight') - logger.info("Done.") + LOGGER.info("Done.") + + if best_fit_model: + LOGGER.info("Computing and saving best fit...") + _compute_best_fit( + model, + dataset.training_x, + data[0], + os.path.join(output_path, "best_fit.npy") + ) + LOGGER.info("Done.") def show_usage(parser, **_): @@ -130,38 +212,77 @@ def show_usage(parser, **_): def main(): - - parser = argparse.ArgumentParser(description="rfretrieval: Atmospheric retrieval with random forests.") + + parser = argparse.ArgumentParser( + description="rfretrieval: Atmospheric retrieval with random forests." + ) parser.set_defaults(func=show_usage, parser=parser) parser.add_argument("--quiet", action='store_true') subparsers = parser.add_subparsers() - + parser_train = subparsers.add_parser('train', help="train a model") - parser_train.add_argument("training_dataset", type=str, - help="JSON file with the training dataset description") - parser_train.add_argument("model_path", type=str, - help="path where the trained model will be saved") - parser_train.add_argument("--num-trees", type=int, default=1000, - help="number of trees in the forest") - parser_train.add_argument("--num-jobs", type=int, default=5, - help="number of parallel jobs for fitting the random forest") - parser_train.add_argument("--feature-importance", action='store_true', - help="compute feature importances after training") parser_train.set_defaults(func=main_train) - - parser_test = subparsers.add_parser('predict', help="use a trained model to perform a prediction") + parser_train.add_argument( + "training_dataset", type=str, + help="JSON file with the training dataset description" + ) + parser_train.add_argument( + "model_path", type=str, + help="path where the trained model will be saved" + ) + parser_train.add_argument( + "--num-trees", type=int, default=1000, + help="number of trees in the forest" + ) + parser_train.add_argument( + "--min-impurity-decrease", type=float, default=0.01, + help="minimum impurity decrease to split a node in each tree" + ) + parser_train.add_argument( + "--num-jobs", type=int, default=5, + help="number of parallel jobs for fitting the random forest" + ) + parser_train.add_argument( + "--feature-importances", action='store_true', + help="plot feature importances after training" + ) + parser_train.add_argument( + "--feature-importances-breakdown", action='store_true', + help="plot feature importances per output after training" + ) + + parser_test = subparsers.add_parser( + 'predict', + help="use a trained model to perform a prediction" + ) parser_test.set_defaults(func=main_predict) - parser_test.add_argument("model_path", type=str, - help="path to the trained model") - parser_test.add_argument("data_file", type=str, - help="NPY file with the data for the prediction") - parser_test.add_argument("output_path", type=str, - help="path to write the results of the prediction") - parser_test.add_argument("--plot-posterior", action='store_true', - help="plot and save the scatter matrix of the posterior distribution") - + parser_test.add_argument( + "training_dataset", type=str, + help="JSON file with the training dataset description" + ) + parser_test.add_argument( + "model_path", type=str, + help="path to the trained model" + ) + parser_test.add_argument( + "data_file", type=str, + help="NPY file with the data for the prediction" + ) + parser_test.add_argument( + "output_path", type=str, + help="path to write the results of the prediction" + ) + parser_test.add_argument( + "--plot-posterior", action='store_true', + help="plot and save the scatter matrix of the posterior distribution" + ) + parser_test.add_argument( + "--best-fit-model", action='store_true', + help="save the best fit model" + ) + args = parser.parse_args() - config_logger(level=logging.WARNING if args.quiet else logging.INFO) + hela.config_logger(level=logging.WARNING if args.quiet else logging.INFO) args.func(**vars(args)) diff --git a/wpercentile.py b/wpercentile.py deleted file mode 100644 index f030db9..0000000 --- a/wpercentile.py +++ /dev/null @@ -1,64 +0,0 @@ - -import numpy as np - -__all__ = [ - "wpercentile", - "wmedian" -] - - -def _wpercentile1d(data, weights, percentiles): - - if data.ndim > 1 or weights.ndim > 1: - raise ValueError("data and weights must be one-dimensional arrays") - - if data.shape != weights.shape: - raise ValueError("data and weights must have the same shape") - - data = np.asarray(data) - weights = np.asarray(weights) - percentiles = np.asarray(percentiles) - - sort_indices = np.argsort(data) - sorted_data = data[sort_indices] - sorted_weights = weights[sort_indices] - - cumsum_weights = np.cumsum(sorted_weights) - sum_weights = cumsum_weights[-1] - - pn = 100 * (cumsum_weights - 0.5*sorted_weights) / sum_weights - - return np.interp(percentiles, pn, sorted_data) - - -def wpercentile(data, weights, percentiles, axis=None): - """ - Compute percentiles of a weighted sample. - """ - if axis is None: - data = np.ravel(data) - weights = np.ravel(weights) - return _wpercentile1d(data, weights, percentiles) - - axis = np.atleast_1d(axis) - - # Reshape the arrays for proper computation - # Move the requested axis to the final dimensions - dest_axis = list(range(len(axis))) - data2 = np.moveaxis(data, axis, dest_axis) - - ndim = len(axis) - shape = data2.shape - newshape = (np.prod(shape[:ndim]), np.prod(shape[ndim:])) - newdata = np.reshape(data2, newshape) - newweights = np.reshape(weights, newshape[0]) - - result = np.apply_along_axis(_wpercentile1d, 0, newdata, newweights, percentiles) - - final_shape = (*np.shape(percentiles), *shape[ndim:]) - return np.reshape(result, final_shape) - - -def wmedian(data, weights, axis=None): - """Compute the weighted median.""" - return wpercentile(data, weights, 50, axis)