gaussian_process.py

""" gaussian_process.py

Bayesian optimisation of loss functions.
"""

import numpy as np
import random
import sklearn.gaussian_process as gp

from scipy.stats import norm
from scipy.optimize import minimize

def expected_improvement(x, gaussian_process, evaluated_loss, greater_is_better=False, n_params=1):
    """ expected_improvement

    Expected improvement acquisition function.

    Arguments:
    ----------
        x: array-like, shape = [n_samples, n_hyperparams]
            The point for which the expected improvement needs to be computed.
        gaussian_process: GaussianProcessRegressor object.
            Gaussian process trained on previously evaluated hyperparameters.
        evaluated_loss: Numpy array.
            Numpy array that contains the values off the loss function for the previously
            evaluated hyperparameters.
        greater_is_better: Boolean.
            Boolean flag that indicates whether the loss function is to be maximised or minimised.
        n_params: int.
            Dimension of the hyperparameter space.
    """

    x_to_predict = x.reshape(-1, n_params)

    mu, sigma = gaussian_process.predict(x_to_predict, return_std=True)

    if greater_is_better:
        loss_optimum = np.max(evaluated_loss)
    else:
        loss_optimum = np.min(evaluated_loss)

    scaling_factor = (-1) ** (not greater_is_better)

    # In case sigma equals zero
    with np.errstate(divide='ignore'):
        Z = scaling_factor * (mu - loss_optimum) / sigma
        expected_improvement = scaling_factor * (mu - loss_optimum) * norm.cdf(Z) + sigma * norm.pdf(Z)
        expected_improvement[sigma == 0.0] == 0.0

    return -1 * expected_improvement


def sample_next_hyperparameter(acquisition_func, gaussian_process, evaluated_loss, greater_is_better=False,
                               bounds=(0, 10), n_restarts=25):
    """ sample_next_hyperparameter

    Proposes the next hyperparameter to sample the loss function for.

    Arguments:
    ----------
        acquisition_func: function.
            Acquisition function to optimise.
        gaussian_process: GaussianProcessRegressor object.
            Gaussian process trained on previously evaluated hyperparameters.
        evaluated_loss: array-like, shape = [n_obs,]
            Numpy array that contains the values of the loss function for the
            previously evaluated hyperparameters.
        greater_is_better: Boolean.
            Boolean flag that indicates whether the loss function is to be
            maximised or minimised.
        bounds: Tuple.
            Bounds for the L-BFGS optimiser.
        n_restarts: integer.
            Number of times to run the minimiser with different starting points.

    """
    best_x = None
    best_acquisition_value = 1
    n_params = bounds.shape[0]

    for starting_point in np.random.uniform(bounds[:, 0], bounds[:, 1], size=(n_restarts, n_params)):

        res = minimize(fun=acquisition_func,
                       x0=starting_point.reshape(1, -1),
                       bounds=bounds,
                       method='L-BFGS-B',
                       args=(gaussian_process, evaluated_loss, greater_is_better, n_params))

        if res.fun < best_acquisition_value:
            best_acquisition_value = res.fun
            best_x = res.x

    return best_x


def closest_option(params, candidates):
    # Needed to make it work in a discrete search space
    distance = lambda candidate: np.linalg.norm(candidate - np.array(params))

    closest = min(candidates, key=distance)
    return closest

def bayesian_optimisation(n_iters, sample_loss, candidates, x0=None, n_pre_samples=3,
                          gp_params=None, alpha=1e-5, epsilon=1e-7, fast=False):
    """ bayesian_optimisation

    Uses Gaussian Processes to optimise the loss function `sample_loss`.

    Arguments:
    ----------
        n_iters: integer.
            Number of iterations to run the search algorithm.
        sample_loss: function.
            Function to be optimised.
        candidates: 2d-array-like, shape = [[n_params], ...].
        x0: array-like, shape = [n_pre_samples, n_params].
            Array of initial points to sample the loss function for. If None, randomly
            samples from the loss function.
        n_pre_samples: integer.
            If x0 is None, samples `n_pre_samples` initial points from the loss function.
        gp_params: dictionary.
            Dictionary of parameters to pass on to the underlying Gaussian Process.
        alpha: double.
            Variance of the error term of the GP.
        epsilon: double.
            Precision tolerance for floats.
    """
    def remove_from_candidates(sample):
        index = min(i for i, tup in enumerate(candidates) if all(tup == sample))
        return np.delete(candidates, index, axis=0)

    x_list = []
    y_list = []

    if len(candidates) == 0: return np.array([]), np.array([])


    candidate_list = np.array(list(candidates))
    n_params = len(candidate_list[0])
    bounds = np.array([
        [candidate_list[:,i].min(), candidate_list[:,i].max()]
        for i in range(n_params) ])  # shape = [n_params, 2].

    if x0 is not None and len(x0) > 0:
        for sample in x0:
            x_list.append(sample)
            cv_score = sample_loss(sample)
            y_list.append(cv_score)
            #candidates = remove_from_candidates(sample)
    else:
        for _ in range(max(0, n_pre_samples-len(x_list))):
            if len(candidates) == 0: return np.array([]), np.array([])
            sample = random.choice(candidates)
            cv_score = sample_loss(sample)
            x_list.append(sample)
            y_list.append(cv_score)
            candidates = remove_from_candidates(sample)

    xp = np.array(x_list)
    yp = np.array(y_list)

    # Create the GP
    if gp_params is not None:
        model = gp.GaussianProcessRegressor(**gp_params)
    else:
        kernel = gp.kernels.Matern()
        model = gp.GaussianProcessRegressor(kernel=kernel,
                                            alpha=alpha,
                                            n_restarts_optimizer=10,
                                            normalize_y=True)

    for _ in range(n_iters):

        if len(candidates) == 0: break
        if fast and 1 in yp: break
        model.fit(xp, yp)

        # Sample next hyperparameter
        next_sample = sample_next_hyperparameter(expected_improvement, model, yp, greater_is_better=False, bounds=bounds, n_restarts=100)
        next_sample = closest_option(next_sample, candidates)

        # Sample loss for new set of parameters
        cv_score = sample_loss(next_sample)

        # Remove from candidates, because duplicates break the Gaussian process
        candidates = remove_from_candidates(next_sample)

        # Update lists
        x_list.append(next_sample)
        y_list.append(cv_score)

        # Update xp and yp
        xp = np.array(x_list)
        yp = np.array(y_list)

    return xp, yp