diff --git a/botorch/models/gpytorch.py b/botorch/models/gpytorch.py index 708f4b8ec2..82d4cbd580 100644 --- a/botorch/models/gpytorch.py +++ b/botorch/models/gpytorch.py @@ -402,6 +402,8 @@ def _apply_noise( mvn = self.likelihood(mvn, X) return mvn + # pyre-ignore[14]: Inconsistent override. Could not find parameter + # `Keywords(typing.Any)` in overriding signature. def posterior( self, X: Tensor, @@ -470,6 +472,7 @@ def posterior( return posterior_transform(posterior) return posterior + # pyre-ignore[14]: Inconsistent override. Could not find parameter `noise`. def condition_on_observations( self, X: Tensor, Y: Tensor, **kwargs: Any ) -> BatchedMultiOutputGPyTorchModel: @@ -632,7 +635,7 @@ def batch_shape(self) -> torch.Size: raise NotImplementedError(msg + " that are not broadcastble.") return next(iter(batch_shapes)) - # pyre-fixme[15]: Inconsistent override in return types + # pyre-fixme[14]: Inconsistent override in return types def posterior( self, X: Tensor, @@ -838,6 +841,8 @@ def _apply_noise( ) return self.likelihood(mvn, X) + # pyre-ignore[14]: Inconsistent override. Could not find parameter + # `Keywords(typing.Any)` in overriding signature. def posterior( self, X: Tensor, diff --git a/botorch/models/likelihoods/sparse_outlier_noise.py b/botorch/models/likelihoods/sparse_outlier_noise.py new file mode 100644 index 0000000000..2f3630a5d8 --- /dev/null +++ b/botorch/models/likelihoods/sparse_outlier_noise.py @@ -0,0 +1,480 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any +from warnings import warn + +import torch +from botorch.exceptions.errors import UnsupportedError +from botorch.exceptions.warnings import InputDataWarning +from botorch.models.model import Model +from botorch.models.relevance_pursuit import RelevancePursuitMixin +from botorch.utils.constraints import NonTransformedInterval +from gpytorch.distributions import MultivariateNormal +from gpytorch.likelihoods import _GaussianLikelihoodBase +from gpytorch.likelihoods.noise_models import FixedGaussianNoise, Noise +from gpytorch.mlls import ExactMarginalLogLikelihood +from gpytorch.priors import Prior +from linear_operator.operators import DiagLinearOperator, LinearOperator +from linear_operator.utils.cholesky import psd_safe_cholesky +from torch import Tensor +from torch.nn.parameter import Parameter + + +class SparseOutlierGaussianLikelihood(_GaussianLikelihoodBase): + def __init__( + self, + base_noise: Noise | FixedGaussianNoise, + dim: int, + outlier_indices: list[int] | None = None, + rho_prior: Prior | None = None, + rho_constraint: NonTransformedInterval | None = None, + batch_shape: torch.Size | None = None, + convex_parameterization: bool = True, + loo: bool = True, + ) -> None: + """A likelihood that models the noise of a GP with SparseOutlierNoise, a noise + model in the Relevance Pursuit family of models, permitting additional "robust" + variance for a small set of outlier data points. Notably, the indices of the + outlier data points are inferred during the optimization of the associated log + marginal likelihood via the Relevance Pursuit algorithm. + + For details, see [Ament2024pursuit]_ or https://arxiv.org/abs/2410.24222. + + NOTE: Letting base_noise also use the non-transformed constraints, will lead + to more stable optimization, but is orthogonal implementation-wise. If the base + noise is a HomoskedasticNoise, one can pass the non-transformed constraint as + the `noise_constraint`. + + Example: + >>> base_noise = HomoskedasticNoise( + >>> noise_constraint=NonTransformedInterval( + >>> 1e-5, 1e-1, initial_value=1e-3 + >>> ) + >>> ) + >>> likelihood = SparseOutlierGaussianLikelihood( + >>> base_noise=base_noise, + >>> dim=X.shape[0], + >>> ) + >>> model = SingleTaskGP(train_X=X, train_Y=Y, likelihood=likelihood) + >>> mll = ExactMarginalLogLikelihood(model.likelihood, model) + >>> # NOTE: `likelihood.noise_covar` is the `RelevancePursuitMixin` + >>> sparse_module = likelihood.noise_covar + >>> backward_relevance_pursuit(sparse_module, mll) + + Args: + base_noise: The base noise model. + dim: The number of training observations, which determines the maximum + number of data-point-specific noise variances of the noise model. + outlier_indices: The indices of the outliers. + rho_prior: Prior for `self.noise_covar`'s rho parameter. + rho_constraint: Constraint for `self.noise_covar`'s rho parameter. Needs to + be a NonTransformedInterval because exact sparsity cannot be represented + using smooth transforms like a softplus or sigmoid. + batch_shape: The batch shape of the learned noise parameter (default: []). + convex_parameterization: Whether to use the convex parameterization of rho, + which generally improves optimization results and is thus recommended. + loo: Whether to use leave-one-out (LOO) update equations that can compute + the optimal values of each individual rho, keeping all else equal. + """ + noise_covar = SparseOutlierNoise( + base_noise=base_noise, + dim=dim, + outlier_indices=outlier_indices, + rho_prior=rho_prior, + rho_constraint=rho_constraint, + batch_shape=batch_shape, + convex_parameterization=convex_parameterization, + loo=loo, + ) + super().__init__(noise_covar=noise_covar) + + # pyre-ignore[14]: Inconsistent override because the super class accepts `*params` + def marginal( + self, + function_dist: MultivariateNormal, + X: Tensor | list[Tensor] | None = None, + **kwargs: Any, + ) -> MultivariateNormal: + mean, covar = function_dist.mean, function_dist.lazy_covariance_matrix + # this scales the rhos by the diagonal of the "non-robust" covariance matrix + diag_K = covar.diagonal() if self.noise_covar.convex_parameterization else None + noise_covar = self.noise_covar.forward( + X=X, shape=mean.shape, diag_K=diag_K, **kwargs + ) + full_covar = covar + noise_covar + return function_dist.__class__(mean, full_covar) + + def expected_log_prob( + self, target: Tensor, input: MultivariateNormal, *params: Any, **kwargs: Any + ) -> Tensor: + raise NotImplementedError( + "SparseOutlierGaussianLikelihood does not yet support variational inference" + ", but this is not a fundamental limitation. It will require an adding " + "the `expected_log_prob` method to SparseOutlierGaussianLikelihood." + ) + + +class SparseOutlierNoise(Noise, RelevancePursuitMixin): + def __init__( + self, + base_noise: Noise | FixedGaussianNoise, + dim: int, + outlier_indices: list[int] | None = None, + rho_prior: Prior | None = None, + rho_constraint: NonTransformedInterval | None = None, + batch_shape: torch.Size | None = None, + convex_parameterization: bool = True, + loo: bool = True, + ): + """A noise model in the Relevance Pursuit family of models, permitting + additional "robust" variance for a small set of outlier data points. + See also `SparseOutlierGaussianLikelihood`, which leverages this noise model. + + For details, see [Ament2024pursuit]_ or https://arxiv.org/abs/2410.24222. + + Example: + >>> base_noise = HomoskedasticNoise( + >>> noise_constraint=NonTransformedInterval( + >>> 1e-5, 1e-1, initial_value=1e-3 + >>> ) + >>> ) + >>> likelihood = SparseOutlierGaussianLikelihood( + >>> base_noise=base_noise, + >>> dim=X.shape[0], + >>> ) + >>> model = SingleTaskGP(train_X=X, train_Y=Y, likelihood=likelihood) + >>> mll = ExactMarginalLogLikelihood(model.likelihood, model) + >>> # NOTE: `likelihood.noise_covar` is the `SparseOutlierNoise` + >>> sparse_module = likelihood.noise_covar + >>> backward_relevance_pursuit(sparse_module, mll) + + Args: + base_noise: The base noise model. + dim: The number of training observations, which determines the maximum + number of data-point-specific noise variances of the noise model. + outlier_indices: The indices of the outliers. + rho_prior: Prior for the rho parameter. + rho_constraint: Constraint for the rho parameter. Needs to be a + NonTransformedInterval because exact sparsity cannot be represented + using smooth transforms like a softplus or sigmoid. + batch_shape: The batch shape of the learned noise parameter (default: []). + convex_parameterization: Whether to use the convex parameterization of rho, + which generally improves optimization results and is thus recommended. + loo: Whether to use leave-one-out (LOO) update equations that can compute + the optimal values of each individual rho, keeping all else equal. + """ + super().__init__() + RelevancePursuitMixin.__init__(self, dim=dim, support=outlier_indices) + + if batch_shape is None: + batch_shape = base_noise.noise.shape[:-1] + + self.base_noise = base_noise + device = base_noise.noise.device + if rho_constraint is None: + cvx_upper_bound = 1 - 1e-3 # < 1 to avoid singularities + rho_constraint = NonTransformedInterval( + lower_bound=0.0, + upper_bound=cvx_upper_bound if convex_parameterization else torch.inf, + initial_value=0.0, + ) + else: + if not isinstance(rho_constraint, NonTransformedInterval): + raise ValueError( + "`rho_constraint` must be a `NonTransformedInterval` if it " + "is not None." + ) + + if rho_constraint.lower_bound < 0: + raise ValueError( + "SparseOutlierNoise requires rho_constraint.lower_bound >= 0." + ) + + if convex_parameterization and rho_constraint.upper_bound > 1: + raise ValueError( + "Convex parameterization requires rho_constraint.upper_bound <= 1." + ) + + # NOTE: Prefer to keep the initialization of the sparse_parameter in the + # derived classes of the Mixin, because it might require additional logic + # that we don't want to put into RelevancePursuitMixin. + num_outliers = len(self.support) + self.register_parameter( + "raw_rho", + parameter=Parameter( + torch.zeros( + *batch_shape, + num_outliers, + dtype=base_noise.noise.dtype, + device=device, + ) + ), + ) + + if rho_prior is not None: + + def _rho_param(m): + return m.rho + + # this closure is only needed for some features, e.g. sampling from the + # prior, and requires additional thought in this case, as it will also has + # to modify the support of the RelevancePursuitMixin + _set_rho_closure = None + + self.register_prior("rho_prior", rho_prior, _rho_param, _set_rho_closure) + + self.register_constraint("raw_rho", rho_constraint) + # only publicly exposing getter of convex parameterization + # since post-hoc modification can lead to inconsistencies + # with the rho constraints. + self._convex_parameterization = convex_parameterization + self.loo = loo + self._cached_train_inputs = None + + @property + def sparse_parameter(self) -> Parameter: + return self.raw_rho + + def set_sparse_parameter(self, value: Parameter) -> None: + """Sets the sparse parameter. + + NOTE: We can't use the property setter @sparse_parameter.setter because of + the special way PyTorch treats Parameter types, including custom setters. + """ + self.raw_rho = torch.nn.Parameter(value.to(self.raw_rho)) + + @property + def convex_parameterization(self) -> bool: + return self._convex_parameterization + + @staticmethod + def _from_model(model: Model) -> RelevancePursuitMixin: + sparse_module = model.likelihood.noise_covar + if not isinstance(sparse_module, SparseOutlierNoise): + raise ValueError( + "The model's likelihood does not have a SparseOutlierNoise noise " + f"as its noise_covar module, but instead a {type(sparse_module)}." + ) + return sparse_module + + @property + def _convex_rho(self) -> Tensor: + """Transforms the raw_rho parameter such that `rho ~= 1 / (1 - raw_rho) - 1`, + which is a diffeomorphism from [0, 1] to [0, inf] whose derivative is nowhere + zero. This transforms the marginal log likelihood to be a convex function of + the `self.raw_rho` Parameter, when the covariance matrix is well conditioned. + + NOTE: The convex parameterization also includes a scaling of the rho values by + the diagonal of the covariance matrix, which is carried out in the `marginal` + call in the SparseOutlierGaussianLikelihood. + """ + # pyre-ignore[7]: It is not have an incompatible return type, pyre just doesn't + # recognize that the result gets promoted to a Tensor. + return 1 / (1 - self.raw_rho) - 1 + + @property + def rho(self) -> Tensor: + """Dense representation of the data-point-specific variances, corresponding to + the latent `self.raw_rho` values, which might be represented sparsely or in the + convex parameterization. The last dimension is equal to the number of training + points `self.dim`. + + NOTE: `rho` differs from `self.sparse_parameter` in that the latter returns the + the parameter in its sparse representation when `self.is_sparse` is true, and in + its latent convex paramzeterization when `self.convex_parameterization` is true, + while `rho` always returns the data-point-specific variances, embedded in a + dense tensor. The dense representation is used to propagate gradients to the + sparse rhos in the support. + + Returns: + A `batch_shape x self.dim`-dim Tensor of robustness variances. + """ + # NOTE: don't need to do transform / untransform since we are + # enforcing NonTransformedIntervals. + rho_outlier = self._convex_rho if self.convex_parameterization else self.raw_rho + if not self.is_sparse: # in the dense representation, we're done. + return rho_outlier + + # If rho_outlier is in the sparse representation, we need to pad the + # rho values with zeros at the correct positions. The difference + # between this and calling RelevancePursuit's `to_dense` is that + # the latter will propagate gradients through all rhos, whereas + # the path here only propagates gradients to the sparse set of + # outliers, which is important for the optimization of the support. + rho_inlier = torch.zeros( + 1, dtype=rho_outlier.dtype, device=rho_outlier.device + ).expand(rho_outlier.shape[:-1] + (1,)) + rho = torch.cat( + [rho_outlier, rho_inlier], dim=-1 + ) # batch_shape x (num_outliers + 1) + + return rho[..., self._rho_selection_indices] + + @property + def _rho_selection_indices(self) -> Tensor: + # num_train is cached in the forward pass in training mode + # if an index is not in the outlier indices, we get the zeros from the + # last index of "rho" + # is this related to a sparse to dense mapping used in RP? + rho_selection_indices = torch.full( + self.raw_rho.shape[:-1] + (self.dim,), + -1, + dtype=torch.long, + device=self.raw_rho.device, + ) + for i, j in enumerate(self.support): + rho_selection_indices[j] = i + + return rho_selection_indices + + # pyre-ignore[14]: Inconsistent override because the super class accepts `*params` + def forward( + self, + X: Tensor | list[Tensor] | None = None, + shape: torch.Size | None = None, + diag_K: Tensor | None = None, + **kwargs: Any, + ) -> LinearOperator | Tensor: + """Computes the covariance matrix of the sparse outlier noise model. + + Args: + X: The training inputs, used to determine if the model is applied to the + training data, in which case the outlier variances are applied, or not. + NOTE: By default, BoTorch passes the transformed training inputs to + the likelihood during both training and inference. + shape: The shape of the covariance matrix, which is used to broadcast the + rho values to the correct shape. + diag_K: The diagonal of the covariance matrix, which is used to scale the + rho values in the convex parameterization. + kwargs: Any additional parameters of the base noise model, same as for + GPyTorch's noise model. Note that this implementation does not support + non-kwarg `params` arguments, which are used in GPyTorch's noise models. + + Returns: + A `batch_shape x self.dim`-dim Tensor of robustness variances. + """ + noise_covar = self.base_noise(X, shape=shape, **kwargs) + # rho should always be applied to the training set, irrespective of whether or + # not we are in training mode, so we will check if we should apply the rhos, + # based on cached training inputs. + rho = self.rho + # NOTE: Even though it is not strictly required for many likelihoods, BoTorch + # and GPyTorch generally pass the training inputs to the likelihood, e.g.: + # 1) in fit_gpytorch_mll: + # ( + # github.com/pytorch/botorch/blob/3ca48d0ac5865a017ac6b2294807b432d6472bcf/ + # botorch/optim/closures/model_closures.py#L185 + # ) + # 2) in the exact prediction strategy: + # ( + # github.com/cornellius-gp/gpytorch/blob/ + # d501c284d05a1186868dc3fb20e0fa6ad32d32ac/ + # gpytorch/models/exact_prediction_strategies.py#L387 + # ) + # 3) In the model's `posterior` method, if `observation_noise` is True, in which + # case the test inputs will be passed to the likelihood: + # ( + # https://github.com/pytorch/botorch/blob/ + # 4190f74363757ad97bfb0b437402b749ae50ba4c/ + # botorch/models/gpytorch.py#L198 + # ) + # Note that this module will raise a warning in this case to inform the user + # that the robust rhos are not applied to the test data. + if X is not None: + if isinstance(X, list): + if len(X) != 1: + raise UnsupportedError( + "SparseOutlierNoise only supports a single training input " + f"Tensor, but got a list of length {len(X)}." + ) + X = X[0] + if noise_covar.shape[-1] != rho.shape[-1]: + apply_robust_variances = False + warning_reason = ( + "the last dimension of the base noise covariance " + f"({noise_covar.shape[-1]}) " + "is not compatible with the last dimension of rho " + f"({rho.shape[-1]})." + ) + elif self.training or self._cached_train_inputs is None: + apply_robust_variances = True + self._cached_train_inputs = X + warning_reason = "" # will not warn when applying robust variances + else: + apply_robust_variances = torch.equal(X, self._cached_train_inputs) + warning_reason = ( + "the passed train_inputs are not equal to the cached ones." + ) + else: + apply_robust_variances = False + warning_reason = "the training inputs were not passed to the likelihood." + + if apply_robust_variances: + if diag_K is not None: + rho = (diag_K + noise_covar.diagonal()) * rho # convex parameterization + noise_covar = noise_covar + DiagLinearOperator(rho) + else: + warn( + f"SparseOutlierNoise: Robust rho not applied because {warning_reason} " + + "This can happen when the model posterior is evaluated on test data.", + InputDataWarning, + stacklevel=2, + ) + return noise_covar + + # relevance pursuit method expansion and contraction related methods + def expansion_objective(self, mll: ExactMarginalLogLikelihood) -> Tensor: + """Computes an objective value for all the inactive parameters, i.e. + self.sparse_parameter[~self.is_active] since we can't add already active + parameters to the support. This value will be used to select the parameters. + + Args: + mll: The marginal likelihood, containing the model to optimize. + + Returns: + The expansion objective value for all the inactive parameters. + """ + f = self._optimal_rhos if self.loo else self._sparse_parameter_gradient + return f(mll) + + def _optimal_rhos(self, mll: ExactMarginalLogLikelihood) -> Tensor: + """Computes the optimal rho deltas for the given model. + + Args: + mll: The marginal likelihood, containing the model to optimize. + + Returns: + A `batch_shape x self.dim`-dim Tensor of optimal rho deltas. + """ + # train() is important, since we want to evaluate the prior with mll.model(X), + # but in eval(), __call__ gives the posterior. + mll.train() # NOTE: this changes model.train_inputs to be unnormalized. + X, Y = mll.model.train_inputs[0], mll.model.train_targets + F = mll.model(X) + L = mll.likelihood(F, X) + S = L.covariance_matrix # (Kernel Matrix + Noise Matrix) + + # NOTE: The following computation is mathematically equivalent to the formula + # in this comment, but leverages the positive-definiteness of S via its + # Cholesky factorization. + # S_inv = S.inverse() + # diag_S_inv = S_inv.diagonal(dim1=-1, dim2=-2) + # loo_var = 1 / S_inv.diagonal(dim1=-1, dim2=-2) + # loo_mean = Y - (S_inv @ Y) / diag_S_inv + + chol = psd_safe_cholesky(S, upper=True) + eye = torch.eye(chol.size(-1), device=chol.device, dtype=chol.dtype) + inv_root = torch.linalg.solve_triangular(chol, eye, upper=True) + + # test: inv_root.square().sum(dim=-1) - S.inverse().diag() + diag_S_inv = inv_root.square().sum(dim=-1) + loo_var = 1 / diag_S_inv + S_inv_Y = torch.cholesky_solve(Y.unsqueeze(-1), chol, upper=True).squeeze(-1) + loo_mean = Y - S_inv_Y / diag_S_inv + + loo_error = loo_mean - Y + optimal_rho_deltas = loo_error.square() - loo_var + return (optimal_rho_deltas - self.rho).clamp(0)[~self.is_active] diff --git a/botorch/models/relevance_pursuit.py b/botorch/models/relevance_pursuit.py new file mode 100644 index 0000000000..e1cd665b02 --- /dev/null +++ b/botorch/models/relevance_pursuit.py @@ -0,0 +1,851 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +r""" +Relevance Pursuit model structure and optimization routines for the sparse optimization +of Gaussian process hyper-parameters, see [Ament2024pursuit]_ for details. + +References + +.. [Ament2024pursuit] + S. Ament, E. Santorella, D. Eriksson, B. Letham, M. Balandat, and E. Bakshy. + Robust Gaussian Processes via Relevance Pursuit. Advances in Neural Information + Processing Systems 37, 2024. Arxiv: https://arxiv.org/abs/2410.24222. +""" + +from __future__ import annotations + +import math + +from abc import ABC, abstractmethod +from collections.abc import Callable +from copy import copy, deepcopy +from functools import partial +from typing import Any, cast, Optional +from warnings import warn + +import torch +from botorch.fit import fit_gpytorch_mll +from botorch.models.model import Model +from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood +from torch import Tensor +from torch.nn.parameter import Parameter + +MLL_ITER = 10_000 # let's take convergence seriously +MLL_TOL = 1e-8 +RESET_PARAMETERS = False + + +class RelevancePursuitMixin(ABC): + """Mixin class to convert between the sparse and dense representations of the + relevance pursuit models' sparse parameters, as well as to compute the generalized + support acquisition and support deletion criteria. + """ + + dim: int # the total number of features + _support: list[int] # indices of the features in the support, subset of range(dim) + + def __init__( + self, + dim: int, + support: list[int] | None, + ) -> None: + """Constructor for the RelevancePursuitMixin class. + + For details, see [Ament2024pursuit]_ or https://arxiv.org/abs/2410.24222. + + Args: + dim: The total number of features. + support: The indices of the features in the support, subset of range(dim). + """ + + self.dim = dim + self._support = support if support is not None else [] + # Assumption: sparse_parameter is initialized in sparse representation + self._is_sparse = True + self._expansion_modifier = None + self._contraction_modifier = None + + @property + @abstractmethod + def sparse_parameter(self) -> Parameter: + """The sparse parameter, required to have a single indexing dimension.""" + pass # pragma: no cover + + @abstractmethod + def set_sparse_parameter(self, value: Parameter) -> None: + """Sets the sparse parameter. + + NOTE: We can't use the property setter @sparse_parameter.setter because of + the special way PyTorch treats Parameter types, including custom setters that + bypass the @property setters before the latter are called. + """ + pass # pragma: no cover + + @staticmethod + def _from_model(model: Model) -> RelevancePursuitMixin: + """Retrieves a RelevancePursuitMixin from a model.""" + raise NotImplementedError # pragma: no cover + + @property + def is_sparse(self) -> bool: + # Do we need to differentiate between a full support sparse representation and + # a full support dense representation? The order the of the indices could be + # different, unless we keep them sorted. + return self._is_sparse + + @property + def support(self) -> list[int]: + """The indices of the active parameters.""" + return self._support + + @property + def is_active(self) -> Tensor: + """A Boolean Tensor of length `dim`, indicating which of the `dim` indices of + `self.sparse_parameter` are in the support, i.e. active.""" + is_active = [(i in self.support) for i in range(self.dim)] + return torch.tensor( + is_active, dtype=torch.bool, device=self.sparse_parameter.device + ) + + @property + def inactive_indices(self) -> Tensor: + """An integral Tensor of length `dim - len(support)`, indicating which of the + indices of `self.sparse_parameter` are not in the support, i.e. inactive.""" + device = self.sparse_parameter.device + return torch.arange(self.dim, device=device)[~self.is_active] + + def to_sparse(self) -> RelevancePursuitMixin: + """Converts the sparse parameter to its sparse (< dim) representation. + + Returns: + The current object in its sparse representation. + """ + if not self.is_sparse: + self.set_sparse_parameter( + torch.nn.Parameter(self.sparse_parameter[self.support]) + ) + self._is_sparse = True + return self + + def to_dense(self) -> RelevancePursuitMixin: + """Converts the sparse parameter to its dense, length-`dim` representation. + + Returns: + The current object in its dense representation. + """ + if self.is_sparse: + dtype = self.sparse_parameter.dtype + device = self.sparse_parameter.device + zero = torch.tensor( + 0.0, + dtype=dtype, + device=device, + ) + dense_parameter = [ + ( + self.sparse_parameter[self.support.index(i)] + if i in self.support + else zero + ) + for i in range(self.dim) + ] + dense_parameter = torch.tensor(dense_parameter, dtype=dtype, device=device) + self.set_sparse_parameter(torch.nn.Parameter(dense_parameter)) + self._is_sparse = False + return self + + def expand_support(self, indices: list[int]) -> RelevancePursuitMixin: + """Expands the support by a number of indices. + + Args: + indices: A list of indices of `self.sparse_parameter` to add to the support. + + Returns: + The current object, updated with the expanded support. + """ + for i in indices: + if i in self.support: + raise ValueError(f"Feature {i} already in the support.") + + self.support.extend(indices) + # we need to add the parameter in the sparse representation + if self.is_sparse: + self.set_sparse_parameter( + torch.nn.Parameter( + torch.cat( + ( + self.sparse_parameter, + torch.zeros(len(indices)).to(self.sparse_parameter), + ) + ) + ) + ) + return self + + def contract_support(self, indices: list[int]) -> RelevancePursuitMixin: + """Contracts the support by a number of indices. + + Args: + indices: A list of indices of `self.sparse_parameter` to remove from + the support. + + Returns: + The current object, updated with the contracted support. + """ + # indices into the sparse representation of features to *keep* + sparse_indices = list(range(len(self.support))) + original_support = copy(self.support) + for i in indices: + if i not in self.support: + raise ValueError(f"Feature {i} is not in support.") + sparse_indices.remove(original_support.index(i)) + self.support.remove(i) + + # we need to remove the parameter in the sparse representation + if self.is_sparse: + self.set_sparse_parameter(Parameter(self.sparse_parameter[sparse_indices])) + else: + requires_grad = self.sparse_parameter.requires_grad + self.sparse_parameter.requires_grad_(False) + self.sparse_parameter[indices] = 0.0 + self.sparse_parameter.requires_grad_(requires_grad) # restore + return self + + # support initialization helpers + def full_support(self) -> RelevancePursuitMixin: + """Initializes the RelevancePursuitMixin with a full, size-`dim` support. + + Returns: + The current object with full support in the dense representation. + """ + self.expand_support([i for i in range(self.dim) if i not in self.support]) + self.to_dense() # no reason to be sparse with full support + return self + + def remove_support(self) -> RelevancePursuitMixin: + """Initializes the RelevancePursuitMixin with an empty, size-zero support. + + Returns: + The current object with empty support, representation unchanged. + """ + self._support = [] + requires_grad = self.sparse_parameter.requires_grad + if self.is_sparse: + self.set_sparse_parameter( + torch.nn.Parameter(torch.tensor([]).to(self.sparse_parameter)) + ) + else: + self.sparse_parameter.requires_grad_(False) + self.sparse_parameter[:] = 0.0 + self.sparse_parameter.requires_grad_(requires_grad) + return self + + # the following two methods are the only ones that are specific to the marginal + # likelihood optimization problem + def support_expansion( + self, + mll: ExactMarginalLogLikelihood, + n: int = 1, + modifier: Callable[[Tensor], Tensor] | None = None, + ) -> bool: + """Computes the indices of the features that maximize the gradient of the sparse + parameter and that are not already in the support, and subsequently expands the + support to include the features if their gradient is positive. + + Args: + mll: The marginal likelihood, containing the model to optimize. + NOTE: Virtually all of the rest of the code is not specific to the + marginal likelihood optimization, so we could generalize this to work + with any objective. + n: The number of features to select. + modifier: A function that modifies the gradient of the inactive parameters + before computing the support expansion criterion. This can be used + to select the maximum gradient magnitude for real-valued parameters + whose gradients are not non-negative, using modifier = torch.abs. + + Returns: + True if the support was expanded, False otherwise. + """ + # can't expand if the support is already full, or if n is non-positive + if len(self.support) == self.dim or n <= 0: + return False + + g = self.expansion_objective(mll) + + modifier = modifier if modifier is not None else self._expansion_modifier + if modifier is not None: + g = modifier(g) + + # support is already removed from consideration + # gradient of the support parameters is not necessarily zero, + # even for a converged solution in the presence of constraints. + # NOTE: these indices are relative to self.inactive_indices. + indices = g.argsort(descending=True)[:n] + indices = indices[g[indices] > 0] + if indices.numel() == 0: # no indices with positive gradient + return False + self.expand_support(self.inactive_indices[indices].tolist()) + return True + + def expansion_objective(self, mll: ExactMarginalLogLikelihood) -> Tensor: + """Computes an objective value for all the inactive parameters, i.e. + self.sparse_parameter[~self.is_active] since we can't add already active + parameters to the support. This value will be used to select the parameters. + + Args: + mll: The marginal likelihood, containing the model to optimize. + + Returns: + The expansion objective value for all the inactive parameters. + """ + return self._sparse_parameter_gradient(mll) + + def _sparse_parameter_gradient(self, mll: ExactMarginalLogLikelihood) -> Tensor: + """Computes the gradient of the marginal likelihood with respect to the + sparse parameter. + + Args: + mll: The marginal likelihood, containing the model to optimize. + + Returns: + The gradient of the marginal likelihood with respect to the inactive + sparse parameters. + """ + # evaluate gradient of the sparse parameter + is_sparse = self.is_sparse # in order to restore the original representation + self.to_dense() # need the parameter in its dense parameterization + + requires_grad = self.sparse_parameter.requires_grad + self.sparse_parameter.requires_grad_(True) + if self.sparse_parameter.grad is not None: + self.sparse_parameter.grad.zero_() + mll.train() # NOTE: this changes model.train_inputs + model = mll.model + X, Y = model.train_inputs[0], model.train_targets + cast( + Tensor, + mll( + mll.model(X), + Y, + *(model.transform_inputs(X=t_in) for t_in in model.train_inputs), + ), + ).backward() # evaluation + self.sparse_parameter.requires_grad_(requires_grad) + + g = self.sparse_parameter.grad + if g is None: + raise ValueError( + "The gradient of the sparse_parameter is None, most likely " + "because the passed marginal likelihood is not a function of the " + "sparse_parameter." + ) + + if is_sparse: + self.to_sparse() + + return g[~self.is_active] # only need the inactive parameters + + def support_contraction( + self, + mll: ExactMarginalLogLikelihood, + n: int = 1, + modifier: Callable[[Tensor], Tensor] | None = None, + ) -> bool: + """Computes the indices of the features that have the smallest coefficients, + and subsequently contracts the exlude the features. + + Args: + mll: The marginal likelihood, containing the model to optimize. + NOTE: Virtually all of the rest of the code is not specific to the + marginal likelihood optimization, so we could generalize this to work + with any objective. + n: The number of features to select for removal. + modifier: A function that modifies the parameter values before computing + the support contraction criterion. + + Returns: + True if the support was expanded, False otherwise. + """ + # can't expand if the support is already empty, or if n is non-positive + if len(self.support) == 0 or n <= 0: + return False + + is_sparse = self.is_sparse + self.to_sparse() + x = self.sparse_parameter + + modifier = modifier if modifier is not None else self._contraction_modifier + if modifier is not None: + x = modifier(x) + + # for non-negative parameters, could break ties at zero + # based on derivative + sparse_indices = x.argsort(descending=False)[:n] + indices = [self.support[i] for i in sparse_indices] + self.contract_support(indices) + if not is_sparse: + self.to_dense() + return True + + def optimize_mll( + self, + mll: ExactMarginalLogLikelihood, + model_trace: list[Model] | None = None, + reset_parameters: bool = RESET_PARAMETERS, + reset_dense_parameters: bool = RESET_PARAMETERS, + optimizer_kwargs: dict[str, Any] | None = None, + ): + """Optimizes the marginal likelihood. + + Args: + mll: The marginal likelihood, containing the model to optimize. + model_trace: If not None, a list to which a deepcopy of the model state is + appended. NOTE This operation is *in place*. + reset_parameters: If True, initializes the sparse parameter to the all-zeros + vector before every marginal likelihood optimization step. If False, the + optimization is warm-started with the previous iteration's parameters. + reset_dense_parameters: If True, re-initializes the dense parameters, e.g. + other GP hyper-parameters that are *not* part of the Relevance Pursuit + module, to the initial values provided by their associated constraints. + optimizer_kwargs: A dictionary of keyword arguments for the optimizer. + + Returns: + The marginal likelihood after optimization. + """ + if reset_parameters: + # this might be beneficial because the parameters can + # end up at a constraint boundary, which can anecdotally make + # it more difficult to move the newly added parameters. + # should we only do this after expansion? + with torch.no_grad(): + self.sparse_parameter.zero_() + + if reset_dense_parameters: + initialize_dense_parameters(mll.model) + + # move to sparse representation for optimization + # NOTE: this function should never force the dense representation, because some + # models might never need it, and it would be inefficient. + self.to_sparse() + fit_gpytorch_mll(mll, optimizer_kwargs=optimizer_kwargs) + if model_trace is not None: + # need to record the full model here, rather than just the sparse parameter + # since other hyper-parameters are co-adapted to the sparse parameter. + model_trace.append(deepcopy(mll.model)) + return mll + + +# Optimization Algorithms +def forward_relevance_pursuit( + sparse_module: RelevancePursuitMixin, + mll: ExactMarginalLogLikelihood, + sparsity_levels: list[int] | None = None, + optimizer_kwargs: dict[str, Any] | None = None, + reset_parameters: bool = RESET_PARAMETERS, + reset_dense_parameters: bool = RESET_PARAMETERS, + record_model_trace: bool = True, + initial_support: list[int] | None = None, +) -> tuple[RelevancePursuitMixin, Optional[list[Model]]]: + """Forward Relevance Pursuit. + + NOTE: For the robust `SparseOutlierNoise` model of [Ament2024pursuit]_, the forward + algorithm is generally faster than the backward algorithm, particularly when the + maximum sparsity level is small, but it leads to less robust results when the number + of outliers is large. + + For details, see [Ament2024pursuit]_ or https://arxiv.org/abs/2410.24222. + + Example: + >>> base_noise = HomoskedasticNoise( + >>> noise_constraint=NonTransformedInterval( + >>> 1e-5, 1e-1, initial_value=1e-3 + >>> ) + >>> ) + >>> likelihood = SparseOutlierGaussianLikelihood( + >>> base_noise=base_noise, + >>> dim=X.shape[0], + >>> ) + >>> model = SingleTaskGP(train_X=X, train_Y=Y, likelihood=likelihood) + >>> mll = ExactMarginalLogLikelihood(model.likelihood, model) + >>> # NOTE: `likelihood.noise_covar` is the `RelevancePursuitMixin` + >>> sparse_module = likelihood.noise_covar + >>> sparse_module, model_trace = forward_relevance_pursuit(sparse_module, mll) + + Args: + sparse_module: The relevance pursuit module. + mll: The marginal likelihood, containing the model to optimize. + sparsity_levels: The sparsity levels to expand the support to. + optimizer_kwargs: A dictionary of keyword arguments to pass to the optimizer. + By default, initializes the "options" sub-dictionary with `maxiter` and + `ftol`, `gtol` values, unless specified. + reset_parameters: If true, initializes the sparse parameter to the all zeros + after each iteration. + reset_dense_parameters: If true, re-initializes the dense parameters, e.g. + other GP hyper-parameters that are *not* part of the Relevance Pursuit + module, to the initial values provided by their associated constraints. + record_model_trace: If true, records the model state after every iteration. + initial_support: The support with which to initialize the sparse module. By + default, the support is initialized to the empty set. + + Returns: + The relevance pursuit module after forward relevance pursuit optimization, and + a list of models with different supports that were optimized. + """ + sparse_module.remove_support() + if initial_support is not None: + sparse_module.expand_support(initial_support) + + if sparsity_levels is None: + sparsity_levels = list(range(len(sparse_module.support), sparse_module.dim + 1)) + + # since this is the forward algorithm, potential sparsity levels + # must be in increasing order and unique. + sparsity_levels = list(set(sparsity_levels)) + sparsity_levels.sort(reverse=False) + + optimizer_kwargs = _initialize_optimizer_kwargs(optimizer_kwargs) + + model_trace = [] if record_model_trace else None + + def optimize_mll(mll): + return sparse_module.optimize_mll( + mll=mll, + model_trace=model_trace, + reset_parameters=reset_parameters, + reset_dense_parameters=reset_dense_parameters, + optimizer_kwargs=optimizer_kwargs, + ) + + # if sparsity levels contains the initial support, remove it + if sparsity_levels[0] == len(sparse_module.support): + sparsity_levels.pop(0) + + optimize_mll(mll) # initial optimization + + for sparsity in sparsity_levels: + support_size = len(sparse_module.support) + num_expand = sparsity - support_size + expanded = sparse_module.support_expansion(mll=mll, n=num_expand) + if not expanded: # stationary support + warn( + "Terminating optimization because the expansion from sparsity " + f"{support_size} to {sparsity} was unsuccessful, usually due to " + "reaching a stationary point of the marginal likelihood.", + Warning, + stacklevel=2, + ) + break + + optimize_mll(mll) # re-optimize support + + return sparse_module, model_trace + + +def backward_relevance_pursuit( + sparse_module: RelevancePursuitMixin, + mll: ExactMarginalLogLikelihood, + sparsity_levels: list[int] | None = None, + optimizer_kwargs: dict[str, Any] | None = None, + reset_parameters: bool = RESET_PARAMETERS, + reset_dense_parameters: bool = RESET_PARAMETERS, + record_model_trace: bool = True, + initial_support: list[int] | None = None, +) -> tuple[RelevancePursuitMixin, Optional[list[Model]]]: + """Backward Relevance Pursuit. + + NOTE: For the robust `SparseOutlierNoise` model of [Ament2024pursuit]_, the backward + algorithm generally leads to more robust results than the forward algorithm, + especially when the number of outliers is large, but is more expensive unless the + support is contracted by more than one in each iteration. + + For details, see [Ament2024pursuit]_ or https://arxiv.org/abs/2410.24222. + + Example: + >>> base_noise = HomoskedasticNoise( + >>> noise_constraint=NonTransformedInterval( + >>> 1e-5, 1e-1, initial_value=1e-3 + >>> ) + >>> ) + >>> likelihood = SparseOutlierGaussianLikelihood( + >>> base_noise=base_noise, + >>> dim=X.shape[0], + >>> ) + >>> model = SingleTaskGP(train_X=X, train_Y=Y, likelihood=likelihood) + >>> mll = ExactMarginalLogLikelihood(model.likelihood, model) + >>> # NOTE: `likelihood.noise_covar` is the `RelevancePursuitMixin` + >>> sparse_module = likelihood.noise_covar + >>> sparse_module, model_trace = backward_relevance_pursuit(sparse_module, mll) + + Args: + sparse_module: The relevance pursuit module. + mll: The marginal likelihood, containing the model to optimize. + sparsity_levels: The sparsity levels to expand the support to. + optimizer_kwargs: A dictionary of keyword arguments to pass to the optimizer. + By default, initializes the "options" sub-dictionary with `maxiter` and + `ftol`, `gtol` values, unless specified. + reset_parameters: If true, initializes the sparse parameter to the all zeros + after each iteration. + reset_dense_parameters: If true, re-initializes the dense parameters, e.g. + other GP hyper-parameters that are *not* part of the Relevance Pursuit + module, to the initial values provided by their associated constraints. + record_model_trace: If true, records the model state after every iteration. + initial_support: The support with which to initialize the sparse module. By + default, the support is initialized to the full set. + + Returns: + The relevance pursuit module after forward relevance pursuit optimization, and + a list of models with different supports that were optimized. + """ + if initial_support is not None: + sparse_module.remove_support() + sparse_module.expand_support(initial_support) + else: + sparse_module.full_support() + + if sparsity_levels is None: + sparsity_levels = list(range(len(sparse_module.support) + 1)) + + # since this is the backward algorithm, potential sparsity levels + # must be in decreasing order, unique, and less than the initial support. + sparsity_levels = list(set(sparsity_levels)) + sparsity_levels.sort(reverse=True) + + optimizer_kwargs = _initialize_optimizer_kwargs(optimizer_kwargs) + + model_trace = [] if record_model_trace else None + + def optimize_mll(mll): + return sparse_module.optimize_mll( + mll=mll, + model_trace=model_trace, + reset_parameters=reset_parameters, + reset_dense_parameters=reset_dense_parameters, + optimizer_kwargs=optimizer_kwargs, + ) + + # if sparsity levels contains the initial support, remove it + if sparsity_levels[0] == len(sparse_module.support): + sparsity_levels.pop(0) + + optimize_mll(mll) # initial optimization + + for sparsity in sparsity_levels: + support_size = len(sparse_module.support) + num_contract = support_size - sparsity + contracted = sparse_module.support_contraction(mll=mll, n=num_contract) + if not contracted: # stationary support + warn( + "Terminating optimization because the contraction from sparsity " + f"{support_size} to {sparsity} was unsuccessful.", + Warning, + stacklevel=2, + ) + break + + optimize_mll(mll) # re-optimize support + + return sparse_module, model_trace + + +# Bayesian Model Comparison +def get_posterior_over_support( + rp_class: type[RelevancePursuitMixin], + model_trace: list[Model], + log_support_prior: Callable[[Tensor], Tensor] | None = None, + prior_mean_of_support: float | None = None, +) -> tuple[Tensor, Tensor]: + """Computes the posterior distribution over a list of models. + Assumes we are storing both likelihood and GP model in the model_trace. + + Example: + >>> likelihood = SparseOutlierGaussianLikelihood( + >>> base_noise=base_noise, + >>> dim=X.shape[0], + >>> ) + >>> model = SingleTaskGP(train_X=X, train_Y=Y, likelihood=likelihood) + >>> mll = ExactMarginalLogLikelihood(model.likelihood, model) + >>> # NOTE: `likelihood.noise_covar` is the `RelevancePursuitMixin` + >>> sparse_module = likelihood.noise_covar + >>> sparse_module, model_trace = backward_relevance_pursuit(sparse_module, mll) + >>> # NOTE: SparseOutlierNoise is the type of `sparse_module` + >>> support_size, bmc_probabilities = get_posterior_over_support( + >>> SparseOutlierNoise, model_trace, prior_mean_of_support=2.0 + >>> ) + + Args: + rp_class: The relevance pursuit class to use for computing the support size. + This is used to get the RelevancePursuitMixin from the Model via the static + method `_from_model`. We could generalize this and let the user pass this + getter instead. + model_trace: A list of models with different support sizes, usually generated + with relevance_pursuit. + log_support_prior: Callable that computes the log prior probability of a + support size. If None, uses a default exponential prior with a mean + specified by `prior_mean_of_support`. + prior_mean_of_support: A mean value for the default exponential prior + distribution over the support size. Ignored if `log_support_prior` + is passed. + + Returns: + A tensor of posterior marginal likelihoods, one for each model in the trace. + """ + if log_support_prior is None: + if prior_mean_of_support is None: + raise ValueError( + "`log_support_prior` and `prior_mean_of_support` cannot both be None." + ) + log_support_prior = partial(_exp_log_pdf, mean=prior_mean_of_support) + + log_support_prior = cast(Callable[[Tensor], Tensor], log_support_prior) + + def log_prior( + model: Model, + dtype: torch.dtype, + device: torch.device, + ) -> tuple[Tensor, Tensor]: + sparse_module = rp_class._from_model(model) + num_support = torch.tensor( + len(sparse_module.support), dtype=dtype, device=device + ) + return num_support, log_support_prior(num_support) # pyre-ignore[29] + + log_mll_trace = [] + log_prior_trace = [] + support_size_trace = [] + for model in model_trace: + mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model) + mll.train() + X, Y = mll.model.train_inputs[0], mll.model.train_targets + F = mll.model(X) + mll_i = cast(Tensor, mll(F, Y, X)) + log_mll_trace.append(mll_i) + support_size, log_prior_i = log_prior( + model, + dtype=mll_i.dtype, + device=mll_i.device, + ) + support_size_trace.append(support_size) + log_prior_trace.append(log_prior_i) + + log_mll_trace = torch.stack(log_mll_trace) + log_prior_trace = torch.stack(log_prior_trace) + support_size_trace = torch.stack(support_size_trace) + + unnormalized_posterior_trace = log_mll_trace + log_prior_trace + evidence = unnormalized_posterior_trace.logsumexp(dim=-1) + posterior_probabilities = (unnormalized_posterior_trace - evidence).exp() + return support_size_trace, posterior_probabilities + + +def _exp_log_pdf(x: Tensor, mean: Tensor) -> Tensor: + """Compute the exponential log probability density. + + Args: + x: A tensor of values. + mean: A tensor of means. + + Returns: + A tensor of log probabilities. + """ + return -x / mean - math.log(mean) + + +def initialize_dense_parameters(model: Model) -> tuple[Model, dict[str, Any]]: + """Sets the dense parameters of a model to their initial values. Infers initial + values from the constraints, if no initial values are provided. If a parameter + does not have a constraint, it is initialized to zero. + + Args: + model: The model to initialize. + + Returns: + The re-initialized model, and a dictionary of initial values. + """ + constraints = dict(model.named_constraints()) + parameters = dict(model.named_parameters()) + initial_values = { + n: getattr(constraints.get(n + "_constraint", None), "_initial_value", None) + for n in parameters + } + lower_bounds = { + n: getattr( + constraints.get(n + "_constraint", None), + "lower_bound", + torch.tensor(-torch.inf), + ) + for n in parameters + } + upper_bounds = { + n: getattr( + constraints.get(n + "_constraint", None), + "upper_bound", + torch.tensor(torch.inf), + ) + for n in parameters + } + for name, value in initial_values.items(): + initial_values[name] = _get_initial_value( + value=value, + lower=lower_bounds[name], + upper=upper_bounds[name], + ) + + # the initial values need to be converted to the transformed space + for n, v in initial_values.items(): + c = constraints.get(n + "_constraint", None) + # convert the constraint into the latent space + if c is not None: + initial_values[n] = c.inverse_transform(v) + model.initialize(**initial_values) + parameters = dict(model.named_parameters()) + return model, initial_values + + +def _get_initial_value(value: Tensor, lower: Tensor, upper: Tensor) -> Tensor: + # if no initial value is provided, or the initial value is outside the bounds, + # use a rule-based initialization. + if value is None or not ((lower <= value) and (value <= upper)): + if upper.isinf(): + value = 0.0 if lower.isinf() else lower + 1 + elif lower.isinf(): # implies u[n] is finite + value = upper - 1 + else: # both are finite + # generally keep the value close to the lower bound in this case, + # since many parameters (e.g. lengthscales) exhibit vanishing gradients + # for large values. + value = lower + torch.minimum( + torch.ones_like(lower), + (upper - lower) / 2, + ) + return torch.as_tensor(value, dtype=lower.dtype, device=lower.device) + + +def _initialize_optimizer_kwargs( + optimizer_kwargs: dict[str, Any] | None, +) -> dict[str, Any]: + """Initializes the optimizer kwargs with default values if they are not provided. + + Args: + optimizer_kwargs: The optimizer kwargs to initialize. + + Returns: + The initialized optimizer kwargs. + """ + if optimizer_kwargs is None: + optimizer_kwargs = {} + + if optimizer_kwargs.get("options") is None: + optimizer_kwargs["options"] = {} + + options = optimizer_kwargs["options"] + if "maxiter" not in options: + options.update({"maxiter": MLL_ITER}) + + if ("ftol" not in options) and ("gtol" not in options): + options.update({"ftol": MLL_TOL}) + options.update({"gtol": MLL_TOL}) + + return optimizer_kwargs diff --git a/botorch/test_functions/base.py b/botorch/test_functions/base.py index 19abe18ebb..a6ed1a5c0d 100644 --- a/botorch/test_functions/base.py +++ b/botorch/test_functions/base.py @@ -11,9 +11,11 @@ from __future__ import annotations from abc import ABC, abstractmethod +from typing import Any, Iterable, Iterator, Protocol import torch from botorch.exceptions.errors import InputDataError +from pyre_extensions import none_throws from torch import Tensor from torch.nn import Module @@ -203,3 +205,127 @@ def max_hv(self) -> float: def gen_pareto_front(self, n: int) -> Tensor: r"""Generate `n` pareto optimal points.""" raise NotImplementedError + + +class SeedingMixin(ABC): + _seeds: Iterator[int] | None + _current_seed: int | None + + @property + def has_seeds(self) -> bool: + return self._seeds is not None + + def increment_seed(self) -> int: + self._current_seed = next(none_throws(self._seeds)) + return none_throws(self._current_seed) + + @property + def seed(self) -> int | None: + return self._current_seed + + +# Outlier problems +class OutlierGenerator(Protocol): + def __call__(self, problem: BaseTestProblem, X: Tensor, bounds: Tensor) -> Tensor: + """Call signature for outlier generators for single-objective problems. + + Args: + problem: The test problem. + X: The input tensor. + bounds: The bounds of the test problem. + + Returns: + A tensor of outliers with shape X.shape[:-1] (1d if unbatched). + """ + pass # pragma: no cover + + +def constant_outlier_generator( + problem: Any, X: Tensor, bounds: Any, constant: float +) -> Tensor: + """ + Generates outliers that are all the same constant. To be used in conjunction with + `partial` to fix the constant value and conform to the `OutlierGenerator` protocol. + + Example: + >>> generator = partial(constant_outlier_generator, constant=1.0) + + Args: + problem: Not used. + X: The `batch_shape x n x d`-dim inputs. Also determines the number, dtype, + and device of the returned tensor. + bounds: Not used. + constant: The constant value of the outliers. + + Returns: + Tensor of shape `batch_shape x n` (1d if unbatched). + """ + return torch.full(X.shape[:-1], constant, dtype=X.dtype, device=X.device) + + +class CorruptedTestProblem(BaseTestProblem, SeedingMixin): + def __init__( + self, + base_test_problem: BaseTestProblem, + outlier_generator: OutlierGenerator, + outlier_fraction: float, + bounds: list[tuple[float, float]] | None = None, + seeds: Iterable[int] | None = None, + ) -> None: + """A problem with outliers. + + NOTE: Both noise_std and negate will be taken from the base test problem. + + Args: + base_test_problem: The base function to be corrupted. + outlier_generator: A function that generates outliers. It will be called + with arguments `f`, `X` and `bounds`, where `f` is the + `base_test_problem`, `X` is the + argument passed to the `forward` method, and `bounds` + are as here, and it returns the values of outliers. + outlier_fraction: The fraction of outliers. + bounds: The bounds of the function. + seeds: The seeds to use for the outlier generator. If seeds are provided, + the problem will iterate through the list of seeds, changing the seed + with a call to `next(seeds)` with every `forward` call. If a list is + provided, it will first be converted to an iterator. + """ + self.dim: int = base_test_problem.dim + self._bounds: list[tuple[float, float]] = ( + bounds if bounds is not None else base_test_problem._bounds + ) + super().__init__( + noise_std=base_test_problem.noise_std, + negate=base_test_problem.negate, + ) + self.base_test_problem = base_test_problem + self.outlier_generator = outlier_generator + self.outlier_fraction = outlier_fraction + self._current_seed: int | None = None + self._seeds: Iterator[int] | None = None if seeds is None else iter(seeds) + + def evaluate_true(self, X: Tensor) -> Tensor: + return self.base_test_problem.evaluate_true(X) + + def forward(self, X: Tensor, noise: bool = True) -> Tensor: + """ + Generate data at X and corrupt it, if noise is True. + + Args: + X: The `batch_shape x n x d`-dim inputs. + noise: Whether to corrupt the data. + + Returns: + A `batch_shape x n`-dim tensor. + """ + Y = super().forward(X, noise=noise) + if noise: + if self.has_seeds: + self.increment_seed() + torch.manual_seed(self.seed) + corrupt = torch.rand(X.shape[:-1]) < self.outlier_fraction + outliers = self.outlier_generator( + problem=self.base_test_problem, X=X, bounds=self.bounds + ) + Y = torch.where(corrupt, outliers, Y) + return Y diff --git a/botorch/test_functions/synthetic.py b/botorch/test_functions/synthetic.py index b4efc0920c..001c5dcae5 100644 --- a/botorch/test_functions/synthetic.py +++ b/botorch/test_functions/synthetic.py @@ -160,7 +160,7 @@ def __init__( def evaluate_true(self, X: Tensor) -> Tensor: a, b, c = self.a, self.b, self.c - part1 = -a * torch.exp(-b / math.sqrt(self.dim) * torch.linalg.norm(X, dim=-1)) + part1 = -a * torch.exp(-torch.linalg.norm(X, dim=-1) * b / math.sqrt(self.dim)) part2 = -(torch.exp(torch.mean(torch.cos(c * X), dim=-1))) return part1 + part2 + a + math.e diff --git a/botorch/utils/constraints.py b/botorch/utils/constraints.py index 1ee46984d6..4af5d408ab 100644 --- a/botorch/utils/constraints.py +++ b/botorch/utils/constraints.py @@ -15,6 +15,8 @@ from functools import partial import torch +from gpytorch.constraints import Interval + from torch import Tensor @@ -96,3 +98,48 @@ def get_monotonicity_constraints( if descending: A = -A return A, b + + +class NonTransformedInterval(Interval): + """Modification of the GPyTorch interval class that does not apply transformations. + + This is generally useful, and it is a requirement for the sparse parameters of the + Relevance Pursuit model [Ament2024pursuit]_, since it is not possible to achieve + exact zeros with the sigmoid transformations that are applied by default in the + GPyTorch Interval class. The variant implemented here does not apply transformations + to the parameters, instead passing the bounds constraint to the scipy L-BFGS + optimizer. This allows for the expression of exact zeros for sparse optimization + algorithms. + + NOTE: On a high level, the cleanest solution for this would be to separate out the + 1) definition and book-keeping of parameter constraints on the one hand, and + 2) the re-parameterization of the variables with some monotonic transformation, + since the two steps are orthogonal, but this would require refactoring GPyTorch. + """ + + def __init__( + self, + lower_bound: float | Tensor, + upper_bound: float | Tensor, + initial_value: float | Tensor | None = None, + ): + """Constructor of the NonTransformedInterval class. + + Args: + lower_bound: The lower bound of the interval. + upper_bound: The upper bound of the interval. + initial_value: The initial value of the parameter. + """ + super().__init__( + lower_bound=lower_bound, + upper_bound=upper_bound, + transform=None, + inv_transform=None, + initial_value=initial_value, + ) + + def transform(self, tensor: Tensor) -> Tensor: + return tensor + + def inverse_transform(self, transformed_tensor: Tensor) -> Tensor: + return transformed_tensor diff --git a/botorch/utils/testing.py b/botorch/utils/testing.py index 3c468f9e6b..effc04276b 100644 --- a/botorch/utils/testing.py +++ b/botorch/utils/testing.py @@ -28,7 +28,8 @@ from botorch.sampling.base import MCSampler from botorch.sampling.get_sampler import GetSampler from botorch.sampling.stochastic_samplers import StochasticSampler -from botorch.test_functions.base import BaseTestProblem +from botorch.test_functions.base import BaseTestProblem, CorruptedTestProblem +from botorch.test_functions.synthetic import Rosenbrock from botorch.utils.transforms import unnormalize from gpytorch.distributions import MultitaskMultivariateNormal, MultivariateNormal from linear_operator.operators import AddedDiagLinearOperator, DiagLinearOperator @@ -245,6 +246,25 @@ def test_evaluate_slack(self): self.assertTrue(is_equal.all().item()) +class TestCorruptedProblemsMixin(BotorchTestCase): + def setUp(self, suppress_input_warnings: bool = True) -> None: + super().setUp(suppress_input_warnings=suppress_input_warnings) + + def outlier_generator( + problem: torch.Tensor | BaseTestProblem, X: Any, bounds: Any + ) -> torch.Tensor: + return torch.ones(X.shape[0]) + + self.outlier_generator = outlier_generator + + self.rosenbrock_problem = CorruptedTestProblem( + base_test_problem=Rosenbrock(), + outlier_fraction=1.0, + outlier_generator=outlier_generator, + seeds=[1, 2], + ) + + class MockPosterior(Posterior): r"""Mock object that implements dummy methods and feeds through specified outputs""" diff --git a/sphinx/source/models.rst b/sphinx/source/models.rst index ec6995565f..4880d22931 100644 --- a/sphinx/source/models.rst +++ b/sphinx/source/models.rst @@ -99,6 +99,10 @@ Fully Bayesian Multitask GP Models .. automodule:: botorch.models.fully_bayesian_multitask :members: +Relevance Pursuit Models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: botorch.models.relevance_pursuit + :members: Model Components ------------------------------------------- @@ -134,6 +138,9 @@ Likelihoods .. automodule:: botorch.models.likelihoods.pairwise :members: +.. automodule:: botorch.models.likelihoods.sparse_outlier_noise + :members: + Transforms ------------------------------------------- diff --git a/test/models/test_relevance_pursuit.py b/test/models/test_relevance_pursuit.py new file mode 100644 index 0000000000..3a671b31d3 --- /dev/null +++ b/test/models/test_relevance_pursuit.py @@ -0,0 +1,621 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +import itertools + +from functools import partial + +import gpytorch +import torch +from botorch.exceptions.errors import UnsupportedError +from botorch.exceptions.warnings import InputDataWarning +from botorch.models import SingleTaskGP +from botorch.models.likelihoods.sparse_outlier_noise import ( + SparseOutlierGaussianLikelihood, + SparseOutlierNoise, +) +from botorch.models.relevance_pursuit import ( + _get_initial_value, + backward_relevance_pursuit, + forward_relevance_pursuit, + get_posterior_over_support, + RelevancePursuitMixin, +) +from botorch.models.transforms.input import Normalize +from botorch.models.transforms.outcome import Standardize +from botorch.test_functions.base import constant_outlier_generator, CorruptedTestProblem + +from botorch.test_functions.synthetic import Ackley +from botorch.utils.constraints import NonTransformedInterval +from botorch.utils.testing import BotorchTestCase +from gpytorch.constraints import Interval + +from gpytorch.kernels import RBFKernel, ScaleKernel +from gpytorch.likelihoods.noise_models import HomoskedasticNoise +from gpytorch.means import ZeroMean +from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood +from pyre_extensions import none_throws +from torch import Tensor +from torch.nn import Module +from torch.nn.parameter import Parameter + + +class DummyRelevancePursuitModule(Module, RelevancePursuitMixin): + """Module to test support modification methods of RelevancePursuitMixin.""" + + def __init__( + self, + dim: int, + support: list[int] | None, + ) -> None: + """Initializes test module. + + Args: + dim: The total number of features. + support: The indices of the features in the support, subset of range(dim). + """ + Module.__init__(self) + self._par = Parameter(torch.zeros(len(support) if support is not None else 0)) + RelevancePursuitMixin.__init__(self, dim=dim, support=support) + + @property + def sparse_parameter(self) -> Parameter: + return self._par + + def set_sparse_parameter(self, value: Parameter) -> None: + self._par = torch.nn.Parameter(value.to(self._par)) + + +class TestRobustGP(BotorchTestCase): + def _make_dataset( + self, + n: int, + num_outliers: int, + dtype: torch.dtype, + seed: int = 1, + ) -> tuple[Tensor, Tensor, list[int]]: + torch.manual_seed(seed) + + X = torch.rand(n, 1, dtype=dtype, device=self.device) + F = torch.sin(2 * torch.pi * (2 * X)).sum(dim=-1, keepdim=True) + sigma = 1e-2 + Y = F + torch.randn_like(F) * sigma + outlier_indices = list(range(n - num_outliers, n)) + Y[outlier_indices] = -Y[outlier_indices] + return X, Y, outlier_indices + + def _get_robust_model( + self, + X: Tensor, + Y: Tensor, + likelihood: SparseOutlierGaussianLikelihood, + ) -> SingleTaskGP: + min_lengthscale = 0.1 + lengthscale_constraint = NonTransformedInterval( + min_lengthscale, torch.inf, initial_value=0.2 + ) + d = X.shape[-1] + + kernel = ScaleKernel( + RBFKernel(ard_num_dims=d, lengthscale_constraint=lengthscale_constraint), + outputscale_constraint=NonTransformedInterval( + 0.01, 10.0, initial_value=0.1 + ), + ).to(dtype=X.dtype, device=self.device) + + model = SingleTaskGP( + train_X=X, + train_Y=Y, + mean_module=ZeroMean(), + covar_module=kernel, + input_transform=Normalize(d=X.shape[-1]), + outcome_transform=Standardize(m=Y.shape[-1]), + likelihood=likelihood, + ) + model.to(dtype=X.dtype, device=self.device) + return model + + def test_robust_gp_end_to_end(self) -> None: + self._test_robust_gp_end_to_end(convex_parameterization=False, mll_tol=1e-8) + + def test_robust_convex_gp_end_to_end(self) -> None: + self._test_robust_gp_end_to_end(convex_parameterization=True, mll_tol=1e-8) + + def _test_robust_gp_end_to_end( + self, + convex_parameterization: bool, + mll_tol: float, + ) -> None: + """End-to-end robust GP test.""" + n = 32 + dtype = torch.double + num_outliers = 5 + X, Y, outlier_indices = self._make_dataset( + n=n, num_outliers=num_outliers, dtype=dtype, seed=1 + ) + min_noise = 1e-6 # minimum noise variance constraint + max_noise = 1e-2 + base_noise = HomoskedasticNoise( + noise_constraint=NonTransformedInterval( + min_noise, max_noise, initial_value=1e-3 + ) + ).to(dtype=dtype, device=self.device) + + rp_likelihood = SparseOutlierGaussianLikelihood( + base_noise=base_noise, + dim=X.shape[0], + convex_parameterization=convex_parameterization, + ) + + model = self._get_robust_model(X=X, Y=Y, likelihood=rp_likelihood) + + X_test = torch.rand(3, 1, dtype=dtype, device=self.device) + with self.assertWarnsRegex(InputDataWarning, "SparseOutlierNoise"): + model.posterior(X_test, observation_noise=True) + + # optimization via backward relevance pursuit (num_contract=1) + sparse_module = model.likelihood.noise_covar + sparse_module.full_support() + mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model) + optimizer_kwargs = { + "options": {"maxiter": 1024, "ftol": mll_tol, "gtol": mll_tol} + } + sparse_module, model_trace = backward_relevance_pursuit( + sparse_module=sparse_module, + mll=mll, + record_model_trace=True, + reset_parameters=False, + # When `mll_iter` > 100, runtime seems to be primarily controlled + # by the convergence tolerance + optimizer_kwargs=optimizer_kwargs, + ) + model_trace = none_throws(model_trace) + + # Bayesian model comparison + prior_mean_of_support = 2.0 + support_size, bmc_probabilities = get_posterior_over_support( + SparseOutlierNoise, model_trace, prior_mean_of_support=prior_mean_of_support + ) + self.assertEqual(len(support_size), n + 1) + self.assertEqual(len(support_size), len(bmc_probabilities)) + self.assertAlmostEqual(bmc_probabilities.sum().item(), 1.0) + map_index = torch.argmax(bmc_probabilities) + + with self.assertRaisesRegex( + ValueError, + "`log_support_prior` and `prior_mean_of_support` cannot both be None", + ): + get_posterior_over_support(SparseOutlierNoise, model_trace) + + map_model = model_trace[map_index] + sparse_module = map_model.likelihood.noise_covar + undetected_outliers = set(outlier_indices) - set(sparse_module.support) + self.assertEqual(len(undetected_outliers), 0) + + def test_robust_relevance_pursuit(self) -> None: + for optimizer, convex_parameterization, dtype in itertools.product( + [forward_relevance_pursuit, backward_relevance_pursuit], + [True, False], + [torch.float32, torch.float64], + ): + with self.subTest( + optimizer=optimizer, + convex_parameterization=convex_parameterization, + dtype=dtype, + ): + # testing the loo functionality only with the forward algorithm + # and the convex parameterization, to save test runtime. + loo = ( + optimizer is forward_relevance_pursuit + ) and convex_parameterization + self._test_robust_relevance_pursuit( + optimizer=optimizer, + convex_parameterization=convex_parameterization, + dtype=dtype, + loo=loo, + ) + + def _test_robust_relevance_pursuit( + self, + optimizer: forward_relevance_pursuit | backward_relevance_pursuit, + convex_parameterization: bool, + dtype: torch.dtype, + loo: bool, + ) -> None: + """ + Test executing with different combinations of arguments, without checking the + model fit end-to-end. + """ + n = 32 + dtype = torch.double + X, Y, _ = self._make_dataset(n=n, num_outliers=6, dtype=dtype, seed=1) + min_noise = 1e-6 # minimum noise variance constraint + max_noise = 1e-2 + base_noise = HomoskedasticNoise( + noise_constraint=NonTransformedInterval( + min_noise, max_noise, initial_value=1e-3 + ) + ).to(dtype=dtype, device=self.device) + + with self.assertRaisesRegex( + ValueError, + "`rho_constraint` must be a `NonTransformedInterval` if it is not None.", + ): + SparseOutlierGaussianLikelihood( + base_noise=base_noise, + dim=X.shape[0], + convex_parameterization=convex_parameterization, + rho_constraint=Interval(0.0, 1.0), # pyre-ignore[6] + ) + + with self.assertRaisesRegex(ValueError, "rho_constraint.lower_bound >= 0"): + SparseOutlierGaussianLikelihood( + base_noise=base_noise, + dim=X.shape[0], + convex_parameterization=convex_parameterization, + rho_constraint=NonTransformedInterval(-1.0, 1.0), + ) + + if convex_parameterization: + with self.assertRaisesRegex(ValueError, "rho_constraint.upper_bound <= 1"): + SparseOutlierGaussianLikelihood( + base_noise=base_noise, + dim=X.shape[0], + convex_parameterization=convex_parameterization, + rho_constraint=NonTransformedInterval(0.0, 2.0), + loo=loo, + ) + else: # with the canonical parameterization, any upper bound on rho is valid. + likelihood_with_other_bounds = SparseOutlierGaussianLikelihood( + base_noise=base_noise, + dim=X.shape[0], + convex_parameterization=convex_parameterization, + rho_constraint=NonTransformedInterval(0.0, 2.0), + loo=loo, + ) + noise_w_other_bounds = likelihood_with_other_bounds.noise_covar + self.assertEqual(noise_w_other_bounds.raw_rho_constraint.lower_bound, 0.0) + self.assertEqual(noise_w_other_bounds.raw_rho_constraint.upper_bound, 2.0) + + rp_likelihood = SparseOutlierGaussianLikelihood( + base_noise=base_noise, + dim=X.shape[0], + convex_parameterization=convex_parameterization, + loo=loo, + ) + + # testing conversion to and from dense representation + sparse_noise = rp_likelihood.noise_covar + sparse_noise.to_dense() + self.assertFalse(sparse_noise.is_sparse) + dense_rho = sparse_noise.rho + sparse_noise.to_sparse() + self.assertTrue(sparse_noise.is_sparse) + sparse_rho = sparse_noise.rho + self.assertAllClose(dense_rho, sparse_rho) + + with self.assertRaisesRegex(NotImplementedError, "variational inference"): + rp_likelihood.expected_log_prob(target=None, input=None) # pyre-ignore[6] + + # testing prior initialization + likelihood_with_prior = SparseOutlierGaussianLikelihood( + base_noise=base_noise, + dim=X.shape[0], + convex_parameterization=convex_parameterization, + rho_prior=gpytorch.priors.NormalPrior(loc=1 / 2, scale=0.1), + loo=loo, + ) + self.assertIsInstance( + likelihood_with_prior.noise_covar.rho_prior, gpytorch.priors.NormalPrior + ) + + # combining likelihood with rho prior and full GP model + # this will test the prior code paths when computing the marginal likelihood + model = self._get_robust_model(X=X, Y=Y, likelihood=likelihood_with_prior) + + # testing the _from_model method + with self.assertRaisesRegex( + ValueError, + "The model's likelihood does not have a SparseOutlierNoise noise", + ): + SparseOutlierNoise._from_model(SingleTaskGP(train_X=X, train_Y=Y)) + + self.assertEqual( + SparseOutlierNoise._from_model(model), likelihood_with_prior.noise_covar + ) + + # Test that there is a warning because + # model.likelihood.noise_covar._cached_train_inputs is None + # and the shape of the test inputs are not compatible with the noise module. + X_test = torch.rand(3, 1, dtype=dtype, device=self.device) + with self.assertWarnsRegex( + InputDataWarning, + "Robust rho not applied because the last dimension of the base noise " + "covariance", + ): + model.likelihood.noise_covar.forward(X_test) + + with self.assertRaisesRegex( + UnsupportedError, "only supports a single training input Tensor" + ): + model.likelihood.noise_covar.forward([X, X]) + + # executing once successfully so that _cached_train_inputs is populated + self.assertIsNone(model.likelihood.noise_covar._cached_train_inputs) + model.posterior(X, observation_noise=True) + self.assertIsNotNone(model.likelihood.noise_covar._cached_train_inputs) + + X_test = torch.rand_like(X) # same size as training inputs but not the same + with self.assertWarnsRegex( + InputDataWarning, + "Robust rho not applied because the passed train_inputs are not equal to" + " the cached ones.", + ): + model.posterior(X_test, observation_noise=True) + + # optimization via forward or backward relevance pursuit + sparse_module = model.likelihood.noise_covar + mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model) + + with self.assertWarnsRegex( + InputDataWarning, + "not applied because the training inputs were not passed to the likelihood", + ): + # the training inputs would have to be passed as arguments to the mll after + # the train targets. + mll(mll.model(*mll.model.train_inputs), mll.model.train_targets) + + extra_kwargs = {} + if convex_parameterization: + # initial_support doesn't have any special effect in the convex + # parameterization, adding this here so we don't have to test every + # combination of arguments. + extra_kwargs["initial_support"] = torch.randperm( + X.shape[0], device=self.device + )[: n // 2] # initializing with half the support + + # testing the reset_parameters functionality in conjunction with the convex + # parameterization to limit test runtime, but they are orthogonal. + # NOTE: the forward algorithm can give rise to an early termination warning, + # due to reaching a stationary point of the likelihood before adding all indices + reset_parameters = convex_parameterization + sparse_module, model_trace = optimizer( + sparse_module=sparse_module, + mll=mll, + record_model_trace=True, + reset_parameters=reset_parameters, + reset_dense_parameters=reset_parameters, + optimizer_kwargs={"options": {"maxiter": 2}}, + **extra_kwargs, + ) + model_trace = none_throws(model_trace) + # Bayesian model comparison + prior_mean_of_support = 2.0 + support_size, bmc_probabilities = get_posterior_over_support( + SparseOutlierNoise, model_trace, prior_mean_of_support=prior_mean_of_support + ) + if optimizer is backward_relevance_pursuit: + expected_length = len(X) + 1 + if "initial_support" in extra_kwargs: + expected_length = len(X) // 2 + 1 + self.assertEqual(len(model_trace), expected_length) + self.assertEqual(support_size.max(), expected_length - 1) + self.assertEqual(support_size[-1].item(), 0) # includes zero + # results of forward are sorted in decreasing order of support size + self.assertAllClose(support_size, support_size.sort(descending=True).values) + + elif optimizer is forward_relevance_pursuit: + # the forward algorithm will only add until no additional rho has a + # non-negative gradient, which can happen before the full support is added. + min_expected_length = 10 + self.assertGreaterEqual(len(model_trace), min_expected_length) + lower_bound = len(X) // 2 if "initial_support" in extra_kwargs else 0 + self.assertGreaterEqual(support_size.min().item(), lower_bound) + self.assertEqual( + support_size[0].item(), + n // 2 if "initial_support" in extra_kwargs else 0, + ) + + # results of forward are sorted in increasing order of support size + self.assertAllClose( + support_size, support_size.sort(descending=False).values + ) + + def test_basic_relevance_pursuit_module(self) -> None: + dim = 3 + module = DummyRelevancePursuitModule(dim=dim, support=[0]) + # testing basic sparse -> dense conversions + self.assertTrue(module.is_sparse) + module.to_dense() + self.assertFalse(module.is_sparse) + self.assertEqual(len(module.sparse_parameter), dim) + + for representation in ["sparse", "dense"]: + module = DummyRelevancePursuitModule(dim=dim, support=[0]) + getattr(module, "to_" + representation)() + + # testing that we can't remove indices that don't exist in the support + with self.assertRaisesRegex(ValueError, "is not in support"): + module.contract_support(indices=[1]) + + # testing that we can't add already existing indices to the support + with self.assertRaisesRegex(ValueError, "already in the support"): + module.expand_support(indices=[0]) + + # successfully contract in dense representation + module.contract_support(indices=[0]) + self.assertEqual(len(module.support), 0) + self.assertFalse( + module.is_sparse if representation == "dense" else not module.is_sparse + ) + self.assertAllClose( + module.sparse_parameter, torch.zeros_like(module.sparse_parameter) + ) + + module = DummyRelevancePursuitModule(dim=dim, support=[0, 2]) + getattr(module, "to_" + representation)() + module.remove_support() + self.assertEqual(len(module.support), 0) + self.assertAllClose( + module.sparse_parameter, torch.zeros_like(module.sparse_parameter) + ) + + # unrelated model + model = SingleTaskGP(train_X=torch.rand(3, 1), train_Y=torch.rand(3, 1)) + mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model) + with self.assertRaisesRegex( + ValueError, "The gradient of the sparse_parameter is None." + ): + module.expansion_objective(mll) + + # can't "expand" support by a negative number of indices + self.assertFalse(module.support_expansion(mll, n=-1)) + + # initalizing model so that train_X is dependent on the sparse_parameter + module = DummyRelevancePursuitModule(dim=dim, support=None) + module.to_dense() + train_X = module.sparse_parameter.unsqueeze(-1) + model = SingleTaskGP(train_X=train_X, train_Y=torch.rand_like(train_X)) + mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model) + # not expanding because the gradient is zero, but not None. + self.assertFalse(module.support_expansion(mll, n=1)) + + # testing support expansion with a gradient modifier, which assigns + # a positive value to the last element of the sparse_parameter. + def expansion_modifier(g: Tensor | None, module=module) -> Tensor: + g = torch.zeros_like(module.sparse_parameter) + g[-1] = 1.0 # this means we are adding the last inactive element + return g + + expanded = module.support_expansion(mll, n=1, modifier=expansion_modifier) + self.assertTrue(expanded) + self.assertTrue(dim - 1 in module.support) + + # testing support contraction + # not contracting since we can't contract by a negative number of indices + self.assertFalse(module.support_contraction(mll, n=-1)) + self.assertTrue( + module.support_contraction(mll, n=1, modifier=expansion_modifier) + ) + self.assertEqual(len(module.support), 0) + + def contraction_modifier(g: Tensor | None, module) -> Tensor: + g = torch.ones_like(module.sparse_parameter) + g[-1] = 0.0 # this means we are removing the last element + return g + + module = DummyRelevancePursuitModule(dim=dim, support=None) + module.full_support() + module._contraction_modifier = partial(contraction_modifier, module=module) + + # unrelated model + model = SingleTaskGP(train_X=torch.rand(3, 1), train_Y=torch.rand(3, 1)) + mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model) + + _, model_trace = backward_relevance_pursuit( + sparse_module=module, + mll=mll, + record_model_trace=True, + ) + self.assertEqual(len(model_trace), dim + 1) + + module = DummyRelevancePursuitModule(dim=dim, support=None) + module.full_support() + module._contraction_modifier = partial(contraction_modifier, module=module) + + with self.assertWarnsRegex(Warning, "contraction from sparsity.*unsuccessful."): + _, model_trace = backward_relevance_pursuit( + sparse_module=module, + mll=mll, + record_model_trace=True, + sparsity_levels=[dim + 1, dim, dim - 1], + ) + self.assertEqual(len(model_trace), 1) + + # the exit condition of the forward algorithm is already tested in the tests of + # the robust model + + # testing parameter initialization helper + value = _get_initial_value( + value=None, + lower=torch.tensor(-torch.inf, device=self.device), + upper=torch.tensor(torch.inf, device=self.device), + ) + self.assertEqual(value, torch.tensor(0.0, device=self.device)) + value = _get_initial_value( + value=torch.tensor(-3.0, device=self.device), + lower=torch.tensor(-2.0, device=self.device), + upper=torch.tensor(torch.inf, device=self.device), + ) + self.assertEqual(value, torch.tensor(-1.0, device=self.device)) + value = _get_initial_value( + value=None, + lower=torch.tensor(-torch.inf, device=self.device), + upper=torch.tensor(2.0, device=self.device), + ) + self.assertEqual(value, torch.tensor(1.0, device=self.device)) + value = _get_initial_value( + value=None, + lower=torch.tensor(0.0, device=self.device), + upper=torch.tensor(torch.pi, device=self.device), + ) + self.assertAllClose(value, torch.tensor(1.0, device=self.device)) + + def test_experimental_utils(self) -> None: + base_f = Ackley(dim=3) + outlier_value = 100.0 + outlier_generator = partial(constant_outlier_generator, constant=outlier_value) + + # no outliers + f = CorruptedTestProblem( + base_test_problem=base_f, + outlier_generator=outlier_generator, + outlier_fraction=0.0, + ) + n, d = 16, base_f.dim + X = torch.randn(n, d) + Y = f(X) + self.assertAllClose(Y, base_f(X)) + + # all outliers + f = CorruptedTestProblem( + base_test_problem=base_f, + outlier_generator=outlier_generator, + outlier_fraction=1.0, + ) + n, d = 16, base_f.dim + X = torch.randn(n, d) + Y = f(X) + self.assertTrue(((Y - base_f(X)).abs() > 1).all()) + + # testing seeds + num_seeds = 3 + f = CorruptedTestProblem( + base_test_problem=base_f, + outlier_generator=outlier_generator, + outlier_fraction=1 / 2, + seeds=range(num_seeds), + ) + n, d = 8, base_f.dim + X = torch.randn(n, d) + Y_last = base_f(X) + for _ in range(num_seeds): + Y = f(X) + # with these seeds we should have at least 1 outlier and less than n, + # which shows that the masking works correctly. + num_outliers = (Y == outlier_value).sum() + self.assertGreater(num_outliers, 1) + self.assertLess(num_outliers, n) + # testing that the outliers are not the same, even if we evaluate on the + # same input. + self.assertTrue((Y_last - Y).norm() > 1.0) + Y_last = Y + + # after num_seeds has been exhausted, the evaluation will error. + with self.assertRaises(StopIteration): + f(X) diff --git a/test/test_functions/test_base.py b/test/test_functions/test_base.py index b39fa7a049..3ee76558c6 100644 --- a/test/test_functions/test_base.py +++ b/test/test_functions/test_base.py @@ -6,13 +6,13 @@ import torch from botorch.test_functions.base import BaseTestProblem, ConstrainedBaseTestProblem -from botorch.utils.testing import BotorchTestCase +from botorch.utils.testing import BotorchTestCase, TestCorruptedProblemsMixin from torch import Tensor class DummyTestProblem(BaseTestProblem): dim = 2 - _bounds = [(0, 1), (2, 3)] + _bounds = [(0.0, 1.0), (2.0, 3.0)] def evaluate_true(self, X: Tensor) -> Tensor: return -X.pow(2).sum(dim=-1) @@ -56,3 +56,26 @@ def test_constrained_base_test_problem(self): feas = problem.is_feasible(X=X) self.assertFalse(feas[0].item()) self.assertTrue(feas[1].item()) + + +class TestSeedingMixin(TestCorruptedProblemsMixin): + def test_seed_iteration(self) -> None: + problem = self.rosenbrock_problem + + self.assertTrue(problem.has_seeds) + self.assertIsNone(problem.seed) # increment_seed needs to be called first + problem.increment_seed() + self.assertEqual(problem.seed, 1) + problem.increment_seed() + self.assertEqual(problem.seed, 2) + with self.assertRaises(StopIteration): + problem.increment_seed() + + +class TestCorruptedTestProblem(TestCorruptedProblemsMixin): + def test_basic_rosenbrock(self) -> None: + problem = self.rosenbrock_problem + x = torch.rand(5, 1) + result = problem(x) + # the outlier_generator sets corruptions to 1 + self.assertTrue((result == 1).all()) diff --git a/test/utils/multi_objective/test_scalarization.py b/test/utils/multi_objective/test_scalarization.py index 723135eef8..4ae745b5aa 100644 --- a/test/utils/multi_objective/test_scalarization.py +++ b/test/utils/multi_objective/test_scalarization.py @@ -15,6 +15,7 @@ class TestGetChebyshevScalarization(BotorchTestCase): def test_get_chebyshev_scalarization(self): + torch.manual_seed(1234) tkwargs = {"device": self.device} Y_train = torch.rand(4, 2, **tkwargs) neg_Y_train = -Y_train