diff --git a/CHANGELOG.md b/CHANGELOG.md index 566bab6..6a11a12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ # Changelog ## [Untracked Changes] +### Added +- Improved methods for fitting PyTorch surrogates, including auto-stopping by parameter value norm + +### Modified +- Greatly reduced the number of samples for DNN posterior, speeding up optimization +- Stabilized the mean estimate of ensemble surrogates by avoiding resampling +- Disabled root caching for ensemble surrogates during optimization ## [0.8.5] ### Added diff --git a/obsidian/acquisition/custom.py b/obsidian/acquisition/custom.py index 4f41b6a..efb2495 100644 --- a/obsidian/acquisition/custom.py +++ b/obsidian/acquisition/custom.py @@ -64,10 +64,11 @@ class qSpaceFill(MCAcquisitionFunction): """ def __init__(self, model: Model, + X_baseline: Tensor, sampler: MCSampler | None = None, objective: MCAcquisitionObjective | None = None, posterior_transform: PosteriorTransform | None = None, - X_pending: Tensor | None = None): + X_pending: Tensor | None = None,): if sampler is None: sampler = SobolQMCNormalSampler(sample_shape=torch.Size([512])) @@ -80,6 +81,8 @@ def __init__(self, super().__init__(model=model, sampler=sampler, objective=objective, posterior_transform=posterior_transform, X_pending=X_pending) + + self.register_buffer('X_baseline', X_baseline) @t_batch_mode_transform() def forward(self, @@ -88,7 +91,7 @@ def forward(self, Evaluate the acquisition function on the candidate set x """ # x dimensions: b * q * d - x_train = self.model.train_inputs[0][0] # train_inputs is a list of tuples + x_train = self.X_baseline # For sequential mode, add pending data points to "train" if self.X_pending is not None: diff --git a/obsidian/optimizer/bayesian.py b/obsidian/optimizer/bayesian.py index 468199c..dff4fc5 100644 --- a/obsidian/optimizer/bayesian.py +++ b/obsidian/optimizer/bayesian.py @@ -3,7 +3,7 @@ from .base import Optimizer from obsidian.parameters import ParamSpace, Target, Task -from obsidian.surrogates import SurrogateBoTorch, DNN +from obsidian.surrogates import SurrogateBoTorch, EnsembleModel from obsidian.acquisition import aq_class_dict, aq_defaults, aq_hp_defaults, valid_aqs from obsidian.surrogates import model_class_dict from obsidian.objectives import Index_Objective, Objective_Sequence @@ -18,7 +18,7 @@ from botorch.sampling.index_sampler import IndexSampler from botorch.models.model_list_gp_regression import ModelListGP from botorch.models.gpytorch import GPyTorchModel -from botorch.models.model import ModelList +from botorch.models.model import ModelList, Model from botorch.utils.sampling import draw_sobol_samples from botorch.utils.multi_objective.box_decompositions.non_dominated import NondominatedPartitioning @@ -478,6 +478,7 @@ def _parse_aq_kwargs(self, hps: dict, m_batch: int, target_locs: list[int], + model: Model, X_t_pending: Tensor | None = None, objective: MCAcquisitionObjective | None = None) -> dict: """ @@ -533,7 +534,9 @@ def _parse_aq_kwargs(self, # Noisy aqs require X_train reference if aq in ['NEI', 'NEHVI', 'NParEGO']: aq_kwargs['X_baseline'] = X_baseline - + if any(isinstance(m, EnsembleModel) for m in model.models): + aq_kwargs['cache_root'] = False + # Hypervolume requires reference point if aq in ['EHVI', 'NEHVI']: @@ -570,6 +573,9 @@ def _parse_aq_kwargs(self, w = w/torch.sum(torch.abs(w)) aq_kwargs['scalarization_weights'] = w + if aq == 'SF': + aq_kwargs['X_baseline'] = X_baseline + return aq_kwargs def suggest(self, @@ -712,7 +718,7 @@ def suggest(self, if not isinstance(model, ModelListGP): samplers = [] for m in model.models: - if isinstance(m, DNN): + if isinstance(m, EnsembleModel): sampler_i = IndexSampler(sample_shape=torch.Size([optim_samples]), seed=self.seed) else: sampler_i = SobolQMCNormalSampler(sample_shape=torch.Size([optim_samples]), seed=self.seed) @@ -757,7 +763,9 @@ def suggest(self, # Use aq_kwargs so that extra unnecessary ones in hps get removed for certain aq funcs aq_kwargs = {'model': model, 'sampler': sampler, 'X_pending': X_t_pending} - aq_kwargs.update(self._parse_aq_kwargs(aq_str, aq_hps, m_batch, target_locs, X_t_pending, objective)) + aq_kwargs.update(self._parse_aq_kwargs(aq_str, aq_hps, m_batch, + target_locs, model, + X_t_pending, objective)) # Raise errors related to certain constraints if aq_str in ['UCB', 'Mean', 'TS', 'SF', 'SR', 'NIPV']: @@ -812,7 +820,10 @@ def suggest(self, # Hypervolume aqs fail with X_t_pending when optim_sequential=True if aq_str in ['NEHVI', 'EHVI']: - optim_sequential = False + if optim_sequential and X_t_pending is not None: + warnings.warn('Hypervolume aqs with X_pending require joint optimization. \ + Setting optim_sequential to False', UserWarning) + optim_sequential = False # If it's random search, no need to do optimization; Otherwise, initialize the aq function and optimize if aq_str == 'RS': @@ -978,7 +989,9 @@ def evaluate(self, # Use aq_kwargs so that extra unnecessary ones in hps get removed for certain aq funcs aq_kwargs = {'model': model, 'sampler': None, 'X_pending': X_t_pending} - aq_kwargs.update(self._parse_aq_kwargs(aq_str, aq_hps, X_suggest.shape[0], target_locs, X_t_pending, objective)) + aq_kwargs.update(self._parse_aq_kwargs(aq_str, aq_hps, X_suggest.shape[0], + target_locs, model, + X_t_pending, objective)) # If it's random search, no need to evaluate aq if aq_str == 'RS': diff --git a/obsidian/plotting/plotly.py b/obsidian/plotting/plotly.py index e753252..02f88e7 100644 --- a/obsidian/plotting/plotly.py +++ b/obsidian/plotting/plotly.py @@ -10,7 +10,6 @@ import plotly.graph_objects as go from plotly.graph_objects import Figure from plotly.subplots import make_subplots -from sklearn.manifold import MDS import pandas as pd import numpy as np @@ -52,18 +51,34 @@ def visualize_inputs(campaign: Campaign) -> Figure: + ['Correlation Matrix'] + [X.columns[i] for i in range(cols, n_dim)] ) + # if campaign.optimizer is fitted, then X_best_f_idx is identified + if 'X_best_f_idx' in dir(campaign.optimizer): + marker_shapes = ['diamond' if rowInd in [campaign.optimizer.X_best_f_idx] else 'circle' for rowInd in range(campaign.X.shape[0])] + else: + marker_shapes = ['circle']*campaign.X.shape[0] for i, param in enumerate(X.columns): row_i = i // cols + 1 col_i = i % cols + 1 fig.add_trace(go.Scatter(x=X.index, y=X[param], mode='markers', name=param, - marker=dict(color=color_list[i]), + marker=dict(color=color_list[i], symbol=marker_shapes), showlegend=False), row=row_i, col=col_i) fig.update_xaxes(tickvals=np.around(np.linspace(0, campaign.m_exp, 5)), row=row_i, col=col_i) + # Add note to explain the shape of markers + if hasattr(campaign.optimizer, 'X_best_f_idx'): + fig.add_annotation( + text="Note: The diamond markers denote samples that achieve the best sum of targets.", + showarrow=False, + xref="paper", yref="paper", + x=0, + y=-0.2, + font=dict(style="italic") + ) + # Calculate the correlation matrix X_u = campaign.X_space.unit_map(X) corr_matrix = X_u.corr() @@ -99,6 +114,12 @@ def MDS_plot(campaign: Campaign) -> Figure: Returns: fig (Figure): The MDS plot """ + try: + from sklearn.manifold import MDS + except ImportError: + raise ImportError('The `sklearn` package (>1.0) is required for the MDS plot. \ + Please install it using `pip install scikit-learn`') + mds = MDS(n_components=2) X_mds = mds.fit_transform(campaign.X_space.encode(campaign.X)) @@ -320,8 +341,9 @@ def factor_plot(optimizer: Optimizer, Y_mu_ref = Y_pred_ref[y_name+('_t (pred)' if f_transform else ' (pred)')].values fig.add_trace(go.Scatter(x=X_ref.iloc[:, feature_id].values, y=Y_mu_ref, mode='markers', + marker=dict(symbol='diamond'), line={'color': obsidian_colors.teal}, - name='Ref'), + name='Reference'), ) fig.update_xaxes(title_text=X_name) fig.update_yaxes(title_text=y_name) @@ -539,7 +561,19 @@ def optim_progress(campaign: Campaign, marker=marker_dict, customdata=campaign.data[X_names], name='Data')) - + + # Highlight the best samples + if hasattr(campaign.optimizer, 'X_best_f_idx'): + fig.add_trace(go.Scatter(x=pd.Series(out_exp.iloc[campaign.optimizer.X_best_f_idx, 0]), + y=pd.Series(out_exp.iloc[campaign.optimizer.X_best_f_idx, 1]), + mode='markers', + marker=dict(symbol='diamond-open', size=14), + line={'color': 'black'}, + legendgroup='marker_shape', showlegend=True, + name='Best') + ) + fig.update_layout(showlegend=True) + template = [""+str(param.name)+": "+" %{customdata["+str(i)+"]" + (":.3G}"if isinstance(param, Param_Continuous) else "}") + "
" for i, param in enumerate(campaign.X_space)] diff --git a/obsidian/surrogates/__init__.py b/obsidian/surrogates/__init__.py index 3ec1726..94d42ee 100644 --- a/obsidian/surrogates/__init__.py +++ b/obsidian/surrogates/__init__.py @@ -4,3 +4,4 @@ from .custom_GP import * from .custom_torch import * from .config import * +from .utils import * diff --git a/obsidian/surrogates/botorch.py b/obsidian/surrogates/botorch.py index 25d668e..9eed9d7 100644 --- a/obsidian/surrogates/botorch.py +++ b/obsidian/surrogates/botorch.py @@ -2,6 +2,7 @@ from .base import SurrogateModel from .config import model_class_dict +from .utils import fit_pytorch from obsidian.utils import tensordict_to_dict, dict_to_tensordict from obsidian.exceptions import SurrogateFitError @@ -10,11 +11,11 @@ from botorch.fit import fit_gpytorch_mll from botorch.optim.fit import fit_gpytorch_mll_torch, fit_gpytorch_mll_scipy from botorch.models.gpytorch import GPyTorchModel +from botorch.models.ensemble import EnsembleModel from gpytorch.mlls import ExactMarginalLogLikelihood import torch import torch.nn as nn -import torch.optim as optim import numpy as np import pandas as pd import warnings @@ -156,20 +157,7 @@ def fit(self, raise SurrogateFitError('BoTorch model failed to fit') else: self.loss_fcn = nn.MSELoss() - self.optimizer = optim.Adam(self.torch_model.parameters(), lr=1e-2) - - self.torch_model.train() - for epoch in range(200): - self.optimizer.zero_grad() - output = self.torch_model(X_p) - loss = self.loss_fcn(output, y_p) - loss.backward() - self.optimizer.step() - - if (epoch % 50 == 0 and self.verbose): - print(f'Epoch {epoch}: Loss {loss.item()}') - - self.torch_model.eval() + fit_pytorch(self.torch_model, X_p, y_p, loss_fcn=self.loss_fcn, verbose=self.verbose) self.is_fit = True @@ -229,7 +217,14 @@ def predict(self, X_p = self._prepare(X) pred_posterior = self.torch_model.posterior(X_p) - mu = pred_posterior.mean.detach().cpu().squeeze(-1) + + # We would prefer to have stability in the mean of ensemble models, + # So, we will not re-sample for prediction but use forward methods + if isinstance(self.torch_model, EnsembleModel): + mu = self.torch_model.forward(X_p).detach() + else: + mu = pred_posterior.mean.detach().cpu().squeeze(-1) + if q is not None: if (q < 0) or (q > 1): raise ValueError('Quantile must be between 0 and 1') diff --git a/obsidian/surrogates/custom_torch.py b/obsidian/surrogates/custom_torch.py index 3cbdcd6..b1bb037 100644 --- a/obsidian/surrogates/custom_torch.py +++ b/obsidian/surrogates/custom_torch.py @@ -1,11 +1,21 @@ """Custom implementations of PyTorch surrogate models using BoTorch API""" -from botorch.models.model import Model +from .utils import fit_pytorch + +from obsidian.config import TORCH_DTYPE + +from botorch.models.model import FantasizeMixin +from botorch.models.ensemble import EnsembleModel, Model from botorch.posteriors.ensemble import Posterior, EnsemblePosterior +import torch import torch.nn as nn +from torch.nn import Module from torch import Tensor +from typing import TypeVar +TFantasizeMixin = TypeVar("TFantasizeMixin", bound="FantasizeMixin") + class DNNPosterior(EnsemblePosterior): @@ -17,7 +27,7 @@ def quantile(self, value: Tensor) -> Tensor: return self.values.quantile(q=value.to(self.values), dim=-3, interpolation='linear') -class DNN(Model): +class DNN(EnsembleModel, FantasizeMixin): def __init__(self, train_X: Tensor, train_Y: Tensor, @@ -33,6 +43,13 @@ def __init__(self, if p_dropout < 0 or p_dropout > 1: raise ValueError("p_dropout must be in [0, 1]") + self.register_buffer('train_X', train_X) + self.register_buffer('train_Y', train_Y) + self.register_buffer('p_dropout', torch.tensor(p_dropout, dtype=TORCH_DTYPE)) + self.register_buffer('h_width', torch.tensor(h_width, dtype=torch.int)) + self.register_buffer('h_layers', torch.tensor(h_layers, dtype=torch.int)) + self.register_buffer('num_outputs', torch.tensor(num_outputs, dtype=torch.int)) + self.input_layer = nn.Sequential( nn.Linear(train_X.shape[-1], h_width), nn.PReLU(), @@ -49,6 +66,7 @@ def __init__(self, self.outer_layer = nn.Linear(h_width, num_outputs) self._num_outputs = num_outputs + self.to(TORCH_DTYPE) def forward(self, x: Tensor) -> Tensor: @@ -60,7 +78,7 @@ def forward(self, def posterior(self, X: Tensor, - n_sample: int = 16384, + n_sample: int = 512, output_indices: list[int] = None, observation_noise: bool | Tensor = False) -> Posterior: """Calculates the posterior distribution of the model at X""" @@ -86,3 +104,53 @@ def posterior(self, def num_outputs(self) -> int: """Number of outputs of the model""" return self._num_outputs + + def transform_inputs(self, + X: Tensor, + input_transform: Module = None) -> Tensor: + """ + Transform inputs. + + Args: + X: A tensor of inputs + input_transform: A Module that performs the input transformation. + + Returns: + A tensor of transformed inputs + """ + if input_transform is not None: + input_transform.to(X) + return input_transform(X) + try: + return self.input_transform(X) + except AttributeError: + return X + + def condition_on_observations(self, + X: Tensor, + Y: Tensor) -> TFantasizeMixin: + """ + Condition the model to new observations, returning a fantasy model + """ + + X_c = torch.concat((self.train_X, X), axis=0) + Y_c = torch.concat((self.train_Y, Y), axis=0) + + # Create a new model based on the current one + fantasy = self.__class__(train_X=X_c, train_Y=Y_c, + p_dropout=float(self.p_dropout), + h_width=int(self.h_width), h_layers=int(self.h_layers), + num_outputs=int(self.num_outputs)) + + # Fit to the new data + fit_pytorch(fantasy, X_c, Y_c) + + return fantasy + + def fantasize(self, + X: Tensor) -> Model: + + Y_f = self.forward(X).detach() + fantasy = self.condition_on_observations(X, Y_f) + + return fantasy diff --git a/obsidian/surrogates/utils.py b/obsidian/surrogates/utils.py new file mode 100644 index 0000000..246c96c --- /dev/null +++ b/obsidian/surrogates/utils.py @@ -0,0 +1,86 @@ +"""Utility functions for surrogate model handling""" + +import torch +from torch import Tensor +import torch.nn as nn +from torch.nn import Module, Parameter +import torch.optim as optim + + +def check_parameter_change(params: list[Parameter], + prev_params: list[Parameter]) -> float: + """Check the average change in model parameters.""" + + changes = [] + for param, prev_param in zip(params, prev_params): + changes.append(torch.norm(param.data - prev_param.data)) + + total_change = sum(changes) + avg_change = total_change / len(list(params)) + + return avg_change + + +def fit_pytorch(model: Module, + X: Tensor, + y: Tensor, + loss_fcn: Module | None = None, + verbose: bool = False, + max_iter: int = 5000) -> None: + """ + Fits a PyTorch model to the given input data. + Args: + model (Module): The PyTorch model to be trained. + X (Tensor): The input data. + y (Tensor): The target data. + loss_fcn (Module, optional): The loss function to be used. + If not provided, the Mean Squared Error (MSE) loss function will + be used. Defaults to ``None``. + verbose (bool, optional): Whether to print the loss at each epoch. + Defaults to ``False``. + """ + + if loss_fcn is None: + loss_fcn = nn.MSELoss() + + optimizer = optim.Adam(model.parameters(), + lr=1e-2, + weight_decay=1e-2) + + # Set up early stoppping + avg_change = 1e10 + param_tol = 1e-2 + converge_buffer = 5 + converge_count = 0 + + model.train() + for epoch in range(max_iter): + + prev_params = [param.clone().detach() for param in model.parameters()] + optimizer.zero_grad() + output = model(X) + loss = loss_fcn(output, y) + loss.backward() + optimizer.step() + + # Check change + current_params = [param.clone().detach() for param in model.parameters()] + avg_change = check_parameter_change(current_params, prev_params) + + if (epoch % 50 == 0 and verbose): + print(f'Epoch {epoch}: Loss {loss.item()}') + + # If the parameter change is under tolerance, increment the counter + if avg_change < param_tol: + converge_count += 1 + else: + converge_count = 0 + + # Auto-converge when the count reaches the desired length + if converge_count == converge_buffer: + if verbose: + print(f'Converged at epoch {epoch}') + break + + model.eval() + return diff --git a/obsidian/tests/test_optimizer_MOO.py b/obsidian/tests/test_optimizer_MOO.py index 5740c32..167030c 100644 --- a/obsidian/tests/test_optimizer_MOO.py +++ b/obsidian/tests/test_optimizer_MOO.py @@ -7,6 +7,8 @@ from obsidian.optimizer import BayesianOptimizer from obsidian.experiment.benchmark import two_leaves from obsidian.tests.utils import equal_state_dicts + +from botorch.models.ensemble import EnsembleModel import pandas as pd import numpy as np @@ -51,10 +53,17 @@ def test_optimizer_fit(X_space, surrogate, Z0, serial_test=True): obj_dict2 = optimizer_2.save_state() assert equal_state_dicts(obj_dict, obj_dict2) optimizer_2.__repr__() + y_pred = optimizer.predict(optimizer.X_train) y_pred_2 = optimizer_2.predict(optimizer.X_train) - y_error = ((y_pred_2-y_pred)/y_pred.max(axis=0)).values - assert abs(y_error).max() < tol, 'Prediction error in loading parameters of saved optimizer' + y_error = ((y_pred_2-y_pred)/y_pred.max(axis=0)) + + # Only check the mean for ensemble models, as the lb/ub are not deterministic + if isinstance(optimizer.surrogate[0].torch_model, EnsembleModel): + check_cols = [col for col in y_pred if '(pred)' in col] + y_error = y_error[check_cols] + + assert abs(y_error.values).max() < tol, 'Prediction error in loading parameters of saved optimizer' # Generate a baseline optimizer to use for future tests diff --git a/obsidian/tests/test_optimizer_SOO.py b/obsidian/tests/test_optimizer_SOO.py index 8477709..ca961b4 100644 --- a/obsidian/tests/test_optimizer_SOO.py +++ b/obsidian/tests/test_optimizer_SOO.py @@ -8,6 +8,8 @@ from obsidian.experiment.benchmark import shifted_parab from obsidian.tests.utils import approx_equal, equal_state_dicts +from botorch.models.ensemble import EnsembleModel + import pandas as pd import numpy as np import pytest @@ -80,8 +82,14 @@ def test_optimizer_fit(X_space, surrogate, Z0, serial_test=True): optimizer_2.__repr__() y_pred = optimizer.predict(optimizer.X_train) y_pred_2 = optimizer_2.predict(optimizer.X_train) - y_error = ((y_pred_2-y_pred)/y_pred.max(axis=0)).values - assert abs(y_error).max() < tol, 'Prediction error in loading parameters of saved optimizer' + y_error = ((y_pred_2-y_pred)/y_pred.max(axis=0)) + + # Only check the mean for ensemble models, as the lb/ub are not deterministic + if isinstance(optimizer.surrogate[0].torch_model, EnsembleModel): + check_cols = [col for col in y_pred if '(pred)' in col] + y_error = y_error[check_cols] + + assert abs(y_error.values).max() < tol, 'Prediction error in loading parameters of saved optimizer' # Generate a baseline optimizer to use for future tests diff --git a/pyproject.toml b/pyproject.toml index c974769..c114fb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,12 +58,12 @@ sphinx = { version = "^7.3.7", optional = true} myst-parser = { version = "^3.0.1", optional = true} pydata-sphinx-theme = { version = "^0.15.4", optional = true} linkify-it-py = { version = "^2.0.3", optional = true} -scikit-learn = "^1.5.1" +scikit-learn = { version = "^1.5.1", optional = true} [tool.poetry.extras] app = ["flask", "dash", "dash-daq", "dash-bootstrap-components"] -dev = ["pytest", "xlrd", "ipykernel", "jupyterlab", "flake8", "pytest-cov"] +dev = ["pytest", "xlrd", "ipykernel", "jupyterlab", "flake8", "pytest-cov", "scikit-learn"] docs = ["sphinx", "myst-parser", "pydata-sphinx-theme", "linkify-it-py"]